Skip to content

pixano_inference.impls.transformers.grounding_dino

Grounding DINO zero-shot detection model.

GroundingDINOModel(config)

Bases: DetectionModel

Native Ray Serve model for Grounding DINO zero-shot detection.

model_params contract:

  • path (str, required): HuggingFace model ID or local checkpoint path.
  • processor_config (dict, optional): Kwargs for AutoProcessor.from_pretrained.
  • config (dict, optional): Kwargs for AutoModelForZeroShotObjectDetection.from_pretrained.

Parameters:

Name Type Description Default
config ModelDeploymentConfig

Model deployment configuration.

required
Source code in pixano_inference/impls/transformers/grounding_dino.py
def __init__(self, config: ModelDeploymentConfig) -> None:
    """Initialize the model.

    Args:
        config: Model deployment configuration.
    """
    super().__init__(config)
    self._model: Any = None
    self._processor: Any = None

metadata property

Model metadata.

load_model()

Load the Grounding DINO model and processor.

Source code in pixano_inference/impls/transformers/grounding_dino.py
def load_model(self) -> None:
    """Load the Grounding DINO model and processor."""
    from pixano_inference.utils.package import assert_transformers_installed

    assert_transformers_installed()

    import torch
    from transformers import AutoModelForZeroShotObjectDetection, AutoProcessor

    params = dict(self._config.model_params)
    path = params.pop("path")
    processor_config = params.pop("processor_config", {})
    model_config = params.pop("config", {})

    device = resolve_device(self._config)

    self._processor = AutoProcessor.from_pretrained(path, **processor_config)
    self._model = AutoModelForZeroShotObjectDetection.from_pretrained(path, device_map=device, **model_config)
    self._model = self._model.eval()
    self._model = torch.compile(self._model)

    logger.info("GroundingDINOModel '%s' loaded on %s", self.model_name, device)

predict(input)

Run zero-shot detection.

Parameters:

Name Type Description Default
input DetectionInput

Detection input with image, classes, and thresholds.

required

Returns:

Type Description
DetectionOutput

Detection output with boxes, scores, and classes.

Raises:

Type Description
ValueError

If classes is not provided (GroundingDINO is open-vocab only).

Source code in pixano_inference/impls/transformers/grounding_dino.py
def predict(self, input: DetectionInput) -> DetectionOutput:
    """Run zero-shot detection.

    Args:
        input: Detection input with image, classes, and thresholds.

    Returns:
        Detection output with boxes, scores, and classes.

    Raises:
        ValueError: If ``classes`` is not provided (GroundingDINO is open-vocab only).
    """
    import torch

    from pixano_inference.utils.media import convert_string_to_image

    if input.classes is None:
        raise ValueError("GroundingDINOModel requires 'classes' (open-vocabulary model).")

    pil_image = convert_string_to_image(input.image)
    classes = input.classes

    if isinstance(classes, list):
        classes_str = ". ".join(classes)
    elif classes is not None:
        classes_str = classes
    else:
        classes_str = ""

    with torch.inference_mode():
        inputs = self._processor(images=pil_image, text=classes_str, return_tensors="pt").to(self._model.device)

        outputs = self._model(**inputs)

        target_size = (pil_image.height, pil_image.width)

        result = self._processor.post_process_grounded_object_detection(
            outputs,
            inputs.input_ids,
            box_threshold=input.box_threshold,
            text_threshold=input.text_threshold,
            target_sizes=[target_size],
        )[0]

        return DetectionOutput(
            boxes=[[int(round(x, 0)) for x in box.tolist()] for box in result["boxes"]],
            scores=result["scores"].tolist() if hasattr(result["scores"], "tolist") else result["scores"],
            classes=result["labels"],
        )

unload()

Free resources.

Source code in pixano_inference/impls/transformers/grounding_dino.py
def unload(self) -> None:
    """Free resources."""
    if self._model is not None:
        del self._model
        self._model = None
    if self._processor is not None:
        del self._processor
        self._processor = None
    gc.collect()
    try:
        import torch

        torch.cuda.empty_cache()
    except Exception:
        pass