Skip to content

pixano_inference.models.transformers

Inference models for Transformers.

TransformerModel(name, path, processor, model)

Bases: BaseInferenceModel

Inference model for transformers.

Parameters:

Name Type Description Default
name str

Name of the model.

required
path Path | str

Path to the model or its Hugging Face hub's identifier.

required
processor 'ProcessorMixin'

Processor for the model.

required
model 'PreTrainedModel'

Model for the inference.

required
Source code in pixano_inference/models/transformers.py
def __init__(self, name: str, path: Path | str, processor: "ProcessorMixin", model: "PreTrainedModel"):
    """Initialize the model.

    Args:
        name: Name of the model.
        path: Path to the model or its Hugging Face hub's identifier.
        processor: Processor for the model.
        model: Model for the inference.
    """
    assert_transformers_installed()

    super().__init__(name, provider="transformers")
    self.processor = processor
    self.path = path
    self.model = model.eval()

metadata property

Return the metadata of the model.

delete()

Delete the model.

Source code in pixano_inference/models/transformers.py
def delete(self):
    """Delete the model."""
    del self.model
    del self.processor
    gc.collect()
    torch.cuda.empty_cache()

image_mask_generation(image, image_embedding=None, points=None, labels=None, boxes=None, num_multimask_outputs=3, multimask_output=True, return_image_embedding=False, **kwargs)

Generate a mask from the image.

Parameters:

Name Type Description Default
image 'Tensor' | Image

Image for the generation.

required
image_embedding 'Tensor' | None

Image embeddings for the generation.

None
points list[list[list[int]]] | None

Points for the mask generation. The first fimension is the number of prompts, the second the number of points per mask and the third the coordinates of the points.

None
labels list[list[int]] | None

Labels for the mask generation. The first fimension is the number of prompts, the second the number of labels per mask.

None
boxes list[list[int]] | None

Boxes for the mask generation. The first fimension is the number of prompts, the second the coordinates of the boxes.

None
num_multimask_outputs int

Number of masks to generate per prediction.

3
multimask_output bool

Whether to generate multiple masks per prediction.

True
return_image_embedding bool

Whether to return the image embedding.

False
kwargs Any

Additional keyword arguments.

{}
Source code in pixano_inference/models/transformers.py
def image_mask_generation(
    self,
    image: "Tensor" | Image,
    image_embedding: "Tensor" | None = None,
    points: list[list[list[int]]] | None = None,
    labels: list[list[int]] | None = None,
    boxes: list[list[int]] | None = None,
    num_multimask_outputs: int = 3,
    multimask_output: bool = True,
    return_image_embedding: bool = False,
    **kwargs: Any,
) -> ImageMaskGenerationOutput:
    """Generate a mask from the image.

    Args:
        image: Image for the generation.
        image_embedding: Image embeddings for the generation.
        points: Points for the mask generation. The first fimension is the number of prompts, the
            second the number of points per mask and the third the coordinates of the points.
        labels: Labels for the mask generation. The first fimension is the number of prompts, the second
            the number of labels per mask.
        boxes: Boxes for the mask generation. The first fimension is the number of prompts, the second
            the coordinates of the boxes.
        num_multimask_outputs: Number of masks to generate per prediction.
        multimask_output: Whether to generate multiple masks per prediction.
        return_image_embedding: Whether to return the image embedding.
        kwargs: Additional keyword arguments.
    """
    with torch.inference_mode():
        inputs = self.processor(
            image,
            input_points=[points] if points is not None else None,
            input_boxes=[boxes] if boxes is not None else None,
            input_labels=[labels] if labels is not None else None,
            return_tensors="pt",
        ).to(self.model.device, dtype=self.model.dtype)

        if return_image_embedding:
            if image_embedding is None:  # Compute image embeddings if not provided
                image_embedding = self.model.get_image_embeddings(inputs["pixel_values"])

        if image_embedding is not None:
            if image_embedding.ndim == 3:
                image_embedding = image_embedding.unsqueeze(0)
            inputs.pop("pixel_values", None)
            inputs.update({"image_embeddings": image_embedding.to(self.model.device, dtype=self.model.dtype)})

        outputs = self.model(
            **inputs, num_multimask_outputs=num_multimask_outputs, multimask_output=multimask_output, **kwargs
        )

        masks = (
            self.processor.image_processor.post_process_masks(
                outputs.pred_masks, inputs["original_sizes"], inputs["reshaped_input_sizes"]
            )
        )[0].cpu()
        return ImageMaskGenerationOutput(
            masks=[
                [CompressedRLE(**encode_mask_to_rle(mask)) for mask in prediction_masks]
                for prediction_masks in masks
            ],
            scores=NDArrayFloat.from_torch(outputs.iou_scores[0].cpu()),
            image_embedding=(
                NDArrayFloat.from_torch(image_embedding[0].cpu()) if return_image_embedding else None
            ),
        )

image_zero_shot_detection(image, classes, box_threshold, text_threshold, **kwargs)

Perform zero shot detection on an image.

Parameters:

Name Type Description Default
image 'Tensor' | Image

The image.

required
classes str

The list of classes to detect in the format 'class1. class2'.

required
box_threshold float

The threshold for bounding boxes detection.

required
text_threshold float

The threshold for the classes identification during zero shot learning phase.

required
kwargs Any

Additional arguments.

{}

Returns:

Type Description
ImageZeroShotDetectionOutput

The output of image zero-shot detection task.

Source code in pixano_inference/models/transformers.py
def image_zero_shot_detection(
    self,
    image: "Tensor" | Image,
    classes: str,
    box_threshold: float,
    text_threshold: float,
    **kwargs: Any,
) -> ImageZeroShotDetectionOutput:
    """Perform zero shot detection on an image.

    Args:
        image: The image.
        classes: The list of classes to detect in the format 'class1. class2'.
        box_threshold: The threshold for bounding boxes detection.
        text_threshold: The threshold for the classes identification during zero shot learning phase.
        kwargs: Additional arguments.

    Returns:
        The output of image zero-shot detection task.
    """
    with torch.inference_mode():
        inputs = self.processor(images=image, text=classes, return_tensors="pt").to(self.model.device)

        outputs = self.model(**inputs)

        target_size = (
            (image.shape[-2], image.shape[-1]) if isinstance(image, torch.Tensor) else (image.height, image.width)
        )

        result = self.processor.post_process_grounded_object_detection(
            outputs,
            inputs.input_ids,
            box_threshold=box_threshold,
            text_threshold=text_threshold,
            target_sizes=[target_size],
        )[0]

        return ImageZeroShotDetectionOutput(
            boxes=[[int(round(x, 0)) for x in box.tolist()] for box in result["boxes"]],
            scores=result["scores"],
            classes=result["labels"],
        )

text_image_conditional_generation(prompt, images, generation_config=None, **kwargs)

Generate text from an image and a prompt.

Parameters:

Name Type Description Default
prompt str | list[dict[str, Any]]

Prompt for the generation.

required
images list['Tensor']

Images for the generation.

required
generation_config 'GenerationConfig' | None

Configuration for the generation as Hugging Face's GenerationConfig.

None
kwargs Any

Additional keyword arguments.

{}
Source code in pixano_inference/models/transformers.py
def text_image_conditional_generation(
    self,
    prompt: str | list[dict[str, Any]],
    images: list["Tensor"],
    generation_config: "GenerationConfig" | None = None,
    **kwargs: Any,
) -> TextImageConditionalGenerationOutput:
    """Generate text from an image and a prompt.

    Args:
        prompt: Prompt for the generation.
        images: Images for the generation.
        generation_config: Configuration for the generation as Hugging Face's GenerationConfig.
        kwargs: Additional keyword arguments.
    """
    with torch.inference_mode():
        if generation_config is None:
            generation_config = GenerationConfig()

        generation_config = self._fill_generation_config(generation_config, **kwargs)

        if isinstance(prompt, list):
            prompt = self.processor.apply_chat_template(prompt, add_generation_prompt=True)

        inputs = self.processor(prompt, images, return_tensors="pt").to(self.model.device)
        generate_ids = self.model.generate(**inputs, generation_config=generation_config)

        total_tokens: int = generate_ids.shape[1]
        prompt_tokens: int = inputs["input_ids"].shape[1]
        completion_tokens: int = total_tokens - prompt_tokens

        output = self.processor.decode(
            generate_ids[0, prompt_tokens:], skip_special_tokens=True, clean_up_tokenization_spaces=False
        )

        return TextImageConditionalGenerationOutput(
            generated_text=output,
            usage=UsageConditionalGeneration(
                prompt_tokens=prompt_tokens,
                completion_tokens=completion_tokens,
                total_tokens=total_tokens,
            ),
            generation_config=generation_config.to_diff_dict(),
        )