`pixano_inference.providers.transformers`

Provider for Hugging Face Transformers models.

`TransformersProvider(*args, **kwargs)`

Bases: ModelProvider

Provider for Hugging Face Transformers models.

Source code in pixano_inference/providers/transformers.py

def __init__(self, *args: Any, **kwargs: Any) -> None:
    """Initialize the transformer provider."""
    assert_transformers_installed()
    super().__init__(*args, **kwargs)

`image_mask_generation(request, model, *args, **kwargs)`

Generate a mask from the image.

Parameters:

Name	Type	Description	Default
`request`	`ImageMaskGenerationRequest`	Request for the generation.	required
`model`	`TransformerModel`	Model to use for the generation.	required
`args`	`Any`	Additional arguments.	`()`
`kwargs`	`Any`	Additional keyword arguments.	`{}`

Returns:

Type	Description
`ImageMaskGenerationOutput`	Output of the generation

Source code in pixano_inference/providers/transformers.py

def image_mask_generation(
    self,
    request: ImageMaskGenerationRequest,
    model: TransformerModel,  # type: ignore[override]
    *args: Any,
    **kwargs: Any,
) -> ImageMaskGenerationOutput:
    """Generate a mask from the image.

    Args:
        request: Request for the generation.
        model: Model to use for the generation.
        args: Additional arguments.
        kwargs: Additional keyword arguments.

    Returns:
        Output of the generation
    """
    request_input = request.to_input()
    image = convert_string_to_image(request_input.image)

    if request_input.image_embedding is not None:
        image_embedding = vector_to_tensor(request_input.image_embedding)

    model_input = request_input.model_dump(exclude=["image", "image_embedding"])
    model_input["image"] = image
    model_input["image_embedding"] = image_embedding if request_input.image_embedding is not None else None
    output = model.image_mask_generation(**model_input)
    return output

`image_zero_shot_detection(request, model, *args, **kwargs)`

Perform zero-shot image detection.

Source code in pixano_inference/providers/transformers.py

def image_zero_shot_detection(
    self,
    request: ImageZeroShotDetectionRequest,
    model: TransformerModel,  # type: ignore[override]
    *args: Any,
    **kwargs: Any,
) -> ImageZeroShotDetectionOutput:
    """Perform zero-shot image detection."""
    request_input = request.to_input()

    image = convert_string_to_image(request_input.image)
    classes = request.classes
    if isinstance(classes, list):
        classes = ". ".join(classes)

    model_input = request_input.model_dump(exclude=["image", "classes"])
    model_input["image"] = image
    model_input["classes"] = classes

    output = model.image_zero_shot_detection(**model_input)
    return output

`load_model(name, task, device, path=None, processor_config={}, config={})`

Load a model from transformers.

Parameters:

Name	Type	Description	Default
`name`	`str`	Name of the model.	required
`task`	`Task \| str`	Task of the model.	required
`device`	`device`	Device to use for the model.	required
`path`	`Path \| str \| None`	Path to the model or its Hugging Face hub's identifier.	`None`
`processor_config`	`dict`	Configuration for the processor.	`{}`
`config`	`dict`	Configuration for the model.	`{}`

Returns:

Type	Description
`TransformerModel`	Loaded model.

Source code in pixano_inference/providers/transformers.py

def load_model(
    self,
    name: str,
    task: Task | str,
    device: "torch.device",
    path: Path | str | None = None,
    processor_config: dict = {},
    config: dict = {},
) -> TransformerModel:
    """Load a model from transformers.

    Args:
        name: Name of the model.
        task: Task of the model.
        device: Device to use for the model.
        path: Path to the model or its Hugging Face hub's identifier.
        processor_config: Configuration for the processor.
        config: Configuration for the model.

    Returns:
        Loaded model.
    """
    if path is None:
        raise ValueError("Path is required to load a model from transformers.")
    if isinstance(task, str):
        task = str_to_task(task)
    processor = AutoProcessor.from_pretrained(path, **processor_config)

    if (quantization_config := config.pop("quantization_config", None)) is not None:
        quantization_config = BitsAndBytesConfig(**quantization_config)
        config["quantization_config"] = quantization_config

    model = get_transformer_automodel_from_pretrained(path, task, device_map=device, **config)
    if model is None:
        if task in [NLPTask.CONDITONAL_GENERATION, MultimodalImageNLPTask.CONDITIONAL_GENERATION]:
            model = get_conditional_generation_transformer_from_pretrained(name, path, device_map=device, **config)

    model = model.eval()
    model = torch.compile(model)

    our_model = TransformerModel(name, path, processor, model)
    return our_model

`text_image_conditional_generation(request, model, *args, **kwargs)`

Generate text from an image and a prompt.

Parameters:

Name	Type	Description	Default
`request`	`TextImageConditionalGenerationRequest`	Request for text-image conditional generation.	required
`model`	`TransformerModel`	Model for text-image conditional generation	required
`args`	`Any`	Additional arguments.	`()`
`kwargs`	`Any`	Additional keyword arguments.	`{}`

Returns:

Type	Description
`TextImageConditionalGenerationOutput`	Output of text-image conditional generation.

Source code in pixano_inference/providers/transformers.py

def text_image_conditional_generation(
    self,
    request: TextImageConditionalGenerationRequest,
    model: TransformerModel,  # type: ignore[override]
    *args: Any,
    **kwargs: Any,
) -> TextImageConditionalGenerationOutput:
    """Generate text from an image and a prompt.

    Args:
        request: Request for text-image conditional generation.
        model: Model for text-image conditional generation
        args: Additional arguments.
        kwargs: Additional keyword arguments.

    Returns:
        Output of text-image conditional generation.
    """
    model_input = request.to_input()

    images: list[Image] | None
    if model_input.images is None:
        if isinstance(model_input.prompt, str):
            raise ValueError("Images must be provided if the prompt is a string.")
        images = []
        for message in model_input.prompt:
            new_content = []
            for content in message["content"]:
                if content["type"] == "image_url":
                    images.append(convert_string_to_image(content["image_url"]["url"]))
                    new_content.append({"type": "image"})
                else:
                    new_content.append(content)
            message["content"] = new_content

    else:
        images = (
            [convert_string_to_image(image) for image in model_input["images"]]
            if len(model_input["images"]) > 0
            else None
        )

    model_input_dump = model_input.model_dump()
    model_input_dump["images"] = images
    output = model.text_image_conditional_generation(**model_input_dump)
    return output

`get_conditional_generation_transformer_from_pretrained(name, path, **model_kwargs)`

Get a transformer model from transformers using automodel.

Parameters:

Name	Type	Description	Default
`name`	`str`	Name of the model.	required
`path`	`Path \| str \| None`	Path to the model or its Hugging Face hub's identifier.	required
`model_kwargs`	`Any`	Additional keyword arguments for the model.	`{}`

Returns:

Type	Description
`PreTrainedModel`	Model from Transformers.

Source code in pixano_inference/providers/transformers.py

def get_conditional_generation_transformer_from_pretrained(
    name: str, path: Path | str | None, **model_kwargs: Any
) -> PreTrainedModel:
    """Get a transformer model from transformers using automodel.

    Args:
        name: Name of the model.
        path: Path to the model or its Hugging Face hub's identifier.
        model_kwargs: Additional keyword arguments for the model.

    Returns:
        Model from Transformers.
    """
    name = name.lower()
    if "llava" in name:
        if "next" in name:
            if "video" in name:
                from transformers import LlavaNextVideoForConditionalGeneration

                model = LlavaNextVideoForConditionalGeneration.from_pretrained(path, **model_kwargs)
            else:
                from transformers import LlavaNextForConditionalGeneration

                model = LlavaNextForConditionalGeneration.from_pretrained(path, **model_kwargs)
        else:
            from transformers import LlavaForConditionalGeneration

            model = LlavaForConditionalGeneration.from_pretrained(path, **model_kwargs)
    else:
        raise ValueError(f"Model {name} not supported.")
    return model

`get_transformer_automodel_from_pretrained(pretrained_model_name_or_path, task, **model_kwargs)`

Get a transformer model from transformers using automodel.

Parameters:

Name	Type	Description	Default
`pretrained_model_name_or_path`	`str \| Path`	Name or path of the pretrained model.	required
`task`	`Task`	Task of the model.	required
`model_kwargs`	`Any`	Additional keyword arguments for the model.	`{}`

Source code in pixano_inference/providers/transformers.py

def get_transformer_automodel_from_pretrained(
    pretrained_model_name_or_path: str | Path, task: Task, **model_kwargs: Any
):
    """Get a transformer model from transformers using automodel.

    Args:
        pretrained_model_name_or_path: Name or path of the pretrained model.
        task: Task of the model.
        model_kwargs: Additional keyword arguments for the model.
    """
    assert_transformers_installed()
    pretrained_model_name_or_path = str(pretrained_model_name_or_path)
    if isinstance(task, ImageTask):
        match task:
            case ImageTask.CLASSIFICATION:
                from transformers import AutoModelForImageClassification

                return AutoModelForImageClassification.from_pretrained(pretrained_model_name_or_path, **model_kwargs)
            case ImageTask.DEPTH_ESTIMATION:
                from transformers import AutoModelForDepthEstimation

                return AutoModelForDepthEstimation.from_pretrained(pretrained_model_name_or_path, **model_kwargs)
            case ImageTask.FEATURE_EXTRACTION:
                from transformers import AutoModel

                return AutoModel.from_pretrained(pretrained_model_name_or_path, **model_kwargs)
            case ImageTask.KEYPOINT_DETECTION:
                from transformers import AutoModelForKeypointDetection

                return AutoModelForKeypointDetection.from_pretrained(pretrained_model_name_or_path, **model_kwargs)
            case ImageTask.MASK_GENERATION:
                from transformers import AutoModelForMaskGeneration

                return AutoModelForMaskGeneration.from_pretrained(pretrained_model_name_or_path, **model_kwargs)
            case ImageTask.OBJECT_DETECTION:
                from transformers import AutoModelForObjectDetection

                return AutoModelForObjectDetection.from_pretrained(pretrained_model_name_or_path, **model_kwargs)
            case ImageTask.SEMANTIC_SEGMENTATION:
                from transformers import AutoModelForSemanticSegmentation

                return AutoModelForSemanticSegmentation.from_pretrained(pretrained_model_name_or_path, **model_kwargs)
            case ImageTask.INSTANCE_SEGMENTATION:
                from transformers import AutoModelForInstanceSegmentation

                return AutoModelForInstanceSegmentation.from_pretrained(pretrained_model_name_or_path, **model_kwargs)
            case ImageTask.UNIVERSAL_SEGMENTATION:
                from transformers import AutoModelForUniversalSegmentation

                return AutoModelForUniversalSegmentation.from_pretrained(pretrained_model_name_or_path, **model_kwargs)
            case ImageTask.ZERO_SHOT_CLASSIFICATION:
                from transformers import AutoModelForZeroShotImageClassification

                return AutoModelForZeroShotImageClassification.from_pretrained(
                    pretrained_model_name_or_path, **model_kwargs
                )
            case ImageTask.ZERO_SHOT_DETECTION:
                from transformers import AutoModelForZeroShotObjectDetection

                return AutoModelForZeroShotObjectDetection.from_pretrained(
                    pretrained_model_name_or_path, **model_kwargs
                )
    elif isinstance(task, NLPTask):
        match task:
            case NLPTask.CAUSAL_LM:
                from transformers import AutoModelForCausalLM

                return AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, **model_kwargs)
            case NLPTask.MASKED_LM:
                from transformers import AutoModelForMaskedLM

                return AutoModelForMaskedLM.from_pretrained(pretrained_model_name_or_path, **model_kwargs)
            case NLPTask.MASK_GENERATION:
                from transformers import AutoModelForMaskGeneration

                return AutoModelForMaskGeneration.from_pretrained(pretrained_model_name_or_path, **model_kwargs)
            case NLPTask.SEQUENCE_CLASSIFICATION:
                from transformers import AutoModelForSequenceClassification

                return AutoModelForSequenceClassification.from_pretrained(
                    pretrained_model_name_or_path, **model_kwargs
                )
            case NLPTask.MULTIPLE_CHOICE:
                from transformers import AutoModelForMultipleChoice

                return AutoModelForMultipleChoice.from_pretrained(pretrained_model_name_or_path, **model_kwargs)
            case NLPTask.NEXT_SENTENCE_PREDICTION:
                from transformers import AutoModelForNextSentencePrediction

                return AutoModelForNextSentencePrediction.from_pretrained(
                    pretrained_model_name_or_path, **model_kwargs
                )
            case NLPTask.TOKEN_CLASSIFICATION:
                from transformers import AutoModelForTokenClassification

                return AutoModelForTokenClassification.from_pretrained(pretrained_model_name_or_path, **model_kwargs)
            case NLPTask.QUESTION_ANSWERING:
                from transformers import AutoModelForQuestionAnswering

                return AutoModelForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, **model_kwargs)
            case NLPTask.TEXT_ENCODING:
                from transformers import AutoModelForTextEncoding

                return AutoModelForTextEncoding.from_pretrained(pretrained_model_name_or_path, **model_kwargs)
            case _:
                raise ValueError(f"Task {task} not supported.")