Skip to content

pixano_inference.transformers.clip

CLIP(pretrained_model='openai/clip-vit-base-patch32', model_id='')

Bases: InferenceModel

CLIP: Connecting text and images

Attributes:

Name Type Description
name str

Model name

model_id str

Model ID

device str

Model GPU or CPU device (e.g. "cuda", "cpu")

description str

Model description

model CLIPModel

CLIP model

processor CLIPProcessor

CLIP processor

tokenizer CLIPTokenizerFast

CLIP tokenizer

pretrained_model str

Pretrained model name or path

Parameters:

Name Type Description Default
pretrained_model str

Pretrained model name or path

'openai/clip-vit-base-patch32'
model_id str

Previously used ID, generate new ID if "". Defaults to "".

''
Source code in pixano_inference/transformers/clip.py
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
def __init__(
    self,
    pretrained_model: str = "openai/clip-vit-base-patch32",
    model_id: str = "",
) -> None:
    """Initialize model

    Args:
        pretrained_model (str): Pretrained model name or path
        model_id (str, optional): Previously used ID, generate new ID if "". Defaults to "".
    """

    super().__init__(
        name="CLIP",
        model_id=model_id,
        device="cpu",
        description=f"From HuggingFace Transformers. CLIP: Connecting text and images. {pretrained_model}.",
    )

    # Model
    self.model = CLIPModel.from_pretrained(pretrained_model)
    self.processor = CLIPProcessor.from_pretrained(pretrained_model)
    self.tokenizer = CLIPTokenizerFast.from_pretrained(pretrained_model)

    # Model name or path
    self.pretrained_model = pretrained_model

precompute_embeddings(batch, views, uri_prefix)

Embedding precomputing for a batch

Parameters:

Name Type Description Default
batch RecordBatch

Input batch

required
views list[str]

Dataset views

required
uri_prefix str

URI prefix for media files

required

Returns:

Type Description
RecordBatch

Embedding rows

Source code in pixano_inference/transformers/clip.py
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
def precompute_embeddings(
    self,
    batch: pa.RecordBatch,
    views: list[str],
    uri_prefix: str,
) -> list[dict]:
    """Embedding precomputing for a batch

    Args:
        batch (pa.RecordBatch): Input batch
        views (list[str]): Dataset views
        uri_prefix (str): URI prefix for media files

    Returns:
        pa.RecordBatch: Embedding rows
    """

    rows = [
        {
            "id": batch["id"][x].as_py(),
        }
        for x in range(batch.num_rows)
    ]

    for view in views:
        # Iterate manually
        for x in range(batch.num_rows):
            # Preprocess image
            im: Image = Image.from_dict(batch[view][x].as_py())
            im.uri_prefix = uri_prefix
            im = im.as_pillow()

            # Inference
            inputs = self.processor(images=im, padded=True, return_tensors="pt")
            image_features = self.model.get_image_features(**inputs)
            vect = image_features.detach().numpy()[0]

            # Process model outputs
            rows[x][view] = vect

    return rows

Process semantic search query with CLIP

Parameters:

Name Type Description Default
query str

Search query text

required

Returns:

Type Description
ndarray

Search query vector

Source code in pixano_inference/transformers/clip.py
105
106
107
108
109
110
111
112
113
114
115
116
117
118
def semantic_search(self, query: str) -> np.ndarray:
    """Process semantic search query with CLIP

    Args:
        query (str): Search query text

    Returns:
        np.ndarray: Search query vector
    """

    inputs = self.tokenizer([query], padding=True, return_tensors="pt")
    text_features = self.model.get_text_features(**inputs)

    return text_features.detach().numpy()[0]