Skip to content

pixano.data.dataset.dataset_item

DatasetItem(**data)

Bases: BaseModel

DatasetItem

Attributes:

Name Type Description
id str

Item ID

original_id str

Item original ID

split str

Item split

features dict[str, ItemFeature]

Item features

views dict[str, ItemView]

Item views

objects dict[str, ItemObject]

Item objects

embeddings dict[str, ItemEmbedding]

Item embeddings

Raises ValidationError if the input data cannot be validated to form a valid model.

self is explicitly positional-only to allow self as a field name.

Source code in pydantic/main.py
def __init__(self, /, **data: Any) -> None:
    """Create a new model by parsing and validating input data from keyword arguments.

    Raises [`ValidationError`][pydantic_core.ValidationError] if the input data cannot be
    validated to form a valid model.

    `self` is explicitly positional-only to allow `self` as a field name.
    """
    # `__tracebackhide__` tells pytest and some other tools to omit this function from tracebacks
    __tracebackhide__ = True
    validated_self = self.__pydantic_validator__.validate_python(data, self_instance=self)
    if self is not validated_self:
        warnings.warn(
            'A custom validator is returning a value other than `self`.\n'
            "Returning anything other than `self` from a top level model validator isn't supported when validating via `__init__`.\n"
            'See the `model_validator` docs (https://docs.pydantic.dev/latest/concepts/validators/#model-validators) for more details.',
            category=None,
        )

delete_objects(ds_tables)

Delete remove objects from dataset item

Parameters:

Name Type Description Default
ds_tables dict[str, dict[str, LanceTable]]

Dataset tables

required
Source code in pixano/data/dataset/dataset_item.py
def delete_objects(
    self,
    ds_tables: dict[str, dict[str, lancedb.db.LanceTable]],
):
    """Delete remove objects from dataset item

    Args:
        ds_tables (dict[str, dict[str, lancedb.db.LanceTable]]): Dataset tables
    """

    # Get current item objects
    current_obj_tables = {}
    for table_name, table in ds_tables["objects"].items():
        media_scanner = table.to_lance().scanner(filter=f"item_id in ('{self.id}')")
        current_obj_tables[table_name] = media_scanner.to_table().to_pylist()

    # Check if objects have been deleted
    for table_name, current_obj_table in current_obj_tables.items():
        for current_obj in current_obj_table:
            # If object has been deleted
            if not any(
                obj_id == current_obj["id"] for obj_id in self.objects.keys()
            ):
                # Remove object from table
                ds_tables["objects"][table_name].delete(
                    f"id in ('{current_obj['id']}')"
                )

                # Clear change history to prevent dataset from becoming too large
                ds_tables["objects"][table_name].to_lance().cleanup_old_versions()

from_pyarrow(pyarrow_item, info, media_dir, media_features=False, model_id=None) staticmethod

Format PyArrow item

Parameters:

Name Type Description Default
pyarrow_item dict[str, dict[str, Table]]

PyArrow item

required
info DatasetInfo

Dataset info

required
media_dir Path

Dataset media directory

required
media_features bool

Load media features like image width and height (slow for large item batches)

False
model_id str

Model ID (ONNX file path) of embeddings to load. Defaults to None.

None

Returns:

Type Description
DatasetItem

Formatted item

Source code in pixano/data/dataset/dataset_item.py
@staticmethod
def from_pyarrow(
    pyarrow_item: dict[str, dict[str, pa.Table]],
    info: DatasetInfo,
    media_dir: Path,
    media_features: bool = False,
    model_id: str = None,
) -> "DatasetItem":
    """Format PyArrow item

    Args:
        pyarrow_item (dict[str, dict[str, pa.Table]]): PyArrow item
        info (DatasetInfo): Dataset info
        media_dir (Path): Dataset media directory
        media_features (bool, optional): Load media features like image width and height (slow for large item batches)
        model_id (str, optional): Model ID (ONNX file path) of embeddings to load. Defaults to None.

    Returns:
        DatasetItem: Formatted item
    """

    item_info = pyarrow_item["main"]["db"].to_pylist()[0]

    # Create item
    item = DatasetItem(
        id=item_info["id"],
        split=item_info["split"],
    )

    for group_name, table_group in info.tables.items():
        # Main table
        if group_name == "main":
            # Item features
            item.features = ItemFeature.from_pyarrow(
                pyarrow_item["main"]["db"],
                Fields(table_group[0].fields).to_schema(),
            )

        # Media tables
        if group_name == "media" and "media" in pyarrow_item:
            item.views = {}
            for table in table_group:
                item.views = item.views | ItemView.from_pyarrow(
                    pyarrow_item["media"][table.name],
                    Fields(table.fields).to_schema(),
                    media_dir,
                    media_features,
                )

        # Objects
        if group_name == "objects" and "objects" in pyarrow_item:
            item.objects = {}
            for table in table_group:
                item.objects = item.objects | ItemObject.from_pyarrow(
                    pyarrow_item["objects"][table.name],
                    Fields(table.fields).to_schema(),
                    table.source,
                )

        # Active Learning
        if group_name == "active_learning" and "active_learning" in pyarrow_item:
            for table in table_group:
                al_features = ItemFeature.from_pyarrow(
                    pyarrow_item["active_learning"][table.name],
                    Fields(table.fields).to_schema(),
                )
                item.features = item.features | al_features

        # Segmentation embeddings
        if group_name == "embeddings" and "embeddings" in pyarrow_item:
            item.embeddings = {}
            for table in table_group:
                if table.source.lower() in model_id.lower():
                    item.embeddings = item.embeddings | ItemEmbedding.from_pyarrow(
                        pyarrow_item["embeddings"][table.name],
                        Fields(table.fields).to_schema(),
                    )

    return item

to_pyarrow()

Return DatasetItem in PyArrow format

Returns:

Type Description
dict[str, Any]

Item in PyArrow format

Source code in pixano/data/dataset/dataset_item.py
def to_pyarrow(self) -> dict[str, Any]:
    """Return DatasetItem in PyArrow format

    Returns:
        dict[str, Any]: Item in PyArrow format
    """

    pyarrow_item = {}

    # ID
    pyarrow_item["id"] = self.id
    pyarrow_item["split"] = self.split

    # Features
    if self.features is not None:
        # Add features
        for feat in self.features.values():
            pyarrow_item[feat.name] = (
                field_to_python(feat.dtype)(feat.value)
                if feat.value is not None
                else None
            )

        # Check feature types
        for feat in self.features.values():
            if pyarrow_item[feat.name] is not None and not isinstance(
                pyarrow_item[feat.name], field_to_python(feat.dtype)
            ):
                raise ValueError(
                    f"Feature {feat.name} of object {self.id} is of type {type(self.features[feat.name].value)} instead of type {field_to_python(feat.dtype)}"
                )

    return pyarrow_item

update(ds_table)

Update dataset item

Parameters:

Name Type Description Default
ds_table LanceTable

Item table

required
Source code in pixano/data/dataset/dataset_item.py
def update(
    self,
    ds_table: lancedb.db.LanceTable,
):
    """Update dataset item

    Args:
        ds_table (lancedb.db.LanceTable): Item table
    """

    # Convert item to PyArrow
    pyarrow_item = self.to_pyarrow()
    table_item = pa.Table.from_pylist(
        [pyarrow_item],
        schema=ds_table.schema,
    )

    # Update item
    ds_table.delete(f"id in ('{self.id}')")
    ds_table.add(table_item, mode="append")

    # Clear change history to prevent dataset from becoming too large
    ds_table.to_lance().cleanup_old_versions()