`pixano.datasets.dataset_schema`

`DatasetItem(created_at=None, updated_at=None, **data)`

Bases: BaseModel

Dataset Item.

It is a Pydantic model that represents an item in a dataset.

Attributes:

Name	Type	Description
`id`	`str`	The unique identifier of the item.
`split`	`str`	The split of the item.
`created_at`	`datetime`	The creation date of the item.
`updated_at`	`datetime`	The last modification date of the item.

Raises ValidationError if the input data cannot be validated to form a valid model.

self is explicitly positional-only to allow self as a field name.

Parameters:

Name	Type	Description	Default
`created_at`	`datetime \| None`	The creation date of the object.	`None`
`updated_at`	`datetime \| None`	The last modification date of the object.	`None`
`data`	`Any`	The data of the object validated by Pydantic.	`{}`

Source code in pixano/datasets/dataset_schema.py

def __init__(self, /, created_at: datetime | None = None, updated_at: datetime | None = None, **data: Any) -> None:
    """Create a new model by parsing and validating input data from keyword arguments.

    Raises [`ValidationError`][pydantic_core.ValidationError] if the input data cannot be
    validated to form a valid model.

    `self` is explicitly positional-only to allow `self` as a field name.

    Args:
        created_at: The creation date of the object.
        updated_at: The last modification date of the object.
        data: The data of the object validated by Pydantic.
    """
    created_at, updated_at = validate_and_init_create_at_and_update_at(created_at, updated_at)
    data.update({"created_at": created_at, "updated_at": updated_at})
    super().__init__(**data)

`from_dataset_schema(dataset_schema, exclude_embeddings=True)` `staticmethod`

Create a dataset item model based on the schema.

Parameters:

Name	Type	Description	Default
`dataset_schema`	`DatasetSchema`	The dataset schema.	required
`exclude_embeddings`	`bool`	Exclude embeddings from the dataset item model to reduce the size.	`True`

Returns:

Type	Description
`type[DatasetItem]`	The dataset item model

Source code in pixano/datasets/dataset_schema.py

@staticmethod
def from_dataset_schema(dataset_schema: DatasetSchema, exclude_embeddings: bool = True) -> type["DatasetItem"]:
    """Create a dataset item model based on the schema.

    Args:
        dataset_schema: The dataset schema.
        exclude_embeddings: Exclude embeddings from the dataset item model to reduce the size.

    Returns:
        The dataset item model
    """
    item_type = dataset_schema.schemas[SchemaGroup.ITEM.value]
    fields: dict[str, Any] = {}

    if dataset_schema.relations != {} and SchemaGroup.ITEM.value in dataset_schema.relations:
        for schema, relation in dataset_schema.relations[SchemaGroup.ITEM.value].items():
            if exclude_embeddings and schema in dataset_schema.groups[SchemaGroup.EMBEDDING]:
                continue
            # Add default value in case an item does not have a specific view or entity.
            schema_type = dataset_schema.schemas[schema]
            if relation == SchemaRelation.ONE_TO_MANY:
                fields[schema] = (list[schema_type], [])  # type: ignore[valid-type]
            else:
                fields[schema] = (schema_type | None, None)

    for field_name, field in item_type.model_fields.items():
        # No default value as all items metadata should be retrieved.
        fields[field_name] = (field.annotation, ...)

    CustomDatasetItem = create_model(
        "DatasetItem",
        **fields,
        __base__=DatasetItem,
    )
    return CustomDatasetItem

`from_schemas_data(schemas_data)` `staticmethod`

Create a DatasetItem from schemas data.

Parameters:

Name	Type	Description	Default
`cls`	`DatasetItem`	The DatasetItem class.	required
`schemas_data`	`dict[str, BaseSchema \| list[BaseSchema] \| None]`	Schemas data.	required

Returns:

Type	Description
`DatasetItem`	The created DatasetItem.

Source code in pixano/datasets/dataset_schema.py

@staticmethod
def from_schemas_data(
    cls: "DatasetItem", schemas_data: dict[str, BaseSchema | list[BaseSchema] | None]
) -> "DatasetItem":
    """Create a DatasetItem from schemas data.

    Args:
        cls: The DatasetItem class.
        schemas_data: Schemas data.

    Returns:
        The created DatasetItem.
    """
    if SchemaGroup.ITEM.value not in schemas_data:
        raise ValueError("Item schema data not found.")

    schemas_data.update(schemas_data.pop(SchemaGroup.ITEM.value).model_dump())  # type: ignore[union-attr]
    return cls(**schemas_data)

`get_sub_dataset_item(selected_fields)` `classmethod`

Create a new dataset item based on the selected fields of the original dataset item.

Note

The id and split fields are always included in the sub dataset item.

Note

The sub dataset item does not have the methods and config of the original dataset item.

Parameters:

Name	Type	Description	Default
`selected_fields`	`list[str]`	The selected fields.	required

Returns:

Type	Description
`type[Self]`	The sub dataset item.

Source code in pixano/datasets/dataset_schema.py

@classmethod
def get_sub_dataset_item(cls, selected_fields: list[str]) -> type[Self]:
    """Create a new dataset item based on the selected fields of the original dataset
    item.

    Note:
        The id and split fields are always included in the sub dataset item.

    Note:
        The sub dataset item does not have the methods and config of the original
        dataset item.

    Args:
        selected_fields: The selected fields.

    Returns:
        The sub dataset item.
    """
    fields = {}
    for field_name, field in cls.model_fields.items():
        if field_name in selected_fields or field_name in ["id", "split"]:
            if isinstance(field.annotation, GenericAlias):
                origin = field.annotation.__origin__
                args = field.annotation.__args__

                # Check if field is list or tuple
                if origin is tuple:
                    fields[field_name] = (origin[args[0], ...], field.default)  # type: ignore[index]
                else:
                    fields[field_name] = (field.annotation, field.default)
            else:
                fields[field_name] = (field.annotation, field.default)

    SubDatasetItem: type[DatasetItem] = create_model(
        cls.__name__,
        **fields,
        __base__=DatasetItem,
    )

    return SubDatasetItem

`model_copy(*, dataset, deep=False)`

Returns a copy of the model.

Parameters:

Name	Type	Description	Default
`dataset`	`Dataset`	The dataset where the DatasetItem belongs.	required
`deep`		Set to `True` to make a deep copy of the model.	`False`

Returns:

Type	Description
`Self`	New model instance.

Source code in pixano/datasets/dataset_schema.py

def model_copy(self, *, dataset: "Dataset", deep=False) -> Self:
    """Returns a copy of the model.

    Args:
        dataset: The dataset where the DatasetItem belongs.
        deep: Set to `True` to make a deep copy of the model.

    Returns:
        New model instance.
    """
    # Actual copy done by each schema to call our own model_copy method
    data: dict[str, BaseSchema | list[BaseSchema] | None] = self.to_schemas_data(dataset.schema)
    copied_data: dict[str, BaseSchema | list[BaseSchema] | None] = {}
    for key, value in data.items():
        if isinstance(value, list):
            copied_data[key] = [item.model_copy(deep=deep) for item in value]
        elif value is not None:
            copied_data[key] = value.model_copy(deep=deep)
        else:
            copied_data[key] = None
    copy_item = self.from_schemas_data(self.__class__, copied_data)  # type: ignore[arg-type]
    return copy_item

`model_dump(exclude_timestamps=False, **kwargs)`

Dump the model to a dictionary.

Parameters:

Name	Type	Description	Default
`exclude_timestamps`	`bool`	Exclude timestamps "created_at" and "updated_at" from the model dump. Useful for comparing models without timestamps.	`False`
`kwargs`	`Any`	Arguments for pydantic `BaseModel.model_dump()`.	`{}`

Returns:

Type	Description
`dict[str, Any]`	The model dump.

Source code in pixano/datasets/dataset_schema.py

def model_dump(self, exclude_timestamps: bool = False, **kwargs: Any) -> dict[str, Any]:
    """Dump the model to a dictionary.

    Args:
        exclude_timestamps: Exclude timestamps "created_at" and "updated_at" from the model dump. Useful for
            comparing models without timestamps.
        kwargs: Arguments for pydantic `BaseModel.model_dump()`.

    Returns:
        The model dump.
    """
    model_dump = super().model_dump(**kwargs)
    if exclude_timestamps:
        model_dump.pop("created_at", None)
        model_dump.pop("updated_at", None)
        for k, value in model_dump.items():
            if isinstance(value, dict):
                value.pop("created_at", None)
                value.pop("updated_at", None)
            elif isinstance(value, list):  # Only one level deep.
                for item in value:
                    if isinstance(item, dict):
                        item.pop("created_at", None)
                        item.pop("updated_at", None)
    return model_dump

`to_dataset_schema()` `classmethod`

Convert a DatasetItem to a DatasetSchema.

Source code in pixano/datasets/dataset_schema.py

@classmethod
def to_dataset_schema(cls) -> DatasetSchema:
    """Convert a DatasetItem to a DatasetSchema."""
    return DatasetSchema.from_dataset_item(cls)

`to_schemas_data(dataset_schema)`

Convert DatasetItem to schemas data.

Parameters:

Name	Type	Description	Default
`dataset_schema`	`DatasetSchema`	DatasetSchema to convert to.	required

Returns:

Type	Description
`dict[str, BaseSchema \| list[BaseSchema] \| None]`	Schemas data.

Source code in pixano/datasets/dataset_schema.py

def to_schemas_data(self, dataset_schema: DatasetSchema) -> dict[str, BaseSchema | list[BaseSchema] | None]:
    """Convert DatasetItem to schemas data.

    Args:
        dataset_schema: DatasetSchema to convert to.

    Returns:
        Schemas data.
    """
    schemas_data = {}
    item_data = {}
    for field_name in self.model_fields.keys():
        if field_name in dataset_schema.schemas:
            schemas_data[field_name] = getattr(self, field_name)
        else:
            item_data[field_name] = getattr(self, field_name)
    schemas_data[SchemaGroup.ITEM.value] = dataset_schema.schemas[SchemaGroup.ITEM.value](**item_data)
    return schemas_data

`DatasetSchema(**data)`

Bases: BaseModel

A dataset schema that defines the tables and the relations between them.

Attributes:

Name	Type	Description
`schemas`	`dict[str, type[BaseSchema]]`	The mapping between the table names and their schema.
`relations`	`dict[str, dict[str, SchemaRelation]]`	The relations between the item table and the other tables.
`groups`	`dict[SchemaGroup, set[str]]`	The groups of tables. It is filled automatically based on the schemas.

Source code in pydantic/main.py

def __init__(self, /, **data: Any) -> None:
    """Create a new model by parsing and validating input data from keyword arguments.

    Raises [`ValidationError`][pydantic_core.ValidationError] if the input data cannot be
    validated to form a valid model.

    `self` is explicitly positional-only to allow `self` as a field name.
    """
    # `__tracebackhide__` tells pytest and some other tools to omit this function from tracebacks
    __tracebackhide__ = True
    validated_self = self.__pydantic_validator__.validate_python(data, self_instance=self)
    if self is not validated_self:
        warnings.warn(
            'A custom validator is returning a value other than `self`.\n'
            "Returning anything other than `self` from a top level model validator isn't supported when validating via `__init__`.\n"
            'See the `model_validator` docs (https://docs.pydantic.dev/latest/concepts/validators/#model-validators) for more details.',
            stacklevel=2,
        )

`add_schema(table_name, schema, relation_item)`

Add a schema to the dataset schema.

Parameters:

Name	Type	Description	Default
`table_name`	`str`	Name of the table to add to the dataset schema.	required
`schema`	`type[BaseSchema]`	Schema of the table.	required
`relation_item`	`SchemaRelation`	Relationship with the item schema.	required

Returns:

Type	Description
`Self`	The dataset schema.

Source code in pixano/datasets/dataset_schema.py

def add_schema(self, table_name: str, schema: type[BaseSchema], relation_item: SchemaRelation) -> Self:
    """Add a schema to the dataset schema.

    Args:
        table_name: Name of the table to add to the dataset schema.
        schema: Schema of the table.
        relation_item: Relationship with the item schema.

    Returns:
        The dataset schema.
    """
    table_name = self.format_table_name(table_name)
    if table_name in self.schemas:
        raise ValueError(f"Table {table_name} already exists in the schemas.")
    elif not issubclass(schema, BaseSchema):
        raise ValueError(f"Schema {schema} should be a subclass of BaseSchema.")
    elif not isinstance(relation_item, SchemaRelation):
        raise ValueError(f"Invalid relation {relation_item}.")
    found_group = False
    for group, group_type in _SCHEMA_GROUP_TO_SCHEMA_DICT.items():
        if issubclass(schema, group_type):
            self.groups[group].add(table_name)
            found_group = True
            break
    if not found_group:
        raise ValueError(f"Invalid table type {schema}")
    self.schemas[table_name] = schema
    if relation_item == SchemaRelation.ONE_TO_ONE:
        self.relations[SchemaGroup.ITEM.value][table_name] = SchemaRelation.ONE_TO_ONE
        self.relations[table_name] = {SchemaGroup.ITEM.value: SchemaRelation.ONE_TO_ONE}
    elif relation_item == SchemaRelation.ONE_TO_MANY:
        self.relations[SchemaGroup.ITEM.value][table_name] = SchemaRelation.MANY_TO_ONE
        self.relations[table_name] = {SchemaGroup.ITEM.value: SchemaRelation.ONE_TO_MANY}
    elif relation_item == SchemaRelation.MANY_TO_ONE:
        self.relations[SchemaGroup.ITEM.value][table_name] = SchemaRelation.ONE_TO_MANY
        self.relations[table_name] = {SchemaGroup.ITEM.value: SchemaRelation.MANY_TO_ONE}
    elif relation_item == SchemaRelation.MANY_TO_MANY:
        self.relations[SchemaGroup.ITEM.value][table_name] = SchemaRelation.MANY_TO_MANY
        self.relations[table_name] = {SchemaGroup.ITEM.value: SchemaRelation.MANY_TO_MANY}
    return self

`deserialize(dataset_schema_json)` `staticmethod`

Deserialize the dataset schema.

Parameters:

Name	Type	Description	Default
`dataset_schema_json`	`dict[str, dict[str, Any]]`	Serialized dataset schema.	required

Returns:

Type	Description
`DatasetSchema`	The dataset schema.

Source code in pixano/datasets/dataset_schema.py

@staticmethod
def deserialize(dataset_schema_json: dict[str, dict[str, Any]]) -> "DatasetSchema":
    """Deserialize the dataset schema.

    Args:
        dataset_schema_json: Serialized dataset schema.

    Returns:
        The dataset schema.
    """
    dataset_schema_dict: dict[str, Any] = {
        "relations": {
            schema1: {schema2: SchemaRelation(relation) for schema2, relation in relations.items()}
            for schema1, relations in dataset_schema_json["relations"].items()
        },
        "schemas": {},
        "groups": {SchemaGroup(group): set(schemas) for group, schemas in dataset_schema_json["groups"].items()},
    }
    for table_name, schema in dataset_schema_json["schemas"].items():
        dataset_schema_dict["schemas"][table_name] = BaseSchema.deserialize(schema)
    return DatasetSchema(**dataset_schema_dict)

`format_table_name(table_name)` `staticmethod`

Format table name.

It converts the table name to lowercase and replaces spaces with underscores.

Parameters:

Name	Type	Description	Default
`table_name`	`str`	Table name.	required

Returns:

Type	Description
`str`	the formatted table name.

Source code in pixano/datasets/dataset_schema.py

@staticmethod
def format_table_name(table_name: str) -> str:
    """Format table name.

    It converts the table name to lowercase and replaces spaces with underscores.

    Args:
        table_name: Table name.

    Returns:
        the formatted table name.
    """
    return table_name.lower().replace(" ", "_")

`from_dataset_item(dataset_item)` `staticmethod`

Create a dataset schema from a DatasetItem.

Parameters:

Name	Type	Description	Default
`dataset_item`	`type[DatasetItem]`	The dataset item.	required

Returns:

Type	Description
`DatasetSchema`	The dataset schema.

Source code in pixano/datasets/dataset_schema.py

@staticmethod
def from_dataset_item(dataset_item: type["DatasetItem"]) -> "DatasetSchema":
    """Create a dataset schema from a [DatasetItem][pixano.datasets.DatasetItem].

    Args:
        dataset_item: The dataset item.

    Returns:
        The dataset schema.
    """
    item_fields = {}

    # table schemas
    dataset_schema_dict: dict[str, Any] = {}
    dataset_schema_dict["relations"] = {SchemaGroup.ITEM.value: {}}
    schemas = {}

    for field_name, field in dataset_item.model_fields.items():
        # Check if field is a generic alias (list or tuple)
        if isinstance(field.annotation, GenericAlias):
            origin = field.annotation.__origin__
            args = field.annotation.__args__

            # Check if field is list or tuple
            if origin in [list, tuple]:
                # Categorizing list of schemas as schemas and keeping track of the relation
                if issubclass(args[0], tuple(_SCHEMA_REGISTRY.values())):
                    schemas[field_name] = args[0]
                    dataset_schema_dict["relations"][SchemaGroup.ITEM.value][field_name] = (
                        SchemaRelation.ONE_TO_MANY
                    )
                    dataset_schema_dict["relations"][field_name] = {
                        SchemaGroup.ITEM.value: SchemaRelation.MANY_TO_ONE
                    }
                else:
                    item_fields[field_name] = (list[args[0]], ...)  # type: ignore[valid-type]
            else:
                # Default case: categorize as item attribute
                item_fields[field_name] = (args[0], ...)  # type: ignore[valid-type]
        # Check if field is a schema
        elif issubclass(field.annotation, tuple(_SCHEMA_REGISTRY.values())):
            schemas[field_name] = field.annotation
            dataset_schema_dict["relations"][SchemaGroup.ITEM.value][field_name] = SchemaRelation.ONE_TO_ONE
            dataset_schema_dict["relations"][field_name] = {SchemaGroup.ITEM.value: SchemaRelation.ONE_TO_ONE}
        else:
            # Default case: item attribute
            item_fields[field_name] = (field.annotation, ...)

    CustomItem = create_model("Item", **item_fields, __base__=Item)

    schemas[SchemaGroup.ITEM.value] = CustomItem
    dataset_schema_dict["schemas"] = schemas

    return DatasetSchema(**dataset_schema_dict)

`from_json(json_fp)` `staticmethod`

Read a dataset schema from JSON file.

Parameters:

Name	Type	Description	Default
`json_fp`	`Path`	JSON file path	required

Returns:

Type	Description
`DatasetSchema`	The dataset schema.

Source code in pixano/datasets/dataset_schema.py

@staticmethod
def from_json(
    json_fp: Path,
) -> "DatasetSchema":
    """Read a dataset schema from JSON file.

    Args:
        json_fp: JSON file path

    Returns:
        The dataset schema.
    """
    schema_json = json.loads(json_fp.read_text(encoding="utf-8"))

    return DatasetSchema.deserialize(schema_json)

`get_table_group(table_name)`

Get the group of a table.

Parameters:

Name	Type	Description	Default
`table_name`	`str`	Table name.	required

Returns:

Type	Description
`SchemaGroup`	The group of the table.

Source code in pixano/datasets/dataset_schema.py

def get_table_group(self, table_name: str) -> SchemaGroup:
    """Get the group of a table.

    Args:
        table_name: Table name.

    Returns:
        The group of the table.
    """
    for group, tables in self.groups.items():
        if table_name in tables:
            return group
    raise ValueError(f"Table {table_name} not found in groups.")

`serialize()`

Serialize the dataset schema.

The serialized schema is a dictionary with the following format: { "relations": { "item": { "image": "one_to_one", } }, "schemas": { "table1": { "schema": "CustomItem", "base_schema": "Item", "fields": { "id": { "type": "str", "collection": False }, "split": { "type": "str", "collection": False }, ... }

}
}

}

Returns:

Type	Description
`dict[str, dict[str, Any]]`	The serialized dataset schema.

Source code in pixano/datasets/dataset_schema.py

@model_serializer
def serialize(self) -> dict[str, dict[str, Any]]:
    """Serialize the dataset schema.

    The serialized schema is a dictionary with the following format:
    {
        "relations": {
            "item": {
                "image": "one_to_one",
            }
        },
        "schemas": {
            "table1": {
                "schema": "CustomItem",
                "base_schema": "Item",
                "fields": {
                    "id": {
                        "type": "str",
                        "collection": False
                    },
                    "split": {
                        "type": "str",
                        "collection": False
                    },
                    ...
                }

            }
        }
    }

    Returns:
        The serialized dataset schema.
    """
    dataset_schema_json: dict[str, dict[str, Any]] = {
        "relations": {
            schema1: {schema2: relation.value for schema2, relation in relations.items()}
            for schema1, relations in self.relations.items()
        },
        "schemas": {},
        "groups": {group.value: list(schemas) for group, schemas in self.groups.items()},
    }
    for table_name, schema in self.schemas.items():
        dataset_schema_json["schemas"][table_name] = schema.serialize()
    return dataset_schema_json

`to_json(json_fp)`

Save DatasetSchema to json file.

Source code in pixano/datasets/dataset_schema.py

def to_json(self, json_fp: Path) -> None:
    """Save DatasetSchema to json file."""
    if json_fp.exists():
        old_json_content = json.loads(json_fp.read_text(encoding="utf-8"))
    else:
        old_json_content = None

    json_content = self.serialize()

    # Keep the schema field from the old json content for custom schemas.
    if old_json_content is not None:
        for table, schema in json_content["schemas"].items():
            if table not in old_json_content["schemas"]:
                continue
            schema["schema"] = old_json_content["schemas"][table]["schema"]
    json_fp.write_text(json.dumps(json_content, indent=4), encoding="utf-8")

`SchemaRelation`

Bases: Enum

Relation between tables.

Attributes:

Name	Type	Description
`ONE_TO_MANY`		One to many relation.
`MANY_TO_ONE`		Many to one relation.
`ONE_TO_ONE`		One to one relation.
`MANY_TO_MANY`		Many to many relation

pixano.datasets.dataset_schema

DatasetItem(created_at=None, updated_at=None, **data)

from_dataset_schema(dataset_schema, exclude_embeddings=True) staticmethod

from_schemas_data(schemas_data) staticmethod

get_sub_dataset_item(selected_fields) classmethod

model_copy(*, dataset, deep=False)

model_dump(exclude_timestamps=False, **kwargs)

to_dataset_schema() classmethod

to_schemas_data(dataset_schema)

DatasetSchema(**data)

add_schema(table_name, schema, relation_item)

deserialize(dataset_schema_json) staticmethod

format_table_name(table_name) staticmethod

from_dataset_item(dataset_item) staticmethod

from_json(json_fp) staticmethod

get_table_group(table_name)

serialize()

to_json(json_fp)

SchemaRelation

`pixano.datasets.dataset_schema`

`DatasetItem(created_at=None, updated_at=None, **data)`

`from_dataset_schema(dataset_schema, exclude_embeddings=True)` `staticmethod`

`from_schemas_data(schemas_data)` `staticmethod`

`get_sub_dataset_item(selected_fields)` `classmethod`

`model_copy(*, dataset, deep=False)`

`model_dump(exclude_timestamps=False, **kwargs)`

`to_dataset_schema()` `classmethod`

`to_schemas_data(dataset_schema)`

`DatasetSchema(**data)`

`add_schema(table_name, schema, relation_item)`

`deserialize(dataset_schema_json)` `staticmethod`

`format_table_name(table_name)` `staticmethod`

`from_dataset_item(dataset_item)` `staticmethod`

`from_json(json_fp)` `staticmethod`

`get_table_group(table_name)`

`serialize()`

`to_json(json_fp)`

`SchemaRelation`