Skip to content

pixano.datasets.builders.folders.base

FolderBaseBuilder(source_dir, target_dir, dataset_item, info, url_prefix=None)

Bases: DatasetBuilder

This is a class for building datasets based on a folder structure.

The folder structure should be as follows
  • source_dir/{split}/{item}.{ext}
  • source_dir/{split}/metadata.jsonl

The metadata file should be a jsonl file with the following format:

    [
        {
            "item": "item1",
            "metadata1": "value1",
            "metadata2": "value2",
            ...
            "entities": {
                "attr1": [val1, val2, ...],
                "attr2": [val1, val2, ...],
                ...
            }
        },
        {
            "item": "item2",
            "metadata1": "value1",
            "metadata2": "value2",
            ...
            "entities": {
                "attr1": [val1, val2, ...],
                "attr2": [val1, val2, ...],
                ...
            }
        },
        ...
    ]

Note

Only one view and one entity are supported in folder builders.

Attributes:

Name Type Description
source_dir

The source directory for the dataset.

view_name

The name of the view schema.

view_schema type[View]

The schema of the view.

entity_name

The name of the entities schema.

entity_schema type[Entity]

The schema of the entities.

METADATA_FILENAME str

The metadata filename.

EXTENSIONS list[str]

The list of supported extensions.

Parameters:

Name Type Description Default
source_dir Path | str

The source directory for the dataset.

required
target_dir Path | str

The target directory for the dataset.

required
dataset_item type[DatasetItem]

The dataset item schema.

required
info DatasetInfo

User informations (name, description, ...) for the dataset.

required
url_prefix Path | str | None

The path to build relative URLs for the views. Useful to build dataset libraries to pass the relative path from the media directory.

None
Source code in pixano/datasets/builders/folders/base.py
def __init__(
    self,
    source_dir: Path | str,
    target_dir: Path | str,
    dataset_item: type[DatasetItem],
    info: DatasetInfo,
    url_prefix: Path | str | None = None,
) -> None:
    """Initialize the `FolderBaseBuilder`.

    Args:
        source_dir: The source directory for the dataset.
        target_dir: The target directory for the dataset.
        dataset_item: The dataset item schema.
        info: User informations (name, description, ...) for the dataset.
        url_prefix: The path to build relative URLs for the views. Useful to build dataset libraries to pass the
            relative path from the media directory.
    """
    super().__init__(target_dir=target_dir, dataset_item=dataset_item, info=info)
    self.source_dir = Path(source_dir)
    if url_prefix is None:
        url_prefix = Path(".")
    else:
        url_prefix = Path(url_prefix)
    self.url_prefix = url_prefix

    view_name = None
    entity_name = None
    for k, s in self.schemas.items():
        if issubclass(s, View):
            if view_name is not None:
                raise ValueError("Only one view schema is supported in folder based builders.")
            view_name = k
            view_schema = s
        if issubclass(s, Entity):
            if entity_name is not None:
                raise ValueError("Only one entity schema is supported in folder based builders.")
            entity_name = k
            entity_schema = s
    if view_name is None or entity_name is None:
        raise ValueError("View and entity schemas must be defined in the schemas argument.")
    self.view_name = view_name
    self.view_schema: type[View] = view_schema
    self.entity_name = entity_name
    self.entity_schema: type[Entity] = entity_schema

generate_data()

Generate data from the source directory.

Returns:

Type Description
Iterator[dict[str, BaseSchema | list[BaseSchema]]]

An iterator over the data following the dataset schemas.

Source code in pixano/datasets/builders/folders/base.py
def generate_data(
    self,
) -> Iterator[dict[str, BaseSchema | list[BaseSchema]]]:
    """Generate data from the source directory.

    Returns:
        An iterator over the data following the dataset schemas.
    """
    source_id = None
    for split in self.source_dir.glob("*"):
        if split.is_dir() and not split.name.startswith("."):
            metadata = self._read_metadata(split / self.METADATA_FILENAME)

            for view_file in split.glob("*"):
                # only consider {split}/{item}.{ext} files
                if view_file.is_file() and view_file.suffix in self.EXTENSIONS:
                    # retrieve item metadata in metadata file
                    item_metadata = {}
                    for m in metadata:
                        if m[self.view_name] == view_file.name:
                            item_metadata = m
                            break
                    if not item_metadata:
                        raise ValueError(f"Metadata not found for {view_file}")

                    # extract entity metadata from item metadata
                    entities_data = item_metadata.pop(self.entity_name, None)

                    # create item
                    item = self._create_item(split.name, **item_metadata)

                    # create view
                    view = self._create_view(item, view_file, self.view_schema)

                    if entities_data is None:
                        yield {
                            self.item_schema_name: item,
                            self.view_name: view,
                        }
                        continue
                    elif source_id is None:
                        source_id = self.add_source("Builder", SourceKind.OTHER)

                    # create entities and their annotations
                    entities, annotations = self._create_entities(item, view, entities_data, source_id)

                    yield {
                        self.item_schema_name: item,
                        self.view_name: view,
                        self.entity_name: entities,
                        **annotations,
                    }