Skip to content

pixano.datasets.builders.folders.base

FolderBaseBuilder(media_dir, library_dir, info, dataset_path, dataset_item=None, use_image_name_as_id=False)

Bases: DatasetBuilder

This is a class for building datasets based on a folder structure.

The folder structure should be as follows
  • source_dir/{split}/{item}.{ext}
  • source_dir/{split}/metadata.jsonl

The metadata file should be a jsonl file with the following format:

    [
        {
            "item": "item1",
            "metadata1": "value1",
            "metadata2": "value2",
            ...
            "entities": {
                "attr1": [val1, val2, ...],
                "attr2": [val1, val2, ...],
                ...
            }
        },
        {
            "item": "item2",
            "metadata1": "value1",
            "metadata2": "value2",
            ...
            "entities": {
                "attr1": [val1, val2, ...],
                "attr2": [val1, val2, ...],
                ...
            }
        },
        ...
    ]

Note

Only one view is supported in folder builders. If you give a list of images, it will be put in a mosaic.

Attributes:

Name Type Description
source_dir

The source directory for the dataset.

view_name

The name of the view schema.

view_schema

The schema of the view.

entity_name

The name of the entities schema.

entity_schema

The schema of the entities.

METADATA_FILENAME str

The metadata filename.

EXTENSIONS list[str]

The list of supported extensions.

WORKSPACE_TYPE

The workspace type of the dataset. Subclass should override this attribute if workspace is known.

Parameters:

Name Type Description Default
media_dir Path | str

The global media directory.

required
library_dir Path | str

The global directory for Pixano datasets library.

required
dataset_item type[DatasetItem] | None

The dataset item schema.

None
info DatasetInfo

User informations (name, description, ...) for the dataset.

required
dataset_path Path | str

Path to dataset, relative to media_dir.

required
use_image_name_as_id bool

If True, use image base name as image id. Images MUST have unique base names. When no metadata file exists, also use it as item id, else, use 'item_#' This allows to reuse image embeddings after dataset overwrite.

False
Source code in pixano/datasets/builders/folders/base.py
def __init__(
    self,
    media_dir: Path | str,
    library_dir: Path | str,
    info: DatasetInfo,
    dataset_path: Path | str,
    dataset_item: type[DatasetItem] | None = None,
    use_image_name_as_id: bool = False,
) -> None:
    """Initialize the `FolderBaseBuilder`.

    Args:
        media_dir: The global media directory.
        library_dir: The global directory for Pixano datasets library.
        dataset_item: The dataset item schema.
        info: User informations (name, description, ...) for the dataset.
        dataset_path: Path to dataset, relative to media_dir.
        use_image_name_as_id: If True, use image base name as image id.
                              Images MUST have unique base names.
                              When no metadata file exists, also use it as item id,
                              else, use 'item_#'
                              This allows to reuse image embeddings after dataset overwrite.
    """
    info.workspace = self.WORKSPACE_TYPE
    if self.DEFAULT_SCHEMA is not None and dataset_item is None:
        dataset_item = self.DEFAULT_SCHEMA
    if dataset_item is None:
        raise ValueError("A schema is required.")

    self.use_image_name_as_id = use_image_name_as_id

    self.media_dir = Path(media_dir)
    dataset_path = Path(dataset_path)
    self.source_dir = self.media_dir / dataset_path
    if not self.source_dir.is_dir():
        raise ValueError("A source path (media_dir / dataset_path) is required.")

    target_dir = Path(library_dir) / "_".join(dataset_path.parts)
    super().__init__(target_dir=target_dir, dataset_item=dataset_item, info=info)

    self.views_schema: dict[str, type[View]] = {}
    self.entities_schema: dict[str, type[Entity]] = {}
    self.annotations_schema: dict[str, type[Annotation]] = {}

    for k, s in self.schemas.items():
        if is_view(s):
            self.views_schema.update({k: s})
        elif is_entity(s):
            self.entities_schema.update({k: s})
        elif is_annotation(s):
            self.annotations_schema.update({k: s})
    if not self.views_schema or not self.entities_schema:
        raise ValueError("At least one View and one Entity schema must be defined in the schemas argument.")

    # TODO - allow multiview in base FolderBuilder
    if len(self.views_schema) > 1:
        raise ValueError("Only one view schema is supported in folder based builders.")

generate_data()

Generate data from the source directory.

Returns:

Type Description
Iterator[dict[str, BaseSchema | list[BaseSchema]]]

An iterator over the data following the dataset schemas.

Source code in pixano/datasets/builders/folders/base.py
def generate_data(
    self,
) -> Iterator[dict[str, BaseSchema | list[BaseSchema]]]:
    """Generate data from the source directory.

    Returns:
        An iterator over the data following the dataset schemas.
    """
    self.source_id = self.add_source("Builder", SourceKind.OTHER)
    for split in self.source_dir.glob("*"):
        if not split.is_dir() or split.name.startswith("."):
            continue

        try:
            dataset_pieces = self._read_metadata(split / self.METADATA_FILENAME)
        except FileNotFoundError:
            dataset_pieces = None

        if dataset_pieces is None:
            for view_file in sorted(split.glob("**/*")):
                # only consider {split}/**/{item}.{ext} files
                if not view_file.is_file() or view_file.suffix not in self.EXTENSIONS:
                    continue
                # create item with default values for custom fields
                custom_item_metadata = self._build_default_custom_metadata_item()
                item = self._create_item(split.name, view_file.stem, **custom_item_metadata)
                # create view
                view_name_nojsonl, view_schema_nojsonl = list(self.views_schema.items())[0]  # only one view
                view = self._create_view(item, view_file, view_schema_nojsonl)
                yield {
                    self.item_schema_name: item,
                    view_name_nojsonl: view,
                }
                # if schema contain a Conversation, add one
                for entity_name, entity_schema_nojsonl in self.entities_schema.items():
                    if entity_schema_nojsonl is not None and is_conversation(entity_schema_nojsonl):
                        default_view_ref = ViewRef(id=view.id, name=view_name_nojsonl)
                        conversation = create_conversation(
                            id=shortuuid.uuid(),
                            kind="vqa",
                            item_ref=ItemRef(id=item.id),
                            view_ref=default_view_ref,
                        )
                        yield {"conversations": conversation}

            continue

        for i, dataset_piece in enumerate(dataset_pieces):
            item_metadata = {}
            for k in dataset_piece.keys():
                if (
                    k not in self.views_schema
                    and k not in self.entities_schema
                    and k not in self.annotations_schema
                ):
                    item_metadata.update({k: dataset_piece.get(k, None)})
            for k in item_metadata.keys():
                dataset_piece.pop(k, None)

            # create item
            item = self._create_item(
                split.name, id=f"item_{split.name}_{i}" if self.use_image_name_as_id else None, **item_metadata
            )

            # create view
            views_data: list[tuple[str, View]] = []
            for k, v in dataset_piece.items():
                if k in self.views_schema:
                    view_name = k
                    view_schema = self.views_schema.get(view_name)
                    if view_schema is not None:
                        if isinstance(v, list):
                            if len(v) == 0:
                                continue
                            if len(v) > 1:
                                # create a mosaic from item images
                                mosaic_file = mosaic(self.source_dir, split.name, v, view_name)
                                view_file = self.source_dir / mosaic_file
                                if not view_file.is_file():  # no split path in metadata.jsonl
                                    view_file = self.source_dir / split.name / mosaic_file
                            else:
                                view_file = self.source_dir / Path(v[0])
                                if not view_file.is_file():  # no split path in metadata.jsonl
                                    view_file = self.source_dir / split.name / Path(v[0])
                            if view_file.is_file() and view_file.suffix in self.EXTENSIONS:
                                view = self._create_view(item, view_file, view_schema)
                                views_data.append((view_name, view))
                        else:
                            view_file = self.source_dir / (
                                Path(v) if split.name == Path(v).parts[0] else split / Path(v)
                            )
                            if view_file.is_file() and view_file.suffix in self.EXTENSIONS:
                                view = self._create_view(item, view_file, view_schema)
                                views_data.append((view_name, view))

            all_entities_data: dict[str, list[Entity]] = defaultdict(list)
            all_annotations_data: dict[str, list[Annotation]] = defaultdict(list)
            for k, v in dataset_piece.items():
                if k in self.entities_schema and v is not None:
                    entity_name = k
                    raw_entities_data = v
                    entity_schema = self.entities_schema.get(entity_name)
                    if entity_schema is not None:
                        if is_conversation(entity_schema):
                            entities_data, annotations_data = self._create_vqa_entities(
                                item, views_data, entity_name, entity_schema, raw_entities_data
                            )
                        else:  # classic entity
                            entities_data, annotations_data = self._create_objects_entities(
                                item, views_data, entity_name, entity_schema, raw_entities_data
                            )

                        for name, entities in entities_data.items():
                            all_entities_data[name].extend(entities)

                        for name, annotations in annotations_data.items():
                            all_annotations_data[name].extend(annotations)

            yield {self.item_schema_name: item}
            for view_name, view in views_data:
                yield {view_name: view}

            if all_entities_data is None:
                continue

            yield all_entities_data
            yield all_annotations_data