def generate_data(
self,
) -> Iterator[dict[str, BaseSchema | list[BaseSchema]]]:
"""Generate data from the source directory.
Returns:
An iterator over the data following the dataset schemas.
"""
source_id = None
for split in self.source_dir.glob("*"):
if split.is_dir() and not split.name.startswith("."):
try:
dataset_pieces = self._read_annotations(split / self.METADATA_FILENAME)
except Exception:
raise ValueError(f"Could not read annotation file {split / self.METADATA_FILENAME}")
for dataset_piece in dataset_pieces:
item_metadata = {}
for k in dataset_piece.keys():
if (
k not in self.views_schema
and k not in self.entities_schema
and k not in self.annotations_schema
):
item_metadata.update({k: dataset_piece.get(k, None)})
for k in item_metadata.keys():
dataset_piece.pop(k, None)
# create item
item = self._create_item(split.name, **item_metadata)
views_data: list[tuple[str, View]] = []
all_entities_data: dict[str, list[Entity]] = defaultdict(list)
all_annotations_data: dict[str, list[Annotation]] = defaultdict(list)
for k, v in dataset_piece.items():
if k in self.views_schema:
view_name = k
s = self.views_schema.get(view_name)
if s is None:
raise ValueError("View schema must be defined in the schemas argument.")
view_schema: type[View] = s
# create views
if isinstance(v, list):
if len(v) > 1:
# create a mosaic from item images
mosaic_file = mosaic(self.source_dir, split.name, v, view_name)
view_file = self.source_dir / mosaic_file
else:
view_file = self.source_dir / Path(v[0])
if not view_file.is_file(): # no split path in metadata.jsonl
view_file = self.source_dir / split / Path(v[0])
if view_file.is_file() and view_file.suffix in self.EXTENSIONS:
view = self._create_vqa_view(item, view_file, view_schema)
views_data.append((view_name, view))
for k, v in dataset_piece.items():
if k in self.entities_schema:
if source_id is None:
source_id = self.add_source("Builder", SourceKind.OTHER)
entity_name = k
raw_entities_data = v
# create entities and their annotations
entities_data, annotations_data = self._create_vqa_entities(
item, views_data, entity_name, raw_entities_data, source_id
)
for name, entities in entities_data.items():
all_entities_data[name].extend(entities)
for name, annotations in annotations_data.items():
all_annotations_data[name].extend(annotations)
yield {self.item_schema_name: item}
for view_name, view in views_data:
yield {view_name: view}
if all_entities_data is None:
continue
yield all_entities_data
yield all_annotations_data