Skip to content

pixano.data.exporters.coco_exporter

COCOExporter(input_dir)

Bases: Exporter

Exporter class for COCO instances dataset

Attributes:

Name Type Description
dataset Dataset

Dataset to export

coco_json dict[str, Any]

Dataset split in COCO format

Parameters:

Name Type Description Default
input_dir Path

Input dataset directory

required
Source code in pixano/data/exporters/exporter.py
def __init__(
    self,
    input_dir: Path,
):
    """Initialize Exporter

    Args:
        input_dir (Path): Input dataset directory
    """

    # Dataset to export
    self.dataset = Dataset(input_dir)

export_dataset(export_dir, splits=None, objects_sources=None, copy=True)

Export dataset back to original format

Parameters:

Name Type Description Default
export_dir Path

Export directory

required
splits list[str]

Dataset splits to export, all if None. Defaults to None.

None
objects_sources list[str]

Objects sources to export, all if None. Defaults to None.

None
copy bool

True to copy files to export directory. Defaults to True.

True
Source code in pixano/data/exporters/coco_exporter.py
def export_dataset(
    self,
    export_dir: Path,
    splits: list[str] = None,
    objects_sources: list[str] = None,
    copy: bool = True,
):
    """Export dataset back to original format

    Args:
        export_dir (Path): Export directory
        splits (list[str], optional): Dataset splits to export, all if None. Defaults to None.
        objects_sources (list[str], optional): Objects sources to export, all if None. Defaults to None.
        copy (bool, optional): True to copy files to export directory. Defaults to True.
    """

    # If no splits provided, select all splits
    if splits is None:
        splits = self.dataset.info.splits
        # If no splits, there is nothing to export
        if not splits:
            raise ValueError("Dataset has no splits to export.")

    # If no object sources provided, select all object tables
    if objects_sources is None:
        objects_sources = list(
            table.source for table in self.dataset.info.tables["objects"]
        )
        # If no object tables, there is nothing to export
        if not objects_sources:
            raise ValueError("Dataset has no objects tables to export.")

    # Create export directory
    ann_dir = export_dir / f"annotations [{', '.join(objects_sources)}]"
    ann_dir.mkdir(parents=True, exist_ok=True)

    self._category_id_count = 0  # used if no category_id in dataset (TODO: get from prebuilt coco id/name mapping)

    # Iterate on splits
    with tqdm(desc="Processing dataset", total=self.dataset.num_rows) as progress:
        for split in splits:
            # build categories field from features_values
            if (
                self.dataset.info.features_values
                and self.dataset.info.features_values.objects
                and "category" in self.dataset.info.features_values.objects
            ):
                if "category_id" in self.dataset.info.features_values.objects:
                    categories = [
                        {"id": id, "name": name, "supercategory": ""}
                        for id, name in zip(
                            self.dataset.info.features_values.objects[
                                "category_id"
                            ].values,
                            self.dataset.info.features_values.objects[
                                "category"
                            ].values,
                        )
                    ]
                else:
                    for i, val in enumerate(
                        self.dataset.info.features_values.objects["category"].values
                    ):
                        categories.append({"id": i, "name": val})

            else:
                categories = []

            # Create COCO json
            self.coco_json = {
                "info": {
                    "description": self.dataset.info.name,
                    "url": "N/A",
                    "version": f"v{datetime.datetime.now().strftime('%y%m%d.%H%M%S')}",
                    "year": datetime.date.today().year,
                    "contributor": "Exported from Pixano",
                    "date_created": datetime.date.today().isoformat(),
                },
                "licences": [
                    {
                        "url": "N/A",
                        "id": 1,
                        "name": "Unknown",
                    },
                ],
                "images": [],
                "annotations": [],
                "categories": categories,
            }

            batch_size = 1024

            for batch_index in range(ceil(self.dataset.num_rows / batch_size)):
                # Load items
                items = self.dataset.load_items(
                    limit=min(
                        self.dataset.num_rows,
                        (batch_index + 1) * batch_size,
                    ),
                    offset=batch_index * batch_size,
                )

                # Iterate on items
                for item in items:
                    # Filter on split
                    if item.split == split:
                        # Export item
                        self._export_item(item, objects_sources)
                        # Update progress bar
                        progress.update(1)

            # Save COCO format .json file
            with open(
                ann_dir / f"instances_{split}.json", "w", encoding="utf-8"
            ) as f:
                json.dump(self.coco_json, f)

    # Copy media directory
    if copy:
        if (
            self.dataset.media_dir.exists()
            and self.dataset.media_dir != export_dir / "media"
        ):
            shutil.copytree(
                self.dataset.media_dir, export_dir / "media", dirs_exist_ok=True
            )