Skip to content

pixano.analytics.feature_statistics

categorical_stats(df, split, field_name)

Compute feature categorical statistics

Parameters:

Name Type Description Default
df DataFrame

Input DataFrame

required
split str

DataFrame split

required
field_name str

Selected field

required

Returns:

Type Description
list[dict]

Feature statistics

Source code in pixano/analytics/feature_statistics.py
def categorical_stats(df: pd.DataFrame, split: str, field_name: str) -> list[dict]:
    """Compute feature categorical statistics

    Args:
        df (pd.DataFrame): Input DataFrame
        split (str): DataFrame split
        field_name (str): Selected field

    Returns:
        list[dict]: Feature statistics
    """

    counts = df.value_counts(subset=field_name)
    return [{field_name: k, "counts": v, "split": split} for k, v in counts.items()]

compute_additional_data(data_table)

Convert Table to DataFrame and add resolution and aspect ratio

Parameters:

Name Type Description Default
data_table Table

Input Table

required

Returns:

Type Description
DataFrame

DataFrame with added resolution and aspect ratio

Source code in pixano/analytics/feature_statistics.py
def compute_additional_data(data_table: pa.Table) -> pd.DataFrame:
    """Convert Table to DataFrame and add resolution and aspect ratio

    Args:
        data_table (pa.Table): Input Table

    Returns:
        pd.DataFrame: DataFrame with added resolution and aspect ratio
    """

    # Take a subset of table without image columns (which can't be converted to pandas)
    if not all(p in data_table.column_names for p in ["width", "height"]):
        return None
    data = data_table.select(["width", "height"]).to_pandas()

    # Compute additional data
    data["resolution"] = data.apply(
        lambda x: str(x["width"]) + "x" + str(x["height"]), axis=1
    )
    data["aspect_ratio"] = data.apply(
        lambda x: str(Fraction(x["width"], x["height"])).replace("/", ":"), axis=1
    )

    return data

compute_stats(df, split, feature)

Compute feature statistics

Parameters:

Name Type Description Default
df DataFrame

Input DataFrame

required
split str

DataFrame split

required
feature dict

Selected feature

required

Returns:

Type Description
list[dict]

Feature statistics

Source code in pixano/analytics/feature_statistics.py
def compute_stats(df: pd.DataFrame, split: str, feature: dict[str, Any]) -> list[dict]:
    """Compute feature statistics

    Args:
        df (pd.DataFrame): Input DataFrame
        split (str): DataFrame split
        feature (dict): Selected feature

    Returns:
        list[dict]: Feature statistics
    """

    # Categorical
    if feature["type"] == "categorical":
        return categorical_stats(df, split, feature["name"])
    # Numerical
    if feature["type"] == "numerical":
        return numerical_stats(df, split, feature["name"], feature.get("range", None))
    return []

numerical_stats(df, split, field_name, field_range=None)

Compute feature numerical statistics

Parameters:

Name Type Description Default
df DataFrame

Input DataFrame

required
split str

DataFrame split

required
field_name str

Selected field

required
field_range list[float]

Selected field range. Defaults to None.

None

Returns:

Type Description
list[dict]

Feature statistics

Source code in pixano/analytics/feature_statistics.py
def numerical_stats(
    df: pd.DataFrame, split: str, field_name: str, field_range: list[float] = None
) -> list[dict]:
    """Compute feature numerical statistics

    Args:
        df (pd.DataFrame): Input DataFrame
        split (str): DataFrame split
        field_name (str): Selected field
        field_range (list[float], optional): Selected field range. Defaults to None.

    Returns:
        list[dict]: Feature statistics
    """

    counts, bins = np.histogram(df[field_name], range=field_range)
    return [
        {
            "bin_start": float(bins[i]),
            "bin_end": float(bins[i + 1]),
            "counts": int(counts[i]),
            "split": split,
        }
        for i in range(len(counts))
    ]

objects_table_to_df(data_table, field)

Convert a field from the objects column to a DataFrame

Parameters:

Name Type Description Default
data_table Table

Table with an objects column

required
field str

Selected field from the objects column

required

Returns:

Type Description
DataFrame

Selected field as DataFrame

Source code in pixano/analytics/feature_statistics.py
def objects_table_to_df(data_table: pa.Table, field: str) -> pd.DataFrame:
    """Convert a field from the objects column to a DataFrame

    Args:
        data_table (pa.Table): Table with an objects column
        field (str): Selected field from the objects column

    Returns:
        pd.DataFrame: Selected field as DataFrame
    """

    try:
        df_objs = data_table.select(["objects"]).to_pandas()
        sel = [{field: d[field]} for objs in df_objs["objects"] for d in objs]
        return pd.DataFrame.from_dict(sel)
    except ValueError as e:
        raise ValueError("Unable to convert table Pandas DataFrame") from e