Source code for decent_bench.benchmark._metric_result

from collections.abc import Mapping, Sequence
from dataclasses import dataclass
from typing import Literal

import pandas as pd

from decent_bench.algorithms import Algorithm
from decent_bench.benchmark._compute.compute_tables import aggregate_table_metrics
from decent_bench.metrics import Metric
from decent_bench.metrics._metrics_view import NetworkMetricsView
from decent_bench.networks import Network



[docs]
@dataclass
class MetricResult:
    """
    Result of metric computation, containing raw data and statistics across agents and trials.

    This class is used to store the computed metrics from a benchmark execution.
    It is returned by the :func:`~decent_bench.benchmark.compute_metrics` function and contains
    all the information about the computed metrics, including agent-level metrics, table statistics,
    and plot data for visualization.

    * `network_views`: contains the raw network-level metrics for each algorithm, organized by algorithm where
        each algorithm maps to a sequence of trials, with each trial containing a
        :class:`~decent_bench.metrics.NetworkMetricsView`.
    * `raw_table_results`: contains raw metric evaluations in a dictionary mapping Metric to pandas.DataFrame. Each
        DataFrame has columns (algorithm, trial, agent, value). Table metrics are evaluated only at the *most recent*
        iteration reached during benchmarking.
    * `raw_plot_results`: contains raw metric evaluations in a dictionary mapping Metric to pandas.DataFrame. Each
        DataFrame has columns (algorithm, trial, agent, iteration, value). Plot metrics are evaluated at *all*
        iterations reached during benchmarking.
    * `table_results`: contains the aggregated results in a pandas.DataFrame with columns
        (metric, algorithm, statistic, mean, std).
    * `plot_results`: contains the aggregated results in a pandas.DataFrame with columns
        (metric, algorithm, iteration, mean, min, max).

    `table_results` can be recomputed with a new set of statistics across agents by using :meth:`update_table_results`.

    Use the properties `algorithms`, `table_metrics` and `plot_metrics` to check for which algorithms and metrics
    the object stores data. Note that these methods assume that all attributes have the same set of metrics and
    algorithms, since the object is generated by the backend; no sanity check is performed, so altering any of the
    attributes might lead to unexpected results.
    """

    network_views: Mapping[Algorithm[Network], Sequence[NetworkMetricsView]] | None
    raw_table_results: Mapping[Metric, pd.DataFrame] | None
    raw_plot_results: Mapping[Metric, pd.DataFrame] | None
    table_results: pd.DataFrame | None
    plot_results: pd.DataFrame | None


[docs]
    def update_table_results(self, statistics_across_agents: list[str] | None) -> pd.DataFrame | None:
        """Recompute aggregated table statistics from stored raw table results."""
        if not self.raw_table_results:
            return None
        self.table_results = aggregate_table_metrics(self.raw_table_results, statistics_across_agents)
        return self.table_results


    @property
    def algorithms(self) -> list[str]:
        """Return ``name`` of available algorithms, which can be used for filtering in :func:`~decent_bench.benchmark.display_metrics`."""  # noqa: E501
        if self.network_views is not None:
            return sorted({algorithm.name for algorithm in self.network_views})
        data = self._which_attribute()
        if not data:
            return []
        return sorted(next(iter(data.values()))["algorithm"].unique())

    @property
    def table_metrics(self) -> list[str]:
        """Return ``description`` of available table metrics, which can be used for filtering in :func:`~decent_bench.benchmark.display_metrics`."""  # noqa: E501
        data = self._which_attribute("table")
        if not data:
            return []
        return sorted({metric.description for metric in (data.keys() or [])})

    @property
    def plot_metrics(self) -> list[str]:
        """Return ``description`` of available plot metrics, which can be used for filtering in :func:`~decent_bench.benchmark.display_metrics`."""  # noqa: E501
        data = self._which_attribute("plot")
        if not data:
            return []
        return sorted({metric.description for metric in (data.keys() or [])})

    @property
    def iterations(self) -> list[int]:
        """Return all the iterations that were reached in at least one trial by at least one algorithm."""
        if not self.raw_plot_results:
            return []
        return sorted(next(iter(self.raw_plot_results.values()))["iteration"].unique())

    def _which_attribute(self, type_: Literal["all", "table", "plot"] = "all") -> Mapping[Metric, pd.DataFrame] | None:
        if type_ == "all":
            candidates = [self.raw_table_results, self.raw_plot_results]
        elif type_ == "table":
            candidates = [self.raw_table_results]
        else:
            candidates = [self.raw_plot_results]
        return next((c for c in candidates if c is not None), None)