Source code for decent_bench.datasets._dataset_handler
from __future__ import annotations
from abc import ABC, abstractmethod
from collections.abc import Sequence
from decent_bench.utils.types import Dataset
[docs]
class DatasetHandler(ABC):
"""
Abstract wrapper for datasets used in decentralized optimization benchmark problems.
This class provides an interface for accessing datasets in a partitioned format
for decentralized optimization scenarios. Rather than storing the data directly,
:class:`DatasetHandler` implementations act as wrappers that return data in the required
format when queried.
In decentralized optimization, the dataset is typically divided among multiple
agents in a network, where each agent has access to only a subset (partition)
of the complete dataset. This class abstracts that partitioning scheme.
When defining benchmark problems, a DatasetHandler instance can be used to:
- Provide local datasets to each agent in the network via :meth:`get_partitions`
- Define the overall optimization problem (e.g., empirical risk minimization)
- Serve as a test set for evaluating decentralized algorithms on the full dataset
(e.g. via :meth:`get_datapoints`) by defining the `test_data` field of
:class:`~decent_bench.benchmark.BenchmarkProblem`.
Data Structure:
The dataset consists of datapoints, where each datapoint is a tuple of
(features, targets). Features and targets are represented as :class:`~decent_bench.utils.array.Array`
objects or framework-specific tensor objects in special cases. For unsupervised learning,
targets are usually None. Partitions are sequences of such datapoints,
allowing users to easily distribute local datasets among agents.
Note:
Implementations may load data from various sources (files, generators,
synthetic data, etc) and are not required to store all datapoints in memory.
"""
@property
@abstractmethod
def n_samples(self) -> int:
"""Total number of datapoints in the dataset."""
@property
@abstractmethod
def n_partitions(self) -> int:
"""Total number of partitions in the dataset."""
@property
@abstractmethod
def n_features(self) -> int:
"""Number of feature dimensions."""
@property
@abstractmethod
def n_targets(self) -> int:
"""Number of target dimensions."""
[docs]
@abstractmethod
def get_datapoints(self) -> Dataset:
"""
Return all datapoints in the dataset.
Can be used for evaluation on the full dataset or creation of test datasets.
"""
[docs]
@abstractmethod
def get_partitions(self) -> Sequence[Dataset]:
"""
Return the dataset divided into partitions for distribution among agents.
This method provides the core partitioning functionality for decentralized
optimization. Each partition represents the local dataset of an agent in
the network.
Returns:
Sequence[Dataset]: Sequence of Dataset objects, where each partition is a list of
(features, targets) tuples.
"""
def __len__(self) -> int:
"""Return the number of datapoints in the dataset."""
return self.n_samples