Skip to content

Index

ipw.datasets

Dataset implementations bundled with Intelligence Per Watt.

Datasets register themselves with ipw.core.DatasetRegistry.

DatasetProvider

Bases: ABC

Base interface for providing prompts to the profiler.

Source code in intelligence-per-watt/src/ipw/datasets/base.py
class DatasetProvider(ABC):
    """Base interface for providing prompts to the profiler."""

    dataset_id: str
    dataset_name: str

    # Preferred evaluation settings (datasets may override)
    eval_client: str | None = "openai"
    eval_base_url: str | None = "https://api.openai.com/v1"
    eval_model: str | None = "gpt-5-nano-2025-08-07"

    def __iter__(self) -> Iterator[DatasetRecord]:
        return iter(self.iter_records())

    @abstractmethod
    def iter_records(self) -> Iterable[DatasetRecord]:
        """Yield dataset records in the order they should be executed."""

    @abstractmethod
    def size(self) -> int:
        """Return the number of records."""

    def score(
        self,
        record: DatasetRecord,
        response: str,
        *,
        eval_client: Optional[InferenceClient] = None,
    ) -> Tuple[Optional[bool], Dict[str, object]]:
        """
        Compute correctness for a single model response.

        Args:
            record: The dataset record containing problem and reference answer
            response: The model's response to evaluate
            eval_client: Optional inference client to use for LLM-based judging

        Returns:
            (is_correct, metadata) tuple where:
            - is_correct: True/False if scored, None if unscorable
            - metadata: method-specific evaluation details
        """
        raise NotImplementedError("score() is not implemented for this dataset")

    def verify_requirements(self) -> list[str]:
        """
        Return a list of unmet requirements for this dataset (e.g., missing env vars).
        An empty list means all required preconditions are satisfied.
        """
        return []

iter_records() abstractmethod

Yield dataset records in the order they should be executed.

Source code in intelligence-per-watt/src/ipw/datasets/base.py
@abstractmethod
def iter_records(self) -> Iterable[DatasetRecord]:
    """Yield dataset records in the order they should be executed."""

size() abstractmethod

Return the number of records.

Source code in intelligence-per-watt/src/ipw/datasets/base.py
@abstractmethod
def size(self) -> int:
    """Return the number of records."""

score(record, response, *, eval_client=None)

Compute correctness for a single model response.

Parameters:

Name Type Description Default
record DatasetRecord

The dataset record containing problem and reference answer

required
response str

The model's response to evaluate

required
eval_client Optional[InferenceClient]

Optional inference client to use for LLM-based judging

None

Returns:

Type Description
Optional[bool]

(is_correct, metadata) tuple where:

Dict[str, object]
  • is_correct: True/False if scored, None if unscorable
Tuple[Optional[bool], Dict[str, object]]
  • metadata: method-specific evaluation details
Source code in intelligence-per-watt/src/ipw/datasets/base.py
def score(
    self,
    record: DatasetRecord,
    response: str,
    *,
    eval_client: Optional[InferenceClient] = None,
) -> Tuple[Optional[bool], Dict[str, object]]:
    """
    Compute correctness for a single model response.

    Args:
        record: The dataset record containing problem and reference answer
        response: The model's response to evaluate
        eval_client: Optional inference client to use for LLM-based judging

    Returns:
        (is_correct, metadata) tuple where:
        - is_correct: True/False if scored, None if unscorable
        - metadata: method-specific evaluation details
    """
    raise NotImplementedError("score() is not implemented for this dataset")

verify_requirements()

Return a list of unmet requirements for this dataset (e.g., missing env vars). An empty list means all required preconditions are satisfied.

Source code in intelligence-per-watt/src/ipw/datasets/base.py
def verify_requirements(self) -> list[str]:
    """
    Return a list of unmet requirements for this dataset (e.g., missing env vars).
    An empty list means all required preconditions are satisfied.
    """
    return []

ensure_registered()

Import built-in dataset providers to populate the registry.

Source code in intelligence-per-watt/src/ipw/datasets/__init__.py
def ensure_registered() -> None:
    """Import built-in dataset providers to populate the registry."""
    from . import (  # noqa: F401
        frames,
        gaia,
        hle,
        ipw,
        mmlu_pro,
        simpleqa,
        supergpqa,
        swebench,
        swefficiency,
    )

    # Optional-dependency datasets: import errors are silently ignored so
    # that missing packages don't prevent the rest of the registry from
    # being populated.
    try:
        from . import terminalbench  # noqa: F401
    except ImportError:
        pass