Skip to content

Index

ipw.datasets.ipw

Intelligence Per Watt dataset provider and bundled data.

IPWDataset

Bases: DatasetProvider

Dataset provider for the bundled Intelligence Per Watt benchmark.

Source code in intelligence-per-watt/src/ipw/datasets/ipw/ipw.py
@DatasetRegistry.register("ipw")
class IPWDataset(DatasetProvider):
    """Dataset provider for the bundled Intelligence Per Watt benchmark."""

    dataset_name = "Intelligence Per Watt"
    dataset_id = "ipw"

    def __init__(self) -> None:
        self._path = _default_dataset_path()
        if not self._path.exists():
            raise FileNotFoundError(f"Dataset location not found: {self._path}")
        self._records = tuple(self._load_all_records())

    def iter_records(self) -> Iterable[DatasetRecord]:
        return iter(self._records)

    def _load_all_records(self) -> Iterable[DatasetRecord]:
        if self._path.is_dir():
            yield from self._load_from_dataset_dir(self._path)
        else:
            yield from (
                record
                for record in self._load_from_jsonl(self._path)
                if self._is_valid(record)
            )

    def _load_from_dataset_dir(self, directory: Path) -> Iterable[DatasetRecord]:
        dataset = load_from_disk(str(directory))
        if isinstance(dataset, dict):
            hf_dataset = next(iter(dataset.values()))
        else:
            hf_dataset = dataset

        raw_records: Iterable[MutableMapping[str, Any]] = (
            hf_dataset if isinstance(hf_dataset, list) else hf_dataset.to_list()
        )
        for raw in raw_records:
            record = self._parse_record(raw)
            if self._is_valid(record):
                yield record

    def _load_from_jsonl(self, file_path: Path) -> Iterator[DatasetRecord]:
        with file_path.open("r", encoding="utf-8") as handle:
            for line in handle:
                stripped = line.strip()
                if not stripped:
                    continue
                raw: Dict[str, Any] = json.loads(stripped)
                yield self._parse_record(raw)

    def _parse_record(self, raw: Dict[str, Any]) -> DatasetRecord:
        problem = str(raw.get("problem") or raw.get("prompt") or "").strip()
        answer = str(raw.get("answer") or raw.get("expected_answer") or "").strip()
        subject = str(raw.get("subject") or "general").strip() or "general"

        dataset_metadata = dict(raw)
        return DatasetRecord(
            problem=problem,
            answer=answer,
            subject=subject,
            dataset_metadata=dataset_metadata,
        )

    def _is_valid(self, record: DatasetRecord) -> bool:
        return bool(
            record.problem
            and record.answer
            and record.subject
            and record.dataset_metadata
        )

    def size(self) -> int:
        return len(self._records)

    def verify_requirements(self) -> list[str]:
        issues: list[str] = []
        if not (os.getenv("IPW_EVAL_API_KEY") or os.getenv("OPENAI_API_KEY")):
            issues.append(
                "Missing evaluation API key. Set IPW_EVAL_API_KEY (preferred) or OPENAI_API_KEY for scoring."
            )
        return issues

    def score(
        self,
        record: DatasetRecord,
        response: str,
        *,
        eval_client: Optional[InferenceClient] = None,
    ) -> Tuple[Optional[bool], Dict[str, object]]:
        """
        Delegate scoring to a dataset-specific evaluation handler based on the
        embedded metadata in this mixed dataset.
        """
        raw_meta = record.dataset_metadata.get("dataset_metadata")
        if not isinstance(raw_meta, str):
            raise RuntimeError("Missing or invalid 'dataset_metadata' field for scoring.")

        meta = json.loads(raw_meta)
        config = meta.get("config") or {}

        # Use a mapping from dataset_name to evaluation_method (handler key)
        dataset_name = config.get("dataset_name")

        # Define the mapping
        VERIFICATION_MAPPING = {
            "allenai/WildChat": "wildchat",
            "facebook/natural_reasoning": "natural_reasoning",
            "lmsys/lmsys-chat-1m": "wildchat",
        }

        evaluation_method = VERIFICATION_MAPPING.get(dataset_name)

        if not evaluation_method:
            raise RuntimeError(
                f"Could not determine evaluation method for dataset: {dataset_name}. "
                f"Supported datasets: {', '.join(sorted(VERIFICATION_MAPPING.keys()))}"
            )

        handler = self._resolve_handler(evaluation_method, eval_client)

        problem = record.problem
        reference = record.answer

        is_correct, eval_meta = handler.evaluate(
            problem=problem,
            reference=reference,
            model_answer=response,
            metadata=meta,
        )
        return is_correct, eval_meta

    def _resolve_handler(
        self, evaluation_method: str, eval_client: Optional[InferenceClient]
    ):
        judge_client = eval_client or ClientRegistry.create(
            self.eval_client or "openai",
            base_url=self.eval_base_url or "https://api.openai.com/v1",
            model=self.eval_model or "gpt-5-nano-2025-08-07",
        )
        return EvaluationRegistry.create(evaluation_method, client=judge_client)

score(record, response, *, eval_client=None)

Delegate scoring to a dataset-specific evaluation handler based on the embedded metadata in this mixed dataset.

Source code in intelligence-per-watt/src/ipw/datasets/ipw/ipw.py
def score(
    self,
    record: DatasetRecord,
    response: str,
    *,
    eval_client: Optional[InferenceClient] = None,
) -> Tuple[Optional[bool], Dict[str, object]]:
    """
    Delegate scoring to a dataset-specific evaluation handler based on the
    embedded metadata in this mixed dataset.
    """
    raw_meta = record.dataset_metadata.get("dataset_metadata")
    if not isinstance(raw_meta, str):
        raise RuntimeError("Missing or invalid 'dataset_metadata' field for scoring.")

    meta = json.loads(raw_meta)
    config = meta.get("config") or {}

    # Use a mapping from dataset_name to evaluation_method (handler key)
    dataset_name = config.get("dataset_name")

    # Define the mapping
    VERIFICATION_MAPPING = {
        "allenai/WildChat": "wildchat",
        "facebook/natural_reasoning": "natural_reasoning",
        "lmsys/lmsys-chat-1m": "wildchat",
    }

    evaluation_method = VERIFICATION_MAPPING.get(dataset_name)

    if not evaluation_method:
        raise RuntimeError(
            f"Could not determine evaluation method for dataset: {dataset_name}. "
            f"Supported datasets: {', '.join(sorted(VERIFICATION_MAPPING.keys()))}"
        )

    handler = self._resolve_handler(evaluation_method, eval_client)

    problem = record.problem
    reference = record.answer

    is_correct, eval_meta = handler.evaluate(
        problem=problem,
        reference=reference,
        model_answer=response,
        metadata=meta,
    )
    return is_correct, eval_meta