Adding Datasets¶
Datasets provide the workload for profiling. To add a new dataset, subclass DatasetProvider and register it with DatasetRegistry.
Step 1: Create the Dataset File¶
Create a new file in intelligence-per-watt/src/ipw/datasets/:
# ipw/datasets/my_benchmark.py
from __future__ import annotations
import os
from typing import Dict, Iterable, List, MutableMapping, Optional, Tuple
from datasets import load_dataset
from ..clients.base import InferenceClient
from ..core.registry import ClientRegistry, DatasetRegistry, EvaluationRegistry
from ..core.types import DatasetRecord
from .base import DatasetProvider
@DatasetRegistry.register("my-benchmark")
class MyBenchmarkDataset(DatasetProvider):
"""My custom benchmark dataset."""
dataset_id = "my-benchmark"
dataset_name = "My Benchmark"
evaluation_method = "my-benchmark" # Maps to EvaluationRegistry
# Evaluation settings
eval_client = "openai"
eval_base_url = "https://api.openai.com/v1"
eval_model = "gpt-5-nano-2025-08-07"
_hf_path = "my-org/my-benchmark" # HuggingFace dataset path
_default_split = "test"
def __init__(
self,
*,
split: Optional[str] = None,
max_samples: Optional[int] = None,
) -> None:
self._split = split or self._default_split
self._max_samples = max_samples
self._records: Tuple[DatasetRecord, ...] = tuple(self._build_records())
def iter_records(self) -> Iterable[DatasetRecord]:
return iter(self._records)
def size(self) -> int:
return len(self._records)
def verify_requirements(self) -> list[str]:
issues: list[str] = []
if not (os.getenv("IPW_EVAL_API_KEY") or os.getenv("OPENAI_API_KEY")):
issues.append(
"Missing evaluation API key. Set IPW_EVAL_API_KEY or OPENAI_API_KEY."
)
return issues
def score(
self,
record: DatasetRecord,
response: str,
*,
eval_client: Optional[InferenceClient] = None,
) -> Tuple[Optional[bool], Dict[str, object]]:
handler = self._resolve_handler(eval_client)
return handler.evaluate(
problem=record.problem,
reference=record.answer,
model_answer=response,
metadata=record.dataset_metadata,
)
def _resolve_handler(self, eval_client: Optional[InferenceClient]):
judge_client = eval_client or ClientRegistry.create(
self.eval_client or "openai",
base_url=self.eval_base_url or "https://api.openai.com/v1",
model=self.eval_model or "gpt-5-nano-2025-08-07",
)
return EvaluationRegistry.create(self.evaluation_method, client=judge_client)
def _build_records(self) -> List[DatasetRecord]:
dataset = load_dataset(self._hf_path, split=self._split)
rows = list(dataset)
if self._max_samples is not None:
rows = rows[: self._max_samples]
records: List[DatasetRecord] = []
for raw in rows:
record = self._convert_row(raw)
if record is not None:
records.append(record)
return records
def _convert_row(self, raw: dict) -> Optional[DatasetRecord]:
question = str(raw.get("question", "")).strip()
answer = str(raw.get("answer", "")).strip()
if not question or not answer:
return None
return DatasetRecord(
problem=question,
answer=answer,
subject=str(raw.get("category", "general")),
dataset_metadata={
"dataset_name": self.dataset_name,
"id": raw.get("id"),
},
)
Step 2: Register the Import¶
Add your module to ipw/datasets/__init__.py:
def ensure_registered() -> None:
from . import (
# ... existing imports ...
my_benchmark, # Add this
)
If your dataset has optional dependencies, wrap it in a try/except:
Step 3: Create an Evaluation Handler (Optional)¶
If your dataset needs custom scoring logic, create an evaluation handler:
# ipw/evaluation/my_benchmark.py
from ..core.registry import EvaluationRegistry
from .base import EvaluationHandler
@EvaluationRegistry.register("my-benchmark")
class MyBenchmarkEvaluator(EvaluationHandler):
evaluation_method = "my-benchmark"
def evaluate(self, *, problem, reference, model_answer, metadata):
# Custom scoring logic
is_correct = reference.lower().strip() == model_answer.lower().strip()
return is_correct, {"method": "exact_match"}
Register it in ipw/evaluation/__init__.py.
Step 4: Test¶
# Verify registration
ipw list datasets
# Run a profile
ipw profile --dataset my-benchmark --client ollama --model llama3.2:1b \
--client-base-url http://localhost:11434 --max-queries 10
Key Methods¶
iter_records()¶
Must yield DatasetRecord objects. Each record contains:
problem-- the prompt to send to the modelanswer-- the reference/ground-truth answersubject-- a category label (used for grouping in analysis)dataset_metadata-- arbitrary metadata (preserved in output)
size()¶
Return the total number of records. Used for progress bars and validation.
score()¶
Score a single model response. Returns (is_correct, metadata):
is_correct:True,False, orNone(if unscorable)metadata: dictionary with evaluation details
For MCQ datasets, scoring can be done without an LLM judge (exact match on the selected option). For open-ended datasets, delegate to an EvaluationHandler that uses an LLM judge.
verify_requirements()¶
Return a list of unmet requirements (e.g., missing API keys, missing packages). An empty list means the dataset is ready. Called before profiling starts.
Prompt Formatting¶
Format prompts to get the best results from the model. Common patterns:
# Multiple choice
prompt = f"{question}\n\nOptions:\n{formatted_options}\n\nRespond with the correct letter."
# Open-ended QA
prompt = f"Answer the following question concisely:\n\n{question}"
# With context
prompt = f"Given the following context:\n{context}\n\nAnswer: {question}"
Existing Datasets for Reference¶
ipw/datasets/mmlu_pro.py-- MCQ with option formattingipw/datasets/gaia.py-- Agentic with file attachments and cachingipw/datasets/swebench.py-- Coding with variant supportipw/datasets/simpleqa.py-- Simple factual QA