Skip to content

swebench

ipw.datasets.swebench

SWEBenchDataset

Bases: DatasetProvider

SWE-bench dataset (princeton-nlp/SWE-bench_Verified).

Supports two variants: - verified: Full 500-task dataset - verified_mini: 50-task subset

Source code in intelligence-per-watt/src/ipw/datasets/swebench.py
@DatasetRegistry.register("swebench")
class SWEBenchDataset(DatasetProvider):
    """SWE-bench dataset (princeton-nlp/SWE-bench_Verified).

    Supports two variants:
    - ``verified``: Full 500-task dataset
    - ``verified_mini``: 50-task subset
    """

    dataset_id = "swebench"
    dataset_name = "SWE-bench"
    evaluation_method = "swebench"

    _default_split = "test"
    _default_variant = "verified_mini"

    # SWE-bench correctness is determined by test execution, not LLM judge.
    eval_client: str | None = None
    eval_base_url: str | None = None
    eval_model: str | None = None

    def __init__(
        self,
        *,
        variant: Optional[str] = None,
        split: Optional[str] = None,
        max_samples: Optional[int] = None,
    ) -> None:
        self._variant = variant or self._default_variant
        if self._variant not in _DATASET_PATHS:
            raise ValueError(
                f"Unknown SWE-bench variant '{self._variant}'. "
                f"Choose from: {list(_DATASET_PATHS)}"
            )
        self._split = split or self._default_split
        self._max_samples = max_samples
        self._records: Tuple[DatasetRecord, ...] = tuple(self._build_records())

    def iter_records(self) -> Iterable[DatasetRecord]:
        return iter(self._records)

    def size(self) -> int:
        return len(self._records)

    # ------------------------------------------------------------------
    # Dataset loading
    # ------------------------------------------------------------------

    def _build_records(self) -> List[DatasetRecord]:
        rows = self._load_raw_rows()
        records: List[DatasetRecord] = []
        for raw in rows:
            record = self._convert_row(raw)
            if record is not None:
                records.append(record)
        return records

    def _load_raw_rows(self) -> Sequence[MutableMapping[str, object]]:
        hf_path = _DATASET_PATHS[self._variant]
        dataset = load_dataset(hf_path, split=self._split)
        rows: Sequence[MutableMapping[str, object]]
        if hasattr(dataset, "to_list"):
            rows = dataset.to_list()
        else:
            rows = list(dataset)
        if self._max_samples is not None:
            rows = rows[: self._max_samples]
        normalized: list[MutableMapping[str, object]] = []
        for row in rows:
            if isinstance(row, MutableMapping):
                normalized.append(row)
            else:
                normalized.append(dict(row))
        return normalized

    def _convert_row(self, raw: MutableMapping[str, object]) -> Optional[DatasetRecord]:
        instance_id = str(raw.get("instance_id") or "")
        repo = str(raw.get("repo") or "")
        problem_statement = str(raw.get("problem_statement") or "").strip()

        if not instance_id or not problem_statement:
            return None

        # The problem is the issue description
        problem = problem_statement

        # The "answer" is the ground-truth patch
        patch = str(raw.get("patch") or "")

        fail_to_pass = _parse_test_list(raw.get("FAIL_TO_PASS", ""))
        pass_to_pass = _parse_test_list(raw.get("PASS_TO_PASS", ""))

        metadata: MutableMapping[str, object] = {
            "dataset_name": self.dataset_name,
            "instance_id": instance_id,
            "repo": repo,
            "base_commit": raw.get("base_commit"),
            "hints_text": raw.get("hints_text"),
            "version": raw.get("version"),
            "test_patch": raw.get("test_patch"),
            "created_at": raw.get("created_at"),
            "environment_setup_commit": raw.get("environment_setup_commit"),
            "fail_to_pass": fail_to_pass,
            "pass_to_pass": pass_to_pass,
            "difficulty": raw.get("difficulty"),
            "variant": self._variant,
        }

        return DatasetRecord(
            problem=problem,
            answer=patch,
            subject=repo,
            dataset_metadata=metadata,
        )