Skip to content

gpqa

ipw.evaluation.gpqa

SuperGPQAHandler

Bases: BaseMCQHandler

Evaluation for SuperGPQA tasks.

Source code in intelligence-per-watt/src/ipw/evaluation/gpqa.py
@EvaluationRegistry.register("supergpqa")
class SuperGPQAHandler(BaseMCQHandler):
    """Evaluation for SuperGPQA tasks."""

    evaluation_method = "supergpqa"

    def evaluate(
        self,
        *,
        problem: str,
        reference: str,
        model_answer: str,
        metadata: Dict[str, object],
    ) -> Tuple[Optional[bool], Dict[str, object]]:
        # SuperGPQA stores the correct letter as answer_letter in the original
        # dataset; in the mixed IPW dataset, reference is already that letter.
        return self._evaluate_mcq(
            problem=problem,
            reference_letter=reference,
            model_answer=model_answer,
            metadata=metadata,
        )

GPQAHandler

Bases: BaseMCQHandler

Evaluation for GPQA tasks.

Source code in intelligence-per-watt/src/ipw/evaluation/gpqa.py
@EvaluationRegistry.register("gpqa")
class GPQAHandler(BaseMCQHandler):
    """Evaluation for GPQA tasks."""

    evaluation_method = "gpqa"

    def evaluate(
        self,
        *,
        problem: str,
        reference: str,
        model_answer: str,
        metadata: Dict[str, object],
    ) -> Tuple[Optional[bool], Dict[str, object]]:
        # GPQA normalizes all correct answers to letter "A" in the base dataset.
        # The IPW "answer" field should therefore be the gold letter already.
        return self._evaluate_mcq(
            problem=problem,
            reference_letter=reference,
            model_answer=model_answer,
            metadata=metadata,
        )