Skip to content

mmlu_pro

ipw.evaluation.mmlu_pro

MMLUProHandler

Bases: BaseMCQHandler

Evaluation for MMLU-Pro multiple-choice tasks.

Source code in intelligence-per-watt/src/ipw/evaluation/mmlu_pro.py
@EvaluationRegistry.register("mmlu-pro")
class MMLUProHandler(BaseMCQHandler):
    """Evaluation for MMLU-Pro multiple-choice tasks."""

    evaluation_method = "mmlu-pro"

    def evaluate(
        self,
        *,
        problem: str,
        reference: str,
        model_answer: str,
        metadata: Dict[str, object],
    ) -> Tuple[Optional[bool], Dict[str, object]]:
        # For MMLU-Pro, the reference answer is already a letter (A/B/C/...).
        return self._evaluate_mcq(
            problem=problem,
            reference_letter=reference,
            model_answer=model_answer,
            metadata=metadata,
        )