Skip to content

hle

ipw.evaluation.hle

HLEHandler

Bases: EvaluationHandler

LLM-as-judge evaluation for Humanity's Last Exam (CORRECT / INCORRECT / NOT_ATTEMPTED).

Source code in intelligence-per-watt/src/ipw/evaluation/hle.py
@EvaluationRegistry.register("hle")
class HLEHandler(EvaluationHandler):
    """LLM-as-judge evaluation for Humanity's Last Exam (CORRECT / INCORRECT / NOT_ATTEMPTED)."""

    evaluation_method = "hle"

    def evaluate(
        self,
        *,
        problem: str,
        reference: str,
        model_answer: str,
        metadata: Dict[str, object],
    ) -> Tuple[Optional[bool], Dict[str, object]]:
        if not model_answer or not model_answer.strip():
            return False, {"grade": "NOT_ATTEMPTED", "reason": "empty_response"}

        if not reference or not reference.strip():
            return None, {"reason": "no_ground_truth"}

        if not hasattr(self._client, "chat"):
            raise RuntimeError(
                "HLEHandler requires a client with a .chat() helper (e.g. OpenAIClient)."
            )

        prompt = _GRADER_TEMPLATE.format(
            question=problem,
            target=reference,
            predicted=model_answer,
        )

        try:
            raw = self._client.chat(
                system_prompt="",
                user_prompt=prompt,
                temperature=0.0,
                max_output_tokens=1024,
            )

            grade = _parse_grade(raw)
            is_correct = grade == "CORRECT"

            meta: Dict[str, object] = {
                "grade": grade,
                "raw_judge_output": raw,
            }
            extracted = re.search(
                r"^extracted_final_answer:\s*(.+)", raw, re.MULTILINE
            )
            if extracted:
                meta["extracted_answer"] = extracted.group(1).strip()

            return is_correct, meta

        except Exception as exc:
            LOGGER.error("HLE scoring failed: %s", exc)
            return None, {"error": str(exc)}