Skip to content

Index

ipw.evaluation

EvaluationHandler

Bases: ABC

Base class for per-dataset evaluation strategies.

Source code in intelligence-per-watt/src/ipw/evaluation/base.py
class EvaluationHandler(ABC):
    """Base class for per-dataset evaluation strategies."""

    evaluation_method: str

    def __init__(self, client: InferenceClient) -> None:
        # Handlers require a client for LLM-based judging
        self._client = client

    @abstractmethod
    def evaluate(
        self,
        *,
        problem: str,
        reference: str,
        model_answer: str,
        metadata: Dict[str, object],
    ) -> Tuple[Optional[bool], Dict[str, object]]:
        """
        Evaluate a single model answer.

        Returns:
            (is_correct, metadata)
            - is_correct: True/False if a decision could be made, or None
              if the example is not scorable.
            - metadata: method-specific payload (e.g., extracted answers,
              judge explanation, or reasons for being unscorable).
        """

evaluate(*, problem, reference, model_answer, metadata) abstractmethod

Evaluate a single model answer.

Returns:

Type Description
Optional[bool]

(is_correct, metadata)

Dict[str, object]
  • is_correct: True/False if a decision could be made, or None if the example is not scorable.
Tuple[Optional[bool], Dict[str, object]]
  • metadata: method-specific payload (e.g., extracted answers, judge explanation, or reasons for being unscorable).
Source code in intelligence-per-watt/src/ipw/evaluation/base.py
@abstractmethod
def evaluate(
    self,
    *,
    problem: str,
    reference: str,
    model_answer: str,
    metadata: Dict[str, object],
) -> Tuple[Optional[bool], Dict[str, object]]:
    """
    Evaluate a single model answer.

    Returns:
        (is_correct, metadata)
        - is_correct: True/False if a decision could be made, or None
          if the example is not scorable.
        - metadata: method-specific payload (e.g., extracted answers,
          judge explanation, or reasons for being unscorable).
    """

FRAMESHandler

Bases: EvaluationHandler

LLM-as-judge evaluation for FRAMES multi-hop factual retrieval.

Source code in intelligence-per-watt/src/ipw/evaluation/frames.py
@EvaluationRegistry.register("frames")
class FRAMESHandler(EvaluationHandler):
    """LLM-as-judge evaluation for FRAMES multi-hop factual retrieval."""

    evaluation_method = "frames"

    def evaluate(
        self,
        *,
        problem: str,
        reference: str,
        model_answer: str,
        metadata: Dict[str, object],
    ) -> Tuple[Optional[bool], Dict[str, object]]:
        if not model_answer or not model_answer.strip():
            return False, {"reason": "empty_response"}

        if not reference or not reference.strip():
            return None, {"reason": "no_ground_truth"}

        if not hasattr(self._client, "chat"):
            raise RuntimeError(
                "FRAMESHandler requires a client with a .chat() helper (e.g. OpenAIClient)."
            )

        prompt = _GRADER_TEMPLATE.format(
            question=problem,
            ground_truth=reference,
            predicted_answer=model_answer,
        )

        try:
            raw = self._client.chat(
                system_prompt="",
                user_prompt=prompt,
                temperature=0.0,
                max_output_tokens=1024,
            )

            structured_match = re.search(
                r"^correct:\s*(yes|no)", raw, re.MULTILINE | re.IGNORECASE
            )
            if structured_match:
                is_correct = structured_match.group(1).lower() == "yes"
            else:
                response_upper = raw.upper().strip()
                if "TRUE" in response_upper:
                    is_correct = True
                elif "FALSE" in response_upper:
                    is_correct = False
                else:
                    LOGGER.warning("Could not parse grade from response: %s", raw[:50])
                    is_correct = False

            meta: Dict[str, object] = {
                "raw_judge_output": raw,
            }
            extracted = re.search(
                r"^extracted_final_answer:\s*(.+)", raw, re.MULTILINE
            )
            if extracted:
                meta["extracted_answer"] = extracted.group(1).strip()

            return is_correct, meta

        except Exception as exc:
            LOGGER.error("FRAMES scoring failed: %s", exc)
            return None, {"error": str(exc)}

GAIAHandler

Bases: EvaluationHandler

GAIA evaluation: exact match with normalization + LLM fallback.

Source code in intelligence-per-watt/src/ipw/evaluation/gaia.py
@EvaluationRegistry.register("gaia")
class GAIAHandler(EvaluationHandler):
    """GAIA evaluation: exact match with normalization + LLM fallback."""

    evaluation_method = "gaia"

    def evaluate(
        self,
        *,
        problem: str,
        reference: str,
        model_answer: str,
        metadata: Dict[str, object],
    ) -> Tuple[Optional[bool], Dict[str, object]]:
        if not model_answer or not model_answer.strip():
            return False, {"reason": "empty_response"}

        if not reference or not reference.strip():
            return None, {"reason": "no_ground_truth"}

        # Try exact match first (fast, no API call)
        if _exact_match(model_answer, reference):
            return True, {"match_type": "exact"}

        # LLM fallback for semantic comparison
        if not hasattr(self._client, "chat"):
            return False, {
                "match_type": "exact_failed",
                "reason": "no_llm_client_for_fallback",
            }

        try:
            prompt = _LLM_FALLBACK_PROMPT.format(
                question=problem or "(No question provided)",
                response=model_answer,
                ground_truth=reference,
            )
            raw = self._client.chat(
                system_prompt="",
                user_prompt=prompt,
                temperature=0.0,
                max_output_tokens=1024,
            )

            structured_match = re.search(
                r"^correct:\s*(yes|no)", raw, re.MULTILINE | re.IGNORECASE
            )
            if structured_match:
                is_correct = structured_match.group(1).lower() == "yes"
            else:
                is_correct = (
                    "CORRECT" in raw.upper() and "INCORRECT" not in raw.upper()
                )

            meta: Dict[str, object] = {
                "match_type": "llm_fallback",
                "raw_judge_output": raw,
            }
            extracted_match = re.search(
                r"^extracted_final_answer:\s*(.+)", raw, re.MULTILINE
            )
            if extracted_match:
                meta["extracted_answer"] = extracted_match.group(1).strip()

            return is_correct, meta

        except Exception as exc:
            LOGGER.error("GAIA LLM fallback failed: %s", exc)
            return False, {
                "match_type": "llm_fallback_error",
                "error": str(exc),
            }

GPQAHandler

Bases: BaseMCQHandler

Evaluation for GPQA tasks.

Source code in intelligence-per-watt/src/ipw/evaluation/gpqa.py
@EvaluationRegistry.register("gpqa")
class GPQAHandler(BaseMCQHandler):
    """Evaluation for GPQA tasks."""

    evaluation_method = "gpqa"

    def evaluate(
        self,
        *,
        problem: str,
        reference: str,
        model_answer: str,
        metadata: Dict[str, object],
    ) -> Tuple[Optional[bool], Dict[str, object]]:
        # GPQA normalizes all correct answers to letter "A" in the base dataset.
        # The IPW "answer" field should therefore be the gold letter already.
        return self._evaluate_mcq(
            problem=problem,
            reference_letter=reference,
            model_answer=model_answer,
            metadata=metadata,
        )

SuperGPQAHandler

Bases: BaseMCQHandler

Evaluation for SuperGPQA tasks.

Source code in intelligence-per-watt/src/ipw/evaluation/gpqa.py
@EvaluationRegistry.register("supergpqa")
class SuperGPQAHandler(BaseMCQHandler):
    """Evaluation for SuperGPQA tasks."""

    evaluation_method = "supergpqa"

    def evaluate(
        self,
        *,
        problem: str,
        reference: str,
        model_answer: str,
        metadata: Dict[str, object],
    ) -> Tuple[Optional[bool], Dict[str, object]]:
        # SuperGPQA stores the correct letter as answer_letter in the original
        # dataset; in the mixed IPW dataset, reference is already that letter.
        return self._evaluate_mcq(
            problem=problem,
            reference_letter=reference,
            model_answer=model_answer,
            metadata=metadata,
        )

HLEHandler

Bases: EvaluationHandler

LLM-as-judge evaluation for Humanity's Last Exam (CORRECT / INCORRECT / NOT_ATTEMPTED).

Source code in intelligence-per-watt/src/ipw/evaluation/hle.py
@EvaluationRegistry.register("hle")
class HLEHandler(EvaluationHandler):
    """LLM-as-judge evaluation for Humanity's Last Exam (CORRECT / INCORRECT / NOT_ATTEMPTED)."""

    evaluation_method = "hle"

    def evaluate(
        self,
        *,
        problem: str,
        reference: str,
        model_answer: str,
        metadata: Dict[str, object],
    ) -> Tuple[Optional[bool], Dict[str, object]]:
        if not model_answer or not model_answer.strip():
            return False, {"grade": "NOT_ATTEMPTED", "reason": "empty_response"}

        if not reference or not reference.strip():
            return None, {"reason": "no_ground_truth"}

        if not hasattr(self._client, "chat"):
            raise RuntimeError(
                "HLEHandler requires a client with a .chat() helper (e.g. OpenAIClient)."
            )

        prompt = _GRADER_TEMPLATE.format(
            question=problem,
            target=reference,
            predicted=model_answer,
        )

        try:
            raw = self._client.chat(
                system_prompt="",
                user_prompt=prompt,
                temperature=0.0,
                max_output_tokens=1024,
            )

            grade = _parse_grade(raw)
            is_correct = grade == "CORRECT"

            meta: Dict[str, object] = {
                "grade": grade,
                "raw_judge_output": raw,
            }
            extracted = re.search(
                r"^extracted_final_answer:\s*(.+)", raw, re.MULTILINE
            )
            if extracted:
                meta["extracted_answer"] = extracted.group(1).strip()

            return is_correct, meta

        except Exception as exc:
            LOGGER.error("HLE scoring failed: %s", exc)
            return None, {"error": str(exc)}

Math500Handler

Bases: EvaluationHandler

Evaluation for MATH-500-style problems.

Two-stage approach: 1. First, uses LLM to verify mathematical equivalence (with retry logic) 2. If LLM verification fails, falls back to extraction + normalization

Source code in intelligence-per-watt/src/ipw/evaluation/math500.py
@EvaluationRegistry.register("math-500")
class Math500Handler(EvaluationHandler):
    """
    Evaluation for MATH-500-style problems.

    Two-stage approach:
    1. First, uses LLM to verify mathematical equivalence (with retry logic)
    2. If LLM verification fails, falls back to extraction + normalization
    """
    evaluation_method = "math-500"

    EXTRACT_PROMPT = r"""Extract the mathematical answer from the solution. The answer will typically be inside a \boxed{} command.
If there are multiple boxed expressions, extract the final one. Return only the mathematical expression without any surrounding text.

Example 1:
Input: Therefore, $x = \boxed{5}$ is the solution.
Output: 5

Example 2:
Input: The final answer is $\boxed{\frac{\sqrt{3}}{2}}$.
Output: \frac{\sqrt{3}}{2}

Example 3:
Input: We get $\boxed{x = 2}$ and $\boxed{y = 3}$, so $\boxed{x + y = 5}$.
Output: 5

Rules:
- Return ONLY the final numeric/algebraic expression
- Do NOT include headings or words (e.g., 'Step', 'Solution')
- Do NOT include LaTeX delimiters like $ ... $, \( ... \), or \[ ... \]
- If no clear final answer is present, return NONE"""

    VERIFY_PROMPT = r"""Compare these two mathematical solutions and determine if they are equivalent. Focus on:
1. The final numerical or mathematical answer (typically in a \boxed{} command)
2. Mathematical equivalence (e.g., 1/2 = 0.5 = \frac{1}{2})
3. Different but valid solution methods that arrive at the same result

Output format (strict):
Return ONLY a compact JSON object with these exact keys and types, no extra text:
{"explanation": string, "is_correct": boolean}
- explanation: short description on why they match or not.
- is_correct: true if equivalent, false otherwise."""

    def _extract_answer_with_llm(self, text: str) -> Optional[str]:
        """Extract answer using LLM."""
        # Assumes self._client is an OpenAIClient or compatible with a .chat() helper
        if not hasattr(self._client, "chat"):
            raise RuntimeError("Math500Handler requires a client with a .chat() helper (e.g. OpenAIClient)")

        try:
            raw_response = self._client.chat(
                system_prompt=self.EXTRACT_PROMPT,
                user_prompt=text,
                temperature=0.0,
                max_output_tokens=100,
            )

            extracted = raw_response.strip()
            normalized = normalize_math_answer(extracted)

            if normalized and normalized.upper() != "NONE":
                return normalized
            return None

        except Exception as exc:
            LOGGER.error(f"Error extracting math answer: {exc}")
            return None

    def _verify_with_llm(self, model_answer: str, reference: str) -> Tuple[Optional[bool], Optional[str]]:
        """Verify mathematical equivalence using LLM judge."""
        if not hasattr(self._client, "chat"):
            raise RuntimeError("Math500Handler requires a client with a .chat() helper (e.g. OpenAIClient)")

        try:
            user_prompt = f"Solution 1:\n{model_answer}\n\nSolution 2:\n{reference}\n\nRespond with JSON only."

            raw_response = self._client.chat(
                system_prompt=self.VERIFY_PROMPT,
                user_prompt=user_prompt,
                temperature=0.0,
                max_output_tokens=500,
            )

            content = raw_response.strip()

            # Extract first JSON object and parse
            m = re.search(r"\{[\s\S]*\}", content)
            if not m:
                return None, None

            try:
                data = json.loads(m.group(0))
            except Exception:
                return None, None

            is_correct = data.get("is_correct")
            explanation = data.get("explanation", "")

            if isinstance(is_correct, bool):
                return is_correct, str(explanation).replace('\n', ' ')

            return None, None

        except Exception as exc:
            LOGGER.error(f"Error verifying math answer: {exc}")
            return None, None

    def evaluate(
        self,
        *,
        problem: str,
        reference: str,
        model_answer: str,
        metadata: Dict[str, object],
    ) -> Tuple[Optional[bool], Dict[str, object]]:
        """
        Two-stage evaluation:
        1. Try LLM-based verification (with retries)
        2. Fall back to extraction + normalization if LLM fails
        """

        # Stage 1: Try LLM verification with retries
        NUM_RETRIES = 3
        judge_result = None
        judge_explanation = None

        for attempt in range(NUM_RETRIES):
            judge_result, judge_explanation = self._verify_with_llm(model_answer, reference)
            if judge_result is not None:
                break

        if judge_result is not None:
            return judge_result, {
                "method": "llm_verify",
                "explanation": judge_explanation,
            }

        # Stage 2: Fallback to extraction + normalization
        LOGGER.warning("MATH | LLM Judge failed, falling back to extraction")

        extracted_model = self._extract_answer_with_llm(model_answer)
        extracted_ref = self._extract_answer_with_llm(reference)

        # Also try direct normalization as additional fallback
        if not extracted_model:
            extracted_model = normalize_math_answer(model_answer)
        if not extracted_ref:
            extracted_ref = normalize_math_answer(reference)

        if not extracted_model or not extracted_ref:
            return None, {
                "reason": "unscorable_math_answer",
                "extracted_model": extracted_model,
                "extracted_ref": extracted_ref,
            }

        is_correct = extracted_model == extracted_ref
        return is_correct, {
            "method": "extraction_fallback",
            "extracted_model": extracted_model,
            "extracted_ref": extracted_ref,
        }

evaluate(*, problem, reference, model_answer, metadata)

Two-stage evaluation: 1. Try LLM-based verification (with retries) 2. Fall back to extraction + normalization if LLM fails

Source code in intelligence-per-watt/src/ipw/evaluation/math500.py
def evaluate(
    self,
    *,
    problem: str,
    reference: str,
    model_answer: str,
    metadata: Dict[str, object],
) -> Tuple[Optional[bool], Dict[str, object]]:
    """
    Two-stage evaluation:
    1. Try LLM-based verification (with retries)
    2. Fall back to extraction + normalization if LLM fails
    """

    # Stage 1: Try LLM verification with retries
    NUM_RETRIES = 3
    judge_result = None
    judge_explanation = None

    for attempt in range(NUM_RETRIES):
        judge_result, judge_explanation = self._verify_with_llm(model_answer, reference)
        if judge_result is not None:
            break

    if judge_result is not None:
        return judge_result, {
            "method": "llm_verify",
            "explanation": judge_explanation,
        }

    # Stage 2: Fallback to extraction + normalization
    LOGGER.warning("MATH | LLM Judge failed, falling back to extraction")

    extracted_model = self._extract_answer_with_llm(model_answer)
    extracted_ref = self._extract_answer_with_llm(reference)

    # Also try direct normalization as additional fallback
    if not extracted_model:
        extracted_model = normalize_math_answer(model_answer)
    if not extracted_ref:
        extracted_ref = normalize_math_answer(reference)

    if not extracted_model or not extracted_ref:
        return None, {
            "reason": "unscorable_math_answer",
            "extracted_model": extracted_model,
            "extracted_ref": extracted_ref,
        }

    is_correct = extracted_model == extracted_ref
    return is_correct, {
        "method": "extraction_fallback",
        "extracted_model": extracted_model,
        "extracted_ref": extracted_ref,
    }

BaseMCQHandler

Bases: EvaluationHandler

Shared logic for multiple-choice (letter-based) evaluations.

Source code in intelligence-per-watt/src/ipw/evaluation/mcq.py
class BaseMCQHandler(EvaluationHandler):
    """Shared logic for multiple-choice (letter-based) evaluations."""

    def _valid_letters_from_options(self, metadata: Dict[str, object]) -> Optional[str]:
        options = metadata.get("options")
        if isinstance(options, list) and options:
            n = len(options)
            return "".join(chr(ord("A") + i) for i in range(n))
        return None

    def _extract_answer_with_llm(
        self,
        problem: str,
        model_answer: str,
        valid_letters: str,
    ) -> Optional[str]:
        """
        Use the evaluation LLM to extract the answer letter from the model response.
        """
        if not hasattr(self._client, "chat"):
            raise RuntimeError(
                "MCQ handler requires a client with a .chat() helper (e.g. OpenAIClient)."
            )

        last_letter = chr(ord('A') + len(valid_letters) - 1) if valid_letters else 'D'

        system_prompt = (
            f"You are an answer extraction assistant. Extract the final multiple choice answer "
            f"from the response. Return ONLY a single letter (A-{last_letter}). "
            f"If no valid answer letter is found, return 'NONE'."
        )

        user_prompt = f"Problem: {problem}\nResponse: {model_answer}\n\nExtract the final answer letter:"

        try:
            raw_response = self._client.chat(
                system_prompt=system_prompt,
                user_prompt=user_prompt,
                temperature=0.0,
                max_output_tokens=5,
            )

            extracted = raw_response.strip().upper()

            # Try to extract letter from response (handles "The answer is: A" etc.)
            answer_match = re.search(r'(?:THE ANSWER IS:?\s*)?([A-Z])', extracted, re.IGNORECASE)
            if answer_match:
                extracted = answer_match.group(1).upper()

            # Validate it's in the valid set
            if extracted in valid_letters:
                return extracted

            return None

        except Exception as exc:
            LOGGER.error(f"Error in LLM-based answer extraction: {exc}")
            return None

    def _evaluate_mcq(
        self,
        *,
        problem: str,
        reference_letter: str,
        model_answer: str,
        metadata: Dict[str, object],
    ) -> Tuple[Optional[bool], Dict[str, object]]:
        ref = (reference_letter or "").strip().upper()
        if not ref:
            return None, {"reason": "missing_reference_letter"}

        valid_letters = self._valid_letters_from_options(metadata)
        if valid_letters is None:
            # Fallback if options not available
            valid_letters = "ABCD"

        candidate = self._extract_answer_with_llm(problem, model_answer, valid_letters)
        if not candidate:
            return None, {"reason": "no_choice_letter_extracted"}

        is_correct = candidate == ref
        meta = {
            "reference_letter": ref,
            "candidate_letter": candidate,
            "valid_letters": valid_letters,
        }
        return is_correct, meta

MMLUProHandler

Bases: BaseMCQHandler

Evaluation for MMLU-Pro multiple-choice tasks.

Source code in intelligence-per-watt/src/ipw/evaluation/mmlu_pro.py
@EvaluationRegistry.register("mmlu-pro")
class MMLUProHandler(BaseMCQHandler):
    """Evaluation for MMLU-Pro multiple-choice tasks."""

    evaluation_method = "mmlu-pro"

    def evaluate(
        self,
        *,
        problem: str,
        reference: str,
        model_answer: str,
        metadata: Dict[str, object],
    ) -> Tuple[Optional[bool], Dict[str, object]]:
        # For MMLU-Pro, the reference answer is already a letter (A/B/C/...).
        return self._evaluate_mcq(
            problem=problem,
            reference_letter=reference,
            model_answer=model_answer,
            metadata=metadata,
        )

NaturalReasoningHandler

Bases: EvaluationHandler

LLM-based evaluation comparing a response to a ground-truth answer.

Source code in intelligence-per-watt/src/ipw/evaluation/natural_reasoning.py
@EvaluationRegistry.register("natural_reasoning")
@EvaluationRegistry.register("reasoning")  # Alias used in IPW dataset
class NaturalReasoningHandler(EvaluationHandler):
    """LLM-based evaluation comparing a response to a ground-truth answer."""

    evaluation_method = "natural_reasoning"

    GROUND_TRUTH_PROMPT = """
        You are evaluating the technical accuracy of a model's response by comparing it against a reference answer.

        TASK: Determine if the model's response is CORRECT or INCORRECT based on whether it matches the reference answer.

        EVALUATION CRITERIA:
        1. The model's answer must arrive at the same conclusion as the reference answer
        2. Mathematical results must match those in the reference (allowing for equivalent forms)
        3. For factual questions, the core facts must align with the reference
        4. For reasoning problems, the final answer must match even if the approach differs

        IMPORTANT RULES:
        - The reference answer is your ground truth - treat it as the correct answer
        - Different wording or approach is acceptable IF the conclusion matches the reference
        - This is a binary decision: TRUE if correct, FALSE if incorrect
        - No partial credit - the answer either matches the reference or it doesn't
        - Ignore differences in formatting, verbosity, or explanation style
        - Focus ONLY on whether the core answer aligns with the reference

        Question: {question}

        Reference Answer: {reference_answer}

        Model Response: {model_response}

        EVALUATION STEPS:
        1. Identify the core answer in the reference response
        2. Identify the core answer in the model's response  
        3. Compare: Do they reach the same conclusion?
        4. If numerical: Are the values equivalent?
        5. If factual: Do the key facts match?
        6. If reasoning-based: Is the final answer the same?

        VERDICT: Return TRUE if the model's answer matches the reference answer, FALSE otherwise.

        Final Verdict (respond with only TRUE or FALSE):
        """

    def evaluate(
        self,
        *,
        problem: str,
        reference: str,
        model_answer: str,
        metadata: Dict[str, object],
    ) -> Tuple[Optional[bool], Dict[str, object]]:
        if not reference.strip():
            raise RuntimeError("NaturalReasoningHandler requires a non-empty reference answer.")

        if not hasattr(self._client, "chat"):
            raise RuntimeError(
                "NaturalReasoningHandler requires a client with a .chat() helper (e.g. OpenAIClient)."
            )

        prompt = self.GROUND_TRUTH_PROMPT.format(
            question=problem,
            model_response=model_answer,
            reference_answer=reference,
        )

        raw = self._client.chat(
            system_prompt="",
            user_prompt=prompt,
            temperature=0.0,
            max_output_tokens=32,
        )

        # Match 'true' / 'false' as the entire response, case-insensitive, allowing for whitespace.
        m = re.search(r"^\s*(true|false)\s*$", raw, flags=re.IGNORECASE)
        meta = {
            "evaluation_mode": "ground_truth",
            "raw_judge_output": raw,
        }
        if not m:
            meta["unscorable"] = True
            return None, meta

        norm = m.group(1).lower()
        meta["parsed_label"] = norm
        result = norm == "true"
        return result, meta

SimpleQAHandler

Bases: EvaluationHandler

LLM-as-judge evaluation for SimpleQA (CORRECT / INCORRECT / NOT_ATTEMPTED).

Source code in intelligence-per-watt/src/ipw/evaluation/simpleqa.py
@EvaluationRegistry.register("simpleqa")
class SimpleQAHandler(EvaluationHandler):
    """LLM-as-judge evaluation for SimpleQA (CORRECT / INCORRECT / NOT_ATTEMPTED)."""

    evaluation_method = "simpleqa"

    def evaluate(
        self,
        *,
        problem: str,
        reference: str,
        model_answer: str,
        metadata: Dict[str, object],
    ) -> Tuple[Optional[bool], Dict[str, object]]:
        if not model_answer or not model_answer.strip():
            return False, {"grade": "NOT_ATTEMPTED", "reason": "empty_response"}

        if not reference or not reference.strip():
            return None, {"reason": "no_ground_truth"}

        if not hasattr(self._client, "chat"):
            raise RuntimeError(
                "SimpleQAHandler requires a client with a .chat() helper (e.g. OpenAIClient)."
            )

        prompt = _GRADER_TEMPLATE.format(
            question=problem,
            target=reference,
            predicted=model_answer,
        )

        try:
            raw = self._client.chat(
                system_prompt="",
                user_prompt=prompt,
                temperature=0.0,
                max_output_tokens=1024,
            )

            grade = _parse_grade(raw)
            is_correct = grade == "CORRECT"

            meta: Dict[str, object] = {
                "grade": grade,
                "raw_judge_output": raw,
            }
            extracted = re.search(
                r"^extracted_final_answer:\s*(.+)", raw, re.MULTILINE
            )
            if extracted:
                meta["extracted_answer"] = extracted.group(1).strip()

            return is_correct, meta

        except Exception as exc:
            LOGGER.error("SimpleQA scoring failed: %s", exc)
            return None, {"error": str(exc)}

TerminalBenchHandler

Bases: EvaluationHandler

LLM-as-judge evaluation for TerminalBench tasks.

Source code in intelligence-per-watt/src/ipw/evaluation/terminalbench.py
@EvaluationRegistry.register("terminalbench")
class TerminalBenchHandler(EvaluationHandler):
    """LLM-as-judge evaluation for TerminalBench tasks."""

    evaluation_method = "terminalbench"

    def evaluate(
        self,
        *,
        problem: str,
        reference: str,
        model_answer: str,
        metadata: Dict[str, object],
    ) -> Tuple[Optional[bool], Dict[str, object]]:
        if not model_answer or not model_answer.strip():
            return False, {"reason": "empty_response"}

        if not reference or not reference.strip():
            return None, {"reason": "no_expected_output"}

        # Try exact match first (strip whitespace)
        if model_answer.strip() == reference.strip():
            return True, {"match_type": "exact"}

        if not hasattr(self._client, "chat"):
            raise RuntimeError(
                "TerminalBenchHandler requires a client with a .chat() helper (e.g. OpenAIClient)."
            )

        prompt = _GRADER_TEMPLATE.format(
            question=problem,
            expected_output=reference,
            predicted_answer=model_answer,
        )

        try:
            raw = self._client.chat(
                system_prompt="",
                user_prompt=prompt,
                temperature=0.0,
                max_output_tokens=1024,
            )

            structured_match = re.search(
                r"^correct:\s*(yes|no)", raw, re.MULTILINE | re.IGNORECASE
            )
            if structured_match:
                is_correct = structured_match.group(1).lower() == "yes"
            else:
                is_correct = (
                    "CORRECT" in raw.upper() and "INCORRECT" not in raw.upper()
                )

            meta: Dict[str, object] = {
                "match_type": "llm_judge",
                "raw_judge_output": raw,
            }
            return is_correct, meta

        except Exception as exc:
            LOGGER.error("TerminalBench scoring failed: %s", exc)
            return None, {"error": str(exc)}

WildChatHandler

Bases: EvaluationHandler

LLM-based semantic equivalence check for free-form chat responses.

Source code in intelligence-per-watt/src/ipw/evaluation/wildchat.py
@EvaluationRegistry.register("wildchat")
@EvaluationRegistry.register("chat")  # Alias used in IPW dataset
class WildChatHandler(EvaluationHandler):
    """LLM-based semantic equivalence check for free-form chat responses."""

    evaluation_method = "wildchat"

    SYSTEM_PROMPT = """You are an impartial judge evaluating the quality of two AI-assistant replies to the same user prompt.

Step 1 – Generate your own answer
Write the response *you* would give to the user. Keep it separate from later analysis.

Step 2 – Decide the query type
Classify the user prompt as either
• **Subjective / open-ended** (creative writing, opinion, advice, brainstorming)
• **Objective / technical** (code, math, logical derivations with a single correct outcome)
If uncertain, default to "Subjective".

Step 3 – Score each assistant with the correct rubric

| Query type | Criteria |
|------------|----------|
| Subjective / open-ended | 1. Correctness / factual soundness 2. Helpfulness 3. Relevance 4. Conciseness 5. Creativity & novelty |
| Objective / technical   | 1. Correctness only |

When using the multi-criteria rubric, note strengths and weaknesses for **each** dimension.
When using the single-criterion rubric, focus exclusively on factual / functional accuracy and ignore style or flair.

Step 4 – Compare & justify
Explain which assistant is better and why, correcting any mistakes you find. Highlight missing but important details. **Be concise.**

Step 5 – Verdict
1. Assistant A is significantly better: [[A>>B]]
2. Assistant A is slightly better: [[A>B]]
3. Tie, Assistant A is equal: [[A=B]]
4. Assistant B is slightly better: [[B>A]]
5. Assistant B is significantly better: [[B>>A]]

Choose exactly one token from: `[[A>>B]]`, `[[A>B]]`, `[[A=B]]`, `[[B>A]]`, `[[B>>A]]`.

---

### Output format (strict)
Return **only** a JSON object that matches the provided schema:

<Your Response To The User Prompt>

```json
{
"query_type": "<query type>",
"explanation": "<multi-criteria explanation> | <single-criteria explanation> (if query_type is \"Objective / technical\")",
"verdict": "<one verdict token from: [[A>>B]], [[A>B]], [[A=B]], [[B>A]], [[B>>A]]>"
}
```"""

    def evaluate(
        self,
        *,
        problem: str,
        reference: str,
        model_answer: str,
        metadata: Dict[str, object],
    ) -> Tuple[Optional[bool], Dict[str, object]]:
        if not reference.strip():
            raise RuntimeError("WildChatHandler requires a non-empty reference answer.")

        # Perform two comparisons: (model_answer vs reference) and (reference vs model_answer).
        # This accounts for possible asymmetry in the evaluation. Self-comparison is not a concern,
        # as this function is always called with distinct candidate and reference answers.
        verdict1, response1 = self._get_judge_verdict(problem, model_answer, reference)
        verdict2, response2 = self._get_judge_verdict(problem, reference, model_answer)

        if verdict1 is None or verdict2 is None:
            return None, {
                "reason": "missing_verdicts",
                "verdict1": verdict1,
                "verdict2": verdict2,
            }

        result1 = self._verdict_to_bool(verdict1, generated_is_a=True)
        result2 = self._verdict_to_bool(verdict2, generated_is_a=False)

        if result1 is None or result2 is None:
            meta = {
                "generated_as_a": {"verdict": verdict1, "response": response1},
                "generated_as_b": {"verdict": verdict2, "response": response2},
            }
            return None, meta
        else:
            final_result = result1 or result2
            meta = {
                "generated_as_a": {"verdict": verdict1, "response": response1},
                "generated_as_b": {"verdict": verdict2, "response": response2},
            }
            return final_result, meta
    def _get_judge_verdict(
        self, problem: str, response_a: str, response_b: str
    ) -> Tuple[Optional[str], Optional[str]]:
        if not hasattr(self._client, "chat"):
            raise RuntimeError(
                "WildChatHandler requires a client with a .chat() helper (e.g. OpenAIClient)."
            )

        prompt = (
            f"<|User Prompt|>\n{problem}\n\n"
            f"<|The Start of Assistant A's Answer|>\n{response_a}\n"
            f"<|The End of Assistant A's Answer|>\n\n"
            f"<|The Start of Assistant B's Answer|>\n{response_b}\n"
            f"<|The End of Assistant B's Answer|>"
        )

        raw = self._client.chat(
            system_prompt=self.SYSTEM_PROMPT,
            user_prompt=prompt,
            temperature=0.0,
            max_output_tokens=1024,
        )
        content = raw.strip()

        verdict_match = re.search(r"\[\[([AB][><=]{1,2}[AB])\]\]", content)
        if verdict_match:
            return verdict_match.group(1), content

        return None, content

    def _verdict_to_bool(self, verdict: Optional[str], generated_is_a: bool) -> Optional[bool]:
        if not verdict:
            return None

        verdict_map_a = {
            "A>>B": True,
            "A>B": True,
            "A=B": True,
            "B>A": False,
            "B>>A": False,
        }

        verdict_map_b = {
            "A>>B": False,
            "A>B": False,
            "A=B": True,
            "B>A": True,
            "B>>A": True,
        }

        verdict_map = verdict_map_a if generated_is_a else verdict_map_b
        return verdict_map.get(verdict)