Skip to content

terminalbench

ipw.evaluation.terminalbench

TerminalBenchHandler

Bases: EvaluationHandler

LLM-as-judge evaluation for TerminalBench tasks.

Source code in intelligence-per-watt/src/ipw/evaluation/terminalbench.py
@EvaluationRegistry.register("terminalbench")
class TerminalBenchHandler(EvaluationHandler):
    """LLM-as-judge evaluation for TerminalBench tasks."""

    evaluation_method = "terminalbench"

    def evaluate(
        self,
        *,
        problem: str,
        reference: str,
        model_answer: str,
        metadata: Dict[str, object],
    ) -> Tuple[Optional[bool], Dict[str, object]]:
        if not model_answer or not model_answer.strip():
            return False, {"reason": "empty_response"}

        if not reference or not reference.strip():
            return None, {"reason": "no_expected_output"}

        # Try exact match first (strip whitespace)
        if model_answer.strip() == reference.strip():
            return True, {"match_type": "exact"}

        if not hasattr(self._client, "chat"):
            raise RuntimeError(
                "TerminalBenchHandler requires a client with a .chat() helper (e.g. OpenAIClient)."
            )

        prompt = _GRADER_TEMPLATE.format(
            question=problem,
            expected_output=reference,
            predicted_answer=model_answer,
        )

        try:
            raw = self._client.chat(
                system_prompt="",
                user_prompt=prompt,
                temperature=0.0,
                max_output_tokens=1024,
            )

            structured_match = re.search(
                r"^correct:\s*(yes|no)", raw, re.MULTILINE | re.IGNORECASE
            )
            if structured_match:
                is_correct = structured_match.group(1).lower() == "yes"
            else:
                is_correct = (
                    "CORRECT" in raw.upper() and "INCORRECT" not in raw.upper()
                )

            meta: Dict[str, object] = {
                "match_type": "llm_judge",
                "raw_judge_output": raw,
            }
            return is_correct, meta

        except Exception as exc:
            LOGGER.error("TerminalBench scoring failed: %s", exc)
            return None, {"error": str(exc)}