@EvaluationRegistry.register("simpleqa")
class SimpleQAHandler(EvaluationHandler):
"""LLM-as-judge evaluation for SimpleQA (CORRECT / INCORRECT / NOT_ATTEMPTED)."""
evaluation_method = "simpleqa"
def evaluate(
self,
*,
problem: str,
reference: str,
model_answer: str,
metadata: Dict[str, object],
) -> Tuple[Optional[bool], Dict[str, object]]:
if not model_answer or not model_answer.strip():
return False, {"grade": "NOT_ATTEMPTED", "reason": "empty_response"}
if not reference or not reference.strip():
return None, {"reason": "no_ground_truth"}
if not hasattr(self._client, "chat"):
raise RuntimeError(
"SimpleQAHandler requires a client with a .chat() helper (e.g. OpenAIClient)."
)
prompt = _GRADER_TEMPLATE.format(
question=problem,
target=reference,
predicted=model_answer,
)
try:
raw = self._client.chat(
system_prompt="",
user_prompt=prompt,
temperature=0.0,
max_output_tokens=1024,
)
grade = _parse_grade(raw)
is_correct = grade == "CORRECT"
meta: Dict[str, object] = {
"grade": grade,
"raw_judge_output": raw,
}
extracted = re.search(
r"^extracted_final_answer:\s*(.+)", raw, re.MULTILINE
)
if extracted:
meta["extracted_answer"] = extracted.group(1).strip()
return is_correct, meta
except Exception as exc:
LOGGER.error("SimpleQA scoring failed: %s", exc)
return None, {"error": str(exc)}