@EvaluationRegistry.register("frames")
class FRAMESHandler(EvaluationHandler):
"""LLM-as-judge evaluation for FRAMES multi-hop factual retrieval."""
evaluation_method = "frames"
def evaluate(
self,
*,
problem: str,
reference: str,
model_answer: str,
metadata: Dict[str, object],
) -> Tuple[Optional[bool], Dict[str, object]]:
if not model_answer or not model_answer.strip():
return False, {"reason": "empty_response"}
if not reference or not reference.strip():
return None, {"reason": "no_ground_truth"}
if not hasattr(self._client, "chat"):
raise RuntimeError(
"FRAMESHandler requires a client with a .chat() helper (e.g. OpenAIClient)."
)
prompt = _GRADER_TEMPLATE.format(
question=problem,
ground_truth=reference,
predicted_answer=model_answer,
)
try:
raw = self._client.chat(
system_prompt="",
user_prompt=prompt,
temperature=0.0,
max_output_tokens=1024,
)
structured_match = re.search(
r"^correct:\s*(yes|no)", raw, re.MULTILINE | re.IGNORECASE
)
if structured_match:
is_correct = structured_match.group(1).lower() == "yes"
else:
response_upper = raw.upper().strip()
if "TRUE" in response_upper:
is_correct = True
elif "FALSE" in response_upper:
is_correct = False
else:
LOGGER.warning("Could not parse grade from response: %s", raw[:50])
is_correct = False
meta: Dict[str, object] = {
"raw_judge_output": raw,
}
extracted = re.search(
r"^extracted_final_answer:\s*(.+)", raw, re.MULTILINE
)
if extracted:
meta["extracted_answer"] = extracted.group(1).strip()
return is_correct, meta
except Exception as exc:
LOGGER.error("FRAMES scoring failed: %s", exc)
return None, {"error": str(exc)}