Skip to content

frames

ipw.evaluation.frames

FRAMESHandler

Bases: EvaluationHandler

LLM-as-judge evaluation for FRAMES multi-hop factual retrieval.

Source code in intelligence-per-watt/src/ipw/evaluation/frames.py
@EvaluationRegistry.register("frames")
class FRAMESHandler(EvaluationHandler):
    """LLM-as-judge evaluation for FRAMES multi-hop factual retrieval."""

    evaluation_method = "frames"

    def evaluate(
        self,
        *,
        problem: str,
        reference: str,
        model_answer: str,
        metadata: Dict[str, object],
    ) -> Tuple[Optional[bool], Dict[str, object]]:
        if not model_answer or not model_answer.strip():
            return False, {"reason": "empty_response"}

        if not reference or not reference.strip():
            return None, {"reason": "no_ground_truth"}

        if not hasattr(self._client, "chat"):
            raise RuntimeError(
                "FRAMESHandler requires a client with a .chat() helper (e.g. OpenAIClient)."
            )

        prompt = _GRADER_TEMPLATE.format(
            question=problem,
            ground_truth=reference,
            predicted_answer=model_answer,
        )

        try:
            raw = self._client.chat(
                system_prompt="",
                user_prompt=prompt,
                temperature=0.0,
                max_output_tokens=1024,
            )

            structured_match = re.search(
                r"^correct:\s*(yes|no)", raw, re.MULTILINE | re.IGNORECASE
            )
            if structured_match:
                is_correct = structured_match.group(1).lower() == "yes"
            else:
                response_upper = raw.upper().strip()
                if "TRUE" in response_upper:
                    is_correct = True
                elif "FALSE" in response_upper:
                    is_correct = False
                else:
                    LOGGER.warning("Could not parse grade from response: %s", raw[:50])
                    is_correct = False

            meta: Dict[str, object] = {
                "raw_judge_output": raw,
            }
            extracted = re.search(
                r"^extracted_final_answer:\s*(.+)", raw, re.MULTILINE
            )
            if extracted:
                meta["extracted_answer"] = extracted.group(1).strip()

            return is_correct, meta

        except Exception as exc:
            LOGGER.error("FRAMES scoring failed: %s", exc)
            return None, {"error": str(exc)}