Skip to content

wildchat

ipw.evaluation.wildchat

WildChatHandler

Bases: EvaluationHandler

LLM-based semantic equivalence check for free-form chat responses.

Source code in intelligence-per-watt/src/ipw/evaluation/wildchat.py
@EvaluationRegistry.register("wildchat")
@EvaluationRegistry.register("chat")  # Alias used in IPW dataset
class WildChatHandler(EvaluationHandler):
    """LLM-based semantic equivalence check for free-form chat responses."""

    evaluation_method = "wildchat"

    SYSTEM_PROMPT = """You are an impartial judge evaluating the quality of two AI-assistant replies to the same user prompt.

Step 1 – Generate your own answer
Write the response *you* would give to the user. Keep it separate from later analysis.

Step 2 – Decide the query type
Classify the user prompt as either
• **Subjective / open-ended** (creative writing, opinion, advice, brainstorming)
• **Objective / technical** (code, math, logical derivations with a single correct outcome)
If uncertain, default to "Subjective".

Step 3 – Score each assistant with the correct rubric

| Query type | Criteria |
|------------|----------|
| Subjective / open-ended | 1. Correctness / factual soundness 2. Helpfulness 3. Relevance 4. Conciseness 5. Creativity & novelty |
| Objective / technical   | 1. Correctness only |

When using the multi-criteria rubric, note strengths and weaknesses for **each** dimension.
When using the single-criterion rubric, focus exclusively on factual / functional accuracy and ignore style or flair.

Step 4 – Compare & justify
Explain which assistant is better and why, correcting any mistakes you find. Highlight missing but important details. **Be concise.**

Step 5 – Verdict
1. Assistant A is significantly better: [[A>>B]]
2. Assistant A is slightly better: [[A>B]]
3. Tie, Assistant A is equal: [[A=B]]
4. Assistant B is slightly better: [[B>A]]
5. Assistant B is significantly better: [[B>>A]]

Choose exactly one token from: `[[A>>B]]`, `[[A>B]]`, `[[A=B]]`, `[[B>A]]`, `[[B>>A]]`.

---

### Output format (strict)
Return **only** a JSON object that matches the provided schema:

<Your Response To The User Prompt>

```json
{
"query_type": "<query type>",
"explanation": "<multi-criteria explanation> | <single-criteria explanation> (if query_type is \"Objective / technical\")",
"verdict": "<one verdict token from: [[A>>B]], [[A>B]], [[A=B]], [[B>A]], [[B>>A]]>"
}
```"""

    def evaluate(
        self,
        *,
        problem: str,
        reference: str,
        model_answer: str,
        metadata: Dict[str, object],
    ) -> Tuple[Optional[bool], Dict[str, object]]:
        if not reference.strip():
            raise RuntimeError("WildChatHandler requires a non-empty reference answer.")

        # Perform two comparisons: (model_answer vs reference) and (reference vs model_answer).
        # This accounts for possible asymmetry in the evaluation. Self-comparison is not a concern,
        # as this function is always called with distinct candidate and reference answers.
        verdict1, response1 = self._get_judge_verdict(problem, model_answer, reference)
        verdict2, response2 = self._get_judge_verdict(problem, reference, model_answer)

        if verdict1 is None or verdict2 is None:
            return None, {
                "reason": "missing_verdicts",
                "verdict1": verdict1,
                "verdict2": verdict2,
            }

        result1 = self._verdict_to_bool(verdict1, generated_is_a=True)
        result2 = self._verdict_to_bool(verdict2, generated_is_a=False)

        if result1 is None or result2 is None:
            meta = {
                "generated_as_a": {"verdict": verdict1, "response": response1},
                "generated_as_b": {"verdict": verdict2, "response": response2},
            }
            return None, meta
        else:
            final_result = result1 or result2
            meta = {
                "generated_as_a": {"verdict": verdict1, "response": response1},
                "generated_as_b": {"verdict": verdict2, "response": response2},
            }
            return final_result, meta
    def _get_judge_verdict(
        self, problem: str, response_a: str, response_b: str
    ) -> Tuple[Optional[str], Optional[str]]:
        if not hasattr(self._client, "chat"):
            raise RuntimeError(
                "WildChatHandler requires a client with a .chat() helper (e.g. OpenAIClient)."
            )

        prompt = (
            f"<|User Prompt|>\n{problem}\n\n"
            f"<|The Start of Assistant A's Answer|>\n{response_a}\n"
            f"<|The End of Assistant A's Answer|>\n\n"
            f"<|The Start of Assistant B's Answer|>\n{response_b}\n"
            f"<|The End of Assistant B's Answer|>"
        )

        raw = self._client.chat(
            system_prompt=self.SYSTEM_PROMPT,
            user_prompt=prompt,
            temperature=0.0,
            max_output_tokens=1024,
        )
        content = raw.strip()

        verdict_match = re.search(r"\[\[([AB][><=]{1,2}[AB])\]\]", content)
        if verdict_match:
            return verdict_match.group(1), content

        return None, content

    def _verdict_to_bool(self, verdict: Optional[str], generated_is_a: bool) -> Optional[bool]:
        if not verdict:
            return None

        verdict_map_a = {
            "A>>B": True,
            "A>B": True,
            "A=B": True,
            "B>A": False,
            "B>>A": False,
        }

        verdict_map_b = {
            "A>>B": False,
            "A>B": False,
            "A=B": True,
            "B>A": True,
            "B>>A": True,
        }

        verdict_map = verdict_map_a if generated_is_a else verdict_map_b
        return verdict_map.get(verdict)