Skip to content

base

ipw.evaluation.base

EvaluationHandler

Bases: ABC

Base class for per-dataset evaluation strategies.

Source code in intelligence-per-watt/src/ipw/evaluation/base.py
class EvaluationHandler(ABC):
    """Base class for per-dataset evaluation strategies."""

    evaluation_method: str

    def __init__(self, client: InferenceClient) -> None:
        # Handlers require a client for LLM-based judging
        self._client = client

    @abstractmethod
    def evaluate(
        self,
        *,
        problem: str,
        reference: str,
        model_answer: str,
        metadata: Dict[str, object],
    ) -> Tuple[Optional[bool], Dict[str, object]]:
        """
        Evaluate a single model answer.

        Returns:
            (is_correct, metadata)
            - is_correct: True/False if a decision could be made, or None
              if the example is not scorable.
            - metadata: method-specific payload (e.g., extracted answers,
              judge explanation, or reasons for being unscorable).
        """

evaluate(*, problem, reference, model_answer, metadata) abstractmethod

Evaluate a single model answer.

Returns:

Type Description
Optional[bool]

(is_correct, metadata)

Dict[str, object]
  • is_correct: True/False if a decision could be made, or None if the example is not scorable.
Tuple[Optional[bool], Dict[str, object]]
  • metadata: method-specific payload (e.g., extracted answers, judge explanation, or reasons for being unscorable).
Source code in intelligence-per-watt/src/ipw/evaluation/base.py
@abstractmethod
def evaluate(
    self,
    *,
    problem: str,
    reference: str,
    model_answer: str,
    metadata: Dict[str, object],
) -> Tuple[Optional[bool], Dict[str, object]]:
    """
    Evaluate a single model answer.

    Returns:
        (is_correct, metadata)
        - is_correct: True/False if a decision could be made, or None
          if the example is not scorable.
        - metadata: method-specific payload (e.g., extracted answers,
          judge explanation, or reasons for being unscorable).
    """