Index

`ipw.execution` ¶

Execution pipeline components for Intelligence Per Watt profiling runs.

`AgenticRunner` ¶

Orchestrate multi-turn agent runs with energy telemetry correlation.

Similar to ProfilerRunner but designed for agentic workloads where a single query may involve multiple LLM turns and tool calls.

Source code in intelligence-per-watt/src/ipw/execution/agentic_runner.py

class AgenticRunner:
    """Orchestrate multi-turn agent runs with energy telemetry correlation.

    Similar to ProfilerRunner but designed for agentic workloads where a single
    query may involve multiple LLM turns and tool calls.
    """

    _FLUSH_INTERVAL = 50

    def __init__(
        self,
        agent: BaseAgent,
        dataset: DatasetProvider,
        telemetry_session: Optional[TelemetrySession] = None,
        config: Optional[dict[str, Any]] = None,
        event_recorder: Optional[EventRecorder] = None,
    ) -> None:
        self._agent = agent
        self._dataset = dataset
        self._telemetry = telemetry_session
        self._config = config or {}
        self._event_recorder = event_recorder if event_recorder is not None else EventRecorder()
        self._traces: list[QueryTrace] = []
        self._records: list[ProfilingRecord] = []

    async def run(self, max_queries: Optional[int] = None) -> list[QueryTrace]:
        """Run the agent over the dataset, collecting traces and telemetry.

        Args:
            max_queries: Maximum number of queries to process. None means all.

        Returns:
            List of QueryTrace objects with energy-correlated telemetry.
        """
        total = max_queries or self._dataset.size()
        model = self._config.get("model", "unknown")

        with tqdm(total=total, desc="Agent run", unit="query") as progress:
            for index, record in enumerate(self._dataset):
                if index >= total:
                    break

                trace = await self._run_single_query(index, record, model)
                self._traces.append(trace)

                # Build profiling record for energy data persistence
                profiling_record = self._build_profiling_record(
                    record, trace, model
                )
                self._records.append(profiling_record)

                if len(self._traces) % self._FLUSH_INTERVAL == 0:
                    LOGGER.debug(
                        "Processed %d/%d queries", len(self._traces), total
                    )

                progress.update(1)

        return self._traces

    async def _run_single_query(
        self,
        index: int,
        record: DatasetRecord,
        model: str,
    ) -> QueryTrace:
        """Run a single query through the agent with telemetry capture."""
        query_id = f"q{index:04d}"
        workload_type = record.dataset_metadata.get("workload_type", "agentic")

        # Capture telemetry window around the agent call
        start_time = time.time()
        telemetry_samples_before = (
            list(self._telemetry.readings()) if self._telemetry else []
        )

        self._event_recorder.clear()

        # Run the agent
        try:
            result: AgentRunResult = self._agent.run(record.problem)
        except Exception as exc:
            LOGGER.warning("Agent failed on query %s: %s", query_id, exc)
            end_time = time.time()
            trace = QueryTrace(
                query_id=query_id,
                workload_type=str(workload_type),
                query_text=record.problem,
                response_text=str(exc),
                total_wall_clock_s=end_time - start_time,
                completed=False,
            )
            return trace

        end_time = time.time()

        # Collect telemetry samples for this query window
        readings: list[TelemetrySample] = []
        if self._telemetry:
            readings = list(self._telemetry.window(start_time, end_time))

        # Build turn traces from event recorder
        events = self._event_recorder.get_events()
        turns = self._build_turn_traces(events, readings)

        trace = QueryTrace(
            query_id=query_id,
            workload_type=str(workload_type),
            query_text=record.problem,
            response_text=result.content,
            turns=turns,
            total_wall_clock_s=end_time - start_time,
            completed=True,
        )

        # Correlate energy data with trace
        trace = self._correlate_energy(trace, readings)

        return trace

    def _build_turn_traces(
        self,
        events: list,
        readings: list[TelemetrySample],
    ) -> list[TurnTrace]:
        """Build TurnTrace objects from recorded events."""
        turns: list[TurnTrace] = []
        current_turn_index = 0
        current_turn_start: Optional[float] = None
        current_tools: list[str] = []
        current_tool_latencies: dict[str, float] = {}
        tool_start_times: dict[str, float] = {}
        input_tokens = 0
        output_tokens = 0

        for event in events:
            etype = event.event_type

            if etype == EventType.LM_INFERENCE_START:
                current_turn_start = event.timestamp

            elif etype == EventType.LM_INFERENCE_END:
                wall_clock = 0.0
                if current_turn_start is not None:
                    wall_clock = event.timestamp - current_turn_start

                input_tokens = event.metadata.get("prompt_tokens", 0)
                output_tokens = event.metadata.get("completion_tokens", 0)

                # Get energy readings for this turn window
                turn_gpu_energy = None
                turn_cpu_energy = None
                turn_gpu_power_avg = None
                turn_cpu_power_avg = None

                if current_turn_start is not None and readings:
                    turn_readings = [
                        s for s in readings
                        if current_turn_start <= s.timestamp <= event.timestamp
                    ]
                    if turn_readings:
                        gpu_energies = [
                            s.reading.energy_joules for s in turn_readings
                            if s.reading.energy_joules is not None
                            and math.isfinite(s.reading.energy_joules)
                        ]
                        if len(gpu_energies) >= 2:
                            delta = gpu_energies[-1] - gpu_energies[0]
                            turn_gpu_energy = delta if delta >= 0 else None

                        cpu_energies = [
                            s.reading.cpu_energy_joules for s in turn_readings
                            if s.reading.cpu_energy_joules is not None
                            and math.isfinite(s.reading.cpu_energy_joules)
                        ]
                        if len(cpu_energies) >= 2:
                            delta = cpu_energies[-1] - cpu_energies[0]
                            turn_cpu_energy = delta if delta >= 0 else None

                        gpu_powers = [
                            s.reading.power_watts for s in turn_readings
                            if s.reading.power_watts is not None
                            and math.isfinite(s.reading.power_watts)
                        ]
                        if gpu_powers:
                            turn_gpu_power_avg = statistics.mean(gpu_powers)

                        cpu_powers = [
                            s.reading.cpu_power_watts for s in turn_readings
                            if s.reading.cpu_power_watts is not None
                            and math.isfinite(s.reading.cpu_power_watts)
                        ]
                        if cpu_powers:
                            turn_cpu_power_avg = statistics.mean(cpu_powers)

                turn = TurnTrace(
                    turn_index=current_turn_index,
                    input_tokens=input_tokens,
                    output_tokens=output_tokens,
                    tools_called=list(current_tools),
                    tool_latencies_s=dict(current_tool_latencies),
                    wall_clock_s=wall_clock,
                    gpu_energy_joules=turn_gpu_energy,
                    cpu_energy_joules=turn_cpu_energy,
                    gpu_power_avg_watts=turn_gpu_power_avg,
                    cpu_power_avg_watts=turn_cpu_power_avg,
                )
                turns.append(turn)

                # Reset for next turn
                current_turn_index += 1
                current_turn_start = None
                current_tools = []
                current_tool_latencies = {}
                input_tokens = 0
                output_tokens = 0

            elif etype == EventType.TOOL_CALL_START:
                tool_name = event.metadata.get("tool", "unknown")
                tool_start_times[tool_name] = event.timestamp

            elif etype == EventType.TOOL_CALL_END:
                tool_name = event.metadata.get("tool", "unknown")
                current_tools.append(tool_name)
                start_ts = tool_start_times.pop(tool_name, None)
                if start_ts is not None:
                    current_tool_latencies[tool_name] = (
                        event.timestamp - start_ts
                    )

        # If there were events but no complete turn, create a synthetic one
        if not turns and events:
            turns.append(
                TurnTrace(
                    turn_index=0,
                    tools_called=current_tools,
                    tool_latencies_s=current_tool_latencies,
                )
            )

        return turns

    def _correlate_energy(
        self,
        trace: QueryTrace,
        readings: list[TelemetrySample],
    ) -> QueryTrace:
        """Correlate energy readings with the trace at the query level.

        If per-turn energy was not populated from events (e.g., no event
        recorder), distribute energy evenly across turns based on wall clock.
        """
        if not readings or not trace.turns:
            return trace

        # Check if any turns already have energy data
        has_turn_energy = any(
            t.gpu_energy_joules is not None for t in trace.turns
        )
        if has_turn_energy:
            return trace

        # Compute total query energy
        gpu_energies = [
            s.reading.energy_joules for s in readings
            if s.reading.energy_joules is not None
            and math.isfinite(s.reading.energy_joules)
        ]
        total_gpu_energy = None
        if len(gpu_energies) >= 2:
            delta = gpu_energies[-1] - gpu_energies[0]
            total_gpu_energy = delta if delta >= 0 else None

        cpu_energies = [
            s.reading.cpu_energy_joules for s in readings
            if s.reading.cpu_energy_joules is not None
            and math.isfinite(s.reading.cpu_energy_joules)
        ]
        total_cpu_energy = None
        if len(cpu_energies) >= 2:
            delta = cpu_energies[-1] - cpu_energies[0]
            total_cpu_energy = delta if delta >= 0 else None

        # Distribute proportionally by wall clock time
        total_wall = sum(t.wall_clock_s for t in trace.turns)
        if total_wall > 0:
            for turn in trace.turns:
                fraction = turn.wall_clock_s / total_wall
                if total_gpu_energy is not None:
                    turn.gpu_energy_joules = total_gpu_energy * fraction
                if total_cpu_energy is not None:
                    turn.cpu_energy_joules = total_cpu_energy * fraction

        return trace

    def _build_profiling_record(
        self,
        record: DatasetRecord,
        trace: QueryTrace,
        model: str,
    ) -> ProfilingRecord:
        """Build a ProfilingRecord from a completed query trace."""
        total_input_tokens = trace.total_input_tokens
        total_output_tokens = trace.total_output_tokens
        total_seconds = trace.total_wall_clock_s

        # Energy metrics from trace
        gpu_energy = trace.total_gpu_energy_joules
        cpu_energy = sum(
            (t.cpu_energy_joules for t in trace.turns if t.cpu_energy_joules is not None),
            0.0,
        ) or None

        energy_metrics = EnergyMetrics(
            per_query_joules=gpu_energy,
            total_joules=gpu_energy,
            cpu_per_query_joules=cpu_energy,
            cpu_total_joules=cpu_energy,
        )

        # Latency
        per_token_ms = None
        throughput = None
        if total_output_tokens > 0 and total_seconds > 0:
            per_token_ms = (total_seconds * 1000.0) / total_output_tokens
            throughput = total_output_tokens / total_seconds

        latency_metrics = LatencyMetrics(
            per_token_ms=per_token_ms,
            throughput_tokens_per_sec=throughput,
            total_query_seconds=total_seconds,
        )

        # Cost
        cost = trace.total_cost_usd
        cost_metrics = CostMetrics(total_cost_usd=cost)

        model_metrics = ModelMetrics(
            compute_metrics=ComputeMetrics(),
            energy_metrics=energy_metrics,
            latency_metrics=latency_metrics,
            memory_metrics=MemoryMetrics(),
            power_metrics=PowerMetrics(),
            temperature_metrics=MetricStats(),
            token_metrics=TokenMetrics(
                input=total_input_tokens,
                output=total_output_tokens,
                total=total_input_tokens + total_output_tokens,
            ),
            cost=cost_metrics,
            lm_response=trace.response_text,
        )

        return ProfilingRecord(
            problem=record.problem,
            answer=record.answer,
            dataset_metadata=dict(record.dataset_metadata),
            subject=record.subject,
            model_answers={model: trace.response_text},
            model_metrics={model: model_metrics},
        )

    @property
    def traces(self) -> list[QueryTrace]:
        """Return collected traces."""
        return list(self._traces)

    @property
    def records(self) -> list[ProfilingRecord]:
        """Return collected profiling records."""
        return list(self._records)

`traces` `property` ¶

Return collected traces.

`records` `property` ¶

Return collected profiling records.

`run(max_queries=None)` `async` ¶

Run the agent over the dataset, collecting traces and telemetry.

Parameters:

Name	Type	Description	Default
`max_queries`	`Optional[int]`	Maximum number of queries to process. None means all.	`None`

Returns:

Type	Description
`list[QueryTrace]`	List of QueryTrace objects with energy-correlated telemetry.

Source code in intelligence-per-watt/src/ipw/execution/agentic_runner.py

async def run(self, max_queries: Optional[int] = None) -> list[QueryTrace]:
    """Run the agent over the dataset, collecting traces and telemetry.

    Args:
        max_queries: Maximum number of queries to process. None means all.

    Returns:
        List of QueryTrace objects with energy-correlated telemetry.
    """
    total = max_queries or self._dataset.size()
    model = self._config.get("model", "unknown")

    with tqdm(total=total, desc="Agent run", unit="query") as progress:
        for index, record in enumerate(self._dataset):
            if index >= total:
                break

            trace = await self._run_single_query(index, record, model)
            self._traces.append(trace)

            # Build profiling record for energy data persistence
            profiling_record = self._build_profiling_record(
                record, trace, model
            )
            self._records.append(profiling_record)

            if len(self._traces) % self._FLUSH_INTERVAL == 0:
                LOGGER.debug(
                    "Processed %d/%d queries", len(self._traces), total
                )

            progress.update(1)

    return self._traces

`ProfilerRunner` ¶

Coordinate dataset iteration, inference calls, telemetry capture, and persistence.

Source code in intelligence-per-watt/src/ipw/execution/runner.py

class ProfilerRunner:
    """Coordinate dataset iteration, inference calls, telemetry capture, and persistence."""

    _FLUSH_INTERVAL = 100
    _HARDWARE_PRIME_TIMEOUT_SECONDS = 2.0
    _HARDWARE_PRIME_POLL_INTERVAL_SECONDS = 0.05

    # The runner is intentionally a slim orchestrator, but it still handles a
    # fair amount of coordination work:
    #
    # 1. Resolve dataset / client implementations from the registries so that we
    #    only depend on the registry surface, not the old resolution helpers.
    # 2. Spin up the `TelemetrySession`, which hides the threaded sampling loop
    #    that continuously pulls energy/power/memory readings into a rolling
    #    buffer while the run executes.
    # 3. For each dataset record, send the request to the client, collect the
    #    telemetry samples that overlap the query window, and transform the raw
    #    response + telemetry into the strongly typed `ProfilingRecord` payload
    #    defined in `ipw.execution.types`.
    # 4. Accumulate all records in-memory and write a HuggingFace dataset to the
    #    configured output directory once the run completes, along with a
    #    `summary.json` containing run metadata and aggregate energy totals.
    #
    # The actual measurements and conversions stay localized to helper methods
    # (`_compute_energy_metrics`, `_stat_summary`, etc.) so that the control flow
    # remains readable. Any future refactor (e.g., streaming writes or different
    # telemetry aggregation) should only need to touch the helpers and the final
    # persistence step.

    def __init__(self, config: ProfilerConfig) -> None:
        self._config = config
        self._records: list[ProfilingRecord] = []
        self._output_path: Optional[Path] = None
        self._output_prepared: bool = False
        self._hardware_label: Optional[str] = None
        self._system_info: Optional[SystemInfo] = None
        self._gpu_info: Optional[GpuInfo] = None
        self._baseline_energy: Optional[float] = None
        self._last_energy_total: Optional[float] = None
        self._overwrite_confirmed: bool = False

    def run(self) -> None:
        dataset = self._resolve_dataset(
            self._config.dataset_id, self._config.dataset_params
        )
        client: InferenceClient | None = None
        collector = EnergyMonitorCollector()

        try:
            client = self._resolve_client(
                self._config.client_id,
                self._config.client_base_url,
                self._config.client_params,
            )

            self._ensure_client_ready(client)

            with TelemetrySession(collector) as telemetry:
                self._process_records(dataset, client, telemetry)

            if not self._records:
                return

            self._persist_records(dataset)
        finally:
            self._close_client(client)

    def _process_records(
        self,
        dataset,
        client,
        telemetry: TelemetrySession,
    ) -> None:
        total_queries = self._config.max_queries or dataset.size()
        iterator = enumerate(dataset)
        # Prime hardware metadata early so the output directory label is accurate.
        self._prime_hardware_metadata(telemetry)
        # Prepare output directory (and confirm overwrite) before any inference.
        self._ensure_output_prepared(dataset)
        with tqdm(total=total_queries, desc="Profiling", unit="query") as progress:
            for index, record in iterator:
                if index >= total_queries:
                    break
                start = time.time()
                response = self._invoke_client(client, record)
                end = time.time()
                samples = list(telemetry.window(start, end))
                built = self._build_record(index, record, response, samples, start, end)
                if built is not None:
                    self._records.append(built)
                    if len(self._records) % self._FLUSH_INTERVAL == 0:
                        self._persist_records(dataset)
                progress.update(1)

    def _build_record(
        self,
        index: int,
        record: DatasetRecord,
        response: Response,
        samples: Sequence[TelemetrySample],
        start_time: float,
        end_time: float,
    ) -> Optional[ProfilingRecord]:
        self._update_hardware_metadata(samples)
        telemetry_readings = [sample.reading for sample in samples]

        energy_metrics = self._compute_energy_metrics(telemetry_readings)
        power_stats = _stat_summary(
            [reading.power_watts for reading in telemetry_readings]
        )
        cpu_power_stats = _stat_summary(
            [reading.cpu_power_watts for reading in telemetry_readings]
        )
        temperature_stats = _stat_summary(
            [reading.temperature_celsius for reading in telemetry_readings]
        )
        cpu_memory_stats = _stat_summary(
            [reading.cpu_memory_usage_mb for reading in telemetry_readings]
        )
        gpu_memory_stats = _stat_summary(
            [reading.gpu_memory_usage_mb for reading in telemetry_readings]
        )
        compute_util_stats = _stat_summary(
            [reading.gpu_compute_utilization_pct for reading in telemetry_readings]
        )
        memory_bw_util_stats = _stat_summary(
            [reading.gpu_memory_bandwidth_utilization_pct for reading in telemetry_readings]
        )
        tensor_util_stats = _stat_summary(
            [reading.gpu_tensor_core_utilization_pct for reading in telemetry_readings]
        )

        memory_used_gb = _max_gb(
            [reading.gpu_memory_usage_mb for reading in telemetry_readings]
        )
        memory_total_gb = _max_gb(
            [reading.gpu_memory_total_mb for reading in telemetry_readings]
        )

        usage = response.usage
        total_seconds = max(end_time - start_time, 0.0)

        # Defensive: ensure token counts are valid integers
        prompt_tokens = usage.prompt_tokens if usage.prompt_tokens is not None else 0
        completion_tokens = (
            usage.completion_tokens if usage.completion_tokens is not None else 0
        )

        per_token_ms = None
        throughput_tokens = None
        if completion_tokens > 0 and total_seconds > 0:
            per_token_ms = (total_seconds * 1000.0) / completion_tokens
            throughput_tokens = completion_tokens / total_seconds

        latency_metrics = LatencyMetrics(
            per_token_ms=per_token_ms,
            throughput_tokens_per_sec=throughput_tokens,
            time_to_first_token_seconds=(
                response.time_to_first_token_ms / 1000.0
                if response.time_to_first_token_ms is not None
                else None
            ),
            total_query_seconds=total_seconds,
        )

        model_name = self._config.model

        hardware_utilization = HardwareUtilization(
            gpu=HardwareUtilizationGpu(
                compute_utilization_pct=compute_util_stats.avg,
                memory_bandwidth_utilization_pct=memory_bw_util_stats.avg,
                tensor_core_utilization_pct=tensor_util_stats.avg,
                memory_used_gb=memory_used_gb,
                memory_total_gb=memory_total_gb,
            ),
            derived=HardwareUtilizationDerived(),
        )

        model_metrics = ModelMetrics(
            compute_metrics=ComputeMetrics(),
            energy_metrics=energy_metrics,
            latency_metrics=latency_metrics,
            memory_metrics=MemoryMetrics(
                cpu_mb=cpu_memory_stats,
                gpu_mb=gpu_memory_stats,
            ),
            power_metrics=PowerMetrics(
                gpu=PowerComponentMetrics(
                    per_query_watts=power_stats,
                    total_watts=MetricStats(
                        avg=power_stats.avg,
                        max=power_stats.max,
                        median=power_stats.median,
                        min=power_stats.min,
                    ),
                ),
                cpu=PowerComponentMetrics(
                    per_query_watts=cpu_power_stats,
                    total_watts=MetricStats(
                        avg=cpu_power_stats.avg,
                        max=cpu_power_stats.max,
                        median=cpu_power_stats.median,
                        min=cpu_power_stats.min,
                    ),
                ),
            ),
            temperature_metrics=temperature_stats,
            token_metrics=TokenMetrics(
                input=prompt_tokens,
                output=completion_tokens,
                total=prompt_tokens + completion_tokens,
            ),
            hardware_utilization=hardware_utilization,
            gpu_info=self._gpu_info,
            system_info=self._system_info,
            lm_response=response.content,
        )

        record_payload = ProfilingRecord(
            problem=record.problem,
            answer=record.answer,
            dataset_metadata=dict(record.dataset_metadata),
            subject=record.subject,
            model_answers={model_name: response.content},
            model_metrics={model_name: model_metrics},
        )

        return record_payload

    def _compute_energy_metrics(
        self, readings: Sequence[TelemetryReading]
    ) -> EnergyMetrics:
        """Compute energy metrics from telemetry readings.

        Energy values should be monotonically increasing cumulative counters.
        Negative deltas indicate counter reset or data anomaly and are treated as None.
        """
        # GPU energy
        gpu_energy_values = [
            reading.energy_joules
            for reading in readings
            if reading.energy_joules is not None
        ]
        gpu_per_query = self._compute_energy_delta(gpu_energy_values)

        # CPU energy
        cpu_energy_values = [
            reading.cpu_energy_joules
            for reading in readings
            if reading.cpu_energy_joules is not None
        ]
        cpu_per_query = self._compute_energy_delta(cpu_energy_values)

        # ANE energy (macOS only)
        ane_energy_values = [
            reading.ane_energy_joules
            for reading in readings
            if reading.ane_energy_joules is not None
        ]
        ane_per_query = self._compute_energy_delta(ane_energy_values)

        # Maintain baseline tracking for GPU (backward compat)
        if gpu_energy_values:
            start_value = gpu_energy_values[0]
            end_value = gpu_energy_values[-1]
            if self._baseline_energy is None:
                self._baseline_energy = start_value
            if (
                self._last_energy_total is not None
                and start_value < self._last_energy_total
            ):
                self._baseline_energy = start_value
            self._last_energy_total = end_value

        return EnergyMetrics(
            per_query_joules=gpu_per_query,
            total_joules=gpu_per_query,
            cpu_per_query_joules=cpu_per_query,
            cpu_total_joules=cpu_per_query,
            ane_per_query_joules=ane_per_query,
            ane_total_joules=ane_per_query,
        )

    def _compute_energy_delta(
        self, energy_values: list[float]
    ) -> Optional[float]:
        """Compute energy delta from a list of cumulative energy values."""
        if not energy_values:
            return None

        start_value = energy_values[0]
        end_value = energy_values[-1]

        # Validate energy values are finite and non-negative
        if not (
            math.isfinite(start_value)
            and math.isfinite(end_value)
            and start_value >= 0
            and end_value >= 0
        ):
            return None

        per_query_delta = end_value - start_value
        return per_query_delta if per_query_delta >= 0 else None

    def _update_hardware_metadata(self, readings: Sequence[TelemetrySample]) -> None:
        for sample in readings:
            reading = sample.reading
            if reading.system_info is not None:
                self._system_info = reading.system_info
            if reading.gpu_info is not None:
                self._gpu_info = reading.gpu_info

        candidate = derive_hardware_label(self._system_info, self._gpu_info)
        if candidate and (self._hardware_label in (None, "UNKNOWN_HW")):
            self._hardware_label = candidate

    def _get_output_path(self, dataset_label: str | None = None) -> Path:
        if self._output_path is not None:
            return self._output_path

        hardware_label = self._hardware_label or "UNKNOWN_HW"
        model_slug = _slugify_model(self._config.model)
        dataset_segment = dataset_label or self._config.dataset_id or "dataset"
        dataset_segment = str(dataset_segment).strip() or "dataset"
        default_runs_dir = Path(__file__).resolve().parents[4] / "runs"
        base_dir = self._config.output_dir or default_runs_dir
        profile_dir = f"profile_{hardware_label}_{model_slug}_{dataset_segment}".strip("_")

        output_path = Path(base_dir) / profile_dir

        self._hardware_label = hardware_label
        self._output_path = output_path
        return output_path

    def _invoke_client(self, client, record: DatasetRecord) -> Response:
        payload: MutableMapping[str, object] = dict(self._config.additional_parameters)
        return client.stream_chat_completion(
            self._config.model, record.problem, **payload
        )

    def _resolve_dataset(self, dataset_id: str, params: Mapping[str, Any]):
        try:
            dataset_cls = DatasetRegistry.get(dataset_id)
        except KeyError as exc:
            raise RuntimeError(f"Unknown dataset '{dataset_id}'") from exc

        try:
            return dataset_cls(**params)
        except TypeError as exc:
            raise RuntimeError(
                f"Failed to instantiate dataset '{dataset_id}' with params {params!r}: {exc}"
            ) from exc

    def _resolve_client(
        self,
        client_id: str,
        base_url: str | None,
        params: Mapping[str, Any],
    ) -> InferenceClient:
        try:
            client_cls = ClientRegistry.get(client_id)
        except KeyError as exc:
            raise RuntimeError(f"Unknown client '{client_id}'") from exc

        try:
            return client_cls(base_url, **params)
        except TypeError as exc:
            raise RuntimeError(
                f"Failed to instantiate client '{client_id}' with params {params!r}: {exc}"
            ) from exc

    def _ensure_client_ready(self, client: InferenceClient) -> None:
        if not client.health():
            raise RuntimeError(
                f"Client '{client.client_name}' at {getattr(client, 'base_url', '')} is unavailable"
            )
        client.prepare(self._config.model)

    def _close_client(self, client: InferenceClient | None) -> None:
        if client is None:
            return
        close_fn = getattr(client, "close", None)
        if callable(close_fn):
            try:
                close_fn()
            except Exception:
                LOGGER.warning("Failed to close inference client cleanly", exc_info=True)

    def _prime_hardware_metadata(self, telemetry: TelemetrySession) -> None:
        """Wait briefly for telemetry samples so hardware labels are stable."""
        try:
            initial_samples = list(telemetry.readings() or [])
        except TypeError:
            initial_samples = []
        if initial_samples:
            self._update_hardware_metadata(initial_samples)
        if self._hardware_label not in (None, "UNKNOWN_HW"):
            return

        session_type = TelemetrySession
        if not isinstance(session_type, type):
            return
        if not isinstance(telemetry, session_type):
            return

        deadline = time.time() + self._HARDWARE_PRIME_TIMEOUT_SECONDS
        while time.time() < deadline:
            try:
                samples = list(telemetry.readings() or [])
            except TypeError:
                samples = []
            if samples:
                self._update_hardware_metadata(samples)
                if self._hardware_label not in (None, "UNKNOWN_HW"):
                    return
            time.sleep(self._HARDWARE_PRIME_POLL_INTERVAL_SECONDS)

    def _ensure_output_prepared(self, dataset) -> Path:
        """Resolve and prepare the output directory once per run."""
        if self._output_prepared:
            return self._get_output_path()

        dataset_label = (
            getattr(dataset, "dataset_name", None)
            or getattr(dataset, "dataset_id", None)
            or self._config.dataset_id
        )
        output_path = self._get_output_path(
            str(dataset_label).strip() or self._config.dataset_id
        )
        if output_path.exists():
            self._confirm_overwrite(output_path)
            shutil.rmtree(output_path)
        output_path.parent.mkdir(parents=True, exist_ok=True)
        self._output_prepared = True
        return output_path

    def _persist_records(self, dataset) -> None:
        if not self._records:
            return
        output_path = self._ensure_output_prepared(dataset)

        dataset_obj = Dataset.from_list([asdict(record) for record in self._records])
        dataset_obj.save_to_disk(str(output_path))
        output_path.mkdir(parents=True, exist_ok=True)

        summary = {
            "model": self._config.model,
            "profiler_config": _jsonify(asdict(self._config)),
            "dataset": getattr(dataset, "dataset_id", self._config.dataset_id),
            "dataset_name": getattr(dataset, "dataset_name", None),
            "hardware_label": self._hardware_label,
            "generated_at": time.time(),
            "total_queries": len(self._records),
            "system_info": asdict(self._system_info) if self._system_info else None,
            "gpu_info": asdict(self._gpu_info) if self._gpu_info else None,
            "output_dir": str(output_path),
            "versions": _get_versions(),
        }
        summary_path = output_path / "summary.json"
        summary_path.write_text(json.dumps(summary, indent=2, default=str))

    def _confirm_overwrite(self, output_path: Path) -> None:
        """Prompt before overwriting an existing output directory."""
        if self._overwrite_confirmed:
            return

        prompt = (
            f"Output directory already exists at {output_path}. "
            "Overwrite it? This will remove existing run data."
        )
        proceed = click.confirm(prompt, default=False)

        if not proceed:
            raise RuntimeError(
                f"Profiling aborted to avoid overwriting existing output at {output_path}."
            )
        self._overwrite_confirmed = True

`export_hf_dataset(traces, path)` ¶

Export traces as a HuggingFace Arrow dataset.

Parameters:

Name	Type	Description	Default
`traces`	`list[QueryTrace]`	List of QueryTrace objects to export.	required
`path`	`Path`	Output directory for the Arrow dataset.	required

Returns:

Type	Description
`Path`	The path to the saved dataset directory.

Source code in intelligence-per-watt/src/ipw/execution/exporters.py

def export_hf_dataset(traces: list[QueryTrace], path: Path) -> Path:
    """Export traces as a HuggingFace Arrow dataset.

    Args:
        traces: List of QueryTrace objects to export.
        path: Output directory for the Arrow dataset.

    Returns:
        The path to the saved dataset directory.
    """
    ds = QueryTrace.to_hf_dataset(traces)
    path.parent.mkdir(parents=True, exist_ok=True)
    ds.save_to_disk(str(path))
    return path

`export_jsonl(traces, path)` ¶

Export traces as JSONL (one JSON object per line).

Parameters:

Name	Type	Description	Default
`traces`	`list[QueryTrace]`	List of QueryTrace objects to export.	required
`path`	`Path`	Output file path. Parent directories are created if needed.	required

Returns:

Type	Description
`Path`	The path to the written file.

Source code in intelligence-per-watt/src/ipw/execution/exporters.py

def export_jsonl(traces: list[QueryTrace], path: Path) -> Path:
    """Export traces as JSONL (one JSON object per line).

    Args:
        traces: List of QueryTrace objects to export.
        path: Output file path. Parent directories are created if needed.

    Returns:
        The path to the written file.
    """
    path.parent.mkdir(parents=True, exist_ok=True)
    with open(path, "w") as f:
        for trace in traces:
            f.write(json.dumps(trace.to_dict()) + "\n")
    return path

`export_summary_json(traces, config, path)` ¶

Export aggregate summary as JSON.

Parameters:

Name	Type	Description	Default
`traces`	`list[QueryTrace]`	List of QueryTrace objects.	required
`config`	`dict[str, Any]`	Run configuration dictionary.	required
`path`	`Path`	Output file path.	required

Returns:

Type	Description
`Path`	The path to the written file.

Source code in intelligence-per-watt/src/ipw/execution/exporters.py

def export_summary_json(
    traces: list[QueryTrace],
    config: dict[str, Any],
    path: Path,
) -> Path:
    """Export aggregate summary as JSON.

    Args:
        traces: List of QueryTrace objects.
        config: Run configuration dictionary.
        path: Output file path.

    Returns:
        The path to the written file.
    """
    total_queries = len(traces)
    completed = sum(1 for t in traces if t.completed)
    total_turns = sum(t.num_turns for t in traces)
    total_tool_calls = sum(t.total_tool_calls for t in traces)

    # Aggregate tokens
    total_input_tokens = sum(t.total_input_tokens for t in traces)
    total_output_tokens = sum(t.total_output_tokens for t in traces)

    # Aggregate wall clock
    total_wall_clock_s = sum(t.total_wall_clock_s for t in traces)

    # Aggregate energy
    gpu_energy_values = [
        t.total_gpu_energy_joules for t in traces
        if t.total_gpu_energy_joules is not None
    ]
    total_gpu_energy = sum(gpu_energy_values) if gpu_energy_values else None

    cpu_energy_values = []
    for trace in traces:
        cpu_vals = [
            turn.cpu_energy_joules for turn in trace.turns
            if turn.cpu_energy_joules is not None
        ]
        if cpu_vals:
            cpu_energy_values.append(sum(cpu_vals))
    total_cpu_energy = sum(cpu_energy_values) if cpu_energy_values else None

    # Aggregate cost
    cost_values = [
        t.total_cost_usd for t in traces
        if t.total_cost_usd is not None
    ]
    total_cost = sum(cost_values) if cost_values else None

    # Per-query averages
    avg_turns = total_turns / total_queries if total_queries > 0 else 0
    avg_wall_clock = total_wall_clock_s / total_queries if total_queries > 0 else 0
    avg_gpu_energy = (
        total_gpu_energy / total_queries
        if total_gpu_energy is not None and total_queries > 0
        else None
    )

    summary = {
        "generated_at": time.time(),
        "config": config,
        "totals": {
            "queries": total_queries,
            "completed": completed,
            "turns": total_turns,
            "tool_calls": total_tool_calls,
            "input_tokens": total_input_tokens,
            "output_tokens": total_output_tokens,
            "wall_clock_s": total_wall_clock_s,
            "gpu_energy_joules": total_gpu_energy,
            "cpu_energy_joules": total_cpu_energy,
            "cost_usd": total_cost,
        },
        "averages": {
            "turns_per_query": avg_turns,
            "wall_clock_per_query_s": avg_wall_clock,
            "gpu_energy_per_query_joules": avg_gpu_energy,
        },
    }

    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_text(json.dumps(summary, indent=2, default=str))
    return path

`derive_hardware_label(system_info, gpu_info)` ¶

Return a concise hardware label using GPU or CPU identifiers.

Source code in intelligence-per-watt/src/ipw/execution/hardware.py

def derive_hardware_label(
    system_info: Optional[SystemInfo | Mapping[str, object]],
    gpu_info: Optional[GpuInfo | Mapping[str, object]],
) -> str:
    """Return a concise hardware label using GPU or CPU identifiers."""

    def _sanitize(raw: Optional[str]) -> Sequence[str]:
        if not raw:
            return []
        tokens: list[str] = []
        current = []
        for ch in raw:
            if ch.isalnum():
                current.append(ch)
            else:
                if current:
                    tokens.append("".join(current))
                    current.clear()
        if current:
            tokens.append("".join(current))
        return tokens

    def _has_alpha(value: str) -> bool:
        return any(ch.isalpha() for ch in value)

    def _has_digit(value: str) -> bool:
        return any(ch.isdigit() for ch in value)

    def _normalize(label: str) -> str:
        if not label:
            return label
        alpha_chars = [ch for ch in label if ch.isalpha()]
        if alpha_chars and not any(ch.isupper() for ch in alpha_chars):
            return label.upper()
        return label

    def _should_pair(token: str) -> bool:
        """Return True when token should be combined with the next token."""
        if not _has_alpha(token):
            return False
        if len(token) > 4 and not _has_digit(token):
            return False
        if token.isupper() or token.islower() or _has_digit(token):
            return True
        return False

    def _combine(token: str, next_token: Optional[str]) -> Optional[str]:
        if not next_token:
            return None
        if not _has_digit(next_token):
            return None
        if not (_has_alpha(next_token) or _has_alpha(token)):
            return None
        if _has_digit(token) and not _has_alpha(token):
            return None
        if not _should_pair(token) and not _has_digit(token):
            return None
        return token + next_token

    def _derive_label(tokens: Sequence[str]) -> tuple[Optional[str], bool]:
        if not tokens:
            return None, False

        first_digit_candidate: Optional[str] = None
        alpha_fallback: Optional[str] = None

        for index, token in enumerate(tokens):
            next_token = tokens[index + 1] if index + 1 < len(tokens) else None
            combined = _combine(token, next_token)
            if combined:
                label = _normalize(combined)
                return label, _has_digit(label)

            if first_digit_candidate is None and _has_digit(token):
                first_digit_candidate = token

            if _has_alpha(token):
                alpha_fallback = token

        if first_digit_candidate is not None:
            label = _normalize(first_digit_candidate)
            return label, _has_digit(label)

        if alpha_fallback is not None:
            label = _normalize(alpha_fallback)
            return label, _has_digit(label)

        label = _normalize(tokens[-1])
        return label, _has_digit(label)

    def _extract_field(obj: Optional[object], attr: str) -> str:
        if obj is None:
            return ""
        if isinstance(obj, MappingABC):
            mapping = cast(Mapping[str, object], obj)
            value = mapping.get(attr)
            return str(value) if value is not None else ""
        return str(getattr(obj, attr, "") or "")

    gpu_label: Optional[str] = None
    gpu_has_digit = False
    raw_gpu = _extract_field(gpu_info, "name")
    if raw_gpu:
        gpu_label, gpu_has_digit = _derive_label(_sanitize(raw_gpu))
        if gpu_label and gpu_has_digit:
            return gpu_label

    cpu_label: Optional[str] = None
    cpu_has_digit = False
    raw_cpu = _extract_field(system_info, "cpu_brand")
    if raw_cpu:
        cpu_label, cpu_has_digit = _derive_label(_sanitize(raw_cpu))
        if cpu_label and cpu_has_digit and not gpu_has_digit:
            return cpu_label

    if gpu_label:
        return gpu_label
    if cpu_label:
        return cpu_label
    return "UNKNOWN_HW"

Index

ipw.execution ¶

AgenticRunner ¶

traces property ¶

records property ¶

run(max_queries=None) async ¶

ProfilerRunner ¶

export_hf_dataset(traces, path) ¶

export_jsonl(traces, path) ¶

export_summary_json(traces, config, path) ¶

derive_hardware_label(system_info, gpu_info) ¶

`ipw.execution` ¶

`AgenticRunner` ¶

`traces` `property` ¶

`records` `property` ¶

`run(max_queries=None)` `async` ¶

`ProfilerRunner` ¶

`export_hf_dataset(traces, path)` ¶

`export_jsonl(traces, path)` ¶

`export_summary_json(traces, config, path)` ¶

`derive_hardware_label(system_info, gpu_info)` ¶