From c244f636711578ef7cccd769a88a4fdfc6ccf1e9 Mon Sep 17 00:00:00 2001 From: Chathurangi Shyalika Date: Sun, 28 Jun 2026 20:22:41 -0400 Subject: [PATCH] fixing opencode token reporting Signed-off-by: Chathurangi Shyalika --- src/agent/opencode_agent/runner.py | 53 +++++++++++++-- src/agent/opencode_agent/tests/test_runner.py | 36 ++++++++++ src/evaluation/metrics.py | 67 +++++++++++++++++++ src/evaluation/tests/test_metrics.py | 46 +++++++++++++ 4 files changed, 196 insertions(+), 6 deletions(-) diff --git a/src/agent/opencode_agent/runner.py b/src/agent/opencode_agent/runner.py index 4457c050..442bd1f6 100644 --- a/src/agent/opencode_agent/runner.py +++ b/src/agent/opencode_agent/runner.py @@ -288,12 +288,55 @@ def _walk_dicts(value: Any): yield from _walk_dicts(child) +def _token_count(value: Any) -> int: + if isinstance(value, bool): + return 0 + if isinstance(value, int): + return value + if isinstance(value, float) and value.is_integer(): + return int(value) + if isinstance(value, str): + try: + return int(value) + except ValueError: + return 0 + return 0 + + +def _opencode_usage_tokens(tokens: dict[str, Any]) -> tuple[int, int]: + """Return input/output totals from OpenCode's step-finish token schema.""" + cache = tokens.get("cache") if isinstance(tokens.get("cache"), dict) else {} + input_tokens = ( + _token_count(tokens.get("input")) + + _token_count(cache.get("read")) + + _token_count(cache.get("write")) + ) + output_tokens = _token_count(tokens.get("output")) + _token_count( + tokens.get("reasoning") + ) + return input_tokens, output_tokens + + def _usage_from_events(events: list[dict[str, Any]]) -> tuple[int, int]: - """Extract a conservative max token usage from possibly cumulative events.""" + """Extract token usage from OpenCode events. + + OpenCode emits per-step usage as ``tokens.input`` / ``tokens.output`` plus + cache and reasoning buckets. Older/test fixtures may use SDK-style usage + names, which are treated as possibly cumulative and deduplicated by max. + """ input_tokens = 0 output_tokens = 0 + sdk_input_tokens = 0 + sdk_output_tokens = 0 for event in events: for item in _walk_dicts(event): + tokens = item.get("tokens") + if isinstance(tokens, dict): + in_value, out_value = _opencode_usage_tokens(tokens) + input_tokens += in_value + output_tokens += out_value + continue + in_value = ( item.get("input_tokens") or item.get("inputTokens") @@ -306,11 +349,9 @@ def _usage_from_events(events: list[dict[str, Any]]) -> tuple[int, int]: or item.get("completion_tokens") or item.get("completionTokens") ) - if isinstance(in_value, int): - input_tokens = max(input_tokens, in_value) - if isinstance(out_value, int): - output_tokens = max(output_tokens, out_value) - return input_tokens, output_tokens + sdk_input_tokens = max(sdk_input_tokens, _token_count(in_value)) + sdk_output_tokens = max(sdk_output_tokens, _token_count(out_value)) + return input_tokens or sdk_input_tokens, output_tokens or sdk_output_tokens def _build_trajectory_from_events( diff --git a/src/agent/opencode_agent/tests/test_runner.py b/src/agent/opencode_agent/tests/test_runner.py index d1391df6..4f861132 100644 --- a/src/agent/opencode_agent/tests/test_runner.py +++ b/src/agent/opencode_agent/tests/test_runner.py @@ -179,6 +179,42 @@ def test_build_trajectory_from_text_and_tool_parts(): assert trajectory.turns[0].tool_calls[0].name == "iot_get_asset" +def test_build_trajectory_from_opencode_step_finish_usage(): + events = [ + { + "type": "step_finish", + "part": { + "type": "step-finish", + "tokens": { + "input": 13084, + "output": 33, + "reasoning": 61, + "cache": {"read": 128, "write": 2}, + }, + }, + }, + { + "type": "step_finish", + "part": { + "type": "step-finish", + "tokens": { + "input": 209, + "output": 84, + "reasoning": 17, + "cache": {"read": 13184, "write": 0}, + }, + }, + }, + ] + + answer, trajectory = _build_trajectory_from_events(events, []) + + assert answer == "" + assert len(trajectory.turns) == 1 + assert trajectory.turns[0].input_tokens == 26607 + assert trajectory.turns[0].output_tokens == 195 + + def test_runner_defaults(): runner = OpenCodeAgentRunner(server_paths={}, model="opencode/gpt-5") assert runner._model_id == "opencode/gpt-5" diff --git a/src/evaluation/metrics.py b/src/evaluation/metrics.py index 325074a7..9450c310 100644 --- a/src/evaluation/metrics.py +++ b/src/evaluation/metrics.py @@ -39,6 +39,10 @@ def _from_sdk_trajectory(traj: dict, model: str) -> OpsMetrics: turns = traj.get("turns", []) or [] tokens_in = sum(int(t.get("input_tokens") or 0) for t in turns) tokens_out = sum(int(t.get("output_tokens") or 0) for t in turns) + if tokens_in == 0 and tokens_out == 0: + raw_events = traj.get("raw_events") or [] + if isinstance(raw_events, list): + tokens_in, tokens_out = _usage_from_raw_events(raw_events) durations_ms = [t.get("duration_ms") for t in turns if t.get("duration_ms") is not None] duration_ms = sum(durations_ms) if durations_ms else None @@ -61,6 +65,69 @@ def _from_sdk_trajectory(traj: dict, model: str) -> OpsMetrics: ) +def _walk_dicts(value: Any): + if isinstance(value, dict): + yield value + for child in value.values(): + yield from _walk_dicts(child) + elif isinstance(value, list): + for child in value: + yield from _walk_dicts(child) + + +def _token_count(value: Any) -> int: + if isinstance(value, bool): + return 0 + if isinstance(value, int): + return value + if isinstance(value, float) and value.is_integer(): + return int(value) + if isinstance(value, str): + try: + return int(value) + except ValueError: + return 0 + return 0 + + +def _usage_from_raw_events(events: list[Any]) -> tuple[int, int]: + """Extract token usage from raw event schemas kept for parser fallback.""" + input_tokens = 0 + output_tokens = 0 + sdk_input_tokens = 0 + sdk_output_tokens = 0 + for event in events: + for item in _walk_dicts(event): + tokens = item.get("tokens") + if isinstance(tokens, dict): + cache = tokens.get("cache") if isinstance(tokens.get("cache"), dict) else {} + input_tokens += ( + _token_count(tokens.get("input")) + + _token_count(cache.get("read")) + + _token_count(cache.get("write")) + ) + output_tokens += _token_count(tokens.get("output")) + _token_count( + tokens.get("reasoning") + ) + continue + + in_value = ( + item.get("input_tokens") + or item.get("inputTokens") + or item.get("prompt_tokens") + or item.get("promptTokens") + ) + out_value = ( + item.get("output_tokens") + or item.get("outputTokens") + or item.get("completion_tokens") + or item.get("completionTokens") + ) + sdk_input_tokens = max(sdk_input_tokens, _token_count(in_value)) + sdk_output_tokens = max(sdk_output_tokens, _token_count(out_value)) + return input_tokens or sdk_input_tokens, output_tokens or sdk_output_tokens + + def _from_plan_execute(steps: list[Any], model: str) -> OpsMetrics: # plan-execute persists ``list[StepResult]``; the dataclass exposes # ``server`` / ``tool`` / ``response`` fields but no per-step token diff --git a/src/evaluation/tests/test_metrics.py b/src/evaluation/tests/test_metrics.py index 21f097b1..d1bce483 100644 --- a/src/evaluation/tests/test_metrics.py +++ b/src/evaluation/tests/test_metrics.py @@ -43,6 +43,52 @@ def test_handles_none_trajectory(self, make_persisted_record): rec = PersistedTrajectory.from_raw(make_persisted_record(trajectory=None)) assert metrics_from_trajectory(rec) == OpsMetrics() + def test_sdk_trajectory_falls_back_to_raw_event_tokens(self, make_persisted_record): + rec = PersistedTrajectory.from_raw( + make_persisted_record( + trajectory={ + "turns": [ + { + "index": 0, + "text": "answer", + "tool_calls": [], + "input_tokens": 0, + "output_tokens": 0, + } + ], + "raw_events": [ + { + "type": "step_finish", + "part": { + "tokens": { + "input": 13084, + "output": 33, + "reasoning": 61, + "cache": {"read": 128, "write": 2}, + } + }, + }, + { + "type": "step_finish", + "part": { + "tokens": { + "input": 209, + "output": 84, + "reasoning": 17, + "cache": {"read": 13184, "write": 0}, + } + }, + }, + ], + } + ) + ) + + m = metrics_from_trajectory(rec) + + assert m.tokens_in == 26607 + assert m.tokens_out == 195 + def test_plan_execute_list_trajectory(self, make_persisted_record): rec = PersistedTrajectory.from_raw( make_persisted_record(