From c244f636711578ef7cccd769a88a4fdfc6ccf1e9 Mon Sep 17 00:00:00 2001
From: Chathurangi Shyalika
 <chathurangishyalika@Chathurangis-MacBook-Pro.local>
Date: Sun, 28 Jun 2026 20:22:41 -0400
Subject: [PATCH] fixing opencode token reporting

Signed-off-by: Chathurangi Shyalika <chathurangishyalika@Chathurangis-MacBook-Pro.local>
---
 src/agent/opencode_agent/runner.py            | 53 +++++++++++++--
 src/agent/opencode_agent/tests/test_runner.py | 36 ++++++++++
 src/evaluation/metrics.py                     | 67 +++++++++++++++++++
 src/evaluation/tests/test_metrics.py          | 46 +++++++++++++
 4 files changed, 196 insertions(+), 6 deletions(-)

diff --git a/src/agent/opencode_agent/runner.py b/src/agent/opencode_agent/runner.py
index 4457c050..442bd1f6 100644
--- a/src/agent/opencode_agent/runner.py
+++ b/src/agent/opencode_agent/runner.py
@@ -288,12 +288,55 @@ def _walk_dicts(value: Any):
             yield from _walk_dicts(child)
 
 
+def _token_count(value: Any) -> int:
+    if isinstance(value, bool):
+        return 0
+    if isinstance(value, int):
+        return value
+    if isinstance(value, float) and value.is_integer():
+        return int(value)
+    if isinstance(value, str):
+        try:
+            return int(value)
+        except ValueError:
+            return 0
+    return 0
+
+
+def _opencode_usage_tokens(tokens: dict[str, Any]) -> tuple[int, int]:
+    """Return input/output totals from OpenCode's step-finish token schema."""
+    cache = tokens.get("cache") if isinstance(tokens.get("cache"), dict) else {}
+    input_tokens = (
+        _token_count(tokens.get("input"))
+        + _token_count(cache.get("read"))
+        + _token_count(cache.get("write"))
+    )
+    output_tokens = _token_count(tokens.get("output")) + _token_count(
+        tokens.get("reasoning")
+    )
+    return input_tokens, output_tokens
+
+
 def _usage_from_events(events: list[dict[str, Any]]) -> tuple[int, int]:
-    """Extract a conservative max token usage from possibly cumulative events."""
+    """Extract token usage from OpenCode events.
+
+    OpenCode emits per-step usage as ``tokens.input`` / ``tokens.output`` plus
+    cache and reasoning buckets. Older/test fixtures may use SDK-style usage
+    names, which are treated as possibly cumulative and deduplicated by max.
+    """
     input_tokens = 0
     output_tokens = 0
+    sdk_input_tokens = 0
+    sdk_output_tokens = 0
     for event in events:
         for item in _walk_dicts(event):
+            tokens = item.get("tokens")
+            if isinstance(tokens, dict):
+                in_value, out_value = _opencode_usage_tokens(tokens)
+                input_tokens += in_value
+                output_tokens += out_value
+                continue
+
             in_value = (
                 item.get("input_tokens")
                 or item.get("inputTokens")
@@ -306,11 +349,9 @@ def _usage_from_events(events: list[dict[str, Any]]) -> tuple[int, int]:
                 or item.get("completion_tokens")
                 or item.get("completionTokens")
             )
-            if isinstance(in_value, int):
-                input_tokens = max(input_tokens, in_value)
-            if isinstance(out_value, int):
-                output_tokens = max(output_tokens, out_value)
-    return input_tokens, output_tokens
+            sdk_input_tokens = max(sdk_input_tokens, _token_count(in_value))
+            sdk_output_tokens = max(sdk_output_tokens, _token_count(out_value))
+    return input_tokens or sdk_input_tokens, output_tokens or sdk_output_tokens
 
 
 def _build_trajectory_from_events(
diff --git a/src/agent/opencode_agent/tests/test_runner.py b/src/agent/opencode_agent/tests/test_runner.py
index d1391df6..4f861132 100644
--- a/src/agent/opencode_agent/tests/test_runner.py
+++ b/src/agent/opencode_agent/tests/test_runner.py
@@ -179,6 +179,42 @@ def test_build_trajectory_from_text_and_tool_parts():
     assert trajectory.turns[0].tool_calls[0].name == "iot_get_asset"
 
 
+def test_build_trajectory_from_opencode_step_finish_usage():
+    events = [
+        {
+            "type": "step_finish",
+            "part": {
+                "type": "step-finish",
+                "tokens": {
+                    "input": 13084,
+                    "output": 33,
+                    "reasoning": 61,
+                    "cache": {"read": 128, "write": 2},
+                },
+            },
+        },
+        {
+            "type": "step_finish",
+            "part": {
+                "type": "step-finish",
+                "tokens": {
+                    "input": 209,
+                    "output": 84,
+                    "reasoning": 17,
+                    "cache": {"read": 13184, "write": 0},
+                },
+            },
+        },
+    ]
+
+    answer, trajectory = _build_trajectory_from_events(events, [])
+
+    assert answer == ""
+    assert len(trajectory.turns) == 1
+    assert trajectory.turns[0].input_tokens == 26607
+    assert trajectory.turns[0].output_tokens == 195
+
+
 def test_runner_defaults():
     runner = OpenCodeAgentRunner(server_paths={}, model="opencode/gpt-5")
     assert runner._model_id == "opencode/gpt-5"
diff --git a/src/evaluation/metrics.py b/src/evaluation/metrics.py
index 325074a7..9450c310 100644
--- a/src/evaluation/metrics.py
+++ b/src/evaluation/metrics.py
@@ -39,6 +39,10 @@ def _from_sdk_trajectory(traj: dict, model: str) -> OpsMetrics:
     turns = traj.get("turns", []) or []
     tokens_in = sum(int(t.get("input_tokens") or 0) for t in turns)
     tokens_out = sum(int(t.get("output_tokens") or 0) for t in turns)
+    if tokens_in == 0 and tokens_out == 0:
+        raw_events = traj.get("raw_events") or []
+        if isinstance(raw_events, list):
+            tokens_in, tokens_out = _usage_from_raw_events(raw_events)
 
     durations_ms = [t.get("duration_ms") for t in turns if t.get("duration_ms") is not None]
     duration_ms = sum(durations_ms) if durations_ms else None
@@ -61,6 +65,69 @@ def _from_sdk_trajectory(traj: dict, model: str) -> OpsMetrics:
     )
 
 
+def _walk_dicts(value: Any):
+    if isinstance(value, dict):
+        yield value
+        for child in value.values():
+            yield from _walk_dicts(child)
+    elif isinstance(value, list):
+        for child in value:
+            yield from _walk_dicts(child)
+
+
+def _token_count(value: Any) -> int:
+    if isinstance(value, bool):
+        return 0
+    if isinstance(value, int):
+        return value
+    if isinstance(value, float) and value.is_integer():
+        return int(value)
+    if isinstance(value, str):
+        try:
+            return int(value)
+        except ValueError:
+            return 0
+    return 0
+
+
+def _usage_from_raw_events(events: list[Any]) -> tuple[int, int]:
+    """Extract token usage from raw event schemas kept for parser fallback."""
+    input_tokens = 0
+    output_tokens = 0
+    sdk_input_tokens = 0
+    sdk_output_tokens = 0
+    for event in events:
+        for item in _walk_dicts(event):
+            tokens = item.get("tokens")
+            if isinstance(tokens, dict):
+                cache = tokens.get("cache") if isinstance(tokens.get("cache"), dict) else {}
+                input_tokens += (
+                    _token_count(tokens.get("input"))
+                    + _token_count(cache.get("read"))
+                    + _token_count(cache.get("write"))
+                )
+                output_tokens += _token_count(tokens.get("output")) + _token_count(
+                    tokens.get("reasoning")
+                )
+                continue
+
+            in_value = (
+                item.get("input_tokens")
+                or item.get("inputTokens")
+                or item.get("prompt_tokens")
+                or item.get("promptTokens")
+            )
+            out_value = (
+                item.get("output_tokens")
+                or item.get("outputTokens")
+                or item.get("completion_tokens")
+                or item.get("completionTokens")
+            )
+            sdk_input_tokens = max(sdk_input_tokens, _token_count(in_value))
+            sdk_output_tokens = max(sdk_output_tokens, _token_count(out_value))
+    return input_tokens or sdk_input_tokens, output_tokens or sdk_output_tokens
+
+
 def _from_plan_execute(steps: list[Any], model: str) -> OpsMetrics:
     # plan-execute persists ``list[StepResult]``; the dataclass exposes
     # ``server`` / ``tool`` / ``response`` fields but no per-step token
diff --git a/src/evaluation/tests/test_metrics.py b/src/evaluation/tests/test_metrics.py
index 21f097b1..d1bce483 100644
--- a/src/evaluation/tests/test_metrics.py
+++ b/src/evaluation/tests/test_metrics.py
@@ -43,6 +43,52 @@ def test_handles_none_trajectory(self, make_persisted_record):
         rec = PersistedTrajectory.from_raw(make_persisted_record(trajectory=None))
         assert metrics_from_trajectory(rec) == OpsMetrics()
 
+    def test_sdk_trajectory_falls_back_to_raw_event_tokens(self, make_persisted_record):
+        rec = PersistedTrajectory.from_raw(
+            make_persisted_record(
+                trajectory={
+                    "turns": [
+                        {
+                            "index": 0,
+                            "text": "answer",
+                            "tool_calls": [],
+                            "input_tokens": 0,
+                            "output_tokens": 0,
+                        }
+                    ],
+                    "raw_events": [
+                        {
+                            "type": "step_finish",
+                            "part": {
+                                "tokens": {
+                                    "input": 13084,
+                                    "output": 33,
+                                    "reasoning": 61,
+                                    "cache": {"read": 128, "write": 2},
+                                }
+                            },
+                        },
+                        {
+                            "type": "step_finish",
+                            "part": {
+                                "tokens": {
+                                    "input": 209,
+                                    "output": 84,
+                                    "reasoning": 17,
+                                    "cache": {"read": 13184, "write": 0},
+                                }
+                            },
+                        },
+                    ],
+                }
+            )
+        )
+
+        m = metrics_from_trajectory(rec)
+
+        assert m.tokens_in == 26607
+        assert m.tokens_out == 195
+
     def test_plan_execute_list_trajectory(self, make_persisted_record):
         rec = PersistedTrajectory.from_raw(
             make_persisted_record(