Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 47 additions & 6 deletions src/agent/opencode_agent/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,12 +288,55 @@ def _walk_dicts(value: Any):
yield from _walk_dicts(child)


def _token_count(value: Any) -> int:
if isinstance(value, bool):
return 0
if isinstance(value, int):
return value
if isinstance(value, float) and value.is_integer():
return int(value)
if isinstance(value, str):
try:
return int(value)
except ValueError:
return 0
return 0


def _opencode_usage_tokens(tokens: dict[str, Any]) -> tuple[int, int]:
"""Return input/output totals from OpenCode's step-finish token schema."""
cache = tokens.get("cache") if isinstance(tokens.get("cache"), dict) else {}
input_tokens = (
_token_count(tokens.get("input"))
+ _token_count(cache.get("read"))
+ _token_count(cache.get("write"))
)
output_tokens = _token_count(tokens.get("output")) + _token_count(
tokens.get("reasoning")
)
return input_tokens, output_tokens


def _usage_from_events(events: list[dict[str, Any]]) -> tuple[int, int]:
"""Extract a conservative max token usage from possibly cumulative events."""
"""Extract token usage from OpenCode events.

OpenCode emits per-step usage as ``tokens.input`` / ``tokens.output`` plus
cache and reasoning buckets. Older/test fixtures may use SDK-style usage
names, which are treated as possibly cumulative and deduplicated by max.
"""
input_tokens = 0
output_tokens = 0
sdk_input_tokens = 0
sdk_output_tokens = 0
for event in events:
for item in _walk_dicts(event):
tokens = item.get("tokens")
if isinstance(tokens, dict):
in_value, out_value = _opencode_usage_tokens(tokens)
input_tokens += in_value
output_tokens += out_value
continue

in_value = (
item.get("input_tokens")
or item.get("inputTokens")
Expand All @@ -306,11 +349,9 @@ def _usage_from_events(events: list[dict[str, Any]]) -> tuple[int, int]:
or item.get("completion_tokens")
or item.get("completionTokens")
)
if isinstance(in_value, int):
input_tokens = max(input_tokens, in_value)
if isinstance(out_value, int):
output_tokens = max(output_tokens, out_value)
return input_tokens, output_tokens
sdk_input_tokens = max(sdk_input_tokens, _token_count(in_value))
sdk_output_tokens = max(sdk_output_tokens, _token_count(out_value))
return input_tokens or sdk_input_tokens, output_tokens or sdk_output_tokens


def _build_trajectory_from_events(
Expand Down
36 changes: 36 additions & 0 deletions src/agent/opencode_agent/tests/test_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,42 @@ def test_build_trajectory_from_text_and_tool_parts():
assert trajectory.turns[0].tool_calls[0].name == "iot_get_asset"


def test_build_trajectory_from_opencode_step_finish_usage():
events = [
{
"type": "step_finish",
"part": {
"type": "step-finish",
"tokens": {
"input": 13084,
"output": 33,
"reasoning": 61,
"cache": {"read": 128, "write": 2},
},
},
},
{
"type": "step_finish",
"part": {
"type": "step-finish",
"tokens": {
"input": 209,
"output": 84,
"reasoning": 17,
"cache": {"read": 13184, "write": 0},
},
},
},
]

answer, trajectory = _build_trajectory_from_events(events, [])

assert answer == ""
assert len(trajectory.turns) == 1
assert trajectory.turns[0].input_tokens == 26607
assert trajectory.turns[0].output_tokens == 195


def test_runner_defaults():
runner = OpenCodeAgentRunner(server_paths={}, model="opencode/gpt-5")
assert runner._model_id == "opencode/gpt-5"
Expand Down
67 changes: 67 additions & 0 deletions src/evaluation/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,10 @@ def _from_sdk_trajectory(traj: dict, model: str) -> OpsMetrics:
turns = traj.get("turns", []) or []
tokens_in = sum(int(t.get("input_tokens") or 0) for t in turns)
tokens_out = sum(int(t.get("output_tokens") or 0) for t in turns)
if tokens_in == 0 and tokens_out == 0:
raw_events = traj.get("raw_events") or []
if isinstance(raw_events, list):
tokens_in, tokens_out = _usage_from_raw_events(raw_events)

durations_ms = [t.get("duration_ms") for t in turns if t.get("duration_ms") is not None]
duration_ms = sum(durations_ms) if durations_ms else None
Expand All @@ -61,6 +65,69 @@ def _from_sdk_trajectory(traj: dict, model: str) -> OpsMetrics:
)


def _walk_dicts(value: Any):
if isinstance(value, dict):
yield value
for child in value.values():
yield from _walk_dicts(child)
elif isinstance(value, list):
for child in value:
yield from _walk_dicts(child)


def _token_count(value: Any) -> int:
if isinstance(value, bool):
return 0
if isinstance(value, int):
return value
if isinstance(value, float) and value.is_integer():
return int(value)
if isinstance(value, str):
try:
return int(value)
except ValueError:
return 0
return 0


def _usage_from_raw_events(events: list[Any]) -> tuple[int, int]:
"""Extract token usage from raw event schemas kept for parser fallback."""
input_tokens = 0
output_tokens = 0
sdk_input_tokens = 0
sdk_output_tokens = 0
for event in events:
for item in _walk_dicts(event):
tokens = item.get("tokens")
if isinstance(tokens, dict):
cache = tokens.get("cache") if isinstance(tokens.get("cache"), dict) else {}
input_tokens += (
_token_count(tokens.get("input"))
+ _token_count(cache.get("read"))
+ _token_count(cache.get("write"))
)
output_tokens += _token_count(tokens.get("output")) + _token_count(
tokens.get("reasoning")
)
continue

in_value = (
item.get("input_tokens")
or item.get("inputTokens")
or item.get("prompt_tokens")
or item.get("promptTokens")
)
out_value = (
item.get("output_tokens")
or item.get("outputTokens")
or item.get("completion_tokens")
or item.get("completionTokens")
)
sdk_input_tokens = max(sdk_input_tokens, _token_count(in_value))
sdk_output_tokens = max(sdk_output_tokens, _token_count(out_value))
return input_tokens or sdk_input_tokens, output_tokens or sdk_output_tokens


def _from_plan_execute(steps: list[Any], model: str) -> OpsMetrics:
# plan-execute persists ``list[StepResult]``; the dataclass exposes
# ``server`` / ``tool`` / ``response`` fields but no per-step token
Expand Down
46 changes: 46 additions & 0 deletions src/evaluation/tests/test_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,52 @@ def test_handles_none_trajectory(self, make_persisted_record):
rec = PersistedTrajectory.from_raw(make_persisted_record(trajectory=None))
assert metrics_from_trajectory(rec) == OpsMetrics()

def test_sdk_trajectory_falls_back_to_raw_event_tokens(self, make_persisted_record):
rec = PersistedTrajectory.from_raw(
make_persisted_record(
trajectory={
"turns": [
{
"index": 0,
"text": "answer",
"tool_calls": [],
"input_tokens": 0,
"output_tokens": 0,
}
],
"raw_events": [
{
"type": "step_finish",
"part": {
"tokens": {
"input": 13084,
"output": 33,
"reasoning": 61,
"cache": {"read": 128, "write": 2},
}
},
},
{
"type": "step_finish",
"part": {
"tokens": {
"input": 209,
"output": 84,
"reasoning": 17,
"cache": {"read": 13184, "write": 0},
}
},
},
],
}
)
)

m = metrics_from_trajectory(rec)

assert m.tokens_in == 26607
assert m.tokens_out == 195

def test_plan_execute_list_trajectory(self, make_persisted_record):
rec = PersistedTrajectory.from_raw(
make_persisted_record(
Expand Down