Skip to content

Commit b950a22

Browse files
authored
fix(guard): use UTF-8 safe string truncation in output preview logging (#3713)
## Summary Fixes #3711 — the WASM guard panics on multi-byte UTF-8 characters in tool response previews, poisoning the entire session. ## Root cause `label_response` in `lib.rs` truncates serialized JSON for debug logging using byte-index slicing (`&output_json[..500]`). When byte 500 falls in the middle of a multi-byte UTF-8 code point (CJK = 3 bytes, emoji = 4 bytes), Rust panics with "byte index is not a char boundary". Since this runs inside the WASM guest, the panic becomes a trap that permanently poisons the guard instance. ## Changes **New helper** — `safe_preview(s, max_bytes) -> &str`: - Uses `str::floor_char_boundary()` (stable since Rust 1.80) to find the nearest valid character boundary at or before the byte limit - Returns the full string when shorter than the limit **Three call sites fixed** in `label_response`: | Location | Before | After | |----------|--------|-------| | Path-specific output preview (~L808) | `&output_json[..500]` — **panics** | `safe_preview(&output_json, 500)` | | General output preview (~L939) | `&output_json[..500]` — **panics** | `safe_preview(&output_json, 500)` | | Input preview (~L752) | `from_utf8(&bytes[..500])` — silent drop | `safe_preview(full_str, 500)` — always logs | **8 unit tests** covering: - ASCII strings (under, at, and over the 500-byte limit) - CJK characters (3-byte UTF-8) crossing the boundary - Emoji (4-byte UTF-8) crossing the boundary - Accented characters (2-byte UTF-8) at exact boundary - Mixed ASCII + CJK content simulating real JSON payloads - Empty string edge case ## Evidence Discovered in `moeru-ai/airi` PR triage workflow ([run #24311673575](https://github.com/moeru-ai/airi/actions/runs/24311673575)) — a PR with a Chinese body caused the guard to crash, and all subsequent MCP calls failed with "WASM guard is unavailable after a previous trap". ## Verification `make agent-finished` passes — all unit + integration tests green.
2 parents 20a28d2 + c0dc2a4 commit b950a22

File tree

1 file changed

+118
-13
lines changed
  • guards/github-guard/rust-guard/src

1 file changed

+118
-13
lines changed

guards/github-guard/rust-guard/src/lib.rs

Lines changed: 118 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,23 @@ use std::sync::Mutex;
2626
const POLICY_SCOPE_ALL: &str = "all";
2727
const POLICY_SCOPE_PUBLIC: &str = "public";
2828

29+
/// Maximum number of bytes to include in a log preview of serialized JSON.
30+
const PREVIEW_MAX_BYTES: usize = 500;
31+
32+
/// Truncate a string to at most `max_bytes` bytes on a valid UTF-8 character
33+
/// boundary. Returns the full string when it is shorter than the limit.
34+
fn safe_preview(s: &str, max_bytes: usize) -> &str {
35+
if s.len() <= max_bytes {
36+
return s;
37+
}
38+
39+
let mut end = max_bytes;
40+
while end > 0 && !s.is_char_boundary(end) {
41+
end -= 1;
42+
}
43+
&s[..end]
44+
}
45+
2946
/// Global policy context for WASM runtime entry points.
3047
///
3148
/// `label_agent` stores the parsed policy here; `label_resource` and
@@ -748,9 +765,22 @@ pub extern "C" fn label_response(
748765
// Read input bytes
749766
let input_bytes = unsafe { slice::from_raw_parts(input_ptr as *const u8, input_len as usize) };
750767

751-
// Log first 500 chars of input to debug structure
752-
let preview_len = std::cmp::min(500, input_bytes.len());
753-
if let Ok(preview) = std::str::from_utf8(&input_bytes[..preview_len]) {
768+
// Log a bounded preview of the input for debugging.
769+
// Only decode up to PREVIEW_MAX_BYTES so logging stays cheap, and
770+
// if the prefix ends mid-codepoint, fall back to the valid UTF-8 prefix.
771+
let preview_bytes = &input_bytes[..input_bytes.len().min(PREVIEW_MAX_BYTES)];
772+
let preview = match std::str::from_utf8(preview_bytes) {
773+
Ok(s) => s,
774+
Err(e) => {
775+
let valid_up_to = e.valid_up_to();
776+
if valid_up_to == 0 {
777+
""
778+
} else {
779+
std::str::from_utf8(&preview_bytes[..valid_up_to]).unwrap_or("")
780+
}
781+
}
782+
};
783+
if !preview.is_empty() {
754784
log_info(&format!(" input_preview={}", preview));
755785
}
756786

@@ -804,11 +834,7 @@ pub extern "C" fn label_response(
804834
};
805835

806836
// Log output preview for debugging
807-
let output_preview = if output_json.len() > 500 {
808-
&output_json[..500]
809-
} else {
810-
&output_json
811-
};
837+
let output_preview = safe_preview(&output_json, PREVIEW_MAX_BYTES);
812838
log_info(&format!(" path_output_preview={}", output_preview));
813839

814840
if output_json.len() as u32 > output_size {
@@ -935,11 +961,7 @@ pub extern "C" fn label_response(
935961
};
936962

937963
// Log output preview for debugging
938-
let output_preview = if output_json.len() > 500 {
939-
&output_json[..500]
940-
} else {
941-
&output_json
942-
};
964+
let output_preview = safe_preview(&output_json, PREVIEW_MAX_BYTES);
943965
log_info(&format!(" output_preview={}", output_preview));
944966

945967
if output_json.len() as u32 > output_size {
@@ -1177,4 +1199,87 @@ mod tests {
11771199
final_integrity
11781200
);
11791201
}
1202+
1203+
// === UTF-8 safe preview tests (issue #3711) ===
1204+
1205+
#[test]
1206+
fn test_safe_preview_ascii_under_limit() {
1207+
let s = "hello";
1208+
assert_eq!(safe_preview(s, 500), "hello");
1209+
}
1210+
1211+
#[test]
1212+
fn test_safe_preview_ascii_at_limit() {
1213+
let s = "a".repeat(500);
1214+
assert_eq!(safe_preview(&s, 500), s.as_str());
1215+
}
1216+
1217+
#[test]
1218+
fn test_safe_preview_ascii_over_limit() {
1219+
let s = "a".repeat(600);
1220+
assert_eq!(safe_preview(&s, 500).len(), 500);
1221+
}
1222+
1223+
#[test]
1224+
fn test_safe_preview_cjk_boundary() {
1225+
// Each CJK character is 3 bytes in UTF-8. Build a string where byte 500
1226+
// falls in the middle of a character (500 is not divisible by 3).
1227+
// 166 chars = 498 bytes, 167 chars = 501 bytes.
1228+
let cjk = "中".repeat(167); // 501 bytes
1229+
assert_eq!(cjk.len(), 501);
1230+
1231+
let preview = safe_preview(&cjk, 500);
1232+
// Must truncate to 498 bytes (166 chars) — the last valid boundary before 500.
1233+
assert_eq!(preview.len(), 498);
1234+
assert_eq!(preview.chars().count(), 166);
1235+
}
1236+
1237+
#[test]
1238+
fn test_safe_preview_emoji_boundary() {
1239+
// 🎉 is 4 bytes in UTF-8. 125 emojis = 500 bytes exactly (boundary safe).
1240+
// 126 emojis = 504 bytes; truncating at 500 would split the 126th emoji.
1241+
let emoji = "🎉".repeat(126); // 504 bytes
1242+
assert_eq!(emoji.len(), 504);
1243+
1244+
let preview = safe_preview(&emoji, 500);
1245+
// Must truncate to 500 bytes (125 complete emojis).
1246+
assert_eq!(preview.len(), 500);
1247+
assert_eq!(preview.chars().count(), 125);
1248+
}
1249+
1250+
#[test]
1251+
fn test_safe_preview_mixed_content_near_boundary() {
1252+
// Simulate a JSON string with ASCII keys and a CJK value crossing byte 500.
1253+
// {"body":"<padding>中中中..."}
1254+
let prefix = "{\"body\":\""; // 9 bytes
1255+
let padding = "x".repeat(489); // 489 bytes — total so far: 498
1256+
let cjk_tail = "中中中中中"; // 5 × 3 = 15 bytes — subtotal: 513
1257+
1258+
let json = format!("{}{}{}\"}}", prefix, padding, cjk_tail); // +3 bytes for "\"}}" => 516 total
1259+
assert!(json.len() > 500);
1260+
1261+
let preview = safe_preview(&json, 500);
1262+
// Byte 498 is the start of the first CJK char (498..501). Byte 500 is
1263+
// mid-character, so floor_char_boundary(500) should give 498.
1264+
assert_eq!(preview.len(), 498);
1265+
// Verify it's valid UTF-8 (implicit — it's a &str).
1266+
assert!(preview.ends_with('x'));
1267+
}
1268+
1269+
#[test]
1270+
fn test_safe_preview_empty_string() {
1271+
assert_eq!(safe_preview("", 500), "");
1272+
}
1273+
1274+
#[test]
1275+
fn test_safe_preview_two_byte_chars() {
1276+
// é is 2 bytes in UTF-8. 250 chars = 500 bytes (exact boundary).
1277+
// 251 chars = 502 bytes; byte 500 is the first byte of the 251st char.
1278+
let accented = "é".repeat(251); // 502 bytes
1279+
assert_eq!(accented.len(), 502);
1280+
1281+
let preview = safe_preview(&accented, 500);
1282+
assert_eq!(preview.len(), 500);
1283+
assert_eq!(preview.chars().count(), 250);
1284+
}
11801285
}

0 commit comments

Comments
 (0)