fix(guard): use UTF-8 safe string truncation in output preview logging

lpcox · Copilot · lpcox · commit 25f52d03a2aa · 2026-04-13T08:49:33.000-07:00
The label_response function panics when serialized JSON contains multi-byte UTF-8 characters (CJK, emoji, etc.) and byte index 500 falls in the middle of a code point. The panic causes a WASM trap that permanently poisons the guard instance — all subsequent MCP calls to that server fail for the rest of the session. Extract a safe_preview(s, max_bytes) helper that uses str::floor_char_boundary() (stable since Rust 1.80) to find the nearest valid character boundary at or before the limit. Replace all three preview truncation sites in label_response: - Line ~808: path-specific output preview (was panicking) - Line ~939: general output preview (was panicking) - Line ~752: input preview (was silently dropping the log) Add 8 unit tests covering ASCII, CJK (3-byte), emoji (4-byte), accented (2-byte), mixed content, empty strings, and boundary conditions. Fixes #3711 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
diff --git a/guards/github-guard/rust-guard/src/lib.rs b/guards/github-guard/rust-guard/src/lib.rs
@@ -26,6 +26,19 @@ use std::sync::Mutex;
 const POLICY_SCOPE_ALL: &str = "all";
 const POLICY_SCOPE_PUBLIC: &str = "public";
 
+/// Maximum number of bytes to include in a log preview of serialized JSON.
+const PREVIEW_MAX_BYTES: usize = 500;
+
+/// Truncate a string to at most `max_bytes` bytes on a valid UTF-8 character
+/// boundary. Returns the full string when it is shorter than the limit.
+fn safe_preview(s: &str, max_bytes: usize) -> &str {
+    if s.len() <= max_bytes {
+        return s;
+    }
+    let end = s.floor_char_boundary(max_bytes);
+    &s[..end]
+}
+
 /// Global policy context for WASM runtime entry points.
 ///
 /// `label_agent` stores the parsed policy here; `label_resource` and
@@ -748,9 +761,9 @@ pub extern "C" fn label_response(
     // Read input bytes
     let input_bytes = unsafe { slice::from_raw_parts(input_ptr as *const u8, input_len as usize) };
 
-    // Log first 500 chars of input to debug structure
-    let preview_len = std::cmp::min(500, input_bytes.len());
-    if let Ok(preview) = std::str::from_utf8(&input_bytes[..preview_len]) {
+    // Log first 500 bytes of input to debug structure (safe on char boundary)
+    if let Ok(full_str) = std::str::from_utf8(input_bytes) {
+        let preview = safe_preview(full_str, PREVIEW_MAX_BYTES);
         log_info(&format!("    input_preview={}", preview));
     }
 
@@ -804,11 +817,7 @@ pub extern "C" fn label_response(
         };
 
         // Log output preview for debugging
-        let output_preview = if output_json.len() > 500 {
-            &output_json[..500]
-        } else {
-            &output_json
-        };
+        let output_preview = safe_preview(&output_json, PREVIEW_MAX_BYTES);
         log_info(&format!("    path_output_preview={}", output_preview));
 
         if output_json.len() as u32 > output_size {
@@ -935,11 +944,7 @@ pub extern "C" fn label_response(
     };
 
     // Log output preview for debugging
-    let output_preview = if output_json.len() > 500 {
-        &output_json[..500]
-    } else {
-        &output_json
-    };
+    let output_preview = safe_preview(&output_json, PREVIEW_MAX_BYTES);
     log_info(&format!("    output_preview={}", output_preview));
 
     if output_json.len() as u32 > output_size {
@@ -1177,4 +1182,87 @@ mod tests {
             final_integrity
         );
     }
+
+    // === UTF-8 safe preview tests (issue #3711) ===
+
+    #[test]
+    fn test_safe_preview_ascii_under_limit() {
+        let s = "hello";
+        assert_eq!(safe_preview(s, 500), "hello");
+    }
+
+    #[test]
+    fn test_safe_preview_ascii_at_limit() {
+        let s = "a".repeat(500);
+        assert_eq!(safe_preview(&s, 500), s.as_str());
+    }
+
+    #[test]
+    fn test_safe_preview_ascii_over_limit() {
+        let s = "a".repeat(600);
+        assert_eq!(safe_preview(&s, 500).len(), 500);
+    }
+
+    #[test]
+    fn test_safe_preview_cjk_boundary() {
+        // Each CJK character is 3 bytes in UTF-8. Build a string where byte 500
+        // falls in the middle of a character (500 is not divisible by 3).
+        // 166 chars = 498 bytes, 167 chars = 501 bytes.
+        let cjk = "中".repeat(167); // 501 bytes
+        assert_eq!(cjk.len(), 501);
+
+        let preview = safe_preview(&cjk, 500);
+        // Must truncate to 498 bytes (166 chars) — the last valid boundary before 500.
+        assert_eq!(preview.len(), 498);
+        assert_eq!(preview.chars().count(), 166);
+    }
+
+    #[test]
+    fn test_safe_preview_emoji_boundary() {
+        // 🎉 is 4 bytes in UTF-8. 125 emojis = 500 bytes exactly (boundary safe).
+        // 126 emojis = 504 bytes; truncating at 500 would split the 126th emoji.
+        let emoji = "🎉".repeat(126); // 504 bytes
+        assert_eq!(emoji.len(), 504);
+
+        let preview = safe_preview(&emoji, 500);
+        // Must truncate to 500 bytes (125 complete emojis).
+        assert_eq!(preview.len(), 500);
+        assert_eq!(preview.chars().count(), 125);
+    }
+
+    #[test]
+    fn test_safe_preview_mixed_content_near_boundary() {
+        // Simulate a JSON string with ASCII keys and a CJK value crossing byte 500.
+        // {"body":"<padding>中中中..."}
+        let prefix = "{\"body\":\"";      // 9 bytes
+        let padding = "x".repeat(489);    // 489 bytes — total so far: 498
+        let cjk_tail = "中中中中中";       // 5 × 3 = 15 bytes — total: 513
+
+        let json = format!("{}{}{}\"}}",  prefix, padding, cjk_tail);
+        assert!(json.len() > 500);
+
+        let preview = safe_preview(&json, 500);
+        // Byte 498 is the start of the first CJK char (498..501). Byte 500 is
+        // mid-character, so floor_char_boundary(500) should give 498.
+        assert_eq!(preview.len(), 498);
+        // Verify it's valid UTF-8 (implicit — it's a &str).
+        assert!(preview.ends_with('x'));
+    }
+
+    #[test]
+    fn test_safe_preview_empty_string() {
+        assert_eq!(safe_preview("", 500), "");
+    }
+
+    #[test]
+    fn test_safe_preview_two_byte_chars() {
+        // é is 2 bytes in UTF-8. 250 chars = 500 bytes (exact boundary).
+        // 251 chars = 502 bytes; byte 500 is the first byte of the 251st char.
+        let accented = "é".repeat(251); // 502 bytes
+        assert_eq!(accented.len(), 502);
+
+        let preview = safe_preview(&accented, 500);
+        assert_eq!(preview.len(), 500);
+        assert_eq!(preview.chars().count(), 250);
+    }
 }