From 346f5cc1dfa676288b5e51720437dec19355217d Mon Sep 17 00:00:00 2001 From: Gandalf the Greybeard Date: Thu, 23 Apr 2026 23:28:28 +0000 Subject: [PATCH] fix: prevent UTF-8 corruption when RTK truncation splits multi-byte codepoints (FAR-19) The trunc function in the RTK filter script now walks back from the truncation point past continuation bytes and checks whether the full codepoint fits, avoiding replacement characters from mid-codepoint slicing. Co-Authored-By: Paperclip --- src/server/job-manifest.test.ts | 22 ++++++++++++++++++++++ src/server/job-manifest.ts | 3 ++- 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/src/server/job-manifest.test.ts b/src/server/job-manifest.test.ts index eccf02e..36bb852 100644 --- a/src/server/job-manifest.test.ts +++ b/src/server/job-manifest.test.ts @@ -802,6 +802,28 @@ describe("buildJobManifest", () => { expect(filterScript).toContain("tool_result"); }); + it("filter script truncates without corrupting multi-byte UTF-8", () => { + // "中" is U+4E2D, 3 bytes in UTF-8: E4 B8 AD + // With MAX=5, two "中" (6 bytes) should truncate to one (3 bytes), not + // produce a replacement character from slicing mid-codepoint. + const setup = buildRtkSetupCommands(5); + const b64Matches = [...setup.matchAll(/Buffer\.from\('([A-Za-z0-9+/=]+)','base64'\)/g)]; + const filterScript = Buffer.from(b64Matches[0]![1], "base64").toString("utf-8"); + + // Extract the trunc function from the filter script and evaluate it + const fnMatch = filterScript.match(/(function trunc\(s\)\{.*\})(?=const tr=)/); + expect(fnMatch).toBeTruthy(); + // eslint-disable-next-line no-eval + const trunc = eval(`(()=>{const MAX=5;${fnMatch![1]};return trunc;})()`); + + const result = trunc("中中"); + expect(result).not.toContain("�"); + expect(result).toContain("中"); + expect(result).toContain("truncated by paperclip-rtk"); + // Should report bytes from the actual truncation point, not MAX + expect(result).toContain("3 bytes truncated"); + }); + it("filter script handles array content (block format)", () => { const setup = buildRtkSetupCommands(50000); const b64Matches = [...setup.matchAll(/Buffer\.from\('([A-Za-z0-9+/=]+)','base64'\)/g)]; diff --git a/src/server/job-manifest.ts b/src/server/job-manifest.ts index 438953d..cc59383 100644 --- a/src/server/job-manifest.ts +++ b/src/server/job-manifest.ts @@ -47,7 +47,8 @@ export function buildRtkSetupCommands(maxOutputBytes: number): string { `if(typeof s!=='string')return s;`, `const b=Buffer.from(s,'utf-8');`, `if(b.length<=MAX)return s;`, - `return b.slice(0,MAX).toString('utf-8')+'\\n[...'+(b.length-MAX)+' bytes truncated by paperclip-rtk]';`, + `let e=MAX;if(e>0){let p=e-1;while(p>0&&(b[p]&0xC0)===0x80)p--;const l=b[p];let n=1;if((l&0xE0)===0xC0)n=2;else if((l&0xF0)===0xE0)n=3;else if((l&0xF8)===0xF0)n=4;if(p+n>e)e=p;}`, + `return b.slice(0,e).toString('utf-8')+'\\n[...'+(b.length-e)+' bytes truncated by paperclip-rtk]';`, `}`, `const tr=o&&(o.tool_response||o.tool_result);`, `if(tr){`,