fix: prevent UTF-8 corruption when RTK truncation splits multi-byte codepoints (FAR-19)
The trunc function in the RTK filter script now walks back from the truncation point past continuation bytes and checks whether the full codepoint fits, avoiding replacement characters from mid-codepoint slicing. Co-Authored-By: Paperclip <noreply@paperclip.ing>
This commit is contained in:
@@ -802,6 +802,28 @@ describe("buildJobManifest", () => {
|
||||
expect(filterScript).toContain("tool_result");
|
||||
});
|
||||
|
||||
it("filter script truncates without corrupting multi-byte UTF-8", () => {
|
||||
// "中" is U+4E2D, 3 bytes in UTF-8: E4 B8 AD
|
||||
// With MAX=5, two "中" (6 bytes) should truncate to one (3 bytes), not
|
||||
// produce a replacement character from slicing mid-codepoint.
|
||||
const setup = buildRtkSetupCommands(5);
|
||||
const b64Matches = [...setup.matchAll(/Buffer\.from\('([A-Za-z0-9+/=]+)','base64'\)/g)];
|
||||
const filterScript = Buffer.from(b64Matches[0]![1], "base64").toString("utf-8");
|
||||
|
||||
// Extract the trunc function from the filter script and evaluate it
|
||||
const fnMatch = filterScript.match(/(function trunc\(s\)\{.*\})(?=const tr=)/);
|
||||
expect(fnMatch).toBeTruthy();
|
||||
// eslint-disable-next-line no-eval
|
||||
const trunc = eval(`(()=>{const MAX=5;${fnMatch![1]};return trunc;})()`);
|
||||
|
||||
const result = trunc("中中");
|
||||
expect(result).not.toContain("�");
|
||||
expect(result).toContain("中");
|
||||
expect(result).toContain("truncated by paperclip-rtk");
|
||||
// Should report bytes from the actual truncation point, not MAX
|
||||
expect(result).toContain("3 bytes truncated");
|
||||
});
|
||||
|
||||
it("filter script handles array content (block format)", () => {
|
||||
const setup = buildRtkSetupCommands(50000);
|
||||
const b64Matches = [...setup.matchAll(/Buffer\.from\('([A-Za-z0-9+/=]+)','base64'\)/g)];
|
||||
|
||||
@@ -47,7 +47,8 @@ export function buildRtkSetupCommands(maxOutputBytes: number): string {
|
||||
`if(typeof s!=='string')return s;`,
|
||||
`const b=Buffer.from(s,'utf-8');`,
|
||||
`if(b.length<=MAX)return s;`,
|
||||
`return b.slice(0,MAX).toString('utf-8')+'\\n[...'+(b.length-MAX)+' bytes truncated by paperclip-rtk]';`,
|
||||
`let e=MAX;if(e>0){let p=e-1;while(p>0&&(b[p]&0xC0)===0x80)p--;const l=b[p];let n=1;if((l&0xE0)===0xC0)n=2;else if((l&0xF0)===0xE0)n=3;else if((l&0xF8)===0xF0)n=4;if(p+n>e)e=p;}`,
|
||||
`return b.slice(0,e).toString('utf-8')+'\\n[...'+(b.length-e)+' bytes truncated by paperclip-rtk]';`,
|
||||
`}`,
|
||||
`const tr=o&&(o.tool_response||o.tool_result);`,
|
||||
`if(tr){`,
|
||||
|
||||
Reference in New Issue
Block a user