Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit dbe5898

Browse filesBrowse files
mcollinaaduh95
authored andcommitted
stream: fix UTF-8 character corruption in fast-utf8-stream
Fix releaseWritingBuf() to correctly handle partial writes that split multi-byte UTF-8 characters. The previous implementation incorrectly converted byte counts to character counts, causing: - 3-byte characters (CJK) to be silently dropped - 4-byte characters (emoji) to leave lone surrogates in the buffer The fix backs up from the byte position to find a valid UTF-8 character boundary by checking for continuation bytes (pattern 10xxxxxx), then decodes the properly-aligned bytes to get the correct character count. Also fixes a typo where this._asyncDrainScheduled was used instead of the private field this.#asyncDrainScheduled. Fixes: #61744 PR-URL: #61745 Reviewed-By: Yagiz Nizipli <yagiz@nizipli.com> Reviewed-By: James M Snell <jasnell@gmail.com> Reviewed-By: Colin Ihrig <cjihrig@gmail.com> Reviewed-By: Paolo Insogna <paolo@cowtech.it>
1 parent bd34e53 commit dbe5898
Copy full SHA for dbe5898

2 files changed

+340-6Lines changed: 340 additions & 6 deletions

File tree

Expand file treeCollapse file tree
Open diff view settings
Filter options
Expand file treeCollapse file tree
Open diff view settings
Collapse file

‎lib/internal/streams/fast-utf8-stream.js‎

Copy file name to clipboardExpand all lines: lib/internal/streams/fast-utf8-stream.js
+18-6Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -237,7 +237,7 @@ class Utf8Stream extends EventEmitter {
237237

238238
this.on('newListener', (name) => {
239239
if (name === 'drain') {
240-
this._asyncDrainScheduled = false;
240+
this.#asyncDrainScheduled = false;
241241
}
242242
});
243243

@@ -894,11 +894,23 @@ class Utf8Stream extends EventEmitter {
894894
* @returns {{writingBuf: string | Buffer, len: number}} released writingBuf and length
895895
*/
896896
function releaseWritingBuf(writingBuf, len, n) {
897-
// if Buffer.byteLength is equal to n, that means writingBuf contains no multi-byte character
898-
if (typeof writingBuf === 'string' && Buffer.byteLength(writingBuf) !== n) {
899-
// Since the fs.write callback parameter `n` means how many bytes the passed of string
900-
// We calculate the original string length for avoiding the multi-byte character issue
901-
n = Buffer.from(writingBuf).subarray(0, n).toString().length;
897+
if (typeof writingBuf === 'string') {
898+
const byteLength = Buffer.byteLength(writingBuf);
899+
if (byteLength !== n) {
900+
// Since fs.write returns the number of bytes written, we need to find
901+
// how many complete characters fit within those n bytes.
902+
// If a partial write splits a multi-byte UTF-8 character, we must back up
903+
// to the start of that character to avoid data corruption.
904+
const buf = Buffer.from(writingBuf);
905+
// Back up from position n to find a valid UTF-8 character boundary.
906+
// UTF-8 continuation bytes have the pattern 10xxxxxx (0x80-0xBF).
907+
// We need to find the start of the character that was split.
908+
while (n > 0 && (buf[n] & 0xC0) === 0x80) {
909+
n--;
910+
}
911+
// Decode the properly-aligned bytes to get the character count.
912+
n = buf.subarray(0, n).toString().length;
913+
}
902914
}
903915
len = MathMax(len - n, 0);
904916
writingBuf = writingBuf.slice(n);
Collapse file
+322Lines changed: 322 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,322 @@
1+
'use strict';
2+
3+
// Tests for UTF-8 character preservation when partial writes split multi-byte characters.
4+
// See: https://github.com/nodejs/node/issues/61744
5+
6+
const common = require('../common');
7+
const tmpdir = require('../common/tmpdir');
8+
const assert = require('node:assert');
9+
const {
10+
openSync,
11+
write,
12+
writeSync,
13+
} = require('node:fs');
14+
const { Utf8Stream } = require('node:fs');
15+
const { join } = require('node:path');
16+
const { isMainThread } = require('node:worker_threads');
17+
18+
tmpdir.refresh();
19+
if (isMainThread) {
20+
process.umask(0o000);
21+
}
22+
23+
let fileCounter = 0;
24+
25+
function getTempFile() {
26+
return join(tmpdir.path, `fastutf8stream-partial-${process.pid}-${Date.now()}-${fileCounter++}.log`);
27+
}
28+
29+
runTests(false);
30+
runTests(true);
31+
32+
function runTests(sync) {
33+
// Test 1: Partial write splitting a 3-byte UTF-8 character (CJK)
34+
// "abc中def" where "中" is 3 bytes (E4 B8 AD)
35+
// Simulate partial write of 4 bytes: "abc" (3 bytes) + first byte of "中"
36+
// The remaining buffer should be "中def" (not "def")
37+
{
38+
const dest = getTempFile();
39+
const fd = openSync(dest, 'w');
40+
41+
let firstWrite = true;
42+
const writtenChunks = [];
43+
const fsOverride = {};
44+
45+
if (sync) {
46+
fsOverride.writeSync = common.mustCall((...args) => {
47+
const data = args[1];
48+
writtenChunks.push(typeof data === 'string' ? data : data.toString());
49+
if (firstWrite) {
50+
firstWrite = false;
51+
// Simulate partial write: only 4 bytes written out of 9
52+
// This splits the 3-byte "中" character
53+
return 4;
54+
}
55+
return writeSync(...args);
56+
}, 2);
57+
} else {
58+
fsOverride.write = common.mustCall((...args) => {
59+
const data = args[1];
60+
writtenChunks.push(typeof data === 'string' ? data : data.toString());
61+
const callback = args[args.length - 1];
62+
if (firstWrite) {
63+
firstWrite = false;
64+
// Simulate partial write: only 4 bytes written out of 9
65+
process.nextTick(callback, null, 4);
66+
return;
67+
}
68+
return write(...args);
69+
}, 2);
70+
}
71+
72+
const stream = new Utf8Stream({
73+
fd,
74+
sync,
75+
minLength: 0,
76+
fs: fsOverride,
77+
});
78+
79+
stream.on('ready', common.mustCall(() => {
80+
stream.write('abc中def');
81+
stream.end();
82+
83+
stream.on('finish', common.mustCall(() => {
84+
// Verify the second chunk contains the preserved CJK character
85+
assert.strictEqual(writtenChunks.length, 2);
86+
assert.strictEqual(writtenChunks[0], 'abc中def'); // First attempt
87+
assert.strictEqual(writtenChunks[1], '中def'); // Retry with preserved char
88+
}));
89+
}));
90+
}
91+
92+
// Test 2: Partial write splitting a 4-byte UTF-8 character (emoji)
93+
// "hello🌍world" where "🌍" is 4 bytes (F0 9F 8C 8D)
94+
// Simulate partial write of 7 bytes: "hello" (5 bytes) + first 2 bytes of "🌍"
95+
// The remaining buffer should be "🌍world" (not a lone surrogate + "world")
96+
{
97+
const dest = getTempFile();
98+
const fd = openSync(dest, 'w');
99+
100+
let firstWrite = true;
101+
const writtenChunks = [];
102+
const fsOverride = {};
103+
104+
if (sync) {
105+
fsOverride.writeSync = common.mustCall((...args) => {
106+
const data = args[1];
107+
writtenChunks.push(typeof data === 'string' ? data : data.toString());
108+
if (firstWrite) {
109+
firstWrite = false;
110+
// Simulate partial write: only 7 bytes written
111+
return 7;
112+
}
113+
return writeSync(...args);
114+
}, 2);
115+
} else {
116+
fsOverride.write = common.mustCall((...args) => {
117+
const data = args[1];
118+
writtenChunks.push(typeof data === 'string' ? data : data.toString());
119+
const callback = args[args.length - 1];
120+
if (firstWrite) {
121+
firstWrite = false;
122+
process.nextTick(callback, null, 7);
123+
return;
124+
}
125+
return write(...args);
126+
}, 2);
127+
}
128+
129+
const stream = new Utf8Stream({
130+
fd,
131+
sync,
132+
minLength: 0,
133+
fs: fsOverride,
134+
});
135+
136+
stream.on('ready', common.mustCall(() => {
137+
stream.write('hello🌍world');
138+
stream.end();
139+
140+
stream.on('finish', common.mustCall(() => {
141+
assert.strictEqual(writtenChunks.length, 2);
142+
assert.strictEqual(writtenChunks[0], 'hello🌍world'); // First attempt
143+
assert.strictEqual(writtenChunks[1], '🌍world'); // Retry with preserved emoji
144+
145+
// Verify no lone surrogates in the retry chunk
146+
const retryChunk = writtenChunks[1];
147+
for (let i = 0; i < retryChunk.length; i++) {
148+
const code = retryChunk.charCodeAt(i);
149+
if (code >= 0xD800 && code <= 0xDBFF) {
150+
// High surrogate - next must be low surrogate
151+
const next = retryChunk.charCodeAt(i + 1);
152+
assert.ok(next >= 0xDC00 && next <= 0xDFFF,
153+
`Found lone high surrogate at position ${i}`);
154+
i++; // Skip the low surrogate we just verified
155+
} else if (code >= 0xDC00 && code <= 0xDFFF) {
156+
// Low surrogate without preceding high surrogate
157+
assert.fail(`Found lone low surrogate at position ${i}: 0x${code.toString(16)}`);
158+
}
159+
}
160+
}));
161+
}));
162+
}
163+
164+
// Test 3: Partial write at exactly 0 bytes (edge case)
165+
{
166+
const dest = getTempFile();
167+
const fd = openSync(dest, 'w');
168+
169+
let firstWrite = true;
170+
const writtenChunks = [];
171+
const fsOverride = {};
172+
173+
if (sync) {
174+
fsOverride.writeSync = common.mustCall((...args) => {
175+
const data = args[1];
176+
writtenChunks.push(typeof data === 'string' ? data : data.toString());
177+
if (firstWrite) {
178+
firstWrite = false;
179+
return 0; // No bytes written
180+
}
181+
return writeSync(...args);
182+
}, 2);
183+
} else {
184+
fsOverride.write = common.mustCall((...args) => {
185+
const data = args[1];
186+
writtenChunks.push(typeof data === 'string' ? data : data.toString());
187+
const callback = args[args.length - 1];
188+
if (firstWrite) {
189+
firstWrite = false;
190+
process.nextTick(callback, null, 0);
191+
return;
192+
}
193+
return write(...args);
194+
}, 2);
195+
}
196+
197+
const stream = new Utf8Stream({
198+
fd,
199+
sync,
200+
minLength: 0,
201+
fs: fsOverride,
202+
});
203+
204+
stream.on('ready', common.mustCall(() => {
205+
stream.write('中文');
206+
stream.end();
207+
208+
stream.on('finish', common.mustCall(() => {
209+
assert.strictEqual(writtenChunks.length, 2);
210+
assert.strictEqual(writtenChunks[0], '中文');
211+
assert.strictEqual(writtenChunks[1], '中文'); // Entire string retried
212+
}));
213+
}));
214+
}
215+
216+
// Test 4: Partial write splitting between characters (not mid-character)
217+
// This should work the same as before - no character preservation needed
218+
{
219+
const dest = getTempFile();
220+
const fd = openSync(dest, 'w');
221+
222+
let firstWrite = true;
223+
const writtenChunks = [];
224+
const fsOverride = {};
225+
226+
if (sync) {
227+
fsOverride.writeSync = common.mustCall((...args) => {
228+
const data = args[1];
229+
writtenChunks.push(typeof data === 'string' ? data : data.toString());
230+
if (firstWrite) {
231+
firstWrite = false;
232+
// Write exactly 3 bytes ("abc"), which is a clean character boundary
233+
return 3;
234+
}
235+
return writeSync(...args);
236+
}, 2);
237+
} else {
238+
fsOverride.write = common.mustCall((...args) => {
239+
const data = args[1];
240+
writtenChunks.push(typeof data === 'string' ? data : data.toString());
241+
const callback = args[args.length - 1];
242+
if (firstWrite) {
243+
firstWrite = false;
244+
process.nextTick(callback, null, 3);
245+
return;
246+
}
247+
return write(...args);
248+
}, 2);
249+
}
250+
251+
const stream = new Utf8Stream({
252+
fd,
253+
sync,
254+
minLength: 0,
255+
fs: fsOverride,
256+
});
257+
258+
stream.on('ready', common.mustCall(() => {
259+
stream.write('abc中def');
260+
stream.end();
261+
262+
stream.on('finish', common.mustCall(() => {
263+
assert.strictEqual(writtenChunks.length, 2);
264+
assert.strictEqual(writtenChunks[0], 'abc中def');
265+
assert.strictEqual(writtenChunks[1], '中def'); // Remaining after 3 bytes
266+
}));
267+
}));
268+
}
269+
270+
// Test 5: Single multi-byte character with partial write of 1 byte
271+
{
272+
const dest = getTempFile();
273+
const fd = openSync(dest, 'w');
274+
275+
let firstWrite = true;
276+
const writtenChunks = [];
277+
const fsOverride = {};
278+
279+
if (sync) {
280+
fsOverride.writeSync = common.mustCall((...args) => {
281+
const data = args[1];
282+
writtenChunks.push(typeof data === 'string' ? data : data.toString());
283+
if (firstWrite) {
284+
firstWrite = false;
285+
// Write only 1 byte of a 3-byte character
286+
return 1;
287+
}
288+
return writeSync(...args);
289+
}, 2);
290+
} else {
291+
fsOverride.write = common.mustCall((...args) => {
292+
const data = args[1];
293+
writtenChunks.push(typeof data === 'string' ? data : data.toString());
294+
const callback = args[args.length - 1];
295+
if (firstWrite) {
296+
firstWrite = false;
297+
process.nextTick(callback, null, 1);
298+
return;
299+
}
300+
return write(...args);
301+
}, 2);
302+
}
303+
304+
const stream = new Utf8Stream({
305+
fd,
306+
sync,
307+
minLength: 0,
308+
fs: fsOverride,
309+
});
310+
311+
stream.on('ready', common.mustCall(() => {
312+
stream.write('中');
313+
stream.end();
314+
315+
stream.on('finish', common.mustCall(() => {
316+
assert.strictEqual(writtenChunks.length, 2);
317+
assert.strictEqual(writtenChunks[0], '中');
318+
assert.strictEqual(writtenChunks[1], '中'); // Full character retried
319+
}));
320+
}));
321+
}
322+
}

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.