Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit 4e99bf6

Browse filesBrowse files
mscdexFishrock123
authored andcommitted
string_decoder: fix bad utf8 character handling
This commit fixes an issue when extra utf8 continuation bytes appear at the end of a chunk of data, causing miscalculations to be made when checking how many bytes are needed to decode a complete character. Fixes: #7308 PR-URL: #7310 Reviewed-By: Colin Ihrig <cjihrig@gmail.com> Reviewed-By: James M Snell <jasnell@gmail.com> Reviewed-By: Fedor Indutny <fedor.indutny@gmail.com>
1 parent 4000e0e commit 4e99bf6
Copy full SHA for 4e99bf6

File tree

Expand file treeCollapse file tree

2 files changed

+62
-15
lines changed
Open diff view settings
Filter options
Expand file treeCollapse file tree

2 files changed

+62
-15
lines changed
Open diff view settings
Collapse file

‎lib/string_decoder.js‎

Copy file name to clipboardExpand all lines: lib/string_decoder.js
+61-14Lines changed: 61 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -45,8 +45,10 @@ function StringDecoder(encoding) {
4545
case 'utf16le':
4646
this.text = utf16Text;
4747
this.end = utf16End;
48-
// fall through
48+
nb = 4;
49+
break;
4950
case 'utf8':
51+
this.fillLast = utf8FillLast;
5052
nb = 4;
5153
break;
5254
case 'base64':
@@ -88,7 +90,7 @@ StringDecoder.prototype.end = utf8End;
8890
// Returns only complete characters in a Buffer
8991
StringDecoder.prototype.text = utf8Text;
9092

91-
// Attempts to complete a partial character using bytes from a Buffer
93+
// Attempts to complete a partial non-UTF-8 character using bytes from a Buffer
9294
StringDecoder.prototype.fillLast = function(buf) {
9395
if (this.lastNeed <= buf.length) {
9496
buf.copy(this.lastChar, this.lastTotal - this.lastNeed, 0, this.lastNeed);
@@ -112,38 +114,83 @@ function utf8CheckByte(byte) {
112114
return -1;
113115
}
114116

115-
// Checks at most the last 3 bytes of a Buffer for an incomplete UTF-8
116-
// character, returning the total number of bytes needed to complete the partial
117-
// character (if applicable).
117+
// Checks at most 3 bytes at the end of a Buffer in order to detect an
118+
// incomplete multi-byte UTF-8 character. The total number of bytes (2, 3, or 4)
119+
// needed to complete the UTF-8 character (if applicable) are returned.
118120
function utf8CheckIncomplete(self, buf, i) {
119121
var j = buf.length - 1;
120122
if (j < i)
121123
return 0;
122-
var nb = utf8CheckByte(buf[j--]);
124+
var nb = utf8CheckByte(buf[j]);
123125
if (nb >= 0) {
124126
if (nb > 0)
125-
self.lastNeed = nb + 1 - (buf.length - j);
127+
self.lastNeed = nb - 1;
126128
return nb;
127129
}
128-
if (j < i)
130+
if (--j < i)
129131
return 0;
130-
nb = utf8CheckByte(buf[j--]);
132+
nb = utf8CheckByte(buf[j]);
131133
if (nb >= 0) {
132134
if (nb > 0)
133-
self.lastNeed = nb + 1 - (buf.length - j);
135+
self.lastNeed = nb - 2;
134136
return nb;
135137
}
136-
if (j < i)
138+
if (--j < i)
137139
return 0;
138-
nb = utf8CheckByte(buf[j--]);
140+
nb = utf8CheckByte(buf[j]);
139141
if (nb >= 0) {
140-
if (nb > 0)
141-
self.lastNeed = nb + 1 - (buf.length - j);
142+
if (nb > 0) {
143+
if (nb === 2)
144+
nb = 0;
145+
else
146+
self.lastNeed = nb - 3;
147+
}
142148
return nb;
143149
}
144150
return 0;
145151
}
146152

153+
// Validates as many continuation bytes for a multi-byte UTF-8 character as
154+
// needed or are available. If we see a non-continuation byte where we expect
155+
// one, we "replace" the validated continuation bytes we've seen so far with
156+
// UTF-8 replacement characters ('\ufffd'), to match v8's UTF-8 decoding
157+
// behavior. The continuation byte check is included three times in the case
158+
// where all of the continuation bytes for a character exist in the same buffer.
159+
// It is also done this way as a slight performance increase instead of using a
160+
// loop.
161+
function utf8CheckExtraBytes(self, buf, p) {
162+
if ((buf[0] & 0xC0) !== 0x80) {
163+
self.lastNeed = 0;
164+
return '\ufffd'.repeat(p);
165+
}
166+
if (self.lastNeed > 1 && buf.length > 1) {
167+
if ((buf[1] & 0xC0) !== 0x80) {
168+
self.lastNeed = 1;
169+
return '\ufffd'.repeat(p + 1);
170+
}
171+
if (self.lastNeed > 2 && buf.length > 2) {
172+
if ((buf[2] & 0xC0) !== 0x80) {
173+
self.lastNeed = 2;
174+
return '\ufffd'.repeat(p + 2);
175+
}
176+
}
177+
}
178+
}
179+
180+
// Attempts to complete a multi-byte UTF-8 character using bytes from a Buffer.
181+
function utf8FillLast(buf) {
182+
const p = this.lastTotal - this.lastNeed;
183+
var r = utf8CheckExtraBytes(this, buf, p);
184+
if (r !== undefined)
185+
return r;
186+
if (this.lastNeed <= buf.length) {
187+
buf.copy(this.lastChar, p, 0, this.lastNeed);
188+
return this.lastChar.toString(this.encoding, 0, this.lastTotal);
189+
}
190+
buf.copy(this.lastChar, p, 0, buf.length);
191+
this.lastNeed -= buf.length;
192+
}
193+
147194
// Returns all complete UTF-8 characters in a Buffer. If the Buffer ended on a
148195
// partial character, the character's bytes are buffered until the required
149196
// number of bytes are available.
Collapse file

‎test/parallel/test-string-decoder.js‎

Copy file name to clipboardExpand all lines: test/parallel/test-string-decoder.js
+1-1Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ assert.strictEqual(decoder.write(Buffer.from('\ufffd\ufffd\ufffd')),
5555
assert.strictEqual(decoder.end(), '');
5656

5757
decoder = new StringDecoder('utf8');
58-
assert.strictEqual(decoder.write(Buffer.from('efbfbde2', 'hex')), '\ufffd');
58+
assert.strictEqual(decoder.write(Buffer.from('EFBFBDE2', 'hex')), '\ufffd');
5959
assert.strictEqual(decoder.end(), '\ufffd');
6060

6161

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.