Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit d99336a

Browse filesBrowse files
skomskijasnell
authored andcommitted
src: replace naive search in Buffer::IndexOf
Adds the string search implementation from v8 which uses naive search if pattern length < 8 or to a specific badness then uses Boyer-Moore-Horspool Added benchmark shows the expected improvements Added option to use ucs2 encoding with Buffer::IndexOf Reviewed-By: James M Snell <jasnell@gmail.com> Reviewed-By: Trevor Norris <trev.norris@gmail.com> PR-URL: #2539
1 parent 3eaa593 commit d99336a
Copy full SHA for d99336a

File tree

Expand file treeCollapse file tree

8 files changed

+4935
-60
lines changed
Open diff view settings
Filter options
Expand file treeCollapse file tree

8 files changed

+4935
-60
lines changed
Open diff view settings
Collapse file
+38Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
var common = require('../common.js');
2+
var fs = require('fs');
3+
4+
var bench = common.createBenchmark(main, {
5+
search: ['@', 'SQ', '10x', '--l', 'Alice', 'Gryphon', 'Panther',
6+
'Ou est ma chatte?', 'found it very', 'among mad people',
7+
'neighbouring pool', 'Soo--oop', 'aaaaaaaaaaaaaaaaa',
8+
'venture to go near the house till she had brought herself down to',
9+
'</i> to the Caterpillar'],
10+
encoding: ['undefined', 'utf8', 'ucs2', 'binary'],
11+
type: ['buffer', 'string'],
12+
iter: [1]
13+
});
14+
15+
function main(conf) {
16+
var iter = (conf.iter) * 100000;
17+
var aliceBuffer = fs.readFileSync(__dirname + '/../fixtures/alice.html');
18+
var search = conf.search;
19+
var encoding = conf.encoding;
20+
21+
if (encoding === 'undefined') {
22+
encoding = undefined;
23+
}
24+
25+
if (encoding === 'ucs2') {
26+
aliceBuffer = new Buffer(aliceBuffer.toString(), encoding);
27+
}
28+
29+
if (conf.type === 'buffer') {
30+
search = new Buffer(new Buffer(search).toString(), encoding);
31+
}
32+
33+
bench.start();
34+
for (var i = 0; i < iter; i++) {
35+
aliceBuffer.indexOf(search, 0, encoding);
36+
}
37+
bench.end(iter);
38+
}
Collapse file

‎benchmark/fixtures/alice.html‎

Copy file name to clipboardExpand all lines: benchmark/fixtures/alice.html
+3,865Lines changed: 3865 additions & 0 deletions
Large diffs are not rendered by default.
Collapse file

‎lib/buffer.js‎

Copy file name to clipboardExpand all lines: lib/buffer.js
+39-6Lines changed: 39 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -414,20 +414,53 @@ Buffer.prototype.compare = function compare(b) {
414414
return binding.compare(this, b);
415415
};
416416

417+
function slowIndexOf(buffer, val, byteOffset, encoding) {
418+
var loweredCase = false;
419+
for (;;) {
420+
switch (encoding) {
421+
case 'utf8':
422+
case 'utf-8':
423+
case 'ucs2':
424+
case 'ucs-2':
425+
case 'utf16le':
426+
case 'utf-16le':
427+
case 'binary':
428+
return binding.indexOfString(buffer, val, byteOffset, encoding);
417429

418-
Buffer.prototype.indexOf = function indexOf(val, byteOffset) {
430+
case 'base64':
431+
case 'ascii':
432+
case 'hex':
433+
return binding.indexOfBuffer(
434+
buffer, Buffer(val, encoding), byteOffset, encoding);
435+
436+
default:
437+
if (loweredCase) {
438+
throw new TypeError('Unknown encoding: ' + encoding);
439+
}
440+
441+
encoding = ('' + encoding).toLowerCase();
442+
loweredCase = true;
443+
}
444+
}
445+
}
446+
447+
Buffer.prototype.indexOf = function indexOf(val, byteOffset, encoding) {
419448
if (byteOffset > 0x7fffffff)
420449
byteOffset = 0x7fffffff;
421450
else if (byteOffset < -0x80000000)
422451
byteOffset = -0x80000000;
423452
byteOffset >>= 0;
424453

425-
if (typeof val === 'string')
426-
return binding.indexOfString(this, val, byteOffset);
427-
if (val instanceof Buffer)
428-
return binding.indexOfBuffer(this, val, byteOffset);
429-
if (typeof val === 'number')
454+
if (typeof val === 'string') {
455+
if (encoding === undefined) {
456+
return binding.indexOfString(this, val, byteOffset, encoding);
457+
}
458+
return slowIndexOf(this, val, byteOffset, encoding);
459+
} else if (val instanceof Buffer) {
460+
return binding.indexOfBuffer(this, val, byteOffset, encoding);
461+
} else if (typeof val === 'number') {
430462
return binding.indexOfNumber(this, val, byteOffset);
463+
}
431464

432465
throw new TypeError('val must be string, number or Buffer');
433466
};
Collapse file

‎node.gyp‎

Copy file name to clipboardExpand all lines: node.gyp
+1Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,7 @@
169169
'src/util.h',
170170
'src/util-inl.h',
171171
'src/util.cc',
172+
'src/string_search.cc',
172173
'deps/http_parser/http_parser.h',
173174
'deps/v8/include/v8.h',
174175
'deps/v8/include/v8-debug.h',
Collapse file

‎src/node_buffer.cc‎

Copy file name to clipboardExpand all lines: src/node_buffer.cc
+124-54Lines changed: 124 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
#include "env.h"
55
#include "env-inl.h"
66
#include "string_bytes.h"
7+
#include "string_search.h"
78
#include "util.h"
89
#include "util-inl.h"
910
#include "v8-profiler.h"
@@ -792,87 +793,156 @@ void Compare(const FunctionCallbackInfo<Value> &args) {
792793
}
793794

794795

795-
int32_t IndexOf(const char* haystack,
796-
size_t h_length,
797-
const char* needle,
798-
size_t n_length) {
799-
CHECK_GE(h_length, n_length);
800-
// TODO(trevnorris): Implement Boyer-Moore string search algorithm.
801-
for (size_t i = 0; i < h_length - n_length + 1; i++) {
802-
if (haystack[i] == needle[0]) {
803-
if (memcmp(haystack + i, needle, n_length) == 0)
804-
return i;
805-
}
806-
}
807-
return -1;
808-
}
809-
810-
811796
void IndexOfString(const FunctionCallbackInfo<Value>& args) {
812797
ASSERT(args[1]->IsString());
813798
ASSERT(args[2]->IsNumber());
814799

800+
enum encoding enc = ParseEncoding(args.GetIsolate(),
801+
args[3],
802+
UTF8);
803+
815804
THROW_AND_RETURN_UNLESS_BUFFER(Environment::GetCurrent(args), args[0]);
816805
SPREAD_ARG(args[0], ts_obj);
817806

818-
node::Utf8Value str(args.GetIsolate(), args[1]);
819-
int32_t offset_i32 = args[2]->Int32Value();
820-
uint32_t offset;
807+
Local<String> needle = args[1].As<String>();
808+
const char* haystack = ts_obj_data;
809+
const size_t haystack_length = ts_obj_length;
810+
const size_t needle_length = needle->Utf8Length();
811+
812+
813+
if (needle_length == 0 || haystack_length == 0) {
814+
return args.GetReturnValue().Set(-1);
815+
}
816+
817+
int64_t offset_i64 = args[2]->IntegerValue();
818+
size_t offset = 0;
821819

822-
if (offset_i32 < 0) {
823-
if (offset_i32 + static_cast<int32_t>(ts_obj_length) < 0)
820+
if (offset_i64 < 0) {
821+
if (offset_i64 + static_cast<int64_t>(haystack_length) < 0) {
824822
offset = 0;
825-
else
826-
offset = static_cast<uint32_t>(ts_obj_length + offset_i32);
823+
} else {
824+
offset = static_cast<size_t>(haystack_length + offset_i64);
825+
}
827826
} else {
828-
offset = static_cast<uint32_t>(offset_i32);
827+
offset = static_cast<size_t>(offset_i64);
829828
}
830829

831-
if (str.length() == 0 ||
832-
ts_obj_length == 0 ||
833-
(offset != 0 && str.length() + offset <= str.length()) ||
834-
str.length() + offset > ts_obj_length)
830+
if (haystack_length < offset || needle_length + offset > haystack_length) {
835831
return args.GetReturnValue().Set(-1);
832+
}
836833

837-
int32_t r =
838-
IndexOf(ts_obj_data + offset, ts_obj_length - offset, *str, str.length());
839-
args.GetReturnValue().Set(r == -1 ? -1 : static_cast<int32_t>(r + offset));
840-
}
834+
size_t result = haystack_length;
835+
836+
if (enc == UCS2) {
837+
String::Value needle_value(needle);
838+
if (*needle_value == nullptr)
839+
return args.GetReturnValue().Set(-1);
840+
841+
if (haystack_length < 2 || needle_value.length() < 1) {
842+
return args.GetReturnValue().Set(-1);
843+
}
844+
845+
result = SearchString(reinterpret_cast<const uint16_t*>(haystack),
846+
haystack_length / 2,
847+
reinterpret_cast<const uint16_t*>(*needle_value),
848+
needle_value.length(),
849+
offset / 2);
850+
result *= 2;
851+
} else if (enc == UTF8) {
852+
String::Utf8Value needle_value(needle);
853+
if (*needle_value == nullptr)
854+
return args.GetReturnValue().Set(-1);
855+
856+
result = SearchString(reinterpret_cast<const uint8_t*>(haystack),
857+
haystack_length,
858+
reinterpret_cast<const uint8_t*>(*needle_value),
859+
needle_length,
860+
offset);
861+
} else if (enc == BINARY) {
862+
uint8_t* needle_data = static_cast<uint8_t*>(malloc(needle_length));
863+
if (needle_data == nullptr) {
864+
return args.GetReturnValue().Set(-1);
865+
}
866+
needle->WriteOneByte(
867+
needle_data, 0, needle_length, String::NO_NULL_TERMINATION);
868+
869+
result = SearchString(reinterpret_cast<const uint8_t*>(haystack),
870+
haystack_length,
871+
needle_data,
872+
needle_length,
873+
offset);
874+
free(needle_data);
875+
}
841876

877+
args.GetReturnValue().Set(
878+
result == haystack_length ? -1 : static_cast<int>(result));
879+
}
842880

843881
void IndexOfBuffer(const FunctionCallbackInfo<Value>& args) {
844882
ASSERT(args[1]->IsObject());
845883
ASSERT(args[2]->IsNumber());
846884

885+
enum encoding enc = ParseEncoding(args.GetIsolate(),
886+
args[3],
887+
UTF8);
888+
847889
THROW_AND_RETURN_UNLESS_BUFFER(Environment::GetCurrent(args), args[0]);
848890
SPREAD_ARG(args[0], ts_obj);
849891
SPREAD_ARG(args[1], buf);
850-
const int32_t offset_i32 = args[2]->Int32Value();
851-
uint32_t offset;
852892

853893
if (buf_length > 0)
854894
CHECK_NE(buf_data, nullptr);
855895

856-
if (offset_i32 < 0) {
857-
if (offset_i32 + static_cast<int32_t>(ts_obj_length) < 0)
896+
const char* haystack = ts_obj_data;
897+
const size_t haystack_length = ts_obj_length;
898+
const char* needle = buf_data;
899+
const size_t needle_length = buf_length;
900+
901+
if (needle_length == 0 || haystack_length == 0) {
902+
return args.GetReturnValue().Set(-1);
903+
}
904+
905+
int64_t offset_i64 = args[2]->IntegerValue();
906+
size_t offset = 0;
907+
908+
if (offset_i64 < 0) {
909+
if (offset_i64 + static_cast<int64_t>(haystack_length) < 0)
858910
offset = 0;
859911
else
860-
offset = static_cast<uint32_t>(ts_obj_length + offset_i32);
912+
offset = static_cast<size_t>(haystack_length + offset_i64);
861913
} else {
862-
offset = static_cast<uint32_t>(offset_i32);
914+
offset = static_cast<size_t>(offset_i64);
863915
}
864916

865-
if (buf_length == 0 ||
866-
ts_obj_length == 0 ||
867-
(offset != 0 && buf_length + offset <= buf_length) ||
868-
buf_length + offset > ts_obj_length)
917+
if (haystack_length < offset || needle_length + offset > haystack_length) {
869918
return args.GetReturnValue().Set(-1);
919+
}
870920

871-
int32_t r =
872-
IndexOf(ts_obj_data + offset, ts_obj_length - offset, buf_data, buf_length);
873-
args.GetReturnValue().Set(r == -1 ? -1 : static_cast<int32_t>(r + offset));
874-
}
921+
size_t result = haystack_length;
875922

923+
if (enc == UCS2) {
924+
if (haystack_length < 2 || needle_length < 2) {
925+
return args.GetReturnValue().Set(-1);
926+
}
927+
result = SearchString(
928+
reinterpret_cast<const uint16_t*>(haystack),
929+
haystack_length / 2,
930+
reinterpret_cast<const uint16_t*>(needle),
931+
needle_length / 2,
932+
offset / 2);
933+
result *= 2;
934+
} else {
935+
result = SearchString(
936+
reinterpret_cast<const uint8_t*>(haystack),
937+
haystack_length,
938+
reinterpret_cast<const uint8_t*>(needle),
939+
needle_length,
940+
offset);
941+
}
942+
943+
args.GetReturnValue().Set(
944+
result == haystack_length ? -1 : static_cast<int>(result));
945+
}
876946

877947
void IndexOfNumber(const FunctionCallbackInfo<Value>& args) {
878948
ASSERT(args[1]->IsNumber());
@@ -882,25 +952,25 @@ void IndexOfNumber(const FunctionCallbackInfo<Value>& args) {
882952
SPREAD_ARG(args[0], ts_obj);
883953

884954
uint32_t needle = args[1]->Uint32Value();
885-
int32_t offset_i32 = args[2]->Int32Value();
886-
uint32_t offset;
955+
int64_t offset_i64 = args[2]->IntegerValue();
956+
size_t offset;
887957

888-
if (offset_i32 < 0) {
889-
if (offset_i32 + static_cast<int32_t>(ts_obj_length) < 0)
958+
if (offset_i64 < 0) {
959+
if (offset_i64 + static_cast<int64_t>(ts_obj_length) < 0)
890960
offset = 0;
891961
else
892-
offset = static_cast<uint32_t>(ts_obj_length + offset_i32);
962+
offset = static_cast<size_t>(ts_obj_length + offset_i64);
893963
} else {
894-
offset = static_cast<uint32_t>(offset_i32);
964+
offset = static_cast<size_t>(offset_i64);
895965
}
896966

897967
if (ts_obj_length == 0 || offset + 1 > ts_obj_length)
898968
return args.GetReturnValue().Set(-1);
899969

900970
void* ptr = memchr(ts_obj_data + offset, needle, ts_obj_length - offset);
901971
char* ptr_char = static_cast<char*>(ptr);
902-
args.GetReturnValue().Set(
903-
ptr ? static_cast<int32_t>(ptr_char - ts_obj_data) : -1);
972+
args.GetReturnValue().Set(ptr ? static_cast<int>(ptr_char - ts_obj_data)
973+
: -1);
904974
}
905975

906976

Collapse file

‎src/string_search.cc‎

Copy file name to clipboard
+10Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
#include "string_search.h"
2+
3+
namespace node {
4+
namespace stringsearch {
5+
6+
int StringSearchBase::kBadCharShiftTable[kUC16AlphabetSize];
7+
int StringSearchBase::kGoodSuffixShiftTable[kBMMaxShift + 1];
8+
int StringSearchBase::kSuffixTable[kBMMaxShift + 1];
9+
}
10+
} // namespace node::stringsearch

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.