Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit 30ebe42

Browse filesBrowse files
authored
[opt](bloomfilter index) optimize memory usage for bloom filter index writer (#45833)
### What problem does this PR solve? Issue Number: close #xxx Related PR: #xxx Problem Summary: Optimize memory usage when adding string values for bloom filter index. Using uint64 hash value instead of string values itself, it is expected to save a lot of memory for especially long text
1 parent 6b51e9d commit 30ebe42
Copy full SHA for 30ebe42

File tree

Expand file treeCollapse file tree

3 files changed

+56
-12
lines changed
Filter options
Expand file treeCollapse file tree

3 files changed

+56
-12
lines changed

‎be/src/olap/rowset/segment_v2/bloom_filter.h

Copy file name to clipboardExpand all lines: be/src/olap/rowset/segment_v2/bloom_filter.h
+10Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -167,6 +167,16 @@ class BloomFilter {
167167
return hash_code;
168168
}
169169

170+
static Result<uint64_t> hash(const char* buf, uint32_t size, HashStrategyPB strategy) {
171+
if (strategy == HASH_MURMUR3_X64_64) {
172+
uint64_t hash_code;
173+
murmur_hash3_x64_64(buf, size, DEFAULT_SEED, &hash_code);
174+
return hash_code;
175+
} else {
176+
return Status::InvalidArgument("invalid strategy:{}", strategy);
177+
}
178+
}
179+
170180
virtual void add_bytes(const char* buf, uint32_t size) {
171181
if (buf == nullptr) {
172182
*_has_null = true;

‎be/src/olap/rowset/segment_v2/bloom_filter_index_writer.cpp

Copy file name to clipboardExpand all lines: be/src/olap/rowset/segment_v2/bloom_filter_index_writer.cpp
+16-11Lines changed: 16 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -78,9 +78,10 @@ class BloomFilterIndexWriterImpl : public BloomFilterIndexWriter {
7878
for (int i = 0; i < count; ++i) {
7979
if (_values.find(*v) == _values.end()) {
8080
if constexpr (_is_slice_type()) {
81-
CppType new_value;
82-
RETURN_IF_CATCH_EXCEPTION(_type_info->deep_copy(&new_value, v, &_arena));
83-
_values.insert(new_value);
81+
const auto* s = reinterpret_cast<const Slice*>(v);
82+
auto hash =
83+
DORIS_TRY(BloomFilter::hash(s->data, s->size, _bf_options.strategy));
84+
_hash_values.insert(hash);
8485
} else if constexpr (_is_int128()) {
8586
int128_t new_value;
8687
memcpy(&new_value, v, sizeof(PackedInt128));
@@ -99,25 +100,28 @@ class BloomFilterIndexWriterImpl : public BloomFilterIndexWriter {
99100
Status flush() override {
100101
std::unique_ptr<BloomFilter> bf;
101102
RETURN_IF_ERROR(BloomFilter::create(BLOCK_BLOOM_FILTER, &bf));
102-
RETURN_IF_ERROR(bf->init(_values.size(), _bf_options.fpp, _bf_options.strategy));
103-
bf->set_has_null(_has_null);
104-
for (auto& v : _values) {
105-
if constexpr (_is_slice_type()) {
106-
auto* s = (Slice*)&v;
107-
bf->add_bytes(s->data, s->size);
108-
} else {
103+
if constexpr (_is_slice_type()) {
104+
RETURN_IF_ERROR(bf->init(_hash_values.size(), _bf_options.fpp, _bf_options.strategy));
105+
for (const auto& h : _hash_values) {
106+
bf->add_hash(h);
107+
}
108+
} else {
109+
RETURN_IF_ERROR(bf->init(_values.size(), _bf_options.fpp, _bf_options.strategy));
110+
for (auto& v : _values) {
109111
bf->add_bytes((char*)&v, sizeof(CppType));
110112
}
111113
}
114+
bf->set_has_null(_has_null);
112115
_bf_buffer_size += bf->size();
113116
_bfs.push_back(std::move(bf));
114117
_values.clear();
118+
_hash_values.clear();
115119
_has_null = false;
116120
return Status::OK();
117121
}
118122

119123
Status finish(io::FileWriter* file_writer, ColumnIndexMetaPB* index_meta) override {
120-
if (_values.size() > 0) {
124+
if (_values.size() > 0 || !_hash_values.empty()) {
121125
RETURN_IF_ERROR(flush());
122126
}
123127
index_meta->set_type(BLOOM_FILTER_INDEX);
@@ -166,6 +170,7 @@ class BloomFilterIndexWriterImpl : public BloomFilterIndexWriter {
166170
// distinct values
167171
ValueDict _values;
168172
std::vector<std::unique_ptr<BloomFilter>> _bfs;
173+
std::set<uint64_t> _hash_values;
169174
};
170175

171176
} // namespace

‎be/test/olap/rowset/segment_v2/bloom_filter_index_reader_writer_test.cpp

Copy file name to clipboardExpand all lines: be/test/olap/rowset/segment_v2/bloom_filter_index_reader_writer_test.cpp
+30-1Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,12 @@ Status test_bloom_filter_index_reader_writer_template(
180180
}
181181
// test nullptr
182182
EXPECT_TRUE(bf->test_bytes(nullptr, 1));
183-
183+
if (is_slice_type) {
184+
Slice* value = (Slice*)(not_exist_value);
185+
EXPECT_FALSE(bf->test_bytes(value->data, value->size));
186+
} else {
187+
EXPECT_FALSE(bf->test_bytes((char*)not_exist_value, sizeof(CppType)));
188+
}
184189
delete reader;
185190
}
186191
return Status::OK();
@@ -803,5 +808,29 @@ TEST_F(BloomFilterIndexReaderWriterTest, test_bloom_filter_fpp_multiple) {
803808
test_bloom_filter_fpp(fpp);
804809
}
805810
}
811+
812+
TEST_F(BloomFilterIndexReaderWriterTest, test_slice_memory_usage) {
813+
size_t num = 1024 * 3;
814+
const size_t slice_size = 256;
815+
816+
std::vector<char> data_buffer;
817+
data_buffer.resize(num * slice_size);
818+
819+
std::vector<Slice> slice_vals(num);
820+
for (size_t i = 0; i < num; ++i) {
821+
char* ptr = data_buffer.data() + i * slice_size;
822+
memset(ptr, 'a' + (i % 26), slice_size);
823+
824+
slice_vals[i].data = ptr;
825+
slice_vals[i].size = slice_size;
826+
}
827+
828+
std::string not_exist_str = "not_exist_val";
829+
Slice not_exist_value(not_exist_str);
830+
831+
auto st = test_bloom_filter_index_reader_writer_template<FieldType::OLAP_FIELD_TYPE_VARCHAR>(
832+
"bloom_filter_large_slices", slice_vals.data(), num, 1, &not_exist_value, true, false);
833+
EXPECT_TRUE(st.ok());
834+
}
806835
} // namespace segment_v2
807836
} // namespace doris

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.