Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit c6461a4

Browse filesBrowse files
authored
perf(storage): use google_crc32c.value for checksums (#16719)
Updates _ReadResumptionStrategy to use google_crc32c.value(data) instead of manually converting Checksum(data).digest() to an integer. Updated unit tests to mock google_crc32c.value accordingly. Test code ``` import timeit import os import google_crc32c from google_crc32c import Checksum def method1(data): return int.from_bytes(Checksum(data).digest(), "big") def method2(data): return google_crc32c.value(data) def benchmark(): # Testing larger sizes and more iterations # 1KB, 1MB, 10MB, 100MB data_sizes = [1024, 1024 * 1024, 10 * 1024 * 1024, 100 * 1024 * 1024] for size in data_sizes: data = os.urandom(size) print(f"\nData size: {size / (1024*1024):.2f} MB" if size >= 1024*1024 else f"\nData size: {size / 1024:.2f} KB") # Correctness check res1 = method1(data) res2 = method2(data) assert res1 == res2, f"Failed for size {size}: {res1} != {res2}" print("Assertion passed: results are equal.") # Increase iterations for more stable results, with a minimum of 1000 if size <= 1024: number = 100000 elif size <= 1024 * 1024: number = 10000 else: number = 1000 t1 = timeit.timeit(lambda: method1(data), number=number) t2 = timeit.timeit(lambda: method2(data), number=number) avg1 = t1 / number avg2 = t2 / number print(f"Method 1 (Checksum(data).digest()): {t1:.6f} s total ({avg1:.8f} s/call)") print(f"Method 2 (google_crc32c.value(data)): {t2:.6f} s total ({avg2:.8f} s/call)") print(f"Improvement: {(t1 - t2) / t1 * 100:.2f}%") if __name__ == "__main__": print(f"google_crc32c implementation: {google_crc32c.implementation}") benchmark() ``` Output ``` google_crc32c implementation: c Data size: 1.00 KB Assertion passed: results are equal. Method 1 (Checksum(data).digest()): 0.046088 s total (0.00000046 s/call) Method 2 (google_crc32c.value(data)): 0.016062 s total (0.00000016 s/call) Improvement: 65.15% Data size: 1.00 MB Assertion passed: results are equal. Method 1 (Checksum(data).digest()): 0.464044 s total (0.00004640 s/call) Method 2 (google_crc32c.value(data)): 0.439121 s total (0.00004391 s/call) Improvement: 5.37% Data size: 10.00 MB Assertion passed: results are equal. Method 1 (Checksum(data).digest()): 0.450953 s total (0.00045095 s/call) Method 2 (google_crc32c.value(data)): 0.445793 s total (0.00044579 s/call) Improvement: 1.14% Data size: 100.00 MB Assertion passed: results are equal. Method 1 (Checksum(data).digest()): 6.287893 s total (0.00628789 s/call) Method 2 (google_crc32c.value(data)): 6.095833 s total (0.00609583 s/call) Improvement: 3.05% ```
1 parent 7c8412a commit c6461a4
Copy full SHA for c6461a4

2 files changed

+12-20Lines changed: 12 additions & 20 deletions

File tree

Expand file treeCollapse file tree
Open diff view settings
Filter options
Expand file treeCollapse file tree
Open diff view settings
Collapse file

‎packages/google-cloud-storage/google/cloud/storage/asyncio/retry/reads_resumption_strategy.py‎

Copy file name to clipboardExpand all lines: packages/google-cloud-storage/google/cloud/storage/asyncio/retry/reads_resumption_strategy.py
+2-2Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
import logging
1616
from typing import IO, Any, Dict, List
1717

18-
from google_crc32c import Checksum
18+
import google_crc32c
1919

2020
from google.cloud import _storage_v2 as storage_v2
2121
from google.cloud.storage.asyncio.retry._helpers import (
@@ -127,7 +127,7 @@ def update_state_from_response(
127127

128128
if checksummed_data.HasField("crc32c"):
129129
server_checksum = checksummed_data.crc32c
130-
client_checksum = int.from_bytes(Checksum(data).digest(), "big")
130+
client_checksum = google_crc32c.value(data)
131131
if server_checksum != client_checksum:
132132
raise DataCorruption(
133133
response,
Collapse file

‎packages/google-cloud-storage/tests/unit/asyncio/test_async_multi_range_downloader.py‎

Copy file name to clipboardExpand all lines: packages/google-cloud-storage/tests/unit/asyncio/test_async_multi_range_downloader.py
+10-18Lines changed: 10 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -324,10 +324,10 @@ def test_init_raises_if_crc32c_c_extension_is_missing(self, mock_google_crc32c):
324324
)
325325

326326
@pytest.mark.asyncio
327-
@mock.patch("google.cloud.storage.asyncio.retry.reads_resumption_strategy.Checksum")
328-
async def test_download_ranges_raises_on_checksum_mismatch(
329-
self, mock_checksum_class
330-
):
327+
@mock.patch(
328+
"google.cloud.storage.asyncio.retry.reads_resumption_strategy.google_crc32c.value"
329+
)
330+
async def test_download_ranges_raises_on_checksum_mismatch(self, mock_crc32c_value):
331331
from google.cloud.storage.asyncio._stream_multiplexer import _StreamMultiplexer
332332
from google.cloud.storage.asyncio.async_multi_range_downloader import (
333333
AsyncMultiRangeDownloader,
@@ -340,8 +340,7 @@ async def test_download_ranges_raises_on_checksum_mismatch(
340340

341341
test_data = b"some-data"
342342
server_checksum = 12345
343-
mock_checksum_instance = mock_checksum_class.return_value
344-
mock_checksum_instance.digest.return_value = (54321).to_bytes(4, "big")
343+
mock_crc32c_value.return_value = 54321
345344

346345
mock_response = _storage_v2.BidiReadObjectResponse(
347346
object_data_ranges=[
@@ -372,7 +371,7 @@ async def test_download_ranges_raises_on_checksum_mismatch(
372371
await mrd.download_ranges([(0, len(test_data), BytesIO())])
373372

374373
assert "Checksum mismatch" in str(exc_info.value)
375-
mock_checksum_class.assert_called_once_with(test_data)
374+
mock_crc32c_value.assert_called_once_with(test_data)
376375

377376
@mock.patch(
378377
"google.cloud.storage.asyncio.async_multi_range_downloader.AsyncMultiRangeDownloader.open",
@@ -575,18 +574,11 @@ async def staged_recv():
575574
# Act
576575
buffer = BytesIO()
577576

578-
# Patch Checksum where it is likely used (reads_resumption_strategy or similar),
579-
# but actually if we use google_crc32c directly, we should patch that or provide valid CRC.
580-
# Since we can't reliably predict where Checksum is imported/used without more digging,
581-
# let's provide a valid CRC for b"data".
582-
# Checksum(b"data").digest() -> needs to match crc32c=123.
583-
# But we can't force b"data" to have crc=123.
584-
# So we MUST patch Checksum.
585-
# It is used in google.cloud.storage.asyncio.retry.reads_resumption_strategy
577+
# Patch google_crc32c.value where it is used in reads_resumption_strategy
586578
with mock.patch(
587-
"google.cloud.storage.asyncio.retry.reads_resumption_strategy.Checksum"
588-
) as mock_chk:
589-
mock_chk.return_value.digest.return_value = (123).to_bytes(4, "big")
579+
"google.cloud.storage.asyncio.retry.reads_resumption_strategy.google_crc32c.value"
580+
) as mock_crc_value:
581+
mock_crc_value.return_value = 123
590582
await mock_mrd.download_ranges([(0, 4, buffer)])
591583

592584
# Assert

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.