From 1af7934bfd99a6a34e2912e98329775f37ae53cb Mon Sep 17 00:00:00 2001 From: Martin Torp Date: Wed, 10 Jun 2026 13:59:31 +0200 Subject: [PATCH] Retry transient full-scan upload failures (502/503/504/408, dropped connections) (#232) * Retry transient full-scan upload failures (502/503/504/408, dropped connections) A full-scan upload can fail transiently at the gateway/connection level - an HTTP 502/503/504/408, a dropped or reset connection, or a client-side timeout - without the server having created the scan. The CLI previously made exactly one attempt, so an entire run (including a completed reachability analysis) died on a single transient failure even though a retried upload almost always succeeds. create_full_scan now retries the fullscans POST up to 3 total attempts with increasing waits (~10s, then ~30s, plus jitter) on transient failures only: APIBadGateway (502), APIConnectionError, APITimeout, and catch-all APIFailure whose embedded original_status_code is 408/503/504. Dedicated 4xx classes, catch-all 400s, and error payloads are never retried. In these failure modes the server never finished reading the request body, so no scan was created and a retry does not duplicate one; in the rare case where a gateway timeout races a request the server later completes, the extra scan is benign and superseded by the retry (as if the CLI had run twice). The retry loop lives inside the existing try/finally so the brotli-compressed .socket.facts.json.br temp files survive until every attempt has finished; fullscans.post rebuilds its lazy file loaders from the plain paths on every call, so re-invoking it per attempt is safe. Assisted-by: Claude Code:claude-opus-4-8 * docs: drop the 'retry almost always succeeds' claim from retry comments * Move transient-error classification into the SDK; simplify retry loop Address review feedback on the upload retry: - The retry decision now delegates to APIFailure.is_transient_error() (socketdev>=3.3.0, SocketDev/socket-sdk-python#93), which classifies by the HTTP status code the SDK records when raising. The CLI no longer encodes the SDK's exception hierarchy or parses status codes out of message text, so SDK restructuring can't silently break the classification. - The backoff schedule is now the single source of truth for the loop: FULL_SCAN_UPLOAD_BACKOFF_SCHEDULE_SECONDS = (10.0, 30.0, None), where each entry is the wait before the next attempt and the final None re-raises instead of retrying. FULL_SCAN_UPLOAD_MAX_ATTEMPTS is computed from its length. Note: uv.lock is intentionally not regenerated yet - socketdev 3.3.0 must be released to PyPI first (blocked on socket-sdk-python#93). * Lock socketdev 3.3.0 socketdev 3.3.0 is now released, unblocking the >=3.3.0 floor bump. --- CHANGELOG.md | 19 ++ pyproject.toml | 4 +- socketsecurity/__init__.py | 2 +- socketsecurity/core/__init__.py | 44 ++++- tests/unit/test_full_scan_retry.py | 285 +++++++++++++++++++++++++++++ uv.lock | 10 +- 6 files changed, 355 insertions(+), 9 deletions(-) create mode 100644 tests/unit/test_full_scan_retry.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 060d486..e8d3d05 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,24 @@ # Changelog +## 2.4.8 + +### Fixed: retry transient full-scan upload failures + +- The full-scan upload (`POST /orgs//full-scans`) now retries transient + gateway/connection failures — HTTP 502/503/504/408, dropped or reset connections, and + request timeouts — up to 3 total attempts with increasing waits (~10s, then ~30s, plus + jitter). Such failures are intermittent and a retried upload almost always succeeds. + In these failure modes the server never finished reading the request body, so no scan + was created and a retry does not duplicate one; in the rare case where a gateway + timeout races a request the server later completes, the extra scan is benign and + superseded by the retried one (as if the CLI had run twice). + Non-transient errors (400/401/403/404/429 and error payloads) are never retried. Each + retry logs a warning explaining what failed and when the next attempt happens. +- Requires `socketdev>=3.3.0`: the SDK now records the HTTP status code on the exceptions + it raises and owns the transient-vs-deterministic classification + (`APIFailure.is_transient_error()`), so the CLI no longer parses status codes out of + exception message text. + ## 2.4.7 ### Changed: pin @coana-tech/cli version; auto-update is now opt-in diff --git a/pyproject.toml b/pyproject.toml index 6bed831..7876f8d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ build-backend = "hatchling.build" [project] name = "socketsecurity" -version = "2.4.7" +version = "2.4.8" requires-python = ">= 3.11" license = {"file" = "LICENSE"} dependencies = [ @@ -16,7 +16,7 @@ dependencies = [ 'GitPython', 'packaging', 'python-dotenv', - "socketdev>=3.2.1,<4.0.0", + "socketdev>=3.3.0,<4.0.0", "bs4>=0.0.2", "markdown>=3.10", "brotli>=1.0.9; platform_python_implementation == 'CPython'", diff --git a/socketsecurity/__init__.py b/socketsecurity/__init__.py index 91c7faf..9f1797f 100644 --- a/socketsecurity/__init__.py +++ b/socketsecurity/__init__.py @@ -1,3 +1,3 @@ __author__ = 'socket.dev' -__version__ = '2.4.7' +__version__ = '2.4.8' USER_AGENT = f'SocketPythonCLI/{__version__}' diff --git a/socketsecurity/core/__init__.py b/socketsecurity/core/__init__.py index 20ff28f..4e6c032 100644 --- a/socketsecurity/core/__init__.py +++ b/socketsecurity/core/__init__.py @@ -1,5 +1,6 @@ import logging import os +import random import re import sys import tarfile @@ -76,6 +77,21 @@ TIER1_FINALIZE_MAX_ATTEMPTS = 3 TIER1_FINALIZE_BACKOFF_SECONDS = 1.0 +# Full scan upload retry policy. An upload can fail transiently at the gateway/connection +# level (an HTTP 502/503/504/408, a dropped or reset connection, or a client-side timeout) +# without the server having created the scan; the SDK classifies those failures via +# APIFailure.is_transient_error() (socketdev>=3.3.0). In these failure modes no scan was +# created, so a retry does not duplicate one. (A duplicate is possible only if a gateway +# timeout races a request the server later completes; that is benign - the retried scan +# supersedes the orphaned one, same as running the CLI twice.) +# +# Each schedule entry is the wait before the next attempt once the current one fails (plus +# a little jitter so a fleet of CI jobs hitting the same failure doesn't retry in +# lock-step); the final None means the last attempt's failure is re-raised, not retried. +FULL_SCAN_UPLOAD_BACKOFF_SCHEDULE_SECONDS = (10.0, 30.0, None) +FULL_SCAN_UPLOAD_MAX_ATTEMPTS = len(FULL_SCAN_UPLOAD_BACKOFF_SCHEDULE_SECONDS) +FULL_SCAN_UPLOAD_BACKOFF_JITTER_SECONDS = 2.0 + def _humanize_alert_type(alert_type: str) -> str: """Convert a camelCase/PascalCase alert type into a Title-Cased label. @@ -787,7 +803,33 @@ def create_full_scan(self, files: List[str], params: FullScanParams, base_paths: # facts file under the per-file upload size cap. See _compress_facts_files_for_upload. upload_files, compressed_temp_files = self._compress_facts_files_for_upload(files) try: - res = self.sdk.fullscans.post(upload_files, params, use_types=True, use_lazy_loading=True, max_open_files=50, base_paths=base_paths) + # Retry transient gateway/timeout failures (502/503/504/408, dropped connections, + # timeouts) with increasing waits. In these failure modes the server never finished + # reading the request body, so no scan was created and a retry does not duplicate + # one (see the retry-policy comment above FULL_SCAN_UPLOAD_BACKOFF_SCHEDULE_SECONDS). + # fullscans.post() rebuilds its lazy file loaders from the plain paths in + # upload_files on every call, so simply calling it again per attempt is safe. The + # loop must stay inside this try so the temp .br files (cleaned up in the finally + # below) outlive every attempt. + for attempt, backoff_seconds in enumerate(FULL_SCAN_UPLOAD_BACKOFF_SCHEDULE_SECONDS, start=1): + try: + res = self.sdk.fullscans.post(upload_files, params, use_types=True, use_lazy_loading=True, max_open_files=50, base_paths=base_paths) + break + except APIFailure as error: + if backoff_seconds is None or not error.is_transient_error(): + raise + wait_seconds = backoff_seconds + random.uniform( + 0, FULL_SCAN_UPLOAD_BACKOFF_JITTER_SECONDS + ) + # SDK error messages can span many lines (path + response headers); the + # first line carries the status, which is all the warning needs. + error_summary = str(error).strip().splitlines()[0] if str(error).strip() else "" + log.warning( + f"Full scan upload failed with {type(error).__name__}({error_summary}), " + f"retrying in {wait_seconds:.0f}s " + f"(attempt {attempt + 1}/{FULL_SCAN_UPLOAD_MAX_ATTEMPTS})" + ) + time.sleep(wait_seconds) finally: for temp_file in compressed_temp_files: try: diff --git a/tests/unit/test_full_scan_retry.py b/tests/unit/test_full_scan_retry.py new file mode 100644 index 0000000..b31bb11 --- /dev/null +++ b/tests/unit/test_full_scan_retry.py @@ -0,0 +1,285 @@ +"""Tests for the full-scan upload retry on transient gateway/connection failures. + +A `POST /orgs//full-scans` upload can fail transiently (an HTTP 502/503/504/408, a +dropped or reset connection, or a timeout) without the server having created the scan. +`Core.create_full_scan` retries the failures the SDK classifies as transient +(`APIFailure.is_transient_error()`, socketdev>=3.3.0); these tests cover the retry +decision, the loop bounds, and that the temporary brotli-compressed facts files survive +until every attempt has finished. +""" + +import logging +from unittest.mock import MagicMock + +import pytest +from socketdev.exceptions import ( + APIAccessDenied, + APIBadGateway, + APIConnectionError, + APIFailure, + APIResourceNotFound, + APITimeout, +) +from socketdev.fullscans import FullScanMetadata + +from socketsecurity.core import ( + FULL_SCAN_UPLOAD_MAX_ATTEMPTS, + SOCKET_FACTS_BROTLI_FILENAME, + SOCKET_FACTS_FILENAME, + Core, +) + + +def _success_response(): + metadata = FullScanMetadata( + id="scan-1", + created_at="2026-01-01T00:00:00Z", + updated_at="2026-01-01T00:00:00Z", + organization_id="org-1", + repository_id="repo-1", + branch="main", + html_report_url="https://socket.dev/report", + ) + response = MagicMock() + response.success = True + response.data = metadata + return response + + +# Catch-all APIFailure as the SDK raises it for statuses without a dedicated class +# (socketdev/core/api.py); the recorded status_code drives is_transient_error(). +def _catch_all_failure(status_code: int) -> APIFailure: + return APIFailure( + f"Bad Request: HTTP original_status_code:{status_code}\n" + f"Path: https://api.socket.dev/v0/orgs/org/full-scans\n\n" + f"Headers:\ncf-ray: abc123", + status_code=status_code, + ) + + +@pytest.fixture +def core_with_mock_sdk(): + # Build a Core without running org setup; we only exercise create_full_scan. + core = Core.__new__(Core) + core.sdk = MagicMock() + core.cli_config = None # skip the tier1 finalize branch + return core + + +@pytest.fixture(autouse=True) +def no_sleep(mocker): + return mocker.patch("socketsecurity.core.time.sleep") + + +def test_upload_succeeds_first_try(core_with_mock_sdk, tmp_path, no_sleep): + manifest = tmp_path / "package.json" + manifest.write_text("{}") + core_with_mock_sdk.sdk.fullscans.post.return_value = _success_response() + + full_scan = core_with_mock_sdk.create_full_scan([str(manifest)], MagicMock()) + + assert full_scan.id == "scan-1" + assert core_with_mock_sdk.sdk.fullscans.post.call_count == 1 + no_sleep.assert_not_called() + + +def test_upload_retries_on_502_then_succeeds( + core_with_mock_sdk, tmp_path, no_sleep, caplog +): + manifest = tmp_path / "package.json" + manifest.write_text("{}") + core_with_mock_sdk.sdk.fullscans.post.side_effect = [ + APIBadGateway(), + APIBadGateway(), + _success_response(), + ] + + with caplog.at_level(logging.WARNING, logger="socketdev"): + full_scan = core_with_mock_sdk.create_full_scan([str(manifest)], MagicMock()) + + assert full_scan.id == "scan-1" + assert core_with_mock_sdk.sdk.fullscans.post.call_count == 3 + assert no_sleep.call_count == 2 # waits before attempts 2 and 3 + retry_warnings = [r for r in caplog.records if "retrying in" in r.message] + assert len(retry_warnings) == 2 + assert "APIBadGateway" in retry_warnings[0].message + assert f"(attempt 2/{FULL_SCAN_UPLOAD_MAX_ATTEMPTS})" in retry_warnings[0].message + + +def test_upload_raises_after_exhausting_attempts( + core_with_mock_sdk, tmp_path, no_sleep +): + manifest = tmp_path / "package.json" + manifest.write_text("{}") + core_with_mock_sdk.sdk.fullscans.post.side_effect = APIBadGateway() + + with pytest.raises(APIBadGateway): + core_with_mock_sdk.create_full_scan([str(manifest)], MagicMock()) + + assert ( + core_with_mock_sdk.sdk.fullscans.post.call_count + == FULL_SCAN_UPLOAD_MAX_ATTEMPTS + ) + + +@pytest.mark.parametrize("status_code", [408, 503, 504]) +def test_upload_retries_on_catch_all_transient_statuses( + core_with_mock_sdk, tmp_path, no_sleep, status_code +): + manifest = tmp_path / "package.json" + manifest.write_text("{}") + core_with_mock_sdk.sdk.fullscans.post.side_effect = [ + _catch_all_failure(status_code), + _success_response(), + ] + + full_scan = core_with_mock_sdk.create_full_scan([str(manifest)], MagicMock()) + + assert full_scan.id == "scan-1" + assert core_with_mock_sdk.sdk.fullscans.post.call_count == 2 + + +@pytest.mark.parametrize("error_class", [APIConnectionError, APITimeout]) +def test_upload_retries_on_connection_level_errors( + core_with_mock_sdk, tmp_path, no_sleep, error_class +): + manifest = tmp_path / "package.json" + manifest.write_text("{}") + core_with_mock_sdk.sdk.fullscans.post.side_effect = [ + error_class(), + _success_response(), + ] + + full_scan = core_with_mock_sdk.create_full_scan([str(manifest)], MagicMock()) + + assert full_scan.id == "scan-1" + assert core_with_mock_sdk.sdk.fullscans.post.call_count == 2 + + +def test_upload_does_not_retry_on_400(core_with_mock_sdk, tmp_path, no_sleep): + manifest = tmp_path / "package.json" + manifest.write_text("{}") + core_with_mock_sdk.sdk.fullscans.post.side_effect = _catch_all_failure(400) + + with pytest.raises(APIFailure): + core_with_mock_sdk.create_full_scan([str(manifest)], MagicMock()) + + assert core_with_mock_sdk.sdk.fullscans.post.call_count == 1 + no_sleep.assert_not_called() + + +@pytest.mark.parametrize( + "error_class,status_code", [(APIAccessDenied, 401), (APIResourceNotFound, 404)] +) +def test_upload_does_not_retry_on_dedicated_4xx_classes( + core_with_mock_sdk, tmp_path, no_sleep, error_class, status_code +): + manifest = tmp_path / "package.json" + manifest.write_text("{}") + core_with_mock_sdk.sdk.fullscans.post.side_effect = error_class( + status_code=status_code + ) + + with pytest.raises(error_class): + core_with_mock_sdk.create_full_scan([str(manifest)], MagicMock()) + + assert core_with_mock_sdk.sdk.fullscans.post.call_count == 1 + no_sleep.assert_not_called() + + +def test_upload_does_not_retry_on_error_payload(core_with_mock_sdk, tmp_path, no_sleep): + # A response that came back but reports failure (res.success False) is not transient. + manifest = tmp_path / "package.json" + manifest.write_text("{}") + failed = MagicMock() + failed.success = False + failed.message = "tarball too large" + failed.status = 200 + core_with_mock_sdk.sdk.fullscans.post.return_value = failed + + with pytest.raises(Exception, match="tarball too large"): + core_with_mock_sdk.create_full_scan([str(manifest)], MagicMock()) + + assert core_with_mock_sdk.sdk.fullscans.post.call_count == 1 + no_sleep.assert_not_called() + + +def test_temp_br_file_survives_retries_and_is_cleaned_after( + core_with_mock_sdk, tmp_path, no_sleep +): + # The brotli-compressed facts sibling must stay on disk across every retry attempt + # (the SDK re-reads it per call) and only be deleted once all attempts finished. + facts = tmp_path / SOCKET_FACTS_FILENAME + facts.write_text('{"components": []}') + compressed = tmp_path / SOCKET_FACTS_BROTLI_FILENAME + br_present_per_attempt = [] + + def post_side_effect(upload_files, *args, **kwargs): + br_present_per_attempt.append(compressed.is_file()) + assert str(compressed) in upload_files + if len(br_present_per_attempt) < 3: + raise APIBadGateway() + return _success_response() + + core_with_mock_sdk.sdk.fullscans.post.side_effect = post_side_effect + + full_scan = core_with_mock_sdk.create_full_scan([str(facts)], MagicMock()) + + assert full_scan.id == "scan-1" + assert br_present_per_attempt == [True, True, True] + assert not compressed.exists() # cleaned up after the attempts finished + assert facts.is_file() # the original facts file is never touched + + +def test_temp_br_file_cleaned_after_exhausted_retries( + core_with_mock_sdk, tmp_path, no_sleep +): + facts = tmp_path / SOCKET_FACTS_FILENAME + facts.write_text('{"components": []}') + compressed = tmp_path / SOCKET_FACTS_BROTLI_FILENAME + core_with_mock_sdk.sdk.fullscans.post.side_effect = APIBadGateway() + + with pytest.raises(APIBadGateway): + core_with_mock_sdk.create_full_scan([str(facts)], MagicMock()) + + assert ( + core_with_mock_sdk.sdk.fullscans.post.call_count + == FULL_SCAN_UPLOAD_MAX_ATTEMPTS + ) + assert not compressed.exists() + + +class _StubFailure(APIFailure): + """An APIFailure whose transience is fixed, regardless of class or status code.""" + + def __init__(self, transient: bool): + super().__init__("stub failure") + self._transient = transient + + def is_transient_error(self) -> bool: + return self._transient + + +@pytest.mark.parametrize("transient,expected_calls", [(True, 2), (False, 1)]) +def test_retry_decision_delegates_to_sdk_classification( + core_with_mock_sdk, tmp_path, no_sleep, transient, expected_calls +): + # The CLI encodes no knowledge of the SDK's exception hierarchy or status codes: + # the retry decision is exactly APIFailure.is_transient_error(). (The transient / + # non-transient truth table itself is tested in the SDK, next to the code that + # raises the exceptions.) + manifest = tmp_path / "package.json" + manifest.write_text("{}") + core_with_mock_sdk.sdk.fullscans.post.side_effect = [ + _StubFailure(transient), + _success_response(), + ] + + if transient: + full_scan = core_with_mock_sdk.create_full_scan([str(manifest)], MagicMock()) + assert full_scan.id == "scan-1" + else: + with pytest.raises(_StubFailure): + core_with_mock_sdk.create_full_scan([str(manifest)], MagicMock()) + + assert core_with_mock_sdk.sdk.fullscans.post.call_count == expected_calls diff --git a/uv.lock b/uv.lock index 29cd641..0ed1361 100644 --- a/uv.lock +++ b/uv.lock @@ -1270,20 +1270,20 @@ wheels = [ [[package]] name = "socketdev" -version = "3.2.1" +version = "3.3.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "requests" }, { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/21/65/07df2bf6e490c56544fb06e4cfde059b2572fdd5b02ff352c766b1d5f7ce/socketdev-3.2.1.tar.gz", hash = "sha256:7db910a98628473e8ec06822deb01b6bd465b385e9e8ea405f2b7526e8258074", size = 179279, upload-time = "2026-06-03T18:08:19.806Z" } +sdist = { url = "https://files.pythonhosted.org/packages/25/30/16155f7f27d18274f364b3bd3506ee45d17f53fc8938aaea9a618054449b/socketdev-3.3.0.tar.gz", hash = "sha256:3d60bd4ac3201e9d581b1fe02bf2e6aef1b90c13ae75d15a8664aa9ef966734e", size = 181519, upload-time = "2026-06-10T11:41:17.942Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/53/01/fff70923755b3a187ca971189fb078a2aaedcad42d682abfdd06f3445def/socketdev-3.2.1-py3-none-any.whl", hash = "sha256:6dc762d78baea8011dc22f2afe49c84c926e640a6879bd7b58c3abdd4e29e8bb", size = 67266, upload-time = "2026-06-03T18:08:18.029Z" }, + { url = "https://files.pythonhosted.org/packages/33/dd/25622e033182e8c744d2420bb4f056206edc096a1e5ce8e4af4b0a0c0791/socketdev-3.3.0-py3-none-any.whl", hash = "sha256:513c045ce42bdd6cc2bb66a527f5863e0c399e56dbdcb1832cd5d94a5fb1a5e4", size = 67956, upload-time = "2026-06-10T11:41:16.534Z" }, ] [[package]] name = "socketsecurity" -version = "2.4.7" +version = "2.4.8" source = { editable = "." } dependencies = [ { name = "brotli", marker = "platform_python_implementation == 'CPython'" }, @@ -1340,7 +1340,7 @@ requires-dist = [ { name = "python-dotenv" }, { name = "requests" }, { name = "ruff", marker = "extra == 'dev'", specifier = ">=0.3.0" }, - { name = "socketdev", specifier = ">=3.2.1,<4.0.0" }, + { name = "socketdev", specifier = ">=3.3.0,<4.0.0" }, { name = "twine", marker = "extra == 'dev'" }, { name = "uv", marker = "extra == 'dev'", specifier = ">=0.1.0" }, ]