From 10f7cceefdacec0ece7c2a34fd7322069c73b1d4 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Thu, 10 Oct 2024 13:00:59 +0200 Subject: [PATCH 1/5] fix: use HttpHeaders type in Scrapy integration --- .pre-commit-config.yaml | 6 ------ src/apify/scrapy/requests.py | 3 ++- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 4a0af384..f12158e9 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -13,12 +13,6 @@ repos: language: system pass_filenames: false - - id: unit-tests - name: Run unit tests - entry: make unit-tests - language: system - pass_filenames: false - - id: check-changelog-entry name: Check changelog entry entry: make check-changelog-entry diff --git a/src/apify/scrapy/requests.py b/src/apify/scrapy/requests.py index dec279cf..2687a5e7 100644 --- a/src/apify/scrapy/requests.py +++ b/src/apify/scrapy/requests.py @@ -16,6 +16,7 @@ ) from exc from crawlee import Request as CrawleeRequest +from crawlee._types import HttpHeaders from crawlee._utils.crypto import crypto_random_object_id from crawlee._utils.requests import compute_unique_key, unique_key_to_request_id @@ -79,7 +80,7 @@ def to_apify_request(scrapy_request: Request, spider: Spider) -> CrawleeRequest # Convert Scrapy's headers to a dictionary and store them in the apify_request if isinstance(scrapy_request.headers, Headers): - apify_request.headers = dict(scrapy_request.headers.to_unicode_dict()) + apify_request.headers = HttpHeaders(scrapy_request.headers.to_unicode_dict()) else: Actor.log.warning( f'Invalid scrapy_request.headers type, not scrapy.http.headers.Headers: {scrapy_request.headers}' From 0c3f36da0f38c82feeec9e6d43d6af9aeab70096 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Thu, 10 Oct 2024 13:09:34 +0200 Subject: [PATCH 2/5] fix: standby_url is optional --- src/apify/_configuration.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/apify/_configuration.py b/src/apify/_configuration.py index 71e50547..4078589b 100644 --- a/src/apify/_configuration.py +++ b/src/apify/_configuration.py @@ -8,6 +8,7 @@ from typing_extensions import deprecated from crawlee._utils.models import timedelta_ms +from crawlee._utils.urls import validate_http_url from crawlee.configuration import Configuration as CrawleeConfiguration @@ -262,12 +263,13 @@ class Configuration(CrawleeConfiguration): ] = 4321 standby_url: Annotated[ - str, + str | None, + BeforeValidator(validate_http_url), Field( alias='actor_standby_url', description='URL for accessing web servers of Actor runs in Standby mode', ), - ] + ] = None token: Annotated[ str | None, From d765cd5ece0b85e306c6c2cdc50c3143dd2d7b5f Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Thu, 10 Oct 2024 13:26:34 +0200 Subject: [PATCH 3/5] fix actor env helpers test --- tests/unit/actor/test_actor_env_helpers.py | 216 ++++++++++----------- 1 file changed, 108 insertions(+), 108 deletions(-) diff --git a/tests/unit/actor/test_actor_env_helpers.py b/tests/unit/actor/test_actor_env_helpers.py index 51ed7257..94b83a7a 100644 --- a/tests/unit/actor/test_actor_env_helpers.py +++ b/tests/unit/actor/test_actor_env_helpers.py @@ -23,115 +23,115 @@ import pytest -class TestIsAtHome: - async def test_is_at_home_local(self) -> None: - async with Actor as actor: - is_at_home = actor.is_at_home() - assert is_at_home is False - - async def test_is_at_home_on_apify(self, monkeypatch: pytest.MonkeyPatch) -> None: - print('setenv') - monkeypatch.setenv(ApifyEnvVars.IS_AT_HOME, 'true') - async with Actor as actor: - is_at_home = actor.is_at_home() - assert is_at_home is True - - -class TestGetEnv: - async def test_get_env_use_env_vars(self, monkeypatch: pytest.MonkeyPatch) -> None: - ignored_env_vars = { - ApifyEnvVars.INPUT_KEY, - ApifyEnvVars.MEMORY_MBYTES, - ApifyEnvVars.STARTED_AT, - ApifyEnvVars.TIMEOUT_AT, - ApifyEnvVars.DEFAULT_DATASET_ID, - ApifyEnvVars.DEFAULT_KEY_VALUE_STORE_ID, - ApifyEnvVars.DEFAULT_REQUEST_QUEUE_ID, - ApifyEnvVars.SDK_LATEST_VERSION, - ApifyEnvVars.LOG_FORMAT, - ApifyEnvVars.LOG_LEVEL, - } - - legacy_env_vars = { - ApifyEnvVars.ACT_ID: ActorEnvVars.ID, - ApifyEnvVars.ACT_RUN_ID: ActorEnvVars.RUN_ID, - ApifyEnvVars.ACTOR_ID: ActorEnvVars.ID, - ApifyEnvVars.ACTOR_BUILD_ID: ActorEnvVars.BUILD_ID, - ApifyEnvVars.ACTOR_BUILD_NUMBER: ActorEnvVars.BUILD_NUMBER, - ApifyEnvVars.ACTOR_RUN_ID: ActorEnvVars.RUN_ID, - ApifyEnvVars.ACTOR_TASK_ID: ActorEnvVars.TASK_ID, - ApifyEnvVars.CONTAINER_URL: ActorEnvVars.WEB_SERVER_URL, - ApifyEnvVars.CONTAINER_PORT: ActorEnvVars.WEB_SERVER_PORT, - } - - # Set up random env vars - expected_get_env: dict[str, Any] = {} - expected_get_env[ApifyEnvVars.LOG_LEVEL.name.lower()] = 'INFO' - - for int_env_var in INTEGER_ENV_VARS: - if int_env_var in ignored_env_vars: - continue - - int_get_env_var = int_env_var.name.lower() - expected_get_env[int_get_env_var] = random.randint(1, 99999) - monkeypatch.setenv(int_env_var, f'{expected_get_env[int_get_env_var]}') - - for float_env_var in FLOAT_ENV_VARS: - if float_env_var in ignored_env_vars: - continue - - float_get_env_var = float_env_var.name.lower() - expected_get_env[float_get_env_var] = random.random() - monkeypatch.setenv(float_env_var, f'{expected_get_env[float_get_env_var]}') - - for bool_env_var in BOOL_ENV_VARS: - if bool_env_var in ignored_env_vars: - continue - - bool_get_env_var = bool_env_var.name.lower() - expected_get_env[bool_get_env_var] = random.choice([True, False]) - monkeypatch.setenv(bool_env_var, f'{"true" if expected_get_env[bool_get_env_var] else "false"}') - - for datetime_env_var in DATETIME_ENV_VARS: - if datetime_env_var in ignored_env_vars: - continue - - datetime_get_env_var = datetime_env_var.name.lower() - expected_get_env[datetime_get_env_var] = datetime.now(TzInfo(0)) # type: ignore - monkeypatch.setenv( - datetime_env_var, - expected_get_env[datetime_get_env_var].strftime('%Y-%m-%dT%H:%M:%S.%fZ'), - ) - - for string_env_var in STRING_ENV_VARS: - if string_env_var in ignored_env_vars: - continue - - string_get_env_var = string_env_var.name.lower() - expected_get_env[string_get_env_var] = ''.join(random.choices(string.ascii_uppercase + string.digits, k=10)) - monkeypatch.setenv(string_env_var, expected_get_env[string_get_env_var]) - - # We need this override so that the actor doesn't fail when connecting to the platform events websocket - monkeypatch.delenv(ActorEnvVars.EVENTS_WEBSOCKET_URL) - monkeypatch.delenv(ApifyEnvVars.ACTOR_EVENTS_WS_URL) - expected_get_env[ActorEnvVars.EVENTS_WEBSOCKET_URL.name.lower()] = None - expected_get_env[ApifyEnvVars.ACTOR_EVENTS_WS_URL.name.lower()] = None - - # Adjust expectations for timedelta fields - for env_name, env_value in expected_get_env.items(): - if env_name.endswith('_millis'): - expected_get_env[env_name] = timedelta(milliseconds=env_value) - - # Convert dedicated_cpus to float - expected_get_env[ApifyEnvVars.DEDICATED_CPUS.name.lower()] = float( - expected_get_env[ApifyEnvVars.DEDICATED_CPUS.name.lower()] +async def test_is_at_home_local() -> None: + async with Actor as actor: + is_at_home = actor.is_at_home() + assert is_at_home is False + + +async def test_is_at_home_on_apify(monkeypatch: pytest.MonkeyPatch) -> None: + print('setenv') + monkeypatch.setenv(ApifyEnvVars.IS_AT_HOME, 'true') + async with Actor as actor: + is_at_home = actor.is_at_home() + assert is_at_home is True + + +async def test_get_env_use_env_vars(monkeypatch: pytest.MonkeyPatch) -> None: + ignored_env_vars = { + ApifyEnvVars.INPUT_KEY, + ApifyEnvVars.MEMORY_MBYTES, + ApifyEnvVars.STARTED_AT, + ApifyEnvVars.TIMEOUT_AT, + ApifyEnvVars.DEFAULT_DATASET_ID, + ApifyEnvVars.DEFAULT_KEY_VALUE_STORE_ID, + ApifyEnvVars.DEFAULT_REQUEST_QUEUE_ID, + ApifyEnvVars.SDK_LATEST_VERSION, + ApifyEnvVars.LOG_FORMAT, + ApifyEnvVars.LOG_LEVEL, + ActorEnvVars.STANDBY_PORT, + } + + legacy_env_vars = { + ApifyEnvVars.ACT_ID: ActorEnvVars.ID, + ApifyEnvVars.ACT_RUN_ID: ActorEnvVars.RUN_ID, + ApifyEnvVars.ACTOR_ID: ActorEnvVars.ID, + ApifyEnvVars.ACTOR_BUILD_ID: ActorEnvVars.BUILD_ID, + ApifyEnvVars.ACTOR_BUILD_NUMBER: ActorEnvVars.BUILD_NUMBER, + ApifyEnvVars.ACTOR_RUN_ID: ActorEnvVars.RUN_ID, + ApifyEnvVars.ACTOR_TASK_ID: ActorEnvVars.TASK_ID, + ApifyEnvVars.CONTAINER_URL: ActorEnvVars.WEB_SERVER_URL, + ApifyEnvVars.CONTAINER_PORT: ActorEnvVars.WEB_SERVER_PORT, + } + + # Set up random env vars + expected_get_env: dict[str, Any] = {} + expected_get_env[ApifyEnvVars.LOG_LEVEL.name.lower()] = 'INFO' + + for int_env_var in INTEGER_ENV_VARS: + if int_env_var in ignored_env_vars: + continue + + int_get_env_var = int_env_var.name.lower() + expected_get_env[int_get_env_var] = random.randint(1, 99999) + monkeypatch.setenv(int_env_var, f'{expected_get_env[int_get_env_var]}') + + for float_env_var in FLOAT_ENV_VARS: + if float_env_var in ignored_env_vars: + continue + + float_get_env_var = float_env_var.name.lower() + expected_get_env[float_get_env_var] = random.random() + monkeypatch.setenv(float_env_var, f'{expected_get_env[float_get_env_var]}') + + for bool_env_var in BOOL_ENV_VARS: + if bool_env_var in ignored_env_vars: + continue + + bool_get_env_var = bool_env_var.name.lower() + expected_get_env[bool_get_env_var] = random.choice([True, False]) + monkeypatch.setenv(bool_env_var, f'{"true" if expected_get_env[bool_get_env_var] else "false"}') + + for datetime_env_var in DATETIME_ENV_VARS: + if datetime_env_var in ignored_env_vars: + continue + + datetime_get_env_var = datetime_env_var.name.lower() + expected_get_env[datetime_get_env_var] = datetime.now(TzInfo(0)) # type: ignore + monkeypatch.setenv( + datetime_env_var, + expected_get_env[datetime_get_env_var].strftime('%Y-%m-%dT%H:%M:%S.%fZ'), ) - # Update expectations for legacy configuration - for old_name, new_name in legacy_env_vars.items(): - expected_get_env[old_name.name.lower()] = expected_get_env[new_name.name.lower()] + for string_env_var in STRING_ENV_VARS: + if string_env_var in ignored_env_vars: + continue - await Actor.init() - assert Actor.get_env() == expected_get_env + string_get_env_var = string_env_var.name.lower() + expected_get_env[string_get_env_var] = ''.join(random.choices(string.ascii_uppercase + string.digits, k=10)) + monkeypatch.setenv(string_env_var, expected_get_env[string_get_env_var]) - await Actor.exit() + # We need this override so that the actor doesn't fail when connecting to the platform events websocket + monkeypatch.delenv(ActorEnvVars.EVENTS_WEBSOCKET_URL) + monkeypatch.delenv(ApifyEnvVars.ACTOR_EVENTS_WS_URL) + expected_get_env[ActorEnvVars.EVENTS_WEBSOCKET_URL.name.lower()] = None + expected_get_env[ApifyEnvVars.ACTOR_EVENTS_WS_URL.name.lower()] = None + + # Adjust expectations for timedelta fields + for env_name, env_value in expected_get_env.items(): + if env_name.endswith('_millis'): + expected_get_env[env_name] = timedelta(milliseconds=env_value) + + # Convert dedicated_cpus to float + expected_get_env[ApifyEnvVars.DEDICATED_CPUS.name.lower()] = float( + expected_get_env[ApifyEnvVars.DEDICATED_CPUS.name.lower()] + ) + + # Update expectations for legacy configuration + for old_name, new_name in legacy_env_vars.items(): + expected_get_env[old_name.name.lower()] = expected_get_env[new_name.name.lower()] + + await Actor.init() + assert Actor.get_env() == expected_get_env + + await Actor.exit() From 06688d8c6d2b674a2bad4f557742542efe7a1019 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Tue, 15 Oct 2024 09:10:28 +0200 Subject: [PATCH 4/5] tests are fixed --- .pre-commit-config.yaml | 6 +++--- pyproject.toml | 1 + src/apify/_configuration.py | 4 ++-- src/apify/scrapy/requests.py | 8 +------- tests/unit/scrapy/requests/test_to_apify_request.py | 4 +++- tests/unit/scrapy/requests/test_to_scrapy_request.py | 10 +++++----- 6 files changed, 15 insertions(+), 18 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f12158e9..c1cdf523 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -19,8 +19,8 @@ repos: language: system pass_filenames: false - - id: check-version-conflict - name: Check version conflict - entry: make check-version-conflict + - id: check-version-availability + name: Check version availability + entry: make check-version-availability language: system pass_filenames: false diff --git a/pyproject.toml b/pyproject.toml index f8c8e72b..5149ca2d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -162,6 +162,7 @@ max-branches = 18 [tool.pytest.ini_options] addopts = "-ra" +asyncio_default_fixture_loop_scope = "function" asyncio_mode = "auto" timeout = 1200 diff --git a/src/apify/_configuration.py b/src/apify/_configuration.py index 4078589b..685cc53a 100644 --- a/src/apify/_configuration.py +++ b/src/apify/_configuration.py @@ -263,13 +263,13 @@ class Configuration(CrawleeConfiguration): ] = 4321 standby_url: Annotated[ - str | None, + str, BeforeValidator(validate_http_url), Field( alias='actor_standby_url', description='URL for accessing web servers of Actor runs in Standby mode', ), - ] = None + ] = 'http://localhost' token: Annotated[ str | None, diff --git a/src/apify/scrapy/requests.py b/src/apify/scrapy/requests.py index 2687a5e7..fbca4dfc 100644 --- a/src/apify/scrapy/requests.py +++ b/src/apify/scrapy/requests.py @@ -165,13 +165,7 @@ def to_scrapy_request(apify_request: CrawleeRequest, spider: Spider) -> Request: # Add optional 'headers' field if apify_request.headers: - if isinstance(cast(Any, apify_request.headers), dict): - scrapy_request.headers = Headers(apify_request.headers) - else: - Actor.log.warning( - 'apify_request[headers] is not an instance of the dict class, ' - f'apify_request[headers] = {apify_request.headers}', - ) + scrapy_request.headers |= Headers(apify_request.headers) # Add optional 'userData' field if apify_request.user_data: diff --git a/tests/unit/scrapy/requests/test_to_apify_request.py b/tests/unit/scrapy/requests/test_to_apify_request.py index 2cd61dfb..3da084dd 100644 --- a/tests/unit/scrapy/requests/test_to_apify_request.py +++ b/tests/unit/scrapy/requests/test_to_apify_request.py @@ -4,6 +4,8 @@ from scrapy import Request, Spider from scrapy.http.headers import Headers +from crawlee._types import HttpHeaders + from apify.scrapy.requests import to_apify_request @@ -36,7 +38,7 @@ def test__to_apify_request__headers(spider: Spider) -> None: apify_request = to_apify_request(scrapy_request, spider) assert apify_request is not None - assert apify_request.headers == dict(scrapy_request_headers.to_unicode_dict()) + assert apify_request.headers == HttpHeaders(scrapy_request_headers.to_unicode_dict()) def test__to_apify_request__without_id_and_unique_key(spider: Spider) -> None: diff --git a/tests/unit/scrapy/requests/test_to_scrapy_request.py b/tests/unit/scrapy/requests/test_to_scrapy_request.py index 4cc69196..5b72d380 100644 --- a/tests/unit/scrapy/requests/test_to_scrapy_request.py +++ b/tests/unit/scrapy/requests/test_to_scrapy_request.py @@ -4,9 +4,9 @@ import pytest from scrapy import Request, Spider -from scrapy.http.headers import Headers from crawlee import Request as CrawleeRequest +from crawlee._types import HttpHeaders from apify.scrapy.requests import to_scrapy_request @@ -47,7 +47,7 @@ def test__to_scrapy_request__without_reconstruction_with_optional_fields(spider: method='GET', unique_key='https://crawlee.dev', id='fvwscO2UJLdr10B', - headers={'Authorization': 'Bearer access_token'}, + headers=HttpHeaders({'Authorization': 'Bearer access_token'}), user_data={'some_user_data': 'test'}, ) @@ -58,7 +58,7 @@ def test__to_scrapy_request__without_reconstruction_with_optional_fields(spider: assert apify_request.method == scrapy_request.method assert apify_request.id == scrapy_request.meta.get('apify_request_id') assert apify_request.unique_key == scrapy_request.meta.get('apify_request_unique_key') - assert Headers(apify_request.headers) == scrapy_request.headers + assert apify_request.headers.get('authorization') == scrapy_request.headers.get('authorization').decode() assert apify_request.user_data == scrapy_request.meta.get('userData') @@ -91,7 +91,7 @@ def test__to_scrapy_request__with_reconstruction_with_optional_fields(spider: Sp method='GET', id='fvwscO2UJLdr10B', unique_key='https://apify.com', - headers={'Authorization': 'Bearer access_token'}, + headers=HttpHeaders({'Authorization': 'Bearer access_token'}), user_data={ 'some_user_data': 'hello', 'scrapy_request': 'gASVJgIAAAAAAAB9lCiMA3VybJSMEWh0dHBzOi8vYXBpZnkuY29tlIwIY2FsbGJhY2uUTowHZXJy\nYmFja5ROjAdoZWFkZXJzlH2UKEMGQWNjZXB0lF2UQz90ZXh0L2h0bWwsYXBwbGljYXRpb24veGh0\nbWwreG1sLGFwcGxpY2F0aW9uL3htbDtxPTAuOSwqLyo7cT0wLjiUYUMPQWNjZXB0LUxhbmd1YWdl\nlF2UQwJlbpRhQwpVc2VyLUFnZW50lF2UQyNTY3JhcHkvMi4xMS4wICgraHR0cHM6Ly9zY3JhcHku\nb3JnKZRhQw9BY2NlcHQtRW5jb2RpbmeUXZRDDWd6aXAsIGRlZmxhdGWUYXWMBm1ldGhvZJSMA0dF\nVJSMBGJvZHmUQwCUjAdjb29raWVzlH2UjARtZXRhlH2UKIwQYXBpZnlfcmVxdWVzdF9pZJSMD2Z2\nd3NjTzJVSkxkcjEwQpSMGGFwaWZ5X3JlcXVlc3RfdW5pcXVlX2tleZSMEWh0dHBzOi8vYXBpZnku\nY29tlIwQZG93bmxvYWRfdGltZW91dJRHQGaAAAAAAACMDWRvd25sb2FkX3Nsb3SUjAlhcGlmeS5j\nb22UjBBkb3dubG9hZF9sYXRlbmN5lEc/tYIIAAAAAHWMCGVuY29kaW5nlIwFdXRmLTiUjAhwcmlv\ncml0eZRLAIwLZG9udF9maWx0ZXKUiYwFZmxhZ3OUXZSMCWNiX2t3YXJnc5R9lHUu\n', # noqa: E501 @@ -105,7 +105,7 @@ def test__to_scrapy_request__with_reconstruction_with_optional_fields(spider: Sp assert apify_request.method == scrapy_request.method assert apify_request.id == scrapy_request.meta.get('apify_request_id') assert apify_request.unique_key == scrapy_request.meta.get('apify_request_unique_key') - assert Headers(apify_request.headers) == scrapy_request.headers + assert apify_request.headers.get('authorization') == scrapy_request.headers.get('authorization').decode() assert apify_request.user_data == scrapy_request.meta.get('userData') From 09a4317328d0dcda66d3fb87ba8a6d9531d511de Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Tue, 15 Oct 2024 09:16:05 +0200 Subject: [PATCH 5/5] raise minimum required crawlee version --- pyproject.toml | 2 +- src/apify/scrapy/requests.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 5149ca2d..9c59f53d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,7 +48,7 @@ keywords = [ python = "^3.9" apify-client = ">=1.8.1" apify-shared = ">=1.1.2" -crawlee = ">=0.3.5" +crawlee = ">=0.3.8" cryptography = ">=42.0.0" httpx = ">=0.27.0" lazy-object-proxy = ">=1.10.0" diff --git a/src/apify/scrapy/requests.py b/src/apify/scrapy/requests.py index fbca4dfc..ec6bf997 100644 --- a/src/apify/scrapy/requests.py +++ b/src/apify/scrapy/requests.py @@ -78,7 +78,7 @@ def to_apify_request(scrapy_request: Request, spider: Spider) -> CrawleeRequest id=request_id, ) - # Convert Scrapy's headers to a dictionary and store them in the apify_request + # Convert Scrapy's headers to a HttpHeaders and store them in the apify_request if isinstance(scrapy_request.headers, Headers): apify_request.headers = HttpHeaders(scrapy_request.headers.to_unicode_dict()) else: