apify · vdusek · Oct 15, 2024 · Oct 10, 2024 · Oct 10, 2024 · Oct 10, 2024
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -13,20 +13,14 @@ repos:
        language: system
        pass_filenames: false

-      - id: unit-tests
-        name: Run unit tests
-        entry: make unit-tests
-        language: system
-        pass_filenames: false
-
      - id: check-changelog-entry
        name: Check changelog entry
        entry: make check-changelog-entry
        language: system
        pass_filenames: false

-      - id: check-version-conflict
-        name: Check version conflict
-        entry: make check-version-conflict
+      - id: check-version-availability
+        name: Check version availability
+        entry: make check-version-availability
        language: system
        pass_filenames: false
diff --git a/pyproject.toml b/pyproject.toml
@@ -48,7 +48,7 @@ keywords = [
 python = "^3.9"
 apify-client = ">=1.8.1"
 apify-shared = ">=1.1.2"
-crawlee = ">=0.3.5"
+crawlee = ">=0.3.8"
 cryptography = ">=42.0.0"
 httpx = ">=0.27.0"
 lazy-object-proxy = ">=1.10.0"
@@ -162,6 +162,7 @@ max-branches = 18

 [tool.pytest.ini_options]
 addopts = "-ra"
+asyncio_default_fixture_loop_scope = "function"
 asyncio_mode = "auto"
 timeout = 1200


diff --git a/src/apify/_configuration.py b/src/apify/_configuration.py
@@ -8,6 +8,7 @@
 from typing_extensions import deprecated

 from crawlee._utils.models import timedelta_ms
+from crawlee._utils.urls import validate_http_url
 from crawlee.configuration import Configuration as CrawleeConfiguration


@@ -263,11 +264,12 @@ class Configuration(CrawleeConfiguration):

    standby_url: Annotated[
        str,
+        BeforeValidator(validate_http_url),
        Field(
            alias='actor_standby_url',
            description='URL for accessing web servers of Actor runs in Standby mode',
        ),
-    ]
+    ] = 'http://localhost'

    token: Annotated[
        str | None,

diff --git a/src/apify/scrapy/requests.py b/src/apify/scrapy/requests.py
@@ -16,6 +16,7 @@
    ) from exc

 from crawlee import Request as CrawleeRequest
+from crawlee._types import HttpHeaders
 from crawlee._utils.crypto import crypto_random_object_id
 from crawlee._utils.requests import compute_unique_key, unique_key_to_request_id

@@ -77,9 +78,9 @@ def to_apify_request(scrapy_request: Request, spider: Spider) -> CrawleeRequest
            id=request_id,
        )

-        # Convert Scrapy's headers to a dictionary and store them in the apify_request
+        # Convert Scrapy's headers to a HttpHeaders and store them in the apify_request
        if isinstance(scrapy_request.headers, Headers):
-            apify_request.headers = dict(scrapy_request.headers.to_unicode_dict())
+            apify_request.headers = HttpHeaders(scrapy_request.headers.to_unicode_dict())
        else:
            Actor.log.warning(
                f'Invalid scrapy_request.headers type, not scrapy.http.headers.Headers: {scrapy_request.headers}'
@@ -164,13 +165,7 @@ def to_scrapy_request(apify_request: CrawleeRequest, spider: Spider) -> Request:

    # Add optional 'headers' field
    if apify_request.headers:
-        if isinstance(cast(Any, apify_request.headers), dict):
-            scrapy_request.headers = Headers(apify_request.headers)
-        else:
-            Actor.log.warning(
-                'apify_request[headers] is not an instance of the dict class, '
-                f'apify_request[headers] = {apify_request.headers}',
-            )
+        scrapy_request.headers |= Headers(apify_request.headers)

    # Add optional 'userData' field
    if apify_request.user_data:

diff --git a/tests/unit/actor/test_actor_env_helpers.py b/tests/unit/actor/test_actor_env_helpers.py
@@ -23,115 +23,115 @@
    import pytest


-class TestIsAtHome:
-    async def test_is_at_home_local(self) -> None:
-        async with Actor as actor:
-            is_at_home = actor.is_at_home()
-            assert is_at_home is False
-
-    async def test_is_at_home_on_apify(self, monkeypatch: pytest.MonkeyPatch) -> None:
-        print('setenv')
-        monkeypatch.setenv(ApifyEnvVars.IS_AT_HOME, 'true')
-        async with Actor as actor:
-            is_at_home = actor.is_at_home()
-            assert is_at_home is True
-
-
-class TestGetEnv:
-    async def test_get_env_use_env_vars(self, monkeypatch: pytest.MonkeyPatch) -> None:
-        ignored_env_vars = {
-            ApifyEnvVars.INPUT_KEY,
-            ApifyEnvVars.MEMORY_MBYTES,
-            ApifyEnvVars.STARTED_AT,
-            ApifyEnvVars.TIMEOUT_AT,
-            ApifyEnvVars.DEFAULT_DATASET_ID,
-            ApifyEnvVars.DEFAULT_KEY_VALUE_STORE_ID,
-            ApifyEnvVars.DEFAULT_REQUEST_QUEUE_ID,
-            ApifyEnvVars.SDK_LATEST_VERSION,
-            ApifyEnvVars.LOG_FORMAT,
-            ApifyEnvVars.LOG_LEVEL,
-        }
-
-        legacy_env_vars = {
-            ApifyEnvVars.ACT_ID: ActorEnvVars.ID,
-            ApifyEnvVars.ACT_RUN_ID: ActorEnvVars.RUN_ID,
-            ApifyEnvVars.ACTOR_ID: ActorEnvVars.ID,
-            ApifyEnvVars.ACTOR_BUILD_ID: ActorEnvVars.BUILD_ID,
-            ApifyEnvVars.ACTOR_BUILD_NUMBER: ActorEnvVars.BUILD_NUMBER,
-            ApifyEnvVars.ACTOR_RUN_ID: ActorEnvVars.RUN_ID,
-            ApifyEnvVars.ACTOR_TASK_ID: ActorEnvVars.TASK_ID,
-            ApifyEnvVars.CONTAINER_URL: ActorEnvVars.WEB_SERVER_URL,
-            ApifyEnvVars.CONTAINER_PORT: ActorEnvVars.WEB_SERVER_PORT,
-        }
-
-        # Set up random env vars
-        expected_get_env: dict[str, Any] = {}
-        expected_get_env[ApifyEnvVars.LOG_LEVEL.name.lower()] = 'INFO'
-
-        for int_env_var in INTEGER_ENV_VARS:
-            if int_env_var in ignored_env_vars:
-                continue
-
-            int_get_env_var = int_env_var.name.lower()
-            expected_get_env[int_get_env_var] = random.randint(1, 99999)
-            monkeypatch.setenv(int_env_var, f'{expected_get_env[int_get_env_var]}')
-
-        for float_env_var in FLOAT_ENV_VARS:
-            if float_env_var in ignored_env_vars:
-                continue
-
-            float_get_env_var = float_env_var.name.lower()
-            expected_get_env[float_get_env_var] = random.random()
-            monkeypatch.setenv(float_env_var, f'{expected_get_env[float_get_env_var]}')
-
-        for bool_env_var in BOOL_ENV_VARS:
-            if bool_env_var in ignored_env_vars:
-                continue
-
-            bool_get_env_var = bool_env_var.name.lower()
-            expected_get_env[bool_get_env_var] = random.choice([True, False])
-            monkeypatch.setenv(bool_env_var, f'{"true" if expected_get_env[bool_get_env_var] else "false"}')
-
-        for datetime_env_var in DATETIME_ENV_VARS:
-            if datetime_env_var in ignored_env_vars:
-                continue
-
-            datetime_get_env_var = datetime_env_var.name.lower()
-            expected_get_env[datetime_get_env_var] = datetime.now(TzInfo(0))  # type: ignore
-            monkeypatch.setenv(
-                datetime_env_var,
-                expected_get_env[datetime_get_env_var].strftime('%Y-%m-%dT%H:%M:%S.%fZ'),
-            )
-
-        for string_env_var in STRING_ENV_VARS:
-            if string_env_var in ignored_env_vars:
-                continue
-
-            string_get_env_var = string_env_var.name.lower()
-            expected_get_env[string_get_env_var] = ''.join(random.choices(string.ascii_uppercase + string.digits, k=10))
-            monkeypatch.setenv(string_env_var, expected_get_env[string_get_env_var])
-
-        # We need this override so that the actor doesn't fail when connecting to the platform events websocket
-        monkeypatch.delenv(ActorEnvVars.EVENTS_WEBSOCKET_URL)
-        monkeypatch.delenv(ApifyEnvVars.ACTOR_EVENTS_WS_URL)
-        expected_get_env[ActorEnvVars.EVENTS_WEBSOCKET_URL.name.lower()] = None
-        expected_get_env[ApifyEnvVars.ACTOR_EVENTS_WS_URL.name.lower()] = None
-
-        # Adjust expectations for timedelta fields
-        for env_name, env_value in expected_get_env.items():
-            if env_name.endswith('_millis'):
-                expected_get_env[env_name] = timedelta(milliseconds=env_value)
-
-        # Convert dedicated_cpus to float
-        expected_get_env[ApifyEnvVars.DEDICATED_CPUS.name.lower()] = float(
-            expected_get_env[ApifyEnvVars.DEDICATED_CPUS.name.lower()]
+async def test_is_at_home_local() -> None:
+    async with Actor as actor:
+        is_at_home = actor.is_at_home()
+        assert is_at_home is False
+
+
+async def test_is_at_home_on_apify(monkeypatch: pytest.MonkeyPatch) -> None:
+    print('setenv')
+    monkeypatch.setenv(ApifyEnvVars.IS_AT_HOME, 'true')
+    async with Actor as actor:
+        is_at_home = actor.is_at_home()
+        assert is_at_home is True
+
+
+async def test_get_env_use_env_vars(monkeypatch: pytest.MonkeyPatch) -> None:
+    ignored_env_vars = {
+        ApifyEnvVars.INPUT_KEY,
+        ApifyEnvVars.MEMORY_MBYTES,
+        ApifyEnvVars.STARTED_AT,
+        ApifyEnvVars.TIMEOUT_AT,
+        ApifyEnvVars.DEFAULT_DATASET_ID,
+        ApifyEnvVars.DEFAULT_KEY_VALUE_STORE_ID,
+        ApifyEnvVars.DEFAULT_REQUEST_QUEUE_ID,
+        ApifyEnvVars.SDK_LATEST_VERSION,
+        ApifyEnvVars.LOG_FORMAT,
+        ApifyEnvVars.LOG_LEVEL,
+        ActorEnvVars.STANDBY_PORT,
+    }
+
+    legacy_env_vars = {
+        ApifyEnvVars.ACT_ID: ActorEnvVars.ID,
+        ApifyEnvVars.ACT_RUN_ID: ActorEnvVars.RUN_ID,
+        ApifyEnvVars.ACTOR_ID: ActorEnvVars.ID,
+        ApifyEnvVars.ACTOR_BUILD_ID: ActorEnvVars.BUILD_ID,
+        ApifyEnvVars.ACTOR_BUILD_NUMBER: ActorEnvVars.BUILD_NUMBER,
+        ApifyEnvVars.ACTOR_RUN_ID: ActorEnvVars.RUN_ID,
+        ApifyEnvVars.ACTOR_TASK_ID: ActorEnvVars.TASK_ID,
+        ApifyEnvVars.CONTAINER_URL: ActorEnvVars.WEB_SERVER_URL,
+        ApifyEnvVars.CONTAINER_PORT: ActorEnvVars.WEB_SERVER_PORT,
+    }
+
+    # Set up random env vars
+    expected_get_env: dict[str, Any] = {}
+    expected_get_env[ApifyEnvVars.LOG_LEVEL.name.lower()] = 'INFO'
+
+    for int_env_var in INTEGER_ENV_VARS:
+        if int_env_var in ignored_env_vars:
+            continue
+
+        int_get_env_var = int_env_var.name.lower()
+        expected_get_env[int_get_env_var] = random.randint(1, 99999)
+        monkeypatch.setenv(int_env_var, f'{expected_get_env[int_get_env_var]}')
+
+    for float_env_var in FLOAT_ENV_VARS:
+        if float_env_var in ignored_env_vars:
+            continue
+
+        float_get_env_var = float_env_var.name.lower()
+        expected_get_env[float_get_env_var] = random.random()
+        monkeypatch.setenv(float_env_var, f'{expected_get_env[float_get_env_var]}')
+
+    for bool_env_var in BOOL_ENV_VARS:
+        if bool_env_var in ignored_env_vars:
+            continue
+
+        bool_get_env_var = bool_env_var.name.lower()
+        expected_get_env[bool_get_env_var] = random.choice([True, False])
+        monkeypatch.setenv(bool_env_var, f'{"true" if expected_get_env[bool_get_env_var] else "false"}')
+
+    for datetime_env_var in DATETIME_ENV_VARS:
+        if datetime_env_var in ignored_env_vars:
+            continue
+
+        datetime_get_env_var = datetime_env_var.name.lower()
+        expected_get_env[datetime_get_env_var] = datetime.now(TzInfo(0))  # type: ignore
+        monkeypatch.setenv(
+            datetime_env_var,
+            expected_get_env[datetime_get_env_var].strftime('%Y-%m-%dT%H:%M:%S.%fZ'),
        )

-        # Update expectations for legacy configuration
-        for old_name, new_name in legacy_env_vars.items():
-            expected_get_env[old_name.name.lower()] = expected_get_env[new_name.name.lower()]
+    for string_env_var in STRING_ENV_VARS:
+        if string_env_var in ignored_env_vars:
+            continue

-        await Actor.init()
-        assert Actor.get_env() == expected_get_env
+        string_get_env_var = string_env_var.name.lower()
+        expected_get_env[string_get_env_var] = ''.join(random.choices(string.ascii_uppercase + string.digits, k=10))
+        monkeypatch.setenv(string_env_var, expected_get_env[string_get_env_var])

-        await Actor.exit()
+    # We need this override so that the actor doesn't fail when connecting to the platform events websocket
+    monkeypatch.delenv(ActorEnvVars.EVENTS_WEBSOCKET_URL)
+    monkeypatch.delenv(ApifyEnvVars.ACTOR_EVENTS_WS_URL)
+    expected_get_env[ActorEnvVars.EVENTS_WEBSOCKET_URL.name.lower()] = None
+    expected_get_env[ApifyEnvVars.ACTOR_EVENTS_WS_URL.name.lower()] = None
+
+    # Adjust expectations for timedelta fields
+    for env_name, env_value in expected_get_env.items():
+        if env_name.endswith('_millis'):
+            expected_get_env[env_name] = timedelta(milliseconds=env_value)
+
+    # Convert dedicated_cpus to float
+    expected_get_env[ApifyEnvVars.DEDICATED_CPUS.name.lower()] = float(
+        expected_get_env[ApifyEnvVars.DEDICATED_CPUS.name.lower()]
+    )
+
+    # Update expectations for legacy configuration
+    for old_name, new_name in legacy_env_vars.items():
+        expected_get_env[old_name.name.lower()] = expected_get_env[new_name.name.lower()]
+
+    await Actor.init()
+    assert Actor.get_env() == expected_get_env
+
+    await Actor.exit()
diff --git a/tests/unit/scrapy/requests/test_to_apify_request.py b/tests/unit/scrapy/requests/test_to_apify_request.py
@@ -4,6 +4,8 @@
 from scrapy import Request, Spider
 from scrapy.http.headers import Headers

+from crawlee._types import HttpHeaders
+
 from apify.scrapy.requests import to_apify_request


@@ -36,7 +38,7 @@ def test__to_apify_request__headers(spider: Spider) -> None:
    apify_request = to_apify_request(scrapy_request, spider)

    assert apify_request is not None
-    assert apify_request.headers == dict(scrapy_request_headers.to_unicode_dict())
+    assert apify_request.headers == HttpHeaders(scrapy_request_headers.to_unicode_dict())


 def test__to_apify_request__without_id_and_unique_key(spider: Spider) -> None:

diff --git a/tests/unit/scrapy/requests/test_to_scrapy_request.py b/tests/unit/scrapy/requests/test_to_scrapy_request.py
@@ -4,9 +4,9 @@

 import pytest
 from scrapy import Request, Spider
-from scrapy.http.headers import Headers

 from crawlee import Request as CrawleeRequest
+from crawlee._types import HttpHeaders

 from apify.scrapy.requests import to_scrapy_request

@@ -47,7 +47,7 @@ def test__to_scrapy_request__without_reconstruction_with_optional_fields(spider:
        method='GET',
        unique_key='https://crawlee.dev',
        id='fvwscO2UJLdr10B',
-        headers={'Authorization': 'Bearer access_token'},
+        headers=HttpHeaders({'Authorization': 'Bearer access_token'}),
        user_data={'some_user_data': 'test'},
    )

@@ -58,7 +58,7 @@ def test__to_scrapy_request__without_reconstruction_with_optional_fields(spider:
    assert apify_request.method == scrapy_request.method
    assert apify_request.id == scrapy_request.meta.get('apify_request_id')
    assert apify_request.unique_key == scrapy_request.meta.get('apify_request_unique_key')
-    assert Headers(apify_request.headers) == scrapy_request.headers
+    assert apify_request.headers.get('authorization') == scrapy_request.headers.get('authorization').decode()
    assert apify_request.user_data == scrapy_request.meta.get('userData')


@@ -91,7 +91,7 @@ def test__to_scrapy_request__with_reconstruction_with_optional_fields(spider: Sp
        method='GET',
        id='fvwscO2UJLdr10B',
        unique_key='https://apify.com',
-        headers={'Authorization': 'Bearer access_token'},
+        headers=HttpHeaders({'Authorization': 'Bearer access_token'}),
        user_data={
            'some_user_data': 'hello',
            'scrapy_request': 'gASVJgIAAAAAAAB9lCiMA3VybJSMEWh0dHBzOi8vYXBpZnkuY29tlIwIY2FsbGJhY2uUTowHZXJy\nYmFja5ROjAdoZWFkZXJzlH2UKEMGQWNjZXB0lF2UQz90ZXh0L2h0bWwsYXBwbGljYXRpb24veGh0\nbWwreG1sLGFwcGxpY2F0aW9uL3htbDtxPTAuOSwqLyo7cT0wLjiUYUMPQWNjZXB0LUxhbmd1YWdl\nlF2UQwJlbpRhQwpVc2VyLUFnZW50lF2UQyNTY3JhcHkvMi4xMS4wICgraHR0cHM6Ly9zY3JhcHku\nb3JnKZRhQw9BY2NlcHQtRW5jb2RpbmeUXZRDDWd6aXAsIGRlZmxhdGWUYXWMBm1ldGhvZJSMA0dF\nVJSMBGJvZHmUQwCUjAdjb29raWVzlH2UjARtZXRhlH2UKIwQYXBpZnlfcmVxdWVzdF9pZJSMD2Z2\nd3NjTzJVSkxkcjEwQpSMGGFwaWZ5X3JlcXVlc3RfdW5pcXVlX2tleZSMEWh0dHBzOi8vYXBpZnku\nY29tlIwQZG93bmxvYWRfdGltZW91dJRHQGaAAAAAAACMDWRvd25sb2FkX3Nsb3SUjAlhcGlmeS5j\nb22UjBBkb3dubG9hZF9sYXRlbmN5lEc/tYIIAAAAAHWMCGVuY29kaW5nlIwFdXRmLTiUjAhwcmlv\ncml0eZRLAIwLZG9udF9maWx0ZXKUiYwFZmxhZ3OUXZSMCWNiX2t3YXJnc5R9lHUu\n',  # noqa: E501
@@ -105,7 +105,7 @@ def test__to_scrapy_request__with_reconstruction_with_optional_fields(spider: Sp
    assert apify_request.method == scrapy_request.method
    assert apify_request.id == scrapy_request.meta.get('apify_request_id')
    assert apify_request.unique_key == scrapy_request.meta.get('apify_request_unique_key')
-    assert Headers(apify_request.headers) == scrapy_request.headers
+    assert apify_request.headers.get('authorization') == scrapy_request.headers.get('authorization').decode()
    assert apify_request.user_data == scrapy_request.meta.get('userData')