Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

fix: use HttpHeaders type in Scrapy integration #289

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Oct 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 3 additions & 9 deletions 12 .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,20 +13,14 @@ repos:
language: system
pass_filenames: false

- id: unit-tests
name: Run unit tests
entry: make unit-tests
language: system
pass_filenames: false

vdusek marked this conversation as resolved.
Show resolved Hide resolved
- id: check-changelog-entry
name: Check changelog entry
entry: make check-changelog-entry
language: system
pass_filenames: false

- id: check-version-conflict
name: Check version conflict
entry: make check-version-conflict
- id: check-version-availability
name: Check version availability
entry: make check-version-availability
language: system
pass_filenames: false
3 changes: 2 additions & 1 deletion 3 pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ keywords = [
python = "^3.9"
apify-client = ">=1.8.1"
apify-shared = ">=1.1.2"
crawlee = ">=0.3.5"
crawlee = ">=0.3.8"
cryptography = ">=42.0.0"
httpx = ">=0.27.0"
lazy-object-proxy = ">=1.10.0"
Expand Down Expand Up @@ -162,6 +162,7 @@ max-branches = 18

[tool.pytest.ini_options]
addopts = "-ra"
asyncio_default_fixture_loop_scope = "function"
asyncio_mode = "auto"
timeout = 1200

Expand Down
4 changes: 3 additions & 1 deletion 4 src/apify/_configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from typing_extensions import deprecated

from crawlee._utils.models import timedelta_ms
from crawlee._utils.urls import validate_http_url
from crawlee.configuration import Configuration as CrawleeConfiguration


Expand Down Expand Up @@ -263,11 +264,12 @@ class Configuration(CrawleeConfiguration):

standby_url: Annotated[
str,
BeforeValidator(validate_http_url),
Field(
alias='actor_standby_url',
description='URL for accessing web servers of Actor runs in Standby mode',
),
]
] = 'http://localhost'

token: Annotated[
str | None,
Expand Down
13 changes: 4 additions & 9 deletions 13 src/apify/scrapy/requests.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
) from exc

from crawlee import Request as CrawleeRequest
from crawlee._types import HttpHeaders
vdusek marked this conversation as resolved.
Show resolved Hide resolved
from crawlee._utils.crypto import crypto_random_object_id
from crawlee._utils.requests import compute_unique_key, unique_key_to_request_id

Expand Down Expand Up @@ -77,9 +78,9 @@ def to_apify_request(scrapy_request: Request, spider: Spider) -> CrawleeRequest
id=request_id,
)

# Convert Scrapy's headers to a dictionary and store them in the apify_request
# Convert Scrapy's headers to a HttpHeaders and store them in the apify_request
if isinstance(scrapy_request.headers, Headers):
apify_request.headers = dict(scrapy_request.headers.to_unicode_dict())
apify_request.headers = HttpHeaders(scrapy_request.headers.to_unicode_dict())
else:
Actor.log.warning(
f'Invalid scrapy_request.headers type, not scrapy.http.headers.Headers: {scrapy_request.headers}'
Expand Down Expand Up @@ -164,13 +165,7 @@ def to_scrapy_request(apify_request: CrawleeRequest, spider: Spider) -> Request:

# Add optional 'headers' field
if apify_request.headers:
if isinstance(cast(Any, apify_request.headers), dict):
scrapy_request.headers = Headers(apify_request.headers)
else:
Actor.log.warning(
'apify_request[headers] is not an instance of the dict class, '
f'apify_request[headers] = {apify_request.headers}',
)
scrapy_request.headers |= Headers(apify_request.headers)

# Add optional 'userData' field
if apify_request.user_data:
Expand Down
216 changes: 108 additions & 108 deletions 216 tests/unit/actor/test_actor_env_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,115 +23,115 @@
import pytest


class TestIsAtHome:
async def test_is_at_home_local(self) -> None:
async with Actor as actor:
is_at_home = actor.is_at_home()
assert is_at_home is False

async def test_is_at_home_on_apify(self, monkeypatch: pytest.MonkeyPatch) -> None:
print('setenv')
monkeypatch.setenv(ApifyEnvVars.IS_AT_HOME, 'true')
async with Actor as actor:
is_at_home = actor.is_at_home()
assert is_at_home is True


class TestGetEnv:
async def test_get_env_use_env_vars(self, monkeypatch: pytest.MonkeyPatch) -> None:
ignored_env_vars = {
ApifyEnvVars.INPUT_KEY,
ApifyEnvVars.MEMORY_MBYTES,
ApifyEnvVars.STARTED_AT,
ApifyEnvVars.TIMEOUT_AT,
ApifyEnvVars.DEFAULT_DATASET_ID,
ApifyEnvVars.DEFAULT_KEY_VALUE_STORE_ID,
ApifyEnvVars.DEFAULT_REQUEST_QUEUE_ID,
ApifyEnvVars.SDK_LATEST_VERSION,
ApifyEnvVars.LOG_FORMAT,
ApifyEnvVars.LOG_LEVEL,
}

legacy_env_vars = {
ApifyEnvVars.ACT_ID: ActorEnvVars.ID,
ApifyEnvVars.ACT_RUN_ID: ActorEnvVars.RUN_ID,
ApifyEnvVars.ACTOR_ID: ActorEnvVars.ID,
ApifyEnvVars.ACTOR_BUILD_ID: ActorEnvVars.BUILD_ID,
ApifyEnvVars.ACTOR_BUILD_NUMBER: ActorEnvVars.BUILD_NUMBER,
ApifyEnvVars.ACTOR_RUN_ID: ActorEnvVars.RUN_ID,
ApifyEnvVars.ACTOR_TASK_ID: ActorEnvVars.TASK_ID,
ApifyEnvVars.CONTAINER_URL: ActorEnvVars.WEB_SERVER_URL,
ApifyEnvVars.CONTAINER_PORT: ActorEnvVars.WEB_SERVER_PORT,
}

# Set up random env vars
expected_get_env: dict[str, Any] = {}
expected_get_env[ApifyEnvVars.LOG_LEVEL.name.lower()] = 'INFO'

for int_env_var in INTEGER_ENV_VARS:
if int_env_var in ignored_env_vars:
continue

int_get_env_var = int_env_var.name.lower()
expected_get_env[int_get_env_var] = random.randint(1, 99999)
monkeypatch.setenv(int_env_var, f'{expected_get_env[int_get_env_var]}')

for float_env_var in FLOAT_ENV_VARS:
if float_env_var in ignored_env_vars:
continue

float_get_env_var = float_env_var.name.lower()
expected_get_env[float_get_env_var] = random.random()
monkeypatch.setenv(float_env_var, f'{expected_get_env[float_get_env_var]}')

for bool_env_var in BOOL_ENV_VARS:
if bool_env_var in ignored_env_vars:
continue

bool_get_env_var = bool_env_var.name.lower()
expected_get_env[bool_get_env_var] = random.choice([True, False])
monkeypatch.setenv(bool_env_var, f'{"true" if expected_get_env[bool_get_env_var] else "false"}')

for datetime_env_var in DATETIME_ENV_VARS:
if datetime_env_var in ignored_env_vars:
continue

datetime_get_env_var = datetime_env_var.name.lower()
expected_get_env[datetime_get_env_var] = datetime.now(TzInfo(0)) # type: ignore
monkeypatch.setenv(
datetime_env_var,
expected_get_env[datetime_get_env_var].strftime('%Y-%m-%dT%H:%M:%S.%fZ'),
)

for string_env_var in STRING_ENV_VARS:
if string_env_var in ignored_env_vars:
continue

string_get_env_var = string_env_var.name.lower()
expected_get_env[string_get_env_var] = ''.join(random.choices(string.ascii_uppercase + string.digits, k=10))
monkeypatch.setenv(string_env_var, expected_get_env[string_get_env_var])

# We need this override so that the actor doesn't fail when connecting to the platform events websocket
monkeypatch.delenv(ActorEnvVars.EVENTS_WEBSOCKET_URL)
monkeypatch.delenv(ApifyEnvVars.ACTOR_EVENTS_WS_URL)
expected_get_env[ActorEnvVars.EVENTS_WEBSOCKET_URL.name.lower()] = None
expected_get_env[ApifyEnvVars.ACTOR_EVENTS_WS_URL.name.lower()] = None

# Adjust expectations for timedelta fields
for env_name, env_value in expected_get_env.items():
if env_name.endswith('_millis'):
expected_get_env[env_name] = timedelta(milliseconds=env_value)

# Convert dedicated_cpus to float
expected_get_env[ApifyEnvVars.DEDICATED_CPUS.name.lower()] = float(
expected_get_env[ApifyEnvVars.DEDICATED_CPUS.name.lower()]
async def test_is_at_home_local() -> None:
async with Actor as actor:
is_at_home = actor.is_at_home()
assert is_at_home is False


async def test_is_at_home_on_apify(monkeypatch: pytest.MonkeyPatch) -> None:
print('setenv')
monkeypatch.setenv(ApifyEnvVars.IS_AT_HOME, 'true')
async with Actor as actor:
is_at_home = actor.is_at_home()
assert is_at_home is True


async def test_get_env_use_env_vars(monkeypatch: pytest.MonkeyPatch) -> None:
ignored_env_vars = {
ApifyEnvVars.INPUT_KEY,
ApifyEnvVars.MEMORY_MBYTES,
ApifyEnvVars.STARTED_AT,
ApifyEnvVars.TIMEOUT_AT,
ApifyEnvVars.DEFAULT_DATASET_ID,
ApifyEnvVars.DEFAULT_KEY_VALUE_STORE_ID,
ApifyEnvVars.DEFAULT_REQUEST_QUEUE_ID,
ApifyEnvVars.SDK_LATEST_VERSION,
ApifyEnvVars.LOG_FORMAT,
ApifyEnvVars.LOG_LEVEL,
ActorEnvVars.STANDBY_PORT,
}

legacy_env_vars = {
ApifyEnvVars.ACT_ID: ActorEnvVars.ID,
ApifyEnvVars.ACT_RUN_ID: ActorEnvVars.RUN_ID,
ApifyEnvVars.ACTOR_ID: ActorEnvVars.ID,
ApifyEnvVars.ACTOR_BUILD_ID: ActorEnvVars.BUILD_ID,
ApifyEnvVars.ACTOR_BUILD_NUMBER: ActorEnvVars.BUILD_NUMBER,
ApifyEnvVars.ACTOR_RUN_ID: ActorEnvVars.RUN_ID,
ApifyEnvVars.ACTOR_TASK_ID: ActorEnvVars.TASK_ID,
ApifyEnvVars.CONTAINER_URL: ActorEnvVars.WEB_SERVER_URL,
ApifyEnvVars.CONTAINER_PORT: ActorEnvVars.WEB_SERVER_PORT,
}

# Set up random env vars
expected_get_env: dict[str, Any] = {}
expected_get_env[ApifyEnvVars.LOG_LEVEL.name.lower()] = 'INFO'

for int_env_var in INTEGER_ENV_VARS:
if int_env_var in ignored_env_vars:
continue

int_get_env_var = int_env_var.name.lower()
expected_get_env[int_get_env_var] = random.randint(1, 99999)
monkeypatch.setenv(int_env_var, f'{expected_get_env[int_get_env_var]}')

for float_env_var in FLOAT_ENV_VARS:
if float_env_var in ignored_env_vars:
continue

float_get_env_var = float_env_var.name.lower()
expected_get_env[float_get_env_var] = random.random()
monkeypatch.setenv(float_env_var, f'{expected_get_env[float_get_env_var]}')

for bool_env_var in BOOL_ENV_VARS:
if bool_env_var in ignored_env_vars:
continue

bool_get_env_var = bool_env_var.name.lower()
expected_get_env[bool_get_env_var] = random.choice([True, False])
monkeypatch.setenv(bool_env_var, f'{"true" if expected_get_env[bool_get_env_var] else "false"}')

for datetime_env_var in DATETIME_ENV_VARS:
if datetime_env_var in ignored_env_vars:
continue

datetime_get_env_var = datetime_env_var.name.lower()
expected_get_env[datetime_get_env_var] = datetime.now(TzInfo(0)) # type: ignore
monkeypatch.setenv(
datetime_env_var,
expected_get_env[datetime_get_env_var].strftime('%Y-%m-%dT%H:%M:%S.%fZ'),
)

# Update expectations for legacy configuration
for old_name, new_name in legacy_env_vars.items():
expected_get_env[old_name.name.lower()] = expected_get_env[new_name.name.lower()]
for string_env_var in STRING_ENV_VARS:
if string_env_var in ignored_env_vars:
continue

await Actor.init()
assert Actor.get_env() == expected_get_env
string_get_env_var = string_env_var.name.lower()
expected_get_env[string_get_env_var] = ''.join(random.choices(string.ascii_uppercase + string.digits, k=10))
monkeypatch.setenv(string_env_var, expected_get_env[string_get_env_var])

await Actor.exit()
# We need this override so that the actor doesn't fail when connecting to the platform events websocket
monkeypatch.delenv(ActorEnvVars.EVENTS_WEBSOCKET_URL)
monkeypatch.delenv(ApifyEnvVars.ACTOR_EVENTS_WS_URL)
expected_get_env[ActorEnvVars.EVENTS_WEBSOCKET_URL.name.lower()] = None
expected_get_env[ApifyEnvVars.ACTOR_EVENTS_WS_URL.name.lower()] = None

# Adjust expectations for timedelta fields
for env_name, env_value in expected_get_env.items():
if env_name.endswith('_millis'):
expected_get_env[env_name] = timedelta(milliseconds=env_value)

# Convert dedicated_cpus to float
expected_get_env[ApifyEnvVars.DEDICATED_CPUS.name.lower()] = float(
expected_get_env[ApifyEnvVars.DEDICATED_CPUS.name.lower()]
)

# Update expectations for legacy configuration
for old_name, new_name in legacy_env_vars.items():
expected_get_env[old_name.name.lower()] = expected_get_env[new_name.name.lower()]

await Actor.init()
assert Actor.get_env() == expected_get_env

await Actor.exit()
4 changes: 3 additions & 1 deletion 4 tests/unit/scrapy/requests/test_to_apify_request.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
from scrapy import Request, Spider
from scrapy.http.headers import Headers

from crawlee._types import HttpHeaders

from apify.scrapy.requests import to_apify_request


Expand Down Expand Up @@ -36,7 +38,7 @@ def test__to_apify_request__headers(spider: Spider) -> None:
apify_request = to_apify_request(scrapy_request, spider)

assert apify_request is not None
assert apify_request.headers == dict(scrapy_request_headers.to_unicode_dict())
assert apify_request.headers == HttpHeaders(scrapy_request_headers.to_unicode_dict())


def test__to_apify_request__without_id_and_unique_key(spider: Spider) -> None:
Expand Down
10 changes: 5 additions & 5 deletions 10 tests/unit/scrapy/requests/test_to_scrapy_request.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@

import pytest
from scrapy import Request, Spider
from scrapy.http.headers import Headers

from crawlee import Request as CrawleeRequest
from crawlee._types import HttpHeaders

from apify.scrapy.requests import to_scrapy_request

Expand Down Expand Up @@ -47,7 +47,7 @@ def test__to_scrapy_request__without_reconstruction_with_optional_fields(spider:
method='GET',
unique_key='https://crawlee.dev',
id='fvwscO2UJLdr10B',
headers={'Authorization': 'Bearer access_token'},
headers=HttpHeaders({'Authorization': 'Bearer access_token'}),
user_data={'some_user_data': 'test'},
)

Expand All @@ -58,7 +58,7 @@ def test__to_scrapy_request__without_reconstruction_with_optional_fields(spider:
assert apify_request.method == scrapy_request.method
assert apify_request.id == scrapy_request.meta.get('apify_request_id')
assert apify_request.unique_key == scrapy_request.meta.get('apify_request_unique_key')
assert Headers(apify_request.headers) == scrapy_request.headers
assert apify_request.headers.get('authorization') == scrapy_request.headers.get('authorization').decode()
assert apify_request.user_data == scrapy_request.meta.get('userData')


Expand Down Expand Up @@ -91,7 +91,7 @@ def test__to_scrapy_request__with_reconstruction_with_optional_fields(spider: Sp
method='GET',
id='fvwscO2UJLdr10B',
unique_key='https://apify.com',
headers={'Authorization': 'Bearer access_token'},
headers=HttpHeaders({'Authorization': 'Bearer access_token'}),
user_data={
'some_user_data': 'hello',
'scrapy_request': 'gASVJgIAAAAAAAB9lCiMA3VybJSMEWh0dHBzOi8vYXBpZnkuY29tlIwIY2FsbGJhY2uUTowHZXJy\nYmFja5ROjAdoZWFkZXJzlH2UKEMGQWNjZXB0lF2UQz90ZXh0L2h0bWwsYXBwbGljYXRpb24veGh0\nbWwreG1sLGFwcGxpY2F0aW9uL3htbDtxPTAuOSwqLyo7cT0wLjiUYUMPQWNjZXB0LUxhbmd1YWdl\nlF2UQwJlbpRhQwpVc2VyLUFnZW50lF2UQyNTY3JhcHkvMi4xMS4wICgraHR0cHM6Ly9zY3JhcHku\nb3JnKZRhQw9BY2NlcHQtRW5jb2RpbmeUXZRDDWd6aXAsIGRlZmxhdGWUYXWMBm1ldGhvZJSMA0dF\nVJSMBGJvZHmUQwCUjAdjb29raWVzlH2UjARtZXRhlH2UKIwQYXBpZnlfcmVxdWVzdF9pZJSMD2Z2\nd3NjTzJVSkxkcjEwQpSMGGFwaWZ5X3JlcXVlc3RfdW5pcXVlX2tleZSMEWh0dHBzOi8vYXBpZnku\nY29tlIwQZG93bmxvYWRfdGltZW91dJRHQGaAAAAAAACMDWRvd25sb2FkX3Nsb3SUjAlhcGlmeS5j\nb22UjBBkb3dubG9hZF9sYXRlbmN5lEc/tYIIAAAAAHWMCGVuY29kaW5nlIwFdXRmLTiUjAhwcmlv\ncml0eZRLAIwLZG9udF9maWx0ZXKUiYwFZmxhZ3OUXZSMCWNiX2t3YXJnc5R9lHUu\n', # noqa: E501
Expand All @@ -105,7 +105,7 @@ def test__to_scrapy_request__with_reconstruction_with_optional_fields(spider: Sp
assert apify_request.method == scrapy_request.method
assert apify_request.id == scrapy_request.meta.get('apify_request_id')
assert apify_request.unique_key == scrapy_request.meta.get('apify_request_unique_key')
assert Headers(apify_request.headers) == scrapy_request.headers
assert apify_request.headers.get('authorization') == scrapy_request.headers.get('authorization').decode()
assert apify_request.user_data == scrapy_request.meta.get('userData')


Expand Down
Morty Proxy This is a proxified and sanitized view of the page, visit original site.