Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit 51be03d

Browse filesBrowse files
committed
feat: Upgrade to Crawlee v0.5
1 parent ccba8d1 commit 51be03d
Copy full SHA for 51be03d

File tree

Expand file treeCollapse file tree

12 files changed

+363
-145
lines changed
Filter options
Expand file treeCollapse file tree

12 files changed

+363
-145
lines changed

‎poetry.lock

Copy file name to clipboardExpand all lines: poetry.lock
+150-38Lines changed: 150 additions & 38 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

‎pyproject.toml

Copy file name to clipboardExpand all lines: pyproject.toml
+1-1Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ keywords = [
4545
python = "^3.9"
4646
apify-client = ">=1.8.1"
4747
apify-shared = ">=1.2.1"
48-
crawlee = "~0.4.0"
48+
crawlee = "~0.5.0"
4949
cryptography = ">=42.0.0"
5050
httpx = ">=0.27.0"
5151
lazy-object-proxy = ">=1.10.0"

‎src/apify/_actor.py

Copy file name to clipboardExpand all lines: src/apify/_actor.py
+34-25Lines changed: 34 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,9 @@
1313
from apify_client import ApifyClientAsync
1414
from apify_shared.consts import ActorEnvVars, ActorExitCodes, ApifyEnvVars
1515
from apify_shared.utils import ignore_docs, maybe_extract_enum_member_value
16-
from crawlee import service_container
16+
from crawlee import service_locator
1717
from crawlee.events._types import Event, EventMigratingData, EventPersistStateData
18+
from crawlee.memory_storage_client import MemoryStorageClient
1819

1920
from apify._configuration import Configuration
2021
from apify._consts import EVENT_LISTENERS_TIMEOUT
@@ -71,17 +72,22 @@ def __init__(
7172
self._configure_logging = configure_logging
7273
self._apify_client = self.new_client()
7374

74-
self._event_manager: EventManager
75-
if self._configuration.is_at_home:
76-
self._event_manager = PlatformEventManager(
77-
config=self._configuration,
78-
persist_state_interval=self._configuration.persist_state_interval,
75+
# We need to keep both local & cloud storage clients because of the `force_cloud` option.
76+
self._local_storage_client = MemoryStorageClient.from_config(config=self.config)
77+
self._cloud_storage_client = ApifyStorageClient.from_config(config=self.config)
78+
79+
# Set the event manager based on whether the Actor is running on the platform or locally.
80+
self._event_manager = (
81+
PlatformEventManager(
82+
config=self.config,
83+
persist_state_interval=self.config.persist_state_interval,
7984
)
80-
else:
81-
self._event_manager = LocalEventManager(
82-
system_info_interval=self._configuration.system_info_interval,
83-
persist_state_interval=self._configuration.persist_state_interval,
85+
if self.is_at_home()
86+
else LocalEventManager(
87+
system_info_interval=self.config.system_info_interval,
88+
persist_state_interval=self.config.persist_state_interval,
8489
)
90+
)
8591

8692
self._is_initialized = False
8793

@@ -95,7 +101,7 @@ async def __aenter__(self) -> Self:
95101
executing the block code, the `Actor.fail` method is called.
96102
"""
97103
if self._configure_logging:
98-
_configure_logging(self._configuration)
104+
_configure_logging()
99105

100106
await self.init()
101107
return self
@@ -184,18 +190,17 @@ async def init(self) -> None:
184190
if self._is_initialized:
185191
raise RuntimeError('The Actor was already initialized!')
186192

187-
if self._configuration.token:
188-
service_container.set_cloud_storage_client(ApifyStorageClient(configuration=self._configuration))
193+
self._is_exiting = False
194+
self._was_final_persist_state_emitted = False
189195

190-
if self._configuration.is_at_home:
191-
service_container.set_default_storage_client_type('cloud')
196+
# Register services in the service locator.
197+
if self.is_at_home():
198+
service_locator.set_storage_client(self._cloud_storage_client)
192199
else:
193-
service_container.set_default_storage_client_type('local')
200+
service_locator.set_storage_client(self._local_storage_client)
194201

195-
service_container.set_event_manager(self._event_manager)
196-
197-
self._is_exiting = False
198-
self._was_final_persist_state_emitted = False
202+
service_locator.set_event_manager(self.event_manager)
203+
service_locator.set_configuration(self.configuration)
199204

200205
self.log.info('Initializing Actor...')
201206
self.log.info('System info', extra=get_system_info())
@@ -245,7 +250,6 @@ async def finalize() -> None:
245250
await self._event_manager.wait_for_all_listeners_to_complete(timeout=event_listeners_timeout)
246251

247252
await self._event_manager.__aexit__(None, None, None)
248-
cast(dict, service_container._services).clear() # noqa: SLF001
249253

250254
await asyncio.wait_for(finalize(), cleanup_timeout.total_seconds())
251255
self._is_initialized = False
@@ -349,11 +353,13 @@ async def open_dataset(
349353
self._raise_if_not_initialized()
350354
self._raise_if_cloud_requested_but_not_configured(force_cloud=force_cloud)
351355

356+
storage_client = self._cloud_storage_client if force_cloud else service_locator.get_storage_client()
357+
352358
return await Dataset.open(
353359
id=id,
354360
name=name,
355361
configuration=self._configuration,
356-
storage_client=service_container.get_storage_client(client_type='cloud' if force_cloud else None),
362+
storage_client=storage_client,
357363
)
358364

359365
async def open_key_value_store(
@@ -381,12 +387,13 @@ async def open_key_value_store(
381387
"""
382388
self._raise_if_not_initialized()
383389
self._raise_if_cloud_requested_but_not_configured(force_cloud=force_cloud)
390+
storage_client = self._cloud_storage_client if force_cloud else service_locator.get_storage_client()
384391

385392
return await KeyValueStore.open(
386393
id=id,
387394
name=name,
388395
configuration=self._configuration,
389-
storage_client=service_container.get_storage_client(client_type='cloud' if force_cloud else None),
396+
storage_client=storage_client,
390397
)
391398

392399
async def open_request_queue(
@@ -417,11 +424,13 @@ async def open_request_queue(
417424
self._raise_if_not_initialized()
418425
self._raise_if_cloud_requested_but_not_configured(force_cloud=force_cloud)
419426

427+
storage_client = self._cloud_storage_client if force_cloud else service_locator.get_storage_client()
428+
420429
return await RequestQueue.open(
421430
id=id,
422431
name=name,
423432
configuration=self._configuration,
424-
storage_client=service_container.get_storage_client(client_type='cloud' if force_cloud else None),
433+
storage_client=storage_client,
425434
)
426435

427436
async def push_data(self, data: dict | list[dict]) -> None:
@@ -963,7 +972,7 @@ async def create_proxy_configuration(
963972
password: str | None = None,
964973
groups: list[str] | None = None,
965974
country_code: str | None = None,
966-
proxy_urls: list[str] | None = None,
975+
proxy_urls: list[str | None] | None = None,
967976
new_url_function: _NewUrlFunction | None = None,
968977
) -> ProxyConfiguration | None:
969978
"""Create a ProxyConfiguration object with the passed proxy configuration.

‎src/apify/_configuration.py

Copy file name to clipboardExpand all lines: src/apify/_configuration.py
+12Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from __future__ import annotations
22

33
from datetime import datetime, timedelta
4+
from logging import getLogger
45
from typing import Annotated, Any
56

67
from pydantic import AliasChoices, BeforeValidator, Field
@@ -12,6 +13,8 @@
1213

1314
from apify._utils import docs_group
1415

16+
logger = getLogger(__name__)
17+
1518

1619
def _transform_to_list(value: Any) -> list[str] | None:
1720
if value is None:
@@ -353,6 +356,15 @@ class Configuration(CrawleeConfiguration):
353356
),
354357
] = None
355358

359+
@classmethod
360+
def get_global_configuration(cls) -> Configuration:
361+
"""Retrieve the global instance of the configuration.
362+
363+
Mostly for the backwards compatibility. It is recommended to use the `service_locator.get_configuration()`
364+
instead.
365+
"""
366+
return cls()
367+
356368

357369
# Monkey-patch the base class so that it works with the extended configuration
358370
CrawleeConfiguration.get_global_configuration = Configuration.get_global_configuration # type: ignore[method-assign]

‎src/apify/_proxy_configuration.py

Copy file name to clipboardExpand all lines: src/apify/_proxy_configuration.py
+3-3Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -111,9 +111,9 @@ def __init__(
111111
password: str | None = None,
112112
groups: list[str] | None = None,
113113
country_code: str | None = None,
114-
proxy_urls: list[str] | None = None,
114+
proxy_urls: list[str | None] | None = None,
115115
new_url_function: _NewUrlFunction | None = None,
116-
tiered_proxy_urls: list[list[str]] | None = None,
116+
tiered_proxy_urls: list[list[str | None]] | None = None,
117117
_actor_config: Configuration | None = None,
118118
_apify_client: ApifyClientAsync | None = None,
119119
) -> None:
@@ -148,7 +148,7 @@ def __init__(
148148
' "groups" or "country_code".'
149149
)
150150

151-
if proxy_urls and any('apify.com' in url for url in proxy_urls):
151+
if proxy_urls and any('apify.com' in (url or '') for url in proxy_urls):
152152
logger.warning(
153153
'Some Apify proxy features may work incorrectly. Please consider setting up Apify properties '
154154
'instead of `proxy_urls`.\n'

‎src/apify/apify_storage_client/_apify_storage_client.py

Copy file name to clipboardExpand all lines: src/apify/apify_storage_client/_apify_storage_client.py
+11-1Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,13 @@
1+
from __future__ import annotations
2+
3+
from typing import TYPE_CHECKING
4+
15
from typing_extensions import override
26

37
from apify_client import ApifyClientAsync
48
from crawlee._utils.crypto import crypto_random_object_id
59
from crawlee.base_storage_client import BaseStorageClient
610

7-
from apify._configuration import Configuration
811
from apify._utils import docs_group
912
from apify.apify_storage_client._dataset_client import DatasetClient
1013
from apify.apify_storage_client._dataset_collection_client import DatasetCollectionClient
@@ -13,6 +16,9 @@
1316
from apify.apify_storage_client._request_queue_client import RequestQueueClient
1417
from apify.apify_storage_client._request_queue_collection_client import RequestQueueCollectionClient
1518

19+
if TYPE_CHECKING:
20+
from apify._configuration import Configuration
21+
1622

1723
@docs_group('Classes')
1824
class ApifyStorageClient(BaseStorageClient):
@@ -29,6 +35,10 @@ def __init__(self, *, configuration: Configuration) -> None:
2935
)
3036
self._configuration = configuration
3137

38+
@classmethod
39+
def from_config(cls, config: Configuration) -> ApifyStorageClient:
40+
return cls(configuration=config)
41+
3242
@override
3343
def dataset(self, id: str) -> DatasetClient:
3444
return DatasetClient(self._apify_client.dataset(id))

‎src/apify/log.py

Copy file name to clipboardExpand all lines: src/apify/log.py
+4-8Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,10 @@
11
from __future__ import annotations
22

33
import logging
4-
from typing import TYPE_CHECKING
54

65
from apify_shared.utils import ignore_docs
76
from crawlee._log_config import CrawleeLogFormatter, configure_logger, get_configured_log_level
87

9-
if TYPE_CHECKING:
10-
from apify import Configuration
11-
128
# Name of the logger used throughout the library (resolves to 'apify')
139
logger_name = __name__.split('.')[0]
1410

@@ -21,11 +17,11 @@ class ActorLogFormatter(CrawleeLogFormatter): # noqa: D101 (Inherited from pare
2117
pass
2218

2319

24-
def _configure_logging(configuration: Configuration) -> None:
20+
def _configure_logging() -> None:
2521
apify_client_logger = logging.getLogger('apify_client')
26-
configure_logger(apify_client_logger, configuration, remove_old_handlers=True)
22+
configure_logger(apify_client_logger, remove_old_handlers=True)
2723

28-
level = get_configured_log_level(configuration)
24+
level = get_configured_log_level()
2925

3026
# Keep apify_client logger quiet unless debug logging is requested
3127
if level > logging.DEBUG:
@@ -42,4 +38,4 @@ def _configure_logging(configuration: Configuration) -> None:
4238

4339
# Use configured log level for apify logger
4440
apify_logger = logging.getLogger('apify')
45-
configure_logger(apify_logger, configuration, remove_old_handlers=True)
41+
configure_logger(apify_logger, remove_old_handlers=True)

‎tests/integration/conftest.py

Copy file name to clipboardExpand all lines: tests/integration/conftest.py
+61-11Lines changed: 61 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,15 @@
77
import sys
88
import textwrap
99
from pathlib import Path
10-
from typing import TYPE_CHECKING, Any, Callable, Protocol, cast
10+
from typing import TYPE_CHECKING, Any, Callable, Protocol
1111

1212
import pytest
1313
from filelock import FileLock
1414

1515
from apify_client import ApifyClientAsync
16-
from apify_shared.consts import ActorJobStatus, ActorSourceType
16+
from apify_shared.consts import ActorJobStatus, ActorSourceType, ApifyEnvVars
17+
from crawlee import service_locator
18+
from crawlee.storages import _creation_management
1719

1820
import apify._actor
1921
from ._utils import generate_unique_resource_name
@@ -29,19 +31,67 @@
2931
_SDK_ROOT_PATH = Path(__file__).parent.parent.parent.resolve()
3032

3133

32-
@pytest.fixture(autouse=True)
33-
def _reset_and_patch_default_instances() -> None:
34-
"""Reset the used singletons and patch the default storage client with a temporary directory.
34+
@pytest.fixture
35+
def prepare_test_env(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> Callable[[], None]:
36+
"""Prepare the testing environment by resetting the global state before each test.
37+
38+
This fixture ensures that the global state of the package is reset to a known baseline before each test runs.
39+
It also configures a temporary storage directory for test isolation.
40+
41+
Args:
42+
monkeypatch: Test utility provided by pytest for patching.
43+
tmp_path: A unique temporary directory path provided by pytest for test isolation.
3544
36-
To isolate the tests, we need to reset the used singletons before each test case. We also patch the default
37-
storage client with a tmp_path.
45+
Returns:
46+
A callable that prepares the test environment.
3847
"""
39-
from crawlee import service_container
4048

41-
cast(dict, service_container._services).clear()
42-
delattr(apify._actor.Actor, '__wrapped__')
49+
def _prepare_test_env() -> None:
50+
delattr(apify._actor.Actor, '__wrapped__')
51+
52+
# Set the environment variable for the local storage directory to the temporary path.
53+
monkeypatch.setenv(ApifyEnvVars.LOCAL_STORAGE_DIR, str(tmp_path))
54+
55+
# Reset the flags in the service locator to indicate that no services are explicitly set. This ensures
56+
# a clean state, as services might have been set during a previous test and not reset properly.
57+
service_locator._configuration_was_set = False
58+
service_locator._storage_client_was_set = False
59+
service_locator._event_manager_was_set = False
60+
61+
# Reset the services in the service locator.
62+
service_locator._configuration = None
63+
service_locator._event_manager = None
64+
service_locator._storage_client = None
65+
66+
# Clear creation-related caches to ensure no state is carried over between tests.
67+
monkeypatch.setattr(_creation_management, '_cache_dataset_by_id', {})
68+
monkeypatch.setattr(_creation_management, '_cache_dataset_by_name', {})
69+
monkeypatch.setattr(_creation_management, '_cache_kvs_by_id', {})
70+
monkeypatch.setattr(_creation_management, '_cache_kvs_by_name', {})
71+
monkeypatch.setattr(_creation_management, '_cache_rq_by_id', {})
72+
monkeypatch.setattr(_creation_management, '_cache_rq_by_name', {})
73+
74+
# Verify that the test environment was set up correctly.
75+
assert os.environ.get(ApifyEnvVars.LOCAL_STORAGE_DIR) == str(tmp_path)
76+
assert service_locator._configuration_was_set is False
77+
assert service_locator._storage_client_was_set is False
78+
assert service_locator._event_manager_was_set is False
79+
80+
return _prepare_test_env
81+
82+
83+
@pytest.fixture(autouse=True)
84+
def _isolate_test_environment(prepare_test_env: Callable[[], None]) -> None:
85+
"""Isolate the testing environment by resetting global state before and after each test.
86+
87+
This fixture ensures that each test starts with a clean slate and that any modifications during the test
88+
do not affect subsequent tests. It runs automatically for all tests.
89+
90+
Args:
91+
prepare_test_env: Fixture to prepare the environment before each test.
92+
"""
4393

44-
# TODO: StorageClientManager local storage client purge # noqa: TD003
94+
prepare_test_env()
4595

4696

4797
@pytest.fixture

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.