From 62ca235e55002a4f87de05fcceebd32aa0a28a5c Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Wed, 29 Jan 2025 09:50:44 +0100 Subject: [PATCH 1/4] fix: Fix RQ usage in Scrapy scheduler --- src/apify/_actor.py | 2 +- src/apify/scrapy/middlewares/apify_proxy.py | 2 +- src/apify/scrapy/scheduler.py | 15 +++++++++++++-- 3 files changed, 15 insertions(+), 4 deletions(-) diff --git a/src/apify/_actor.py b/src/apify/_actor.py index 6aa45708..71e3b6e2 100644 --- a/src/apify/_actor.py +++ b/src/apify/_actor.py @@ -889,7 +889,7 @@ async def reboot( The system stops the current container and starts a new one, with the same run ID and default storages. Args: - event_listeners_timeout: How long should the Actor wait for Actor event listeners to finish before exiting + event_listeners_timeout: How long should the Actor wait for Actor event listeners to finish before exiting. custom_after_sleep: How long to sleep for after the reboot, to wait for the container to be stopped. """ self._raise_if_not_initialized() diff --git a/src/apify/scrapy/middlewares/apify_proxy.py b/src/apify/scrapy/middlewares/apify_proxy.py index b1dc2b88..f81be3c4 100644 --- a/src/apify/scrapy/middlewares/apify_proxy.py +++ b/src/apify/scrapy/middlewares/apify_proxy.py @@ -62,7 +62,7 @@ def from_crawler(cls: type[ApifyHttpProxyMiddleware], crawler: Crawler) -> Apify if use_apify_proxy is not True: Actor.log.warning( 'ApifyHttpProxyMiddleware is not going to be used. Actor input field ' - '"proxyConfiguration.useApifyProxy" is probably set to False.' + '"proxyConfiguration.useApifyProxy" is set to False.' ) raise NotConfigured diff --git a/src/apify/scrapy/scheduler.py b/src/apify/scrapy/scheduler.py index da79ac64..8fa5194d 100644 --- a/src/apify/scrapy/scheduler.py +++ b/src/apify/scrapy/scheduler.py @@ -3,6 +3,8 @@ import traceback from typing import TYPE_CHECKING +from crawlee.storage_clients import MemoryStorageClient + from apify._configuration import Configuration from apify.apify_storage_client import ApifyStorageClient @@ -52,8 +54,17 @@ def open(self, spider: Spider) -> None: # this has to be named "open" self.spider = spider async def open_queue() -> RequestQueue: - custom_loop_apify_client = ApifyStorageClient(configuration=Configuration.get_global_configuration()) - return await RequestQueue.open(storage_client=custom_loop_apify_client) + config = Configuration.get_global_configuration() + + # Use the ApifyStorageClient if the Actor is running on the Apify platform, + # otherwise use the MemoryStorageClient. + storage_client = ( + ApifyStorageClient.from_config(config) + if config.is_at_home + else MemoryStorageClient.from_config(config) + ) + + return await RequestQueue.open(storage_client=storage_client) try: self._rq = nested_event_loop.run_until_complete(open_queue()) From a379eb44e49200f1f0b595a3b9d0a90e11387340 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Wed, 29 Jan 2025 09:56:50 +0100 Subject: [PATCH 2/4] linter --- src/apify/scrapy/scheduler.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/apify/scrapy/scheduler.py b/src/apify/scrapy/scheduler.py index 8fa5194d..7d93388f 100644 --- a/src/apify/scrapy/scheduler.py +++ b/src/apify/scrapy/scheduler.py @@ -59,9 +59,7 @@ async def open_queue() -> RequestQueue: # Use the ApifyStorageClient if the Actor is running on the Apify platform, # otherwise use the MemoryStorageClient. storage_client = ( - ApifyStorageClient.from_config(config) - if config.is_at_home - else MemoryStorageClient.from_config(config) + ApifyStorageClient.from_config(config) if config.is_at_home else MemoryStorageClient.from_config(config) ) return await RequestQueue.open(storage_client=storage_client) From e1b60c7e5b028ca0c958b4e51b46f9f0e04a663f Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Wed, 29 Jan 2025 10:43:53 +0100 Subject: [PATCH 3/4] improve apify proxy middleware --- src/apify/scrapy/middlewares/apify_proxy.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/apify/scrapy/middlewares/apify_proxy.py b/src/apify/scrapy/middlewares/apify_proxy.py index f81be3c4..3a5244d6 100644 --- a/src/apify/scrapy/middlewares/apify_proxy.py +++ b/src/apify/scrapy/middlewares/apify_proxy.py @@ -78,6 +78,11 @@ async def process_request(self, request: Request, spider: Spider) -> None: Raises: ValueError: If username and password are not provided in the proxy URL. """ + # Do not use proxy for robots.txt, as it causes 403 Forbidden. + if request.url.endswith('/robots.txt'): + request.meta.pop('proxy', None) + return + Actor.log.debug(f'ApifyHttpProxyMiddleware.process_request: request={request}, spider={spider}') url = await self._get_new_proxy_url() From 660b65858b1147a457ddf7e98e59985342fe594e Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Wed, 29 Jan 2025 14:11:16 +0100 Subject: [PATCH 4/4] redo proxy changes --- src/apify/scrapy/middlewares/apify_proxy.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/apify/scrapy/middlewares/apify_proxy.py b/src/apify/scrapy/middlewares/apify_proxy.py index 3a5244d6..f81be3c4 100644 --- a/src/apify/scrapy/middlewares/apify_proxy.py +++ b/src/apify/scrapy/middlewares/apify_proxy.py @@ -78,11 +78,6 @@ async def process_request(self, request: Request, spider: Spider) -> None: Raises: ValueError: If username and password are not provided in the proxy URL. """ - # Do not use proxy for robots.txt, as it causes 403 Forbidden. - if request.url.endswith('/robots.txt'): - request.meta.pop('proxy', None) - return - Actor.log.debug(f'ApifyHttpProxyMiddleware.process_request: request={request}, spider={spider}') url = await self._get_new_proxy_url()