From 715b65e838d5018825cf8dd4b8b366d08802d4f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ji=C5=99=C3=AD=20Morav=C4=8D=C3=ADk?= Date: Mon, 19 Dec 2022 18:32:53 +0100 Subject: [PATCH 01/23] feat: Implement MemoryStorage and local storage clients --- setup.py | 3 + src/apify/_types.py | 6 + src/apify/_utils.py | 66 ++- src/apify/actor.py | 4 +- src/apify/consts.py | 11 + .../memory_storage/file_storage_utils.py | 112 +++++ src/apify/memory_storage/memory_storage.py | 129 ++++++ .../memory_storage/resource_clients/_utils.py | 63 +++ .../resource_clients/dataset.py | 386 +++++++++++++++++ .../resource_clients/dataset_collection.py | 45 ++ .../resource_clients/key_value_store.py | 406 ++++++++++++++++++ .../key_value_store_collection.py | 45 ++ .../resource_clients/request_queue.py | 344 +++++++++++++++ .../request_queue_collection.py | 45 ++ 14 files changed, 1662 insertions(+), 3 deletions(-) create mode 100644 src/apify/_types.py create mode 100644 src/apify/memory_storage/file_storage_utils.py create mode 100644 src/apify/memory_storage/memory_storage.py create mode 100644 src/apify/memory_storage/resource_clients/_utils.py create mode 100644 src/apify/memory_storage/resource_clients/dataset.py create mode 100644 src/apify/memory_storage/resource_clients/dataset_collection.py create mode 100644 src/apify/memory_storage/resource_clients/key_value_store.py create mode 100644 src/apify/memory_storage/resource_clients/key_value_store_collection.py create mode 100644 src/apify/memory_storage/resource_clients/request_queue.py create mode 100644 src/apify/memory_storage/resource_clients/request_queue_collection.py diff --git a/setup.py b/setup.py index e00b3c64..422c3c57 100644 --- a/setup.py +++ b/setup.py @@ -54,6 +54,8 @@ 'psutil ~= 5.9.4', 'pyee ~= 9.0.4', 'websockets ~= 10.4', + 'aiofiles ~= 22.1.0', + 'aioshutil ~= 1.2', ], extras_require={ 'dev': [ @@ -72,6 +74,7 @@ 'sphinx ~= 5.3.0', 'sphinx-autodoc-typehints ~= 1.19.5', 'sphinx-markdown-builder == 0.5.4', # pinned to 0.5.4, because 0.5.5 has a formatting bug + 'types-aiofiles ~= 22.1.0.4', 'types-psutil ~= 5.9.5.5', 'types-setuptools ~= 65.6.0.1', ], diff --git a/src/apify/_types.py b/src/apify/_types.py new file mode 100644 index 00000000..66a6a0b9 --- /dev/null +++ b/src/apify/_types.py @@ -0,0 +1,6 @@ +from typing import Any, Dict, List, Union + +# Type for representing json-serializable values +# It's close enough to the real thing supported by json.parse, and the best we can do until mypy supports recursive types +# It was suggested in a discussion with (and approved by) Guido van Rossum, so I'd consider it correct enough +JSONSerializable = Union[str, int, float, bool, None, Dict[str, Any], List[Any]] diff --git a/src/apify/_utils.py b/src/apify/_utils.py index 2ed45e90..e3bf9a9e 100644 --- a/src/apify/_utils.py +++ b/src/apify/_utils.py @@ -1,12 +1,14 @@ import asyncio +import errno import inspect import os import sys import time from datetime import datetime, timezone -from typing import Any, Callable, Generic, Optional, TypeVar, Union, cast +from typing import Any, Callable, Dict, Generic, List, Optional, TypeVar, Union, cast import psutil +from aiofiles.os import remove from apify_client import __version__ as client_version from ._version import __version__ as sdk_version @@ -104,3 +106,65 @@ async def _run_func_at_interval_async(func: Callable, interval_secs: float) -> N await res except asyncio.CancelledError: pass + + +class ListPage: + """A single page of items returned from a list() method.""" + + #: list: List of returned objects on this page + items: List + #: int: Count of the returned objects on this page + count: int + #: int: The limit on the number of returned objects offset specified in the API call + offset: int + #: int: The offset of the first object specified in the API call + limit: int + #: int: Total number of objects matching the API call criteria + total: int + #: bool: Whether the listing is descending or not + desc: bool + + def __init__(self, data: Dict) -> None: + """Initialize a ListPage instance from the API response data.""" + self.items = data['items'] if 'items' in data else [] + self.offset = data['offset'] if 'offset' in data else 0 + self.limit = data['limit'] if 'limit' in data else 0 + self.count = data['count'] if 'count' in data else len(self.items) + self.total = data['total'] if 'total' in data else self.offset + self.count + self.desc = data['desc'] if 'desc' in data else False + +# TODO: Compare to https://stackoverflow.com/a/59185523 + + +async def _force_remove(filename: str) -> None: + """JS-like rm(filename, { force: true })""" + try: + await remove(filename) + except OSError as e: + if e.errno != errno.ENOENT: # errno.ENOENT = no such file or directory + raise # re-raise exception if a different error occurred + + +def json_serializer(obj: Any) -> str: # TODO: Improve and check this!!! + if isinstance(obj, (datetime)): + return obj.isoformat(timespec='milliseconds') + 'Z' + else: + return str(obj) + + +def _filter_out_none_values_recursively(dictionary: Dict) -> Dict: + """Return copy of the dictionary, recursively omitting all keys for which values are None.""" + return cast(dict, _filter_out_none_values_recursively_internal(dictionary)) + + +# Unfortunately, it's necessary to have an internal function for the correct result typing, without having to create complicated overloads +def _filter_out_none_values_recursively_internal(dictionary: Dict, remove_empty_dicts: Optional[bool] = None) -> Optional[Dict]: + result = {} + for k, v in dictionary.items(): + if isinstance(v, dict): + v = _filter_out_none_values_recursively_internal(v, remove_empty_dicts is True or remove_empty_dicts is None) + if v is not None: + result[k] = v + if not result and remove_empty_dicts: + return None + return result diff --git a/src/apify/actor.py b/src/apify/actor.py index 08fb244c..61cd3fdf 100644 --- a/src/apify/actor.py +++ b/src/apify/actor.py @@ -1,7 +1,7 @@ import asyncio -import datetime import functools import inspect +from datetime import datetime from types import TracebackType from typing import Any, Callable, Coroutine, Dict, List, Optional, Type, TypeVar, cast @@ -186,7 +186,7 @@ def _get_system_info(self) -> Dict: memory_usage_bytes = _get_memory_usage_bytes() # This is in camel case to be compatible with the events from the platform result = { - 'createdAt': datetime.datetime.now().isoformat(timespec='milliseconds') + 'Z', + 'createdAt': datetime.utcnow().isoformat(timespec='milliseconds') + 'Z', 'cpuCurrentUsage': cpu_usage_percent, 'memCurrentBytes': memory_usage_bytes, } diff --git a/src/apify/consts.py b/src/apify/consts.py index a09150e8..51db9c73 100644 --- a/src/apify/consts.py +++ b/src/apify/consts.py @@ -86,3 +86,14 @@ class ApifyEnvVars(str, Enum): ApifyEnvVars.STARTED_AT.value, ApifyEnvVars.TIMEOUT_AT.value, ] + + +class StorageTypes(str, Enum): + DATASET = 'Dataset' + KEY_VALUE_STORE = 'Key-value store' + REQUEST_QUEUE = 'Request queue' + + +DEFAULT_API_PARAM_LIMIT = 1000 + +REQUEST_ID_LENGTH = 15 diff --git a/src/apify/memory_storage/file_storage_utils.py b/src/apify/memory_storage/file_storage_utils.py new file mode 100644 index 00000000..c1e87905 --- /dev/null +++ b/src/apify/memory_storage/file_storage_utils.py @@ -0,0 +1,112 @@ +import json +import os +from datetime import datetime +from enum import Enum +from typing import Any, Dict, List, Tuple + +import aiofiles +from aiofiles.os import makedirs, remove + +from .._utils import _force_remove, json_serializer + + +class StorageEntityType(Enum): + DATASET = 1 + KEY_VALUE_STORE = 2 + REQUEST_QUEUE = 3 + + +async def update_metadata(*, data: Dict, entity_directory: str, write_metadata: bool) -> None: + # Skip writing the actual metadata file. This is done after ensuring the directory exists so we have the directory present + if not write_metadata: + return + + # Ensure the directory for the entity exists + await makedirs(entity_directory, exist_ok=True) + + # Write the metadata to the file + file_path = os.path.join(entity_directory, '__metadata__.json') + async with aiofiles.open(file_path, mode='wb') as f: + # TODO: Check how to dump to JSON properly with aiofiles... + await f.write(json.dumps(data, ensure_ascii=False, indent=2, default=json_serializer).encode('utf-8')) + # json.dump(data, f) + + +async def _check_conditions(entity_directory: str, persist_storage: bool) -> None: + # Skip writing files to the disk if the client has the option set to false + if not persist_storage: + return + + # Ensure the directory for the entity exists + await makedirs(entity_directory, exist_ok=True) + + +async def update_dataset_items( + *, + data: List[Tuple[str, Dict]], + entity_directory: str, + persist_storage: bool, +) -> None: + await _check_conditions(entity_directory, persist_storage) + # Save all the new items to the disk + for idx, item in data: + file_path = os.path.join(entity_directory, f'{idx}.json') + async with aiofiles.open(file_path, mode='wb') as f: + await f.write(json.dumps(item, ensure_ascii=False, indent=2, default=json_serializer).encode('utf-8')) + + +async def set_or_delete_key_value_store_record( + *, + entity_directory: str, + persist_storage: bool, + record: Dict, + should_set: bool, + write_metadata: bool, +) -> None: + await _check_conditions(entity_directory, persist_storage) + + # Create files for the record + record_path = os.path.join(entity_directory, f"""{record['key']}.{record['extension']}""") + record_metadata_path = os.path.join(entity_directory, f"""{record['key']}.__metadata__.json""") + + await _force_remove(record_path) + await _force_remove(record_metadata_path) + + if should_set: + if write_metadata: + async with aiofiles.open(record_metadata_path, mode='wb') as f: + await f.write(json.dumps({ + 'key': record['key'], + 'contentType': record.get('content_type') or 'unknown/no content type', + 'extension': record['extension'], + }, ensure_ascii=False, indent=2, default=json_serializer).encode('utf-8')) + + # Convert to bytes if string + if isinstance(record['value'], str): + record['value'] = record['value'].encode('utf-8') + + async with aiofiles.open(record_path, mode='wb') as f: + await f.write(record['value']) + + +async def update_request_queue_item( + *, + request_id: str, + request: Dict, + entity_directory: str, + persist_storage: bool, +) -> None: + await _check_conditions(entity_directory, persist_storage) + + # Write the request to the file + file_path = os.path.join(entity_directory, f'{request_id}.json') + async with aiofiles.open(file_path, mode='wb') as f: + await f.write(json.dumps(request, ensure_ascii=False, indent=2, default=json_serializer).encode('utf-8')) + + +async def delete_request(*, request_id: str, entity_directory: str) -> None: + # Ensure the directory for the entity exists + await makedirs(entity_directory, exist_ok=True) + + file_path = os.path.join(entity_directory, f'{request_id}.json') + await _force_remove(file_path) diff --git a/src/apify/memory_storage/memory_storage.py b/src/apify/memory_storage/memory_storage.py new file mode 100644 index 00000000..220411b9 --- /dev/null +++ b/src/apify/memory_storage/memory_storage.py @@ -0,0 +1,129 @@ +import os +from pathlib import Path +from typing import List, Optional + +import aioshutil +from aiofiles import ospath +from aiofiles.os import rename + +from .resource_clients.dataset import DatasetClient +from .resource_clients.dataset_collection import DatasetCollectionClient +from .resource_clients.key_value_store import KeyValueStoreClient +from .resource_clients.key_value_store_collection import KeyValueStoreCollectionClient +from .resource_clients.request_queue import RequestQueueClient +from .resource_clients.request_queue_collection import RequestQueueCollectionClient + + +class MemoryStorage: + datasets_handled: List[DatasetClient] = [] + key_value_stores_handled: List[KeyValueStoreClient] = [] + request_queues_handled: List[RequestQueueClient] = [] + + def __init__(self, *, local_data_directory: str = './storage', write_metadata: Optional[bool] = False, persist_storage: Optional[bool] = True) -> None: + self.local_data_directory = local_data_directory + self.datasets_directory = os.path.join(self.local_data_directory, 'datasets') + self.key_value_stores_directory = os.path.join(self.local_data_directory, 'key_value_stores') + self.request_queues_directory = os.path.join(self.local_data_directory, 'request_queues') + self.write_metadata = write_metadata or '*' in os.getenv('DEBUG', '') + self.persist_storage = persist_storage or not any(s in os.getenv('APIFY_PERSIST_STORAGE', 'true') for s in ['false', '0', '']) + + def datasets(self) -> DatasetCollectionClient: + return DatasetCollectionClient(base_storage_directory=self.datasets_directory, client=self) + + def dataset(self, *, id: str) -> DatasetClient: + return DatasetClient(base_storage_directory=self.datasets_directory, client=self, id=id) + + def key_value_stores(self) -> KeyValueStoreCollectionClient: + return KeyValueStoreCollectionClient(base_storage_directory=self.key_value_stores_directory, client=self) + + def key_value_store(self, *, id: str) -> KeyValueStoreClient: + return KeyValueStoreClient(base_storage_directory=self.key_value_stores_directory, client=self, id=id) + + def request_queues(self) -> RequestQueueCollectionClient: + return RequestQueueCollectionClient(base_storage_directory=self.request_queues_directory, client=self) + + def request_queue(self, *, id: str, client_key: Optional[str] = None, timeout_secs: Optional[int] = None) -> RequestQueueClient: + return RequestQueueClient(base_storage_directory=self.request_queues_directory, client=self, id=id) + + async def purge(self) -> None: + # Key-value stores + key_value_store_folders = os.listdir(self.key_value_stores_directory) + for key_value_store_folder in key_value_store_folders: + if key_value_store_folder.startswith('__APIFY_TEMPORARY') or key_value_store_folder.startswith('__OLD'): + await self._batch_remove_files(os.path.join(self.key_value_stores_directory, key_value_store_folder)) + elif key_value_store_folder == 'default': + await self._handle_default_key_value_store(os.path.join(self.key_value_stores_directory, key_value_store_folder)) + + # Datasets + dataset_folders = os.listdir(self.datasets_directory) + for dataset_folder in dataset_folders: + if dataset_folder == 'default' or dataset_folder.startswith('__APIFY_TEMPORARY'): + await self._batch_remove_files(os.path.join(self.datasets_directory, dataset_folder)) + # Request queues + request_queue_folders = os.listdir(self.request_queues_directory) + for request_queue_folder in request_queue_folders: + if request_queue_folder == 'default' or request_queue_folder.startswith('__APIFY_TEMPORARY'): + await self._batch_remove_files(os.path.join(self.request_queues_directory, request_queue_folder)) + + def teardown(self) -> None: + # We don't need to wait for anything here since we don't have worker threads for fs operations + pass + + async def _handle_default_key_value_store(self, folder: str) -> None: + folder_exists = await ospath.exists(folder) + temporary_path = os.path.join(folder, '../__APIFY_MIGRATING_KEY_VALUE_STORE__') + + # For optimization, we want to only attempt to copy a few files from the default key-value store + possible_input_keys = [ + 'INPUT', + 'INPUT.json', + 'INPUT.bin', + 'INPUT.txt', + ] + + if folder_exists: + # Create a temporary folder to save important files in + Path(temporary_path).mkdir(parents=True, exist_ok=True) + + # Go through each file and save the ones that are important + for entity in possible_input_keys: + original_file_path = os.path.join(folder, entity) + temp_file_path = os.path.join(temporary_path, entity) + try: + await rename(original_file_path, temp_file_path) + except: + # Ignore + pass + + # Remove the original folder and all its content + counter = 0 + temp_path_for_old_folder = os.path.join(folder, f'../__OLD_DEFAULT_{counter}__') + done = False + while not done: + try: + await rename(folder, temp_path_for_old_folder) + done = True + except: + counter += 1 + temp_path_for_old_folder = os.path.join(folder, f'../__OLD_DEFAULT_{counter}__') + + # Replace the temporary folder with the original folder + await rename(temporary_path, folder) + + # Remove the old folder + await self._batch_remove_files(temp_path_for_old_folder) + + async def _batch_remove_files(self, folder: str, counter: int = 0) -> None: + folder_exists = await ospath.exists(folder) + + if folder_exists: + temporary_folder = folder if folder.startswith('__APIFY_TEMPORARY_') else os.path.join(folder, f'../__APIFY_TEMPORARY_{counter}__') + + try: + # Rename the old folder to the new one to allow background deletions + await rename(folder, temporary_folder) + except: + # Folder exists already, try again with an incremented counter + return await self._batch_remove_files(folder, counter + 1) + + await aioshutil.rmtree(temporary_folder, ignore_errors=True) diff --git a/src/apify/memory_storage/resource_clients/_utils.py b/src/apify/memory_storage/resource_clients/_utils.py new file mode 100644 index 00000000..50146d37 --- /dev/null +++ b/src/apify/memory_storage/resource_clients/_utils.py @@ -0,0 +1,63 @@ +import base64 +import hashlib +import io +import json +import mimetypes +import re +from typing import Any, NoReturn, Optional + +from ...consts import REQUEST_ID_LENGTH, StorageTypes + +uuid_regex = re.compile('[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}', re.I) + + +def _raise_on_non_existing(client_type: StorageTypes, id: str) -> NoReturn: + raise ValueError(f'{client_type} with id: {id} does not exist.') + + +def _raise_on_duplicate_entry(client_type: StorageTypes, key_name: str, value: str) -> NoReturn: + raise ValueError(f'{client_type} with {key_name}: {value} already exists.') + + +def _guess_file_extension(content_type: str) -> Optional[str]: + # e.g. mimetypes.guess_extension('application/json ') does not work... + actual_content_type = content_type.split(';')[0].strip() + ext = mimetypes.guess_extension(actual_content_type) + # Remove the leading dot if extension successfully parsed + return ext[1:] if ext is not None else ext + + +def _is_content_type_json(content_type: str) -> bool: + return bool(re.search(r'^application/json', content_type, flags=re.IGNORECASE)) + + +def _is_content_type_xml(content_type: str) -> bool: + return bool(re.search(r'^application/.*xml$', content_type, flags=re.IGNORECASE)) + + +def _is_content_type_text(content_type: str) -> bool: + return bool(re.search(r'^text/', content_type, flags=re.IGNORECASE)) + + +def _is_file_or_bytes(value: Any) -> bool: + # The check for IOBase is not ideal, it would be better to use duck typing, + # but then the check would be super complex, judging from how the 'requests' library does it. + # This way should be good enough for the vast majority of use cases, if it causes issues, we can improve it later. + return isinstance(value, (bytes, bytearray, io.IOBase)) + + +def _maybe_parse_body(body: bytes, content_type: str) -> Any: + try: + if _is_content_type_json(content_type): + return json.loads(body) # Returns any + elif _is_content_type_xml(content_type) or _is_content_type_text(content_type): + return body.decode('utf-8') # TODO: Check if utf-8 can be assumed + except ValueError as err: + print('_maybe_parse_body error', err) + return body + + +def _unique_key_to_request_id(unique_key: str) -> str: + id = re.sub('(\+|\/|=)', '', base64.b64encode(hashlib.sha256(unique_key.encode('utf-8')).digest()).decode('utf-8')) + + return id[:REQUEST_ID_LENGTH] if len(id) > REQUEST_ID_LENGTH else id diff --git a/src/apify/memory_storage/resource_clients/dataset.py b/src/apify/memory_storage/resource_clients/dataset.py new file mode 100644 index 00000000..5ee25b93 --- /dev/null +++ b/src/apify/memory_storage/resource_clients/dataset.py @@ -0,0 +1,386 @@ +import json +import os +import uuid +from contextlib import asynccontextmanager +from datetime import datetime +from typing import TYPE_CHECKING, Any, AsyncGenerator, AsyncIterator, Dict, List, Optional, Tuple, Union + +import aioshutil + +from ..._types import JSONSerializable +from ..._utils import ListPage +from ..file_storage_utils import update_dataset_items, update_metadata +from ._utils import StorageTypes, _raise_on_duplicate_entry, _raise_on_non_existing, uuid_regex + +if TYPE_CHECKING: + from ..memory_storage import MemoryStorage + +""" + Number of characters of the dataset item file names. + E.g.: 000000019.json - 9 digits +""" +LOCAL_ENTRY_NAME_DIGITS = 9 + + +class DatasetClient: + created_at = datetime.utcnow() + accessed_at = datetime.utcnow() + modified_at = datetime.utcnow() + item_count = 0 + dataset_entries: Dict[str, Dict] = {} + + def __init__(self, *, base_storage_directory: str, client: 'MemoryStorage', id: Optional[str] = None, name: Optional[str] = None) -> None: + self.id = str(uuid.uuid4()) if id is None else id + self.dataset_directory = os.path.join(base_storage_directory, name or self.id) + self.client = client + self.name = name + + async def get(self) -> Optional[Dict]: + found = find_or_cache_dataset_by_possible_id(client=self.client, entry_name_or_id=self.name or self.id) + + if found: + await found.update_timestamps(False) + return found.to_dataset_info() + + return None + + async def update(self, *, name: Optional[str] = None) -> Dict: + # Check by id + existing_store_by_id = find_or_cache_dataset_by_possible_id(client=self.client, entry_name_or_id=self.name or self.id) + + if existing_store_by_id is None: + _raise_on_non_existing(StorageTypes.DATASET, self.id) + + # Skip if no changes + if name is None: + return existing_store_by_id.to_dataset_info() + + # Check that name is not in use already + existing_store_by_name = next( + (store for store in self.client.datasets_handled if store.name and store.name.lower() == name.lower()), None) + + if existing_store_by_name is not None: + _raise_on_duplicate_entry(StorageTypes.DATASET, 'name', name) + + existing_store_by_id.name = name + + previous_dir = existing_store_by_id.dataset_directory + + existing_store_by_id.dataset_directory = os.path.join(self.client.datasets_directory, name) + + # Remove new directory if it exists + # TODO: compare to using os.renames, which has problems when target dir exists + # TODO: check if ignore errors needed... + await aioshutil.rmtree(existing_store_by_id.dataset_directory, ignore_errors=True) + # Copy the previous directory to the new one + await aioshutil.copytree(previous_dir, existing_store_by_id.dataset_directory) + # Remove the previous directory + await aioshutil.rmtree(previous_dir) + + # Update timestamps + await existing_store_by_id.update_timestamps(True) + + return existing_store_by_id.to_dataset_info() + + async def delete(self) -> None: + store = next((store for store in self.client.datasets_handled if store.id == self.id), None) + + if store is not None: + self.client.datasets_handled.remove(store) + store.item_count = 0 + store.dataset_entries.clear() + + await aioshutil.rmtree(store.dataset_directory) + + async def list_items( + self, + *, + offset: Optional[int] = None, + limit: Optional[int] = None, + clean: Optional[bool] = None, + desc: Optional[bool] = None, + fields: Optional[List[str]] = None, + omit: Optional[List[str]] = None, + unwind: Optional[str] = None, + skip_empty: Optional[bool] = None, + skip_hidden: Optional[bool] = None, + flatten: Optional[List[str]] = None, + view: Optional[str] = None, + ) -> ListPage: + # Check by id + existing_store_by_id = find_or_cache_dataset_by_possible_id(client=self.client, entry_name_or_id=self.name or self.id) + + if existing_store_by_id is None: + _raise_on_non_existing(StorageTypes.DATASET, self.id) + + start, end = existing_store_by_id.get_start_and_end_indexes( + max(existing_store_by_id.item_count - (offset or 0) - (limit or 0), 0) if desc else offset or 0, + limit + ) + + items = [] + + for idx in range(start, end): + entry_number = self._generate_local_entry_name(idx) + items.append(existing_store_by_id.dataset_entries[entry_number]) + + await existing_store_by_id.update_timestamps(False) + + if desc: + items.reverse() + + return ListPage({ + 'count': len(items), + 'desc': desc or False, + 'items': items, + 'limit': limit, + 'offset': offset, + 'total': existing_store_by_id.item_count, + }) + + async def iterate_items( + self, + *, + offset: int = 0, + limit: Optional[int] = None, + clean: Optional[bool] = None, + desc: Optional[bool] = None, + fields: Optional[List[str]] = None, + omit: Optional[List[str]] = None, + unwind: Optional[str] = None, + skip_empty: Optional[bool] = None, + skip_hidden: Optional[bool] = None, + ) -> AsyncGenerator: # TODO: Copy-pasted from client + cache_size = 1000 + first_item = offset + + # If there is no limit, set last_item to None until we get the total from the first API response + if limit is None: + last_item = None + else: + last_item = offset + limit + + current_offset = first_item + while last_item is None or current_offset < last_item: + if last_item is None: + current_limit = cache_size + else: + current_limit = min(cache_size, last_item - current_offset) + + current_items_page = await self.list_items( + offset=current_offset, + limit=current_limit, + clean=clean, + desc=desc, + fields=fields, + omit=omit, + unwind=unwind, + skip_empty=skip_empty, + skip_hidden=skip_hidden, + ) + + current_offset += current_items_page.count + if last_item is None or current_items_page.total < last_item: + last_item = current_items_page.total + + for item in current_items_page.items: + yield item + + async def get_items_as_bytes( + self, + *, + item_format: str = 'json', + offset: Optional[int] = None, + limit: Optional[int] = None, + desc: Optional[bool] = None, + clean: Optional[bool] = None, + bom: Optional[bool] = None, + delimiter: Optional[str] = None, + fields: Optional[List[str]] = None, + omit: Optional[List[str]] = None, + unwind: Optional[str] = None, + skip_empty: Optional[bool] = None, + skip_header_row: Optional[bool] = None, + skip_hidden: Optional[bool] = None, + xml_root: Optional[str] = None, + xml_row: Optional[str] = None, + flatten: Optional[List[str]] = None, + ) -> bytes: + raise NotImplementedError("This method is not supported in local memory storage") + + @asynccontextmanager + async def stream_items( + self, + *, + item_format: str = 'json', + offset: Optional[int] = None, + limit: Optional[int] = None, + desc: Optional[bool] = None, + clean: Optional[bool] = None, + bom: Optional[bool] = None, + delimiter: Optional[str] = None, + fields: Optional[List[str]] = None, + omit: Optional[List[str]] = None, + unwind: Optional[str] = None, + skip_empty: Optional[bool] = None, + skip_header_row: Optional[bool] = None, + skip_hidden: Optional[bool] = None, + xml_root: Optional[str] = None, + xml_row: Optional[str] = None, + ) -> AsyncIterator: + yield { # TODO: figure out how to do streaming + + } + + async def push_items(self, items: JSONSerializable) -> None: + # Check by id + existing_store_by_id = find_or_cache_dataset_by_possible_id(client=self.client, entry_name_or_id=self.name or self.id) + + if existing_store_by_id is None: + _raise_on_non_existing(StorageTypes.DATASET, self.id) + + normalized = self._normalize_items(items) + + added_ids: List[str] = [] + for entry in normalized: + existing_store_by_id.item_count += 1 + idx = self._generate_local_entry_name(existing_store_by_id.item_count) + + existing_store_by_id.dataset_entries[idx] = entry + added_ids.append(idx) + + data_entries: List[Tuple[str, Dict]] = [] + for id in added_ids: + data_entries.append((id, existing_store_by_id.dataset_entries[id])) + + await existing_store_by_id.update_timestamps(True) + print(self.dataset_directory) + await update_dataset_items( + data=data_entries, + entity_directory=existing_store_by_id.dataset_directory, + persist_storage=self.client.persist_storage, + ) + + def to_dataset_info(self) -> Dict: + return { + 'id': self.id, + 'name': self.name, + 'itemCount': self.item_count, + 'accessedAt': self.accessed_at, + 'createdAt': self.created_at, + 'modifiedAt': self.modified_at, + } + + async def update_timestamps(self, has_been_modified: bool) -> None: + self.accessed_at = datetime.utcnow() + + if has_been_modified: + self.modified_at = datetime.utcnow() + + dataset_info = self.to_dataset_info() + await update_metadata(data=dataset_info, entity_directory=self.dataset_directory, write_metadata=self.client.write_metadata) + + def get_start_and_end_indexes(self, offset: int, limit: Optional[int] = None) -> Tuple[int, int]: + actual_limit = limit or self.item_count + start = offset + 1 + end = min(offset + actual_limit, self.item_count) + 1 + return (start, end) + + def _generate_local_entry_name(self, idx: int) -> str: + return str(idx).zfill(LOCAL_ENTRY_NAME_DIGITS) + + def _normalize_items(self, items: JSONSerializable) -> List[Dict]: + def normalize_item(item: Any) -> Optional[Dict]: + if type(item) is str: + item = json.loads(item) + + if type(item) is list: + received = ',\n'.join(item) + raise ValueError(f'Each dataset item can only be a single JSON object, not an array. Received: [{received}]') + + if type(item) is not dict and item is not None: + raise ValueError(f'Each dataset item must be a JSON object. Received: {item}') + + return item + + if type(items) is str: + items = json.loads(items) + + result = list(map(normalize_item, items)) if type(items) is list else [normalize_item(items)] + # filter(None, ..) returns items that are True + return list(filter(None, result)) + + +def find_or_cache_dataset_by_possible_id(client: 'MemoryStorage', entry_name_or_id: str) -> Optional['DatasetClient']: + # First check memory cache + found = next((store for store in client.datasets_handled if store.id == + entry_name_or_id or (store.name and store.name.lower() == entry_name_or_id.lower())), None) + + if found is not None: + return found + + datasets_dir = os.path.join(client.datasets_directory, entry_name_or_id) + # Check if directory exists + if not os.access(datasets_dir, os.F_OK): + return None + + id: Union[str, None] = None + name: Union[str, None] = None + item_count = 0 + created_at = datetime.utcnow() + accessed_at = datetime.utcnow() + modified_at = datetime.utcnow() + entries: Dict[str, Dict] = {} + + has_seen_metadata_file = False + + # Access the dataset folder + for entry in os.scandir(datasets_dir): + if entry.is_file(): + if entry.name == '__metadata__.json': + has_seen_metadata_file = True + + # We have found the store metadata file, build out information based on it + with open(os.path.join(datasets_dir, entry.name)) as f: + metadata = json.load(f) + id = metadata['id'] + name = metadata['name'] + item_count = metadata['itemCount'] + created_at = datetime.strptime(metadata['createdAt'], '%Y-%m-%dT%H:%M:%S.%fZ') + accessed_at = datetime.strptime(metadata['accessedAt'], '%Y-%m-%dT%H:%M:%S.%fZ') + modified_at = datetime.strptime(metadata['modifiedAt'], '%Y-%m-%dT%H:%M:%S.%fZ') + + continue + + with open(os.path.join(datasets_dir, entry.name)) as f: + entry_content = json.load(f) + entry_name = entry.name.split('.')[0] + + entries[entry_name] = entry_content + + if not has_seen_metadata_file: + item_count += 1 + + if id is None and name is None: + is_uuid = uuid_regex.match(entry_name_or_id) + + if is_uuid is not None: + id = entry_name_or_id + else: + name = entry_name_or_id + + new_client = DatasetClient(base_storage_directory=client.datasets_directory, client=client, id=id, name=name) + + # Overwrite properties + new_client.accessed_at = accessed_at + new_client.created_at = created_at + new_client.modified_at = modified_at + new_client.item_count = item_count + + for entry_id, content in entries.items(): + # TODO: possibly do a copy/deepcopy of content? + new_client.dataset_entries[entry_id] = content + + client.datasets_handled.append(new_client) + + return new_client diff --git a/src/apify/memory_storage/resource_clients/dataset_collection.py b/src/apify/memory_storage/resource_clients/dataset_collection.py new file mode 100644 index 00000000..77f546b1 --- /dev/null +++ b/src/apify/memory_storage/resource_clients/dataset_collection.py @@ -0,0 +1,45 @@ +from operator import itemgetter +from typing import TYPE_CHECKING, Dict, Optional + +from ..._utils import ListPage +from ..file_storage_utils import update_metadata +from .dataset import DatasetClient, find_or_cache_dataset_by_possible_id + +if TYPE_CHECKING: + from ..memory_storage import MemoryStorage + + +class DatasetCollectionClient: + + def __init__(self, *, base_storage_directory: str, client: 'MemoryStorage') -> None: + self.datasets_directory = base_storage_directory + self.client = client + + def list(self) -> ListPage: + def map_store(store: DatasetClient) -> Dict: + return store.to_dataset_info() + return ListPage({ + 'total': len(self.client.datasets_handled), + 'count': len(self.client.datasets_handled), + 'offset': 0, + 'limit': len(self.client.datasets_handled), + 'desc': False, + 'items': sorted(map(map_store, self.client.datasets_handled), key=itemgetter('createdAt')), + }) + + async def get_or_create(self, *, name: Optional[str] = None, schema: Optional[Dict] = None) -> Dict: + if name: + found = find_or_cache_dataset_by_possible_id(client=self.client, entry_name_or_id=name) + + if found: + return found.to_dataset_info() + + new_store = DatasetClient(name=name, base_storage_directory=self.datasets_directory, client=self.client) + self.client.datasets_handled.append(new_store) + + dataset_info = new_store.to_dataset_info() + + # Write to the disk + await update_metadata(data=dataset_info, entity_directory=new_store.dataset_directory, write_metadata=self.client.write_metadata) + + return dataset_info diff --git a/src/apify/memory_storage/resource_clients/key_value_store.py b/src/apify/memory_storage/resource_clients/key_value_store.py new file mode 100644 index 00000000..db41dd8d --- /dev/null +++ b/src/apify/memory_storage/resource_clients/key_value_store.py @@ -0,0 +1,406 @@ +import json +import mimetypes +import os +import pathlib +import uuid +import warnings +from contextlib import asynccontextmanager +from datetime import datetime +from operator import itemgetter +from typing import TYPE_CHECKING, Any, AsyncIterator, Dict, Optional, Union + +import aioshutil + +from ..._utils import json_serializer +from ...consts import DEFAULT_API_PARAM_LIMIT, StorageTypes +from ..file_storage_utils import set_or_delete_key_value_store_record, update_metadata +from ._utils import _guess_file_extension, _is_file_or_bytes, _maybe_parse_body, _raise_on_duplicate_entry, _raise_on_non_existing, uuid_regex + +if TYPE_CHECKING: + from ..memory_storage import MemoryStorage + +DEFAULT_LOCAL_FILE_EXTENSION = 'bin' + + +class KeyValueStoreClient: + created_at = datetime.utcnow() + accessed_at = datetime.utcnow() + modified_at = datetime.utcnow() + key_value_entries: Dict[str, Dict] = {} + + def __init__(self, *, base_storage_directory: str, client: 'MemoryStorage', id: Optional[str] = None, name: Optional[str] = None) -> None: + self.id = str(uuid.uuid4()) if id is None else id + self.key_value_store_directory = os.path.join(base_storage_directory, name or self.id) + self.client = client + self.name = name + + async def get(self) -> Optional[Dict]: + found = find_or_cache_key_value_store_by_possible_id(client=self.client, entry_name_or_id=self.name or self.id) + + if found: + await found.update_timestamps(False) + return found.to_key_value_store_info() + + return None + + async def update(self, *, name: Optional[str] = None) -> Dict: + # Check by id + existing_store_by_id = find_or_cache_key_value_store_by_possible_id(client=self.client, entry_name_or_id=self.name or self.id) + + if existing_store_by_id is None: + _raise_on_non_existing(StorageTypes.KEY_VALUE_STORE, self.id) + + # Skip if no changes + if name is None: + return existing_store_by_id.to_key_value_store_info() + + # Check that name is not in use already + existing_store_by_name = next( + (store for store in self.client.key_value_stores_handled if store.name and store.name.lower() == name.lower()), None) + + if existing_store_by_name is not None: + _raise_on_duplicate_entry(StorageTypes.KEY_VALUE_STORE, 'name', name) + + existing_store_by_id.name = name + + previous_dir = existing_store_by_id.key_value_store_directory + + existing_store_by_id.key_value_store_directory = os.path.join(self.client.key_value_stores_directory, name) + + # Remove new directory if it exists + # TODO: compare to using os.renames, which has problems when target dir exists + # TODO: check if ignore errors needed... + await aioshutil.rmtree(existing_store_by_id.key_value_store_directory, ignore_errors=True) + # Copy the previous directory to the new one + await aioshutil.copytree(previous_dir, existing_store_by_id.key_value_store_directory) + # Remove the previous directory + await aioshutil.rmtree(previous_dir) + + # Update timestamps + await existing_store_by_id.update_timestamps(True) + + return existing_store_by_id.to_key_value_store_info() + + async def delete(self) -> None: + store = next((store for store in self.client.key_value_stores_handled if store.id == self.id), None) + + if store is not None: + self.client.key_value_stores_handled.remove(store) + store.key_value_entries.clear() + + await aioshutil.rmtree(store.key_value_store_directory) + + async def list_keys(self, *, limit: int = DEFAULT_API_PARAM_LIMIT, exclusive_start_key: Optional[str] = None) -> Dict: + # Check by id + existing_store_by_id = find_or_cache_key_value_store_by_possible_id(self.client, self.name or self.id) + + if existing_store_by_id is None: + _raise_on_non_existing(StorageTypes.KEY_VALUE_STORE, self.id) + + items = [] + + for record in existing_store_by_id.key_value_entries.values(): + size = len(record['value']) # TODO: Check if this works for all cases + items.append({ + 'key': record['key'], + 'size': size, + }) + + # Lexically sort to emulate the API + items = sorted(items, key=itemgetter('key')) + + truncated_items = items + if exclusive_start_key is not None: + key_pos = next((idx for idx, i in enumerate(items) if i['key'] == exclusive_start_key), None) + if key_pos is not None: + truncated_items = items[key_pos + 1:] + + limited_items = truncated_items[:limit] + + last_item_in_store = items[-1] + last_selected_item = limited_items[-1] + is_last_selected_item_absolutely_last = last_item_in_store == last_selected_item + next_exclusive_start_key = None if is_last_selected_item_absolutely_last else last_selected_item['key'] + + await existing_store_by_id.update_timestamps(False) + + return { + 'count': len(items), + 'limit': limit, + 'exclusiveStartKey': exclusive_start_key, + 'isTruncated': not is_last_selected_item_absolutely_last, + 'nextExclusiveStartKey': next_exclusive_start_key, + 'items': limited_items, + } + + async def get_record(self, key: str) -> Optional[Dict]: + # Check by id + existing_store_by_id = find_or_cache_key_value_store_by_possible_id(self.client, self.name or self.id) + + if existing_store_by_id is None: + _raise_on_non_existing(StorageTypes.KEY_VALUE_STORE, self.id) + + entry = existing_store_by_id.key_value_entries.get(key) + + if entry is None: + return None + + record = { + 'key': entry['key'], + 'value': entry['value'], + # To guess the type, we need a real file name, not just the extension. e.g. 'file.json' instead of 'json' + 'contentType': entry.get('content_type') or mimetypes.guess_type(f"file.{entry['extension']}")[0], # TODO: Default value? + } + + record['value'] = _maybe_parse_body(record['value'], record['contentType']) + + await existing_store_by_id.update_timestamps(False) + + return record + + async def get_record_as_bytes(self, key: str) -> Optional[Dict]: + # TODO: make a private method that reuses code instead of copy pasting get_record and removing one line with parsing ;) + # Check by id + existing_store_by_id = find_or_cache_key_value_store_by_possible_id(self.client, self.name or self.id) + + if existing_store_by_id is None: + _raise_on_non_existing(StorageTypes.KEY_VALUE_STORE, self.id) + + entry = existing_store_by_id.key_value_entries.get(key) + + if entry is None: + return None + + record = { + 'key': entry['key'], + 'value': entry['value'], + # To guess the type, we need a real file name, not just the extension. e.g. 'file.json' instead of 'json' + 'contentType': entry.get('content_type') or mimetypes.guess_type(f"file.{entry['extension']}")[0], # TODO: Default value? + } + + await existing_store_by_id.update_timestamps(False) + + return record + + @asynccontextmanager + async def stream_record(self, key: str) -> AsyncIterator[Optional[Dict]]: + # TODO: implement - no idea how atm + yield None + + async def set_record(self, key: str, value: Any, content_type: Optional[str] = None) -> None: + # Check by id + existing_store_by_id = find_or_cache_key_value_store_by_possible_id(self.client, self.name or self.id) + + if existing_store_by_id is None: + _raise_on_non_existing(StorageTypes.KEY_VALUE_STORE, self.id) + + if content_type is None: + if _is_file_or_bytes(value): + content_type = 'application/octet-stream' + elif isinstance(value, str): + content_type = 'text/plain; charset=utf-8' + else: + content_type = 'application/json; charset=utf-8' + + extension = _guess_file_extension(content_type or '') or DEFAULT_LOCAL_FILE_EXTENSION + + if 'application/json' in content_type and not _is_file_or_bytes(value) and not isinstance(value, str): + value = json.dumps(value, ensure_ascii=False, indent=2, default=json_serializer).encode('utf-8') + + # TODO: Add stream support for this method... + # if (valueIsStream) { + # const chunks = []; + # for await (const chunk of value) { + # chunks.push(chunk); + # } + # value = Buffer.concat(chunks); + # } + + record = { + 'extension': extension, + 'key': key, + 'value': value, + 'content_type': content_type, + } + + existing_store_by_id.key_value_entries[key] = record + + await existing_store_by_id.update_timestamps(True) + await set_or_delete_key_value_store_record( + entity_directory=existing_store_by_id.key_value_store_directory, + persist_storage=self.client.persist_storage, + record=record, + should_set=True, + write_metadata=self.client.write_metadata, + ) + + async def delete_record(self, key: str) -> None: + # Check by id + existing_store_by_id = find_or_cache_key_value_store_by_possible_id(self.client, self.name or self.id) + + if existing_store_by_id is None: + _raise_on_non_existing(StorageTypes.KEY_VALUE_STORE, self.id) + + entry = existing_store_by_id.key_value_entries.get(key) + + if entry is not None: + del existing_store_by_id.key_value_entries[key] + await existing_store_by_id.update_timestamps(True) + await set_or_delete_key_value_store_record( + entity_directory=existing_store_by_id.key_value_store_directory, + persist_storage=self.client.persist_storage, + record=entry, + should_set=False, + write_metadata=self.client.write_metadata, + ) + + def to_key_value_store_info(self) -> Dict: + return { + 'id': self.id, + 'name': self.name, + 'accessedAt': self.accessed_at, + 'createdAt': self.created_at, + 'modifiedAt': self.modified_at, + 'userId': '1', + } + + async def update_timestamps(self, has_been_modified: bool) -> None: + self.accessed_at = datetime.utcnow() + + if has_been_modified: + self.modified_at = datetime.utcnow() + + kv_store_info = self.to_key_value_store_info() + await update_metadata(data=kv_store_info, entity_directory=self.key_value_store_directory, write_metadata=self.client.write_metadata) + + +def find_or_cache_key_value_store_by_possible_id(client: 'MemoryStorage', entry_name_or_id: str) -> Optional['KeyValueStoreClient']: + # First check memory cache + found = next((store for store in client.key_value_stores_handled if store.id == + entry_name_or_id or (store.name and store.name.lower() == entry_name_or_id.lower())), None) + + if found is not None: + return found + + key_value_store_dir = os.path.join(client.key_value_stores_directory, entry_name_or_id) + # Check if directory exists + if not os.access(key_value_store_dir, os.F_OK): + return None + + id: Union[str, None] = None + name: Union[str, None] = None + created_at = datetime.utcnow() + accessed_at = datetime.utcnow() + modified_at = datetime.utcnow() + internal_records: Dict[str, Dict] = {} + + # Access the key value store folder + for entry in os.scandir(key_value_store_dir): + if entry.is_file(): + if entry.name == '__metadata__.json': + # We have found the store metadata file, build out information based on it + with open(os.path.join(key_value_store_dir, entry.name), encoding='utf8') as f: + metadata = json.load(f) + id = metadata['id'] + name = metadata['name'] + created_at = datetime.strptime(metadata['createdAt'], '%Y-%m-%dT%H:%M:%S.%fZ') + accessed_at = datetime.strptime(metadata['accessedAt'], '%Y-%m-%dT%H:%M:%S.%fZ') + modified_at = datetime.strptime(metadata['modifiedAt'], '%Y-%m-%dT%H:%M:%S.%fZ') + + continue + + if '.__metadata__.' in entry.name: + # This is an entry's metadata file, we can use it to create/extend the record + with open(os.path.join(key_value_store_dir, entry.name), encoding='utf8') as f: + metadata = json.load(f) + + new_record = { + **internal_records.get(metadata['key'], {}), + **metadata, + } + + internal_records[metadata['key']] = new_record + + continue + + with open(os.path.join(key_value_store_dir, entry.name), 'rb') as f: + file_content = f.read() + file_extension = pathlib.Path(entry.name).suffix + content_type, _ = mimetypes.guess_type(entry.name) + if content_type is None: + content_type = 'text/plain' + extension = _guess_file_extension(content_type) + + # TODO: Check necessity of final_file_content in Python + final_file_content = file_content + + if file_extension == '': + # We need to override and then restore the warnings filter so that the warning gets printed out, + # Otherwise it would be silently swallowed + with warnings.catch_warnings(): + warnings.simplefilter('always') + warnings.warn( + f"""Key-value entry "{entry.name}" for store {entry_name_or_id} does not have a file extension, assuming it as text. + If you want to have correct interpretation of the file, you should add a file extension to the entry.""", + Warning, + stacklevel=2, + ) + # final_file_content = file_content + elif 'application/json' in content_type: + try: + # Try parsing the JSON ahead of time (not ideal but solves invalid files being loaded into stores) + json.loads(file_content) + # final_file_content = file_content + except json.JSONDecodeError: + # We need to override and then restore the warnings filter so that the warning gets printed out, + # Otherwise it would be silently swallowed + with warnings.catch_warnings(): + warnings.simplefilter('always') + warnings.warn( + f'Key-value entry "{entry.name}" for store {entry_name_or_id} has invalid JSON content and will be ignored from the store.', + Warning, + stacklevel=2, + ) + continue + # elif 'text/plain' in content_type: + # final_file_content = file_content + + name_split = entry.name.split('.') + + if file_extension != '': + name_split.pop() + + key = '.'.join(name_split) + + new_record = { + 'key': key, + 'extension': extension, + 'value': final_file_content, + 'content_type': content_type, + **internal_records.get(key, {}), + } + + internal_records[key] = new_record + + if id is None and name is None: + is_uuid = uuid_regex.match(entry_name_or_id) + + if is_uuid is not None: + id = entry_name_or_id + else: + name = entry_name_or_id + + new_client = KeyValueStoreClient(base_storage_directory=client.key_value_stores_directory, client=client, id=id, name=name) + + # Overwrite properties + new_client.accessed_at = accessed_at + new_client.created_at = created_at + new_client.modified_at = modified_at + + for key, record in internal_records.items(): + # TODO: possibly do a copy/deepcopy of record? + new_client.key_value_entries[key] = record + + client.key_value_stores_handled.append(new_client) + + return new_client diff --git a/src/apify/memory_storage/resource_clients/key_value_store_collection.py b/src/apify/memory_storage/resource_clients/key_value_store_collection.py new file mode 100644 index 00000000..eb0f360e --- /dev/null +++ b/src/apify/memory_storage/resource_clients/key_value_store_collection.py @@ -0,0 +1,45 @@ +from operator import itemgetter +from typing import TYPE_CHECKING, Any, Dict, Optional + +from ..._utils import ListPage +from ..file_storage_utils import update_metadata +from .key_value_store import KeyValueStoreClient, find_or_cache_key_value_store_by_possible_id + +if TYPE_CHECKING: + from ..memory_storage import MemoryStorage + + +class KeyValueStoreCollectionClient: + + def __init__(self, *, base_storage_directory: str, client: 'MemoryStorage') -> None: + self.key_value_stores_directory = base_storage_directory + self.client = client + + def list(self) -> ListPage: + def map_store(store: KeyValueStoreClient) -> Dict: + return store.to_key_value_store_info() + return ListPage({ + 'total': len(self.client.key_value_stores_handled), + 'count': len(self.client.key_value_stores_handled), + 'offset': 0, + 'limit': len(self.client.key_value_stores_handled), + 'desc': False, + 'items': sorted(map(map_store, self.client.key_value_stores_handled), key=itemgetter('createdAt')), + }) + + async def get_or_create(self, *, name: Optional[str] = None, schema: Optional[Dict] = None) -> Dict: + if name: + found = find_or_cache_key_value_store_by_possible_id(client=self.client, entry_name_or_id=name) + + if found: + return found.to_key_value_store_info() + + new_store = KeyValueStoreClient(name=name, base_storage_directory=self.key_value_stores_directory, client=self.client) + self.client.key_value_stores_handled.append(new_store) + + kv_store_info = new_store.to_key_value_store_info() + + # Write to the disk + await update_metadata(data=kv_store_info, entity_directory=new_store.key_value_store_directory, write_metadata=self.client.write_metadata) + + return kv_store_info diff --git a/src/apify/memory_storage/resource_clients/request_queue.py b/src/apify/memory_storage/resource_clients/request_queue.py new file mode 100644 index 00000000..8f8144b8 --- /dev/null +++ b/src/apify/memory_storage/resource_clients/request_queue.py @@ -0,0 +1,344 @@ +import json +import os +import uuid +from datetime import datetime +from typing import TYPE_CHECKING, Dict, List, Optional, Union + +import aioshutil + +from ..._utils import _filter_out_none_values_recursively, json_serializer +from ..file_storage_utils import update_metadata, update_request_queue_item, delete_request +from ._utils import StorageTypes, _raise_on_duplicate_entry, _raise_on_non_existing, _unique_key_to_request_id, uuid_regex + +if TYPE_CHECKING: + from ..memory_storage import MemoryStorage + + +class RequestQueueClient: + created_at = datetime.utcnow() + accessed_at = datetime.utcnow() + modified_at = datetime.utcnow() + handled_request_count = 0 + pending_request_count = 0 + requests: Dict[str, Dict] = {} + + def __init__(self, *, base_storage_directory: str, client: 'MemoryStorage', id: Optional[str] = None, name: Optional[str] = None) -> None: + self.id = str(uuid.uuid4()) if id is None else id + self.request_queue_directory = os.path.join(base_storage_directory, name or self.id) + self.client = client + self.name = name + + async def get(self) -> Optional[Dict]: + found = find_or_cache_request_queue_by_possible_id(self.client, self.name or self.id) + + if found: + await found.update_timestamps(False) + return found.to_request_queue_info() + + return None + + async def update(self, *, name: Optional[str] = None) -> Dict: + # Check by id + existing_store_by_id = find_or_cache_request_queue_by_possible_id(self.client, self.name or self.id) + + if existing_store_by_id is None: + _raise_on_non_existing(StorageTypes.REQUEST_QUEUE, self.id) + + # Skip if no changes + if name is None: + return existing_store_by_id.to_request_queue_info() + + # Check that name is not in use already + existing_store_by_name = next( + (store for store in self.client.request_queues_handled if store.name and store.name.lower() == name.lower()), None) + + if existing_store_by_name is not None: + _raise_on_duplicate_entry(StorageTypes.REQUEST_QUEUE, 'name', name) + + existing_store_by_id.name = name + + previous_dir = existing_store_by_id.request_queue_directory + + existing_store_by_id.request_queue_directory = os.path.join(self.client.request_queues_directory, name) + + # Remove new directory if it exists + # TODO: compare to using os.renames, which has problems when target dir exists + # TODO: check if ignore errors needed... + await aioshutil.rmtree(existing_store_by_id.request_queue_directory, ignore_errors=True) + # Copy the previous directory to the new one + await aioshutil.copytree(previous_dir, existing_store_by_id.request_queue_directory) + # Remove the previous directory + await aioshutil.rmtree(previous_dir) + + # Update timestamps + await existing_store_by_id.update_timestamps(True) + + return existing_store_by_id.to_request_queue_info() + + async def delete(self) -> None: + store = next((store for store in self.client.request_queues_handled if store.id == self.id), None) + + if store is not None: + self.client.request_queues_handled.remove(store) + store.pending_request_count = 0 + store.requests.clear() + + await aioshutil.rmtree(store.request_queue_directory) + + async def list_head(self, *, limit: Optional[int] = None) -> Dict: + existing_store_by_id = find_or_cache_request_queue_by_possible_id(self.client, self.name or self.id) + + if existing_store_by_id is None: + _raise_on_non_existing(StorageTypes.REQUEST_QUEUE, self.id) + + await existing_store_by_id.update_timestamps(False) + + items: List[Dict] = [] + + for request in existing_store_by_id.requests.values(): + if len(items) == limit: + break + + if request['orderNo']: + items.append(request) + + return { + 'limit': limit, + 'hadMultipleClients': False, + 'queueModifiedAt': existing_store_by_id.modified_at, + 'items': list(map(lambda item: self._json_to_request(item['json']), items)) + } + + async def add_request(self, request: Dict, *, forefront: Optional[bool] = None) -> Dict: + existing_store_by_id = find_or_cache_request_queue_by_possible_id(self.client, self.name or self.id) + + if existing_store_by_id is None: + _raise_on_non_existing(StorageTypes.REQUEST_QUEUE, self.id) + + request_model = self._create_internal_request(request, forefront) + + existing_request_with_id = existing_store_by_id.requests.get(request_model['id']) + + # We already have the request present, so we return information about it + if existing_request_with_id is not None: + await existing_store_by_id.update_timestamps(False) + + return { + 'requestId': existing_request_with_id['id'], + 'wasAlreadyHandled': existing_request_with_id['orderNo'] is None, + 'wasAlreadyPresent': True, + } + + existing_store_by_id.requests[request_model['id']] = request_model + existing_store_by_id.pending_request_count += 1 if request_model['orderNo'] is None else 0 + await existing_store_by_id.update_timestamps(True) + await update_request_queue_item( + request=request_model, + request_id=request_model['id'], + entity_directory=existing_store_by_id.request_queue_directory, + persist_storage=self.client.persist_storage, + ) + + return { + 'requestId': request_model['id'], + # We return wasAlreadyHandled: false even though the request may + # have been added as handled, because that's how API behaves. + 'wasAlreadyHandled': False, + 'wasAlreadyPresent': False, + } + + async def get_request(self, request_id: str) -> Optional[Dict]: + existing_store_by_id = find_or_cache_request_queue_by_possible_id(self.client, self.name or self.id) + + if existing_store_by_id is None: + _raise_on_non_existing(StorageTypes.REQUEST_QUEUE, self.id) + + await existing_store_by_id.update_timestamps(False) + + request = existing_store_by_id.requests.get(request_id) + return self._json_to_request(request['json'] if request is not None else None) + + async def update_request(self, request: Dict, *, forefront: Optional[bool] = None) -> Dict: + existing_store_by_id = find_or_cache_request_queue_by_possible_id(self.client, self.name or self.id) + + if existing_store_by_id is None: + _raise_on_non_existing(StorageTypes.REQUEST_QUEUE, self.id) + + request_model = self._create_internal_request(request, forefront) + + # First we need to check the existing request to be + # able to return information about its handled state. + + existing_request = existing_store_by_id.requests.get(request_model['id']) + + # Undefined means that the request is not present in the queue. + # We need to insert it, to behave the same as API. + if existing_request is None: + return await self.add_request(request, forefront=forefront) + + # When updating the request, we need to make sure that + # the handled counts are updated correctly in all cases. + existing_store_by_id.requests[request_model['id']] = request_model + + handled_count_adjustment = 0 + is_request_handled_state_changing = type(existing_request['orderNo']) != type(request_model['orderNo']) + request_was_handled_before_update = existing_request['orderNo'] is None + + if is_request_handled_state_changing: + handled_count_adjustment += 1 + if request_was_handled_before_update: + handled_count_adjustment = -handled_count_adjustment + + existing_store_by_id.pending_request_count += handled_count_adjustment + await existing_store_by_id.update_timestamps(True) + await update_request_queue_item( + request=request_model, + request_id=request_model['id'], + entity_directory=existing_store_by_id.request_queue_directory, + persist_storage=self.client.persist_storage, + ) + + return { + 'requestId': request_model['id'], + 'wasAlreadyHandled': request_was_handled_before_update, + 'wasAlreadyPresent': True, + } + + async def delete_request(self, request_id: str) -> None: + existing_store_by_id = find_or_cache_request_queue_by_possible_id(self.client, self.name or self.id) + + if existing_store_by_id is None: + _raise_on_non_existing(StorageTypes.REQUEST_QUEUE, self.id) + + request = existing_store_by_id.requests.get(request_id) + + if request: + del existing_store_by_id.requests[request_id] + existing_store_by_id.pending_request_count -= 0 if request['orderNo'] is None else 1 + await existing_store_by_id.update_timestamps(True) + await delete_request(entity_directory=existing_store_by_id.request_queue_directory, request_id=request_id) + + def to_request_queue_info(self) -> Dict: + return { + 'accessedAt': self.accessed_at, + 'createdAt': self.created_at, + 'hadMultipleClients': False, + 'handledRequestCount': self.handled_request_count, + 'id': self.id, + 'modifiedAt': self.modified_at, + 'name': self.name, + 'pendingRequestCount': self.pending_request_count, + 'stats': {}, + 'totalRequestCount': len(self.requests), + 'userId': '1', + } + + async def update_timestamps(self, has_been_modified: bool) -> None: + self.accessed_at = datetime.utcnow() + + if has_been_modified: + self.modified_at = datetime.utcnow() + + request_queue_info = self.to_request_queue_info() + await update_metadata(data=request_queue_info, entity_directory=self.request_queue_directory, write_metadata=self.client.write_metadata) + + def _json_to_request(self, request_json: Optional[str]) -> Optional[dict]: + if request_json is None: + return None + request = json.loads(request_json) + return _filter_out_none_values_recursively(request) + + def _create_internal_request(self, request: Dict, forefront: Optional[bool]) -> Dict: + order_no = self._calculate_order_no(request, forefront) + id = _unique_key_to_request_id(request['uniqueKey']) + + if request.get('id') is not None and request['id'] != id: + raise ValueError('Request ID does not match its unique_key.') + + json_request = json.dumps({**request, 'id': id}, ensure_ascii=False, indent=2, default=json_serializer) + return { + 'id': id, + 'json': json_request, + 'method': request.get('method'), + 'orderNo': order_no, + 'retryCount': request.get('retryCount', 0), + 'uniqueKey': request['uniqueKey'], + 'url': request['url'], + } + + def _calculate_order_no(self, request: Dict, forefront: Optional[bool]) -> Optional[int]: + if request.get('handledAt') is not None: + return None + + timestamp = int(round(datetime.utcnow().timestamp())) + + return -timestamp if forefront else timestamp + + +def find_or_cache_request_queue_by_possible_id(client: 'MemoryStorage', entry_name_or_id: str) -> Optional['RequestQueueClient']: + # First check memory cache + found = next((store for store in client.request_queues_handled if store.id == + entry_name_or_id or (store.name and store.name.lower() == entry_name_or_id.lower())), None) + + if found is not None: + return found + + request_queues_dir = os.path.join(client.request_queues_directory, entry_name_or_id) + # Check if directory exists + if not os.access(request_queues_dir, os.F_OK): + return None + + id: Union[str, None] = None + name: Union[str, None] = None + created_at = datetime.utcnow() + accessed_at = datetime.utcnow() + modified_at = datetime.utcnow() + handled_request_count = 0 + pending_request_count = 0 + entries: List[Dict] = [] + + # Access the request queue folder + for entry in os.scandir(request_queues_dir): + if entry.is_file(): + if entry.name == '__metadata__.json': + # We have found the store metadata file, build out information based on it + with open(os.path.join(request_queues_dir, entry.name)) as f: + metadata = json.load(f) + id = metadata['id'] + name = metadata['name'] + created_at = datetime.strptime(metadata['createdAt'], '%Y-%m-%dT%H:%M:%S.%fZ') + accessed_at = datetime.strptime(metadata['accessedAt'], '%Y-%m-%dT%H:%M:%S.%fZ') + modified_at = datetime.strptime(metadata['modifiedAt'], '%Y-%m-%dT%H:%M:%S.%fZ') + handled_request_count = metadata['handledRequestCount'] + pending_request_count = metadata['pendingRequestCount'] + + continue + + with open(os.path.join(request_queues_dir, entry.name)) as f: + request = json.load(f) + entries.append(request) + + if id is None and name is None: + is_uuid = uuid_regex.match(entry_name_or_id) + + if is_uuid is not None: + id = entry_name_or_id + else: + name = entry_name_or_id + + new_client = RequestQueueClient(base_storage_directory=client.request_queues_directory, client=client, id=id, name=name) + + # Overwrite properties + new_client.accessed_at = accessed_at + new_client.created_at = created_at + new_client.modified_at = modified_at + new_client.handled_request_count = handled_request_count + new_client.pending_request_count = pending_request_count + + for request in entries: + # TODO: possibly do a copy/deepcopy of request? + new_client.requests[request['id']] = request + + client.request_queues_handled.append(new_client) + + return new_client diff --git a/src/apify/memory_storage/resource_clients/request_queue_collection.py b/src/apify/memory_storage/resource_clients/request_queue_collection.py new file mode 100644 index 00000000..5ae2bd42 --- /dev/null +++ b/src/apify/memory_storage/resource_clients/request_queue_collection.py @@ -0,0 +1,45 @@ +from operator import itemgetter +from typing import TYPE_CHECKING, Dict, Optional + +from ..._utils import ListPage +from ..file_storage_utils import update_metadata +from .request_queue import RequestQueueClient, find_or_cache_request_queue_by_possible_id + +if TYPE_CHECKING: + from ..memory_storage import MemoryStorage + + +class RequestQueueCollectionClient: + + def __init__(self, *, base_storage_directory: str, client: 'MemoryStorage') -> None: + self.request_queues_directory = base_storage_directory + self.client = client + + def list(self) -> ListPage: + def map_store(store: RequestQueueClient) -> Dict: + return store.to_request_queue_info() + return ListPage({ + 'total': len(self.client.request_queues_handled), + 'count': len(self.client.request_queues_handled), + 'offset': 0, + 'limit': len(self.client.request_queues_handled), + 'desc': False, + 'items': sorted(map(map_store, self.client.request_queues_handled), key=itemgetter('createdAt')), + }) + + async def get_or_create(self, *, name: Optional[str] = None) -> Dict: + if name: + found = find_or_cache_request_queue_by_possible_id(self.client, name) + + if found: + return found.to_request_queue_info() + + new_store = RequestQueueClient(name=name, base_storage_directory=self.request_queues_directory, client=self.client) + self.client.request_queues_handled.append(new_store) + + request_queue_info = new_store.to_request_queue_info() + + # Write to the disk + await update_metadata(data=request_queue_info, entity_directory=new_store.request_queue_directory, write_metadata=self.client.write_metadata) + + return request_queue_info From 9db2a30aaa89687b49505d8b67fe3f533cc76247 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ji=C5=99=C3=AD=20Morav=C4=8D=C3=ADk?= Date: Mon, 19 Dec 2022 18:48:43 +0100 Subject: [PATCH 02/23] fix lint 1 --- src/apify/_utils.py | 4 ++-- src/apify/consts.py | 2 ++ src/apify/memory_storage/file_storage_utils.py | 10 +++++----- .../memory_storage/resource_clients/key_value_store.py | 4 ++-- .../memory_storage/resource_clients/request_queue.py | 6 +++--- 5 files changed, 14 insertions(+), 12 deletions(-) diff --git a/src/apify/_utils.py b/src/apify/_utils.py index e3bf9a9e..9d2b37b6 100644 --- a/src/apify/_utils.py +++ b/src/apify/_utils.py @@ -137,7 +137,7 @@ def __init__(self, data: Dict) -> None: async def _force_remove(filename: str) -> None: - """JS-like rm(filename, { force: true })""" + """JS-like rm(filename, { force: true }).""" try: await remove(filename) except OSError as e: @@ -145,7 +145,7 @@ async def _force_remove(filename: str) -> None: raise # re-raise exception if a different error occurred -def json_serializer(obj: Any) -> str: # TODO: Improve and check this!!! +def _json_serializer(obj: Any) -> str: # TODO: Improve and check this!!! if isinstance(obj, (datetime)): return obj.isoformat(timespec='milliseconds') + 'Z' else: diff --git a/src/apify/consts.py b/src/apify/consts.py index 51db9c73..41b4312f 100644 --- a/src/apify/consts.py +++ b/src/apify/consts.py @@ -89,6 +89,8 @@ class ApifyEnvVars(str, Enum): class StorageTypes(str, Enum): + """Possible Apify storage types.""" + DATASET = 'Dataset' KEY_VALUE_STORE = 'Key-value store' REQUEST_QUEUE = 'Request queue' diff --git a/src/apify/memory_storage/file_storage_utils.py b/src/apify/memory_storage/file_storage_utils.py index c1e87905..141e695e 100644 --- a/src/apify/memory_storage/file_storage_utils.py +++ b/src/apify/memory_storage/file_storage_utils.py @@ -7,7 +7,7 @@ import aiofiles from aiofiles.os import makedirs, remove -from .._utils import _force_remove, json_serializer +from .._utils import _force_remove, _json_serializer class StorageEntityType(Enum): @@ -28,7 +28,7 @@ async def update_metadata(*, data: Dict, entity_directory: str, write_metadata: file_path = os.path.join(entity_directory, '__metadata__.json') async with aiofiles.open(file_path, mode='wb') as f: # TODO: Check how to dump to JSON properly with aiofiles... - await f.write(json.dumps(data, ensure_ascii=False, indent=2, default=json_serializer).encode('utf-8')) + await f.write(json.dumps(data, ensure_ascii=False, indent=2, default=_json_serializer).encode('utf-8')) # json.dump(data, f) @@ -52,7 +52,7 @@ async def update_dataset_items( for idx, item in data: file_path = os.path.join(entity_directory, f'{idx}.json') async with aiofiles.open(file_path, mode='wb') as f: - await f.write(json.dumps(item, ensure_ascii=False, indent=2, default=json_serializer).encode('utf-8')) + await f.write(json.dumps(item, ensure_ascii=False, indent=2, default=_json_serializer).encode('utf-8')) async def set_or_delete_key_value_store_record( @@ -79,7 +79,7 @@ async def set_or_delete_key_value_store_record( 'key': record['key'], 'contentType': record.get('content_type') or 'unknown/no content type', 'extension': record['extension'], - }, ensure_ascii=False, indent=2, default=json_serializer).encode('utf-8')) + }, ensure_ascii=False, indent=2, default=_json_serializer).encode('utf-8')) # Convert to bytes if string if isinstance(record['value'], str): @@ -101,7 +101,7 @@ async def update_request_queue_item( # Write the request to the file file_path = os.path.join(entity_directory, f'{request_id}.json') async with aiofiles.open(file_path, mode='wb') as f: - await f.write(json.dumps(request, ensure_ascii=False, indent=2, default=json_serializer).encode('utf-8')) + await f.write(json.dumps(request, ensure_ascii=False, indent=2, default=_json_serializer).encode('utf-8')) async def delete_request(*, request_id: str, entity_directory: str) -> None: diff --git a/src/apify/memory_storage/resource_clients/key_value_store.py b/src/apify/memory_storage/resource_clients/key_value_store.py index db41dd8d..a97fae47 100644 --- a/src/apify/memory_storage/resource_clients/key_value_store.py +++ b/src/apify/memory_storage/resource_clients/key_value_store.py @@ -11,7 +11,7 @@ import aioshutil -from ..._utils import json_serializer +from ..._utils import _json_serializer from ...consts import DEFAULT_API_PARAM_LIMIT, StorageTypes from ..file_storage_utils import set_or_delete_key_value_store_record, update_metadata from ._utils import _guess_file_extension, _is_file_or_bytes, _maybe_parse_body, _raise_on_duplicate_entry, _raise_on_non_existing, uuid_regex @@ -205,7 +205,7 @@ async def set_record(self, key: str, value: Any, content_type: Optional[str] = N extension = _guess_file_extension(content_type or '') or DEFAULT_LOCAL_FILE_EXTENSION if 'application/json' in content_type and not _is_file_or_bytes(value) and not isinstance(value, str): - value = json.dumps(value, ensure_ascii=False, indent=2, default=json_serializer).encode('utf-8') + value = json.dumps(value, ensure_ascii=False, indent=2, default=_json_serializer).encode('utf-8') # TODO: Add stream support for this method... # if (valueIsStream) { diff --git a/src/apify/memory_storage/resource_clients/request_queue.py b/src/apify/memory_storage/resource_clients/request_queue.py index 8f8144b8..66f43bf9 100644 --- a/src/apify/memory_storage/resource_clients/request_queue.py +++ b/src/apify/memory_storage/resource_clients/request_queue.py @@ -6,8 +6,8 @@ import aioshutil -from ..._utils import _filter_out_none_values_recursively, json_serializer -from ..file_storage_utils import update_metadata, update_request_queue_item, delete_request +from ..._utils import _filter_out_none_values_recursively, _json_serializer +from ..file_storage_utils import delete_request, update_metadata, update_request_queue_item from ._utils import StorageTypes, _raise_on_duplicate_entry, _raise_on_non_existing, _unique_key_to_request_id, uuid_regex if TYPE_CHECKING: @@ -255,7 +255,7 @@ def _create_internal_request(self, request: Dict, forefront: Optional[bool]) -> if request.get('id') is not None and request['id'] != id: raise ValueError('Request ID does not match its unique_key.') - json_request = json.dumps({**request, 'id': id}, ensure_ascii=False, indent=2, default=json_serializer) + json_request = json.dumps({**request, 'id': id}, ensure_ascii=False, indent=2, default=_json_serializer) return { 'id': id, 'json': json_request, From daa5d103302726e127880f50d7c1c37737fa2955 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ji=C5=99=C3=AD=20Morav=C4=8D=C3=ADk?= Date: Mon, 19 Dec 2022 19:25:25 +0100 Subject: [PATCH 03/23] lint fix 2 --- .flake8 | 1 + .../memory_storage/file_storage_utils.py | 22 +++------ src/apify/memory_storage/memory_storage.py | 21 +++++++-- .../memory_storage/resource_clients/_utils.py | 2 +- .../resource_clients/dataset.py | 43 +++++++++++------ .../resource_clients/dataset_collection.py | 12 +++-- .../resource_clients/key_value_store.py | 45 ++++++++++++------ .../key_value_store_collection.py | 14 ++++-- .../resource_clients/request_queue.py | 47 ++++++++++++------- .../request_queue_collection.py | 12 +++-- 10 files changed, 139 insertions(+), 80 deletions(-) diff --git a/.flake8 b/.flake8 index 0088e87b..32a6a43e 100644 --- a/.flake8 +++ b/.flake8 @@ -9,6 +9,7 @@ max_line_length = 150 # Google docstring convention + D204 & D401 docstring-convention = all ignore = + U100 # TODO: Remove this after we decide how to handle unused args D100 D104 D203 diff --git a/src/apify/memory_storage/file_storage_utils.py b/src/apify/memory_storage/file_storage_utils.py index 141e695e..035ac9d1 100644 --- a/src/apify/memory_storage/file_storage_utils.py +++ b/src/apify/memory_storage/file_storage_utils.py @@ -1,22 +1,14 @@ import json import os -from datetime import datetime -from enum import Enum -from typing import Any, Dict, List, Tuple +from typing import Dict, List, Tuple import aiofiles -from aiofiles.os import makedirs, remove +from aiofiles.os import makedirs from .._utils import _force_remove, _json_serializer -class StorageEntityType(Enum): - DATASET = 1 - KEY_VALUE_STORE = 2 - REQUEST_QUEUE = 3 - - -async def update_metadata(*, data: Dict, entity_directory: str, write_metadata: bool) -> None: +async def _update_metadata(*, data: Dict, entity_directory: str, write_metadata: bool) -> None: # Skip writing the actual metadata file. This is done after ensuring the directory exists so we have the directory present if not write_metadata: return @@ -41,7 +33,7 @@ async def _check_conditions(entity_directory: str, persist_storage: bool) -> Non await makedirs(entity_directory, exist_ok=True) -async def update_dataset_items( +async def _update_dataset_items( *, data: List[Tuple[str, Dict]], entity_directory: str, @@ -55,7 +47,7 @@ async def update_dataset_items( await f.write(json.dumps(item, ensure_ascii=False, indent=2, default=_json_serializer).encode('utf-8')) -async def set_or_delete_key_value_store_record( +async def _set_or_delete_key_value_store_record( *, entity_directory: str, persist_storage: bool, @@ -89,7 +81,7 @@ async def set_or_delete_key_value_store_record( await f.write(record['value']) -async def update_request_queue_item( +async def _update_request_queue_item( *, request_id: str, request: Dict, @@ -104,7 +96,7 @@ async def update_request_queue_item( await f.write(json.dumps(request, ensure_ascii=False, indent=2, default=_json_serializer).encode('utf-8')) -async def delete_request(*, request_id: str, entity_directory: str) -> None: +async def _delete_request(*, request_id: str, entity_directory: str) -> None: # Ensure the directory for the entity exists await makedirs(entity_directory, exist_ok=True) diff --git a/src/apify/memory_storage/memory_storage.py b/src/apify/memory_storage/memory_storage.py index 220411b9..71919727 100644 --- a/src/apify/memory_storage/memory_storage.py +++ b/src/apify/memory_storage/memory_storage.py @@ -15,11 +15,16 @@ class MemoryStorage: + """Class representing an in-memory storage.""" + datasets_handled: List[DatasetClient] = [] key_value_stores_handled: List[KeyValueStoreClient] = [] request_queues_handled: List[RequestQueueClient] = [] - def __init__(self, *, local_data_directory: str = './storage', write_metadata: Optional[bool] = False, persist_storage: Optional[bool] = True) -> None: + def __init__( + self, *, local_data_directory: str = './storage', write_metadata: Optional[bool] = False, persist_storage: Optional[bool] = True, + ) -> None: + """TODO: docs.""" self.local_data_directory = local_data_directory self.datasets_directory = os.path.join(self.local_data_directory, 'datasets') self.key_value_stores_directory = os.path.join(self.local_data_directory, 'key_value_stores') @@ -28,24 +33,31 @@ def __init__(self, *, local_data_directory: str = './storage', write_metadata: O self.persist_storage = persist_storage or not any(s in os.getenv('APIFY_PERSIST_STORAGE', 'true') for s in ['false', '0', '']) def datasets(self) -> DatasetCollectionClient: + """TODO: docs.""" return DatasetCollectionClient(base_storage_directory=self.datasets_directory, client=self) def dataset(self, *, id: str) -> DatasetClient: + """TODO: docs.""" return DatasetClient(base_storage_directory=self.datasets_directory, client=self, id=id) def key_value_stores(self) -> KeyValueStoreCollectionClient: + """TODO: docs.""" return KeyValueStoreCollectionClient(base_storage_directory=self.key_value_stores_directory, client=self) def key_value_store(self, *, id: str) -> KeyValueStoreClient: + """TODO: docs.""" return KeyValueStoreClient(base_storage_directory=self.key_value_stores_directory, client=self, id=id) def request_queues(self) -> RequestQueueCollectionClient: + """TODO: docs.""" return RequestQueueCollectionClient(base_storage_directory=self.request_queues_directory, client=self) def request_queue(self, *, id: str, client_key: Optional[str] = None, timeout_secs: Optional[int] = None) -> RequestQueueClient: + """TODO: docs.""" return RequestQueueClient(base_storage_directory=self.request_queues_directory, client=self, id=id) async def purge(self) -> None: + """TODO: docs.""" # Key-value stores key_value_store_folders = os.listdir(self.key_value_stores_directory) for key_value_store_folder in key_value_store_folders: @@ -66,6 +78,7 @@ async def purge(self) -> None: await self._batch_remove_files(os.path.join(self.request_queues_directory, request_queue_folder)) def teardown(self) -> None: + """TODO: docs.""" # We don't need to wait for anything here since we don't have worker threads for fs operations pass @@ -91,7 +104,7 @@ async def _handle_default_key_value_store(self, folder: str) -> None: temp_file_path = os.path.join(temporary_path, entity) try: await rename(original_file_path, temp_file_path) - except: + except Exception: # Ignore pass @@ -103,7 +116,7 @@ async def _handle_default_key_value_store(self, folder: str) -> None: try: await rename(folder, temp_path_for_old_folder) done = True - except: + except Exception: counter += 1 temp_path_for_old_folder = os.path.join(folder, f'../__OLD_DEFAULT_{counter}__') @@ -122,7 +135,7 @@ async def _batch_remove_files(self, folder: str, counter: int = 0) -> None: try: # Rename the old folder to the new one to allow background deletions await rename(folder, temporary_folder) - except: + except Exception: # Folder exists already, try again with an incremented counter return await self._batch_remove_files(folder, counter + 1) diff --git a/src/apify/memory_storage/resource_clients/_utils.py b/src/apify/memory_storage/resource_clients/_utils.py index 50146d37..ffe358c5 100644 --- a/src/apify/memory_storage/resource_clients/_utils.py +++ b/src/apify/memory_storage/resource_clients/_utils.py @@ -58,6 +58,6 @@ def _maybe_parse_body(body: bytes, content_type: str) -> Any: def _unique_key_to_request_id(unique_key: str) -> str: - id = re.sub('(\+|\/|=)', '', base64.b64encode(hashlib.sha256(unique_key.encode('utf-8')).digest()).decode('utf-8')) + id = re.sub(r'(\+|\/|=)', '', base64.b64encode(hashlib.sha256(unique_key.encode('utf-8')).digest()).decode('utf-8')) return id[:REQUEST_ID_LENGTH] if len(id) > REQUEST_ID_LENGTH else id diff --git a/src/apify/memory_storage/resource_clients/dataset.py b/src/apify/memory_storage/resource_clients/dataset.py index 5ee25b93..2317ecdd 100644 --- a/src/apify/memory_storage/resource_clients/dataset.py +++ b/src/apify/memory_storage/resource_clients/dataset.py @@ -9,7 +9,7 @@ from ..._types import JSONSerializable from ..._utils import ListPage -from ..file_storage_utils import update_dataset_items, update_metadata +from ..file_storage_utils import _update_dataset_items, _update_metadata from ._utils import StorageTypes, _raise_on_duplicate_entry, _raise_on_non_existing, uuid_regex if TYPE_CHECKING: @@ -23,6 +23,8 @@ class DatasetClient: + """TODO: docs.""" + created_at = datetime.utcnow() accessed_at = datetime.utcnow() modified_at = datetime.utcnow() @@ -30,13 +32,15 @@ class DatasetClient: dataset_entries: Dict[str, Dict] = {} def __init__(self, *, base_storage_directory: str, client: 'MemoryStorage', id: Optional[str] = None, name: Optional[str] = None) -> None: + """TODO: docs.""" self.id = str(uuid.uuid4()) if id is None else id self.dataset_directory = os.path.join(base_storage_directory, name or self.id) self.client = client self.name = name async def get(self) -> Optional[Dict]: - found = find_or_cache_dataset_by_possible_id(client=self.client, entry_name_or_id=self.name or self.id) + """TODO: docs.""" + found = _find_or_cache_dataset_by_possible_id(client=self.client, entry_name_or_id=self.name or self.id) if found: await found.update_timestamps(False) @@ -45,8 +49,9 @@ async def get(self) -> Optional[Dict]: return None async def update(self, *, name: Optional[str] = None) -> Dict: + """TODO: docs.""" # Check by id - existing_store_by_id = find_or_cache_dataset_by_possible_id(client=self.client, entry_name_or_id=self.name or self.id) + existing_store_by_id = _find_or_cache_dataset_by_possible_id(client=self.client, entry_name_or_id=self.name or self.id) if existing_store_by_id is None: _raise_on_non_existing(StorageTypes.DATASET, self.id) @@ -83,6 +88,7 @@ async def update(self, *, name: Optional[str] = None) -> Dict: return existing_store_by_id.to_dataset_info() async def delete(self) -> None: + """TODO: docs.""" store = next((store for store in self.client.datasets_handled if store.id == self.id), None) if store is not None: @@ -107,15 +113,16 @@ async def list_items( flatten: Optional[List[str]] = None, view: Optional[str] = None, ) -> ListPage: + """TODO: docs.""" # Check by id - existing_store_by_id = find_or_cache_dataset_by_possible_id(client=self.client, entry_name_or_id=self.name or self.id) + existing_store_by_id = _find_or_cache_dataset_by_possible_id(client=self.client, entry_name_or_id=self.name or self.id) if existing_store_by_id is None: _raise_on_non_existing(StorageTypes.DATASET, self.id) - start, end = existing_store_by_id.get_start_and_end_indexes( + start, end = existing_store_by_id._get_start_and_end_indexes( max(existing_store_by_id.item_count - (offset or 0) - (limit or 0), 0) if desc else offset or 0, - limit + limit, ) items = [] @@ -151,6 +158,7 @@ async def iterate_items( skip_empty: Optional[bool] = None, skip_hidden: Optional[bool] = None, ) -> AsyncGenerator: # TODO: Copy-pasted from client + """TODO: docs.""" cache_size = 1000 first_item = offset @@ -206,7 +214,8 @@ async def get_items_as_bytes( xml_row: Optional[str] = None, flatten: Optional[List[str]] = None, ) -> bytes: - raise NotImplementedError("This method is not supported in local memory storage") + """TODO: docs.""" + raise NotImplementedError('This method is not supported in local memory storage') @asynccontextmanager async def stream_items( @@ -228,13 +237,15 @@ async def stream_items( xml_root: Optional[str] = None, xml_row: Optional[str] = None, ) -> AsyncIterator: + """TODO: docs.""" yield { # TODO: figure out how to do streaming } async def push_items(self, items: JSONSerializable) -> None: + """TODO: docs.""" # Check by id - existing_store_by_id = find_or_cache_dataset_by_possible_id(client=self.client, entry_name_or_id=self.name or self.id) + existing_store_by_id = _find_or_cache_dataset_by_possible_id(client=self.client, entry_name_or_id=self.name or self.id) if existing_store_by_id is None: _raise_on_non_existing(StorageTypes.DATASET, self.id) @@ -254,14 +265,15 @@ async def push_items(self, items: JSONSerializable) -> None: data_entries.append((id, existing_store_by_id.dataset_entries[id])) await existing_store_by_id.update_timestamps(True) - print(self.dataset_directory) - await update_dataset_items( + + await _update_dataset_items( data=data_entries, entity_directory=existing_store_by_id.dataset_directory, persist_storage=self.client.persist_storage, ) def to_dataset_info(self) -> Dict: + """TODO: docs.""" return { 'id': self.id, 'name': self.name, @@ -272,15 +284,16 @@ def to_dataset_info(self) -> Dict: } async def update_timestamps(self, has_been_modified: bool) -> None: + """TODO: docs.""" self.accessed_at = datetime.utcnow() if has_been_modified: self.modified_at = datetime.utcnow() dataset_info = self.to_dataset_info() - await update_metadata(data=dataset_info, entity_directory=self.dataset_directory, write_metadata=self.client.write_metadata) + await _update_metadata(data=dataset_info, entity_directory=self.dataset_directory, write_metadata=self.client.write_metadata) - def get_start_and_end_indexes(self, offset: int, limit: Optional[int] = None) -> Tuple[int, int]: + def _get_start_and_end_indexes(self, offset: int, limit: Optional[int] = None) -> Tuple[int, int]: actual_limit = limit or self.item_count start = offset + 1 end = min(offset + actual_limit, self.item_count) + 1 @@ -311,10 +324,10 @@ def normalize_item(item: Any) -> Optional[Dict]: return list(filter(None, result)) -def find_or_cache_dataset_by_possible_id(client: 'MemoryStorage', entry_name_or_id: str) -> Optional['DatasetClient']: +def _find_or_cache_dataset_by_possible_id(client: 'MemoryStorage', entry_name_or_id: str) -> Optional['DatasetClient']: # First check memory cache - found = next((store for store in client.datasets_handled if store.id == - entry_name_or_id or (store.name and store.name.lower() == entry_name_or_id.lower())), None) + found = next((store for store in client.datasets_handled + if store.id == entry_name_or_id or (store.name and store.name.lower() == entry_name_or_id.lower())), None) if found is not None: return found diff --git a/src/apify/memory_storage/resource_clients/dataset_collection.py b/src/apify/memory_storage/resource_clients/dataset_collection.py index 77f546b1..8aa99b81 100644 --- a/src/apify/memory_storage/resource_clients/dataset_collection.py +++ b/src/apify/memory_storage/resource_clients/dataset_collection.py @@ -2,20 +2,23 @@ from typing import TYPE_CHECKING, Dict, Optional from ..._utils import ListPage -from ..file_storage_utils import update_metadata -from .dataset import DatasetClient, find_or_cache_dataset_by_possible_id +from ..file_storage_utils import _update_metadata +from .dataset import DatasetClient, _find_or_cache_dataset_by_possible_id if TYPE_CHECKING: from ..memory_storage import MemoryStorage class DatasetCollectionClient: + """TODO: docs.""" def __init__(self, *, base_storage_directory: str, client: 'MemoryStorage') -> None: + """TODO: docs.""" self.datasets_directory = base_storage_directory self.client = client def list(self) -> ListPage: + """TODO: docs.""" def map_store(store: DatasetClient) -> Dict: return store.to_dataset_info() return ListPage({ @@ -28,8 +31,9 @@ def map_store(store: DatasetClient) -> Dict: }) async def get_or_create(self, *, name: Optional[str] = None, schema: Optional[Dict] = None) -> Dict: + """TODO: docs.""" if name: - found = find_or_cache_dataset_by_possible_id(client=self.client, entry_name_or_id=name) + found = _find_or_cache_dataset_by_possible_id(client=self.client, entry_name_or_id=name) if found: return found.to_dataset_info() @@ -40,6 +44,6 @@ async def get_or_create(self, *, name: Optional[str] = None, schema: Optional[Di dataset_info = new_store.to_dataset_info() # Write to the disk - await update_metadata(data=dataset_info, entity_directory=new_store.dataset_directory, write_metadata=self.client.write_metadata) + await _update_metadata(data=dataset_info, entity_directory=new_store.dataset_directory, write_metadata=self.client.write_metadata) return dataset_info diff --git a/src/apify/memory_storage/resource_clients/key_value_store.py b/src/apify/memory_storage/resource_clients/key_value_store.py index a97fae47..843a4725 100644 --- a/src/apify/memory_storage/resource_clients/key_value_store.py +++ b/src/apify/memory_storage/resource_clients/key_value_store.py @@ -13,7 +13,7 @@ from ..._utils import _json_serializer from ...consts import DEFAULT_API_PARAM_LIMIT, StorageTypes -from ..file_storage_utils import set_or_delete_key_value_store_record, update_metadata +from ..file_storage_utils import _set_or_delete_key_value_store_record, _update_metadata from ._utils import _guess_file_extension, _is_file_or_bytes, _maybe_parse_body, _raise_on_duplicate_entry, _raise_on_non_existing, uuid_regex if TYPE_CHECKING: @@ -23,19 +23,23 @@ class KeyValueStoreClient: + """TODO: docs.""" + created_at = datetime.utcnow() accessed_at = datetime.utcnow() modified_at = datetime.utcnow() key_value_entries: Dict[str, Dict] = {} def __init__(self, *, base_storage_directory: str, client: 'MemoryStorage', id: Optional[str] = None, name: Optional[str] = None) -> None: + """TODO: docs.""" self.id = str(uuid.uuid4()) if id is None else id self.key_value_store_directory = os.path.join(base_storage_directory, name or self.id) self.client = client self.name = name async def get(self) -> Optional[Dict]: - found = find_or_cache_key_value_store_by_possible_id(client=self.client, entry_name_or_id=self.name or self.id) + """TODO: docs.""" + found = _find_or_cache_key_value_store_by_possible_id(client=self.client, entry_name_or_id=self.name or self.id) if found: await found.update_timestamps(False) @@ -44,8 +48,9 @@ async def get(self) -> Optional[Dict]: return None async def update(self, *, name: Optional[str] = None) -> Dict: + """TODO: docs.""" # Check by id - existing_store_by_id = find_or_cache_key_value_store_by_possible_id(client=self.client, entry_name_or_id=self.name or self.id) + existing_store_by_id = _find_or_cache_key_value_store_by_possible_id(client=self.client, entry_name_or_id=self.name or self.id) if existing_store_by_id is None: _raise_on_non_existing(StorageTypes.KEY_VALUE_STORE, self.id) @@ -82,6 +87,7 @@ async def update(self, *, name: Optional[str] = None) -> Dict: return existing_store_by_id.to_key_value_store_info() async def delete(self) -> None: + """TODO: docs.""" store = next((store for store in self.client.key_value_stores_handled if store.id == self.id), None) if store is not None: @@ -91,8 +97,9 @@ async def delete(self) -> None: await aioshutil.rmtree(store.key_value_store_directory) async def list_keys(self, *, limit: int = DEFAULT_API_PARAM_LIMIT, exclusive_start_key: Optional[str] = None) -> Dict: + """TODO: docs.""" # Check by id - existing_store_by_id = find_or_cache_key_value_store_by_possible_id(self.client, self.name or self.id) + existing_store_by_id = _find_or_cache_key_value_store_by_possible_id(self.client, self.name or self.id) if existing_store_by_id is None: _raise_on_non_existing(StorageTypes.KEY_VALUE_STORE, self.id) @@ -134,8 +141,9 @@ async def list_keys(self, *, limit: int = DEFAULT_API_PARAM_LIMIT, exclusive_sta } async def get_record(self, key: str) -> Optional[Dict]: + """TODO: docs.""" # Check by id - existing_store_by_id = find_or_cache_key_value_store_by_possible_id(self.client, self.name or self.id) + existing_store_by_id = _find_or_cache_key_value_store_by_possible_id(self.client, self.name or self.id) if existing_store_by_id is None: _raise_on_non_existing(StorageTypes.KEY_VALUE_STORE, self.id) @@ -159,9 +167,10 @@ async def get_record(self, key: str) -> Optional[Dict]: return record async def get_record_as_bytes(self, key: str) -> Optional[Dict]: + """TODO: docs.""" # TODO: make a private method that reuses code instead of copy pasting get_record and removing one line with parsing ;) # Check by id - existing_store_by_id = find_or_cache_key_value_store_by_possible_id(self.client, self.name or self.id) + existing_store_by_id = _find_or_cache_key_value_store_by_possible_id(self.client, self.name or self.id) if existing_store_by_id is None: _raise_on_non_existing(StorageTypes.KEY_VALUE_STORE, self.id) @@ -184,12 +193,14 @@ async def get_record_as_bytes(self, key: str) -> Optional[Dict]: @asynccontextmanager async def stream_record(self, key: str) -> AsyncIterator[Optional[Dict]]: + """TODO: docs.""" # TODO: implement - no idea how atm yield None async def set_record(self, key: str, value: Any, content_type: Optional[str] = None) -> None: + """TODO: docs.""" # Check by id - existing_store_by_id = find_or_cache_key_value_store_by_possible_id(self.client, self.name or self.id) + existing_store_by_id = _find_or_cache_key_value_store_by_possible_id(self.client, self.name or self.id) if existing_store_by_id is None: _raise_on_non_existing(StorageTypes.KEY_VALUE_STORE, self.id) @@ -226,7 +237,7 @@ async def set_record(self, key: str, value: Any, content_type: Optional[str] = N existing_store_by_id.key_value_entries[key] = record await existing_store_by_id.update_timestamps(True) - await set_or_delete_key_value_store_record( + await _set_or_delete_key_value_store_record( entity_directory=existing_store_by_id.key_value_store_directory, persist_storage=self.client.persist_storage, record=record, @@ -235,8 +246,9 @@ async def set_record(self, key: str, value: Any, content_type: Optional[str] = N ) async def delete_record(self, key: str) -> None: + """TODO: docs.""" # Check by id - existing_store_by_id = find_or_cache_key_value_store_by_possible_id(self.client, self.name or self.id) + existing_store_by_id = _find_or_cache_key_value_store_by_possible_id(self.client, self.name or self.id) if existing_store_by_id is None: _raise_on_non_existing(StorageTypes.KEY_VALUE_STORE, self.id) @@ -246,7 +258,7 @@ async def delete_record(self, key: str) -> None: if entry is not None: del existing_store_by_id.key_value_entries[key] await existing_store_by_id.update_timestamps(True) - await set_or_delete_key_value_store_record( + await _set_or_delete_key_value_store_record( entity_directory=existing_store_by_id.key_value_store_directory, persist_storage=self.client.persist_storage, record=entry, @@ -255,6 +267,7 @@ async def delete_record(self, key: str) -> None: ) def to_key_value_store_info(self) -> Dict: + """TODO: docs.""" return { 'id': self.id, 'name': self.name, @@ -265,19 +278,20 @@ def to_key_value_store_info(self) -> Dict: } async def update_timestamps(self, has_been_modified: bool) -> None: + """TODO: docs.""" self.accessed_at = datetime.utcnow() if has_been_modified: self.modified_at = datetime.utcnow() kv_store_info = self.to_key_value_store_info() - await update_metadata(data=kv_store_info, entity_directory=self.key_value_store_directory, write_metadata=self.client.write_metadata) + await _update_metadata(data=kv_store_info, entity_directory=self.key_value_store_directory, write_metadata=self.client.write_metadata) -def find_or_cache_key_value_store_by_possible_id(client: 'MemoryStorage', entry_name_or_id: str) -> Optional['KeyValueStoreClient']: +def _find_or_cache_key_value_store_by_possible_id(client: 'MemoryStorage', entry_name_or_id: str) -> Optional['KeyValueStoreClient']: # First check memory cache - found = next((store for store in client.key_value_stores_handled if store.id == - entry_name_or_id or (store.name and store.name.lower() == entry_name_or_id.lower())), None) + found = next((store for store in client.key_value_stores_handled + if store.id == entry_name_or_id or (store.name and store.name.lower() == entry_name_or_id.lower())), None) if found is not None: return found @@ -357,7 +371,8 @@ def find_or_cache_key_value_store_by_possible_id(client: 'MemoryStorage', entry_ with warnings.catch_warnings(): warnings.simplefilter('always') warnings.warn( - f'Key-value entry "{entry.name}" for store {entry_name_or_id} has invalid JSON content and will be ignored from the store.', + (f'Key-value entry "{entry.name}" for store {entry_name_or_id} has invalid JSON content' + 'and will be ignored from the store.'), Warning, stacklevel=2, ) diff --git a/src/apify/memory_storage/resource_clients/key_value_store_collection.py b/src/apify/memory_storage/resource_clients/key_value_store_collection.py index eb0f360e..dac02bb5 100644 --- a/src/apify/memory_storage/resource_clients/key_value_store_collection.py +++ b/src/apify/memory_storage/resource_clients/key_value_store_collection.py @@ -1,21 +1,24 @@ from operator import itemgetter -from typing import TYPE_CHECKING, Any, Dict, Optional +from typing import TYPE_CHECKING, Dict, Optional from ..._utils import ListPage -from ..file_storage_utils import update_metadata -from .key_value_store import KeyValueStoreClient, find_or_cache_key_value_store_by_possible_id +from ..file_storage_utils import _update_metadata +from .key_value_store import KeyValueStoreClient, _find_or_cache_key_value_store_by_possible_id if TYPE_CHECKING: from ..memory_storage import MemoryStorage class KeyValueStoreCollectionClient: + """TODO: docs.""" def __init__(self, *, base_storage_directory: str, client: 'MemoryStorage') -> None: + """TODO: docs.""" self.key_value_stores_directory = base_storage_directory self.client = client def list(self) -> ListPage: + """TODO: docs.""" def map_store(store: KeyValueStoreClient) -> Dict: return store.to_key_value_store_info() return ListPage({ @@ -28,8 +31,9 @@ def map_store(store: KeyValueStoreClient) -> Dict: }) async def get_or_create(self, *, name: Optional[str] = None, schema: Optional[Dict] = None) -> Dict: + """TODO: docs.""" if name: - found = find_or_cache_key_value_store_by_possible_id(client=self.client, entry_name_or_id=name) + found = _find_or_cache_key_value_store_by_possible_id(client=self.client, entry_name_or_id=name) if found: return found.to_key_value_store_info() @@ -40,6 +44,6 @@ async def get_or_create(self, *, name: Optional[str] = None, schema: Optional[Di kv_store_info = new_store.to_key_value_store_info() # Write to the disk - await update_metadata(data=kv_store_info, entity_directory=new_store.key_value_store_directory, write_metadata=self.client.write_metadata) + await _update_metadata(data=kv_store_info, entity_directory=new_store.key_value_store_directory, write_metadata=self.client.write_metadata) return kv_store_info diff --git a/src/apify/memory_storage/resource_clients/request_queue.py b/src/apify/memory_storage/resource_clients/request_queue.py index 66f43bf9..c75f967a 100644 --- a/src/apify/memory_storage/resource_clients/request_queue.py +++ b/src/apify/memory_storage/resource_clients/request_queue.py @@ -7,7 +7,7 @@ import aioshutil from ..._utils import _filter_out_none_values_recursively, _json_serializer -from ..file_storage_utils import delete_request, update_metadata, update_request_queue_item +from ..file_storage_utils import _delete_request, _update_metadata, _update_request_queue_item from ._utils import StorageTypes, _raise_on_duplicate_entry, _raise_on_non_existing, _unique_key_to_request_id, uuid_regex if TYPE_CHECKING: @@ -15,6 +15,8 @@ class RequestQueueClient: + """TODO: docs.""" + created_at = datetime.utcnow() accessed_at = datetime.utcnow() modified_at = datetime.utcnow() @@ -23,13 +25,15 @@ class RequestQueueClient: requests: Dict[str, Dict] = {} def __init__(self, *, base_storage_directory: str, client: 'MemoryStorage', id: Optional[str] = None, name: Optional[str] = None) -> None: + """TODO: docs.""" self.id = str(uuid.uuid4()) if id is None else id self.request_queue_directory = os.path.join(base_storage_directory, name or self.id) self.client = client self.name = name async def get(self) -> Optional[Dict]: - found = find_or_cache_request_queue_by_possible_id(self.client, self.name or self.id) + """TODO: docs.""" + found = _find_or_cache_request_queue_by_possible_id(self.client, self.name or self.id) if found: await found.update_timestamps(False) @@ -38,8 +42,9 @@ async def get(self) -> Optional[Dict]: return None async def update(self, *, name: Optional[str] = None) -> Dict: + """TODO: docs.""" # Check by id - existing_store_by_id = find_or_cache_request_queue_by_possible_id(self.client, self.name or self.id) + existing_store_by_id = _find_or_cache_request_queue_by_possible_id(self.client, self.name or self.id) if existing_store_by_id is None: _raise_on_non_existing(StorageTypes.REQUEST_QUEUE, self.id) @@ -76,6 +81,7 @@ async def update(self, *, name: Optional[str] = None) -> Dict: return existing_store_by_id.to_request_queue_info() async def delete(self) -> None: + """TODO: docs.""" store = next((store for store in self.client.request_queues_handled if store.id == self.id), None) if store is not None: @@ -86,7 +92,8 @@ async def delete(self) -> None: await aioshutil.rmtree(store.request_queue_directory) async def list_head(self, *, limit: Optional[int] = None) -> Dict: - existing_store_by_id = find_or_cache_request_queue_by_possible_id(self.client, self.name or self.id) + """TODO: docs.""" + existing_store_by_id = _find_or_cache_request_queue_by_possible_id(self.client, self.name or self.id) if existing_store_by_id is None: _raise_on_non_existing(StorageTypes.REQUEST_QUEUE, self.id) @@ -106,11 +113,12 @@ async def list_head(self, *, limit: Optional[int] = None) -> Dict: 'limit': limit, 'hadMultipleClients': False, 'queueModifiedAt': existing_store_by_id.modified_at, - 'items': list(map(lambda item: self._json_to_request(item['json']), items)) + 'items': list(map(lambda item: self._json_to_request(item['json']), items)), } async def add_request(self, request: Dict, *, forefront: Optional[bool] = None) -> Dict: - existing_store_by_id = find_or_cache_request_queue_by_possible_id(self.client, self.name or self.id) + """TODO: docs.""" + existing_store_by_id = _find_or_cache_request_queue_by_possible_id(self.client, self.name or self.id) if existing_store_by_id is None: _raise_on_non_existing(StorageTypes.REQUEST_QUEUE, self.id) @@ -132,7 +140,7 @@ async def add_request(self, request: Dict, *, forefront: Optional[bool] = None) existing_store_by_id.requests[request_model['id']] = request_model existing_store_by_id.pending_request_count += 1 if request_model['orderNo'] is None else 0 await existing_store_by_id.update_timestamps(True) - await update_request_queue_item( + await _update_request_queue_item( request=request_model, request_id=request_model['id'], entity_directory=existing_store_by_id.request_queue_directory, @@ -148,7 +156,8 @@ async def add_request(self, request: Dict, *, forefront: Optional[bool] = None) } async def get_request(self, request_id: str) -> Optional[Dict]: - existing_store_by_id = find_or_cache_request_queue_by_possible_id(self.client, self.name or self.id) + """TODO: docs.""" + existing_store_by_id = _find_or_cache_request_queue_by_possible_id(self.client, self.name or self.id) if existing_store_by_id is None: _raise_on_non_existing(StorageTypes.REQUEST_QUEUE, self.id) @@ -159,7 +168,8 @@ async def get_request(self, request_id: str) -> Optional[Dict]: return self._json_to_request(request['json'] if request is not None else None) async def update_request(self, request: Dict, *, forefront: Optional[bool] = None) -> Dict: - existing_store_by_id = find_or_cache_request_queue_by_possible_id(self.client, self.name or self.id) + """TODO: docs.""" + existing_store_by_id = _find_or_cache_request_queue_by_possible_id(self.client, self.name or self.id) if existing_store_by_id is None: _raise_on_non_existing(StorageTypes.REQUEST_QUEUE, self.id) @@ -181,7 +191,7 @@ async def update_request(self, request: Dict, *, forefront: Optional[bool] = Non existing_store_by_id.requests[request_model['id']] = request_model handled_count_adjustment = 0 - is_request_handled_state_changing = type(existing_request['orderNo']) != type(request_model['orderNo']) + is_request_handled_state_changing = type(existing_request['orderNo']) != type(request_model['orderNo']) # noqa request_was_handled_before_update = existing_request['orderNo'] is None if is_request_handled_state_changing: @@ -191,7 +201,7 @@ async def update_request(self, request: Dict, *, forefront: Optional[bool] = Non existing_store_by_id.pending_request_count += handled_count_adjustment await existing_store_by_id.update_timestamps(True) - await update_request_queue_item( + await _update_request_queue_item( request=request_model, request_id=request_model['id'], entity_directory=existing_store_by_id.request_queue_directory, @@ -205,7 +215,8 @@ async def update_request(self, request: Dict, *, forefront: Optional[bool] = Non } async def delete_request(self, request_id: str) -> None: - existing_store_by_id = find_or_cache_request_queue_by_possible_id(self.client, self.name or self.id) + """TODO: docs.""" + existing_store_by_id = _find_or_cache_request_queue_by_possible_id(self.client, self.name or self.id) if existing_store_by_id is None: _raise_on_non_existing(StorageTypes.REQUEST_QUEUE, self.id) @@ -216,9 +227,10 @@ async def delete_request(self, request_id: str) -> None: del existing_store_by_id.requests[request_id] existing_store_by_id.pending_request_count -= 0 if request['orderNo'] is None else 1 await existing_store_by_id.update_timestamps(True) - await delete_request(entity_directory=existing_store_by_id.request_queue_directory, request_id=request_id) + await _delete_request(entity_directory=existing_store_by_id.request_queue_directory, request_id=request_id) def to_request_queue_info(self) -> Dict: + """TODO: docs.""" return { 'accessedAt': self.accessed_at, 'createdAt': self.created_at, @@ -234,13 +246,14 @@ def to_request_queue_info(self) -> Dict: } async def update_timestamps(self, has_been_modified: bool) -> None: + """TODO: docs.""" self.accessed_at = datetime.utcnow() if has_been_modified: self.modified_at = datetime.utcnow() request_queue_info = self.to_request_queue_info() - await update_metadata(data=request_queue_info, entity_directory=self.request_queue_directory, write_metadata=self.client.write_metadata) + await _update_metadata(data=request_queue_info, entity_directory=self.request_queue_directory, write_metadata=self.client.write_metadata) def _json_to_request(self, request_json: Optional[str]) -> Optional[dict]: if request_json is None: @@ -275,10 +288,10 @@ def _calculate_order_no(self, request: Dict, forefront: Optional[bool]) -> Optio return -timestamp if forefront else timestamp -def find_or_cache_request_queue_by_possible_id(client: 'MemoryStorage', entry_name_or_id: str) -> Optional['RequestQueueClient']: +def _find_or_cache_request_queue_by_possible_id(client: 'MemoryStorage', entry_name_or_id: str) -> Optional['RequestQueueClient']: # First check memory cache - found = next((store for store in client.request_queues_handled if store.id == - entry_name_or_id or (store.name and store.name.lower() == entry_name_or_id.lower())), None) + found = next((store for store in client.request_queues_handled + if store.id == entry_name_or_id or (store.name and store.name.lower() == entry_name_or_id.lower())), None) if found is not None: return found diff --git a/src/apify/memory_storage/resource_clients/request_queue_collection.py b/src/apify/memory_storage/resource_clients/request_queue_collection.py index 5ae2bd42..6927f5d2 100644 --- a/src/apify/memory_storage/resource_clients/request_queue_collection.py +++ b/src/apify/memory_storage/resource_clients/request_queue_collection.py @@ -2,20 +2,23 @@ from typing import TYPE_CHECKING, Dict, Optional from ..._utils import ListPage -from ..file_storage_utils import update_metadata -from .request_queue import RequestQueueClient, find_or_cache_request_queue_by_possible_id +from ..file_storage_utils import _update_metadata +from .request_queue import RequestQueueClient, _find_or_cache_request_queue_by_possible_id if TYPE_CHECKING: from ..memory_storage import MemoryStorage class RequestQueueCollectionClient: + """TODO: docs.""" def __init__(self, *, base_storage_directory: str, client: 'MemoryStorage') -> None: + """TODO: docs.""" self.request_queues_directory = base_storage_directory self.client = client def list(self) -> ListPage: + """TODO: docs.""" def map_store(store: RequestQueueClient) -> Dict: return store.to_request_queue_info() return ListPage({ @@ -28,8 +31,9 @@ def map_store(store: RequestQueueClient) -> Dict: }) async def get_or_create(self, *, name: Optional[str] = None) -> Dict: + """TODO: docs.""" if name: - found = find_or_cache_request_queue_by_possible_id(self.client, name) + found = _find_or_cache_request_queue_by_possible_id(self.client, name) if found: return found.to_request_queue_info() @@ -40,6 +44,6 @@ async def get_or_create(self, *, name: Optional[str] = None) -> Dict: request_queue_info = new_store.to_request_queue_info() # Write to the disk - await update_metadata(data=request_queue_info, entity_directory=new_store.request_queue_directory, write_metadata=self.client.write_metadata) + await _update_metadata(data=request_queue_info, entity_directory=new_store.request_queue_directory, write_metadata=self.client.write_metadata) return request_queue_info From cb955ee8671278f74b94689c95186d27bb791877 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ji=C5=99=C3=AD=20Morav=C4=8D=C3=ADk?= Date: Wed, 21 Dec 2022 12:28:56 +0100 Subject: [PATCH 04/23] json dumps utility --- src/apify/_utils.py | 5 ++++- src/apify/memory_storage/file_storage_utils.py | 15 ++++++--------- .../resource_clients/key_value_store.py | 4 ++-- .../resource_clients/request_queue.py | 4 ++-- 4 files changed, 14 insertions(+), 14 deletions(-) diff --git a/src/apify/_utils.py b/src/apify/_utils.py index 9d2b37b6..36b0bc94 100644 --- a/src/apify/_utils.py +++ b/src/apify/_utils.py @@ -3,7 +3,7 @@ import inspect import os import sys -import time +import time, json from datetime import datetime, timezone from typing import Any, Callable, Dict, Generic, List, Optional, TypeVar, Union, cast @@ -168,3 +168,6 @@ def _filter_out_none_values_recursively_internal(dictionary: Dict, remove_empty_ if not result and remove_empty_dicts: return None return result + +def _json_dumps(obj: Any) -> str: + return json.dumps(obj, ensure_ascii=False, indent=2, default=_json_serializer) diff --git a/src/apify/memory_storage/file_storage_utils.py b/src/apify/memory_storage/file_storage_utils.py index 035ac9d1..40ba50d5 100644 --- a/src/apify/memory_storage/file_storage_utils.py +++ b/src/apify/memory_storage/file_storage_utils.py @@ -1,11 +1,10 @@ -import json import os from typing import Dict, List, Tuple import aiofiles from aiofiles.os import makedirs -from .._utils import _force_remove, _json_serializer +from .._utils import _force_remove, _json_dumps async def _update_metadata(*, data: Dict, entity_directory: str, write_metadata: bool) -> None: @@ -19,9 +18,7 @@ async def _update_metadata(*, data: Dict, entity_directory: str, write_metadata: # Write the metadata to the file file_path = os.path.join(entity_directory, '__metadata__.json') async with aiofiles.open(file_path, mode='wb') as f: - # TODO: Check how to dump to JSON properly with aiofiles... - await f.write(json.dumps(data, ensure_ascii=False, indent=2, default=_json_serializer).encode('utf-8')) - # json.dump(data, f) + await f.write(_json_dumps(data).encode('utf-8')) async def _check_conditions(entity_directory: str, persist_storage: bool) -> None: @@ -44,7 +41,7 @@ async def _update_dataset_items( for idx, item in data: file_path = os.path.join(entity_directory, f'{idx}.json') async with aiofiles.open(file_path, mode='wb') as f: - await f.write(json.dumps(item, ensure_ascii=False, indent=2, default=_json_serializer).encode('utf-8')) + await f.write(_json_dumps(item).encode('utf-8')) async def _set_or_delete_key_value_store_record( @@ -67,11 +64,11 @@ async def _set_or_delete_key_value_store_record( if should_set: if write_metadata: async with aiofiles.open(record_metadata_path, mode='wb') as f: - await f.write(json.dumps({ + await f.write(_json_dumps({ 'key': record['key'], 'contentType': record.get('content_type') or 'unknown/no content type', 'extension': record['extension'], - }, ensure_ascii=False, indent=2, default=_json_serializer).encode('utf-8')) + }).encode('utf-8')) # Convert to bytes if string if isinstance(record['value'], str): @@ -93,7 +90,7 @@ async def _update_request_queue_item( # Write the request to the file file_path = os.path.join(entity_directory, f'{request_id}.json') async with aiofiles.open(file_path, mode='wb') as f: - await f.write(json.dumps(request, ensure_ascii=False, indent=2, default=_json_serializer).encode('utf-8')) + await f.write(_json_dumps(request).encode('utf-8')) async def _delete_request(*, request_id: str, entity_directory: str) -> None: diff --git a/src/apify/memory_storage/resource_clients/key_value_store.py b/src/apify/memory_storage/resource_clients/key_value_store.py index 843a4725..1425de4d 100644 --- a/src/apify/memory_storage/resource_clients/key_value_store.py +++ b/src/apify/memory_storage/resource_clients/key_value_store.py @@ -11,7 +11,7 @@ import aioshutil -from ..._utils import _json_serializer +from ..._utils import _json_dumps from ...consts import DEFAULT_API_PARAM_LIMIT, StorageTypes from ..file_storage_utils import _set_or_delete_key_value_store_record, _update_metadata from ._utils import _guess_file_extension, _is_file_or_bytes, _maybe_parse_body, _raise_on_duplicate_entry, _raise_on_non_existing, uuid_regex @@ -216,7 +216,7 @@ async def set_record(self, key: str, value: Any, content_type: Optional[str] = N extension = _guess_file_extension(content_type or '') or DEFAULT_LOCAL_FILE_EXTENSION if 'application/json' in content_type and not _is_file_or_bytes(value) and not isinstance(value, str): - value = json.dumps(value, ensure_ascii=False, indent=2, default=_json_serializer).encode('utf-8') + value = _json_dumps(value).encode('utf-8') # TODO: Add stream support for this method... # if (valueIsStream) { diff --git a/src/apify/memory_storage/resource_clients/request_queue.py b/src/apify/memory_storage/resource_clients/request_queue.py index c75f967a..62064065 100644 --- a/src/apify/memory_storage/resource_clients/request_queue.py +++ b/src/apify/memory_storage/resource_clients/request_queue.py @@ -6,7 +6,7 @@ import aioshutil -from ..._utils import _filter_out_none_values_recursively, _json_serializer +from ..._utils import _filter_out_none_values_recursively, _json_dumps from ..file_storage_utils import _delete_request, _update_metadata, _update_request_queue_item from ._utils import StorageTypes, _raise_on_duplicate_entry, _raise_on_non_existing, _unique_key_to_request_id, uuid_regex @@ -268,7 +268,7 @@ def _create_internal_request(self, request: Dict, forefront: Optional[bool]) -> if request.get('id') is not None and request['id'] != id: raise ValueError('Request ID does not match its unique_key.') - json_request = json.dumps({**request, 'id': id}, ensure_ascii=False, indent=2, default=_json_serializer) + json_request = _json_dumps({**request, 'id': id}) return { 'id': id, 'json': json_request, From d15490c2545bd2a71b075fbe6b8bbfecf4ae23de Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ji=C5=99=C3=AD=20Morav=C4=8D=C3=ADk?= Date: Wed, 21 Dec 2022 12:31:33 +0100 Subject: [PATCH 05/23] fix lint --- src/apify/_utils.py | 4 +++- src/apify/memory_storage/resource_clients/request_queue.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/apify/_utils.py b/src/apify/_utils.py index 36b0bc94..1bbcde22 100644 --- a/src/apify/_utils.py +++ b/src/apify/_utils.py @@ -1,9 +1,10 @@ import asyncio import errno import inspect +import json import os import sys -import time, json +import time from datetime import datetime, timezone from typing import Any, Callable, Dict, Generic, List, Optional, TypeVar, Union, cast @@ -169,5 +170,6 @@ def _filter_out_none_values_recursively_internal(dictionary: Dict, remove_empty_ return None return result + def _json_dumps(obj: Any) -> str: return json.dumps(obj, ensure_ascii=False, indent=2, default=_json_serializer) diff --git a/src/apify/memory_storage/resource_clients/request_queue.py b/src/apify/memory_storage/resource_clients/request_queue.py index 62064065..5bd54f53 100644 --- a/src/apify/memory_storage/resource_clients/request_queue.py +++ b/src/apify/memory_storage/resource_clients/request_queue.py @@ -191,7 +191,7 @@ async def update_request(self, request: Dict, *, forefront: Optional[bool] = Non existing_store_by_id.requests[request_model['id']] = request_model handled_count_adjustment = 0 - is_request_handled_state_changing = type(existing_request['orderNo']) != type(request_model['orderNo']) # noqa + is_request_handled_state_changing = type(existing_request['orderNo']) != type(request_model['orderNo']) # noqa request_was_handled_before_update = existing_request['orderNo'] is None if is_request_handled_state_changing: From 2a3cc719d2bc3a3d070d90d12079349a68994890 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ji=C5=99=C3=AD=20Morav=C4=8D=C3=ADk?= Date: Fri, 23 Dec 2022 00:11:01 +0100 Subject: [PATCH 06/23] resolve todos, refactoring --- src/apify/memory_storage/memory_storage.py | 36 +++++----- .../memory_storage/resource_clients/_utils.py | 13 ++++ .../resource_clients/dataset.py | 18 +---- .../resource_clients/key_value_store.py | 70 ++++++------------- .../resource_clients/request_queue.py | 12 +--- 5 files changed, 59 insertions(+), 90 deletions(-) diff --git a/src/apify/memory_storage/memory_storage.py b/src/apify/memory_storage/memory_storage.py index 71919727..c495bcda 100644 --- a/src/apify/memory_storage/memory_storage.py +++ b/src/apify/memory_storage/memory_storage.py @@ -4,7 +4,7 @@ import aioshutil from aiofiles import ospath -from aiofiles.os import rename +from aiofiles.os import rename, scandir from .resource_clients.dataset import DatasetClient from .resource_clients.dataset_collection import DatasetCollectionClient @@ -59,23 +59,27 @@ def request_queue(self, *, id: str, client_key: Optional[str] = None, timeout_se async def purge(self) -> None: """TODO: docs.""" # Key-value stores - key_value_store_folders = os.listdir(self.key_value_stores_directory) - for key_value_store_folder in key_value_store_folders: - if key_value_store_folder.startswith('__APIFY_TEMPORARY') or key_value_store_folder.startswith('__OLD'): - await self._batch_remove_files(os.path.join(self.key_value_stores_directory, key_value_store_folder)) - elif key_value_store_folder == 'default': - await self._handle_default_key_value_store(os.path.join(self.key_value_stores_directory, key_value_store_folder)) + if await ospath.exists(self.key_value_stores_directory): + key_value_store_folders = await scandir(self.key_value_stores_directory) + for key_value_store_folder in key_value_store_folders: + if key_value_store_folder.name.startswith('__APIFY_TEMPORARY') or key_value_store_folder.name.startswith('__OLD'): + await self._batch_remove_files(os.path.join(self.key_value_stores_directory, key_value_store_folder.name)) + elif key_value_store_folder.name == 'default': + await self._handle_default_key_value_store(os.path.join(self.key_value_stores_directory, key_value_store_folder.name)) # Datasets - dataset_folders = os.listdir(self.datasets_directory) - for dataset_folder in dataset_folders: - if dataset_folder == 'default' or dataset_folder.startswith('__APIFY_TEMPORARY'): - await self._batch_remove_files(os.path.join(self.datasets_directory, dataset_folder)) + if await ospath.exists(self.datasets_directory): + dataset_folders = await scandir(self.datasets_directory) + for dataset_folder in dataset_folders: + if dataset_folder.name == 'default' or dataset_folder.name.startswith('__APIFY_TEMPORARY'): + print(dataset_folder.name) + await self._batch_remove_files(os.path.join(self.datasets_directory, dataset_folder.name)) # Request queues - request_queue_folders = os.listdir(self.request_queues_directory) - for request_queue_folder in request_queue_folders: - if request_queue_folder == 'default' or request_queue_folder.startswith('__APIFY_TEMPORARY'): - await self._batch_remove_files(os.path.join(self.request_queues_directory, request_queue_folder)) + if await ospath.exists(self.request_queues_directory): + request_queue_folders = await scandir(self.request_queues_directory) + for request_queue_folder in request_queue_folders: + if request_queue_folder.name == 'default' or request_queue_folder.name.startswith('__APIFY_TEMPORARY'): + await self._batch_remove_files(os.path.join(self.request_queues_directory, request_queue_folder.name)) def teardown(self) -> None: """TODO: docs.""" @@ -128,7 +132,7 @@ async def _handle_default_key_value_store(self, folder: str) -> None: async def _batch_remove_files(self, folder: str, counter: int = 0) -> None: folder_exists = await ospath.exists(folder) - + print(f'batch remove {folder}') if folder_exists: temporary_folder = folder if folder.startswith('__APIFY_TEMPORARY_') else os.path.join(folder, f'../__APIFY_TEMPORARY_{counter}__') diff --git a/src/apify/memory_storage/resource_clients/_utils.py b/src/apify/memory_storage/resource_clients/_utils.py index ffe358c5..895f33f0 100644 --- a/src/apify/memory_storage/resource_clients/_utils.py +++ b/src/apify/memory_storage/resource_clients/_utils.py @@ -6,6 +6,10 @@ import re from typing import Any, NoReturn, Optional +import aioshutil +from aiofiles import ospath +from aiofiles.os import rename + from ...consts import REQUEST_ID_LENGTH, StorageTypes uuid_regex = re.compile('[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}', re.I) @@ -61,3 +65,12 @@ def _unique_key_to_request_id(unique_key: str) -> str: id = re.sub(r'(\+|\/|=)', '', base64.b64encode(hashlib.sha256(unique_key.encode('utf-8')).digest()).decode('utf-8')) return id[:REQUEST_ID_LENGTH] if len(id) > REQUEST_ID_LENGTH else id + + +async def _force_rename(src_dir: str, dst_dir: str) -> None: + # Make sure source directory exists + if await ospath.exists(src_dir): + # Remove destination directory if it exists + if await ospath.exists(dst_dir): + await aioshutil.rmtree(dst_dir, ignore_errors=True) + await rename(src_dir, dst_dir) diff --git a/src/apify/memory_storage/resource_clients/dataset.py b/src/apify/memory_storage/resource_clients/dataset.py index 2317ecdd..58a75b4c 100644 --- a/src/apify/memory_storage/resource_clients/dataset.py +++ b/src/apify/memory_storage/resource_clients/dataset.py @@ -1,7 +1,6 @@ import json import os import uuid -from contextlib import asynccontextmanager from datetime import datetime from typing import TYPE_CHECKING, Any, AsyncGenerator, AsyncIterator, Dict, List, Optional, Tuple, Union @@ -10,7 +9,7 @@ from ..._types import JSONSerializable from ..._utils import ListPage from ..file_storage_utils import _update_dataset_items, _update_metadata -from ._utils import StorageTypes, _raise_on_duplicate_entry, _raise_on_non_existing, uuid_regex +from ._utils import StorageTypes, _force_rename, _raise_on_duplicate_entry, _raise_on_non_existing, uuid_regex if TYPE_CHECKING: from ..memory_storage import MemoryStorage @@ -73,14 +72,7 @@ async def update(self, *, name: Optional[str] = None) -> Dict: existing_store_by_id.dataset_directory = os.path.join(self.client.datasets_directory, name) - # Remove new directory if it exists - # TODO: compare to using os.renames, which has problems when target dir exists - # TODO: check if ignore errors needed... - await aioshutil.rmtree(existing_store_by_id.dataset_directory, ignore_errors=True) - # Copy the previous directory to the new one - await aioshutil.copytree(previous_dir, existing_store_by_id.dataset_directory) - # Remove the previous directory - await aioshutil.rmtree(previous_dir) + await _force_rename(previous_dir, existing_store_by_id.dataset_directory) # Update timestamps await existing_store_by_id.update_timestamps(True) @@ -217,7 +209,6 @@ async def get_items_as_bytes( """TODO: docs.""" raise NotImplementedError('This method is not supported in local memory storage') - @asynccontextmanager async def stream_items( self, *, @@ -238,9 +229,7 @@ async def stream_items( xml_row: Optional[str] = None, ) -> AsyncIterator: """TODO: docs.""" - yield { # TODO: figure out how to do streaming - - } + raise NotImplementedError('This method is not supported in local memory storage') async def push_items(self, items: JSONSerializable) -> None: """TODO: docs.""" @@ -391,7 +380,6 @@ def _find_or_cache_dataset_by_possible_id(client: 'MemoryStorage', entry_name_or new_client.item_count = item_count for entry_id, content in entries.items(): - # TODO: possibly do a copy/deepcopy of content? new_client.dataset_entries[entry_id] = content client.datasets_handled.append(new_client) diff --git a/src/apify/memory_storage/resource_clients/key_value_store.py b/src/apify/memory_storage/resource_clients/key_value_store.py index 1425de4d..0abccfa4 100644 --- a/src/apify/memory_storage/resource_clients/key_value_store.py +++ b/src/apify/memory_storage/resource_clients/key_value_store.py @@ -4,7 +4,6 @@ import pathlib import uuid import warnings -from contextlib import asynccontextmanager from datetime import datetime from operator import itemgetter from typing import TYPE_CHECKING, Any, AsyncIterator, Dict, Optional, Union @@ -14,7 +13,15 @@ from ..._utils import _json_dumps from ...consts import DEFAULT_API_PARAM_LIMIT, StorageTypes from ..file_storage_utils import _set_or_delete_key_value_store_record, _update_metadata -from ._utils import _guess_file_extension, _is_file_or_bytes, _maybe_parse_body, _raise_on_duplicate_entry, _raise_on_non_existing, uuid_regex +from ._utils import ( + _force_rename, + _guess_file_extension, + _is_file_or_bytes, + _maybe_parse_body, + _raise_on_duplicate_entry, + _raise_on_non_existing, + uuid_regex, +) if TYPE_CHECKING: from ..memory_storage import MemoryStorage @@ -72,14 +79,7 @@ async def update(self, *, name: Optional[str] = None) -> Dict: existing_store_by_id.key_value_store_directory = os.path.join(self.client.key_value_stores_directory, name) - # Remove new directory if it exists - # TODO: compare to using os.renames, which has problems when target dir exists - # TODO: check if ignore errors needed... - await aioshutil.rmtree(existing_store_by_id.key_value_store_directory, ignore_errors=True) - # Copy the previous directory to the new one - await aioshutil.copytree(previous_dir, existing_store_by_id.key_value_store_directory) - # Remove the previous directory - await aioshutil.rmtree(previous_dir) + await _force_rename(previous_dir, existing_store_by_id.key_value_store_directory) # Update timestamps await existing_store_by_id.update_timestamps(True) @@ -107,7 +107,7 @@ async def list_keys(self, *, limit: int = DEFAULT_API_PARAM_LIMIT, exclusive_sta items = [] for record in existing_store_by_id.key_value_entries.values(): - size = len(record['value']) # TODO: Check if this works for all cases + size = len(record['value']) items.append({ 'key': record['key'], 'size': size, @@ -140,8 +140,7 @@ async def list_keys(self, *, limit: int = DEFAULT_API_PARAM_LIMIT, exclusive_sta 'items': limited_items, } - async def get_record(self, key: str) -> Optional[Dict]: - """TODO: docs.""" + async def _get_record_internal(self, key: str, as_bytes: bool = False) -> Optional[Dict]: # Check by id existing_store_by_id = _find_or_cache_key_value_store_by_possible_id(self.client, self.name or self.id) @@ -160,42 +159,24 @@ async def get_record(self, key: str) -> Optional[Dict]: 'contentType': entry.get('content_type') or mimetypes.guess_type(f"file.{entry['extension']}")[0], # TODO: Default value? } - record['value'] = _maybe_parse_body(record['value'], record['contentType']) + if not as_bytes: + record['value'] = _maybe_parse_body(record['value'], record['contentType']) await existing_store_by_id.update_timestamps(False) return record - async def get_record_as_bytes(self, key: str) -> Optional[Dict]: + async def get_record(self, key: str) -> Optional[Dict]: """TODO: docs.""" - # TODO: make a private method that reuses code instead of copy pasting get_record and removing one line with parsing ;) - # Check by id - existing_store_by_id = _find_or_cache_key_value_store_by_possible_id(self.client, self.name or self.id) - - if existing_store_by_id is None: - _raise_on_non_existing(StorageTypes.KEY_VALUE_STORE, self.id) - - entry = existing_store_by_id.key_value_entries.get(key) - - if entry is None: - return None + return await self._get_record_internal(key) - record = { - 'key': entry['key'], - 'value': entry['value'], - # To guess the type, we need a real file name, not just the extension. e.g. 'file.json' instead of 'json' - 'contentType': entry.get('content_type') or mimetypes.guess_type(f"file.{entry['extension']}")[0], # TODO: Default value? - } - - await existing_store_by_id.update_timestamps(False) - - return record + async def get_record_as_bytes(self, key: str) -> Optional[Dict]: + """TODO: docs.""" + return await self._get_record_internal(key, as_bytes=True) - @asynccontextmanager async def stream_record(self, key: str) -> AsyncIterator[Optional[Dict]]: """TODO: docs.""" - # TODO: implement - no idea how atm - yield None + raise NotImplementedError('This method is not supported in local memory storage') async def set_record(self, key: str, value: Any, content_type: Optional[str] = None) -> None: """TODO: docs.""" @@ -206,6 +187,7 @@ async def set_record(self, key: str, value: Any, content_type: Optional[str] = N _raise_on_non_existing(StorageTypes.KEY_VALUE_STORE, self.id) if content_type is None: + # TODO: Add streaming support for this method... if _is_file_or_bytes(value): content_type = 'application/octet-stream' elif isinstance(value, str): @@ -218,15 +200,6 @@ async def set_record(self, key: str, value: Any, content_type: Optional[str] = N if 'application/json' in content_type and not _is_file_or_bytes(value) and not isinstance(value, str): value = _json_dumps(value).encode('utf-8') - # TODO: Add stream support for this method... - # if (valueIsStream) { - # const chunks = []; - # for await (const chunk of value) { - # chunks.push(chunk); - # } - # value = Buffer.concat(chunks); - # } - record = { 'extension': extension, 'key': key, @@ -413,7 +386,6 @@ def _find_or_cache_key_value_store_by_possible_id(client: 'MemoryStorage', entry new_client.modified_at = modified_at for key, record in internal_records.items(): - # TODO: possibly do a copy/deepcopy of record? new_client.key_value_entries[key] = record client.key_value_stores_handled.append(new_client) diff --git a/src/apify/memory_storage/resource_clients/request_queue.py b/src/apify/memory_storage/resource_clients/request_queue.py index 5bd54f53..9fd6fe5a 100644 --- a/src/apify/memory_storage/resource_clients/request_queue.py +++ b/src/apify/memory_storage/resource_clients/request_queue.py @@ -8,7 +8,7 @@ from ..._utils import _filter_out_none_values_recursively, _json_dumps from ..file_storage_utils import _delete_request, _update_metadata, _update_request_queue_item -from ._utils import StorageTypes, _raise_on_duplicate_entry, _raise_on_non_existing, _unique_key_to_request_id, uuid_regex +from ._utils import StorageTypes, _force_rename, _raise_on_duplicate_entry, _raise_on_non_existing, _unique_key_to_request_id, uuid_regex if TYPE_CHECKING: from ..memory_storage import MemoryStorage @@ -66,14 +66,7 @@ async def update(self, *, name: Optional[str] = None) -> Dict: existing_store_by_id.request_queue_directory = os.path.join(self.client.request_queues_directory, name) - # Remove new directory if it exists - # TODO: compare to using os.renames, which has problems when target dir exists - # TODO: check if ignore errors needed... - await aioshutil.rmtree(existing_store_by_id.request_queue_directory, ignore_errors=True) - # Copy the previous directory to the new one - await aioshutil.copytree(previous_dir, existing_store_by_id.request_queue_directory) - # Remove the previous directory - await aioshutil.rmtree(previous_dir) + await _force_rename(previous_dir, existing_store_by_id.request_queue_directory) # Update timestamps await existing_store_by_id.update_timestamps(True) @@ -349,7 +342,6 @@ def _find_or_cache_request_queue_by_possible_id(client: 'MemoryStorage', entry_n new_client.pending_request_count = pending_request_count for request in entries: - # TODO: possibly do a copy/deepcopy of request? new_client.requests[request['id']] = request client.request_queues_handled.append(new_client) From 98d5c86cfba65c5bb5e49e54df10374275452263 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ji=C5=99=C3=AD=20Morav=C4=8D=C3=ADk?= Date: Fri, 23 Dec 2022 23:05:48 +0100 Subject: [PATCH 07/23] refactoring, fixes --- src/apify/_utils.py | 11 +++------- src/apify/memory_storage/memory_storage.py | 21 ++++++++++--------- .../memory_storage/resource_clients/_utils.py | 4 ++-- .../resource_clients/dataset.py | 10 ++++----- .../resource_clients/key_value_store.py | 16 +++++++------- .../resource_clients/request_queue.py | 16 +++++++------- 6 files changed, 37 insertions(+), 41 deletions(-) diff --git a/src/apify/_utils.py b/src/apify/_utils.py index 1bbcde22..90fa38bf 100644 --- a/src/apify/_utils.py +++ b/src/apify/_utils.py @@ -1,5 +1,5 @@ import asyncio -import errno +import contextlib import inspect import json import os @@ -134,19 +134,14 @@ def __init__(self, data: Dict) -> None: self.total = data['total'] if 'total' in data else self.offset + self.count self.desc = data['desc'] if 'desc' in data else False -# TODO: Compare to https://stackoverflow.com/a/59185523 - async def _force_remove(filename: str) -> None: """JS-like rm(filename, { force: true }).""" - try: + with contextlib.suppress(FileNotFoundError): await remove(filename) - except OSError as e: - if e.errno != errno.ENOENT: # errno.ENOENT = no such file or directory - raise # re-raise exception if a different error occurred -def _json_serializer(obj: Any) -> str: # TODO: Improve and check this!!! +def _json_serializer(obj: Any) -> str: # TODO: Decide how to parse/dump/handle datetimes! if isinstance(obj, (datetime)): return obj.isoformat(timespec='milliseconds') + 'Z' else: diff --git a/src/apify/memory_storage/memory_storage.py b/src/apify/memory_storage/memory_storage.py index c495bcda..bd041129 100644 --- a/src/apify/memory_storage/memory_storage.py +++ b/src/apify/memory_storage/memory_storage.py @@ -63,23 +63,22 @@ async def purge(self) -> None: key_value_store_folders = await scandir(self.key_value_stores_directory) for key_value_store_folder in key_value_store_folders: if key_value_store_folder.name.startswith('__APIFY_TEMPORARY') or key_value_store_folder.name.startswith('__OLD'): - await self._batch_remove_files(os.path.join(self.key_value_stores_directory, key_value_store_folder.name)) + await self._batch_remove_files(key_value_store_folder.path) elif key_value_store_folder.name == 'default': - await self._handle_default_key_value_store(os.path.join(self.key_value_stores_directory, key_value_store_folder.name)) + await self._handle_default_key_value_store(key_value_store_folder.path) # Datasets if await ospath.exists(self.datasets_directory): dataset_folders = await scandir(self.datasets_directory) for dataset_folder in dataset_folders: if dataset_folder.name == 'default' or dataset_folder.name.startswith('__APIFY_TEMPORARY'): - print(dataset_folder.name) - await self._batch_remove_files(os.path.join(self.datasets_directory, dataset_folder.name)) + await self._batch_remove_files(dataset_folder.path) # Request queues if await ospath.exists(self.request_queues_directory): request_queue_folders = await scandir(self.request_queues_directory) for request_queue_folder in request_queue_folders: if request_queue_folder.name == 'default' or request_queue_folder.name.startswith('__APIFY_TEMPORARY'): - await self._batch_remove_files(os.path.join(self.request_queues_directory, request_queue_folder.name)) + await self._batch_remove_files(request_queue_folder.path) def teardown(self) -> None: """TODO: docs.""" @@ -88,7 +87,7 @@ def teardown(self) -> None: async def _handle_default_key_value_store(self, folder: str) -> None: folder_exists = await ospath.exists(folder) - temporary_path = os.path.join(folder, '../__APIFY_MIGRATING_KEY_VALUE_STORE__') + temporary_path = os.path.normpath(os.path.join(folder, '../__APIFY_MIGRATING_KEY_VALUE_STORE__')) # For optimization, we want to only attempt to copy a few files from the default key-value store possible_input_keys = [ @@ -114,7 +113,7 @@ async def _handle_default_key_value_store(self, folder: str) -> None: # Remove the original folder and all its content counter = 0 - temp_path_for_old_folder = os.path.join(folder, f'../__OLD_DEFAULT_{counter}__') + temp_path_for_old_folder = os.path.normpath(os.path.join(folder, f'../__OLD_DEFAULT_{counter}__')) done = False while not done: try: @@ -122,7 +121,7 @@ async def _handle_default_key_value_store(self, folder: str) -> None: done = True except Exception: counter += 1 - temp_path_for_old_folder = os.path.join(folder, f'../__OLD_DEFAULT_{counter}__') + temp_path_for_old_folder = os.path.normpath(os.path.join(folder, f'../__OLD_DEFAULT_{counter}__')) # Replace the temporary folder with the original folder await rename(temporary_path, folder) @@ -132,9 +131,11 @@ async def _handle_default_key_value_store(self, folder: str) -> None: async def _batch_remove_files(self, folder: str, counter: int = 0) -> None: folder_exists = await ospath.exists(folder) - print(f'batch remove {folder}') + if folder_exists: - temporary_folder = folder if folder.startswith('__APIFY_TEMPORARY_') else os.path.join(folder, f'../__APIFY_TEMPORARY_{counter}__') + # TODO: the startswith condition is always False, it's also broken in crawlee... + temporary_folder = folder if folder.startswith('__APIFY_TEMPORARY_') else os.path.normpath( + os.path.join(folder, f'../__APIFY_TEMPORARY_{counter}__')) try: # Rename the old folder to the new one to allow background deletions diff --git a/src/apify/memory_storage/resource_clients/_utils.py b/src/apify/memory_storage/resource_clients/_utils.py index 895f33f0..c79a923d 100644 --- a/src/apify/memory_storage/resource_clients/_utils.py +++ b/src/apify/memory_storage/resource_clients/_utils.py @@ -15,11 +15,11 @@ uuid_regex = re.compile('[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}', re.I) -def _raise_on_non_existing(client_type: StorageTypes, id: str) -> NoReturn: +def _raise_on_non_existing_storage(client_type: StorageTypes, id: str) -> NoReturn: raise ValueError(f'{client_type} with id: {id} does not exist.') -def _raise_on_duplicate_entry(client_type: StorageTypes, key_name: str, value: str) -> NoReturn: +def _raise_on_duplicate_storage(client_type: StorageTypes, key_name: str, value: str) -> NoReturn: raise ValueError(f'{client_type} with {key_name}: {value} already exists.') diff --git a/src/apify/memory_storage/resource_clients/dataset.py b/src/apify/memory_storage/resource_clients/dataset.py index 58a75b4c..c52b56b3 100644 --- a/src/apify/memory_storage/resource_clients/dataset.py +++ b/src/apify/memory_storage/resource_clients/dataset.py @@ -9,7 +9,7 @@ from ..._types import JSONSerializable from ..._utils import ListPage from ..file_storage_utils import _update_dataset_items, _update_metadata -from ._utils import StorageTypes, _force_rename, _raise_on_duplicate_entry, _raise_on_non_existing, uuid_regex +from ._utils import StorageTypes, _force_rename, _raise_on_duplicate_storage, _raise_on_non_existing_storage, uuid_regex if TYPE_CHECKING: from ..memory_storage import MemoryStorage @@ -53,7 +53,7 @@ async def update(self, *, name: Optional[str] = None) -> Dict: existing_store_by_id = _find_or_cache_dataset_by_possible_id(client=self.client, entry_name_or_id=self.name or self.id) if existing_store_by_id is None: - _raise_on_non_existing(StorageTypes.DATASET, self.id) + _raise_on_non_existing_storage(StorageTypes.DATASET, self.id) # Skip if no changes if name is None: @@ -64,7 +64,7 @@ async def update(self, *, name: Optional[str] = None) -> Dict: (store for store in self.client.datasets_handled if store.name and store.name.lower() == name.lower()), None) if existing_store_by_name is not None: - _raise_on_duplicate_entry(StorageTypes.DATASET, 'name', name) + _raise_on_duplicate_storage(StorageTypes.DATASET, 'name', name) existing_store_by_id.name = name @@ -110,7 +110,7 @@ async def list_items( existing_store_by_id = _find_or_cache_dataset_by_possible_id(client=self.client, entry_name_or_id=self.name or self.id) if existing_store_by_id is None: - _raise_on_non_existing(StorageTypes.DATASET, self.id) + _raise_on_non_existing_storage(StorageTypes.DATASET, self.id) start, end = existing_store_by_id._get_start_and_end_indexes( max(existing_store_by_id.item_count - (offset or 0) - (limit or 0), 0) if desc else offset or 0, @@ -237,7 +237,7 @@ async def push_items(self, items: JSONSerializable) -> None: existing_store_by_id = _find_or_cache_dataset_by_possible_id(client=self.client, entry_name_or_id=self.name or self.id) if existing_store_by_id is None: - _raise_on_non_existing(StorageTypes.DATASET, self.id) + _raise_on_non_existing_storage(StorageTypes.DATASET, self.id) normalized = self._normalize_items(items) diff --git a/src/apify/memory_storage/resource_clients/key_value_store.py b/src/apify/memory_storage/resource_clients/key_value_store.py index 0abccfa4..10d69d1e 100644 --- a/src/apify/memory_storage/resource_clients/key_value_store.py +++ b/src/apify/memory_storage/resource_clients/key_value_store.py @@ -18,8 +18,8 @@ _guess_file_extension, _is_file_or_bytes, _maybe_parse_body, - _raise_on_duplicate_entry, - _raise_on_non_existing, + _raise_on_duplicate_storage, + _raise_on_non_existing_storage, uuid_regex, ) @@ -60,7 +60,7 @@ async def update(self, *, name: Optional[str] = None) -> Dict: existing_store_by_id = _find_or_cache_key_value_store_by_possible_id(client=self.client, entry_name_or_id=self.name or self.id) if existing_store_by_id is None: - _raise_on_non_existing(StorageTypes.KEY_VALUE_STORE, self.id) + _raise_on_non_existing_storage(StorageTypes.KEY_VALUE_STORE, self.id) # Skip if no changes if name is None: @@ -71,7 +71,7 @@ async def update(self, *, name: Optional[str] = None) -> Dict: (store for store in self.client.key_value_stores_handled if store.name and store.name.lower() == name.lower()), None) if existing_store_by_name is not None: - _raise_on_duplicate_entry(StorageTypes.KEY_VALUE_STORE, 'name', name) + _raise_on_duplicate_storage(StorageTypes.KEY_VALUE_STORE, 'name', name) existing_store_by_id.name = name @@ -102,7 +102,7 @@ async def list_keys(self, *, limit: int = DEFAULT_API_PARAM_LIMIT, exclusive_sta existing_store_by_id = _find_or_cache_key_value_store_by_possible_id(self.client, self.name or self.id) if existing_store_by_id is None: - _raise_on_non_existing(StorageTypes.KEY_VALUE_STORE, self.id) + _raise_on_non_existing_storage(StorageTypes.KEY_VALUE_STORE, self.id) items = [] @@ -145,7 +145,7 @@ async def _get_record_internal(self, key: str, as_bytes: bool = False) -> Option existing_store_by_id = _find_or_cache_key_value_store_by_possible_id(self.client, self.name or self.id) if existing_store_by_id is None: - _raise_on_non_existing(StorageTypes.KEY_VALUE_STORE, self.id) + _raise_on_non_existing_storage(StorageTypes.KEY_VALUE_STORE, self.id) entry = existing_store_by_id.key_value_entries.get(key) @@ -184,7 +184,7 @@ async def set_record(self, key: str, value: Any, content_type: Optional[str] = N existing_store_by_id = _find_or_cache_key_value_store_by_possible_id(self.client, self.name or self.id) if existing_store_by_id is None: - _raise_on_non_existing(StorageTypes.KEY_VALUE_STORE, self.id) + _raise_on_non_existing_storage(StorageTypes.KEY_VALUE_STORE, self.id) if content_type is None: # TODO: Add streaming support for this method... @@ -224,7 +224,7 @@ async def delete_record(self, key: str) -> None: existing_store_by_id = _find_or_cache_key_value_store_by_possible_id(self.client, self.name or self.id) if existing_store_by_id is None: - _raise_on_non_existing(StorageTypes.KEY_VALUE_STORE, self.id) + _raise_on_non_existing_storage(StorageTypes.KEY_VALUE_STORE, self.id) entry = existing_store_by_id.key_value_entries.get(key) diff --git a/src/apify/memory_storage/resource_clients/request_queue.py b/src/apify/memory_storage/resource_clients/request_queue.py index 9fd6fe5a..cfcf9646 100644 --- a/src/apify/memory_storage/resource_clients/request_queue.py +++ b/src/apify/memory_storage/resource_clients/request_queue.py @@ -8,7 +8,7 @@ from ..._utils import _filter_out_none_values_recursively, _json_dumps from ..file_storage_utils import _delete_request, _update_metadata, _update_request_queue_item -from ._utils import StorageTypes, _force_rename, _raise_on_duplicate_entry, _raise_on_non_existing, _unique_key_to_request_id, uuid_regex +from ._utils import StorageTypes, _force_rename, _raise_on_duplicate_storage, _raise_on_non_existing_storage, _unique_key_to_request_id, uuid_regex if TYPE_CHECKING: from ..memory_storage import MemoryStorage @@ -47,7 +47,7 @@ async def update(self, *, name: Optional[str] = None) -> Dict: existing_store_by_id = _find_or_cache_request_queue_by_possible_id(self.client, self.name or self.id) if existing_store_by_id is None: - _raise_on_non_existing(StorageTypes.REQUEST_QUEUE, self.id) + _raise_on_non_existing_storage(StorageTypes.REQUEST_QUEUE, self.id) # Skip if no changes if name is None: @@ -58,7 +58,7 @@ async def update(self, *, name: Optional[str] = None) -> Dict: (store for store in self.client.request_queues_handled if store.name and store.name.lower() == name.lower()), None) if existing_store_by_name is not None: - _raise_on_duplicate_entry(StorageTypes.REQUEST_QUEUE, 'name', name) + _raise_on_duplicate_storage(StorageTypes.REQUEST_QUEUE, 'name', name) existing_store_by_id.name = name @@ -89,7 +89,7 @@ async def list_head(self, *, limit: Optional[int] = None) -> Dict: existing_store_by_id = _find_or_cache_request_queue_by_possible_id(self.client, self.name or self.id) if existing_store_by_id is None: - _raise_on_non_existing(StorageTypes.REQUEST_QUEUE, self.id) + _raise_on_non_existing_storage(StorageTypes.REQUEST_QUEUE, self.id) await existing_store_by_id.update_timestamps(False) @@ -114,7 +114,7 @@ async def add_request(self, request: Dict, *, forefront: Optional[bool] = None) existing_store_by_id = _find_or_cache_request_queue_by_possible_id(self.client, self.name or self.id) if existing_store_by_id is None: - _raise_on_non_existing(StorageTypes.REQUEST_QUEUE, self.id) + _raise_on_non_existing_storage(StorageTypes.REQUEST_QUEUE, self.id) request_model = self._create_internal_request(request, forefront) @@ -153,7 +153,7 @@ async def get_request(self, request_id: str) -> Optional[Dict]: existing_store_by_id = _find_or_cache_request_queue_by_possible_id(self.client, self.name or self.id) if existing_store_by_id is None: - _raise_on_non_existing(StorageTypes.REQUEST_QUEUE, self.id) + _raise_on_non_existing_storage(StorageTypes.REQUEST_QUEUE, self.id) await existing_store_by_id.update_timestamps(False) @@ -165,7 +165,7 @@ async def update_request(self, request: Dict, *, forefront: Optional[bool] = Non existing_store_by_id = _find_or_cache_request_queue_by_possible_id(self.client, self.name or self.id) if existing_store_by_id is None: - _raise_on_non_existing(StorageTypes.REQUEST_QUEUE, self.id) + _raise_on_non_existing_storage(StorageTypes.REQUEST_QUEUE, self.id) request_model = self._create_internal_request(request, forefront) @@ -212,7 +212,7 @@ async def delete_request(self, request_id: str) -> None: existing_store_by_id = _find_or_cache_request_queue_by_possible_id(self.client, self.name or self.id) if existing_store_by_id is None: - _raise_on_non_existing(StorageTypes.REQUEST_QUEUE, self.id) + _raise_on_non_existing_storage(StorageTypes.REQUEST_QUEUE, self.id) request = existing_store_by_id.requests.get(request_id) From 550b1184c257fa7a0f72ffb437be57735be73b8b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ji=C5=99=C3=AD=20Morav=C4=8D=C3=ADk?= Date: Fri, 30 Dec 2022 07:08:31 +0100 Subject: [PATCH 08/23] refactoring stuff, merging master --- src/apify/__init__.py | 3 +- src/apify/_utils.py | 77 ++++++++++++++++++- src/apify/consts.py | 1 + .../memory_storage/file_storage_utils.py | 8 +- .../resource_clients/dataset.py | 4 +- .../resource_clients/key_value_store.py | 8 +- .../resource_clients/request_queue.py | 12 ++- 7 files changed, 98 insertions(+), 15 deletions(-) diff --git a/src/apify/__init__.py b/src/apify/__init__.py index 1032fc27..a14461d0 100644 --- a/src/apify/__init__.py +++ b/src/apify/__init__.py @@ -1,4 +1,5 @@ from ._version import __version__ from .actor import Actor +from .memory_storage.memory_storage import MemoryStorage -__all__ = ['Actor', '__version__'] +__all__ = ['Actor', 'MemoryStorage', '__version__'] diff --git a/src/apify/_utils.py b/src/apify/_utils.py index d75c5ba3..73828b06 100644 --- a/src/apify/_utils.py +++ b/src/apify/_utils.py @@ -1,16 +1,23 @@ import asyncio +import base64 import contextlib +import hashlib import inspect +import io import json +import mimetypes import os +import re import sys import time from datetime import datetime, timezone from enum import Enum -from typing import Any, Callable, Dict, Generic, List, Optional, TypeVar, Union, cast, overload +from typing import Any, Callable, Dict, Generic, List, NoReturn, Optional, TypeVar, Union, cast, overload +import aioshutil import psutil -from aiofiles.os import remove +from aiofiles import ospath +from aiofiles.os import remove, rename from apify_client import __version__ as client_version @@ -23,7 +30,9 @@ BOOL_ENV_VARS, DATETIME_ENV_VARS, INTEGER_ENV_VARS, + REQUEST_ID_LENGTH, ApifyEnvVars, + StorageTypes, ) @@ -231,3 +240,67 @@ def _filter_out_none_values_recursively_internal(dictionary: Dict, remove_empty_ def _json_dumps(obj: Any) -> str: return json.dumps(obj, ensure_ascii=False, indent=2, default=_json_serializer) + + +uuid_regex = re.compile('[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}', re.I) + + +def _raise_on_non_existing_storage(client_type: StorageTypes, id: str) -> NoReturn: + raise ValueError(f'{client_type} with id: {id} does not exist.') + + +def _raise_on_duplicate_storage(client_type: StorageTypes, key_name: str, value: str) -> NoReturn: + raise ValueError(f'{client_type} with {key_name}: {value} already exists.') + + +def _guess_file_extension(content_type: str) -> Optional[str]: + # e.g. mimetypes.guess_extension('application/json ') does not work... + actual_content_type = content_type.split(';')[0].strip() + ext = mimetypes.guess_extension(actual_content_type) + # Remove the leading dot if extension successfully parsed + return ext[1:] if ext is not None else ext + + +def _is_content_type_json(content_type: str) -> bool: + return bool(re.search(r'^application/json', content_type, flags=re.IGNORECASE)) + + +def _is_content_type_xml(content_type: str) -> bool: + return bool(re.search(r'^application/.*xml$', content_type, flags=re.IGNORECASE)) + + +def _is_content_type_text(content_type: str) -> bool: + return bool(re.search(r'^text/', content_type, flags=re.IGNORECASE)) + + +def _is_file_or_bytes(value: Any) -> bool: + # The check for IOBase is not ideal, it would be better to use duck typing, + # but then the check would be super complex, judging from how the 'requests' library does it. + # This way should be good enough for the vast majority of use cases, if it causes issues, we can improve it later. + return isinstance(value, (bytes, bytearray, io.IOBase)) + + +def _maybe_parse_body(body: bytes, content_type: str) -> Any: + try: + if _is_content_type_json(content_type): + return json.loads(body) # Returns any + elif _is_content_type_xml(content_type) or _is_content_type_text(content_type): + return body.decode('utf-8') # TODO: Check if utf-8 can be assumed + except ValueError as err: + print('_maybe_parse_body error', err) + return body + + +def _unique_key_to_request_id(unique_key: str) -> str: + id = re.sub(r'(\+|\/|=)', '', base64.b64encode(hashlib.sha256(unique_key.encode('utf-8')).digest()).decode('utf-8')) + + return id[:REQUEST_ID_LENGTH] if len(id) > REQUEST_ID_LENGTH else id + + +async def _force_rename(src_dir: str, dst_dir: str) -> None: + # Make sure source directory exists + if await ospath.exists(src_dir): + # Remove destination directory if it exists + if await ospath.exists(dst_dir): + await aioshutil.rmtree(dst_dir, ignore_errors=True) + await rename(src_dir, dst_dir) diff --git a/src/apify/consts.py b/src/apify/consts.py index 8baad03e..265bef99 100644 --- a/src/apify/consts.py +++ b/src/apify/consts.py @@ -134,6 +134,7 @@ class ApifyEnvVars(str, Enum): STRING_ENV_VARS: List[_STRING_ENV_VARS_TYPE] = list(get_args(_STRING_ENV_VARS_TYPE)) + class StorageTypes(str, Enum): """Possible Apify storage types.""" diff --git a/src/apify/memory_storage/file_storage_utils.py b/src/apify/memory_storage/file_storage_utils.py index 40ba50d5..a2cc74e4 100644 --- a/src/apify/memory_storage/file_storage_utils.py +++ b/src/apify/memory_storage/file_storage_utils.py @@ -21,7 +21,7 @@ async def _update_metadata(*, data: Dict, entity_directory: str, write_metadata: await f.write(_json_dumps(data).encode('utf-8')) -async def _check_conditions(entity_directory: str, persist_storage: bool) -> None: +async def _check_and_ensure_dir(entity_directory: str, persist_storage: bool) -> None: # Skip writing files to the disk if the client has the option set to false if not persist_storage: return @@ -36,7 +36,7 @@ async def _update_dataset_items( entity_directory: str, persist_storage: bool, ) -> None: - await _check_conditions(entity_directory, persist_storage) + await _check_and_ensure_dir(entity_directory, persist_storage) # Save all the new items to the disk for idx, item in data: file_path = os.path.join(entity_directory, f'{idx}.json') @@ -52,7 +52,7 @@ async def _set_or_delete_key_value_store_record( should_set: bool, write_metadata: bool, ) -> None: - await _check_conditions(entity_directory, persist_storage) + await _check_and_ensure_dir(entity_directory, persist_storage) # Create files for the record record_path = os.path.join(entity_directory, f"""{record['key']}.{record['extension']}""") @@ -85,7 +85,7 @@ async def _update_request_queue_item( entity_directory: str, persist_storage: bool, ) -> None: - await _check_conditions(entity_directory, persist_storage) + await _check_and_ensure_dir(entity_directory, persist_storage) # Write the request to the file file_path = os.path.join(entity_directory, f'{request_id}.json') diff --git a/src/apify/memory_storage/resource_clients/dataset.py b/src/apify/memory_storage/resource_clients/dataset.py index c52b56b3..9a2c16f7 100644 --- a/src/apify/memory_storage/resource_clients/dataset.py +++ b/src/apify/memory_storage/resource_clients/dataset.py @@ -7,9 +7,9 @@ import aioshutil from ..._types import JSONSerializable -from ..._utils import ListPage +from ..._utils import ListPage, _force_rename, _raise_on_duplicate_storage, _raise_on_non_existing_storage, uuid_regex +from ...consts import StorageTypes from ..file_storage_utils import _update_dataset_items, _update_metadata -from ._utils import StorageTypes, _force_rename, _raise_on_duplicate_storage, _raise_on_non_existing_storage, uuid_regex if TYPE_CHECKING: from ..memory_storage import MemoryStorage diff --git a/src/apify/memory_storage/resource_clients/key_value_store.py b/src/apify/memory_storage/resource_clients/key_value_store.py index 10d69d1e..601d8f1d 100644 --- a/src/apify/memory_storage/resource_clients/key_value_store.py +++ b/src/apify/memory_storage/resource_clients/key_value_store.py @@ -10,18 +10,18 @@ import aioshutil -from ..._utils import _json_dumps -from ...consts import DEFAULT_API_PARAM_LIMIT, StorageTypes -from ..file_storage_utils import _set_or_delete_key_value_store_record, _update_metadata -from ._utils import ( +from ..._utils import ( _force_rename, _guess_file_extension, _is_file_or_bytes, + _json_dumps, _maybe_parse_body, _raise_on_duplicate_storage, _raise_on_non_existing_storage, uuid_regex, ) +from ...consts import DEFAULT_API_PARAM_LIMIT, StorageTypes +from ..file_storage_utils import _set_or_delete_key_value_store_record, _update_metadata if TYPE_CHECKING: from ..memory_storage import MemoryStorage diff --git a/src/apify/memory_storage/resource_clients/request_queue.py b/src/apify/memory_storage/resource_clients/request_queue.py index cfcf9646..9aebe2ea 100644 --- a/src/apify/memory_storage/resource_clients/request_queue.py +++ b/src/apify/memory_storage/resource_clients/request_queue.py @@ -6,9 +6,17 @@ import aioshutil -from ..._utils import _filter_out_none_values_recursively, _json_dumps +from ..._utils import ( + _filter_out_none_values_recursively, + _force_rename, + _json_dumps, + _raise_on_duplicate_storage, + _raise_on_non_existing_storage, + _unique_key_to_request_id, + uuid_regex, +) +from ...consts import StorageTypes from ..file_storage_utils import _delete_request, _update_metadata, _update_request_queue_item -from ._utils import StorageTypes, _force_rename, _raise_on_duplicate_storage, _raise_on_non_existing_storage, _unique_key_to_request_id, uuid_regex if TYPE_CHECKING: from ..memory_storage import MemoryStorage From cdeeea0acc381452e834f738281fe9ff5cc26d07 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ji=C5=99=C3=AD=20Morav=C4=8D=C3=ADk?= Date: Fri, 30 Dec 2022 08:10:15 +0100 Subject: [PATCH 09/23] refactoring, docs --- src/apify/_utils.py | 9 +++ .../memory_storage/resource_clients/_utils.py | 76 ------------------- .../resource_clients/key_value_store.py | 6 +- .../resource_clients/request_queue.py | 6 +- 4 files changed, 15 insertions(+), 82 deletions(-) delete mode 100644 src/apify/memory_storage/resource_clients/_utils.py diff --git a/src/apify/_utils.py b/src/apify/_utils.py index 73828b06..c9939a19 100644 --- a/src/apify/_utils.py +++ b/src/apify/_utils.py @@ -239,12 +239,18 @@ def _filter_out_none_values_recursively_internal(dictionary: Dict, remove_empty_ def _json_dumps(obj: Any) -> str: + """Dump JSON to a string with the correct settings and serializer.""" return json.dumps(obj, ensure_ascii=False, indent=2, default=_json_serializer) uuid_regex = re.compile('[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}', re.I) +def _is_uuid(string: str) -> bool: + """Test whether the given string matches UUID format.""" + return bool(uuid_regex.match(string)) + + def _raise_on_non_existing_storage(client_type: StorageTypes, id: str) -> NoReturn: raise ValueError(f'{client_type} with id: {id} does not exist.') @@ -254,6 +260,7 @@ def _raise_on_duplicate_storage(client_type: StorageTypes, key_name: str, value: def _guess_file_extension(content_type: str) -> Optional[str]: + """Guess the file extension based on content type.""" # e.g. mimetypes.guess_extension('application/json ') does not work... actual_content_type = content_type.split(';')[0].strip() ext = mimetypes.guess_extension(actual_content_type) @@ -292,12 +299,14 @@ def _maybe_parse_body(body: bytes, content_type: str) -> Any: def _unique_key_to_request_id(unique_key: str) -> str: + """Generate request ID based on unique key in a deterministic way.""" id = re.sub(r'(\+|\/|=)', '', base64.b64encode(hashlib.sha256(unique_key.encode('utf-8')).digest()).decode('utf-8')) return id[:REQUEST_ID_LENGTH] if len(id) > REQUEST_ID_LENGTH else id async def _force_rename(src_dir: str, dst_dir: str) -> None: + """Rename a directory. Checks for existence of soruce directory and removes destination directory if it exists.""" # Make sure source directory exists if await ospath.exists(src_dir): # Remove destination directory if it exists diff --git a/src/apify/memory_storage/resource_clients/_utils.py b/src/apify/memory_storage/resource_clients/_utils.py deleted file mode 100644 index c79a923d..00000000 --- a/src/apify/memory_storage/resource_clients/_utils.py +++ /dev/null @@ -1,76 +0,0 @@ -import base64 -import hashlib -import io -import json -import mimetypes -import re -from typing import Any, NoReturn, Optional - -import aioshutil -from aiofiles import ospath -from aiofiles.os import rename - -from ...consts import REQUEST_ID_LENGTH, StorageTypes - -uuid_regex = re.compile('[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}', re.I) - - -def _raise_on_non_existing_storage(client_type: StorageTypes, id: str) -> NoReturn: - raise ValueError(f'{client_type} with id: {id} does not exist.') - - -def _raise_on_duplicate_storage(client_type: StorageTypes, key_name: str, value: str) -> NoReturn: - raise ValueError(f'{client_type} with {key_name}: {value} already exists.') - - -def _guess_file_extension(content_type: str) -> Optional[str]: - # e.g. mimetypes.guess_extension('application/json ') does not work... - actual_content_type = content_type.split(';')[0].strip() - ext = mimetypes.guess_extension(actual_content_type) - # Remove the leading dot if extension successfully parsed - return ext[1:] if ext is not None else ext - - -def _is_content_type_json(content_type: str) -> bool: - return bool(re.search(r'^application/json', content_type, flags=re.IGNORECASE)) - - -def _is_content_type_xml(content_type: str) -> bool: - return bool(re.search(r'^application/.*xml$', content_type, flags=re.IGNORECASE)) - - -def _is_content_type_text(content_type: str) -> bool: - return bool(re.search(r'^text/', content_type, flags=re.IGNORECASE)) - - -def _is_file_or_bytes(value: Any) -> bool: - # The check for IOBase is not ideal, it would be better to use duck typing, - # but then the check would be super complex, judging from how the 'requests' library does it. - # This way should be good enough for the vast majority of use cases, if it causes issues, we can improve it later. - return isinstance(value, (bytes, bytearray, io.IOBase)) - - -def _maybe_parse_body(body: bytes, content_type: str) -> Any: - try: - if _is_content_type_json(content_type): - return json.loads(body) # Returns any - elif _is_content_type_xml(content_type) or _is_content_type_text(content_type): - return body.decode('utf-8') # TODO: Check if utf-8 can be assumed - except ValueError as err: - print('_maybe_parse_body error', err) - return body - - -def _unique_key_to_request_id(unique_key: str) -> str: - id = re.sub(r'(\+|\/|=)', '', base64.b64encode(hashlib.sha256(unique_key.encode('utf-8')).digest()).decode('utf-8')) - - return id[:REQUEST_ID_LENGTH] if len(id) > REQUEST_ID_LENGTH else id - - -async def _force_rename(src_dir: str, dst_dir: str) -> None: - # Make sure source directory exists - if await ospath.exists(src_dir): - # Remove destination directory if it exists - if await ospath.exists(dst_dir): - await aioshutil.rmtree(dst_dir, ignore_errors=True) - await rename(src_dir, dst_dir) diff --git a/src/apify/memory_storage/resource_clients/key_value_store.py b/src/apify/memory_storage/resource_clients/key_value_store.py index 601d8f1d..6fe9c7c5 100644 --- a/src/apify/memory_storage/resource_clients/key_value_store.py +++ b/src/apify/memory_storage/resource_clients/key_value_store.py @@ -14,11 +14,11 @@ _force_rename, _guess_file_extension, _is_file_or_bytes, + _is_uuid, _json_dumps, _maybe_parse_body, _raise_on_duplicate_storage, _raise_on_non_existing_storage, - uuid_regex, ) from ...consts import DEFAULT_API_PARAM_LIMIT, StorageTypes from ..file_storage_utils import _set_or_delete_key_value_store_record, _update_metadata @@ -371,9 +371,9 @@ def _find_or_cache_key_value_store_by_possible_id(client: 'MemoryStorage', entry internal_records[key] = new_record if id is None and name is None: - is_uuid = uuid_regex.match(entry_name_or_id) + is_uuid = _is_uuid(entry_name_or_id) - if is_uuid is not None: + if is_uuid: id = entry_name_or_id else: name = entry_name_or_id diff --git a/src/apify/memory_storage/resource_clients/request_queue.py b/src/apify/memory_storage/resource_clients/request_queue.py index 9aebe2ea..0e3cdbe0 100644 --- a/src/apify/memory_storage/resource_clients/request_queue.py +++ b/src/apify/memory_storage/resource_clients/request_queue.py @@ -9,11 +9,11 @@ from ..._utils import ( _filter_out_none_values_recursively, _force_rename, + _is_uuid, _json_dumps, _raise_on_duplicate_storage, _raise_on_non_existing_storage, _unique_key_to_request_id, - uuid_regex, ) from ...consts import StorageTypes from ..file_storage_utils import _delete_request, _update_metadata, _update_request_queue_item @@ -333,9 +333,9 @@ def _find_or_cache_request_queue_by_possible_id(client: 'MemoryStorage', entry_n entries.append(request) if id is None and name is None: - is_uuid = uuid_regex.match(entry_name_or_id) + is_uuid = _is_uuid(entry_name_or_id) - if is_uuid is not None: + if is_uuid: id = entry_name_or_id else: name = entry_name_or_id From bf744ba981b368ed229f01bc9ae9e3cf1c522cc9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ji=C5=99=C3=AD=20Morav=C4=8D=C3=ADk?= Date: Fri, 30 Dec 2022 09:00:09 +0100 Subject: [PATCH 10/23] Add unit tests for new utility methods --- src/apify/_utils.py | 2 +- tests/unit/test_utils.py | 173 ++++++++++++++++++++++++++++++++++++++- 2 files changed, 173 insertions(+), 2 deletions(-) diff --git a/src/apify/_utils.py b/src/apify/_utils.py index c9939a19..bdcaa640 100644 --- a/src/apify/_utils.py +++ b/src/apify/_utils.py @@ -287,7 +287,7 @@ def _is_file_or_bytes(value: Any) -> bool: return isinstance(value, (bytes, bytearray, io.IOBase)) -def _maybe_parse_body(body: bytes, content_type: str) -> Any: +def _maybe_parse_body(body: bytes, content_type: str) -> Any: # TODO: Improve return type try: if _is_content_type_json(content_type): return json.loads(body) # Returns any diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py index ebd20f05..ef062f56 100644 --- a/tests/unit/test_utils.py +++ b/tests/unit/test_utils.py @@ -1,20 +1,38 @@ import asyncio import datetime +import io +import os +import uuid +from aiofiles.os import mkdir from enum import Enum import pytest from apify._utils import ( _fetch_and_parse_env_var, + _filter_out_none_values_recursively, + _filter_out_none_values_recursively_internal, + _force_remove, _get_cpu_usage_percent, _get_memory_usage_bytes, + _guess_file_extension, + _is_content_type_json, + _is_content_type_text, + _is_content_type_xml, + _is_file_or_bytes, + _is_uuid, + _json_dumps, _maybe_extract_enum_member_value, _maybe_parse_bool, _maybe_parse_datetime, _maybe_parse_int, + _raise_on_duplicate_storage, + _raise_on_non_existing_storage, _run_func_at_interval_async, + _unique_key_to_request_id, + _force_rename, ) -from apify.consts import ApifyEnvVars +from apify.consts import ApifyEnvVars, StorageTypes def test__fetch_and_parse_env_var(monkeypatch: pytest.MonkeyPatch) -> None: @@ -138,3 +156,156 @@ async def async_increment() -> None: await asyncio.sleep(1) assert test_var == 3 + + +def test__filter_out_none_values_recursively() -> None: # Copypasted from client + assert _filter_out_none_values_recursively({'k1': 'v1'}) == {'k1': 'v1'} + assert _filter_out_none_values_recursively({'k1': None}) == {} + assert _filter_out_none_values_recursively({'k1': 'v1', 'k2': None, 'k3': {'k4': 'v4', 'k5': None}, 'k6': {'k7': None}}) \ + == {'k1': 'v1', 'k3': {'k4': 'v4'}} + + +def test__filter_out_none_values_recursively_internal() -> None: # Copypasted from client + assert _filter_out_none_values_recursively_internal({}) == {} + assert _filter_out_none_values_recursively_internal({'k1': {}}) == {} + assert _filter_out_none_values_recursively_internal({}, False) == {} + assert _filter_out_none_values_recursively_internal({'k1': {}}, False) == {'k1': {}} + assert _filter_out_none_values_recursively_internal({}, True) is None + assert _filter_out_none_values_recursively_internal({'k1': {}}, True) is None + + +def test__is_content_type_json() -> None: # Copypasted from client + # returns True for the right content types + assert _is_content_type_json('application/json') is True + assert _is_content_type_json('application/jsonc') is True + # returns False for bad content types + assert _is_content_type_json('application/xml') is False + assert _is_content_type_json('application/ld+json') is False + + +def test__is_content_type_xml() -> None: # Copypasted from client + # returns True for the right content types + assert _is_content_type_xml('application/xml') is True + assert _is_content_type_xml('application/xhtml+xml') is True + # returns False for bad content types + assert _is_content_type_xml('application/json') is False + assert _is_content_type_xml('text/html') is False + + +def test__is_content_type_text() -> None: # Copypasted from client + # returns True for the right content types + assert _is_content_type_text('text/html') is True + assert _is_content_type_text('text/plain') is True + # returns False for bad content types + assert _is_content_type_text('application/json') is False + assert _is_content_type_text('application/text') is False + + +def test__is_file_or_bytes() -> None: # Copypasted from client + # returns True for the right value types + assert _is_file_or_bytes(b'abc') is True + assert _is_file_or_bytes(bytearray.fromhex('F0F1F2')) is True + assert _is_file_or_bytes(io.BytesIO(b'\x00\x01\x02')) is True + + # returns False for bad value types + assert _is_file_or_bytes('abc') is False + assert _is_file_or_bytes(['a', 'b', 'c']) is False + assert _is_file_or_bytes({'a': 'b'}) is False + assert _is_file_or_bytes(None) is False + + +@pytest.mark.asyncio +async def test__force_remove(tmp_path: str) -> None: + test_file_path = os.path.join(tmp_path, 'test.txt') + # Does not crash/raise when the file does not exist + assert os.path.exists(test_file_path) is False + await _force_remove(test_file_path) + assert os.path.exists(test_file_path) is False + + # Removes the file if it exists + open(test_file_path, 'a').close() + assert os.path.exists(test_file_path) is True + await _force_remove(test_file_path) + assert os.path.exists(test_file_path) is False + + +def test__is_uuid() -> None: + assert _is_uuid(str(uuid.uuid4())) is True + assert _is_uuid('clearly not a UUID') is False + assert _is_uuid('') is False + + +def test__raise_on_non_existing_storage() -> None: + with pytest.raises(ValueError): + _raise_on_non_existing_storage(StorageTypes.DATASET, str(uuid.uuid4())) + + +def test__raise_on_duplicate_storage() -> None: + with pytest.raises(ValueError): + _raise_on_duplicate_storage(StorageTypes.DATASET, 'name', 'test') + + +def test__guess_file_extension() -> None: + # Can guess common types properly + assert _guess_file_extension('application/json') == 'json' + # assert _guess_file_extension('application/xml') == 'xml' # TODO: This shit library returns xsl for no apparent reason + assert _guess_file_extension('text/plain') == 'txt' + + # Can handle unusual formats + assert _guess_file_extension(' application/json ') == 'json' + assert _guess_file_extension('APPLICATION/JSON') == 'json' + assert _guess_file_extension('application/json;charset=utf-8') == 'json' + + # Returns None for non-existent content types + assert _guess_file_extension('clearly not a content type') is None + assert _guess_file_extension('') is None + + +def test__json_dumps() -> None: + expected = """{ + "string": "123", + "number": 456, + "nested": { + "abc": "def" + } +}""" + actual = _json_dumps({ # TODO: add a date into the object after datetime serialization format is finalized + 'string': '123', + 'number': 456, + 'nested': { + 'abc': 'def', + }, + }) + print(actual) + assert actual == expected + + +def test__unique_key_to_request_id() -> None: + # Right side from `uniqueKeyToRequestId` in Crawlee + assert _unique_key_to_request_id('abc') == 'ungWv48BzpBQUDe' + assert _unique_key_to_request_id('test') == 'n4bQgYhMfWWaLqg' + +@pytest.mark.asyncio +async def test__force_rename(tmp_path: str) -> None: + src_dir = os.path.join(tmp_path, 'src') + dst_dir = os.path.join(tmp_path, 'dst') + src_file = os.path.join(src_dir, 'src_dir.txt') + dst_file = os.path.join(dst_dir, 'dst_dir.txt') + # Won't crash if source directory does not exist + assert os.path.exists(src_dir) is False + await _force_rename(src_dir, dst_dir) + + # Will remove dst_dir if it exists (also covers normal case) + # Create the src_dir with a file in it + await mkdir(src_dir) + open(src_file, 'a').close() + # Create the dst_dir with a file in it + await mkdir(dst_dir) + open(dst_file, 'a').close() + assert os.path.exists(src_file) is True + assert os.path.exists(dst_file) is True + await _force_rename(src_dir, dst_dir) + assert os.path.exists(src_dir) is False + assert os.path.exists(dst_file) is False + # src_dir.txt should exist in dst_dir + assert os.path.exists(os.path.join(dst_dir, 'src_dir.txt')) is True From f798e2b45e1932430c2ede1d98d03105f4cbdc40 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ji=C5=99=C3=AD=20Morav=C4=8D=C3=ADk?= Date: Fri, 30 Dec 2022 09:01:23 +0100 Subject: [PATCH 11/23] fix format --- tests/unit/test_utils.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py index ef062f56..428d7632 100644 --- a/tests/unit/test_utils.py +++ b/tests/unit/test_utils.py @@ -3,16 +3,17 @@ import io import os import uuid -from aiofiles.os import mkdir from enum import Enum import pytest +from aiofiles.os import mkdir from apify._utils import ( _fetch_and_parse_env_var, _filter_out_none_values_recursively, _filter_out_none_values_recursively_internal, _force_remove, + _force_rename, _get_cpu_usage_percent, _get_memory_usage_bytes, _guess_file_extension, @@ -30,7 +31,6 @@ _raise_on_non_existing_storage, _run_func_at_interval_async, _unique_key_to_request_id, - _force_rename, ) from apify.consts import ApifyEnvVars, StorageTypes @@ -285,6 +285,7 @@ def test__unique_key_to_request_id() -> None: assert _unique_key_to_request_id('abc') == 'ungWv48BzpBQUDe' assert _unique_key_to_request_id('test') == 'n4bQgYhMfWWaLqg' + @pytest.mark.asyncio async def test__force_rename(tmp_path: str) -> None: src_dir = os.path.join(tmp_path, 'src') From eb91c7f6ba03555fc842e02662f1f6c7d3f8d3ae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ji=C5=99=C3=AD=20Morav=C4=8D=C3=ADk?= Date: Fri, 30 Dec 2022 09:38:32 +0100 Subject: [PATCH 12/23] revert MemoryStorage export --- src/apify/__init__.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/apify/__init__.py b/src/apify/__init__.py index a14461d0..1032fc27 100644 --- a/src/apify/__init__.py +++ b/src/apify/__init__.py @@ -1,5 +1,4 @@ from ._version import __version__ from .actor import Actor -from .memory_storage.memory_storage import MemoryStorage -__all__ = ['Actor', 'MemoryStorage', '__version__'] +__all__ = ['Actor', '__version__'] From ef91fd08c50e4fe3f28f173573d92255c0b1a75e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ji=C5=99=C3=AD=20Morav=C4=8D=C3=ADk?= Date: Sun, 1 Jan 2023 20:18:45 +0100 Subject: [PATCH 13/23] fixes and refactoring --- .../memory_storage/file_storage_utils.py | 29 ++++++++++++------- src/apify/memory_storage/memory_storage.py | 9 ++++-- .../resource_clients/dataset.py | 9 +++--- .../resource_clients/key_value_store.py | 3 +- .../resource_clients/request_queue.py | 3 +- 5 files changed, 33 insertions(+), 20 deletions(-) diff --git a/src/apify/memory_storage/file_storage_utils.py b/src/apify/memory_storage/file_storage_utils.py index a2cc74e4..4a75993f 100644 --- a/src/apify/memory_storage/file_storage_utils.py +++ b/src/apify/memory_storage/file_storage_utils.py @@ -21,7 +21,12 @@ async def _update_metadata(*, data: Dict, entity_directory: str, write_metadata: await f.write(_json_dumps(data).encode('utf-8')) -async def _check_and_ensure_dir(entity_directory: str, persist_storage: bool) -> None: +async def _update_dataset_items( + *, + data: List[Tuple[str, Dict]], + entity_directory: str, + persist_storage: bool, +) -> None: # Skip writing files to the disk if the client has the option set to false if not persist_storage: return @@ -29,14 +34,6 @@ async def _check_and_ensure_dir(entity_directory: str, persist_storage: bool) -> # Ensure the directory for the entity exists await makedirs(entity_directory, exist_ok=True) - -async def _update_dataset_items( - *, - data: List[Tuple[str, Dict]], - entity_directory: str, - persist_storage: bool, -) -> None: - await _check_and_ensure_dir(entity_directory, persist_storage) # Save all the new items to the disk for idx, item in data: file_path = os.path.join(entity_directory, f'{idx}.json') @@ -52,7 +49,12 @@ async def _set_or_delete_key_value_store_record( should_set: bool, write_metadata: bool, ) -> None: - await _check_and_ensure_dir(entity_directory, persist_storage) + # Skip writing files to the disk if the client has the option set to false + if not persist_storage: + return + + # Ensure the directory for the entity exists + await makedirs(entity_directory, exist_ok=True) # Create files for the record record_path = os.path.join(entity_directory, f"""{record['key']}.{record['extension']}""") @@ -85,7 +87,12 @@ async def _update_request_queue_item( entity_directory: str, persist_storage: bool, ) -> None: - await _check_and_ensure_dir(entity_directory, persist_storage) + # Skip writing files to the disk if the client has the option set to false + if not persist_storage: + return + + # Ensure the directory for the entity exists + await makedirs(entity_directory, exist_ok=True) # Write the request to the file file_path = os.path.join(entity_directory, f'{request_id}.json') diff --git a/src/apify/memory_storage/memory_storage.py b/src/apify/memory_storage/memory_storage.py index bd041129..34ad7804 100644 --- a/src/apify/memory_storage/memory_storage.py +++ b/src/apify/memory_storage/memory_storage.py @@ -17,9 +17,9 @@ class MemoryStorage: """Class representing an in-memory storage.""" - datasets_handled: List[DatasetClient] = [] - key_value_stores_handled: List[KeyValueStoreClient] = [] - request_queues_handled: List[RequestQueueClient] = [] + datasets_handled: List[DatasetClient] + key_value_stores_handled: List[KeyValueStoreClient] + request_queues_handled: List[RequestQueueClient] def __init__( self, *, local_data_directory: str = './storage', write_metadata: Optional[bool] = False, persist_storage: Optional[bool] = True, @@ -31,6 +31,9 @@ def __init__( self.request_queues_directory = os.path.join(self.local_data_directory, 'request_queues') self.write_metadata = write_metadata or '*' in os.getenv('DEBUG', '') self.persist_storage = persist_storage or not any(s in os.getenv('APIFY_PERSIST_STORAGE', 'true') for s in ['false', '0', '']) + self.datasets_handled = [] + self.key_value_stores_handled = [] + self.request_queues_handled = [] def datasets(self) -> DatasetCollectionClient: """TODO: docs.""" diff --git a/src/apify/memory_storage/resource_clients/dataset.py b/src/apify/memory_storage/resource_clients/dataset.py index 9a2c16f7..8267f0e6 100644 --- a/src/apify/memory_storage/resource_clients/dataset.py +++ b/src/apify/memory_storage/resource_clients/dataset.py @@ -7,7 +7,7 @@ import aioshutil from ..._types import JSONSerializable -from ..._utils import ListPage, _force_rename, _raise_on_duplicate_storage, _raise_on_non_existing_storage, uuid_regex +from ..._utils import ListPage, _force_rename, _is_uuid, _raise_on_duplicate_storage, _raise_on_non_existing_storage from ...consts import StorageTypes from ..file_storage_utils import _update_dataset_items, _update_metadata @@ -28,7 +28,7 @@ class DatasetClient: accessed_at = datetime.utcnow() modified_at = datetime.utcnow() item_count = 0 - dataset_entries: Dict[str, Dict] = {} + dataset_entries: Dict[str, Dict] def __init__(self, *, base_storage_directory: str, client: 'MemoryStorage', id: Optional[str] = None, name: Optional[str] = None) -> None: """TODO: docs.""" @@ -36,6 +36,7 @@ def __init__(self, *, base_storage_directory: str, client: 'MemoryStorage', id: self.dataset_directory = os.path.join(base_storage_directory, name or self.id) self.client = client self.name = name + self.dataset_entries = {} async def get(self) -> Optional[Dict]: """TODO: docs.""" @@ -364,9 +365,9 @@ def _find_or_cache_dataset_by_possible_id(client: 'MemoryStorage', entry_name_or item_count += 1 if id is None and name is None: - is_uuid = uuid_regex.match(entry_name_or_id) + is_uuid = _is_uuid(entry_name_or_id) - if is_uuid is not None: + if is_uuid: id = entry_name_or_id else: name = entry_name_or_id diff --git a/src/apify/memory_storage/resource_clients/key_value_store.py b/src/apify/memory_storage/resource_clients/key_value_store.py index 6fe9c7c5..5e3be909 100644 --- a/src/apify/memory_storage/resource_clients/key_value_store.py +++ b/src/apify/memory_storage/resource_clients/key_value_store.py @@ -35,7 +35,7 @@ class KeyValueStoreClient: created_at = datetime.utcnow() accessed_at = datetime.utcnow() modified_at = datetime.utcnow() - key_value_entries: Dict[str, Dict] = {} + key_value_entries: Dict[str, Dict] def __init__(self, *, base_storage_directory: str, client: 'MemoryStorage', id: Optional[str] = None, name: Optional[str] = None) -> None: """TODO: docs.""" @@ -43,6 +43,7 @@ def __init__(self, *, base_storage_directory: str, client: 'MemoryStorage', id: self.key_value_store_directory = os.path.join(base_storage_directory, name or self.id) self.client = client self.name = name + self.key_value_entries = {} async def get(self) -> Optional[Dict]: """TODO: docs.""" diff --git a/src/apify/memory_storage/resource_clients/request_queue.py b/src/apify/memory_storage/resource_clients/request_queue.py index 0e3cdbe0..c59a3fd7 100644 --- a/src/apify/memory_storage/resource_clients/request_queue.py +++ b/src/apify/memory_storage/resource_clients/request_queue.py @@ -30,7 +30,7 @@ class RequestQueueClient: modified_at = datetime.utcnow() handled_request_count = 0 pending_request_count = 0 - requests: Dict[str, Dict] = {} + requests: Dict[str, Dict] def __init__(self, *, base_storage_directory: str, client: 'MemoryStorage', id: Optional[str] = None, name: Optional[str] = None) -> None: """TODO: docs.""" @@ -38,6 +38,7 @@ def __init__(self, *, base_storage_directory: str, client: 'MemoryStorage', id: self.request_queue_directory = os.path.join(base_storage_directory, name or self.id) self.client = client self.name = name + self.requests = {} async def get(self) -> Optional[Dict]: """TODO: docs.""" From 117a2d07e3f975370cb9d7ebfcdd013c97af2efa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ji=C5=99=C3=AD=20Morav=C4=8D=C3=ADk?= Date: Sun, 1 Jan 2023 20:19:18 +0100 Subject: [PATCH 14/23] unit tests for memory storage --- tests/unit/memory_storage/__init__.py | 0 .../memory_storage/test_memory_storage.py | 99 +++++++++++++++++++ tests/unit/test_utils.py | 1 - 3 files changed, 99 insertions(+), 1 deletion(-) create mode 100644 tests/unit/memory_storage/__init__.py create mode 100644 tests/unit/memory_storage/test_memory_storage.py diff --git a/tests/unit/memory_storage/__init__.py b/tests/unit/memory_storage/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/unit/memory_storage/test_memory_storage.py b/tests/unit/memory_storage/test_memory_storage.py new file mode 100644 index 00000000..03f6a72a --- /dev/null +++ b/tests/unit/memory_storage/test_memory_storage.py @@ -0,0 +1,99 @@ +import os + +import pytest + +from apify.memory_storage.memory_storage import MemoryStorage + + +@pytest.mark.asyncio +async def test_write_metadata(tmp_path: str) -> None: + DATASET_NAME = 'test' + DATASET_NO_METADATA_NAME = 'test-no-metadata' + ms = MemoryStorage(local_data_directory=tmp_path, write_metadata=True) + ms_no_metadata = MemoryStorage(local_data_directory=tmp_path, write_metadata=False) + datasets_client = ms.datasets() + datasets_no_metadata_client = ms_no_metadata.datasets() + await datasets_client.get_or_create(name=DATASET_NAME) + await datasets_no_metadata_client.get_or_create(name=DATASET_NO_METADATA_NAME) + assert os.path.exists(os.path.join(ms.datasets_directory, DATASET_NAME, '__metadata__.json')) is True + assert os.path.exists(os.path.join(ms_no_metadata.datasets_directory, DATASET_NO_METADATA_NAME, '__metadata__.json')) is False + + +@pytest.mark.asyncio +async def test_persist_storage(tmp_path: str) -> None: + ms = MemoryStorage(local_data_directory=tmp_path, persist_storage=True) + ms_no_persist = MemoryStorage(local_data_directory=tmp_path, persist_storage=False) + kvs_client = ms.key_value_stores() + kvs_no_metadata_client = ms_no_persist.key_value_stores() + kvs_info = await kvs_client.get_or_create(name='kvs') + kvs_no_metadata_info = await kvs_no_metadata_client.get_or_create(name='kvs-no-persist') + await ms.key_value_store(id=kvs_info['id']).set_record('test', {'x': 1}, 'application/json') + await ms_no_persist.key_value_store(id=kvs_no_metadata_info['id']).set_record('test', {'x': 1}, 'application/json') + assert os.path.exists(os.path.join(ms.key_value_stores_directory, kvs_info['name'], 'test.json')) is True + assert os.path.exists(os.path.join(ms_no_persist.key_value_stores_directory, kvs_no_metadata_info['name'], 'test.json')) is False + + +@pytest.mark.asyncio +async def test_purge_datasets(tmp_path: str) -> None: + ms = MemoryStorage(local_data_directory=tmp_path, write_metadata=True) + # Create default and non-default datasets + datasets_client = ms.datasets() + default_dataset_info = await datasets_client.get_or_create(name='default') + non_default_dataset_info = await datasets_client.get_or_create(name='non-default') + + # Check all folders inside datasets directory before and after purge + folders_before_purge = os.listdir(ms.datasets_directory) + assert default_dataset_info['name'] in folders_before_purge + assert non_default_dataset_info['name'] in folders_before_purge + await ms.purge() + folders_after_purge = os.listdir(ms.datasets_directory) + assert default_dataset_info['name'] not in folders_after_purge + assert non_default_dataset_info['name'] in folders_after_purge + + +@pytest.mark.asyncio +async def test_purge_key_value_stores(tmp_path: str) -> None: + ms = MemoryStorage(local_data_directory=tmp_path, write_metadata=True) + + # Create default and non-default key-value stores + kvs_client = ms.key_value_stores() + default_kvs_info = await kvs_client.get_or_create(name='default') + non_default_kvs_info = await kvs_client.get_or_create(name='non-default') + default_kvs_client = ms.key_value_store(id=default_kvs_info['id']) + # INPUT.json should be kept + await default_kvs_client.set_record('INPUT', {'abc': 123}, 'application/json') + # test.json should not be kept + await default_kvs_client.set_record('test', {'abc': 123}, 'application/json') + + # Check all folders and files inside kvs directory before and after purge + folders_before_purge = os.listdir(ms.key_value_stores_directory) + assert default_kvs_info['name'] in folders_before_purge + assert non_default_kvs_info['name'] in folders_before_purge + default_folder_files_before_purge = os.listdir(os.path.join(ms.key_value_stores_directory, 'default')) + assert 'INPUT.json' in default_folder_files_before_purge + assert 'test.json' in default_folder_files_before_purge + await ms.purge() + folders_after_purge = os.listdir(ms.key_value_stores_directory) + assert default_kvs_info['name'] in folders_after_purge + assert non_default_kvs_info['name'] in folders_after_purge + default_folder_files_after_purge = os.listdir(os.path.join(ms.key_value_stores_directory, 'default')) + assert 'INPUT.json' in default_folder_files_after_purge + assert 'test.json' not in default_folder_files_after_purge + + +@pytest.mark.asyncio +async def test_purge_request_queues(tmp_path: str) -> None: + ms = MemoryStorage(local_data_directory=tmp_path, write_metadata=True) + # Create default and non-default request queues + rq_client = ms.request_queues() + default_rq_info = await rq_client.get_or_create(name='default') + non_default_rq_info = await rq_client.get_or_create(name='non-default') + + # Check all folders inside rq directory before and after purge + folders_before_purge = os.listdir(ms.request_queues_directory) + assert default_rq_info['name'] in folders_before_purge + assert non_default_rq_info['name'] in folders_before_purge + await ms.purge() + folders_after_purge = os.listdir(ms.request_queues_directory) + assert default_rq_info['name'] not in folders_after_purge + assert non_default_rq_info['name'] in folders_after_purge diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py index 428d7632..078c8512 100644 --- a/tests/unit/test_utils.py +++ b/tests/unit/test_utils.py @@ -276,7 +276,6 @@ def test__json_dumps() -> None: 'abc': 'def', }, }) - print(actual) assert actual == expected From db4fdc05b4b623a7d67f324a518686fc0c6864e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ji=C5=99=C3=AD=20Morav=C4=8D=C3=ADk?= Date: Sun, 1 Jan 2023 20:21:08 +0100 Subject: [PATCH 15/23] unit tests for collection clients --- .../resource_clients/__init__.py | 0 .../test_dataset_collection.py | 41 +++++++++++++++++++ .../test_key_value_store_collection.py | 41 +++++++++++++++++++ .../test_request_queue_collection.py | 41 +++++++++++++++++++ .../memory_storage/test_memory_storage.py | 12 +++--- 5 files changed, 129 insertions(+), 6 deletions(-) create mode 100644 tests/unit/memory_storage/resource_clients/__init__.py create mode 100644 tests/unit/memory_storage/resource_clients/test_dataset_collection.py create mode 100644 tests/unit/memory_storage/resource_clients/test_key_value_store_collection.py create mode 100644 tests/unit/memory_storage/resource_clients/test_request_queue_collection.py diff --git a/tests/unit/memory_storage/resource_clients/__init__.py b/tests/unit/memory_storage/resource_clients/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/unit/memory_storage/resource_clients/test_dataset_collection.py b/tests/unit/memory_storage/resource_clients/test_dataset_collection.py new file mode 100644 index 00000000..3fe6f4d1 --- /dev/null +++ b/tests/unit/memory_storage/resource_clients/test_dataset_collection.py @@ -0,0 +1,41 @@ +import os + +import pytest + +from apify.memory_storage.memory_storage import MemoryStorage +from apify.memory_storage.resource_clients.dataset_collection import DatasetCollectionClient + + +@pytest.fixture() +def datasets_client(tmp_path: str) -> DatasetCollectionClient: + return MemoryStorage(local_data_directory=tmp_path, write_metadata=True).datasets() + + +@pytest.mark.asyncio +async def test_get_or_create(datasets_client: DatasetCollectionClient) -> None: + dataset_name = 'test' + # A new dataset gets created + assert os.path.exists(os.path.join(datasets_client.datasets_directory, dataset_name, '__metadata__.json')) is False + dataset_info = await datasets_client.get_or_create(name=dataset_name) + assert dataset_info['name'] == dataset_name + assert os.path.exists(os.path.join(datasets_client.datasets_directory, dataset_name, '__metadata__.json')) is True + # Another get_or_create call returns the same dataset + dataset_info_existing = await datasets_client.get_or_create(name=dataset_name) + assert dataset_info['id'] == dataset_info_existing['id'] + assert dataset_info['name'] == dataset_info_existing['name'] + assert dataset_info['createdAt'] == dataset_info_existing['createdAt'] + + +@pytest.mark.asyncio +async def test_list(datasets_client: DatasetCollectionClient) -> None: + assert datasets_client.list().count == 0 + dataset_info = await datasets_client.get_or_create(name='dataset') + dataset_list = datasets_client.list() + assert dataset_list.count == 1 + assert dataset_list.items[0]['name'] == dataset_info['name'] + # Test sorting behavior + newer_dataset_info = await datasets_client.get_or_create(name='newer-dataset') + dataset_list_sorting = datasets_client.list() + assert dataset_list_sorting.count == 2 + assert dataset_list_sorting.items[0]['name'] == dataset_info['name'] + assert dataset_list_sorting.items[1]['name'] == newer_dataset_info['name'] diff --git a/tests/unit/memory_storage/resource_clients/test_key_value_store_collection.py b/tests/unit/memory_storage/resource_clients/test_key_value_store_collection.py new file mode 100644 index 00000000..57ff9b36 --- /dev/null +++ b/tests/unit/memory_storage/resource_clients/test_key_value_store_collection.py @@ -0,0 +1,41 @@ +import os + +import pytest + +from apify.memory_storage.memory_storage import MemoryStorage +from apify.memory_storage.resource_clients.key_value_store_collection import KeyValueStoreCollectionClient + + +@pytest.fixture() +def key_value_stores_client(tmp_path: str) -> KeyValueStoreCollectionClient: + return MemoryStorage(local_data_directory=tmp_path, write_metadata=True).key_value_stores() + + +@pytest.mark.asyncio +async def test_get_or_create(key_value_stores_client: KeyValueStoreCollectionClient) -> None: + kvs_name = 'test' + # A new kvs gets created + assert os.path.exists(os.path.join(key_value_stores_client.key_value_stores_directory, kvs_name, '__metadata__.json')) is False + kvs_info = await key_value_stores_client.get_or_create(name=kvs_name) + assert kvs_info['name'] == kvs_name + assert os.path.exists(os.path.join(key_value_stores_client.key_value_stores_directory, kvs_name, '__metadata__.json')) is True + # Another get_or_create call returns the same kvs + kvs_info_existing = await key_value_stores_client.get_or_create(name=kvs_name) + assert kvs_info['id'] == kvs_info_existing['id'] + assert kvs_info['name'] == kvs_info_existing['name'] + assert kvs_info['createdAt'] == kvs_info_existing['createdAt'] + + +@pytest.mark.asyncio +async def test_list(key_value_stores_client: KeyValueStoreCollectionClient) -> None: + assert key_value_stores_client.list().count == 0 + kvs_info = await key_value_stores_client.get_or_create(name='kvs') + kvs_list = key_value_stores_client.list() + assert kvs_list.count == 1 + assert kvs_list.items[0]['name'] == kvs_info['name'] + # Test sorting behavior + newer_kvs_info = await key_value_stores_client.get_or_create(name='newer-kvs') + kvs_list_sorting = key_value_stores_client.list() + assert kvs_list_sorting.count == 2 + assert kvs_list_sorting.items[0]['name'] == kvs_info['name'] + assert kvs_list_sorting.items[1]['name'] == newer_kvs_info['name'] diff --git a/tests/unit/memory_storage/resource_clients/test_request_queue_collection.py b/tests/unit/memory_storage/resource_clients/test_request_queue_collection.py new file mode 100644 index 00000000..3b550008 --- /dev/null +++ b/tests/unit/memory_storage/resource_clients/test_request_queue_collection.py @@ -0,0 +1,41 @@ +import os + +import pytest + +from apify.memory_storage.memory_storage import MemoryStorage +from apify.memory_storage.resource_clients.request_queue_collection import RequestQueueCollectionClient + + +@pytest.fixture() +def request_queues_client(tmp_path: str) -> RequestQueueCollectionClient: + return MemoryStorage(local_data_directory=tmp_path, write_metadata=True).request_queues() + + +@pytest.mark.asyncio +async def test_get_or_create(request_queues_client: RequestQueueCollectionClient) -> None: + rq_name = 'test' + # A new request queue gets created + assert os.path.exists(os.path.join(request_queues_client.request_queues_directory, rq_name, '__metadata__.json')) is False + rq_info = await request_queues_client.get_or_create(name=rq_name) + assert rq_info['name'] == rq_name + assert os.path.exists(os.path.join(request_queues_client.request_queues_directory, rq_name, '__metadata__.json')) is True + # Another get_or_create call returns the same request queue + rq_existing = await request_queues_client.get_or_create(name=rq_name) + assert rq_info['id'] == rq_existing['id'] + assert rq_info['name'] == rq_existing['name'] + assert rq_info['createdAt'] == rq_existing['createdAt'] + + +@pytest.mark.asyncio +async def test_list(request_queues_client: RequestQueueCollectionClient) -> None: + assert request_queues_client.list().count == 0 + rq_info = await request_queues_client.get_or_create(name='dataset') + rq_list = request_queues_client.list() + assert rq_list.count == 1 + assert rq_list.items[0]['name'] == rq_info['name'] + # Test sorting behavior + newer_rq_info = await request_queues_client.get_or_create(name='newer-dataset') + rq_list_sorting = request_queues_client.list() + assert rq_list_sorting.count == 2 + assert rq_list_sorting.items[0]['name'] == rq_info['name'] + assert rq_list_sorting.items[1]['name'] == newer_rq_info['name'] diff --git a/tests/unit/memory_storage/test_memory_storage.py b/tests/unit/memory_storage/test_memory_storage.py index 03f6a72a..11e19836 100644 --- a/tests/unit/memory_storage/test_memory_storage.py +++ b/tests/unit/memory_storage/test_memory_storage.py @@ -7,16 +7,16 @@ @pytest.mark.asyncio async def test_write_metadata(tmp_path: str) -> None: - DATASET_NAME = 'test' - DATASET_NO_METADATA_NAME = 'test-no-metadata' + dataset_name = 'test' + dataset_no_metadata_name = 'test-no-metadata' ms = MemoryStorage(local_data_directory=tmp_path, write_metadata=True) ms_no_metadata = MemoryStorage(local_data_directory=tmp_path, write_metadata=False) datasets_client = ms.datasets() datasets_no_metadata_client = ms_no_metadata.datasets() - await datasets_client.get_or_create(name=DATASET_NAME) - await datasets_no_metadata_client.get_or_create(name=DATASET_NO_METADATA_NAME) - assert os.path.exists(os.path.join(ms.datasets_directory, DATASET_NAME, '__metadata__.json')) is True - assert os.path.exists(os.path.join(ms_no_metadata.datasets_directory, DATASET_NO_METADATA_NAME, '__metadata__.json')) is False + await datasets_client.get_or_create(name=dataset_name) + await datasets_no_metadata_client.get_or_create(name=dataset_no_metadata_name) + assert os.path.exists(os.path.join(ms.datasets_directory, dataset_name, '__metadata__.json')) is True + assert os.path.exists(os.path.join(ms_no_metadata.datasets_directory, dataset_no_metadata_name, '__metadata__.json')) is False @pytest.mark.asyncio From a370f646cc89ca767a81bb6a99741ecd4cc8f36b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ji=C5=99=C3=AD=20Morav=C4=8D=C3=ADk?= Date: Sun, 1 Jan 2023 21:01:03 +0100 Subject: [PATCH 16/23] memory storage via an async fixture --- .../unit/memory_storage/resource_clients/_common.py | 12 ++++++++++++ .../resource_clients/test_dataset_collection.py | 6 ++++-- .../test_key_value_store_collection.py | 6 ++++-- .../test_request_queue_collection.py | 6 ++++-- 4 files changed, 24 insertions(+), 6 deletions(-) create mode 100644 tests/unit/memory_storage/resource_clients/_common.py diff --git a/tests/unit/memory_storage/resource_clients/_common.py b/tests/unit/memory_storage/resource_clients/_common.py new file mode 100644 index 00000000..e3f8d029 --- /dev/null +++ b/tests/unit/memory_storage/resource_clients/_common.py @@ -0,0 +1,12 @@ +from typing import AsyncIterator + +import pytest_asyncio + +from apify.memory_storage.memory_storage import MemoryStorage + + +@pytest_asyncio.fixture() +async def memory_storage(tmp_path: str) -> AsyncIterator[MemoryStorage]: + ms = MemoryStorage(local_data_directory=tmp_path, write_metadata=True) + yield ms + await ms.purge() diff --git a/tests/unit/memory_storage/resource_clients/test_dataset_collection.py b/tests/unit/memory_storage/resource_clients/test_dataset_collection.py index 3fe6f4d1..259dfcc7 100644 --- a/tests/unit/memory_storage/resource_clients/test_dataset_collection.py +++ b/tests/unit/memory_storage/resource_clients/test_dataset_collection.py @@ -5,10 +5,12 @@ from apify.memory_storage.memory_storage import MemoryStorage from apify.memory_storage.resource_clients.dataset_collection import DatasetCollectionClient +from ._common import memory_storage + @pytest.fixture() -def datasets_client(tmp_path: str) -> DatasetCollectionClient: - return MemoryStorage(local_data_directory=tmp_path, write_metadata=True).datasets() +def datasets_client(memory_storage: MemoryStorage) -> DatasetCollectionClient: + return memory_storage.datasets() @pytest.mark.asyncio diff --git a/tests/unit/memory_storage/resource_clients/test_key_value_store_collection.py b/tests/unit/memory_storage/resource_clients/test_key_value_store_collection.py index 57ff9b36..99ee9c2c 100644 --- a/tests/unit/memory_storage/resource_clients/test_key_value_store_collection.py +++ b/tests/unit/memory_storage/resource_clients/test_key_value_store_collection.py @@ -5,10 +5,12 @@ from apify.memory_storage.memory_storage import MemoryStorage from apify.memory_storage.resource_clients.key_value_store_collection import KeyValueStoreCollectionClient +from ._common import memory_storage + @pytest.fixture() -def key_value_stores_client(tmp_path: str) -> KeyValueStoreCollectionClient: - return MemoryStorage(local_data_directory=tmp_path, write_metadata=True).key_value_stores() +def key_value_stores_client(memory_storage: MemoryStorage) -> KeyValueStoreCollectionClient: + return memory_storage.key_value_stores() @pytest.mark.asyncio diff --git a/tests/unit/memory_storage/resource_clients/test_request_queue_collection.py b/tests/unit/memory_storage/resource_clients/test_request_queue_collection.py index 3b550008..1d4520e2 100644 --- a/tests/unit/memory_storage/resource_clients/test_request_queue_collection.py +++ b/tests/unit/memory_storage/resource_clients/test_request_queue_collection.py @@ -5,10 +5,12 @@ from apify.memory_storage.memory_storage import MemoryStorage from apify.memory_storage.resource_clients.request_queue_collection import RequestQueueCollectionClient +from ._common import memory_storage + @pytest.fixture() -def request_queues_client(tmp_path: str) -> RequestQueueCollectionClient: - return MemoryStorage(local_data_directory=tmp_path, write_metadata=True).request_queues() +def request_queues_client(memory_storage: MemoryStorage) -> RequestQueueCollectionClient: + return memory_storage.request_queues() @pytest.mark.asyncio From 4cdfbd2cc2302e12385e04d77d2cd0c58363c1fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ji=C5=99=C3=AD=20Morav=C4=8D=C3=ADk?= Date: Sun, 1 Jan 2023 21:42:12 +0100 Subject: [PATCH 17/23] update timestamps private --- .../memory_storage/resource_clients/dataset.py | 10 +++++----- .../resource_clients/key_value_store.py | 14 +++++++------- .../resource_clients/request_queue.py | 18 +++++++++--------- 3 files changed, 21 insertions(+), 21 deletions(-) diff --git a/src/apify/memory_storage/resource_clients/dataset.py b/src/apify/memory_storage/resource_clients/dataset.py index 8267f0e6..66aca8a4 100644 --- a/src/apify/memory_storage/resource_clients/dataset.py +++ b/src/apify/memory_storage/resource_clients/dataset.py @@ -43,7 +43,7 @@ async def get(self) -> Optional[Dict]: found = _find_or_cache_dataset_by_possible_id(client=self.client, entry_name_or_id=self.name or self.id) if found: - await found.update_timestamps(False) + await found._update_timestamps(False) return found.to_dataset_info() return None @@ -76,7 +76,7 @@ async def update(self, *, name: Optional[str] = None) -> Dict: await _force_rename(previous_dir, existing_store_by_id.dataset_directory) # Update timestamps - await existing_store_by_id.update_timestamps(True) + await existing_store_by_id._update_timestamps(True) return existing_store_by_id.to_dataset_info() @@ -124,7 +124,7 @@ async def list_items( entry_number = self._generate_local_entry_name(idx) items.append(existing_store_by_id.dataset_entries[entry_number]) - await existing_store_by_id.update_timestamps(False) + await existing_store_by_id._update_timestamps(False) if desc: items.reverse() @@ -254,7 +254,7 @@ async def push_items(self, items: JSONSerializable) -> None: for id in added_ids: data_entries.append((id, existing_store_by_id.dataset_entries[id])) - await existing_store_by_id.update_timestamps(True) + await existing_store_by_id._update_timestamps(True) await _update_dataset_items( data=data_entries, @@ -273,7 +273,7 @@ def to_dataset_info(self) -> Dict: 'modifiedAt': self.modified_at, } - async def update_timestamps(self, has_been_modified: bool) -> None: + async def _update_timestamps(self, has_been_modified: bool) -> None: """TODO: docs.""" self.accessed_at = datetime.utcnow() diff --git a/src/apify/memory_storage/resource_clients/key_value_store.py b/src/apify/memory_storage/resource_clients/key_value_store.py index 5e3be909..d5ee24ec 100644 --- a/src/apify/memory_storage/resource_clients/key_value_store.py +++ b/src/apify/memory_storage/resource_clients/key_value_store.py @@ -50,7 +50,7 @@ async def get(self) -> Optional[Dict]: found = _find_or_cache_key_value_store_by_possible_id(client=self.client, entry_name_or_id=self.name or self.id) if found: - await found.update_timestamps(False) + await found._update_timestamps(False) return found.to_key_value_store_info() return None @@ -83,7 +83,7 @@ async def update(self, *, name: Optional[str] = None) -> Dict: await _force_rename(previous_dir, existing_store_by_id.key_value_store_directory) # Update timestamps - await existing_store_by_id.update_timestamps(True) + await existing_store_by_id._update_timestamps(True) return existing_store_by_id.to_key_value_store_info() @@ -130,7 +130,7 @@ async def list_keys(self, *, limit: int = DEFAULT_API_PARAM_LIMIT, exclusive_sta is_last_selected_item_absolutely_last = last_item_in_store == last_selected_item next_exclusive_start_key = None if is_last_selected_item_absolutely_last else last_selected_item['key'] - await existing_store_by_id.update_timestamps(False) + await existing_store_by_id._update_timestamps(False) return { 'count': len(items), @@ -163,7 +163,7 @@ async def _get_record_internal(self, key: str, as_bytes: bool = False) -> Option if not as_bytes: record['value'] = _maybe_parse_body(record['value'], record['contentType']) - await existing_store_by_id.update_timestamps(False) + await existing_store_by_id._update_timestamps(False) return record @@ -210,7 +210,7 @@ async def set_record(self, key: str, value: Any, content_type: Optional[str] = N existing_store_by_id.key_value_entries[key] = record - await existing_store_by_id.update_timestamps(True) + await existing_store_by_id._update_timestamps(True) await _set_or_delete_key_value_store_record( entity_directory=existing_store_by_id.key_value_store_directory, persist_storage=self.client.persist_storage, @@ -231,7 +231,7 @@ async def delete_record(self, key: str) -> None: if entry is not None: del existing_store_by_id.key_value_entries[key] - await existing_store_by_id.update_timestamps(True) + await existing_store_by_id._update_timestamps(True) await _set_or_delete_key_value_store_record( entity_directory=existing_store_by_id.key_value_store_directory, persist_storage=self.client.persist_storage, @@ -251,7 +251,7 @@ def to_key_value_store_info(self) -> Dict: 'userId': '1', } - async def update_timestamps(self, has_been_modified: bool) -> None: + async def _update_timestamps(self, has_been_modified: bool) -> None: """TODO: docs.""" self.accessed_at = datetime.utcnow() diff --git a/src/apify/memory_storage/resource_clients/request_queue.py b/src/apify/memory_storage/resource_clients/request_queue.py index c59a3fd7..8cd47105 100644 --- a/src/apify/memory_storage/resource_clients/request_queue.py +++ b/src/apify/memory_storage/resource_clients/request_queue.py @@ -45,7 +45,7 @@ async def get(self) -> Optional[Dict]: found = _find_or_cache_request_queue_by_possible_id(self.client, self.name or self.id) if found: - await found.update_timestamps(False) + await found._update_timestamps(False) return found.to_request_queue_info() return None @@ -78,7 +78,7 @@ async def update(self, *, name: Optional[str] = None) -> Dict: await _force_rename(previous_dir, existing_store_by_id.request_queue_directory) # Update timestamps - await existing_store_by_id.update_timestamps(True) + await existing_store_by_id._update_timestamps(True) return existing_store_by_id.to_request_queue_info() @@ -100,7 +100,7 @@ async def list_head(self, *, limit: Optional[int] = None) -> Dict: if existing_store_by_id is None: _raise_on_non_existing_storage(StorageTypes.REQUEST_QUEUE, self.id) - await existing_store_by_id.update_timestamps(False) + await existing_store_by_id._update_timestamps(False) items: List[Dict] = [] @@ -131,7 +131,7 @@ async def add_request(self, request: Dict, *, forefront: Optional[bool] = None) # We already have the request present, so we return information about it if existing_request_with_id is not None: - await existing_store_by_id.update_timestamps(False) + await existing_store_by_id._update_timestamps(False) return { 'requestId': existing_request_with_id['id'], @@ -141,7 +141,7 @@ async def add_request(self, request: Dict, *, forefront: Optional[bool] = None) existing_store_by_id.requests[request_model['id']] = request_model existing_store_by_id.pending_request_count += 1 if request_model['orderNo'] is None else 0 - await existing_store_by_id.update_timestamps(True) + await existing_store_by_id._update_timestamps(True) await _update_request_queue_item( request=request_model, request_id=request_model['id'], @@ -164,7 +164,7 @@ async def get_request(self, request_id: str) -> Optional[Dict]: if existing_store_by_id is None: _raise_on_non_existing_storage(StorageTypes.REQUEST_QUEUE, self.id) - await existing_store_by_id.update_timestamps(False) + await existing_store_by_id._update_timestamps(False) request = existing_store_by_id.requests.get(request_id) return self._json_to_request(request['json'] if request is not None else None) @@ -202,7 +202,7 @@ async def update_request(self, request: Dict, *, forefront: Optional[bool] = Non handled_count_adjustment = -handled_count_adjustment existing_store_by_id.pending_request_count += handled_count_adjustment - await existing_store_by_id.update_timestamps(True) + await existing_store_by_id._update_timestamps(True) await _update_request_queue_item( request=request_model, request_id=request_model['id'], @@ -228,7 +228,7 @@ async def delete_request(self, request_id: str) -> None: if request: del existing_store_by_id.requests[request_id] existing_store_by_id.pending_request_count -= 0 if request['orderNo'] is None else 1 - await existing_store_by_id.update_timestamps(True) + await existing_store_by_id._update_timestamps(True) await _delete_request(entity_directory=existing_store_by_id.request_queue_directory, request_id=request_id) def to_request_queue_info(self) -> Dict: @@ -247,7 +247,7 @@ def to_request_queue_info(self) -> Dict: 'userId': '1', } - async def update_timestamps(self, has_been_modified: bool) -> None: + async def _update_timestamps(self, has_been_modified: bool) -> None: """TODO: docs.""" self.accessed_at = datetime.utcnow() From 79e114064dce1e3ae456423b7f046db4610261bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ji=C5=99=C3=AD=20Morav=C4=8D=C3=ADk?= Date: Mon, 2 Jan 2023 01:01:22 +0100 Subject: [PATCH 18/23] Add unit tests for non-collection clients, add new configs with TODO, fixes --- mypy.ini | 2 + pytest.ini | 3 + .../resource_clients/dataset.py | 12 +- .../resource_clients/key_value_store.py | 3 +- .../resource_clients/request_queue.py | 16 +- .../resource_clients/test_dataset.py | 131 ++++++++++++++ .../test_dataset_collection.py | 4 +- .../resource_clients/test_key_value_store.py | 153 +++++++++++++++++ .../test_key_value_store_collection.py | 4 +- .../resource_clients/test_request_queue.py | 160 ++++++++++++++++++ .../test_request_queue_collection.py | 4 +- 11 files changed, 475 insertions(+), 17 deletions(-) create mode 100644 pytest.ini create mode 100644 tests/unit/memory_storage/resource_clients/test_dataset.py create mode 100644 tests/unit/memory_storage/resource_clients/test_key_value_store.py create mode 100644 tests/unit/memory_storage/resource_clients/test_request_queue.py diff --git a/mypy.ini b/mypy.ini index 5119fba6..2dce60a2 100644 --- a/mypy.ini +++ b/mypy.ini @@ -1,4 +1,6 @@ [mypy] +; TODO: Decide if this makes sense/helps +; python_version=3.8 files = docs, scripts, diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 00000000..11fc0642 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,3 @@ +[pytest] +; TODO: Decide whether to use this or not +; asyncio_mode=auto diff --git a/src/apify/memory_storage/resource_clients/dataset.py b/src/apify/memory_storage/resource_clients/dataset.py index 66aca8a4..69d39dfd 100644 --- a/src/apify/memory_storage/resource_clients/dataset.py +++ b/src/apify/memory_storage/resource_clients/dataset.py @@ -14,6 +14,12 @@ if TYPE_CHECKING: from ..memory_storage import MemoryStorage +""" + This is what API returns in the x-apify-pagination-limit + header when no limit query parameter is used. + """ +LIST_ITEMS_LIMIT = 999_999_999_999 + """ Number of characters of the dataset item file names. E.g.: 000000019.json - 9 digits @@ -94,8 +100,8 @@ async def delete(self) -> None: async def list_items( self, *, - offset: Optional[int] = None, - limit: Optional[int] = None, + offset: int = 0, + limit: int = LIST_ITEMS_LIMIT, clean: Optional[bool] = None, desc: Optional[bool] = None, fields: Optional[List[str]] = None, @@ -114,7 +120,7 @@ async def list_items( _raise_on_non_existing_storage(StorageTypes.DATASET, self.id) start, end = existing_store_by_id._get_start_and_end_indexes( - max(existing_store_by_id.item_count - (offset or 0) - (limit or 0), 0) if desc else offset or 0, + max(existing_store_by_id.item_count - offset - limit, 0) if desc else offset or 0, limit, ) diff --git a/src/apify/memory_storage/resource_clients/key_value_store.py b/src/apify/memory_storage/resource_clients/key_value_store.py index d5ee24ec..935051c3 100644 --- a/src/apify/memory_storage/resource_clients/key_value_store.py +++ b/src/apify/memory_storage/resource_clients/key_value_store.py @@ -190,7 +190,8 @@ async def set_record(self, key: str, value: Any, content_type: Optional[str] = N if content_type is None: # TODO: Add streaming support for this method... if _is_file_or_bytes(value): - content_type = 'application/octet-stream' + raise NotImplementedError('Such value for set_record is not supported in local memory storage') + # content_type = 'application/octet-stream' elif isinstance(value, str): content_type = 'text/plain; charset=utf-8' else: diff --git a/src/apify/memory_storage/resource_clients/request_queue.py b/src/apify/memory_storage/resource_clients/request_queue.py index 8cd47105..0547902b 100644 --- a/src/apify/memory_storage/resource_clients/request_queue.py +++ b/src/apify/memory_storage/resource_clients/request_queue.py @@ -28,7 +28,7 @@ class RequestQueueClient: created_at = datetime.utcnow() accessed_at = datetime.utcnow() modified_at = datetime.utcnow() - handled_request_count = 0 + handled_request_count = 0 # TODO: Does not seem to be implemented in crawelee, always 0 pending_request_count = 0 requests: Dict[str, Dict] @@ -120,6 +120,7 @@ async def list_head(self, *, limit: Optional[int] = None) -> Dict: async def add_request(self, request: Dict, *, forefront: Optional[bool] = None) -> Dict: """TODO: docs.""" + # TODO: Throw if uniqueKey or url missing from request dict, also do for update_request... existing_store_by_id = _find_or_cache_request_queue_by_possible_id(self.client, self.name or self.id) if existing_store_by_id is None: @@ -140,7 +141,8 @@ async def add_request(self, request: Dict, *, forefront: Optional[bool] = None) } existing_store_by_id.requests[request_model['id']] = request_model - existing_store_by_id.pending_request_count += 1 if request_model['orderNo'] is None else 0 + # TODO: Validate the next line logic, seems wrong in crawlee + existing_store_by_id.pending_request_count += 0 if request_model['orderNo'] is None else 1 await existing_store_by_id._update_timestamps(True) await _update_request_queue_item( request=request_model, @@ -192,16 +194,16 @@ async def update_request(self, request: Dict, *, forefront: Optional[bool] = Non # the handled counts are updated correctly in all cases. existing_store_by_id.requests[request_model['id']] = request_model - handled_count_adjustment = 0 + pending_count_adjustment = 0 is_request_handled_state_changing = type(existing_request['orderNo']) != type(request_model['orderNo']) # noqa request_was_handled_before_update = existing_request['orderNo'] is None + # We add 1 pending request if previous state was handled + # TODO: Validate the next 2 lines logic, seems wrong in crawlee if is_request_handled_state_changing: - handled_count_adjustment += 1 - if request_was_handled_before_update: - handled_count_adjustment = -handled_count_adjustment + pending_count_adjustment = 1 if request_was_handled_before_update else -1 - existing_store_by_id.pending_request_count += handled_count_adjustment + existing_store_by_id.pending_request_count += pending_count_adjustment await existing_store_by_id._update_timestamps(True) await _update_request_queue_item( request=request_model, diff --git a/tests/unit/memory_storage/resource_clients/test_dataset.py b/tests/unit/memory_storage/resource_clients/test_dataset.py new file mode 100644 index 00000000..6860725d --- /dev/null +++ b/tests/unit/memory_storage/resource_clients/test_dataset.py @@ -0,0 +1,131 @@ +import os + +import pytest +import pytest_asyncio + +from apify.memory_storage.memory_storage import MemoryStorage +from apify.memory_storage.resource_clients.dataset import DatasetClient + +from ._common import memory_storage # noqa: F401 + + +@pytest_asyncio.fixture() +async def dataset_client(memory_storage: MemoryStorage) -> DatasetClient: # noqa: F811 + datasets_client = memory_storage.datasets() + dataset_info = await datasets_client.get_or_create(name='test') + return memory_storage.dataset(id=dataset_info['id']) + + +@pytest.mark.asyncio +async def test_nonexistent(memory_storage: MemoryStorage) -> None: # noqa: F811 + dataset_client = memory_storage.dataset(id='clearly not a uuid') + assert await dataset_client.get() is None + with pytest.raises(ValueError): + await dataset_client.update(name='test-update') + await dataset_client.list_items() + await dataset_client.push_items([{'abc': 123}]) + await dataset_client.delete() + + +@pytest.mark.asyncio +async def test_not_implemented(dataset_client: DatasetClient) -> None: + with pytest.raises(NotImplementedError): + await dataset_client.stream_items() + await dataset_client.get_items_as_bytes() + + +@pytest.mark.asyncio +async def test_get(dataset_client: DatasetClient) -> None: + info = await dataset_client.get() + assert info is not None + assert info['id'] == dataset_client.id + assert info['accessedAt'] != info['createdAt'] + + +@pytest.mark.asyncio +async def test_update(dataset_client: DatasetClient) -> None: + new_dataset_name = 'test-update' + old_dataset_info = await dataset_client.get() + assert old_dataset_info is not None + old_dataset_directory = os.path.join(dataset_client.client.datasets_directory, old_dataset_info['name']) + new_dataset_directory = os.path.join(dataset_client.client.datasets_directory, new_dataset_name) + assert os.path.exists(os.path.join(old_dataset_directory, '__metadata__.json')) is True + assert os.path.exists(os.path.join(new_dataset_directory, '__metadata__.json')) is False + updated_dataset_info = await dataset_client.update(name=new_dataset_name) + assert os.path.exists(os.path.join(old_dataset_directory, '__metadata__.json')) is False + assert os.path.exists(os.path.join(new_dataset_directory, '__metadata__.json')) is True + # Only modifiedAt and accessedAt should be different + assert old_dataset_info['createdAt'] == updated_dataset_info['createdAt'] + assert old_dataset_info['modifiedAt'] != updated_dataset_info['modifiedAt'] + assert old_dataset_info['accessedAt'] != updated_dataset_info['accessedAt'] + # Should fail with the same name + with pytest.raises(ValueError): + await dataset_client.update(name=new_dataset_name) + + +@pytest.mark.asyncio +async def test_delete(dataset_client: DatasetClient) -> None: + dataset_info = await dataset_client.get() + assert dataset_info is not None + dataset_directory = os.path.join(dataset_client.client.datasets_directory, dataset_info['name']) + assert os.path.exists(os.path.join(dataset_directory, '__metadata__.json')) is True + await dataset_client.delete() + assert os.path.exists(os.path.join(dataset_directory, '__metadata__.json')) is False + # Does not crash when called again + await dataset_client.delete() + + +@pytest.mark.asyncio +async def test_push_items(dataset_client: DatasetClient) -> None: + await dataset_client.push_items('{"test": "JSON from a string"}') + await dataset_client.push_items({'abc': {'def': {'ghi': '123'}}}) + await dataset_client.push_items(['{"test-json-parse": "JSON from a string"}' for _ in range(10)]) + await dataset_client.push_items([{'test-dict': i} for i in range(10)]) + list_page = await dataset_client.list_items() + assert list_page.items[0]['test'] == 'JSON from a string' + assert list_page.items[1]['abc']['def']['ghi'] == '123' + assert list_page.items[11]['test-json-parse'] == 'JSON from a string' + assert list_page.items[21]['test-dict'] == 9 + assert list_page.count == 22 + + +@pytest.mark.asyncio +async def test_list_items(dataset_client: DatasetClient) -> None: + item_count = 100 + used_offset = 10 + used_limit = 50 + await dataset_client.push_items([{'id': i} for i in range(item_count)]) + # Test without any parameters + list_default = await dataset_client.list_items() + assert list_default.count == item_count + assert list_default.offset == 0 + assert list_default.items[0]['id'] == 0 + assert list_default.desc is False + # Test offset + list_offset_10 = await dataset_client.list_items(offset=used_offset) + assert list_offset_10.count == item_count - used_offset + assert list_offset_10.offset == used_offset + assert list_offset_10.total == item_count + assert list_offset_10.items[0]['id'] == used_offset + # Test limit + list_limit_50 = await dataset_client.list_items(limit=used_limit) + assert list_limit_50.count == used_limit + assert list_limit_50.limit == used_limit + assert list_limit_50.total == item_count + # Test desc + list_desc_true = await dataset_client.list_items(desc=True) + assert list_desc_true.items[0]['id'] == 99 + assert list_desc_true.desc is True + + +@pytest.mark.asyncio +async def test_iterate_items(dataset_client: DatasetClient) -> None: + item_count = 100 + await dataset_client.push_items([{'id': i} for i in range(item_count)]) + actual_items = [] + async for item in dataset_client.iterate_items(): + assert 'id' in item.keys() + actual_items.append(item) + assert len(actual_items) == item_count + assert actual_items[0]['id'] == 0 + assert actual_items[99]['id'] == 99 diff --git a/tests/unit/memory_storage/resource_clients/test_dataset_collection.py b/tests/unit/memory_storage/resource_clients/test_dataset_collection.py index 259dfcc7..b78ea739 100644 --- a/tests/unit/memory_storage/resource_clients/test_dataset_collection.py +++ b/tests/unit/memory_storage/resource_clients/test_dataset_collection.py @@ -5,11 +5,11 @@ from apify.memory_storage.memory_storage import MemoryStorage from apify.memory_storage.resource_clients.dataset_collection import DatasetCollectionClient -from ._common import memory_storage +from ._common import memory_storage # noqa: F401 @pytest.fixture() -def datasets_client(memory_storage: MemoryStorage) -> DatasetCollectionClient: +def datasets_client(memory_storage: MemoryStorage) -> DatasetCollectionClient: # noqa: F811 return memory_storage.datasets() diff --git a/tests/unit/memory_storage/resource_clients/test_key_value_store.py b/tests/unit/memory_storage/resource_clients/test_key_value_store.py new file mode 100644 index 00000000..38da28b8 --- /dev/null +++ b/tests/unit/memory_storage/resource_clients/test_key_value_store.py @@ -0,0 +1,153 @@ +import os + +import pytest +import pytest_asyncio + +from apify.memory_storage.memory_storage import MemoryStorage +from apify.memory_storage.resource_clients.key_value_store import KeyValueStoreClient + +from ._common import memory_storage # noqa: F401 + + +@pytest_asyncio.fixture() +async def key_value_store_client(memory_storage: MemoryStorage) -> KeyValueStoreClient: # noqa: F811 + key_value_stores_client = memory_storage.key_value_stores() + kvs_info = await key_value_stores_client.get_or_create(name='test') + return memory_storage.key_value_store(id=kvs_info['id']) + + +@pytest.mark.asyncio +async def test_nonexistent(memory_storage: MemoryStorage) -> None: # noqa: F811 + kvs_client = memory_storage.key_value_store(id='clearly not a uuid') + assert await kvs_client.get() is None + with pytest.raises(ValueError): + await kvs_client.update(name='test-update') + await kvs_client.list_keys() + await kvs_client.set_record('test', {'abc': 123}) + await kvs_client.get_record('test') + await kvs_client.get_record_as_bytes('test') + await kvs_client.delete_record('test') + await kvs_client.delete() + + +@pytest.mark.asyncio +async def test_not_implemented(key_value_store_client: KeyValueStoreClient) -> None: + with pytest.raises(NotImplementedError): + await key_value_store_client.stream_record('test') + + +@pytest.mark.asyncio +async def test_get(key_value_store_client: KeyValueStoreClient) -> None: + info = await key_value_store_client.get() + assert info is not None + assert info['id'] == key_value_store_client.id + assert info['accessedAt'] != info['createdAt'] + + +@pytest.mark.asyncio +async def test_update(key_value_store_client: KeyValueStoreClient) -> None: + new_kvs_name = 'test-update' + old_kvs_info = await key_value_store_client.get() + assert old_kvs_info is not None + old_kvs_directory = os.path.join(key_value_store_client.client.key_value_stores_directory, old_kvs_info['name']) + new_kvs_directory = os.path.join(key_value_store_client.client.key_value_stores_directory, new_kvs_name) + assert os.path.exists(os.path.join(old_kvs_directory, '__metadata__.json')) is True + assert os.path.exists(os.path.join(new_kvs_directory, '__metadata__.json')) is False + updated_kvs_info = await key_value_store_client.update(name=new_kvs_name) + assert os.path.exists(os.path.join(old_kvs_directory, '__metadata__.json')) is False + assert os.path.exists(os.path.join(new_kvs_directory, '__metadata__.json')) is True + # Only modifiedAt and accessedAt should be different + assert old_kvs_info['createdAt'] == updated_kvs_info['createdAt'] + assert old_kvs_info['modifiedAt'] != updated_kvs_info['modifiedAt'] + assert old_kvs_info['accessedAt'] != updated_kvs_info['accessedAt'] + # Should fail with the same name + with pytest.raises(ValueError): + await key_value_store_client.update(name=new_kvs_name) + + +@pytest.mark.asyncio +async def test_delete(key_value_store_client: KeyValueStoreClient) -> None: + kvs_info = await key_value_store_client.get() + assert kvs_info is not None + kvs_directory = os.path.join(key_value_store_client.client.key_value_stores_directory, kvs_info['name']) + assert os.path.exists(os.path.join(kvs_directory, '__metadata__.json')) is True + await key_value_store_client.delete() + assert os.path.exists(os.path.join(kvs_directory, '__metadata__.json')) is False + # Does not crash when called again + await key_value_store_client.delete() + + +@pytest.mark.asyncio +async def test_list_keys(key_value_store_client: KeyValueStoreClient) -> None: + record_count = 4 + used_limit = 2 + used_exclusive_start_key = 'a' + await key_value_store_client.set_record('b', 'test') + await key_value_store_client.set_record('a', 'test') + await key_value_store_client.set_record('d', 'test') + await key_value_store_client.set_record('c', 'test') + # Default settings + keys = await key_value_store_client.list_keys() + assert keys['items'][0]['key'] == 'a' + assert keys['items'][3]['key'] == 'd' + assert keys['count'] == record_count + assert keys['isTruncated'] is False + # Test limit + keys_limit_2 = await key_value_store_client.list_keys(limit=used_limit) + assert keys_limit_2['count'] == record_count + assert keys_limit_2['limit'] == used_limit + assert keys_limit_2['items'][1]['key'] == 'b' + # Test exclusive start key + keys_exclusive_start = await key_value_store_client.list_keys(exclusive_start_key=used_exclusive_start_key, limit=2) + assert keys_exclusive_start['exclusiveStartKey'] == used_exclusive_start_key + assert keys_exclusive_start['isTruncated'] is True + assert keys_exclusive_start['nextExclusiveStartKey'] == 'c' + assert keys_exclusive_start['items'][0]['key'] == 'b' + assert keys_exclusive_start['items'][-1]['key'] == keys_exclusive_start['nextExclusiveStartKey'] + + +@pytest.mark.asyncio +async def test_get_and_set_record(key_value_store_client: KeyValueStoreClient) -> None: + # Test setting dict record + dict_record_key = 'test-dict' + await key_value_store_client.set_record(dict_record_key, {'test': 123}) + dict_record_info = await key_value_store_client.get_record(dict_record_key) + assert dict_record_info is not None + assert 'application/json' in dict_record_info['contentType'] + assert dict_record_info['value']['test'] == 123 + # Test setting str record + str_record_key = 'test-str' + await key_value_store_client.set_record(str_record_key, 'test') + str_record_info = await key_value_store_client.get_record(str_record_key) + assert str_record_info is not None + assert 'text/plain' in str_record_info['contentType'] + assert str_record_info['value'] == 'test' + # Test setting explicit json record but use str as value, i.e. json dumps is skipped + explicit_json_key = 'test-json' + await key_value_store_client.set_record(explicit_json_key, '{"test": "explicit string"}', 'application/json') + explicit_json_record_info = await key_value_store_client.get_record(explicit_json_key) + assert explicit_json_record_info is not None + assert 'application/json' in explicit_json_record_info['contentType'] + assert explicit_json_record_info['value']['test'] == 'explicit string' + # Test using bytes + with pytest.raises(NotImplementedError): + await key_value_store_client.set_record('bytes', 'test'.encode('utf-8')) + + +@pytest.mark.asyncio +async def test_get_record_as_bytes(key_value_store_client: KeyValueStoreClient) -> None: + record_key = 'test' + record_value = 'testing' + await key_value_store_client.set_record(record_key, record_value) + record_info = await key_value_store_client.get_record_as_bytes(record_key) + assert record_info is not None + assert record_info['value'] == record_value.encode('utf-8') + + +@pytest.mark.asyncio +async def test_delete_record(key_value_store_client: KeyValueStoreClient) -> None: + record_key = 'test' + await key_value_store_client.set_record(record_key, 'test') + await key_value_store_client.delete_record(record_key) + # Does not crash when called again + await key_value_store_client.delete_record(record_key) diff --git a/tests/unit/memory_storage/resource_clients/test_key_value_store_collection.py b/tests/unit/memory_storage/resource_clients/test_key_value_store_collection.py index 99ee9c2c..cf356d57 100644 --- a/tests/unit/memory_storage/resource_clients/test_key_value_store_collection.py +++ b/tests/unit/memory_storage/resource_clients/test_key_value_store_collection.py @@ -5,11 +5,11 @@ from apify.memory_storage.memory_storage import MemoryStorage from apify.memory_storage.resource_clients.key_value_store_collection import KeyValueStoreCollectionClient -from ._common import memory_storage +from ._common import memory_storage # noqa: F401 @pytest.fixture() -def key_value_stores_client(memory_storage: MemoryStorage) -> KeyValueStoreCollectionClient: +def key_value_stores_client(memory_storage: MemoryStorage) -> KeyValueStoreCollectionClient: # noqa: F811 return memory_storage.key_value_stores() diff --git a/tests/unit/memory_storage/resource_clients/test_request_queue.py b/tests/unit/memory_storage/resource_clients/test_request_queue.py new file mode 100644 index 00000000..b0e8e0f8 --- /dev/null +++ b/tests/unit/memory_storage/resource_clients/test_request_queue.py @@ -0,0 +1,160 @@ +import os +from datetime import datetime + +import pytest +import pytest_asyncio + +from apify.memory_storage.memory_storage import MemoryStorage +from apify.memory_storage.resource_clients.request_queue import RequestQueueClient + +from ._common import memory_storage # noqa: F401 + + +@pytest_asyncio.fixture() +async def request_queue_client(memory_storage: MemoryStorage) -> RequestQueueClient: # noqa: F811 + request_queues_client = memory_storage.request_queues() + rq_info = await request_queues_client.get_or_create(name='test') + return memory_storage.request_queue(id=rq_info['id']) + + +@pytest.mark.asyncio +async def test_nonexistent(memory_storage: MemoryStorage) -> None: # noqa: F811 + request_queue_client = memory_storage.request_queue(id='clearly not a uuid') + assert await request_queue_client.get() is None + with pytest.raises(ValueError): + await request_queue_client.update(name='test-update') + await request_queue_client.delete() + + +@pytest.mark.asyncio +async def test_get(request_queue_client: RequestQueueClient) -> None: + info = await request_queue_client.get() + assert info is not None + assert info['id'] == request_queue_client.id + assert info['accessedAt'] != info['createdAt'] + + +@pytest.mark.asyncio +async def test_update(request_queue_client: RequestQueueClient) -> None: + new_rq_name = 'test-update' + old_rq_info = await request_queue_client.get() + assert old_rq_info is not None + old_rq_directory = os.path.join(request_queue_client.client.request_queues_directory, old_rq_info['name']) + new_rq_directory = os.path.join(request_queue_client.client.request_queues_directory, new_rq_name) + assert os.path.exists(os.path.join(old_rq_directory, '__metadata__.json')) is True + assert os.path.exists(os.path.join(new_rq_directory, '__metadata__.json')) is False + updated_rq_info = await request_queue_client.update(name=new_rq_name) + assert os.path.exists(os.path.join(old_rq_directory, '__metadata__.json')) is False + assert os.path.exists(os.path.join(new_rq_directory, '__metadata__.json')) is True + # Only modifiedAt and accessedAt should be different + assert old_rq_info['createdAt'] == updated_rq_info['createdAt'] + assert old_rq_info['modifiedAt'] != updated_rq_info['modifiedAt'] + assert old_rq_info['accessedAt'] != updated_rq_info['accessedAt'] + # Should fail with the same name + with pytest.raises(ValueError): + await request_queue_client.update(name=new_rq_name) + + +@pytest.mark.asyncio +async def test_delete(request_queue_client: RequestQueueClient) -> None: + rq_info = await request_queue_client.get() + assert rq_info is not None + rq_directory = os.path.join(request_queue_client.client.request_queues_directory, rq_info['name']) + assert os.path.exists(os.path.join(rq_directory, '__metadata__.json')) is True + await request_queue_client.delete() + assert os.path.exists(os.path.join(rq_directory, '__metadata__.json')) is False + # Does not crash when called again + await request_queue_client.delete() + + +@pytest.mark.asyncio +async def test_list_head(request_queue_client: RequestQueueClient) -> None: + request_1_url = 'https://apify.com' + request_2_url = 'https://example.com' + await request_queue_client.add_request({ + 'uniqueKey': request_1_url, + 'url': request_1_url, + }) + await request_queue_client.add_request({ + 'uniqueKey': request_2_url, + 'url': request_2_url, + }) + list_head = await request_queue_client.list_head() + assert len(list_head['items']) == 2 + for item in list_head['items']: + assert 'id' in item.keys() + + +@pytest.mark.asyncio +async def test_add_record(request_queue_client: RequestQueueClient) -> None: + # TODO: How can we test the forefront parameter? + request_forefront_url = 'https://apify.com' + request_not_forefront_url = 'https://example.com' + request_forefront_info = await request_queue_client.add_request({ + 'uniqueKey': request_forefront_url, + 'url': request_forefront_url, + }, forefront=True) + request_not_forefront_info = await request_queue_client.add_request({ + 'uniqueKey': request_not_forefront_url, + 'url': request_not_forefront_url, + }, forefront=False) + assert request_forefront_info.get('requestId') is not None + assert request_not_forefront_info.get('requestId') is not None + assert request_forefront_info['wasAlreadyHandled'] is False + assert request_not_forefront_info['wasAlreadyHandled'] is False + rq_info = await request_queue_client.get() + assert rq_info is not None + assert rq_info['pendingRequestCount'] == rq_info['totalRequestCount'] == 2 + + +@pytest.mark.asyncio +async def test_get_record(request_queue_client: RequestQueueClient) -> None: + request_url = 'https://apify.com' + request_info = await request_queue_client.add_request({ + 'uniqueKey': request_url, + 'url': request_url, + }) + request = await request_queue_client.get_request(request_info['requestId']) + assert request is not None + assert 'id' in request.keys() + assert request['url'] == request['uniqueKey'] == request_url + # Non-existent id + assert (await request_queue_client.get_request('non-existent id')) is None + + +@pytest.mark.asyncio +async def test_update_record(request_queue_client: RequestQueueClient) -> None: + # TODO: How can we test the forefront parameter? + request_url = 'https://apify.com' + request_info = await request_queue_client.add_request({ + 'uniqueKey': request_url, + 'url': request_url, + }) + request = await request_queue_client.get_request(request_info['requestId']) + assert request is not None + rq_info_before_update = await request_queue_client.get() + assert rq_info_before_update is not None + assert rq_info_before_update['pendingRequestCount'] == 1 + request_update_info = await request_queue_client.update_request({**request, 'handledAt': datetime.utcnow()}) + assert request_update_info['wasAlreadyHandled'] is False + rq_info_after_update = await request_queue_client.get() + assert rq_info_after_update is not None + assert rq_info_after_update['pendingRequestCount'] == 0 + + +@pytest.mark.asyncio +async def test_delete_record(request_queue_client: RequestQueueClient) -> None: + request_url = 'https://apify.com' + request_info = await request_queue_client.add_request({ + 'uniqueKey': request_url, + 'url': request_url, + }) + rq_info_before_update = await request_queue_client.get() + assert rq_info_before_update is not None + assert rq_info_before_update['pendingRequestCount'] == 1 + await request_queue_client.delete_request(request_info['requestId']) + rq_info_after_update = await request_queue_client.get() + assert rq_info_after_update is not None + assert rq_info_after_update['pendingRequestCount'] == 0 + # Does not crash when called again + await request_queue_client.delete_request(request_info['requestId']) diff --git a/tests/unit/memory_storage/resource_clients/test_request_queue_collection.py b/tests/unit/memory_storage/resource_clients/test_request_queue_collection.py index 1d4520e2..6c9f0c08 100644 --- a/tests/unit/memory_storage/resource_clients/test_request_queue_collection.py +++ b/tests/unit/memory_storage/resource_clients/test_request_queue_collection.py @@ -5,11 +5,11 @@ from apify.memory_storage.memory_storage import MemoryStorage from apify.memory_storage.resource_clients.request_queue_collection import RequestQueueCollectionClient -from ._common import memory_storage +from ._common import memory_storage # noqa: F401 @pytest.fixture() -def request_queues_client(memory_storage: MemoryStorage) -> RequestQueueCollectionClient: +def request_queues_client(memory_storage: MemoryStorage) -> RequestQueueCollectionClient: # noqa: F811 return memory_storage.request_queues() From a97504800257a7027caa1aadcb53102384a53a13 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ji=C5=99=C3=AD=20Morav=C4=8D=C3=ADk?= Date: Tue, 3 Jan 2023 09:18:05 +0100 Subject: [PATCH 19/23] address some PR comments --- .flake8 | 2 +- mypy.ini | 3 +- pytest.ini | 3 +- src/apify/_utils.py | 2 +- src/apify/memory_storage/memory_storage.py | 2 +- .../resource_clients/dataset.py | 168 +++++++++--------- .../resource_clients/dataset_collection.py | 2 +- .../resource_clients/key_value_store.py | 2 +- .../key_value_store_collection.py | 2 +- .../resource_clients/request_queue.py | 104 +++++------ .../test_actor_create_proxy_configuration.py | 3 +- tests/unit/conftest.py | 10 +- .../resource_clients/_common.py | 12 -- .../resource_clients/test_dataset.py | 17 +- .../test_dataset_collection.py | 6 +- .../resource_clients/test_key_value_store.py | 18 +- .../test_key_value_store_collection.py | 6 +- .../resource_clients/test_request_queue.py | 18 +- .../test_request_queue_collection.py | 6 +- .../memory_storage/test_memory_storage.py | 7 - tests/unit/test_proxy_configuration.py | 21 +-- tests/unit/test_utils.py | 3 - 22 files changed, 166 insertions(+), 251 deletions(-) delete mode 100644 tests/unit/memory_storage/resource_clients/_common.py diff --git a/.flake8 b/.flake8 index 2fd7e665..3bf0b531 100644 --- a/.flake8 +++ b/.flake8 @@ -9,7 +9,7 @@ max_line_length = 150 # Google docstring convention + D204 & D401 docstring-convention = all ignore = - U100 # TODO: Remove this after we decide how to handle unused args + U101 D100 D104 D203 diff --git a/mypy.ini b/mypy.ini index b669e700..2d5b2ca6 100644 --- a/mypy.ini +++ b/mypy.ini @@ -1,6 +1,5 @@ [mypy] -; TODO: Decide if this makes sense/helps -; python_version=3.8 +python_version=3.8 files = docs, scripts, diff --git a/pytest.ini b/pytest.ini index 11fc0642..40880458 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,3 +1,2 @@ [pytest] -; TODO: Decide whether to use this or not -; asyncio_mode=auto +asyncio_mode=auto diff --git a/src/apify/_utils.py b/src/apify/_utils.py index c147a82a..2c9f1bd0 100644 --- a/src/apify/_utils.py +++ b/src/apify/_utils.py @@ -178,7 +178,7 @@ async def _run_func_at_interval_async(func: Callable, interval_secs: float) -> N await res -class ListPage: +class ListPage: # TODO: Rather use exported version from Apify client """A single page of items returned from a list() method.""" #: list: List of returned objects on this page diff --git a/src/apify/memory_storage/memory_storage.py b/src/apify/memory_storage/memory_storage.py index 34ad7804..0d3b747d 100644 --- a/src/apify/memory_storage/memory_storage.py +++ b/src/apify/memory_storage/memory_storage.py @@ -55,7 +55,7 @@ def request_queues(self) -> RequestQueueCollectionClient: """TODO: docs.""" return RequestQueueCollectionClient(base_storage_directory=self.request_queues_directory, client=self) - def request_queue(self, *, id: str, client_key: Optional[str] = None, timeout_secs: Optional[int] = None) -> RequestQueueClient: + def request_queue(self, *, id: str, _client_key: Optional[str] = None, _timeout_secs: Optional[int] = None) -> RequestQueueClient: """TODO: docs.""" return RequestQueueClient(base_storage_directory=self.request_queues_directory, client=self, id=id) diff --git a/src/apify/memory_storage/resource_clients/dataset.py b/src/apify/memory_storage/resource_clients/dataset.py index 69d39dfd..064bf578 100644 --- a/src/apify/memory_storage/resource_clients/dataset.py +++ b/src/apify/memory_storage/resource_clients/dataset.py @@ -57,70 +57,70 @@ async def get(self) -> Optional[Dict]: async def update(self, *, name: Optional[str] = None) -> Dict: """TODO: docs.""" # Check by id - existing_store_by_id = _find_or_cache_dataset_by_possible_id(client=self.client, entry_name_or_id=self.name or self.id) + existing_dataset_by_id = _find_or_cache_dataset_by_possible_id(client=self.client, entry_name_or_id=self.name or self.id) - if existing_store_by_id is None: + if existing_dataset_by_id is None: _raise_on_non_existing_storage(StorageTypes.DATASET, self.id) # Skip if no changes if name is None: - return existing_store_by_id.to_dataset_info() + return existing_dataset_by_id.to_dataset_info() # Check that name is not in use already - existing_store_by_name = next( - (store for store in self.client.datasets_handled if store.name and store.name.lower() == name.lower()), None) + existing_dataset_by_name = next( + (dataset for dataset in self.client.datasets_handled if dataset.name and dataset.name.lower() == name.lower()), None) - if existing_store_by_name is not None: + if existing_dataset_by_name is not None: _raise_on_duplicate_storage(StorageTypes.DATASET, 'name', name) - existing_store_by_id.name = name + existing_dataset_by_id.name = name - previous_dir = existing_store_by_id.dataset_directory + previous_dir = existing_dataset_by_id.dataset_directory - existing_store_by_id.dataset_directory = os.path.join(self.client.datasets_directory, name) + existing_dataset_by_id.dataset_directory = os.path.join(self.client.datasets_directory, name) - await _force_rename(previous_dir, existing_store_by_id.dataset_directory) + await _force_rename(previous_dir, existing_dataset_by_id.dataset_directory) # Update timestamps - await existing_store_by_id._update_timestamps(True) + await existing_dataset_by_id._update_timestamps(True) - return existing_store_by_id.to_dataset_info() + return existing_dataset_by_id.to_dataset_info() async def delete(self) -> None: """TODO: docs.""" - store = next((store for store in self.client.datasets_handled if store.id == self.id), None) + dataset = next((dataset for dataset in self.client.datasets_handled if dataset.id == self.id), None) - if store is not None: - self.client.datasets_handled.remove(store) - store.item_count = 0 - store.dataset_entries.clear() + if dataset is not None: + self.client.datasets_handled.remove(dataset) + dataset.item_count = 0 + dataset.dataset_entries.clear() - await aioshutil.rmtree(store.dataset_directory) + await aioshutil.rmtree(dataset.dataset_directory) async def list_items( self, *, offset: int = 0, limit: int = LIST_ITEMS_LIMIT, - clean: Optional[bool] = None, + _clean: Optional[bool] = None, desc: Optional[bool] = None, - fields: Optional[List[str]] = None, - omit: Optional[List[str]] = None, - unwind: Optional[str] = None, - skip_empty: Optional[bool] = None, - skip_hidden: Optional[bool] = None, - flatten: Optional[List[str]] = None, - view: Optional[str] = None, + _fields: Optional[List[str]] = None, + _omit: Optional[List[str]] = None, + _unwind: Optional[str] = None, + _skip_empty: Optional[bool] = None, + _skip_hidden: Optional[bool] = None, + _flatten: Optional[List[str]] = None, + _view: Optional[str] = None, ) -> ListPage: """TODO: docs.""" # Check by id - existing_store_by_id = _find_or_cache_dataset_by_possible_id(client=self.client, entry_name_or_id=self.name or self.id) + existing_dataset_by_id = _find_or_cache_dataset_by_possible_id(client=self.client, entry_name_or_id=self.name or self.id) - if existing_store_by_id is None: + if existing_dataset_by_id is None: _raise_on_non_existing_storage(StorageTypes.DATASET, self.id) - start, end = existing_store_by_id._get_start_and_end_indexes( - max(existing_store_by_id.item_count - offset - limit, 0) if desc else offset or 0, + start, end = existing_dataset_by_id._get_start_and_end_indexes( + max(existing_dataset_by_id.item_count - offset - limit, 0) if desc else offset or 0, limit, ) @@ -128,9 +128,9 @@ async def list_items( for idx in range(start, end): entry_number = self._generate_local_entry_name(idx) - items.append(existing_store_by_id.dataset_entries[entry_number]) + items.append(existing_dataset_by_id.dataset_entries[entry_number]) - await existing_store_by_id._update_timestamps(False) + await existing_dataset_by_id._update_timestamps(False) if desc: items.reverse() @@ -141,7 +141,7 @@ async def list_items( 'items': items, 'limit': limit, 'offset': offset, - 'total': existing_store_by_id.item_count, + 'total': existing_dataset_by_id.item_count, }) async def iterate_items( @@ -149,13 +149,13 @@ async def iterate_items( *, offset: int = 0, limit: Optional[int] = None, - clean: Optional[bool] = None, + _clean: Optional[bool] = None, desc: Optional[bool] = None, - fields: Optional[List[str]] = None, - omit: Optional[List[str]] = None, - unwind: Optional[str] = None, - skip_empty: Optional[bool] = None, - skip_hidden: Optional[bool] = None, + _fields: Optional[List[str]] = None, + _omit: Optional[List[str]] = None, + _unwind: Optional[str] = None, + _skip_empty: Optional[bool] = None, + _skip_hidden: Optional[bool] = None, ) -> AsyncGenerator: # TODO: Copy-pasted from client """TODO: docs.""" cache_size = 1000 @@ -177,13 +177,7 @@ async def iterate_items( current_items_page = await self.list_items( offset=current_offset, limit=current_limit, - clean=clean, desc=desc, - fields=fields, - omit=omit, - unwind=unwind, - skip_empty=skip_empty, - skip_hidden=skip_hidden, ) current_offset += current_items_page.count @@ -196,22 +190,22 @@ async def iterate_items( async def get_items_as_bytes( self, *, - item_format: str = 'json', - offset: Optional[int] = None, - limit: Optional[int] = None, - desc: Optional[bool] = None, - clean: Optional[bool] = None, - bom: Optional[bool] = None, - delimiter: Optional[str] = None, - fields: Optional[List[str]] = None, - omit: Optional[List[str]] = None, - unwind: Optional[str] = None, - skip_empty: Optional[bool] = None, - skip_header_row: Optional[bool] = None, - skip_hidden: Optional[bool] = None, - xml_root: Optional[str] = None, - xml_row: Optional[str] = None, - flatten: Optional[List[str]] = None, + _item_format: str = 'json', + _offset: Optional[int] = None, + _limit: Optional[int] = None, + _desc: Optional[bool] = None, + _clean: Optional[bool] = None, + _bom: Optional[bool] = None, + _delimiter: Optional[str] = None, + _fields: Optional[List[str]] = None, + _omit: Optional[List[str]] = None, + _unwind: Optional[str] = None, + _skip_empty: Optional[bool] = None, + _skip_header_row: Optional[bool] = None, + _skip_hidden: Optional[bool] = None, + _xml_root: Optional[str] = None, + _xml_row: Optional[str] = None, + _flatten: Optional[List[str]] = None, ) -> bytes: """TODO: docs.""" raise NotImplementedError('This method is not supported in local memory storage') @@ -219,21 +213,21 @@ async def get_items_as_bytes( async def stream_items( self, *, - item_format: str = 'json', - offset: Optional[int] = None, - limit: Optional[int] = None, - desc: Optional[bool] = None, - clean: Optional[bool] = None, - bom: Optional[bool] = None, - delimiter: Optional[str] = None, - fields: Optional[List[str]] = None, - omit: Optional[List[str]] = None, - unwind: Optional[str] = None, - skip_empty: Optional[bool] = None, - skip_header_row: Optional[bool] = None, - skip_hidden: Optional[bool] = None, - xml_root: Optional[str] = None, - xml_row: Optional[str] = None, + _item_format: str = 'json', + _offset: Optional[int] = None, + _limit: Optional[int] = None, + _desc: Optional[bool] = None, + _clean: Optional[bool] = None, + _bom: Optional[bool] = None, + _delimiter: Optional[str] = None, + _fields: Optional[List[str]] = None, + _omit: Optional[List[str]] = None, + _unwind: Optional[str] = None, + _skip_empty: Optional[bool] = None, + _skip_header_row: Optional[bool] = None, + _skip_hidden: Optional[bool] = None, + _xml_root: Optional[str] = None, + _xml_row: Optional[str] = None, ) -> AsyncIterator: """TODO: docs.""" raise NotImplementedError('This method is not supported in local memory storage') @@ -241,30 +235,30 @@ async def stream_items( async def push_items(self, items: JSONSerializable) -> None: """TODO: docs.""" # Check by id - existing_store_by_id = _find_or_cache_dataset_by_possible_id(client=self.client, entry_name_or_id=self.name or self.id) + existing_dataset_by_id = _find_or_cache_dataset_by_possible_id(client=self.client, entry_name_or_id=self.name or self.id) - if existing_store_by_id is None: + if existing_dataset_by_id is None: _raise_on_non_existing_storage(StorageTypes.DATASET, self.id) normalized = self._normalize_items(items) added_ids: List[str] = [] for entry in normalized: - existing_store_by_id.item_count += 1 - idx = self._generate_local_entry_name(existing_store_by_id.item_count) + existing_dataset_by_id.item_count += 1 + idx = self._generate_local_entry_name(existing_dataset_by_id.item_count) - existing_store_by_id.dataset_entries[idx] = entry + existing_dataset_by_id.dataset_entries[idx] = entry added_ids.append(idx) data_entries: List[Tuple[str, Dict]] = [] for id in added_ids: - data_entries.append((id, existing_store_by_id.dataset_entries[id])) + data_entries.append((id, existing_dataset_by_id.dataset_entries[id])) - await existing_store_by_id._update_timestamps(True) + await existing_dataset_by_id._update_timestamps(True) await _update_dataset_items( data=data_entries, - entity_directory=existing_store_by_id.dataset_directory, + entity_directory=existing_dataset_by_id.dataset_directory, persist_storage=self.client.persist_storage, ) @@ -322,8 +316,8 @@ def normalize_item(item: Any) -> Optional[Dict]: def _find_or_cache_dataset_by_possible_id(client: 'MemoryStorage', entry_name_or_id: str) -> Optional['DatasetClient']: # First check memory cache - found = next((store for store in client.datasets_handled - if store.id == entry_name_or_id or (store.name and store.name.lower() == entry_name_or_id.lower())), None) + found = next((dataset for dataset in client.datasets_handled + if dataset.id == entry_name_or_id or (dataset.name and dataset.name.lower() == entry_name_or_id.lower())), None) if found is not None: return found @@ -349,7 +343,7 @@ def _find_or_cache_dataset_by_possible_id(client: 'MemoryStorage', entry_name_or if entry.name == '__metadata__.json': has_seen_metadata_file = True - # We have found the store metadata file, build out information based on it + # We have found the dataset's metadata file, build out information based on it with open(os.path.join(datasets_dir, entry.name)) as f: metadata = json.load(f) id = metadata['id'] diff --git a/src/apify/memory_storage/resource_clients/dataset_collection.py b/src/apify/memory_storage/resource_clients/dataset_collection.py index 8aa99b81..1542db5b 100644 --- a/src/apify/memory_storage/resource_clients/dataset_collection.py +++ b/src/apify/memory_storage/resource_clients/dataset_collection.py @@ -30,7 +30,7 @@ def map_store(store: DatasetClient) -> Dict: 'items': sorted(map(map_store, self.client.datasets_handled), key=itemgetter('createdAt')), }) - async def get_or_create(self, *, name: Optional[str] = None, schema: Optional[Dict] = None) -> Dict: + async def get_or_create(self, *, name: Optional[str] = None, _schema: Optional[Dict] = None) -> Dict: """TODO: docs.""" if name: found = _find_or_cache_dataset_by_possible_id(client=self.client, entry_name_or_id=name) diff --git a/src/apify/memory_storage/resource_clients/key_value_store.py b/src/apify/memory_storage/resource_clients/key_value_store.py index 935051c3..640445d2 100644 --- a/src/apify/memory_storage/resource_clients/key_value_store.py +++ b/src/apify/memory_storage/resource_clients/key_value_store.py @@ -175,7 +175,7 @@ async def get_record_as_bytes(self, key: str) -> Optional[Dict]: """TODO: docs.""" return await self._get_record_internal(key, as_bytes=True) - async def stream_record(self, key: str) -> AsyncIterator[Optional[Dict]]: + async def stream_record(self, _key: str) -> AsyncIterator[Optional[Dict]]: """TODO: docs.""" raise NotImplementedError('This method is not supported in local memory storage') diff --git a/src/apify/memory_storage/resource_clients/key_value_store_collection.py b/src/apify/memory_storage/resource_clients/key_value_store_collection.py index dac02bb5..e170d3a6 100644 --- a/src/apify/memory_storage/resource_clients/key_value_store_collection.py +++ b/src/apify/memory_storage/resource_clients/key_value_store_collection.py @@ -30,7 +30,7 @@ def map_store(store: KeyValueStoreClient) -> Dict: 'items': sorted(map(map_store, self.client.key_value_stores_handled), key=itemgetter('createdAt')), }) - async def get_or_create(self, *, name: Optional[str] = None, schema: Optional[Dict] = None) -> Dict: + async def get_or_create(self, *, name: Optional[str] = None, _schema: Optional[Dict] = None) -> Dict: """TODO: docs.""" if name: found = _find_or_cache_key_value_store_by_possible_id(client=self.client, entry_name_or_id=name) diff --git a/src/apify/memory_storage/resource_clients/request_queue.py b/src/apify/memory_storage/resource_clients/request_queue.py index 0547902b..e26b4855 100644 --- a/src/apify/memory_storage/resource_clients/request_queue.py +++ b/src/apify/memory_storage/resource_clients/request_queue.py @@ -53,58 +53,58 @@ async def get(self) -> Optional[Dict]: async def update(self, *, name: Optional[str] = None) -> Dict: """TODO: docs.""" # Check by id - existing_store_by_id = _find_or_cache_request_queue_by_possible_id(self.client, self.name or self.id) + existing_queue_by_id = _find_or_cache_request_queue_by_possible_id(self.client, self.name or self.id) - if existing_store_by_id is None: + if existing_queue_by_id is None: _raise_on_non_existing_storage(StorageTypes.REQUEST_QUEUE, self.id) # Skip if no changes if name is None: - return existing_store_by_id.to_request_queue_info() + return existing_queue_by_id.to_request_queue_info() # Check that name is not in use already - existing_store_by_name = next( - (store for store in self.client.request_queues_handled if store.name and store.name.lower() == name.lower()), None) + existing_queue_by_name = next( + (queue for queue in self.client.request_queues_handled if queue.name and queue.name.lower() == name.lower()), None) - if existing_store_by_name is not None: + if existing_queue_by_name is not None: _raise_on_duplicate_storage(StorageTypes.REQUEST_QUEUE, 'name', name) - existing_store_by_id.name = name + existing_queue_by_id.name = name - previous_dir = existing_store_by_id.request_queue_directory + previous_dir = existing_queue_by_id.request_queue_directory - existing_store_by_id.request_queue_directory = os.path.join(self.client.request_queues_directory, name) + existing_queue_by_id.request_queue_directory = os.path.join(self.client.request_queues_directory, name) - await _force_rename(previous_dir, existing_store_by_id.request_queue_directory) + await _force_rename(previous_dir, existing_queue_by_id.request_queue_directory) # Update timestamps - await existing_store_by_id._update_timestamps(True) + await existing_queue_by_id._update_timestamps(True) - return existing_store_by_id.to_request_queue_info() + return existing_queue_by_id.to_request_queue_info() async def delete(self) -> None: """TODO: docs.""" - store = next((store for store in self.client.request_queues_handled if store.id == self.id), None) + queue = next((queue for queue in self.client.request_queues_handled if queue.id == self.id), None) - if store is not None: - self.client.request_queues_handled.remove(store) - store.pending_request_count = 0 - store.requests.clear() + if queue is not None: + self.client.request_queues_handled.remove(queue) + queue.pending_request_count = 0 + queue.requests.clear() - await aioshutil.rmtree(store.request_queue_directory) + await aioshutil.rmtree(queue.request_queue_directory) async def list_head(self, *, limit: Optional[int] = None) -> Dict: """TODO: docs.""" - existing_store_by_id = _find_or_cache_request_queue_by_possible_id(self.client, self.name or self.id) + existing_queue_by_id = _find_or_cache_request_queue_by_possible_id(self.client, self.name or self.id) - if existing_store_by_id is None: + if existing_queue_by_id is None: _raise_on_non_existing_storage(StorageTypes.REQUEST_QUEUE, self.id) - await existing_store_by_id._update_timestamps(False) + await existing_queue_by_id._update_timestamps(False) items: List[Dict] = [] - for request in existing_store_by_id.requests.values(): + for request in existing_queue_by_id.requests.values(): if len(items) == limit: break @@ -114,25 +114,25 @@ async def list_head(self, *, limit: Optional[int] = None) -> Dict: return { 'limit': limit, 'hadMultipleClients': False, - 'queueModifiedAt': existing_store_by_id.modified_at, + 'queueModifiedAt': existing_queue_by_id.modified_at, 'items': list(map(lambda item: self._json_to_request(item['json']), items)), } async def add_request(self, request: Dict, *, forefront: Optional[bool] = None) -> Dict: """TODO: docs.""" # TODO: Throw if uniqueKey or url missing from request dict, also do for update_request... - existing_store_by_id = _find_or_cache_request_queue_by_possible_id(self.client, self.name or self.id) + existing_queue_by_id = _find_or_cache_request_queue_by_possible_id(self.client, self.name or self.id) - if existing_store_by_id is None: + if existing_queue_by_id is None: _raise_on_non_existing_storage(StorageTypes.REQUEST_QUEUE, self.id) request_model = self._create_internal_request(request, forefront) - existing_request_with_id = existing_store_by_id.requests.get(request_model['id']) + existing_request_with_id = existing_queue_by_id.requests.get(request_model['id']) # We already have the request present, so we return information about it if existing_request_with_id is not None: - await existing_store_by_id._update_timestamps(False) + await existing_queue_by_id._update_timestamps(False) return { 'requestId': existing_request_with_id['id'], @@ -140,14 +140,14 @@ async def add_request(self, request: Dict, *, forefront: Optional[bool] = None) 'wasAlreadyPresent': True, } - existing_store_by_id.requests[request_model['id']] = request_model + existing_queue_by_id.requests[request_model['id']] = request_model # TODO: Validate the next line logic, seems wrong in crawlee - existing_store_by_id.pending_request_count += 0 if request_model['orderNo'] is None else 1 - await existing_store_by_id._update_timestamps(True) + existing_queue_by_id.pending_request_count += 0 if request_model['orderNo'] is None else 1 + await existing_queue_by_id._update_timestamps(True) await _update_request_queue_item( request=request_model, request_id=request_model['id'], - entity_directory=existing_store_by_id.request_queue_directory, + entity_directory=existing_queue_by_id.request_queue_directory, persist_storage=self.client.persist_storage, ) @@ -161,21 +161,21 @@ async def add_request(self, request: Dict, *, forefront: Optional[bool] = None) async def get_request(self, request_id: str) -> Optional[Dict]: """TODO: docs.""" - existing_store_by_id = _find_or_cache_request_queue_by_possible_id(self.client, self.name or self.id) + existing_queue_by_id = _find_or_cache_request_queue_by_possible_id(self.client, self.name or self.id) - if existing_store_by_id is None: + if existing_queue_by_id is None: _raise_on_non_existing_storage(StorageTypes.REQUEST_QUEUE, self.id) - await existing_store_by_id._update_timestamps(False) + await existing_queue_by_id._update_timestamps(False) - request = existing_store_by_id.requests.get(request_id) + request = existing_queue_by_id.requests.get(request_id) return self._json_to_request(request['json'] if request is not None else None) async def update_request(self, request: Dict, *, forefront: Optional[bool] = None) -> Dict: """TODO: docs.""" - existing_store_by_id = _find_or_cache_request_queue_by_possible_id(self.client, self.name or self.id) + existing_queue_by_id = _find_or_cache_request_queue_by_possible_id(self.client, self.name or self.id) - if existing_store_by_id is None: + if existing_queue_by_id is None: _raise_on_non_existing_storage(StorageTypes.REQUEST_QUEUE, self.id) request_model = self._create_internal_request(request, forefront) @@ -183,7 +183,7 @@ async def update_request(self, request: Dict, *, forefront: Optional[bool] = Non # First we need to check the existing request to be # able to return information about its handled state. - existing_request = existing_store_by_id.requests.get(request_model['id']) + existing_request = existing_queue_by_id.requests.get(request_model['id']) # Undefined means that the request is not present in the queue. # We need to insert it, to behave the same as API. @@ -192,7 +192,7 @@ async def update_request(self, request: Dict, *, forefront: Optional[bool] = Non # When updating the request, we need to make sure that # the handled counts are updated correctly in all cases. - existing_store_by_id.requests[request_model['id']] = request_model + existing_queue_by_id.requests[request_model['id']] = request_model pending_count_adjustment = 0 is_request_handled_state_changing = type(existing_request['orderNo']) != type(request_model['orderNo']) # noqa @@ -203,12 +203,12 @@ async def update_request(self, request: Dict, *, forefront: Optional[bool] = Non if is_request_handled_state_changing: pending_count_adjustment = 1 if request_was_handled_before_update else -1 - existing_store_by_id.pending_request_count += pending_count_adjustment - await existing_store_by_id._update_timestamps(True) + existing_queue_by_id.pending_request_count += pending_count_adjustment + await existing_queue_by_id._update_timestamps(True) await _update_request_queue_item( request=request_model, request_id=request_model['id'], - entity_directory=existing_store_by_id.request_queue_directory, + entity_directory=existing_queue_by_id.request_queue_directory, persist_storage=self.client.persist_storage, ) @@ -220,18 +220,18 @@ async def update_request(self, request: Dict, *, forefront: Optional[bool] = Non async def delete_request(self, request_id: str) -> None: """TODO: docs.""" - existing_store_by_id = _find_or_cache_request_queue_by_possible_id(self.client, self.name or self.id) + existing_queue_by_id = _find_or_cache_request_queue_by_possible_id(self.client, self.name or self.id) - if existing_store_by_id is None: + if existing_queue_by_id is None: _raise_on_non_existing_storage(StorageTypes.REQUEST_QUEUE, self.id) - request = existing_store_by_id.requests.get(request_id) + request = existing_queue_by_id.requests.get(request_id) if request: - del existing_store_by_id.requests[request_id] - existing_store_by_id.pending_request_count -= 0 if request['orderNo'] is None else 1 - await existing_store_by_id._update_timestamps(True) - await _delete_request(entity_directory=existing_store_by_id.request_queue_directory, request_id=request_id) + del existing_queue_by_id.requests[request_id] + existing_queue_by_id.pending_request_count -= 0 if request['orderNo'] is None else 1 + await existing_queue_by_id._update_timestamps(True) + await _delete_request(entity_directory=existing_queue_by_id.request_queue_directory, request_id=request_id) def to_request_queue_info(self) -> Dict: """TODO: docs.""" @@ -294,8 +294,8 @@ def _calculate_order_no(self, request: Dict, forefront: Optional[bool]) -> Optio def _find_or_cache_request_queue_by_possible_id(client: 'MemoryStorage', entry_name_or_id: str) -> Optional['RequestQueueClient']: # First check memory cache - found = next((store for store in client.request_queues_handled - if store.id == entry_name_or_id or (store.name and store.name.lower() == entry_name_or_id.lower())), None) + found = next((queue for queue in client.request_queues_handled + if queue.id == entry_name_or_id or (queue.name and queue.name.lower() == entry_name_or_id.lower())), None) if found is not None: return found @@ -318,7 +318,7 @@ def _find_or_cache_request_queue_by_possible_id(client: 'MemoryStorage', entry_n for entry in os.scandir(request_queues_dir): if entry.is_file(): if entry.name == '__metadata__.json': - # We have found the store metadata file, build out information based on it + # We have found the queue's metadata file, build out information based on it with open(os.path.join(request_queues_dir, entry.name)) as f: metadata = json.load(f) id = metadata['id'] diff --git a/tests/unit/actor/test_actor_create_proxy_configuration.py b/tests/unit/actor/test_actor_create_proxy_configuration.py index 4677c9fc..98bcb486 100644 --- a/tests/unit/actor/test_actor_create_proxy_configuration.py +++ b/tests/unit/actor/test_actor_create_proxy_configuration.py @@ -23,7 +23,7 @@ def patched_apify_client(apify_client_async_patcher: ApifyClientAsyncPatcher) -> class TestActorCreateProxyConfiguration: - @pytest.mark.asyncio + async def test_create_proxy_configuration_basic( self, monkeypatch: pytest.MonkeyPatch, @@ -61,7 +61,6 @@ async def test_create_proxy_configuration_basic( await Actor.exit() - @pytest.mark.asyncio async def test_create_proxy_configuration_actor_proxy_input( self, monkeypatch: pytest.MonkeyPatch, diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index 2faca33f..eb1a3ae8 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -1,12 +1,13 @@ import asyncio import inspect from collections import defaultdict -from typing import Any, Callable, Dict, List, Optional, Tuple, get_type_hints +from typing import Any, AsyncIterator, Callable, Dict, List, Optional, Tuple, get_type_hints import pytest from apify import Actor from apify.config import Configuration +from apify.memory_storage.memory_storage import MemoryStorage from apify_client.client import ApifyClientAsync @@ -107,3 +108,10 @@ def getattr_override(apify_client_instance: Any, attr_name: str) -> Any: @pytest.fixture def apify_client_async_patcher(monkeypatch: pytest.MonkeyPatch) -> ApifyClientAsyncPatcher: return ApifyClientAsyncPatcher(monkeypatch) + + +@pytest.fixture() +async def memory_storage(tmp_path: str) -> AsyncIterator[MemoryStorage]: + ms = MemoryStorage(local_data_directory=tmp_path, write_metadata=True) + yield ms + await ms.purge() diff --git a/tests/unit/memory_storage/resource_clients/_common.py b/tests/unit/memory_storage/resource_clients/_common.py deleted file mode 100644 index e3f8d029..00000000 --- a/tests/unit/memory_storage/resource_clients/_common.py +++ /dev/null @@ -1,12 +0,0 @@ -from typing import AsyncIterator - -import pytest_asyncio - -from apify.memory_storage.memory_storage import MemoryStorage - - -@pytest_asyncio.fixture() -async def memory_storage(tmp_path: str) -> AsyncIterator[MemoryStorage]: - ms = MemoryStorage(local_data_directory=tmp_path, write_metadata=True) - yield ms - await ms.purge() diff --git a/tests/unit/memory_storage/resource_clients/test_dataset.py b/tests/unit/memory_storage/resource_clients/test_dataset.py index 6860725d..cafbab51 100644 --- a/tests/unit/memory_storage/resource_clients/test_dataset.py +++ b/tests/unit/memory_storage/resource_clients/test_dataset.py @@ -1,23 +1,19 @@ import os import pytest -import pytest_asyncio from apify.memory_storage.memory_storage import MemoryStorage from apify.memory_storage.resource_clients.dataset import DatasetClient -from ._common import memory_storage # noqa: F401 - -@pytest_asyncio.fixture() -async def dataset_client(memory_storage: MemoryStorage) -> DatasetClient: # noqa: F811 +@pytest.fixture() +async def dataset_client(memory_storage: MemoryStorage) -> DatasetClient: datasets_client = memory_storage.datasets() dataset_info = await datasets_client.get_or_create(name='test') return memory_storage.dataset(id=dataset_info['id']) -@pytest.mark.asyncio -async def test_nonexistent(memory_storage: MemoryStorage) -> None: # noqa: F811 +async def test_nonexistent(memory_storage: MemoryStorage) -> None: dataset_client = memory_storage.dataset(id='clearly not a uuid') assert await dataset_client.get() is None with pytest.raises(ValueError): @@ -27,14 +23,12 @@ async def test_nonexistent(memory_storage: MemoryStorage) -> None: # noqa: F811 await dataset_client.delete() -@pytest.mark.asyncio async def test_not_implemented(dataset_client: DatasetClient) -> None: with pytest.raises(NotImplementedError): await dataset_client.stream_items() await dataset_client.get_items_as_bytes() -@pytest.mark.asyncio async def test_get(dataset_client: DatasetClient) -> None: info = await dataset_client.get() assert info is not None @@ -42,7 +36,6 @@ async def test_get(dataset_client: DatasetClient) -> None: assert info['accessedAt'] != info['createdAt'] -@pytest.mark.asyncio async def test_update(dataset_client: DatasetClient) -> None: new_dataset_name = 'test-update' old_dataset_info = await dataset_client.get() @@ -63,7 +56,6 @@ async def test_update(dataset_client: DatasetClient) -> None: await dataset_client.update(name=new_dataset_name) -@pytest.mark.asyncio async def test_delete(dataset_client: DatasetClient) -> None: dataset_info = await dataset_client.get() assert dataset_info is not None @@ -75,7 +67,6 @@ async def test_delete(dataset_client: DatasetClient) -> None: await dataset_client.delete() -@pytest.mark.asyncio async def test_push_items(dataset_client: DatasetClient) -> None: await dataset_client.push_items('{"test": "JSON from a string"}') await dataset_client.push_items({'abc': {'def': {'ghi': '123'}}}) @@ -89,7 +80,6 @@ async def test_push_items(dataset_client: DatasetClient) -> None: assert list_page.count == 22 -@pytest.mark.asyncio async def test_list_items(dataset_client: DatasetClient) -> None: item_count = 100 used_offset = 10 @@ -118,7 +108,6 @@ async def test_list_items(dataset_client: DatasetClient) -> None: assert list_desc_true.desc is True -@pytest.mark.asyncio async def test_iterate_items(dataset_client: DatasetClient) -> None: item_count = 100 await dataset_client.push_items([{'id': i} for i in range(item_count)]) diff --git a/tests/unit/memory_storage/resource_clients/test_dataset_collection.py b/tests/unit/memory_storage/resource_clients/test_dataset_collection.py index b78ea739..a9fcbcc2 100644 --- a/tests/unit/memory_storage/resource_clients/test_dataset_collection.py +++ b/tests/unit/memory_storage/resource_clients/test_dataset_collection.py @@ -5,15 +5,12 @@ from apify.memory_storage.memory_storage import MemoryStorage from apify.memory_storage.resource_clients.dataset_collection import DatasetCollectionClient -from ._common import memory_storage # noqa: F401 - @pytest.fixture() -def datasets_client(memory_storage: MemoryStorage) -> DatasetCollectionClient: # noqa: F811 +def datasets_client(memory_storage: MemoryStorage) -> DatasetCollectionClient: return memory_storage.datasets() -@pytest.mark.asyncio async def test_get_or_create(datasets_client: DatasetCollectionClient) -> None: dataset_name = 'test' # A new dataset gets created @@ -28,7 +25,6 @@ async def test_get_or_create(datasets_client: DatasetCollectionClient) -> None: assert dataset_info['createdAt'] == dataset_info_existing['createdAt'] -@pytest.mark.asyncio async def test_list(datasets_client: DatasetCollectionClient) -> None: assert datasets_client.list().count == 0 dataset_info = await datasets_client.get_or_create(name='dataset') diff --git a/tests/unit/memory_storage/resource_clients/test_key_value_store.py b/tests/unit/memory_storage/resource_clients/test_key_value_store.py index 38da28b8..fd02f03b 100644 --- a/tests/unit/memory_storage/resource_clients/test_key_value_store.py +++ b/tests/unit/memory_storage/resource_clients/test_key_value_store.py @@ -1,23 +1,19 @@ import os import pytest -import pytest_asyncio from apify.memory_storage.memory_storage import MemoryStorage from apify.memory_storage.resource_clients.key_value_store import KeyValueStoreClient -from ._common import memory_storage # noqa: F401 - -@pytest_asyncio.fixture() -async def key_value_store_client(memory_storage: MemoryStorage) -> KeyValueStoreClient: # noqa: F811 +@pytest.fixture() +async def key_value_store_client(memory_storage: MemoryStorage) -> KeyValueStoreClient: key_value_stores_client = memory_storage.key_value_stores() kvs_info = await key_value_stores_client.get_or_create(name='test') return memory_storage.key_value_store(id=kvs_info['id']) -@pytest.mark.asyncio -async def test_nonexistent(memory_storage: MemoryStorage) -> None: # noqa: F811 +async def test_nonexistent(memory_storage: MemoryStorage) -> None: kvs_client = memory_storage.key_value_store(id='clearly not a uuid') assert await kvs_client.get() is None with pytest.raises(ValueError): @@ -30,13 +26,11 @@ async def test_nonexistent(memory_storage: MemoryStorage) -> None: # noqa: F811 await kvs_client.delete() -@pytest.mark.asyncio async def test_not_implemented(key_value_store_client: KeyValueStoreClient) -> None: with pytest.raises(NotImplementedError): await key_value_store_client.stream_record('test') -@pytest.mark.asyncio async def test_get(key_value_store_client: KeyValueStoreClient) -> None: info = await key_value_store_client.get() assert info is not None @@ -44,7 +38,6 @@ async def test_get(key_value_store_client: KeyValueStoreClient) -> None: assert info['accessedAt'] != info['createdAt'] -@pytest.mark.asyncio async def test_update(key_value_store_client: KeyValueStoreClient) -> None: new_kvs_name = 'test-update' old_kvs_info = await key_value_store_client.get() @@ -65,7 +58,6 @@ async def test_update(key_value_store_client: KeyValueStoreClient) -> None: await key_value_store_client.update(name=new_kvs_name) -@pytest.mark.asyncio async def test_delete(key_value_store_client: KeyValueStoreClient) -> None: kvs_info = await key_value_store_client.get() assert kvs_info is not None @@ -77,7 +69,6 @@ async def test_delete(key_value_store_client: KeyValueStoreClient) -> None: await key_value_store_client.delete() -@pytest.mark.asyncio async def test_list_keys(key_value_store_client: KeyValueStoreClient) -> None: record_count = 4 used_limit = 2 @@ -106,7 +97,6 @@ async def test_list_keys(key_value_store_client: KeyValueStoreClient) -> None: assert keys_exclusive_start['items'][-1]['key'] == keys_exclusive_start['nextExclusiveStartKey'] -@pytest.mark.asyncio async def test_get_and_set_record(key_value_store_client: KeyValueStoreClient) -> None: # Test setting dict record dict_record_key = 'test-dict' @@ -134,7 +124,6 @@ async def test_get_and_set_record(key_value_store_client: KeyValueStoreClient) - await key_value_store_client.set_record('bytes', 'test'.encode('utf-8')) -@pytest.mark.asyncio async def test_get_record_as_bytes(key_value_store_client: KeyValueStoreClient) -> None: record_key = 'test' record_value = 'testing' @@ -144,7 +133,6 @@ async def test_get_record_as_bytes(key_value_store_client: KeyValueStoreClient) assert record_info['value'] == record_value.encode('utf-8') -@pytest.mark.asyncio async def test_delete_record(key_value_store_client: KeyValueStoreClient) -> None: record_key = 'test' await key_value_store_client.set_record(record_key, 'test') diff --git a/tests/unit/memory_storage/resource_clients/test_key_value_store_collection.py b/tests/unit/memory_storage/resource_clients/test_key_value_store_collection.py index cf356d57..f5edf10e 100644 --- a/tests/unit/memory_storage/resource_clients/test_key_value_store_collection.py +++ b/tests/unit/memory_storage/resource_clients/test_key_value_store_collection.py @@ -5,15 +5,12 @@ from apify.memory_storage.memory_storage import MemoryStorage from apify.memory_storage.resource_clients.key_value_store_collection import KeyValueStoreCollectionClient -from ._common import memory_storage # noqa: F401 - @pytest.fixture() -def key_value_stores_client(memory_storage: MemoryStorage) -> KeyValueStoreCollectionClient: # noqa: F811 +def key_value_stores_client(memory_storage: MemoryStorage) -> KeyValueStoreCollectionClient: return memory_storage.key_value_stores() -@pytest.mark.asyncio async def test_get_or_create(key_value_stores_client: KeyValueStoreCollectionClient) -> None: kvs_name = 'test' # A new kvs gets created @@ -28,7 +25,6 @@ async def test_get_or_create(key_value_stores_client: KeyValueStoreCollectionCli assert kvs_info['createdAt'] == kvs_info_existing['createdAt'] -@pytest.mark.asyncio async def test_list(key_value_stores_client: KeyValueStoreCollectionClient) -> None: assert key_value_stores_client.list().count == 0 kvs_info = await key_value_stores_client.get_or_create(name='kvs') diff --git a/tests/unit/memory_storage/resource_clients/test_request_queue.py b/tests/unit/memory_storage/resource_clients/test_request_queue.py index b0e8e0f8..d5b60bb2 100644 --- a/tests/unit/memory_storage/resource_clients/test_request_queue.py +++ b/tests/unit/memory_storage/resource_clients/test_request_queue.py @@ -2,23 +2,19 @@ from datetime import datetime import pytest -import pytest_asyncio from apify.memory_storage.memory_storage import MemoryStorage from apify.memory_storage.resource_clients.request_queue import RequestQueueClient -from ._common import memory_storage # noqa: F401 - -@pytest_asyncio.fixture() -async def request_queue_client(memory_storage: MemoryStorage) -> RequestQueueClient: # noqa: F811 +@pytest.fixture() +async def request_queue_client(memory_storage: MemoryStorage) -> RequestQueueClient: request_queues_client = memory_storage.request_queues() rq_info = await request_queues_client.get_or_create(name='test') return memory_storage.request_queue(id=rq_info['id']) -@pytest.mark.asyncio -async def test_nonexistent(memory_storage: MemoryStorage) -> None: # noqa: F811 +async def test_nonexistent(memory_storage: MemoryStorage) -> None: request_queue_client = memory_storage.request_queue(id='clearly not a uuid') assert await request_queue_client.get() is None with pytest.raises(ValueError): @@ -26,7 +22,6 @@ async def test_nonexistent(memory_storage: MemoryStorage) -> None: # noqa: F811 await request_queue_client.delete() -@pytest.mark.asyncio async def test_get(request_queue_client: RequestQueueClient) -> None: info = await request_queue_client.get() assert info is not None @@ -34,7 +29,6 @@ async def test_get(request_queue_client: RequestQueueClient) -> None: assert info['accessedAt'] != info['createdAt'] -@pytest.mark.asyncio async def test_update(request_queue_client: RequestQueueClient) -> None: new_rq_name = 'test-update' old_rq_info = await request_queue_client.get() @@ -55,7 +49,6 @@ async def test_update(request_queue_client: RequestQueueClient) -> None: await request_queue_client.update(name=new_rq_name) -@pytest.mark.asyncio async def test_delete(request_queue_client: RequestQueueClient) -> None: rq_info = await request_queue_client.get() assert rq_info is not None @@ -67,7 +60,6 @@ async def test_delete(request_queue_client: RequestQueueClient) -> None: await request_queue_client.delete() -@pytest.mark.asyncio async def test_list_head(request_queue_client: RequestQueueClient) -> None: request_1_url = 'https://apify.com' request_2_url = 'https://example.com' @@ -85,7 +77,6 @@ async def test_list_head(request_queue_client: RequestQueueClient) -> None: assert 'id' in item.keys() -@pytest.mark.asyncio async def test_add_record(request_queue_client: RequestQueueClient) -> None: # TODO: How can we test the forefront parameter? request_forefront_url = 'https://apify.com' @@ -107,7 +98,6 @@ async def test_add_record(request_queue_client: RequestQueueClient) -> None: assert rq_info['pendingRequestCount'] == rq_info['totalRequestCount'] == 2 -@pytest.mark.asyncio async def test_get_record(request_queue_client: RequestQueueClient) -> None: request_url = 'https://apify.com' request_info = await request_queue_client.add_request({ @@ -122,7 +112,6 @@ async def test_get_record(request_queue_client: RequestQueueClient) -> None: assert (await request_queue_client.get_request('non-existent id')) is None -@pytest.mark.asyncio async def test_update_record(request_queue_client: RequestQueueClient) -> None: # TODO: How can we test the forefront parameter? request_url = 'https://apify.com' @@ -142,7 +131,6 @@ async def test_update_record(request_queue_client: RequestQueueClient) -> None: assert rq_info_after_update['pendingRequestCount'] == 0 -@pytest.mark.asyncio async def test_delete_record(request_queue_client: RequestQueueClient) -> None: request_url = 'https://apify.com' request_info = await request_queue_client.add_request({ diff --git a/tests/unit/memory_storage/resource_clients/test_request_queue_collection.py b/tests/unit/memory_storage/resource_clients/test_request_queue_collection.py index 6c9f0c08..d87e15d0 100644 --- a/tests/unit/memory_storage/resource_clients/test_request_queue_collection.py +++ b/tests/unit/memory_storage/resource_clients/test_request_queue_collection.py @@ -5,15 +5,12 @@ from apify.memory_storage.memory_storage import MemoryStorage from apify.memory_storage.resource_clients.request_queue_collection import RequestQueueCollectionClient -from ._common import memory_storage # noqa: F401 - @pytest.fixture() -def request_queues_client(memory_storage: MemoryStorage) -> RequestQueueCollectionClient: # noqa: F811 +def request_queues_client(memory_storage: MemoryStorage) -> RequestQueueCollectionClient: return memory_storage.request_queues() -@pytest.mark.asyncio async def test_get_or_create(request_queues_client: RequestQueueCollectionClient) -> None: rq_name = 'test' # A new request queue gets created @@ -28,7 +25,6 @@ async def test_get_or_create(request_queues_client: RequestQueueCollectionClient assert rq_info['createdAt'] == rq_existing['createdAt'] -@pytest.mark.asyncio async def test_list(request_queues_client: RequestQueueCollectionClient) -> None: assert request_queues_client.list().count == 0 rq_info = await request_queues_client.get_or_create(name='dataset') diff --git a/tests/unit/memory_storage/test_memory_storage.py b/tests/unit/memory_storage/test_memory_storage.py index 11e19836..842b5c04 100644 --- a/tests/unit/memory_storage/test_memory_storage.py +++ b/tests/unit/memory_storage/test_memory_storage.py @@ -1,11 +1,8 @@ import os -import pytest - from apify.memory_storage.memory_storage import MemoryStorage -@pytest.mark.asyncio async def test_write_metadata(tmp_path: str) -> None: dataset_name = 'test' dataset_no_metadata_name = 'test-no-metadata' @@ -19,7 +16,6 @@ async def test_write_metadata(tmp_path: str) -> None: assert os.path.exists(os.path.join(ms_no_metadata.datasets_directory, dataset_no_metadata_name, '__metadata__.json')) is False -@pytest.mark.asyncio async def test_persist_storage(tmp_path: str) -> None: ms = MemoryStorage(local_data_directory=tmp_path, persist_storage=True) ms_no_persist = MemoryStorage(local_data_directory=tmp_path, persist_storage=False) @@ -33,7 +29,6 @@ async def test_persist_storage(tmp_path: str) -> None: assert os.path.exists(os.path.join(ms_no_persist.key_value_stores_directory, kvs_no_metadata_info['name'], 'test.json')) is False -@pytest.mark.asyncio async def test_purge_datasets(tmp_path: str) -> None: ms = MemoryStorage(local_data_directory=tmp_path, write_metadata=True) # Create default and non-default datasets @@ -51,7 +46,6 @@ async def test_purge_datasets(tmp_path: str) -> None: assert non_default_dataset_info['name'] in folders_after_purge -@pytest.mark.asyncio async def test_purge_key_value_stores(tmp_path: str) -> None: ms = MemoryStorage(local_data_directory=tmp_path, write_metadata=True) @@ -81,7 +75,6 @@ async def test_purge_key_value_stores(tmp_path: str) -> None: assert 'test.json' not in default_folder_files_after_purge -@pytest.mark.asyncio async def test_purge_request_queues(tmp_path: str) -> None: ms = MemoryStorage(local_data_directory=tmp_path, write_metadata=True) # Create default and non-default request queues diff --git a/tests/unit/test_proxy_configuration.py b/tests/unit/test_proxy_configuration.py index fb336f58..3510fddc 100644 --- a/tests/unit/test_proxy_configuration.py +++ b/tests/unit/test_proxy_configuration.py @@ -69,7 +69,7 @@ def test__fails_with_invalid_arguments(self) -> None: class TestProxyConfigurationNewUrl: - @pytest.mark.asyncio + async def test_new_url_basic(self) -> None: groups = ['GROUP1', 'GROUP2'] password = 'abcd1234' @@ -87,7 +87,6 @@ async def test_new_url_basic(self) -> None: assert proxy_url == f'http://{expected_username}:{password}@{expected_hostname}:{expected_port}' - @pytest.mark.asyncio async def test_new_url_session_id(self) -> None: groups = ['GROUP1', 'GROUP2'] password = 'abcd1234' @@ -115,7 +114,6 @@ async def test_new_url_session_id(self) -> None: with pytest.raises(ValueError, match=re.escape(str(invalid_session_id))): await proxy_configuration.new_url(invalid_session_id) - @pytest.mark.asyncio async def test_rotating_custom_urls(self) -> None: proxy_urls = ['http://proxy.com:1111', 'http://proxy.com:2222', 'http://proxy.com:3333'] proxy_configuration = ProxyConfiguration(proxy_urls=proxy_urls) @@ -127,7 +125,6 @@ async def test_rotating_custom_urls(self) -> None: assert await proxy_configuration.new_url() == proxy_urls[1] assert await proxy_configuration.new_url() == proxy_urls[2] - @pytest.mark.asyncio async def test_rotating_custom_urls_with_sessions(self) -> None: sessions = ['sesssion_01', 'sesssion_02', 'sesssion_03', 'sesssion_04', 'sesssion_05', 'sesssion_06'] proxy_urls = ['http://proxy.com:1111', 'http://proxy.com:2222', 'http://proxy.com:3333'] @@ -150,7 +147,6 @@ async def test_rotating_custom_urls_with_sessions(self) -> None: assert await proxy_configuration.new_url(sessions[1]) == proxy_urls[1] assert await proxy_configuration.new_url(sessions[3]) == proxy_urls[0] - @pytest.mark.asyncio async def test_custom_new_url_function(self) -> None: custom_urls = [ 'http://proxy.com:1111', 'http://proxy.com:2222', 'http://proxy.com:3333', @@ -166,7 +162,6 @@ def custom_new_url_function(_session_id: Optional[str]) -> str: for custom_url in reversed(custom_urls): assert await proxy_configuration.new_url() == custom_url - @pytest.mark.asyncio async def test_custom_new_url_function_async(self) -> None: custom_urls = [ 'http://proxy.com:1111', 'http://proxy.com:2222', 'http://proxy.com:3333', @@ -183,7 +178,6 @@ async def custom_new_url_function(_session_id: Optional[str]) -> str: for custom_url in reversed(custom_urls): assert await proxy_configuration.new_url() == custom_url - @pytest.mark.asyncio async def test_invalid_custom_new_url_function(self) -> None: def custom_new_url_function(_session_id: Optional[str]) -> str: raise ValueError() @@ -193,7 +187,6 @@ def custom_new_url_function(_session_id: Optional[str]) -> str: with pytest.raises(ValueError, match='The provided "new_url_function" did not return a valid URL'): await proxy_configuration.new_url() - @pytest.mark.asyncio async def test_proxy_configuration_not_sharing_references(self) -> None: urls = [ 'http://proxy-example-1.com:8000', @@ -221,7 +214,7 @@ async def test_proxy_configuration_not_sharing_references(self) -> None: class TestProxyConfigurationNewProxyInfo: - @pytest.mark.asyncio + async def test_new_proxy_info_basic(self) -> None: groups = ['GROUP1', 'GROUP2'] password = 'abcd1234' @@ -247,7 +240,6 @@ async def test_new_proxy_info_basic(self) -> None: 'password': password, } - @pytest.mark.asyncio async def test_new_proxy_info_rotates_urls(self) -> None: proxy_urls = ['http://proxy.com:1111', 'http://proxy.com:2222', 'http://proxy.com:3333'] proxy_configuration = ProxyConfiguration(proxy_urls=proxy_urls) @@ -259,7 +251,6 @@ async def test_new_proxy_info_rotates_urls(self) -> None: assert (await proxy_configuration.new_proxy_info())['url'] == proxy_urls[1] assert (await proxy_configuration.new_proxy_info())['url'] == proxy_urls[2] - @pytest.mark.asyncio async def test_new_proxy_info_rotates_urls_with_sessions(self) -> None: sessions = ['sesssion_01', 'sesssion_02', 'sesssion_03', 'sesssion_04', 'sesssion_05', 'sesssion_06'] proxy_urls = ['http://proxy.com:1111', 'http://proxy.com:2222', 'http://proxy.com:3333'] @@ -295,7 +286,7 @@ def patched_apify_client(apify_client_async_patcher: ApifyClientAsyncPatcher) -> class TestProxyConfigurationInitialize: - @pytest.mark.asyncio + async def test_initialize_basic( self, monkeypatch: pytest.MonkeyPatch, @@ -323,14 +314,12 @@ async def test_initialize_basic( assert len(patched_apify_client.calls['user']['get']) == 1 # type: ignore assert len(route.calls) == 1 - @pytest.mark.asyncio async def test_initialize_no_password_no_token(self) -> None: proxy_configuration = ProxyConfiguration() with pytest.raises(ValueError, match='Apify Proxy password must be provided'): await proxy_configuration.initialize() - @pytest.mark.asyncio async def test_initialize_manual_password( self, monkeypatch: pytest.MonkeyPatch, @@ -352,7 +341,6 @@ async def test_initialize_manual_password( assert proxy_configuration._password == DUMMY_PASSWORD assert proxy_configuration.is_man_in_the_middle is False - @pytest.mark.asyncio async def test_initialize_manual_password_different_than_user_one( self, monkeypatch: pytest.MonkeyPatch, @@ -382,7 +370,6 @@ async def test_initialize_manual_password_different_than_user_one( out, _ = capsys.readouterr() assert 'The Apify Proxy password you provided belongs to a different user' in out - @pytest.mark.asyncio async def test_initialize_not_connected( self, monkeypatch: pytest.MonkeyPatch, @@ -402,7 +389,6 @@ async def test_initialize_not_connected( with pytest.raises(ConnectionError, match=dummy_connection_error): await proxy_configuration.initialize() - @pytest.mark.asyncio async def test_initialize_status_page_unavailable( self, monkeypatch: pytest.MonkeyPatch, @@ -421,7 +407,6 @@ async def test_initialize_status_page_unavailable( out, _ = capsys.readouterr() assert 'Apify Proxy access check timed out' in out - @pytest.mark.asyncio async def test_initialize_not_called_non_apify_proxy( self, monkeypatch: pytest.MonkeyPatch, diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py index 29938403..07a776a2 100644 --- a/tests/unit/test_utils.py +++ b/tests/unit/test_utils.py @@ -108,7 +108,6 @@ def test__maybe_parse_int() -> None: assert _maybe_parse_int('abcd') is None -@pytest.mark.asyncio async def test__run_func_at_interval_async() -> None: # Test that it works with a synchronous functions test_var = 0 @@ -222,7 +221,6 @@ def test__is_file_or_bytes() -> None: # Copypasted from client assert _is_file_or_bytes(None) is False -@pytest.mark.asyncio async def test__force_remove(tmp_path: str) -> None: test_file_path = os.path.join(tmp_path, 'test.txt') # Does not crash/raise when the file does not exist @@ -293,7 +291,6 @@ def test__unique_key_to_request_id() -> None: assert _unique_key_to_request_id('test') == 'n4bQgYhMfWWaLqg' -@pytest.mark.asyncio async def test__force_rename(tmp_path: str) -> None: src_dir = os.path.join(tmp_path, 'src') dst_dir = os.path.join(tmp_path, 'dst') From b291a07935bddcd4742347c5a73a7332e94b9862 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ji=C5=99=C3=AD=20Morav=C4=8D=C3=ADk?= Date: Wed, 4 Jan 2023 13:27:27 +0100 Subject: [PATCH 20/23] address PR comments --- src/apify/memory_storage/memory_storage.py | 105 ++++++++---- .../resource_clients/dataset.py | 118 +++++++------- .../resource_clients/dataset_collection.py | 25 +-- .../resource_clients/key_value_store.py | 126 ++++++++------- .../key_value_store_collection.py | 23 +-- .../resource_clients/request_queue.py | 150 +++++++++--------- .../request_queue_collection.py | 29 ++-- .../resource_clients/test_dataset.py | 8 +- .../test_dataset_collection.py | 4 +- .../resource_clients/test_key_value_store.py | 31 ++-- .../test_key_value_store_collection.py | 4 +- .../resource_clients/test_request_queue.py | 8 +- .../test_request_queue_collection.py | 4 +- .../memory_storage/test_memory_storage.py | 52 ++++-- 14 files changed, 396 insertions(+), 291 deletions(-) diff --git a/src/apify/memory_storage/memory_storage.py b/src/apify/memory_storage/memory_storage.py index 0d3b747d..d3d45b4e 100644 --- a/src/apify/memory_storage/memory_storage.py +++ b/src/apify/memory_storage/memory_storage.py @@ -13,57 +13,95 @@ from .resource_clients.request_queue import RequestQueueClient from .resource_clients.request_queue_collection import RequestQueueCollectionClient +""" +Memory storage emulates data storages that are available on the Apify platform. +Specifically, it emulates clients for datasets, key-value stores and request queues. +The data are held in-memory and persisted locally if `persist_storage` is True. +The metadata of the storages is also persisted if `write_metadata` is True. +""" + class MemoryStorage: """Class representing an in-memory storage.""" - datasets_handled: List[DatasetClient] - key_value_stores_handled: List[KeyValueStoreClient] - request_queues_handled: List[RequestQueueClient] + _local_data_directory: str + _datasets_directory: str + _key_value_stores_directory: str + _request_queues_directory: str + _write_metadata: bool + _persist_storage: bool + _datasets_handled: List[DatasetClient] + _key_value_stores_handled: List[KeyValueStoreClient] + _request_queues_handled: List[RequestQueueClient] def __init__( - self, *, local_data_directory: str = './storage', write_metadata: Optional[bool] = False, persist_storage: Optional[bool] = True, + self, *, local_data_directory: str = './storage', write_metadata: Optional[bool] = None, persist_storage: Optional[bool] = None, ) -> None: - """TODO: docs.""" - self.local_data_directory = local_data_directory - self.datasets_directory = os.path.join(self.local_data_directory, 'datasets') - self.key_value_stores_directory = os.path.join(self.local_data_directory, 'key_value_stores') - self.request_queues_directory = os.path.join(self.local_data_directory, 'request_queues') - self.write_metadata = write_metadata or '*' in os.getenv('DEBUG', '') - self.persist_storage = persist_storage or not any(s in os.getenv('APIFY_PERSIST_STORAGE', 'true') for s in ['false', '0', '']) - self.datasets_handled = [] - self.key_value_stores_handled = [] - self.request_queues_handled = [] + """Initialize the MemoryStorage. + + Args: + local_data_directory (str, optional): A local directory where all data will be persisted + persist_storage (bool, optional): Whether to persist the data to the `local_data_directory` or just keep them in memory + write_metadata (bool, optional): Whether to persist metadata of the storages as well + """ + self._local_data_directory = local_data_directory + self._datasets_directory = os.path.join(self._local_data_directory, 'datasets') + self._key_value_stores_directory = os.path.join(self._local_data_directory, 'key_value_stores') + self._request_queues_directory = os.path.join(self._local_data_directory, 'request_queues') + self._write_metadata = write_metadata if write_metadata is not None else '*' in os.getenv('DEBUG', '') + self._persist_storage = persist_storage if persist_storage is not None else not any( + os.getenv('APIFY_PERSIST_STORAGE', 'true') == s for s in ['false', '0', '']) + self._datasets_handled = [] + self._key_value_stores_handled = [] + self._request_queues_handled = [] def datasets(self) -> DatasetCollectionClient: - """TODO: docs.""" - return DatasetCollectionClient(base_storage_directory=self.datasets_directory, client=self) + """Retrieve the sub-client for manipulating datasets.""" + return DatasetCollectionClient(base_storage_directory=self._datasets_directory, client=self) def dataset(self, *, id: str) -> DatasetClient: - """TODO: docs.""" - return DatasetClient(base_storage_directory=self.datasets_directory, client=self, id=id) + """Retrieve the sub-client for manipulating a single dataset. + + Args: + dataset_id (str): ID of the dataset to be manipulated + """ + return DatasetClient(base_storage_directory=self._datasets_directory, client=self, id=id) def key_value_stores(self) -> KeyValueStoreCollectionClient: - """TODO: docs.""" - return KeyValueStoreCollectionClient(base_storage_directory=self.key_value_stores_directory, client=self) + """Retrieve the sub-client for manipulating key-value stores.""" + return KeyValueStoreCollectionClient(base_storage_directory=self._key_value_stores_directory, client=self) def key_value_store(self, *, id: str) -> KeyValueStoreClient: - """TODO: docs.""" - return KeyValueStoreClient(base_storage_directory=self.key_value_stores_directory, client=self, id=id) + """Retrieve the sub-client for manipulating a single key-value store. + + Args: + key_value_store_id (str): ID of the key-value store to be manipulated + """ + return KeyValueStoreClient(base_storage_directory=self._key_value_stores_directory, client=self, id=id) def request_queues(self) -> RequestQueueCollectionClient: - """TODO: docs.""" - return RequestQueueCollectionClient(base_storage_directory=self.request_queues_directory, client=self) + """Retrieve the sub-client for manipulating request queues.""" + return RequestQueueCollectionClient(base_storage_directory=self._request_queues_directory, client=self) def request_queue(self, *, id: str, _client_key: Optional[str] = None, _timeout_secs: Optional[int] = None) -> RequestQueueClient: - """TODO: docs.""" - return RequestQueueClient(base_storage_directory=self.request_queues_directory, client=self, id=id) + """Retrieve the sub-client for manipulating a single request queue. + + Args: + request_queue_id (str): ID of the request queue to be manipulated + client_key (str): A unique identifier of the client accessing the request queue + """ + return RequestQueueClient(base_storage_directory=self._request_queues_directory, client=self, id=id) async def purge(self) -> None: - """TODO: docs.""" + """ + Cleans up the default storage directories before the run starts: + - local directory containing the default dataset; + - all records from the default key-value store in the local directory, except for the "INPUT" key; + - local directory containing the default request queue. + """ # Key-value stores - if await ospath.exists(self.key_value_stores_directory): - key_value_store_folders = await scandir(self.key_value_stores_directory) + if await ospath.exists(self._key_value_stores_directory): + key_value_store_folders = await scandir(self._key_value_stores_directory) for key_value_store_folder in key_value_store_folders: if key_value_store_folder.name.startswith('__APIFY_TEMPORARY') or key_value_store_folder.name.startswith('__OLD'): await self._batch_remove_files(key_value_store_folder.path) @@ -71,14 +109,14 @@ async def purge(self) -> None: await self._handle_default_key_value_store(key_value_store_folder.path) # Datasets - if await ospath.exists(self.datasets_directory): - dataset_folders = await scandir(self.datasets_directory) + if await ospath.exists(self._datasets_directory): + dataset_folders = await scandir(self._datasets_directory) for dataset_folder in dataset_folders: if dataset_folder.name == 'default' or dataset_folder.name.startswith('__APIFY_TEMPORARY'): await self._batch_remove_files(dataset_folder.path) # Request queues - if await ospath.exists(self.request_queues_directory): - request_queue_folders = await scandir(self.request_queues_directory) + if await ospath.exists(self._request_queues_directory): + request_queue_folders = await scandir(self._request_queues_directory) for request_queue_folder in request_queue_folders: if request_queue_folder.name == 'default' or request_queue_folder.name.startswith('__APIFY_TEMPORARY'): await self._batch_remove_files(request_queue_folder.path) @@ -89,6 +127,7 @@ def teardown(self) -> None: pass async def _handle_default_key_value_store(self, folder: str) -> None: + """Remove everything from the default key-value store folder except `possible_input_keys`.""" folder_exists = await ospath.exists(folder) temporary_path = os.path.normpath(os.path.join(folder, '../__APIFY_MIGRATING_KEY_VALUE_STORE__')) diff --git a/src/apify/memory_storage/resource_clients/dataset.py b/src/apify/memory_storage/resource_clients/dataset.py index 064bf578..320590f8 100644 --- a/src/apify/memory_storage/resource_clients/dataset.py +++ b/src/apify/memory_storage/resource_clients/dataset.py @@ -30,23 +30,27 @@ class DatasetClient: """TODO: docs.""" - created_at = datetime.utcnow() - accessed_at = datetime.utcnow() - modified_at = datetime.utcnow() - item_count = 0 - dataset_entries: Dict[str, Dict] + _id: str + _dataset_directory: str + _client: 'MemoryStorage' + _name: str + _dataset_entries: Dict[str, Dict] + _created_at = datetime.utcnow() + _accessed_at = datetime.utcnow() + _modified_at = datetime.utcnow() + _item_count = 0 def __init__(self, *, base_storage_directory: str, client: 'MemoryStorage', id: Optional[str] = None, name: Optional[str] = None) -> None: """TODO: docs.""" - self.id = str(uuid.uuid4()) if id is None else id - self.dataset_directory = os.path.join(base_storage_directory, name or self.id) - self.client = client - self.name = name - self.dataset_entries = {} + self._id = str(uuid.uuid4()) if id is None else id + self._dataset_directory = os.path.join(base_storage_directory, name or self._id) + self._client = client + self._name = name + self._dataset_entries = {} async def get(self) -> Optional[Dict]: """TODO: docs.""" - found = _find_or_cache_dataset_by_possible_id(client=self.client, entry_name_or_id=self.name or self.id) + found = _find_or_cache_dataset_by_possible_id(client=self._client, entry_name_or_id=self._name or self._id) if found: await found._update_timestamps(False) @@ -57,10 +61,10 @@ async def get(self) -> Optional[Dict]: async def update(self, *, name: Optional[str] = None) -> Dict: """TODO: docs.""" # Check by id - existing_dataset_by_id = _find_or_cache_dataset_by_possible_id(client=self.client, entry_name_or_id=self.name or self.id) + existing_dataset_by_id = _find_or_cache_dataset_by_possible_id(client=self._client, entry_name_or_id=self._name or self._id) if existing_dataset_by_id is None: - _raise_on_non_existing_storage(StorageTypes.DATASET, self.id) + _raise_on_non_existing_storage(StorageTypes.DATASET, self._id) # Skip if no changes if name is None: @@ -68,18 +72,18 @@ async def update(self, *, name: Optional[str] = None) -> Dict: # Check that name is not in use already existing_dataset_by_name = next( - (dataset for dataset in self.client.datasets_handled if dataset.name and dataset.name.lower() == name.lower()), None) + (dataset for dataset in self._client._datasets_handled if dataset._name and dataset._name.lower() == name.lower()), None) if existing_dataset_by_name is not None: _raise_on_duplicate_storage(StorageTypes.DATASET, 'name', name) - existing_dataset_by_id.name = name + existing_dataset_by_id._name = name - previous_dir = existing_dataset_by_id.dataset_directory + previous_dir = existing_dataset_by_id._dataset_directory - existing_dataset_by_id.dataset_directory = os.path.join(self.client.datasets_directory, name) + existing_dataset_by_id._dataset_directory = os.path.join(self._client._datasets_directory, name) - await _force_rename(previous_dir, existing_dataset_by_id.dataset_directory) + await _force_rename(previous_dir, existing_dataset_by_id._dataset_directory) # Update timestamps await existing_dataset_by_id._update_timestamps(True) @@ -88,14 +92,14 @@ async def update(self, *, name: Optional[str] = None) -> Dict: async def delete(self) -> None: """TODO: docs.""" - dataset = next((dataset for dataset in self.client.datasets_handled if dataset.id == self.id), None) + dataset = next((dataset for dataset in self._client._datasets_handled if dataset._id == self._id), None) if dataset is not None: - self.client.datasets_handled.remove(dataset) - dataset.item_count = 0 - dataset.dataset_entries.clear() + self._client._datasets_handled.remove(dataset) + dataset._item_count = 0 + dataset._dataset_entries.clear() - await aioshutil.rmtree(dataset.dataset_directory) + await aioshutil.rmtree(dataset._dataset_directory) async def list_items( self, @@ -114,13 +118,13 @@ async def list_items( ) -> ListPage: """TODO: docs.""" # Check by id - existing_dataset_by_id = _find_or_cache_dataset_by_possible_id(client=self.client, entry_name_or_id=self.name or self.id) + existing_dataset_by_id = _find_or_cache_dataset_by_possible_id(client=self._client, entry_name_or_id=self._name or self._id) if existing_dataset_by_id is None: - _raise_on_non_existing_storage(StorageTypes.DATASET, self.id) + _raise_on_non_existing_storage(StorageTypes.DATASET, self._id) start, end = existing_dataset_by_id._get_start_and_end_indexes( - max(existing_dataset_by_id.item_count - offset - limit, 0) if desc else offset or 0, + max(existing_dataset_by_id._item_count - offset - limit, 0) if desc else offset or 0, limit, ) @@ -128,7 +132,7 @@ async def list_items( for idx in range(start, end): entry_number = self._generate_local_entry_name(idx) - items.append(existing_dataset_by_id.dataset_entries[entry_number]) + items.append(existing_dataset_by_id._dataset_entries[entry_number]) await existing_dataset_by_id._update_timestamps(False) @@ -141,7 +145,7 @@ async def list_items( 'items': items, 'limit': limit, 'offset': offset, - 'total': existing_dataset_by_id.item_count, + 'total': existing_dataset_by_id._item_count, }) async def iterate_items( @@ -235,58 +239,58 @@ async def stream_items( async def push_items(self, items: JSONSerializable) -> None: """TODO: docs.""" # Check by id - existing_dataset_by_id = _find_or_cache_dataset_by_possible_id(client=self.client, entry_name_or_id=self.name or self.id) + existing_dataset_by_id = _find_or_cache_dataset_by_possible_id(client=self._client, entry_name_or_id=self._name or self._id) if existing_dataset_by_id is None: - _raise_on_non_existing_storage(StorageTypes.DATASET, self.id) + _raise_on_non_existing_storage(StorageTypes.DATASET, self._id) normalized = self._normalize_items(items) added_ids: List[str] = [] for entry in normalized: - existing_dataset_by_id.item_count += 1 - idx = self._generate_local_entry_name(existing_dataset_by_id.item_count) + existing_dataset_by_id._item_count += 1 + idx = self._generate_local_entry_name(existing_dataset_by_id._item_count) - existing_dataset_by_id.dataset_entries[idx] = entry + existing_dataset_by_id._dataset_entries[idx] = entry added_ids.append(idx) data_entries: List[Tuple[str, Dict]] = [] for id in added_ids: - data_entries.append((id, existing_dataset_by_id.dataset_entries[id])) + data_entries.append((id, existing_dataset_by_id._dataset_entries[id])) await existing_dataset_by_id._update_timestamps(True) await _update_dataset_items( data=data_entries, - entity_directory=existing_dataset_by_id.dataset_directory, - persist_storage=self.client.persist_storage, + entity_directory=existing_dataset_by_id._dataset_directory, + persist_storage=self._client._persist_storage, ) def to_dataset_info(self) -> Dict: """TODO: docs.""" return { - 'id': self.id, - 'name': self.name, - 'itemCount': self.item_count, - 'accessedAt': self.accessed_at, - 'createdAt': self.created_at, - 'modifiedAt': self.modified_at, + 'id': self._id, + 'name': self._name, + 'itemCount': self._item_count, + 'accessedAt': self._accessed_at, + 'createdAt': self._created_at, + 'modifiedAt': self._modified_at, } async def _update_timestamps(self, has_been_modified: bool) -> None: """TODO: docs.""" - self.accessed_at = datetime.utcnow() + self._accessed_at = datetime.utcnow() if has_been_modified: - self.modified_at = datetime.utcnow() + self._modified_at = datetime.utcnow() dataset_info = self.to_dataset_info() - await _update_metadata(data=dataset_info, entity_directory=self.dataset_directory, write_metadata=self.client.write_metadata) + await _update_metadata(data=dataset_info, entity_directory=self._dataset_directory, write_metadata=self._client._write_metadata) def _get_start_and_end_indexes(self, offset: int, limit: Optional[int] = None) -> Tuple[int, int]: - actual_limit = limit or self.item_count + actual_limit = limit or self._item_count start = offset + 1 - end = min(offset + actual_limit, self.item_count) + 1 + end = min(offset + actual_limit, self._item_count) + 1 return (start, end) def _generate_local_entry_name(self, idx: int) -> str: @@ -316,13 +320,13 @@ def normalize_item(item: Any) -> Optional[Dict]: def _find_or_cache_dataset_by_possible_id(client: 'MemoryStorage', entry_name_or_id: str) -> Optional['DatasetClient']: # First check memory cache - found = next((dataset for dataset in client.datasets_handled - if dataset.id == entry_name_or_id or (dataset.name and dataset.name.lower() == entry_name_or_id.lower())), None) + found = next((dataset for dataset in client._datasets_handled + if dataset._id == entry_name_or_id or (dataset._name and dataset._name.lower() == entry_name_or_id.lower())), None) if found is not None: return found - datasets_dir = os.path.join(client.datasets_directory, entry_name_or_id) + datasets_dir = os.path.join(client._datasets_directory, entry_name_or_id) # Check if directory exists if not os.access(datasets_dir, os.F_OK): return None @@ -372,17 +376,17 @@ def _find_or_cache_dataset_by_possible_id(client: 'MemoryStorage', entry_name_or else: name = entry_name_or_id - new_client = DatasetClient(base_storage_directory=client.datasets_directory, client=client, id=id, name=name) + new_client = DatasetClient(base_storage_directory=client._datasets_directory, client=client, id=id, name=name) # Overwrite properties - new_client.accessed_at = accessed_at - new_client.created_at = created_at - new_client.modified_at = modified_at - new_client.item_count = item_count + new_client._accessed_at = accessed_at + new_client._created_at = created_at + new_client._modified_at = modified_at + new_client._item_count = item_count for entry_id, content in entries.items(): - new_client.dataset_entries[entry_id] = content + new_client._dataset_entries[entry_id] = content - client.datasets_handled.append(new_client) + client._datasets_handled.append(new_client) return new_client diff --git a/src/apify/memory_storage/resource_clients/dataset_collection.py b/src/apify/memory_storage/resource_clients/dataset_collection.py index 1542db5b..9dfca819 100644 --- a/src/apify/memory_storage/resource_clients/dataset_collection.py +++ b/src/apify/memory_storage/resource_clients/dataset_collection.py @@ -12,38 +12,41 @@ class DatasetCollectionClient: """TODO: docs.""" + _datasets_directory: str + _client: 'MemoryStorage' + def __init__(self, *, base_storage_directory: str, client: 'MemoryStorage') -> None: """TODO: docs.""" - self.datasets_directory = base_storage_directory - self.client = client + self._datasets_directory = base_storage_directory + self._client = client def list(self) -> ListPage: """TODO: docs.""" def map_store(store: DatasetClient) -> Dict: return store.to_dataset_info() return ListPage({ - 'total': len(self.client.datasets_handled), - 'count': len(self.client.datasets_handled), + 'total': len(self._client._datasets_handled), + 'count': len(self._client._datasets_handled), 'offset': 0, - 'limit': len(self.client.datasets_handled), + 'limit': len(self._client._datasets_handled), 'desc': False, - 'items': sorted(map(map_store, self.client.datasets_handled), key=itemgetter('createdAt')), + 'items': sorted(map(map_store, self._client._datasets_handled), key=itemgetter('createdAt')), }) async def get_or_create(self, *, name: Optional[str] = None, _schema: Optional[Dict] = None) -> Dict: """TODO: docs.""" if name: - found = _find_or_cache_dataset_by_possible_id(client=self.client, entry_name_or_id=name) + found = _find_or_cache_dataset_by_possible_id(client=self._client, entry_name_or_id=name) if found: return found.to_dataset_info() - new_store = DatasetClient(name=name, base_storage_directory=self.datasets_directory, client=self.client) - self.client.datasets_handled.append(new_store) + new_dataset = DatasetClient(name=name, base_storage_directory=self._datasets_directory, client=self._client) + self._client._datasets_handled.append(new_dataset) - dataset_info = new_store.to_dataset_info() + dataset_info = new_dataset.to_dataset_info() # Write to the disk - await _update_metadata(data=dataset_info, entity_directory=new_store.dataset_directory, write_metadata=self.client.write_metadata) + await _update_metadata(data=dataset_info, entity_directory=new_dataset._dataset_directory, write_metadata=self._client._write_metadata) return dataset_info diff --git a/src/apify/memory_storage/resource_clients/key_value_store.py b/src/apify/memory_storage/resource_clients/key_value_store.py index 640445d2..bc6acc94 100644 --- a/src/apify/memory_storage/resource_clients/key_value_store.py +++ b/src/apify/memory_storage/resource_clients/key_value_store.py @@ -1,3 +1,4 @@ +import io import json import mimetypes import os @@ -32,22 +33,26 @@ class KeyValueStoreClient: """TODO: docs.""" - created_at = datetime.utcnow() - accessed_at = datetime.utcnow() - modified_at = datetime.utcnow() - key_value_entries: Dict[str, Dict] + _id: str + _key_value_store_directory: str + _client: 'MemoryStorage' + _name: str + _key_value_entries: Dict[str, Dict] + _created_at = datetime.utcnow() + _accessed_at = datetime.utcnow() + _modified_at = datetime.utcnow() def __init__(self, *, base_storage_directory: str, client: 'MemoryStorage', id: Optional[str] = None, name: Optional[str] = None) -> None: """TODO: docs.""" - self.id = str(uuid.uuid4()) if id is None else id - self.key_value_store_directory = os.path.join(base_storage_directory, name or self.id) - self.client = client - self.name = name - self.key_value_entries = {} + self._id = str(uuid.uuid4()) if id is None else id + self._key_value_store_directory = os.path.join(base_storage_directory, name or self._id) + self._client = client + self._name = name + self._key_value_entries = {} async def get(self) -> Optional[Dict]: """TODO: docs.""" - found = _find_or_cache_key_value_store_by_possible_id(client=self.client, entry_name_or_id=self.name or self.id) + found = _find_or_cache_key_value_store_by_possible_id(client=self._client, entry_name_or_id=self._name or self._id) if found: await found._update_timestamps(False) @@ -58,10 +63,10 @@ async def get(self) -> Optional[Dict]: async def update(self, *, name: Optional[str] = None) -> Dict: """TODO: docs.""" # Check by id - existing_store_by_id = _find_or_cache_key_value_store_by_possible_id(client=self.client, entry_name_or_id=self.name or self.id) + existing_store_by_id = _find_or_cache_key_value_store_by_possible_id(client=self._client, entry_name_or_id=self._name or self._id) if existing_store_by_id is None: - _raise_on_non_existing_storage(StorageTypes.KEY_VALUE_STORE, self.id) + _raise_on_non_existing_storage(StorageTypes.KEY_VALUE_STORE, self._id) # Skip if no changes if name is None: @@ -69,18 +74,18 @@ async def update(self, *, name: Optional[str] = None) -> Dict: # Check that name is not in use already existing_store_by_name = next( - (store for store in self.client.key_value_stores_handled if store.name and store.name.lower() == name.lower()), None) + (store for store in self._client._key_value_stores_handled if store._name and store._name.lower() == name.lower()), None) if existing_store_by_name is not None: _raise_on_duplicate_storage(StorageTypes.KEY_VALUE_STORE, 'name', name) - existing_store_by_id.name = name + existing_store_by_id._name = name - previous_dir = existing_store_by_id.key_value_store_directory + previous_dir = existing_store_by_id._key_value_store_directory - existing_store_by_id.key_value_store_directory = os.path.join(self.client.key_value_stores_directory, name) + existing_store_by_id._key_value_store_directory = os.path.join(self._client._key_value_stores_directory, name) - await _force_rename(previous_dir, existing_store_by_id.key_value_store_directory) + await _force_rename(previous_dir, existing_store_by_id._key_value_store_directory) # Update timestamps await existing_store_by_id._update_timestamps(True) @@ -89,25 +94,25 @@ async def update(self, *, name: Optional[str] = None) -> Dict: async def delete(self) -> None: """TODO: docs.""" - store = next((store for store in self.client.key_value_stores_handled if store.id == self.id), None) + store = next((store for store in self._client._key_value_stores_handled if store._id == self._id), None) if store is not None: - self.client.key_value_stores_handled.remove(store) - store.key_value_entries.clear() + self._client._key_value_stores_handled.remove(store) + store._key_value_entries.clear() - await aioshutil.rmtree(store.key_value_store_directory) + await aioshutil.rmtree(store._key_value_store_directory) async def list_keys(self, *, limit: int = DEFAULT_API_PARAM_LIMIT, exclusive_start_key: Optional[str] = None) -> Dict: """TODO: docs.""" # Check by id - existing_store_by_id = _find_or_cache_key_value_store_by_possible_id(self.client, self.name or self.id) + existing_store_by_id = _find_or_cache_key_value_store_by_possible_id(self._client, self._name or self._id) if existing_store_by_id is None: - _raise_on_non_existing_storage(StorageTypes.KEY_VALUE_STORE, self.id) + _raise_on_non_existing_storage(StorageTypes.KEY_VALUE_STORE, self._id) items = [] - for record in existing_store_by_id.key_value_entries.values(): + for record in existing_store_by_id._key_value_entries.values(): size = len(record['value']) items.append({ 'key': record['key'], @@ -143,12 +148,12 @@ async def list_keys(self, *, limit: int = DEFAULT_API_PARAM_LIMIT, exclusive_sta async def _get_record_internal(self, key: str, as_bytes: bool = False) -> Optional[Dict]: # Check by id - existing_store_by_id = _find_or_cache_key_value_store_by_possible_id(self.client, self.name or self.id) + existing_store_by_id = _find_or_cache_key_value_store_by_possible_id(self._client, self._name or self._id) if existing_store_by_id is None: - _raise_on_non_existing_storage(StorageTypes.KEY_VALUE_STORE, self.id) + _raise_on_non_existing_storage(StorageTypes.KEY_VALUE_STORE, self._id) - entry = existing_store_by_id.key_value_entries.get(key) + entry = existing_store_by_id._key_value_entries.get(key) if entry is None: return None @@ -182,16 +187,17 @@ async def stream_record(self, _key: str) -> AsyncIterator[Optional[Dict]]: async def set_record(self, key: str, value: Any, content_type: Optional[str] = None) -> None: """TODO: docs.""" # Check by id - existing_store_by_id = _find_or_cache_key_value_store_by_possible_id(self.client, self.name or self.id) + existing_store_by_id = _find_or_cache_key_value_store_by_possible_id(self._client, self._name or self._id) if existing_store_by_id is None: - _raise_on_non_existing_storage(StorageTypes.KEY_VALUE_STORE, self.id) + _raise_on_non_existing_storage(StorageTypes.KEY_VALUE_STORE, self._id) + + if isinstance(value, io.IOBase): + raise NotImplementedError('File-like values are not supported in local memory storage') if content_type is None: - # TODO: Add streaming support for this method... if _is_file_or_bytes(value): - raise NotImplementedError('Such value for set_record is not supported in local memory storage') - # content_type = 'application/octet-stream' + content_type = 'application/octet-stream' elif isinstance(value, str): content_type = 'text/plain; charset=utf-8' else: @@ -209,69 +215,69 @@ async def set_record(self, key: str, value: Any, content_type: Optional[str] = N 'content_type': content_type, } - existing_store_by_id.key_value_entries[key] = record + existing_store_by_id._key_value_entries[key] = record await existing_store_by_id._update_timestamps(True) await _set_or_delete_key_value_store_record( - entity_directory=existing_store_by_id.key_value_store_directory, - persist_storage=self.client.persist_storage, + entity_directory=existing_store_by_id._key_value_store_directory, + persist_storage=self._client._persist_storage, record=record, should_set=True, - write_metadata=self.client.write_metadata, + write_metadata=self._client._write_metadata, ) async def delete_record(self, key: str) -> None: """TODO: docs.""" # Check by id - existing_store_by_id = _find_or_cache_key_value_store_by_possible_id(self.client, self.name or self.id) + existing_store_by_id = _find_or_cache_key_value_store_by_possible_id(self._client, self._name or self._id) if existing_store_by_id is None: - _raise_on_non_existing_storage(StorageTypes.KEY_VALUE_STORE, self.id) + _raise_on_non_existing_storage(StorageTypes.KEY_VALUE_STORE, self._id) - entry = existing_store_by_id.key_value_entries.get(key) + entry = existing_store_by_id._key_value_entries.get(key) if entry is not None: - del existing_store_by_id.key_value_entries[key] + del existing_store_by_id._key_value_entries[key] await existing_store_by_id._update_timestamps(True) await _set_or_delete_key_value_store_record( - entity_directory=existing_store_by_id.key_value_store_directory, - persist_storage=self.client.persist_storage, + entity_directory=existing_store_by_id._key_value_store_directory, + persist_storage=self._client._persist_storage, record=entry, should_set=False, - write_metadata=self.client.write_metadata, + write_metadata=self._client._write_metadata, ) def to_key_value_store_info(self) -> Dict: """TODO: docs.""" return { - 'id': self.id, - 'name': self.name, - 'accessedAt': self.accessed_at, - 'createdAt': self.created_at, - 'modifiedAt': self.modified_at, + 'id': self._id, + 'name': self._name, + 'accessedAt': self._accessed_at, + 'createdAt': self._created_at, + 'modifiedAt': self._modified_at, 'userId': '1', } async def _update_timestamps(self, has_been_modified: bool) -> None: """TODO: docs.""" - self.accessed_at = datetime.utcnow() + self._accessed_at = datetime.utcnow() if has_been_modified: - self.modified_at = datetime.utcnow() + self._modified_at = datetime.utcnow() kv_store_info = self.to_key_value_store_info() - await _update_metadata(data=kv_store_info, entity_directory=self.key_value_store_directory, write_metadata=self.client.write_metadata) + await _update_metadata(data=kv_store_info, entity_directory=self._key_value_store_directory, write_metadata=self._client._write_metadata) def _find_or_cache_key_value_store_by_possible_id(client: 'MemoryStorage', entry_name_or_id: str) -> Optional['KeyValueStoreClient']: # First check memory cache - found = next((store for store in client.key_value_stores_handled - if store.id == entry_name_or_id or (store.name and store.name.lower() == entry_name_or_id.lower())), None) + found = next((store for store in client._key_value_stores_handled + if store._id == entry_name_or_id or (store._name and store._name.lower() == entry_name_or_id.lower())), None) if found is not None: return found - key_value_store_dir = os.path.join(client.key_value_stores_directory, entry_name_or_id) + key_value_store_dir = os.path.join(client._key_value_stores_directory, entry_name_or_id) # Check if directory exists if not os.access(key_value_store_dir, os.F_OK): return None @@ -380,16 +386,16 @@ def _find_or_cache_key_value_store_by_possible_id(client: 'MemoryStorage', entry else: name = entry_name_or_id - new_client = KeyValueStoreClient(base_storage_directory=client.key_value_stores_directory, client=client, id=id, name=name) + new_client = KeyValueStoreClient(base_storage_directory=client._key_value_stores_directory, client=client, id=id, name=name) # Overwrite properties - new_client.accessed_at = accessed_at - new_client.created_at = created_at - new_client.modified_at = modified_at + new_client._accessed_at = accessed_at + new_client._created_at = created_at + new_client._modified_at = modified_at for key, record in internal_records.items(): - new_client.key_value_entries[key] = record + new_client._key_value_entries[key] = record - client.key_value_stores_handled.append(new_client) + client._key_value_stores_handled.append(new_client) return new_client diff --git a/src/apify/memory_storage/resource_clients/key_value_store_collection.py b/src/apify/memory_storage/resource_clients/key_value_store_collection.py index e170d3a6..33271afe 100644 --- a/src/apify/memory_storage/resource_clients/key_value_store_collection.py +++ b/src/apify/memory_storage/resource_clients/key_value_store_collection.py @@ -12,38 +12,41 @@ class KeyValueStoreCollectionClient: """TODO: docs.""" + _key_value_stores_directory: str + _client: 'MemoryStorage' + def __init__(self, *, base_storage_directory: str, client: 'MemoryStorage') -> None: """TODO: docs.""" - self.key_value_stores_directory = base_storage_directory - self.client = client + self._key_value_stores_directory = base_storage_directory + self._client = client def list(self) -> ListPage: """TODO: docs.""" def map_store(store: KeyValueStoreClient) -> Dict: return store.to_key_value_store_info() return ListPage({ - 'total': len(self.client.key_value_stores_handled), - 'count': len(self.client.key_value_stores_handled), + 'total': len(self._client._key_value_stores_handled), + 'count': len(self._client._key_value_stores_handled), 'offset': 0, - 'limit': len(self.client.key_value_stores_handled), + 'limit': len(self._client._key_value_stores_handled), 'desc': False, - 'items': sorted(map(map_store, self.client.key_value_stores_handled), key=itemgetter('createdAt')), + 'items': sorted(map(map_store, self._client._key_value_stores_handled), key=itemgetter('createdAt')), }) async def get_or_create(self, *, name: Optional[str] = None, _schema: Optional[Dict] = None) -> Dict: """TODO: docs.""" if name: - found = _find_or_cache_key_value_store_by_possible_id(client=self.client, entry_name_or_id=name) + found = _find_or_cache_key_value_store_by_possible_id(client=self._client, entry_name_or_id=name) if found: return found.to_key_value_store_info() - new_store = KeyValueStoreClient(name=name, base_storage_directory=self.key_value_stores_directory, client=self.client) - self.client.key_value_stores_handled.append(new_store) + new_store = KeyValueStoreClient(name=name, base_storage_directory=self._key_value_stores_directory, client=self._client) + self._client._key_value_stores_handled.append(new_store) kv_store_info = new_store.to_key_value_store_info() # Write to the disk - await _update_metadata(data=kv_store_info, entity_directory=new_store.key_value_store_directory, write_metadata=self.client.write_metadata) + await _update_metadata(data=kv_store_info, entity_directory=new_store._key_value_store_directory, write_metadata=self._client._write_metadata) return kv_store_info diff --git a/src/apify/memory_storage/resource_clients/request_queue.py b/src/apify/memory_storage/resource_clients/request_queue.py index e26b4855..a5ea9626 100644 --- a/src/apify/memory_storage/resource_clients/request_queue.py +++ b/src/apify/memory_storage/resource_clients/request_queue.py @@ -25,24 +25,28 @@ class RequestQueueClient: """TODO: docs.""" - created_at = datetime.utcnow() - accessed_at = datetime.utcnow() - modified_at = datetime.utcnow() - handled_request_count = 0 # TODO: Does not seem to be implemented in crawelee, always 0 - pending_request_count = 0 - requests: Dict[str, Dict] + _id: str + _request_queue_directory: str + _client: 'MemoryStorage' + _name: str + _requests: Dict[str, Dict] + _created_at = datetime.utcnow() + _accessed_at = datetime.utcnow() + _modified_at = datetime.utcnow() + _handled_request_count = 0 # TODO: Does not seem to be implemented in crawelee, always 0 + _pending_request_count = 0 def __init__(self, *, base_storage_directory: str, client: 'MemoryStorage', id: Optional[str] = None, name: Optional[str] = None) -> None: """TODO: docs.""" - self.id = str(uuid.uuid4()) if id is None else id - self.request_queue_directory = os.path.join(base_storage_directory, name or self.id) - self.client = client - self.name = name - self.requests = {} + self._id = str(uuid.uuid4()) if id is None else id + self._request_queue_directory = os.path.join(base_storage_directory, name or self._id) + self._client = client + self._name = name + self._requests = {} async def get(self) -> Optional[Dict]: """TODO: docs.""" - found = _find_or_cache_request_queue_by_possible_id(self.client, self.name or self.id) + found = _find_or_cache_request_queue_by_possible_id(self._client, self._name or self._id) if found: await found._update_timestamps(False) @@ -53,10 +57,10 @@ async def get(self) -> Optional[Dict]: async def update(self, *, name: Optional[str] = None) -> Dict: """TODO: docs.""" # Check by id - existing_queue_by_id = _find_or_cache_request_queue_by_possible_id(self.client, self.name or self.id) + existing_queue_by_id = _find_or_cache_request_queue_by_possible_id(self._client, self._name or self._id) if existing_queue_by_id is None: - _raise_on_non_existing_storage(StorageTypes.REQUEST_QUEUE, self.id) + _raise_on_non_existing_storage(StorageTypes.REQUEST_QUEUE, self._id) # Skip if no changes if name is None: @@ -64,18 +68,18 @@ async def update(self, *, name: Optional[str] = None) -> Dict: # Check that name is not in use already existing_queue_by_name = next( - (queue for queue in self.client.request_queues_handled if queue.name and queue.name.lower() == name.lower()), None) + (queue for queue in self._client._request_queues_handled if queue._name and queue._name.lower() == name.lower()), None) if existing_queue_by_name is not None: _raise_on_duplicate_storage(StorageTypes.REQUEST_QUEUE, 'name', name) - existing_queue_by_id.name = name + existing_queue_by_id._name = name - previous_dir = existing_queue_by_id.request_queue_directory + previous_dir = existing_queue_by_id._request_queue_directory - existing_queue_by_id.request_queue_directory = os.path.join(self.client.request_queues_directory, name) + existing_queue_by_id._request_queue_directory = os.path.join(self._client._request_queues_directory, name) - await _force_rename(previous_dir, existing_queue_by_id.request_queue_directory) + await _force_rename(previous_dir, existing_queue_by_id._request_queue_directory) # Update timestamps await existing_queue_by_id._update_timestamps(True) @@ -84,27 +88,27 @@ async def update(self, *, name: Optional[str] = None) -> Dict: async def delete(self) -> None: """TODO: docs.""" - queue = next((queue for queue in self.client.request_queues_handled if queue.id == self.id), None) + queue = next((queue for queue in self._client._request_queues_handled if queue._id == self._id), None) if queue is not None: - self.client.request_queues_handled.remove(queue) - queue.pending_request_count = 0 - queue.requests.clear() + self._client._request_queues_handled.remove(queue) + queue._pending_request_count = 0 + queue._requests.clear() - await aioshutil.rmtree(queue.request_queue_directory) + await aioshutil.rmtree(queue._request_queue_directory) async def list_head(self, *, limit: Optional[int] = None) -> Dict: """TODO: docs.""" - existing_queue_by_id = _find_or_cache_request_queue_by_possible_id(self.client, self.name or self.id) + existing_queue_by_id = _find_or_cache_request_queue_by_possible_id(self._client, self._name or self._id) if existing_queue_by_id is None: - _raise_on_non_existing_storage(StorageTypes.REQUEST_QUEUE, self.id) + _raise_on_non_existing_storage(StorageTypes.REQUEST_QUEUE, self._id) await existing_queue_by_id._update_timestamps(False) items: List[Dict] = [] - for request in existing_queue_by_id.requests.values(): + for request in existing_queue_by_id._requests.values(): if len(items) == limit: break @@ -114,21 +118,21 @@ async def list_head(self, *, limit: Optional[int] = None) -> Dict: return { 'limit': limit, 'hadMultipleClients': False, - 'queueModifiedAt': existing_queue_by_id.modified_at, + 'queueModifiedAt': existing_queue_by_id._modified_at, 'items': list(map(lambda item: self._json_to_request(item['json']), items)), } async def add_request(self, request: Dict, *, forefront: Optional[bool] = None) -> Dict: """TODO: docs.""" # TODO: Throw if uniqueKey or url missing from request dict, also do for update_request... - existing_queue_by_id = _find_or_cache_request_queue_by_possible_id(self.client, self.name or self.id) + existing_queue_by_id = _find_or_cache_request_queue_by_possible_id(self._client, self._name or self._id) if existing_queue_by_id is None: - _raise_on_non_existing_storage(StorageTypes.REQUEST_QUEUE, self.id) + _raise_on_non_existing_storage(StorageTypes.REQUEST_QUEUE, self._id) request_model = self._create_internal_request(request, forefront) - existing_request_with_id = existing_queue_by_id.requests.get(request_model['id']) + existing_request_with_id = existing_queue_by_id._requests.get(request_model['id']) # We already have the request present, so we return information about it if existing_request_with_id is not None: @@ -140,15 +144,15 @@ async def add_request(self, request: Dict, *, forefront: Optional[bool] = None) 'wasAlreadyPresent': True, } - existing_queue_by_id.requests[request_model['id']] = request_model + existing_queue_by_id._requests[request_model['id']] = request_model # TODO: Validate the next line logic, seems wrong in crawlee - existing_queue_by_id.pending_request_count += 0 if request_model['orderNo'] is None else 1 + existing_queue_by_id._pending_request_count += 0 if request_model['orderNo'] is None else 1 await existing_queue_by_id._update_timestamps(True) await _update_request_queue_item( request=request_model, request_id=request_model['id'], - entity_directory=existing_queue_by_id.request_queue_directory, - persist_storage=self.client.persist_storage, + entity_directory=existing_queue_by_id._request_queue_directory, + persist_storage=self._client._persist_storage, ) return { @@ -161,29 +165,29 @@ async def add_request(self, request: Dict, *, forefront: Optional[bool] = None) async def get_request(self, request_id: str) -> Optional[Dict]: """TODO: docs.""" - existing_queue_by_id = _find_or_cache_request_queue_by_possible_id(self.client, self.name or self.id) + existing_queue_by_id = _find_or_cache_request_queue_by_possible_id(self._client, self._name or self._id) if existing_queue_by_id is None: - _raise_on_non_existing_storage(StorageTypes.REQUEST_QUEUE, self.id) + _raise_on_non_existing_storage(StorageTypes.REQUEST_QUEUE, self._id) await existing_queue_by_id._update_timestamps(False) - request = existing_queue_by_id.requests.get(request_id) + request = existing_queue_by_id._requests.get(request_id) return self._json_to_request(request['json'] if request is not None else None) async def update_request(self, request: Dict, *, forefront: Optional[bool] = None) -> Dict: """TODO: docs.""" - existing_queue_by_id = _find_or_cache_request_queue_by_possible_id(self.client, self.name or self.id) + existing_queue_by_id = _find_or_cache_request_queue_by_possible_id(self._client, self._name or self._id) if existing_queue_by_id is None: - _raise_on_non_existing_storage(StorageTypes.REQUEST_QUEUE, self.id) + _raise_on_non_existing_storage(StorageTypes.REQUEST_QUEUE, self._id) request_model = self._create_internal_request(request, forefront) # First we need to check the existing request to be # able to return information about its handled state. - existing_request = existing_queue_by_id.requests.get(request_model['id']) + existing_request = existing_queue_by_id._requests.get(request_model['id']) # Undefined means that the request is not present in the queue. # We need to insert it, to behave the same as API. @@ -192,7 +196,7 @@ async def update_request(self, request: Dict, *, forefront: Optional[bool] = Non # When updating the request, we need to make sure that # the handled counts are updated correctly in all cases. - existing_queue_by_id.requests[request_model['id']] = request_model + existing_queue_by_id._requests[request_model['id']] = request_model pending_count_adjustment = 0 is_request_handled_state_changing = type(existing_request['orderNo']) != type(request_model['orderNo']) # noqa @@ -203,13 +207,13 @@ async def update_request(self, request: Dict, *, forefront: Optional[bool] = Non if is_request_handled_state_changing: pending_count_adjustment = 1 if request_was_handled_before_update else -1 - existing_queue_by_id.pending_request_count += pending_count_adjustment + existing_queue_by_id._pending_request_count += pending_count_adjustment await existing_queue_by_id._update_timestamps(True) await _update_request_queue_item( request=request_model, request_id=request_model['id'], - entity_directory=existing_queue_by_id.request_queue_directory, - persist_storage=self.client.persist_storage, + entity_directory=existing_queue_by_id._request_queue_directory, + persist_storage=self._client._persist_storage, ) return { @@ -220,44 +224,44 @@ async def update_request(self, request: Dict, *, forefront: Optional[bool] = Non async def delete_request(self, request_id: str) -> None: """TODO: docs.""" - existing_queue_by_id = _find_or_cache_request_queue_by_possible_id(self.client, self.name or self.id) + existing_queue_by_id = _find_or_cache_request_queue_by_possible_id(self._client, self._name or self._id) if existing_queue_by_id is None: - _raise_on_non_existing_storage(StorageTypes.REQUEST_QUEUE, self.id) + _raise_on_non_existing_storage(StorageTypes.REQUEST_QUEUE, self._id) - request = existing_queue_by_id.requests.get(request_id) + request = existing_queue_by_id._requests.get(request_id) if request: - del existing_queue_by_id.requests[request_id] - existing_queue_by_id.pending_request_count -= 0 if request['orderNo'] is None else 1 + del existing_queue_by_id._requests[request_id] + existing_queue_by_id._pending_request_count -= 0 if request['orderNo'] is None else 1 await existing_queue_by_id._update_timestamps(True) - await _delete_request(entity_directory=existing_queue_by_id.request_queue_directory, request_id=request_id) + await _delete_request(entity_directory=existing_queue_by_id._request_queue_directory, request_id=request_id) def to_request_queue_info(self) -> Dict: """TODO: docs.""" return { - 'accessedAt': self.accessed_at, - 'createdAt': self.created_at, + 'accessedAt': self._accessed_at, + 'createdAt': self._created_at, 'hadMultipleClients': False, - 'handledRequestCount': self.handled_request_count, - 'id': self.id, - 'modifiedAt': self.modified_at, - 'name': self.name, - 'pendingRequestCount': self.pending_request_count, + 'handledRequestCount': self._handled_request_count, + 'id': self._id, + 'modifiedAt': self._modified_at, + 'name': self._name, + 'pendingRequestCount': self._pending_request_count, 'stats': {}, - 'totalRequestCount': len(self.requests), + 'totalRequestCount': len(self._requests), 'userId': '1', } async def _update_timestamps(self, has_been_modified: bool) -> None: """TODO: docs.""" - self.accessed_at = datetime.utcnow() + self._accessed_at = datetime.utcnow() if has_been_modified: - self.modified_at = datetime.utcnow() + self._modified_at = datetime.utcnow() request_queue_info = self.to_request_queue_info() - await _update_metadata(data=request_queue_info, entity_directory=self.request_queue_directory, write_metadata=self.client.write_metadata) + await _update_metadata(data=request_queue_info, entity_directory=self._request_queue_directory, write_metadata=self._client._write_metadata) def _json_to_request(self, request_json: Optional[str]) -> Optional[dict]: if request_json is None: @@ -294,13 +298,13 @@ def _calculate_order_no(self, request: Dict, forefront: Optional[bool]) -> Optio def _find_or_cache_request_queue_by_possible_id(client: 'MemoryStorage', entry_name_or_id: str) -> Optional['RequestQueueClient']: # First check memory cache - found = next((queue for queue in client.request_queues_handled - if queue.id == entry_name_or_id or (queue.name and queue.name.lower() == entry_name_or_id.lower())), None) + found = next((queue for queue in client._request_queues_handled + if queue._id == entry_name_or_id or (queue._name and queue._name.lower() == entry_name_or_id.lower())), None) if found is not None: return found - request_queues_dir = os.path.join(client.request_queues_directory, entry_name_or_id) + request_queues_dir = os.path.join(client._request_queues_directory, entry_name_or_id) # Check if directory exists if not os.access(request_queues_dir, os.F_OK): return None @@ -343,18 +347,18 @@ def _find_or_cache_request_queue_by_possible_id(client: 'MemoryStorage', entry_n else: name = entry_name_or_id - new_client = RequestQueueClient(base_storage_directory=client.request_queues_directory, client=client, id=id, name=name) + new_client = RequestQueueClient(base_storage_directory=client._request_queues_directory, client=client, id=id, name=name) # Overwrite properties - new_client.accessed_at = accessed_at - new_client.created_at = created_at - new_client.modified_at = modified_at - new_client.handled_request_count = handled_request_count - new_client.pending_request_count = pending_request_count + new_client._accessed_at = accessed_at + new_client._created_at = created_at + new_client._modified_at = modified_at + new_client._handled_request_count = handled_request_count + new_client._pending_request_count = pending_request_count for request in entries: - new_client.requests[request['id']] = request + new_client._requests[request['id']] = request - client.request_queues_handled.append(new_client) + client._request_queues_handled.append(new_client) return new_client diff --git a/src/apify/memory_storage/resource_clients/request_queue_collection.py b/src/apify/memory_storage/resource_clients/request_queue_collection.py index 6927f5d2..4393b60b 100644 --- a/src/apify/memory_storage/resource_clients/request_queue_collection.py +++ b/src/apify/memory_storage/resource_clients/request_queue_collection.py @@ -12,38 +12,45 @@ class RequestQueueCollectionClient: """TODO: docs.""" + _request_queues_directory: str + _client: 'MemoryStorage' + def __init__(self, *, base_storage_directory: str, client: 'MemoryStorage') -> None: """TODO: docs.""" - self.request_queues_directory = base_storage_directory - self.client = client + self._request_queues_directory = base_storage_directory + self._client = client def list(self) -> ListPage: """TODO: docs.""" def map_store(store: RequestQueueClient) -> Dict: return store.to_request_queue_info() return ListPage({ - 'total': len(self.client.request_queues_handled), - 'count': len(self.client.request_queues_handled), + 'total': len(self._client._request_queues_handled), + 'count': len(self._client._request_queues_handled), 'offset': 0, - 'limit': len(self.client.request_queues_handled), + 'limit': len(self._client._request_queues_handled), 'desc': False, - 'items': sorted(map(map_store, self.client.request_queues_handled), key=itemgetter('createdAt')), + 'items': sorted(map(map_store, self._client._request_queues_handled), key=itemgetter('createdAt')), }) async def get_or_create(self, *, name: Optional[str] = None) -> Dict: """TODO: docs.""" if name: - found = _find_or_cache_request_queue_by_possible_id(self.client, name) + found = _find_or_cache_request_queue_by_possible_id(self._client, name) if found: return found.to_request_queue_info() - new_store = RequestQueueClient(name=name, base_storage_directory=self.request_queues_directory, client=self.client) - self.client.request_queues_handled.append(new_store) + new_queue = RequestQueueClient(name=name, base_storage_directory=self._request_queues_directory, client=self._client) + self._client._request_queues_handled.append(new_queue) - request_queue_info = new_store.to_request_queue_info() + request_queue_info = new_queue.to_request_queue_info() # Write to the disk - await _update_metadata(data=request_queue_info, entity_directory=new_store.request_queue_directory, write_metadata=self.client.write_metadata) + await _update_metadata( + data=request_queue_info, + entity_directory=new_queue._request_queue_directory, + write_metadata=self._client._write_metadata, + ) return request_queue_info diff --git a/tests/unit/memory_storage/resource_clients/test_dataset.py b/tests/unit/memory_storage/resource_clients/test_dataset.py index cafbab51..071e5c60 100644 --- a/tests/unit/memory_storage/resource_clients/test_dataset.py +++ b/tests/unit/memory_storage/resource_clients/test_dataset.py @@ -32,7 +32,7 @@ async def test_not_implemented(dataset_client: DatasetClient) -> None: async def test_get(dataset_client: DatasetClient) -> None: info = await dataset_client.get() assert info is not None - assert info['id'] == dataset_client.id + assert info['id'] == dataset_client._id assert info['accessedAt'] != info['createdAt'] @@ -40,8 +40,8 @@ async def test_update(dataset_client: DatasetClient) -> None: new_dataset_name = 'test-update' old_dataset_info = await dataset_client.get() assert old_dataset_info is not None - old_dataset_directory = os.path.join(dataset_client.client.datasets_directory, old_dataset_info['name']) - new_dataset_directory = os.path.join(dataset_client.client.datasets_directory, new_dataset_name) + old_dataset_directory = os.path.join(dataset_client._client._datasets_directory, old_dataset_info['name']) + new_dataset_directory = os.path.join(dataset_client._client._datasets_directory, new_dataset_name) assert os.path.exists(os.path.join(old_dataset_directory, '__metadata__.json')) is True assert os.path.exists(os.path.join(new_dataset_directory, '__metadata__.json')) is False updated_dataset_info = await dataset_client.update(name=new_dataset_name) @@ -59,7 +59,7 @@ async def test_update(dataset_client: DatasetClient) -> None: async def test_delete(dataset_client: DatasetClient) -> None: dataset_info = await dataset_client.get() assert dataset_info is not None - dataset_directory = os.path.join(dataset_client.client.datasets_directory, dataset_info['name']) + dataset_directory = os.path.join(dataset_client._client._datasets_directory, dataset_info['name']) assert os.path.exists(os.path.join(dataset_directory, '__metadata__.json')) is True await dataset_client.delete() assert os.path.exists(os.path.join(dataset_directory, '__metadata__.json')) is False diff --git a/tests/unit/memory_storage/resource_clients/test_dataset_collection.py b/tests/unit/memory_storage/resource_clients/test_dataset_collection.py index a9fcbcc2..6d3e7e68 100644 --- a/tests/unit/memory_storage/resource_clients/test_dataset_collection.py +++ b/tests/unit/memory_storage/resource_clients/test_dataset_collection.py @@ -14,10 +14,10 @@ def datasets_client(memory_storage: MemoryStorage) -> DatasetCollectionClient: async def test_get_or_create(datasets_client: DatasetCollectionClient) -> None: dataset_name = 'test' # A new dataset gets created - assert os.path.exists(os.path.join(datasets_client.datasets_directory, dataset_name, '__metadata__.json')) is False + assert os.path.exists(os.path.join(datasets_client._datasets_directory, dataset_name, '__metadata__.json')) is False dataset_info = await datasets_client.get_or_create(name=dataset_name) assert dataset_info['name'] == dataset_name - assert os.path.exists(os.path.join(datasets_client.datasets_directory, dataset_name, '__metadata__.json')) is True + assert os.path.exists(os.path.join(datasets_client._datasets_directory, dataset_name, '__metadata__.json')) is True # Another get_or_create call returns the same dataset dataset_info_existing = await datasets_client.get_or_create(name=dataset_name) assert dataset_info['id'] == dataset_info_existing['id'] diff --git a/tests/unit/memory_storage/resource_clients/test_key_value_store.py b/tests/unit/memory_storage/resource_clients/test_key_value_store.py index fd02f03b..40386e1b 100644 --- a/tests/unit/memory_storage/resource_clients/test_key_value_store.py +++ b/tests/unit/memory_storage/resource_clients/test_key_value_store.py @@ -34,7 +34,7 @@ async def test_not_implemented(key_value_store_client: KeyValueStoreClient) -> N async def test_get(key_value_store_client: KeyValueStoreClient) -> None: info = await key_value_store_client.get() assert info is not None - assert info['id'] == key_value_store_client.id + assert info['id'] == key_value_store_client._id assert info['accessedAt'] != info['createdAt'] @@ -42,8 +42,8 @@ async def test_update(key_value_store_client: KeyValueStoreClient) -> None: new_kvs_name = 'test-update' old_kvs_info = await key_value_store_client.get() assert old_kvs_info is not None - old_kvs_directory = os.path.join(key_value_store_client.client.key_value_stores_directory, old_kvs_info['name']) - new_kvs_directory = os.path.join(key_value_store_client.client.key_value_stores_directory, new_kvs_name) + old_kvs_directory = os.path.join(key_value_store_client._client._key_value_stores_directory, old_kvs_info['name']) + new_kvs_directory = os.path.join(key_value_store_client._client._key_value_stores_directory, new_kvs_name) assert os.path.exists(os.path.join(old_kvs_directory, '__metadata__.json')) is True assert os.path.exists(os.path.join(new_kvs_directory, '__metadata__.json')) is False updated_kvs_info = await key_value_store_client.update(name=new_kvs_name) @@ -61,7 +61,7 @@ async def test_update(key_value_store_client: KeyValueStoreClient) -> None: async def test_delete(key_value_store_client: KeyValueStoreClient) -> None: kvs_info = await key_value_store_client.get() assert kvs_info is not None - kvs_directory = os.path.join(key_value_store_client.client.key_value_stores_directory, kvs_info['name']) + kvs_directory = os.path.join(key_value_store_client._client._key_value_stores_directory, kvs_info['name']) assert os.path.exists(os.path.join(kvs_directory, '__metadata__.json')) is True await key_value_store_client.delete() assert os.path.exists(os.path.join(kvs_directory, '__metadata__.json')) is False @@ -97,7 +97,7 @@ async def test_list_keys(key_value_store_client: KeyValueStoreClient) -> None: assert keys_exclusive_start['items'][-1]['key'] == keys_exclusive_start['nextExclusiveStartKey'] -async def test_get_and_set_record(key_value_store_client: KeyValueStoreClient) -> None: +async def test_get_and_set_record(tmp_path: str, key_value_store_client: KeyValueStoreClient) -> None: # Test setting dict record dict_record_key = 'test-dict' await key_value_store_client.set_record(dict_record_key, {'test': 123}) @@ -115,13 +115,24 @@ async def test_get_and_set_record(key_value_store_client: KeyValueStoreClient) - # Test setting explicit json record but use str as value, i.e. json dumps is skipped explicit_json_key = 'test-json' await key_value_store_client.set_record(explicit_json_key, '{"test": "explicit string"}', 'application/json') - explicit_json_record_info = await key_value_store_client.get_record(explicit_json_key) - assert explicit_json_record_info is not None - assert 'application/json' in explicit_json_record_info['contentType'] - assert explicit_json_record_info['value']['test'] == 'explicit string' + bytes_record_info = await key_value_store_client.get_record(explicit_json_key) + assert bytes_record_info is not None + assert 'application/json' in bytes_record_info['contentType'] + assert bytes_record_info['value']['test'] == 'explicit string' # Test using bytes + bytes_key = 'test-json' + bytes_value = 'testing bytes set_record'.encode('utf-8') + await key_value_store_client.set_record(bytes_key, bytes_value, 'unknown') + bytes_record_info = await key_value_store_client.get_record(bytes_key) + assert bytes_record_info is not None + assert 'unknown' in bytes_record_info['contentType'] + assert bytes_record_info['value'] == bytes_value + assert bytes_record_info['value'].decode('utf-8') == bytes_value.decode('utf-8') + # Test using file descriptor with pytest.raises(NotImplementedError): - await key_value_store_client.set_record('bytes', 'test'.encode('utf-8')) + with open(os.path.join(tmp_path, 'test.json'), 'w+') as f: + f.write('Test') + await key_value_store_client.set_record('file', f) async def test_get_record_as_bytes(key_value_store_client: KeyValueStoreClient) -> None: diff --git a/tests/unit/memory_storage/resource_clients/test_key_value_store_collection.py b/tests/unit/memory_storage/resource_clients/test_key_value_store_collection.py index f5edf10e..f984a017 100644 --- a/tests/unit/memory_storage/resource_clients/test_key_value_store_collection.py +++ b/tests/unit/memory_storage/resource_clients/test_key_value_store_collection.py @@ -14,10 +14,10 @@ def key_value_stores_client(memory_storage: MemoryStorage) -> KeyValueStoreColle async def test_get_or_create(key_value_stores_client: KeyValueStoreCollectionClient) -> None: kvs_name = 'test' # A new kvs gets created - assert os.path.exists(os.path.join(key_value_stores_client.key_value_stores_directory, kvs_name, '__metadata__.json')) is False + assert os.path.exists(os.path.join(key_value_stores_client._key_value_stores_directory, kvs_name, '__metadata__.json')) is False kvs_info = await key_value_stores_client.get_or_create(name=kvs_name) assert kvs_info['name'] == kvs_name - assert os.path.exists(os.path.join(key_value_stores_client.key_value_stores_directory, kvs_name, '__metadata__.json')) is True + assert os.path.exists(os.path.join(key_value_stores_client._key_value_stores_directory, kvs_name, '__metadata__.json')) is True # Another get_or_create call returns the same kvs kvs_info_existing = await key_value_stores_client.get_or_create(name=kvs_name) assert kvs_info['id'] == kvs_info_existing['id'] diff --git a/tests/unit/memory_storage/resource_clients/test_request_queue.py b/tests/unit/memory_storage/resource_clients/test_request_queue.py index d5b60bb2..835ab9b7 100644 --- a/tests/unit/memory_storage/resource_clients/test_request_queue.py +++ b/tests/unit/memory_storage/resource_clients/test_request_queue.py @@ -25,7 +25,7 @@ async def test_nonexistent(memory_storage: MemoryStorage) -> None: async def test_get(request_queue_client: RequestQueueClient) -> None: info = await request_queue_client.get() assert info is not None - assert info['id'] == request_queue_client.id + assert info['id'] == request_queue_client._id assert info['accessedAt'] != info['createdAt'] @@ -33,8 +33,8 @@ async def test_update(request_queue_client: RequestQueueClient) -> None: new_rq_name = 'test-update' old_rq_info = await request_queue_client.get() assert old_rq_info is not None - old_rq_directory = os.path.join(request_queue_client.client.request_queues_directory, old_rq_info['name']) - new_rq_directory = os.path.join(request_queue_client.client.request_queues_directory, new_rq_name) + old_rq_directory = os.path.join(request_queue_client._client._request_queues_directory, old_rq_info['name']) + new_rq_directory = os.path.join(request_queue_client._client._request_queues_directory, new_rq_name) assert os.path.exists(os.path.join(old_rq_directory, '__metadata__.json')) is True assert os.path.exists(os.path.join(new_rq_directory, '__metadata__.json')) is False updated_rq_info = await request_queue_client.update(name=new_rq_name) @@ -52,7 +52,7 @@ async def test_update(request_queue_client: RequestQueueClient) -> None: async def test_delete(request_queue_client: RequestQueueClient) -> None: rq_info = await request_queue_client.get() assert rq_info is not None - rq_directory = os.path.join(request_queue_client.client.request_queues_directory, rq_info['name']) + rq_directory = os.path.join(request_queue_client._client._request_queues_directory, rq_info['name']) assert os.path.exists(os.path.join(rq_directory, '__metadata__.json')) is True await request_queue_client.delete() assert os.path.exists(os.path.join(rq_directory, '__metadata__.json')) is False diff --git a/tests/unit/memory_storage/resource_clients/test_request_queue_collection.py b/tests/unit/memory_storage/resource_clients/test_request_queue_collection.py index d87e15d0..128e257b 100644 --- a/tests/unit/memory_storage/resource_clients/test_request_queue_collection.py +++ b/tests/unit/memory_storage/resource_clients/test_request_queue_collection.py @@ -14,10 +14,10 @@ def request_queues_client(memory_storage: MemoryStorage) -> RequestQueueCollecti async def test_get_or_create(request_queues_client: RequestQueueCollectionClient) -> None: rq_name = 'test' # A new request queue gets created - assert os.path.exists(os.path.join(request_queues_client.request_queues_directory, rq_name, '__metadata__.json')) is False + assert os.path.exists(os.path.join(request_queues_client._request_queues_directory, rq_name, '__metadata__.json')) is False rq_info = await request_queues_client.get_or_create(name=rq_name) assert rq_info['name'] == rq_name - assert os.path.exists(os.path.join(request_queues_client.request_queues_directory, rq_name, '__metadata__.json')) is True + assert os.path.exists(os.path.join(request_queues_client._request_queues_directory, rq_name, '__metadata__.json')) is True # Another get_or_create call returns the same request queue rq_existing = await request_queues_client.get_or_create(name=rq_name) assert rq_info['id'] == rq_existing['id'] diff --git a/tests/unit/memory_storage/test_memory_storage.py b/tests/unit/memory_storage/test_memory_storage.py index 842b5c04..b01c32f5 100644 --- a/tests/unit/memory_storage/test_memory_storage.py +++ b/tests/unit/memory_storage/test_memory_storage.py @@ -1,5 +1,7 @@ import os +import pytest + from apify.memory_storage.memory_storage import MemoryStorage @@ -12,8 +14,8 @@ async def test_write_metadata(tmp_path: str) -> None: datasets_no_metadata_client = ms_no_metadata.datasets() await datasets_client.get_or_create(name=dataset_name) await datasets_no_metadata_client.get_or_create(name=dataset_no_metadata_name) - assert os.path.exists(os.path.join(ms.datasets_directory, dataset_name, '__metadata__.json')) is True - assert os.path.exists(os.path.join(ms_no_metadata.datasets_directory, dataset_no_metadata_name, '__metadata__.json')) is False + assert os.path.exists(os.path.join(ms._datasets_directory, dataset_name, '__metadata__.json')) is True + assert os.path.exists(os.path.join(ms_no_metadata._datasets_directory, dataset_no_metadata_name, '__metadata__.json')) is False async def test_persist_storage(tmp_path: str) -> None: @@ -25,8 +27,34 @@ async def test_persist_storage(tmp_path: str) -> None: kvs_no_metadata_info = await kvs_no_metadata_client.get_or_create(name='kvs-no-persist') await ms.key_value_store(id=kvs_info['id']).set_record('test', {'x': 1}, 'application/json') await ms_no_persist.key_value_store(id=kvs_no_metadata_info['id']).set_record('test', {'x': 1}, 'application/json') - assert os.path.exists(os.path.join(ms.key_value_stores_directory, kvs_info['name'], 'test.json')) is True - assert os.path.exists(os.path.join(ms_no_persist.key_value_stores_directory, kvs_no_metadata_info['name'], 'test.json')) is False + assert os.path.exists(os.path.join(ms._key_value_stores_directory, kvs_info['name'], 'test.json')) is True + assert os.path.exists(os.path.join(ms_no_persist._key_value_stores_directory, kvs_no_metadata_info['name'], 'test.json')) is False + + +def test_config_via_env_vars_persist_storage(monkeypatch: pytest.MonkeyPatch, tmp_path: str) -> None: + # Env var changes persist_storage to False + monkeypatch.setenv('APIFY_PERSIST_STORAGE', 'false') + ms = MemoryStorage(local_data_directory=tmp_path) + assert ms._persist_storage is False + monkeypatch.setenv('APIFY_PERSIST_STORAGE', '0') + ms = MemoryStorage(local_data_directory=tmp_path) + assert ms._persist_storage is False + monkeypatch.setenv('APIFY_PERSIST_STORAGE', '') + ms = MemoryStorage(local_data_directory=tmp_path) + assert ms._persist_storage is False + # Test if constructor arg takes precedence over env var value + ms = MemoryStorage(local_data_directory=tmp_path, persist_storage=True) + assert ms._persist_storage is True + + +def test_config_via_env_vars_write_metadata(monkeypatch: pytest.MonkeyPatch, tmp_path: str) -> None: + # Env var changes write_metadata to True + monkeypatch.setenv('DEBUG', '*') + ms = MemoryStorage(local_data_directory=tmp_path) + assert ms._write_metadata is True + # Test if constructor arg takes precedence over env var value + ms = MemoryStorage(local_data_directory=tmp_path, write_metadata=False) + assert ms._write_metadata is False async def test_purge_datasets(tmp_path: str) -> None: @@ -37,11 +65,11 @@ async def test_purge_datasets(tmp_path: str) -> None: non_default_dataset_info = await datasets_client.get_or_create(name='non-default') # Check all folders inside datasets directory before and after purge - folders_before_purge = os.listdir(ms.datasets_directory) + folders_before_purge = os.listdir(ms._datasets_directory) assert default_dataset_info['name'] in folders_before_purge assert non_default_dataset_info['name'] in folders_before_purge await ms.purge() - folders_after_purge = os.listdir(ms.datasets_directory) + folders_after_purge = os.listdir(ms._datasets_directory) assert default_dataset_info['name'] not in folders_after_purge assert non_default_dataset_info['name'] in folders_after_purge @@ -60,17 +88,17 @@ async def test_purge_key_value_stores(tmp_path: str) -> None: await default_kvs_client.set_record('test', {'abc': 123}, 'application/json') # Check all folders and files inside kvs directory before and after purge - folders_before_purge = os.listdir(ms.key_value_stores_directory) + folders_before_purge = os.listdir(ms._key_value_stores_directory) assert default_kvs_info['name'] in folders_before_purge assert non_default_kvs_info['name'] in folders_before_purge - default_folder_files_before_purge = os.listdir(os.path.join(ms.key_value_stores_directory, 'default')) + default_folder_files_before_purge = os.listdir(os.path.join(ms._key_value_stores_directory, 'default')) assert 'INPUT.json' in default_folder_files_before_purge assert 'test.json' in default_folder_files_before_purge await ms.purge() - folders_after_purge = os.listdir(ms.key_value_stores_directory) + folders_after_purge = os.listdir(ms._key_value_stores_directory) assert default_kvs_info['name'] in folders_after_purge assert non_default_kvs_info['name'] in folders_after_purge - default_folder_files_after_purge = os.listdir(os.path.join(ms.key_value_stores_directory, 'default')) + default_folder_files_after_purge = os.listdir(os.path.join(ms._key_value_stores_directory, 'default')) assert 'INPUT.json' in default_folder_files_after_purge assert 'test.json' not in default_folder_files_after_purge @@ -83,10 +111,10 @@ async def test_purge_request_queues(tmp_path: str) -> None: non_default_rq_info = await rq_client.get_or_create(name='non-default') # Check all folders inside rq directory before and after purge - folders_before_purge = os.listdir(ms.request_queues_directory) + folders_before_purge = os.listdir(ms._request_queues_directory) assert default_rq_info['name'] in folders_before_purge assert non_default_rq_info['name'] in folders_before_purge await ms.purge() - folders_after_purge = os.listdir(ms.request_queues_directory) + folders_after_purge = os.listdir(ms._request_queues_directory) assert default_rq_info['name'] not in folders_after_purge assert non_default_rq_info['name'] in folders_after_purge From b4f7a3089d123d6e59d6ca25096690d2b59d51af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ji=C5=99=C3=AD=20Morav=C4=8D=C3=ADk?= Date: Wed, 4 Jan 2023 13:40:28 +0100 Subject: [PATCH 21/23] fix lint, align interface of MemoryStorage with python client one --- src/apify/memory_storage/memory_storage.py | 17 +++++++++-------- .../resource_clients/test_dataset.py | 4 ++-- .../resource_clients/test_key_value_store.py | 4 ++-- .../resource_clients/test_request_queue.py | 4 ++-- .../unit/memory_storage/test_memory_storage.py | 6 +++--- 5 files changed, 18 insertions(+), 17 deletions(-) diff --git a/src/apify/memory_storage/memory_storage.py b/src/apify/memory_storage/memory_storage.py index d3d45b4e..5bd4a824 100644 --- a/src/apify/memory_storage/memory_storage.py +++ b/src/apify/memory_storage/memory_storage.py @@ -59,42 +59,43 @@ def datasets(self) -> DatasetCollectionClient: """Retrieve the sub-client for manipulating datasets.""" return DatasetCollectionClient(base_storage_directory=self._datasets_directory, client=self) - def dataset(self, *, id: str) -> DatasetClient: + def dataset(self, dataset_id: str) -> DatasetClient: """Retrieve the sub-client for manipulating a single dataset. Args: dataset_id (str): ID of the dataset to be manipulated """ - return DatasetClient(base_storage_directory=self._datasets_directory, client=self, id=id) + return DatasetClient(base_storage_directory=self._datasets_directory, client=self, id=dataset_id) def key_value_stores(self) -> KeyValueStoreCollectionClient: """Retrieve the sub-client for manipulating key-value stores.""" return KeyValueStoreCollectionClient(base_storage_directory=self._key_value_stores_directory, client=self) - def key_value_store(self, *, id: str) -> KeyValueStoreClient: + def key_value_store(self, key_value_store_id: str) -> KeyValueStoreClient: """Retrieve the sub-client for manipulating a single key-value store. Args: key_value_store_id (str): ID of the key-value store to be manipulated """ - return KeyValueStoreClient(base_storage_directory=self._key_value_stores_directory, client=self, id=id) + return KeyValueStoreClient(base_storage_directory=self._key_value_stores_directory, client=self, id=key_value_store_id) def request_queues(self) -> RequestQueueCollectionClient: """Retrieve the sub-client for manipulating request queues.""" return RequestQueueCollectionClient(base_storage_directory=self._request_queues_directory, client=self) - def request_queue(self, *, id: str, _client_key: Optional[str] = None, _timeout_secs: Optional[int] = None) -> RequestQueueClient: + def request_queue(self, request_queue_id: str, *, _client_key: Optional[str] = None) -> RequestQueueClient: """Retrieve the sub-client for manipulating a single request queue. Args: request_queue_id (str): ID of the request queue to be manipulated client_key (str): A unique identifier of the client accessing the request queue """ - return RequestQueueClient(base_storage_directory=self._request_queues_directory, client=self, id=id) + return RequestQueueClient(base_storage_directory=self._request_queues_directory, client=self, id=request_queue_id) async def purge(self) -> None: - """ - Cleans up the default storage directories before the run starts: + """Clean up the default storage directories before the run starts. + + Specifically, `purge` cleans up: - local directory containing the default dataset; - all records from the default key-value store in the local directory, except for the "INPUT" key; - local directory containing the default request queue. diff --git a/tests/unit/memory_storage/resource_clients/test_dataset.py b/tests/unit/memory_storage/resource_clients/test_dataset.py index 071e5c60..151c940f 100644 --- a/tests/unit/memory_storage/resource_clients/test_dataset.py +++ b/tests/unit/memory_storage/resource_clients/test_dataset.py @@ -10,11 +10,11 @@ async def dataset_client(memory_storage: MemoryStorage) -> DatasetClient: datasets_client = memory_storage.datasets() dataset_info = await datasets_client.get_or_create(name='test') - return memory_storage.dataset(id=dataset_info['id']) + return memory_storage.dataset(dataset_info['id']) async def test_nonexistent(memory_storage: MemoryStorage) -> None: - dataset_client = memory_storage.dataset(id='clearly not a uuid') + dataset_client = memory_storage.dataset(dataset_id='clearly not a uuid') assert await dataset_client.get() is None with pytest.raises(ValueError): await dataset_client.update(name='test-update') diff --git a/tests/unit/memory_storage/resource_clients/test_key_value_store.py b/tests/unit/memory_storage/resource_clients/test_key_value_store.py index 40386e1b..9e3cde3b 100644 --- a/tests/unit/memory_storage/resource_clients/test_key_value_store.py +++ b/tests/unit/memory_storage/resource_clients/test_key_value_store.py @@ -10,11 +10,11 @@ async def key_value_store_client(memory_storage: MemoryStorage) -> KeyValueStoreClient: key_value_stores_client = memory_storage.key_value_stores() kvs_info = await key_value_stores_client.get_or_create(name='test') - return memory_storage.key_value_store(id=kvs_info['id']) + return memory_storage.key_value_store(kvs_info['id']) async def test_nonexistent(memory_storage: MemoryStorage) -> None: - kvs_client = memory_storage.key_value_store(id='clearly not a uuid') + kvs_client = memory_storage.key_value_store(key_value_store_id='clearly not a uuid') assert await kvs_client.get() is None with pytest.raises(ValueError): await kvs_client.update(name='test-update') diff --git a/tests/unit/memory_storage/resource_clients/test_request_queue.py b/tests/unit/memory_storage/resource_clients/test_request_queue.py index 835ab9b7..9d62f087 100644 --- a/tests/unit/memory_storage/resource_clients/test_request_queue.py +++ b/tests/unit/memory_storage/resource_clients/test_request_queue.py @@ -11,11 +11,11 @@ async def request_queue_client(memory_storage: MemoryStorage) -> RequestQueueClient: request_queues_client = memory_storage.request_queues() rq_info = await request_queues_client.get_or_create(name='test') - return memory_storage.request_queue(id=rq_info['id']) + return memory_storage.request_queue(rq_info['id']) async def test_nonexistent(memory_storage: MemoryStorage) -> None: - request_queue_client = memory_storage.request_queue(id='clearly not a uuid') + request_queue_client = memory_storage.request_queue(request_queue_id='clearly not a uuid') assert await request_queue_client.get() is None with pytest.raises(ValueError): await request_queue_client.update(name='test-update') diff --git a/tests/unit/memory_storage/test_memory_storage.py b/tests/unit/memory_storage/test_memory_storage.py index b01c32f5..db090876 100644 --- a/tests/unit/memory_storage/test_memory_storage.py +++ b/tests/unit/memory_storage/test_memory_storage.py @@ -25,8 +25,8 @@ async def test_persist_storage(tmp_path: str) -> None: kvs_no_metadata_client = ms_no_persist.key_value_stores() kvs_info = await kvs_client.get_or_create(name='kvs') kvs_no_metadata_info = await kvs_no_metadata_client.get_or_create(name='kvs-no-persist') - await ms.key_value_store(id=kvs_info['id']).set_record('test', {'x': 1}, 'application/json') - await ms_no_persist.key_value_store(id=kvs_no_metadata_info['id']).set_record('test', {'x': 1}, 'application/json') + await ms.key_value_store(kvs_info['id']).set_record('test', {'x': 1}, 'application/json') + await ms_no_persist.key_value_store(kvs_no_metadata_info['id']).set_record('test', {'x': 1}, 'application/json') assert os.path.exists(os.path.join(ms._key_value_stores_directory, kvs_info['name'], 'test.json')) is True assert os.path.exists(os.path.join(ms_no_persist._key_value_stores_directory, kvs_no_metadata_info['name'], 'test.json')) is False @@ -81,7 +81,7 @@ async def test_purge_key_value_stores(tmp_path: str) -> None: kvs_client = ms.key_value_stores() default_kvs_info = await kvs_client.get_or_create(name='default') non_default_kvs_info = await kvs_client.get_or_create(name='non-default') - default_kvs_client = ms.key_value_store(id=default_kvs_info['id']) + default_kvs_client = ms.key_value_store(default_kvs_info['id']) # INPUT.json should be kept await default_kvs_client.set_record('INPUT', {'abc': 123}, 'application/json') # test.json should not be kept From 84fa37134bc1ba0d7f834d9febdf910abbec0353 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ji=C5=99=C3=AD=20Morav=C4=8D=C3=ADk?= Date: Wed, 4 Jan 2023 13:43:10 +0100 Subject: [PATCH 22/23] fix types --- src/apify/memory_storage/resource_clients/dataset.py | 2 +- src/apify/memory_storage/resource_clients/key_value_store.py | 2 +- src/apify/memory_storage/resource_clients/request_queue.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/apify/memory_storage/resource_clients/dataset.py b/src/apify/memory_storage/resource_clients/dataset.py index 320590f8..6d37432f 100644 --- a/src/apify/memory_storage/resource_clients/dataset.py +++ b/src/apify/memory_storage/resource_clients/dataset.py @@ -33,7 +33,7 @@ class DatasetClient: _id: str _dataset_directory: str _client: 'MemoryStorage' - _name: str + _name: Optional[str] _dataset_entries: Dict[str, Dict] _created_at = datetime.utcnow() _accessed_at = datetime.utcnow() diff --git a/src/apify/memory_storage/resource_clients/key_value_store.py b/src/apify/memory_storage/resource_clients/key_value_store.py index bc6acc94..7b84de7a 100644 --- a/src/apify/memory_storage/resource_clients/key_value_store.py +++ b/src/apify/memory_storage/resource_clients/key_value_store.py @@ -36,7 +36,7 @@ class KeyValueStoreClient: _id: str _key_value_store_directory: str _client: 'MemoryStorage' - _name: str + _name: Optional[str] _key_value_entries: Dict[str, Dict] _created_at = datetime.utcnow() _accessed_at = datetime.utcnow() diff --git a/src/apify/memory_storage/resource_clients/request_queue.py b/src/apify/memory_storage/resource_clients/request_queue.py index a5ea9626..145e7085 100644 --- a/src/apify/memory_storage/resource_clients/request_queue.py +++ b/src/apify/memory_storage/resource_clients/request_queue.py @@ -28,7 +28,7 @@ class RequestQueueClient: _id: str _request_queue_directory: str _client: 'MemoryStorage' - _name: str + _name: Optional[str] _requests: Dict[str, Dict] _created_at = datetime.utcnow() _accessed_at = datetime.utcnow() From ebea14f75abedd00449384a699b0f4911692bb95 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ji=C5=99=C3=AD=20Morav=C4=8D=C3=ADk?= Date: Wed, 4 Jan 2023 17:04:08 +0100 Subject: [PATCH 23/23] fix --- .flake8 | 1 - 1 file changed, 1 deletion(-) diff --git a/.flake8 b/.flake8 index 3bf0b531..66da48cc 100644 --- a/.flake8 +++ b/.flake8 @@ -9,7 +9,6 @@ max_line_length = 150 # Google docstring convention + D204 & D401 docstring-convention = all ignore = - U101 D100 D104 D203