Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit 174548e

Browse filesBrowse files
authored
feat: Add docs for Dataset, KeyValueStore, and RequestQueue (#37)
1 parent 01162aa commit 174548e
Copy full SHA for 174548e

File tree

Expand file treeCollapse file tree

5 files changed

+363
-45
lines changed
Filter options
Expand file treeCollapse file tree

5 files changed

+363
-45
lines changed

‎src/apify/memory_storage/memory_storage.py

Copy file name to clipboardExpand all lines: src/apify/memory_storage/memory_storage.py
+1-1Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ def __init__(
4747
persist_storage (bool, optional): Whether to persist the data to the `local_data_directory` or just keep them in memory
4848
write_metadata (bool, optional): Whether to persist metadata of the storages as well
4949
"""
50-
self._local_data_directory = local_data_directory
50+
self._local_data_directory = local_data_directory # TODO: Make this work with `APIFY_LOCAL_STORAGE_DIR`
5151
self._datasets_directory = os.path.join(self._local_data_directory, 'datasets')
5252
self._key_value_stores_directory = os.path.join(self._local_data_directory, 'key_value_stores')
5353
self._request_queues_directory = os.path.join(self._local_data_directory, 'request_queues')

‎src/apify/storages/dataset.py

Copy file name to clipboardExpand all lines: src/apify/storages/dataset.py
+144-12Lines changed: 144 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -16,14 +16,44 @@
1616

1717

1818
class Dataset:
19-
"""TODO: docs."""
19+
"""The `Dataset` class represents a store for structured data where each object stored has the same attributes.
20+
21+
You can imagine it as a table, where each object is a row and its attributes are columns.
22+
Dataset is an append-only storage - you can only add new records to it but you cannot modify or remove existing records.
23+
Typically it is used to store crawling results.
24+
25+
Do not instantiate this class directly, use the `Actor.open_dataset()` function instead.
26+
27+
`Dataset` stores its data either on local disk or in the Apify cloud,
28+
depending on whether the `APIFY_LOCAL_STORAGE_DIR` or `APIFY_TOKEN` environment variables are set.
29+
30+
If the `APIFY_LOCAL_STORAGE_DIR` environment variable is set, the data is stored in
31+
the local directory in the following files:
32+
```
33+
{APIFY_LOCAL_STORAGE_DIR}/datasets/{DATASET_ID}/{INDEX}.json
34+
```
35+
Note that `{DATASET_ID}` is the name or ID of the dataset. The default dataset has ID: `default`,
36+
unless you override it by setting the `APIFY_DEFAULT_DATASET_ID` environment variable.
37+
Each dataset item is stored as a separate JSON file, where `{INDEX}` is a zero-based index of the item in the dataset.
38+
39+
If the `APIFY_TOKEN` environment variable is set but `APIFY_LOCAL_STORAGE_DIR` is not, the data is stored in the
40+
[Apify Dataset](https://docs.apify.com/storage/dataset) cloud storage.
41+
"""
2042

2143
_id: str
2244
_name: Optional[str]
2345
_client: Union[DatasetClientAsync, DatasetClient]
2446

2547
def __init__(self, id: str, name: Optional[str], client: Union[ApifyClientAsync, MemoryStorage]) -> None:
26-
"""TODO: docs (constructor should be "internal")."""
48+
"""Create a `Dataset` instance.
49+
50+
Do not use the constructor directly, use the `Dataset.open` function instead.
51+
52+
Args:
53+
id (str): ID of the dataset.
54+
name (str, optional): Name of the dataset.
55+
client (ApifyClientAsync or MemoryStorage): The storage client which should be used.
56+
"""
2757
self.get_data = _wrap_internal(self._get_data_internal, self.get_data) # type: ignore
2858
self.push_data = _wrap_internal(self._push_data_internal, self.push_data) # type: ignore
2959
self.export_to_json = _wrap_internal(self._export_to_json_internal, self.export_to_json) # type: ignore
@@ -47,7 +77,16 @@ def _get_default_name(cls, config: Configuration) -> str:
4777

4878
@classmethod
4979
async def push_data(cls, data: JSONSerializable) -> None:
50-
"""TODO: docs."""
80+
"""Store an object or an array of objects to the dataset.
81+
82+
The size of the data is limited by the receiving API and therefore `push_data()` will only
83+
allow objects whose JSON representation is smaller than 9MB. When an array is passed,
84+
none of the included objects may be larger than 9MB, but the array itself may be of any size.
85+
86+
Args:
87+
data (JSONSerializable): dict or array of dicts containing data to be stored in the default dataset.
88+
The JSON representation of each item must be smaller than 9MB.
89+
"""
5190
dataset = await cls.open()
5291
return await dataset.push_data(data)
5392

@@ -89,7 +128,35 @@ async def get_data(
89128
flatten: Optional[List[str]] = None,
90129
view: Optional[str] = None,
91130
) -> ListPage:
92-
"""TODO: docs."""
131+
"""Get items from the dataset.
132+
133+
Args:
134+
offset (int, optional): Number of items that should be skipped at the start. The default value is 0
135+
limit (int, optional): Maximum number of items to return. By default there is no limit.
136+
desc (bool, optional): By default, results are returned in the same order as they were stored.
137+
To reverse the order, set this parameter to True.
138+
clean (bool, optional): If True, returns only non-empty items and skips hidden fields (i.e. fields starting with the # character).
139+
The clean parameter is just a shortcut for skip_hidden=True and skip_empty=True parameters.
140+
Note that since some objects might be skipped from the output, that the result might contain less items than the limit value.
141+
fields (list of str, optional): A list of fields which should be picked from the items,
142+
only these fields will remain in the resulting record objects.
143+
Note that the fields in the outputted items are sorted the same way as they are specified in the fields parameter.
144+
You can use this feature to effectively fix the output format.
145+
omit (list of str, optional): A list of fields which should be omitted from the items.
146+
unwind (str, optional): Name of a field which should be unwound.
147+
If the field is an array then every element of the array will become a separate record and merged with parent object.
148+
If the unwound field is an object then it is merged with the parent object.
149+
If the unwound field is missing or its value is neither an array nor an object and therefore cannot be merged with a parent object,
150+
then the item gets preserved as it is. Note that the unwound items ignore the desc parameter.
151+
skip_empty (bool, optional): If True, then empty items are skipped from the output.
152+
Note that if used, the results might contain less items than the limit value.
153+
skip_hidden (bool, optional): If True, then hidden fields are skipped from the output, i.e. fields starting with the # character.
154+
flatten (list of str, optional): A list of fields that should be flattened
155+
view (str, optional): Name of the dataset view to be used
156+
157+
Returns:
158+
ListPage: A page of the list of dataset items according to the specified filters.
159+
"""
93160
dataset = await cls.open()
94161
return await dataset.get_data(
95162
offset=offset,
@@ -151,7 +218,14 @@ async def export_to(
151218
to_key_value_store: Optional[str] = None,
152219
content_type: Optional[str] = None,
153220
) -> None:
154-
"""TODO: docs."""
221+
"""Save the entirety of the dataset's contents into one file within a key-value store.
222+
223+
Args:
224+
key (str): The key to save the data under.
225+
to_key_value_store (str, optional): The name of the key-value store in which the result will be saved.
226+
Uses default key-value store if omitted.
227+
content_type (str, optional): Either 'text/csv' or 'application/json'. Defaults to JSON.
228+
"""
155229
key_value_store = await KeyValueStore.open(to_key_value_store)
156230
items: List[Dict] = []
157231
limit = 1000
@@ -186,7 +260,14 @@ async def export_to_json(
186260
from_dataset: Optional[str] = None,
187261
to_key_value_store: Optional[str] = None,
188262
) -> None:
189-
"""TODO: docs."""
263+
"""Save the entirety of the dataset's contents into one JSON file within a key-value store.
264+
265+
Args:
266+
key (str): The key to save the data under.
267+
from_dataset (str, optional): The source dataset in case of calling the class method. Uses default dataset if omitted.
268+
to_key_value_store (str, optional): The name of the key-value store in which the result will be saved.
269+
Uses default key-value store if omitted.
270+
"""
190271
dataset = await cls.open(from_dataset)
191272
await dataset.export_to_json(key, to_key_value_store=to_key_value_store)
192273

@@ -207,7 +288,14 @@ async def export_to_csv(
207288
from_dataset: Optional[str] = None,
208289
to_key_value_store: Optional[str] = None,
209290
) -> None:
210-
"""TODO: docs."""
291+
"""Save the entirety of the dataset's contents into one CSV file within a key-value store.
292+
293+
Args:
294+
key (str): The key to save the data under.
295+
from_dataset (str, optional): The source dataset in case of calling the class method. Uses default dataset if omitted.
296+
to_key_value_store (str, optional): The name of the key-value store in which the result will be saved.
297+
Uses default key-value store if omitted.
298+
"""
211299
dataset = await cls.open(from_dataset)
212300
await dataset.export_to_csv(key, to_key_value_store=to_key_value_store)
213301

@@ -221,10 +309,14 @@ async def _export_to_csv_internal(
221309
await self.export_to(key, to_key_value_store=to_key_value_store, content_type='text/csv')
222310

223311
async def get_info(self) -> Optional[Dict]:
224-
"""TODO: docs."""
312+
"""Get an object containing general information about the dataset.
313+
314+
Returns:
315+
dict: Object returned by calling the GET dataset API endpoint.
316+
"""
225317
return await self._client.get()
226318

227-
def iterate_items( # ~forEach in TS
319+
def iterate_items(
228320
self,
229321
*,
230322
offset: int = 0,
@@ -237,7 +329,33 @@ def iterate_items( # ~forEach in TS
237329
skip_empty: Optional[bool] = None,
238330
skip_hidden: Optional[bool] = None,
239331
) -> AsyncIterator[Dict]:
240-
"""TODO: docs."""
332+
"""Iterate over the items in the dataset.
333+
334+
Args:
335+
offset (int, optional): Number of items that should be skipped at the start. The default value is 0
336+
limit (int, optional): Maximum number of items to return. By default there is no limit.
337+
desc (bool, optional): By default, results are returned in the same order as they were stored.
338+
To reverse the order, set this parameter to True.
339+
clean (bool, optional): If True, returns only non-empty items and skips hidden fields (i.e. fields starting with the # character).
340+
The clean parameter is just a shortcut for skip_hidden=True and skip_empty=True parameters.
341+
Note that since some objects might be skipped from the output, that the result might contain less items than the limit value.
342+
fields (list of str, optional): A list of fields which should be picked from the items,
343+
only these fields will remain in the resulting record objects.
344+
Note that the fields in the outputted items are sorted the same way as they are specified in the fields parameter.
345+
You can use this feature to effectively fix the output format.
346+
omit (list of str, optional): A list of fields which should be omitted from the items.
347+
unwind (str, optional): Name of a field which should be unwound.
348+
If the field is an array then every element of the array will become a separate record and merged with parent object.
349+
If the unwound field is an object then it is merged with the parent object.
350+
If the unwound field is missing or its value is neither an array nor an object and therefore cannot be merged with a parent object,
351+
then the item gets preserved as it is. Note that the unwound items ignore the desc parameter.
352+
skip_empty (bool, optional): If True, then empty items are skipped from the output.
353+
Note that if used, the results might contain less items than the limit value.
354+
skip_hidden (bool, optional): If True, then hidden fields are skipped from the output, i.e. fields starting with the # character.
355+
356+
Yields:
357+
dict: An item from the dataset
358+
"""
241359
return self._client.iterate_items(
242360
offset=offset,
243361
limit=limit,
@@ -251,11 +369,25 @@ def iterate_items( # ~forEach in TS
251369
)
252370

253371
async def drop(self) -> None:
254-
"""TODO: docs."""
372+
"""Remove the dataset either from the Apify cloud storage or from the local directory."""
255373
await self._client.delete()
256374
await StorageManager.close_storage(self.__class__, self._id, self._name)
257375

258376
@classmethod
259377
async def open(cls, dataset_id_or_name: Optional[str] = None, config: Optional[Configuration] = None) -> 'Dataset':
260-
"""TODO: docs."""
378+
"""Open a dataset.
379+
380+
Datasets are used to store structured data where each object stored has the same attributes,
381+
such as online store products or real estate offers.
382+
The actual data is stored either on the local filesystem or in the Apify cloud.
383+
384+
Args:
385+
dataset_id_or_name (str, optional): ID or name of the dataset to be opened.
386+
If not provided, the method returns the default dataset associated with the actor run.
387+
config (Configuration, optional): A `Configuration` instance, uses global configuration if omitted.
388+
389+
Returns:
390+
Dataset: An instance of the `Dataset` class for the given ID or name.
391+
392+
"""
261393
return await StorageManager.open_storage(cls, dataset_id_or_name, None, config)

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.