16
16
17
17
18
18
class Dataset :
19
- """TODO: docs."""
19
+ """The `Dataset` class represents a store for structured data where each object stored has the same attributes.
20
+
21
+ You can imagine it as a table, where each object is a row and its attributes are columns.
22
+ Dataset is an append-only storage - you can only add new records to it but you cannot modify or remove existing records.
23
+ Typically it is used to store crawling results.
24
+
25
+ Do not instantiate this class directly, use the `Actor.open_dataset()` function instead.
26
+
27
+ `Dataset` stores its data either on local disk or in the Apify cloud,
28
+ depending on whether the `APIFY_LOCAL_STORAGE_DIR` or `APIFY_TOKEN` environment variables are set.
29
+
30
+ If the `APIFY_LOCAL_STORAGE_DIR` environment variable is set, the data is stored in
31
+ the local directory in the following files:
32
+ ```
33
+ {APIFY_LOCAL_STORAGE_DIR}/datasets/{DATASET_ID}/{INDEX}.json
34
+ ```
35
+ Note that `{DATASET_ID}` is the name or ID of the dataset. The default dataset has ID: `default`,
36
+ unless you override it by setting the `APIFY_DEFAULT_DATASET_ID` environment variable.
37
+ Each dataset item is stored as a separate JSON file, where `{INDEX}` is a zero-based index of the item in the dataset.
38
+
39
+ If the `APIFY_TOKEN` environment variable is set but `APIFY_LOCAL_STORAGE_DIR` is not, the data is stored in the
40
+ [Apify Dataset](https://docs.apify.com/storage/dataset) cloud storage.
41
+ """
20
42
21
43
_id : str
22
44
_name : Optional [str ]
23
45
_client : Union [DatasetClientAsync , DatasetClient ]
24
46
25
47
def __init__ (self , id : str , name : Optional [str ], client : Union [ApifyClientAsync , MemoryStorage ]) -> None :
26
- """TODO: docs (constructor should be "internal")."""
48
+ """Create a `Dataset` instance.
49
+
50
+ Do not use the constructor directly, use the `Dataset.open` function instead.
51
+
52
+ Args:
53
+ id (str): ID of the dataset.
54
+ name (str, optional): Name of the dataset.
55
+ client (ApifyClientAsync or MemoryStorage): The storage client which should be used.
56
+ """
27
57
self .get_data = _wrap_internal (self ._get_data_internal , self .get_data ) # type: ignore
28
58
self .push_data = _wrap_internal (self ._push_data_internal , self .push_data ) # type: ignore
29
59
self .export_to_json = _wrap_internal (self ._export_to_json_internal , self .export_to_json ) # type: ignore
@@ -47,7 +77,16 @@ def _get_default_name(cls, config: Configuration) -> str:
47
77
48
78
@classmethod
49
79
async def push_data (cls , data : JSONSerializable ) -> None :
50
- """TODO: docs."""
80
+ """Store an object or an array of objects to the dataset.
81
+
82
+ The size of the data is limited by the receiving API and therefore `push_data()` will only
83
+ allow objects whose JSON representation is smaller than 9MB. When an array is passed,
84
+ none of the included objects may be larger than 9MB, but the array itself may be of any size.
85
+
86
+ Args:
87
+ data (JSONSerializable): dict or array of dicts containing data to be stored in the default dataset.
88
+ The JSON representation of each item must be smaller than 9MB.
89
+ """
51
90
dataset = await cls .open ()
52
91
return await dataset .push_data (data )
53
92
@@ -89,7 +128,35 @@ async def get_data(
89
128
flatten : Optional [List [str ]] = None ,
90
129
view : Optional [str ] = None ,
91
130
) -> ListPage :
92
- """TODO: docs."""
131
+ """Get items from the dataset.
132
+
133
+ Args:
134
+ offset (int, optional): Number of items that should be skipped at the start. The default value is 0
135
+ limit (int, optional): Maximum number of items to return. By default there is no limit.
136
+ desc (bool, optional): By default, results are returned in the same order as they were stored.
137
+ To reverse the order, set this parameter to True.
138
+ clean (bool, optional): If True, returns only non-empty items and skips hidden fields (i.e. fields starting with the # character).
139
+ The clean parameter is just a shortcut for skip_hidden=True and skip_empty=True parameters.
140
+ Note that since some objects might be skipped from the output, that the result might contain less items than the limit value.
141
+ fields (list of str, optional): A list of fields which should be picked from the items,
142
+ only these fields will remain in the resulting record objects.
143
+ Note that the fields in the outputted items are sorted the same way as they are specified in the fields parameter.
144
+ You can use this feature to effectively fix the output format.
145
+ omit (list of str, optional): A list of fields which should be omitted from the items.
146
+ unwind (str, optional): Name of a field which should be unwound.
147
+ If the field is an array then every element of the array will become a separate record and merged with parent object.
148
+ If the unwound field is an object then it is merged with the parent object.
149
+ If the unwound field is missing or its value is neither an array nor an object and therefore cannot be merged with a parent object,
150
+ then the item gets preserved as it is. Note that the unwound items ignore the desc parameter.
151
+ skip_empty (bool, optional): If True, then empty items are skipped from the output.
152
+ Note that if used, the results might contain less items than the limit value.
153
+ skip_hidden (bool, optional): If True, then hidden fields are skipped from the output, i.e. fields starting with the # character.
154
+ flatten (list of str, optional): A list of fields that should be flattened
155
+ view (str, optional): Name of the dataset view to be used
156
+
157
+ Returns:
158
+ ListPage: A page of the list of dataset items according to the specified filters.
159
+ """
93
160
dataset = await cls .open ()
94
161
return await dataset .get_data (
95
162
offset = offset ,
@@ -151,7 +218,14 @@ async def export_to(
151
218
to_key_value_store : Optional [str ] = None ,
152
219
content_type : Optional [str ] = None ,
153
220
) -> None :
154
- """TODO: docs."""
221
+ """Save the entirety of the dataset's contents into one file within a key-value store.
222
+
223
+ Args:
224
+ key (str): The key to save the data under.
225
+ to_key_value_store (str, optional): The name of the key-value store in which the result will be saved.
226
+ Uses default key-value store if omitted.
227
+ content_type (str, optional): Either 'text/csv' or 'application/json'. Defaults to JSON.
228
+ """
155
229
key_value_store = await KeyValueStore .open (to_key_value_store )
156
230
items : List [Dict ] = []
157
231
limit = 1000
@@ -186,7 +260,14 @@ async def export_to_json(
186
260
from_dataset : Optional [str ] = None ,
187
261
to_key_value_store : Optional [str ] = None ,
188
262
) -> None :
189
- """TODO: docs."""
263
+ """Save the entirety of the dataset's contents into one JSON file within a key-value store.
264
+
265
+ Args:
266
+ key (str): The key to save the data under.
267
+ from_dataset (str, optional): The source dataset in case of calling the class method. Uses default dataset if omitted.
268
+ to_key_value_store (str, optional): The name of the key-value store in which the result will be saved.
269
+ Uses default key-value store if omitted.
270
+ """
190
271
dataset = await cls .open (from_dataset )
191
272
await dataset .export_to_json (key , to_key_value_store = to_key_value_store )
192
273
@@ -207,7 +288,14 @@ async def export_to_csv(
207
288
from_dataset : Optional [str ] = None ,
208
289
to_key_value_store : Optional [str ] = None ,
209
290
) -> None :
210
- """TODO: docs."""
291
+ """Save the entirety of the dataset's contents into one CSV file within a key-value store.
292
+
293
+ Args:
294
+ key (str): The key to save the data under.
295
+ from_dataset (str, optional): The source dataset in case of calling the class method. Uses default dataset if omitted.
296
+ to_key_value_store (str, optional): The name of the key-value store in which the result will be saved.
297
+ Uses default key-value store if omitted.
298
+ """
211
299
dataset = await cls .open (from_dataset )
212
300
await dataset .export_to_csv (key , to_key_value_store = to_key_value_store )
213
301
@@ -221,10 +309,14 @@ async def _export_to_csv_internal(
221
309
await self .export_to (key , to_key_value_store = to_key_value_store , content_type = 'text/csv' )
222
310
223
311
async def get_info (self ) -> Optional [Dict ]:
224
- """TODO: docs."""
312
+ """Get an object containing general information about the dataset.
313
+
314
+ Returns:
315
+ dict: Object returned by calling the GET dataset API endpoint.
316
+ """
225
317
return await self ._client .get ()
226
318
227
- def iterate_items ( # ~forEach in TS
319
+ def iterate_items (
228
320
self ,
229
321
* ,
230
322
offset : int = 0 ,
@@ -237,7 +329,33 @@ def iterate_items( # ~forEach in TS
237
329
skip_empty : Optional [bool ] = None ,
238
330
skip_hidden : Optional [bool ] = None ,
239
331
) -> AsyncIterator [Dict ]:
240
- """TODO: docs."""
332
+ """Iterate over the items in the dataset.
333
+
334
+ Args:
335
+ offset (int, optional): Number of items that should be skipped at the start. The default value is 0
336
+ limit (int, optional): Maximum number of items to return. By default there is no limit.
337
+ desc (bool, optional): By default, results are returned in the same order as they were stored.
338
+ To reverse the order, set this parameter to True.
339
+ clean (bool, optional): If True, returns only non-empty items and skips hidden fields (i.e. fields starting with the # character).
340
+ The clean parameter is just a shortcut for skip_hidden=True and skip_empty=True parameters.
341
+ Note that since some objects might be skipped from the output, that the result might contain less items than the limit value.
342
+ fields (list of str, optional): A list of fields which should be picked from the items,
343
+ only these fields will remain in the resulting record objects.
344
+ Note that the fields in the outputted items are sorted the same way as they are specified in the fields parameter.
345
+ You can use this feature to effectively fix the output format.
346
+ omit (list of str, optional): A list of fields which should be omitted from the items.
347
+ unwind (str, optional): Name of a field which should be unwound.
348
+ If the field is an array then every element of the array will become a separate record and merged with parent object.
349
+ If the unwound field is an object then it is merged with the parent object.
350
+ If the unwound field is missing or its value is neither an array nor an object and therefore cannot be merged with a parent object,
351
+ then the item gets preserved as it is. Note that the unwound items ignore the desc parameter.
352
+ skip_empty (bool, optional): If True, then empty items are skipped from the output.
353
+ Note that if used, the results might contain less items than the limit value.
354
+ skip_hidden (bool, optional): If True, then hidden fields are skipped from the output, i.e. fields starting with the # character.
355
+
356
+ Yields:
357
+ dict: An item from the dataset
358
+ """
241
359
return self ._client .iterate_items (
242
360
offset = offset ,
243
361
limit = limit ,
@@ -251,11 +369,25 @@ def iterate_items( # ~forEach in TS
251
369
)
252
370
253
371
async def drop (self ) -> None :
254
- """TODO: docs ."""
372
+ """Remove the dataset either from the Apify cloud storage or from the local directory ."""
255
373
await self ._client .delete ()
256
374
await StorageManager .close_storage (self .__class__ , self ._id , self ._name )
257
375
258
376
@classmethod
259
377
async def open (cls , dataset_id_or_name : Optional [str ] = None , config : Optional [Configuration ] = None ) -> 'Dataset' :
260
- """TODO: docs."""
378
+ """Open a dataset.
379
+
380
+ Datasets are used to store structured data where each object stored has the same attributes,
381
+ such as online store products or real estate offers.
382
+ The actual data is stored either on the local filesystem or in the Apify cloud.
383
+
384
+ Args:
385
+ dataset_id_or_name (str, optional): ID or name of the dataset to be opened.
386
+ If not provided, the method returns the default dataset associated with the actor run.
387
+ config (Configuration, optional): A `Configuration` instance, uses global configuration if omitted.
388
+
389
+ Returns:
390
+ Dataset: An instance of the `Dataset` class for the given ID or name.
391
+
392
+ """
261
393
return await StorageManager .open_storage (cls , dataset_id_or_name , None , config )
0 commit comments