forked from openml/openml-python
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdatasets_tutorial.py
More file actions
110 lines (96 loc) · 3.63 KB
/
datasets_tutorial.py
File metadata and controls
110 lines (96 loc) · 3.63 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
"""
========
Datasets
========
How to list and download datasets.
"""
############################################################################
import openml
import pandas as pd
############################################################################
# Exercise 0
# **********
#
# * List datasets
# * Use the output_format parameter to select output type
# * Default gives 'dict' (other option: 'dataframe')
openml_list = openml.datasets.list_datasets() # returns a dict
# Show a nice table with some key data properties
datalist = pd.DataFrame.from_dict(openml_list, orient='index')
datalist = datalist[[
'did', 'name', 'NumberOfInstances',
'NumberOfFeatures', 'NumberOfClasses'
]]
print("First 10 of %s datasets..." % len(datalist))
datalist.head(n=10)
# The same can be done with lesser lines of code
openml_df = openml.datasets.list_datasets(output_format='dataframe')
openml_df.head(n=10)
############################################################################
# Exercise 1
# **********
#
# * Find datasets with more than 10000 examples.
# * Find a dataset called 'eeg_eye_state'.
# * Find all datasets with more than 50 classes.
datalist[datalist.NumberOfInstances > 10000
].sort_values(['NumberOfInstances']).head(n=20)
############################################################################
datalist.query('name == "eeg-eye-state"')
############################################################################
datalist.query('NumberOfClasses > 50')
############################################################################
# Download datasets
# =================
# This is done based on the dataset ID.
dataset = openml.datasets.get_dataset(1471)
# Print a summary
print("This is dataset '%s', the target feature is '%s'" %
(dataset.name, dataset.default_target_attribute))
print("URL: %s" % dataset.url)
print(dataset.description[:500])
############################################################################
# Get the actual data.
#
# The dataset can be returned in 2 possible formats: as a NumPy array, a SciPy
# sparse matrix, or as a Pandas DataFrame (or SparseDataFrame). The format is
# controlled with the parameter ``dataset_format`` which can be either 'array'
# (default) or 'dataframe'. Let's first build our dataset from a NumPy array
# and manually create a dataframe.
X, y, categorical_indicator, attribute_names = dataset.get_data(
dataset_format='array',
target=dataset.default_target_attribute
)
eeg = pd.DataFrame(X, columns=attribute_names)
eeg['class'] = y
print(eeg[:10])
############################################################################
# Instead of manually creating the dataframe, you can already request a
# dataframe with the correct dtypes.
X, y, categorical_indicator, attribute_names = dataset.get_data(
target=dataset.default_target_attribute,
dataset_format='dataframe'
)
print(X.head())
print(X.info())
############################################################################
# Sometimes you only need access to a dataset's metadata.
# In those cases, you can download the dataset without downloading the
# data file. The dataset object can be used as normal.
# Whenever you use any functionality that requires the data,
# such as `get_data`, the data will be downloaded.
dataset = openml.datasets.get_dataset(1471, download_data=False)
############################################################################
# Exercise 2
# **********
# * Explore the data visually.
eegs = eeg.sample(n=1000)
_ = pd.plotting.scatter_matrix(
eegs.iloc[:100, :4],
c=eegs[:100]['class'],
figsize=(10, 10),
marker='o',
hist_kwds={'bins': 20},
alpha=.8,
cmap='plasma'
)