forked from openml/openml-python
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcreate_upload_tutorial.py
More file actions
318 lines (277 loc) · 10.5 KB
/
create_upload_tutorial.py
File metadata and controls
318 lines (277 loc) · 10.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
"""
Dataset upload tutorial
=======================
A tutorial on how to create and upload a dataset to OpenML.
"""
import numpy as np
import pandas as pd
import sklearn.datasets
from scipy.sparse import coo_matrix
import openml
from openml.datasets.functions import create_dataset
############################################################################
# .. warning:: This example uploads data. For that reason, this example
# connects to the test server at test.openml.org. This prevents the main
# server from crowding with example datasets, tasks, runs, and so on.
openml.config.start_using_configuration_for_example()
############################################################################
############################################################################
# Below we will cover the following cases of the dataset object:
#
# * A numpy array
# * A list
# * A pandas dataframe
# * A sparse matrix
# * A pandas sparse dataframe
############################################################################
# Dataset is a numpy array
# ========================
# A numpy array can contain lists in the case of dense data or it can contain
# OrderedDicts in the case of sparse data.
#
# Prepare dataset
# ^^^^^^^^^^^^^^^
# Load an example dataset from scikit-learn which we will upload to OpenML.org
# via the API.
diabetes = sklearn.datasets.load_diabetes()
name = 'Diabetes(scikit-learn)'
X = diabetes.data
y = diabetes.target
attribute_names = diabetes.feature_names
description = diabetes.DESCR
############################################################################
# OpenML does not distinguish between the attributes and targets on the data
# level and stores all data in a single matrix.
#
# The target feature is indicated as meta-data of the dataset (and tasks on
# that data).
data = np.concatenate((X, y.reshape((-1, 1))), axis=1)
attribute_names = list(attribute_names)
attributes = [
(attribute_name, 'REAL') for attribute_name in attribute_names
] + [('class', 'INTEGER')]
citation = (
"Bradley Efron, Trevor Hastie, Iain Johnstone and "
"Robert Tibshirani (2004) (Least Angle Regression) "
"Annals of Statistics (with discussion), 407-499"
)
paper_url = (
'http://web.stanford.edu/~hastie/Papers/'
'LARS/LeastAngle_2002.pdf'
)
############################################################################
# Create the dataset object
# ^^^^^^^^^^^^^^^^^^^^^^^^^
# The definition of all fields can be found in the XSD files describing the
# expected format:
#
# https://github.com/openml/OpenML/blob/master/openml_OS/views/pages/api_new/v1/xsd/openml.data.upload.xsd
diabetes_dataset = create_dataset(
# The name of the dataset (needs to be unique).
# Must not be longer than 128 characters and only contain
# a-z, A-Z, 0-9 and the following special characters: _\-\.(),
name=name,
# Textual description of the dataset.
description=description,
# The person who created the dataset.
creator="Bradley Efron, Trevor Hastie, "
"Iain Johnstone and Robert Tibshirani",
# People who contributed to the current version of the dataset.
contributor=None,
# The date the data was originally collected, given by the uploader.
collection_date='09-01-2012',
# Language in which the data is represented.
# Starts with 1 upper case letter, rest lower case, e.g. 'English'.
language='English',
# License under which the data is/will be distributed.
licence='BSD (from scikit-learn)',
# Name of the target. Can also have multiple values (comma-separated).
default_target_attribute='class',
# The attribute that represents the row-id column, if present in the
# dataset.
row_id_attribute=None,
# Attributes that should be excluded in modelling, such as identifiers and
# indexes.
ignore_attribute=None,
# How to cite the paper.
citation=citation,
# Attributes of the data
attributes=attributes,
data=data,
# A version label which is provided by the user.
version_label='test',
original_data_url=(
'http://www4.stat.ncsu.edu/~boos/var.select/diabetes.html'
),
paper_url=paper_url,
)
############################################################################
upload_did = diabetes_dataset.publish()
print('URL for dataset: %s/data/%d' % (openml.config.server, upload_did))
############################################################################
# Dataset is a list
# =================
# A list can contain lists in the case of dense data or it can contain
# OrderedDicts in the case of sparse data.
#
# Weather dataset:
# http://storm.cis.fordham.edu/~gweiss/data-mining/datasets.html
data = [
['sunny', 85, 85, 'FALSE', 'no'],
['sunny', 80, 90, 'TRUE', 'no'],
['overcast', 83, 86, 'FALSE', 'yes'],
['rainy', 70, 96, 'FALSE', 'yes'],
['rainy', 68, 80, 'FALSE', 'yes'],
['rainy', 65, 70, 'TRUE', 'no'],
['overcast', 64, 65, 'TRUE', 'yes'],
['sunny', 72, 95, 'FALSE', 'no'],
['sunny', 69, 70, 'FALSE', 'yes'],
['rainy', 75, 80, 'FALSE', 'yes'],
['sunny', 75, 70, 'TRUE', 'yes'],
['overcast', 72, 90, 'TRUE', 'yes'],
['overcast', 81, 75, 'FALSE', 'yes'],
['rainy', 71, 91, 'TRUE', 'no'],
]
attribute_names = [
('outlook', ['sunny', 'overcast', 'rainy']),
('temperature', 'REAL'),
('humidity', 'REAL'),
('windy', ['TRUE', 'FALSE']),
('play', ['yes', 'no']),
]
description = (
'The weather problem is a tiny dataset that we will use repeatedly'
' to illustrate machine learning methods. Entirely fictitious, it '
'supposedly concerns the conditions that are suitable for playing '
'some unspecified game. In general, instances in a dataset are '
'characterized by the values of features, or attributes, that measure '
'different aspects of the instance. In this case there are four '
'attributes: outlook, temperature, humidity, and windy. '
'The outcome is whether to play or not.'
)
citation = (
'I. H. Witten, E. Frank, M. A. Hall, and ITPro,'
'Data mining practical machine learning tools and techniques, '
'third edition. Burlington, Mass.: Morgan Kaufmann Publishers, 2011'
)
weather_dataset = create_dataset(
name="Weather",
description=description,
creator='I. H. Witten, E. Frank, M. A. Hall, and ITPro',
contributor=None,
collection_date='01-01-2011',
language='English',
licence=None,
default_target_attribute='play',
row_id_attribute=None,
ignore_attribute=None,
citation=citation,
attributes=attribute_names,
data=data,
version_label='example',
)
############################################################################
upload_did = weather_dataset.publish()
print('URL for dataset: %s/data/%d' % (openml.config.server, upload_did))
############################################################################
# Dataset is a pandas DataFrame
# =============================
# It might happen that your dataset is made of heterogeneous data which can be
# usually stored as a Pandas DataFrame. DataFrame offers the adavantages to
# store the type of data for each column as well as the attribute names.
# Therefore, when providing a Pandas DataFrame, OpenML can infer those
# information without the need to specifically provide them when calling the
# function :func:`create_dataset`. In this regard, you only need to pass
# ``'auto'`` to the ``attributes`` parameter.
df = pd.DataFrame(data, columns=[col_name for col_name, _ in attribute_names])
# enforce the categorical column to have a categorical dtype
df['outlook'] = df['outlook'].astype('category')
df['windy'] = df['windy'].astype('bool')
df['play'] = df['play'].astype('category')
print(df.info())
############################################################################
# We enforce the column 'outlook', 'windy', and 'play' to be a categorical
# dtype while the column 'rnd_str' is kept as a string column. Then, we can
# call :func:`create_dataset` by passing the dataframe and fixing the parameter
# ``attributes`` to ``'auto'``.
weather_dataset = create_dataset(
name="Weather",
description=description,
creator='I. H. Witten, E. Frank, M. A. Hall, and ITPro',
contributor=None,
collection_date='01-01-2011',
language='English',
licence=None,
default_target_attribute='play',
row_id_attribute=None,
ignore_attribute=None,
citation=citation,
attributes='auto',
data=df,
version_label='example',
)
############################################################################
upload_did = weather_dataset.publish()
print('URL for dataset: %s/data/%d' % (openml.config.server, upload_did))
############################################################################
# Dataset is a sparse matrix
# ==========================
sparse_data = coo_matrix((
[0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
([0, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 2, 0, 1])
))
column_names = [
('input1', 'REAL'),
('input2', 'REAL'),
('y', 'REAL'),
]
xor_dataset = create_dataset(
name="XOR",
description='Dataset representing the XOR operation',
creator=None,
contributor=None,
collection_date=None,
language='English',
licence=None,
default_target_attribute='y',
row_id_attribute=None,
ignore_attribute=None,
citation=None,
attributes=column_names,
data=sparse_data,
version_label='example',
)
############################################################################
upload_did = xor_dataset.publish()
print('URL for dataset: %s/data/%d' % (openml.config.server, upload_did))
############################################################################
# Dataset is a pandas sparse dataframe
# ====================================
sparse_data = coo_matrix((
[0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
([0, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 2, 0, 1])
))
column_names = ['input1', 'input2', 'y']
df = pd.SparseDataFrame(sparse_data, columns=column_names)
print(df.info())
xor_dataset = create_dataset(
name="XOR",
description='Dataset representing the XOR operation',
creator=None,
contributor=None,
collection_date=None,
language='English',
licence=None,
default_target_attribute='y',
row_id_attribute=None,
ignore_attribute=None,
citation=None,
attributes='auto',
data=df,
version_label='example',
)
############################################################################
upload_did = xor_dataset.publish()
print('URL for dataset: %s/data/%d' % (openml.config.server, upload_did))
############################################################################
openml.config.stop_using_configuration_for_example()