openml-python/examples/create_upload_tutorial.py at develop · bin2000/openml-python

History

318 lines (277 loc) · 10.5 KB

Raw

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

"""

Dataset upload tutorial

=======================

A tutorial on how to create and upload a dataset to OpenML.

"""

import numpy as np

import pandas as pd

import sklearn.datasets

from scipy.sparse import coo_matrix

import openml

from openml.datasets.functions import create_dataset

############################################################################

# .. warning:: This example uploads data. For that reason, this example

# connects to the test server at test.openml.org. This prevents the main

# server from crowding with example datasets, tasks, runs, and so on.

openml.config.start_using_configuration_for_example()

############################################################################

# Below we will cover the following cases of the dataset object:

# * A numpy array

# * A list

# * A pandas dataframe

# * A sparse matrix

# * A pandas sparse dataframe

############################################################################

# Dataset is a numpy array

# ========================

# A numpy array can contain lists in the case of dense data or it can contain

# OrderedDicts in the case of sparse data.

# Prepare dataset

# ^^^^^^^^^^^^^^^

# Load an example dataset from scikit-learn which we will upload to OpenML.org

# via the API.

diabetes = sklearn.datasets.load_diabetes()

name = 'Diabetes(scikit-learn)'

X = diabetes.data

y = diabetes.target

attribute_names = diabetes.feature_names

description = diabetes.DESCR

############################################################################

# OpenML does not distinguish between the attributes and targets on the data

# level and stores all data in a single matrix.

# The target feature is indicated as meta-data of the dataset (and tasks on

# that data).

data = np.concatenate((X, y.reshape((-1, 1))), axis=1)

attribute_names = list(attribute_names)

attributes = [

(attribute_name, 'REAL') for attribute_name in attribute_names

] + [('class', 'INTEGER')]

citation = (

"Bradley Efron, Trevor Hastie, Iain Johnstone and "

"Robert Tibshirani (2004) (Least Angle Regression) "

"Annals of Statistics (with discussion), 407-499"

)

paper_url = (

'http://web.stanford.edu/~hastie/Papers/'

'LARS/LeastAngle_2002.pdf'

)

############################################################################

# Create the dataset object

# ^^^^^^^^^^^^^^^^^^^^^^^^^

# The definition of all fields can be found in the XSD files describing the

# expected format:

# https://github.com/openml/OpenML/blob/master/openml_OS/views/pages/api_new/v1/xsd/openml.data.upload.xsd

diabetes_dataset = create_dataset(

# The name of the dataset (needs to be unique).

# Must not be longer than 128 characters and only contain

# a-z, A-Z, 0-9 and the following special characters: _\-\.(),

name=name,

# Textual description of the dataset.

description=description,

# The person who created the dataset.

creator="Bradley Efron, Trevor Hastie, "

"Iain Johnstone and Robert Tibshirani",

# People who contributed to the current version of the dataset.

contributor=None,

# The date the data was originally collected, given by the uploader.

collection_date='09-01-2012',

# Language in which the data is represented.

# Starts with 1 upper case letter, rest lower case, e.g. 'English'.

language='English',

# License under which the data is/will be distributed.

licence='BSD (from scikit-learn)',

# Name of the target. Can also have multiple values (comma-separated).

default_target_attribute='class',

# The attribute that represents the row-id column, if present in the

# dataset.

row_id_attribute=None,

# Attributes that should be excluded in modelling, such as identifiers and

# indexes.

ignore_attribute=None,

# How to cite the paper.

citation=citation,

# Attributes of the data

attributes=attributes,

data=data,

# A version label which is provided by the user.

version_label='test',

original_data_url=(

'http://www4.stat.ncsu.edu/~boos/var.select/diabetes.html'

paper_url=paper_url,

)

############################################################################

upload_did = diabetes_dataset.publish()

print('URL for dataset: %s/data/%d' % (openml.config.server, upload_did))

############################################################################

# Dataset is a list

# =================

# A list can contain lists in the case of dense data or it can contain

# OrderedDicts in the case of sparse data.

# Weather dataset:

# http://storm.cis.fordham.edu/~gweiss/data-mining/datasets.html

data = [

['sunny', 85, 85, 'FALSE', 'no'],

['sunny', 80, 90, 'TRUE', 'no'],

['overcast', 83, 86, 'FALSE', 'yes'],

['rainy', 70, 96, 'FALSE', 'yes'],

['rainy', 68, 80, 'FALSE', 'yes'],

['rainy', 65, 70, 'TRUE', 'no'],

['overcast', 64, 65, 'TRUE', 'yes'],

['sunny', 72, 95, 'FALSE', 'no'],

['sunny', 69, 70, 'FALSE', 'yes'],

['rainy', 75, 80, 'FALSE', 'yes'],

['sunny', 75, 70, 'TRUE', 'yes'],

['overcast', 72, 90, 'TRUE', 'yes'],

['overcast', 81, 75, 'FALSE', 'yes'],

['rainy', 71, 91, 'TRUE', 'no'],

]

attribute_names = [

('outlook', ['sunny', 'overcast', 'rainy']),

('temperature', 'REAL'),

('humidity', 'REAL'),

('windy', ['TRUE', 'FALSE']),

('play', ['yes', 'no']),

]

description = (

'The weather problem is a tiny dataset that we will use repeatedly'

' to illustrate machine learning methods. Entirely fictitious, it '

'supposedly concerns the conditions that are suitable for playing '

'some unspecified game. In general, instances in a dataset are '

'characterized by the values of features, or attributes, that measure '

'different aspects of the instance. In this case there are four '

'attributes: outlook, temperature, humidity, and windy. '

'The outcome is whether to play or not.'

)

citation = (

'I. H. Witten, E. Frank, M. A. Hall, and ITPro,'

'Data mining practical machine learning tools and techniques, '

'third edition. Burlington, Mass.: Morgan Kaufmann Publishers, 2011'

)

weather_dataset = create_dataset(

name="Weather",

description=description,

creator='I. H. Witten, E. Frank, M. A. Hall, and ITPro',

contributor=None,

collection_date='01-01-2011',

language='English',

licence=None,

default_target_attribute='play',

row_id_attribute=None,

ignore_attribute=None,

citation=citation,

attributes=attribute_names,

data=data,

version_label='example',

)

############################################################################

upload_did = weather_dataset.publish()

print('URL for dataset: %s/data/%d' % (openml.config.server, upload_did))

############################################################################

# Dataset is a pandas DataFrame

# =============================

# It might happen that your dataset is made of heterogeneous data which can be

# usually stored as a Pandas DataFrame. DataFrame offers the adavantages to

# store the type of data for each column as well as the attribute names.

# Therefore, when providing a Pandas DataFrame, OpenML can infer those

# information without the need to specifically provide them when calling the

# function :func:`create_dataset`. In this regard, you only need to pass

# ``'auto'`` to the ``attributes`` parameter.

df = pd.DataFrame(data, columns=[col_name for col_name, _ in attribute_names])

# enforce the categorical column to have a categorical dtype

df['outlook'] = df['outlook'].astype('category')

df['windy'] = df['windy'].astype('bool')

df['play'] = df['play'].astype('category')

print(df.info())

############################################################################

# We enforce the column 'outlook', 'windy', and 'play' to be a categorical

# dtype while the column 'rnd_str' is kept as a string column. Then, we can

# call :func:`create_dataset` by passing the dataframe and fixing the parameter

# ``attributes`` to ``'auto'``.

weather_dataset = create_dataset(

name="Weather",

description=description,

creator='I. H. Witten, E. Frank, M. A. Hall, and ITPro',

contributor=None,

collection_date='01-01-2011',

language='English',

licence=None,

default_target_attribute='play',

row_id_attribute=None,

ignore_attribute=None,

citation=citation,

attributes='auto',

data=df,

version_label='example',

)

############################################################################

upload_did = weather_dataset.publish()

print('URL for dataset: %s/data/%d' % (openml.config.server, upload_did))

############################################################################

# Dataset is a sparse matrix

# ==========================

sparse_data = coo_matrix((

[0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],

([0, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 2, 0, 1])

))

column_names = [

('input1', 'REAL'),

('input2', 'REAL'),

('y', 'REAL'),

]

xor_dataset = create_dataset(

name="XOR",

description='Dataset representing the XOR operation',

creator=None,

contributor=None,

collection_date=None,

language='English',

licence=None,

default_target_attribute='y',

row_id_attribute=None,

ignore_attribute=None,

citation=None,

attributes=column_names,

data=sparse_data,

version_label='example',

)

############################################################################

upload_did = xor_dataset.publish()

print('URL for dataset: %s/data/%d' % (openml.config.server, upload_did))

############################################################################

# Dataset is a pandas sparse dataframe

# ====================================

sparse_data = coo_matrix((

[0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],

([0, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 2, 0, 1])

))

column_names = ['input1', 'input2', 'y']

df = pd.SparseDataFrame(sparse_data, columns=column_names)

print(df.info())

xor_dataset = create_dataset(

name="XOR",

description='Dataset representing the XOR operation',

creator=None,

contributor=None,

collection_date=None,

language='English',

licence=None,

default_target_attribute='y',

row_id_attribute=None,

ignore_attribute=None,

citation=None,

attributes='auto',

data=df,

version_label='example',

)

############################################################################

upload_did = xor_dataset.publish()

print('URL for dataset: %s/data/%d' % (openml.config.server, upload_did))

############################################################################

openml.config.stop_using_configuration_for_example()

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Expand file tree

Search code, repositories, users, issues, pull requests...

FilesExpand file tree

create_upload_tutorial.py

Latest commit

History

create_upload_tutorial.py

File metadata and controls

Expand file tree