From c9f616593fec5be29d03c759eb2936923fd2735f Mon Sep 17 00:00:00 2001 From: OnlyDeniko Date: Tue, 20 Jul 2021 15:52:39 +0300 Subject: [PATCH 1/4] minor fixes --- configs/blogs/skl_2021_3.json | 2 +- sklearn_bench/kmeans.py | 7 ++++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/configs/blogs/skl_2021_3.json b/configs/blogs/skl_2021_3.json index 30ced64e4..c3e2f409b 100644 --- a/configs/blogs/skl_2021_3.json +++ b/configs/blogs/skl_2021_3.json @@ -307,7 +307,7 @@ } ], "nu": [0.25], - "kernel": ["sigmoid"] + "kernel": ["poly"] }, { "algorithm": "svr", diff --git a/sklearn_bench/kmeans.py b/sklearn_bench/kmeans.py index 4df5ba03f..a7da397b0 100644 --- a/sklearn_bench/kmeans.py +++ b/sklearn_bench/kmeans.py @@ -48,7 +48,8 @@ def main(): def fit_kmeans(X, X_init): alg = KMeans(n_clusters=params.n_clusters, tol=params.tol, - max_iter=params.maxiter, init=X_init, n_init=1) + max_iter=params.maxiter, init=X_init, n_init=params.n_init, + algorithm=params.algorithm) alg.fit(X) return alg @@ -83,5 +84,9 @@ def fit_kmeans(X, X_init): parser.add_argument('--maxiter', type=int, default=100, help='Maximum number of iterations') parser.add_argument('--n-clusters', type=int, help='Number of clusters') + parser.add_argument('--algorithm', type=str, default='full', + help='K-means algorithm to use') + parser.add_argument('--n_init', type=int, default=10, + help='Number of time the k-means algorithm will be run with different centroid seeds') params = bench.parse_args(parser) bench.run_with_context(params, main) From 7c33dcebfd10a98e318d985e5b309f453e320101 Mon Sep 17 00:00:00 2001 From: OnlyDeniko Date: Tue, 20 Jul 2021 15:56:22 +0300 Subject: [PATCH 2/4] pep8 --- sklearn_bench/kmeans.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn_bench/kmeans.py b/sklearn_bench/kmeans.py index a7da397b0..076af45e2 100644 --- a/sklearn_bench/kmeans.py +++ b/sklearn_bench/kmeans.py @@ -87,6 +87,7 @@ def fit_kmeans(X, X_init): parser.add_argument('--algorithm', type=str, default='full', help='K-means algorithm to use') parser.add_argument('--n_init', type=int, default=10, - help='Number of time the k-means algorithm will be run with different centroid seeds') + help='Number of time the k-means algorithm ' + 'will be run with different centroid seeds') params = bench.parse_args(parser) bench.run_with_context(params, main) From 46bf47841ecad96ace1da719636872739a7a507f Mon Sep 17 00:00:00 2001 From: OnlyDeniko Date: Tue, 20 Jul 2021 16:03:11 +0300 Subject: [PATCH 3/4] random state --- sklearn_bench/kmeans.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sklearn_bench/kmeans.py b/sklearn_bench/kmeans.py index 076af45e2..6d3afa0e3 100644 --- a/sklearn_bench/kmeans.py +++ b/sklearn_bench/kmeans.py @@ -49,7 +49,7 @@ def main(): def fit_kmeans(X, X_init): alg = KMeans(n_clusters=params.n_clusters, tol=params.tol, max_iter=params.maxiter, init=X_init, n_init=params.n_init, - algorithm=params.algorithm) + algorithm=params.algorithm, random_state=params.random_state) alg.fit(X) return alg @@ -89,5 +89,7 @@ def fit_kmeans(X, X_init): parser.add_argument('--n_init', type=int, default=10, help='Number of time the k-means algorithm ' 'will be run with different centroid seeds') + parser.add_argument('--random_state', type=int, default=777, + help='Random state') params = bench.parse_args(parser) bench.run_with_context(params, main) From e739ebafed97c98a0d751a04659f1aae2bfe4c72 Mon Sep 17 00:00:00 2001 From: "denis.kulandin" Date: Thu, 29 Jul 2021 11:01:46 +0300 Subject: [PATCH 4/4] size of datasets --- datasets/loader_classification.py | 41 ++++++++++++++++++++----------- datasets/loader_multiclass.py | 24 +++++++++++------- datasets/loader_regression.py | 14 ++++++----- 3 files changed, 49 insertions(+), 30 deletions(-) diff --git a/datasets/loader_classification.py b/datasets/loader_classification.py index 5c8a91121..e4f0417b9 100644 --- a/datasets/loader_classification.py +++ b/datasets/loader_classification.py @@ -68,9 +68,11 @@ def airline(dataset_dir: Path) -> bool: Airline dataset http://kt.ijs.si/elena_ikonomovska/data.html - TaskType:binclass - NumberOfFeatures:13 - NumberOfInstances:115M + Classification task. n_classes = 2. + airline X train dataset (92055213, 13) + airline y train dataset (92055213, 1) + airline X test dataset (23013804, 13) + airline y test dataset (23013804, 1) """ dataset_name = 'airline' os.makedirs(dataset_dir, exist_ok=True) @@ -126,9 +128,12 @@ def airline(dataset_dir: Path) -> bool: def airline_ohe(dataset_dir: Path) -> bool: """ Dataset from szilard benchmarks: https://github.com/szilard/GBM-perf - TaskType:binclass - NumberOfFeatures:700 - NumberOfInstances:10100000 + + Classification task. n_classes = 2. + airline-ohe X train dataset (1000000, 692) + airline-ohe y train dataset (1000000, 1) + airline-ohe X test dataset (100000, 692) + airline-ohe y test dataset (100000, 1) """ dataset_name = 'airline-ohe' os.makedirs(dataset_dir, exist_ok=True) @@ -289,9 +294,11 @@ def epsilon(dataset_dir: Path) -> bool: Epsilon dataset https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html - TaskType:binclass - NumberOfFeatures:2000 - NumberOfInstances:500K + Classification task. n_classes = 2. + epsilon X train dataset (400000, 2000) + epsilon y train dataset (400000, 1) + epsilon X test dataset (100000, 2000) + epsilon y test dataset (100000, 1) """ dataset_name = 'epsilon' os.makedirs(dataset_dir, exist_ok=True) @@ -444,9 +451,11 @@ def higgs(dataset_dir: Path) -> bool: Higgs dataset from UCI machine learning repository https://archive.ics.uci.edu/ml/datasets/HIGGS - TaskType:binclass - NumberOfFeatures:28 - NumberOfInstances:11M + Classification task. n_classes = 2. + higgs X train dataset (8799999, 28) + higgs y train dataset (8799999, 1) + higgs X test dataset (2200000, 28) + higgs y test dataset (2200000, 1) """ dataset_name = 'higgs' os.makedirs(dataset_dir, exist_ok=True) @@ -479,9 +488,11 @@ def higgs_one_m(dataset_dir: Path) -> bool: Only first 1.5M samples is taken - TaskType:binclass - NumberOfFeatures:28 - NumberOfInstances:1.5M + Classification task. n_classes = 2. + higgs1m X train dataset (1000000, 28) + higgs1m y train dataset (1000000, 1) + higgs1m X test dataset (500000, 28) + higgs1m y test dataset (500000, 1) """ dataset_name = 'higgs1m' os.makedirs(dataset_dir, exist_ok=True) diff --git a/datasets/loader_multiclass.py b/datasets/loader_multiclass.py index 0c2013a1f..cd58b21b0 100644 --- a/datasets/loader_multiclass.py +++ b/datasets/loader_multiclass.py @@ -99,9 +99,11 @@ def covtype(dataset_dir: Path) -> bool: https://archive.ics.uci.edu/ml/datasets/covertype y contains 7 unique class labels from 1 to 7 inclusive. - TaskType:multiclass - NumberOfFeatures:54 - NumberOfInstances:581012 + Classification task. n_classes = 7. + covtype X train dataset (464809, 54) + covtype y train dataset (464809, 1) + covtype X test dataset (116203, 54) + covtype y test dataset (116203, 1) """ dataset_name = 'covtype' os.makedirs(dataset_dir, exist_ok=True) @@ -125,9 +127,11 @@ def letters(dataset_dir: Path) -> bool: """ http://archive.ics.uci.edu/ml/datasets/Letter+Recognition - TaskType:multiclass - NumberOfFeatures:16 - NumberOfInstances:20.000 + Classification task. n_classes = 26. + letters X train dataset (16000, 16) + letters y train dataset (16000, 1) + letters X test dataset (4000, 16) + letters y test dataset (4000, 1) """ dataset_name = 'letters' os.makedirs(dataset_dir, exist_ok=True) @@ -204,9 +208,11 @@ def msrank(dataset_dir: Path) -> bool: """ Dataset from szilard benchmarks: https://github.com/szilard/GBM-perf - TaskType:multiclass - NumberOfFeatures:137 - NumberOfInstances:1.2M + Multiclass classification task + msrank X train dataset (958671, 137) + msrank y train dataset (958671, 1) + msrank X test dataset (241521, 137) + msrank y test dataset (241521, 1) """ dataset_name = 'msrank' os.makedirs(dataset_dir, exist_ok=True) diff --git a/datasets/loader_regression.py b/datasets/loader_regression.py index 2f330f799..4d90da2c6 100644 --- a/datasets/loader_regression.py +++ b/datasets/loader_regression.py @@ -32,9 +32,10 @@ def abalone(dataset_dir: Path) -> bool: """ https://archive.ics.uci.edu/ml/machine-learning-databases/abalone - TaskType:regression - NumberOfFeatures:8 - NumberOfInstances:4177 + abalone x train dataset (3341, 8) + abalone y train dataset (3341, 1) + abalone x test dataset (836, 8) + abalone y train dataset (836, 1) """ dataset_name = 'abalone' os.makedirs(dataset_dir, exist_ok=True) @@ -196,9 +197,10 @@ def year_prediction_msd(dataset_dir: Path) -> bool: YearPredictionMSD dataset from UCI repository https://archive.ics.uci.edu/ml/datasets/yearpredictionmsd - TaskType:regression - NumberOfFeatures:90 - NumberOfInstances:515345 + year_prediction_msd x train dataset (463715, 11) + year_prediction_msd y train dataset (463715, 1) + year_prediction_msd x test dataset (51630, 11) + year_prediction_msd y train dataset (51630, 1) """ dataset_name = 'year_prediction_msd' os.makedirs(dataset_dir, exist_ok=True)