diff --git a/configs/blogs/skl_2021_3.json b/configs/blogs/skl_2021_3.json index 30ced64e4..c3e2f409b 100644 --- a/configs/blogs/skl_2021_3.json +++ b/configs/blogs/skl_2021_3.json @@ -307,7 +307,7 @@ } ], "nu": [0.25], - "kernel": ["sigmoid"] + "kernel": ["poly"] }, { "algorithm": "svr", diff --git a/datasets/loader_classification.py b/datasets/loader_classification.py index 5c8a91121..e4f0417b9 100644 --- a/datasets/loader_classification.py +++ b/datasets/loader_classification.py @@ -68,9 +68,11 @@ def airline(dataset_dir: Path) -> bool: Airline dataset http://kt.ijs.si/elena_ikonomovska/data.html - TaskType:binclass - NumberOfFeatures:13 - NumberOfInstances:115M + Classification task. n_classes = 2. + airline X train dataset (92055213, 13) + airline y train dataset (92055213, 1) + airline X test dataset (23013804, 13) + airline y test dataset (23013804, 1) """ dataset_name = 'airline' os.makedirs(dataset_dir, exist_ok=True) @@ -126,9 +128,12 @@ def airline(dataset_dir: Path) -> bool: def airline_ohe(dataset_dir: Path) -> bool: """ Dataset from szilard benchmarks: https://github.com/szilard/GBM-perf - TaskType:binclass - NumberOfFeatures:700 - NumberOfInstances:10100000 + + Classification task. n_classes = 2. + airline-ohe X train dataset (1000000, 692) + airline-ohe y train dataset (1000000, 1) + airline-ohe X test dataset (100000, 692) + airline-ohe y test dataset (100000, 1) """ dataset_name = 'airline-ohe' os.makedirs(dataset_dir, exist_ok=True) @@ -289,9 +294,11 @@ def epsilon(dataset_dir: Path) -> bool: Epsilon dataset https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html - TaskType:binclass - NumberOfFeatures:2000 - NumberOfInstances:500K + Classification task. n_classes = 2. + epsilon X train dataset (400000, 2000) + epsilon y train dataset (400000, 1) + epsilon X test dataset (100000, 2000) + epsilon y test dataset (100000, 1) """ dataset_name = 'epsilon' os.makedirs(dataset_dir, exist_ok=True) @@ -444,9 +451,11 @@ def higgs(dataset_dir: Path) -> bool: Higgs dataset from UCI machine learning repository https://archive.ics.uci.edu/ml/datasets/HIGGS - TaskType:binclass - NumberOfFeatures:28 - NumberOfInstances:11M + Classification task. n_classes = 2. + higgs X train dataset (8799999, 28) + higgs y train dataset (8799999, 1) + higgs X test dataset (2200000, 28) + higgs y test dataset (2200000, 1) """ dataset_name = 'higgs' os.makedirs(dataset_dir, exist_ok=True) @@ -479,9 +488,11 @@ def higgs_one_m(dataset_dir: Path) -> bool: Only first 1.5M samples is taken - TaskType:binclass - NumberOfFeatures:28 - NumberOfInstances:1.5M + Classification task. n_classes = 2. + higgs1m X train dataset (1000000, 28) + higgs1m y train dataset (1000000, 1) + higgs1m X test dataset (500000, 28) + higgs1m y test dataset (500000, 1) """ dataset_name = 'higgs1m' os.makedirs(dataset_dir, exist_ok=True) diff --git a/datasets/loader_multiclass.py b/datasets/loader_multiclass.py index 0c2013a1f..cd58b21b0 100644 --- a/datasets/loader_multiclass.py +++ b/datasets/loader_multiclass.py @@ -99,9 +99,11 @@ def covtype(dataset_dir: Path) -> bool: https://archive.ics.uci.edu/ml/datasets/covertype y contains 7 unique class labels from 1 to 7 inclusive. - TaskType:multiclass - NumberOfFeatures:54 - NumberOfInstances:581012 + Classification task. n_classes = 7. + covtype X train dataset (464809, 54) + covtype y train dataset (464809, 1) + covtype X test dataset (116203, 54) + covtype y test dataset (116203, 1) """ dataset_name = 'covtype' os.makedirs(dataset_dir, exist_ok=True) @@ -125,9 +127,11 @@ def letters(dataset_dir: Path) -> bool: """ http://archive.ics.uci.edu/ml/datasets/Letter+Recognition - TaskType:multiclass - NumberOfFeatures:16 - NumberOfInstances:20.000 + Classification task. n_classes = 26. + letters X train dataset (16000, 16) + letters y train dataset (16000, 1) + letters X test dataset (4000, 16) + letters y test dataset (4000, 1) """ dataset_name = 'letters' os.makedirs(dataset_dir, exist_ok=True) @@ -204,9 +208,11 @@ def msrank(dataset_dir: Path) -> bool: """ Dataset from szilard benchmarks: https://github.com/szilard/GBM-perf - TaskType:multiclass - NumberOfFeatures:137 - NumberOfInstances:1.2M + Multiclass classification task + msrank X train dataset (958671, 137) + msrank y train dataset (958671, 1) + msrank X test dataset (241521, 137) + msrank y test dataset (241521, 1) """ dataset_name = 'msrank' os.makedirs(dataset_dir, exist_ok=True) diff --git a/datasets/loader_regression.py b/datasets/loader_regression.py index 2f330f799..4d90da2c6 100644 --- a/datasets/loader_regression.py +++ b/datasets/loader_regression.py @@ -32,9 +32,10 @@ def abalone(dataset_dir: Path) -> bool: """ https://archive.ics.uci.edu/ml/machine-learning-databases/abalone - TaskType:regression - NumberOfFeatures:8 - NumberOfInstances:4177 + abalone x train dataset (3341, 8) + abalone y train dataset (3341, 1) + abalone x test dataset (836, 8) + abalone y train dataset (836, 1) """ dataset_name = 'abalone' os.makedirs(dataset_dir, exist_ok=True) @@ -196,9 +197,10 @@ def year_prediction_msd(dataset_dir: Path) -> bool: YearPredictionMSD dataset from UCI repository https://archive.ics.uci.edu/ml/datasets/yearpredictionmsd - TaskType:regression - NumberOfFeatures:90 - NumberOfInstances:515345 + year_prediction_msd x train dataset (463715, 11) + year_prediction_msd y train dataset (463715, 1) + year_prediction_msd x test dataset (51630, 11) + year_prediction_msd y train dataset (51630, 1) """ dataset_name = 'year_prediction_msd' os.makedirs(dataset_dir, exist_ok=True) diff --git a/sklearn_bench/kmeans.py b/sklearn_bench/kmeans.py index 4df5ba03f..6d3afa0e3 100644 --- a/sklearn_bench/kmeans.py +++ b/sklearn_bench/kmeans.py @@ -48,7 +48,8 @@ def main(): def fit_kmeans(X, X_init): alg = KMeans(n_clusters=params.n_clusters, tol=params.tol, - max_iter=params.maxiter, init=X_init, n_init=1) + max_iter=params.maxiter, init=X_init, n_init=params.n_init, + algorithm=params.algorithm, random_state=params.random_state) alg.fit(X) return alg @@ -83,5 +84,12 @@ def fit_kmeans(X, X_init): parser.add_argument('--maxiter', type=int, default=100, help='Maximum number of iterations') parser.add_argument('--n-clusters', type=int, help='Number of clusters') + parser.add_argument('--algorithm', type=str, default='full', + help='K-means algorithm to use') + parser.add_argument('--n_init', type=int, default=10, + help='Number of time the k-means algorithm ' + 'will be run with different centroid seeds') + parser.add_argument('--random_state', type=int, default=777, + help='Random state') params = bench.parse_args(parser) bench.run_with_context(params, main)