Open
Description
Describe the bug
Hi, I've previously reported a bug that RandomForestClassifier consumes more memory on newer versions, and I've also noticed that the svm API also consumes more memory on newer versions. The detailed test information is as follows.
Memory | Version |
---|---|
274MB | 1.0.1 |
276MB | 0.20.3 |
180MB | 0.19.2 |
Steps/Code to Reproduce
import math
import pandas as pd
import numpy as np
import random
import torch
from sklearn.svm import SVR
from sklearn.ensemble import BaggingRegressor
def random_init(**kwargs):
random.seed(kwargs['seed'])
np.random.seed(kwargs['seed'])
torch.manual_seed(kwargs['seed'])
torch.cuda.manual_seed(kwargs['seed'])
torch.backends.cudnn.deterministic = True
def load_data(df,cv=False,target=False,**kwargs):
num_samples = len(df)
sample_size = sum([len(args['categories'][c]) for c in args['categories']]) + len(args['num_feats'])
dataset = torch.zeros((num_samples,sample_size),dtype=torch.float)
idx = 0
for c in args['cat_feats']:
for i in range(len(args['categories'][c])):
dataset[np.array(df[c])==args['categories'][c][i],idx] = 1.0
idx += 1
for n in args['num_feats']:
dataset[:,idx] = torch.from_numpy(np.array(df[n]))
idx += 1
if target:
targets = torch.from_numpy(np.array(df['target']))
else:
targets = None
if cv == False:
return dataset, targets
idx = [i for i in range(num_samples)]
random.shuffle(idx)
trainset = dataset[idx[0:int(num_samples*(1-kwargs['cv_percentage']))]]
traintargets = targets[idx[0:int(num_samples*(1-kwargs['cv_percentage']))]]
validset = dataset[idx[int(num_samples*(1-kwargs['cv_percentage'])):]]
validtargets = targets[idx[int(num_samples*(1-kwargs['cv_percentage'])):]]
return trainset, validset, traintargets, validtargets
def get_stats(trainset):
mean = torch.mean(trainset,dim=0)
std = torch.std(trainset,dim=0)
for i in range(trainset.shape[1]):
if ((trainset[:,i]==0) | (trainset[:,i]==1)).all():
mean[i] = 0.5
std[i] = 1.0
return mean, std
#Global arguments
args = {
'cv_percentage': 0.1,
'seed': 0,
}
# Load data
random_init(**args)
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
args['cat_feats'] = [c for c in np.sort(train_data.columns) if 'cat' in c]
args['num_feats'] = [c for c in np.sort(train_data.columns) if 'cont' in c]
args['categories'] = {c: np.unique(train_data[c]) for c in args['cat_feats']}
testset, _ = load_data(test_data,cv=False,target=False,**args)
trainset, validset, traintargets, validtargets = load_data(train_data,cv=True,target=True,**args)
args['mean'],args['std'] = get_stats(trainset)
val_pred = {}
test_pred = {}
# Bagged SVRs
import tracemalloc
tracemalloc.start()
print('Bagged SVRs...')
# 0.19.2 180MB
# 0.20.3 276
# SVR arguments
args = {**args,**{
'kernel': 'linear',
'n_estimators': 3,
'max_samples': 20000,
'max_features':50
}}
random_init(**args)
svrs = BaggingRegressor(SVR(kernel=args['kernel'],C=1.0, epsilon=0.2),
n_estimators=args['n_estimators'],max_samples=args['max_samples'],max_features=args['max_features'])
svrs.fit(((trainset-args['mean'])/args['std']).numpy(),traintargets.numpy())
val_pred['svrs'] = svrs.predict(((validset-args['mean'])/args['std']).numpy())
test_pred['svrs'] = svrs.predict(((testset-args['mean'])/args['std']).numpy())
currentSVR, peakSVR = tracemalloc.get_traced_memory()
print("SVR current memory usage is {",currentSVR /1024/1024,"}MB; SVR Peak memory was :{",peakSVR / 1024/1024,"}MB")
Expected Results
same memory usage
Actual Results
new version use more memory.
Versions
1.0.1, 0.20.3, 0.19.2