import pandas as pd
import re
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,r2_score,mean_absolute_error
path1='./data/train_sales.csv'
path2='./data/test_sales.csv'
df1=pd.read_csv(path1,encoding='gbk')
def func1(x):
r1=re.findall('\d+',x)
r2=[int(i) for i in r1]
return sum(r2)/len(r2),max(r2)-min(r2)
df1['TR']=df1.TR.apply(lambda x:int(x[0]))
df1=df1.replace('-',np.nan)
df1['rated_passenger']=df1.rated_passenger.apply(lambda x: np.nan if x[-1]=='日' else x)
df1['engine_torque']=df1.engine_torque.apply(lambda x:float(str(x).split('/')[0]))
df1['power']=df1.power.apply(lambda x:float(str(x).split('/')[0]))
df1['if_charging']=df1.if_charging.apply(lambda x:1 if x=='L' else 0)
cols=['level_id','price','fuel_type_id','rated_passenger']
for c in cols:
df1[c]=df1[c].astype(float)
df1=pd.get_dummies(df1,columns=['gearbox_type']) ##指定列哑变量,分类变量在最后
df1[['mean-price','price-diff']]=list(df1.price_level.apply(func1))
del df1['price_level']
df1=df1.fillna(df1.mean())
Y=df1.sale_quantity
X=df1[df1.columns[3:]]
x_train, x_test, y_train, y_test = train_test_split(X, Y,test_size=0.3)
model=RandomForestRegressor()
model.fit(x_train,y_train)
y_pred=model.predict(x_test)
y_train_p=model.predict(x_train)
print(df1.head())
print(df1.shape)
print('train mae: %.4f' % mean_absolute_error(y_train, y_train_p))
print('test mae: %.4f' % mean_absolute_error(y_test, y_pred))