import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tensorflow import keras
from tensorflow.keras import layers,Model
import warnings
warnings.filterwarnings('ignore')
def data_processed(train_path,properties_path):
data1=pd.read_csv(train_path)
#对于可能的重复项,取最后一天作为售价
data1.drop_duplicates(subset=['parcelid'], keep='last',inplace=True)
data1.sort_values('parcelid',ascending=True,inplace=True)
data2=pd.read_csv(properties_path)
#去掉含字符行
data2.drop(columns=['propertycountylandusecode','propertyzoningdesc','taxdelinquencyflag'],inplace=True)
features=data2[data2.parcelid.isin(data1.parcelid)]
features.sort_values('parcelid',ascending=True,inplace=True)
date=pd.to_datetime(data1['transactiondate'])
#日期处理
list_date=[]
for i in date.index:
list_date.append(int(date[i].strftime('%Y%m%d')))
# print(features)
return features,list_date,data1['logerror']
def data_std(a):
mean=a.mean()
std=a.std()
a=(a-mean)/std
return a
features1,list_date1,logerror1=data_processed("data\\train_2016.csv","data\\properties_2016.csv")
features2,list_date2,logerror2=data_processed("data\\train_2017.csv","data\\properties_2017.csv")
#链接2016年和2017年的数据
features=(pd.concat([features1,features2]).values)
list_date=np.array(list_date1+list_date2)
logerror=(pd.concat([logerror1,logerror2]).values)
#标准化
features=data_std(features)
features=features.astype("float32")
list_date=data_std(list_date)
# list_date=list_date.reshape(len(list_date),-1)
list_date=list_date.astype("float32")
# logerror=logerror.reshape(len(logerror),-1)
logerror=logerror.astype("float32")
# print(features.shape,list_date.shape,logerror.shape)
#合成验证集
i=features.shape[0]
print(features.shape,i)
test_date201610=np.array([201610]*i)
test_date201610=data_std(test_date201610)
test_date201610=test_date201610.reshape(len(test_date201610),-1)
test_date201611=np.array([201611]*i)
test_date201612=np.array([201612]*i)
test_date201710=np.array([201710]*i)
test_date201711=np.array([201711]*i)
test_date201712=np.array([201712]*i)
#输入应该有两个,一个是房屋属性,一个是时间
num_dates=1
num_feas=55
date=keras.Input(shape=(num_dates,),dtype="float32")
fea=keras.Input(shape=(num_feas,),dtype="float32")
model_features=layers.Concatenate()([fea,date])
model_features=layers.Dense(128,activation="relu")(model_features)
model_features=layers.Dense(64,activation="relu")(model_features)
outputs=layers.Dense(1)(model_features)
model=keras.Model(inputs=[fea,date],outputs=outputs)
model.compile(optimizer="rmsprop",loss="mse",metrics=["mae"])
print(model.summary())
print(features.shape,list_date.shape,logerror.shape)
print(type(features[0][0]),type(list_date[0]),type(logerror[0]))
print(type(features[0]),type(test_date201610[0]))
print(type(features),type(list_date),type(logerror),type(test_date201610))
model.fit([fea,date],logerror,epochs=20,batch_size=64)
loss,accuracy = model.evaluate([features,list_date],logerror)
print('\ntest loss',loss)
print('accuracy',accuracy)
# x=model.predict([features,test_date201610])
# print(x)
类型也改了,大小也对齐了,搞不懂为啥还会报错