import pandas as pd
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
inputfile = 'consumption_data.xls'
outputfile = 'data_type.xlsx'
k = 3
iteration = 500
data = pd.read_excel(inputfile, index_col = 'Id')
data_zs = 1.0*(data-data.mean())/data.std() # 数据标准化
model = KMeans(n_clusters = k, n_jobs = 4, max_iter = iteration,random_state=1234)
model.fit(data_zs)
r1 = pd.Series(model.labels_).value_counts()
r2 = pd.DataFrame(model.cluster_centers_)
r = pd.concat([r2, r1], axis = 1)
r.columns = list(data.columns) + ['number']
print(r)
r = pd.concat([data, pd.Series(model.labels_, index = data.index)], axis = 1)
r.columns = list(data.columns) + ['sorts']
r.to_excel(outputfile)
def density_plot(data):
# plt.rcParams['font.sans-serif'] = ['DejaVuSans.ttc'] # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False # plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号用来正常显示负号
p = data.plot(kind='kde', linewidth=2, subplots=True, sharex=False)
p = data.plot(kind='kde', linewidth=2, subplots=True, sharex=False)
[p[i].set_ylabel(u'density')for i in range(k)]
plt.legend()
return plt
pic_output = 'pd'
for i in range(k):
density_plot(data[r[u'sorts']==i]).savefig(u'%s%s.png'%(pic_output,i))