python在跑k均值聚类时候的时候,每次运行结果都不一样,求帮忙瞅瞅是哪出问题了?
import numpy as np
from sklearn.datasets import load_iris
iris = load_iris()
x,y =iris.data,iris.target
#x=np.array([[1,1,1,1],[10,10,10,10],[20,20,20,20],[5,5,5,5],[3,3,3,3],[6,6,6,6],[25,25,25,25]])
centroids_a=[]
list1=[]
#聚类中心
def init_random_centroids(k,x):
data1=x.shape[0]
suiji_a =np.random.choice(data1,k,replace=False)
for i in suiji_a:
xlist = x[i]
centroids_a.append(xlist)
centroids =np.array(centroids_a)
return centroids
#计算距离
def euclidean_distance(one_sample, x):
#print("中心点",x)
#print("样本",one_sample)
#print("单个样本",one_sample)
#print("中心",x)
distances= np.sum((x-one_sample)**2)
#print("距离",distances)
return distances
#返回离样本最近的中心索引
def _closest_centroid(sample, centroids):
k = centroids.shape[0]
#print("k",k)
#print("聚类中心",centroids)
list_distance =[]
for i in range(k):
a=euclidean_distance(one_sample=sample, x=centroids[i])
list_distance.append(a)
# print("list",list_distance)
distance_list=np.array(list_distance)
id =np.argmin(distance_list)
#print("单个样本与聚类中心的距离:",distance_list)
#print("距离最近的中心索引",id)
return id
#将所有样本进行归类,归类规则就是将该样本归类到与其最近的中心
def create_clusters(k,centroids, x):
clusters = [[] for _ in range(k)]
#print("cen",centroids)
for sample in x:
ys=_closest_centroid(sample, centroids)
#print(ys)
clusters[ys].append(sample)
#print("0",clusters[0])
#print("1",clusters[1])
#print("2",clusters[2])
return clusters
#中心点更新
def update_centroids(k,clusters):
#print(k)
# print("类型0",clusters[0])
# print("类型1",clusters[1])
# print("类型2",clusters[2])
for i in range(k):
centroid = np.mean(clusters[i], axis=0)
#print("平均聚点",centroid)
centroids[i] = centroid
print("新聚类中心",centroids)
return centroids
# 将所有样本进行归类,其所在的类别的索引就是其类别标签
def get_cluster_labels(clusters, x):
#print(clusters)
y_pred = []
for sample in x:
ys=_closest_centroid(sample=sample, centroids=centroids)
y_pred.append(ys)
print(y_pred)
return y_pred
#随机选取k个聚类中心
centroids = init_random_centroids(3, x)
for number in range(20000000):
#样本归类
cluster = create_clusters(k=3,centroids=centroids,x=x)
former_centroids = centroids
#更新新的聚类中心
clusters = update_centroids(k=3,clusters=cluster)
diff = centroids - former_centroids
if diff.any() <0.000001:
get_cluster_labels(clusters=cluster,x=x)
break