以下是我的代码,想问一下为什么运行后会出现Traceback (most recent call last):
File "\test1.py", line 167, in
labels = density_based_clustering(data)
File "\test1.py", line 146, in density_based_clustering
cluster_centers = get_cluster_centers(partition, decision_values)
File "\test1.py", line 77, in get_cluster_centers
cluster_centers = data[decision_values > mean_decision_value]
IndexError: boolean index did not match indexed array along dimension 0; dimension is 1 but corresponding boolean dimension is 1278的错误,该如何修改?
import math
import copy
import numpy as np
from sklearn.cluster import DBSCAN
import pandas as pd
import matplotlib.pyplot as plt
import sklearn.metrics.pairwise as pairwise
def density_based_clustering(data):
# 步骤 1: 计算距离矩阵
distance_matrix = pairwise.euclidean_distances(data)
n = len(data)
dc = np.percentile(distance_matrix, 2)
def calculate_local_density(distance_matrix, dc):
"""
计算局部密度
参数:
distance_matrix: 距离矩阵
dc: 截断距离
返回值:
local_density: 每个样本点的局部密度
"""
local_density = np.sum(distance_matrix - dc, axis=1)
return local_density
def calculate_min_distance_to_high_density(local_density, distance_matrix):
"""
计算与高密度点的最小距离
参数:
local_density: 每个样本点的局部密度
distance_matrix: 距离矩阵
返回值:
min_distances: 每个样本点到高密度点的最小距离
"""
min_distances = np.zeros_like(local_density)
for i in range(len(local_density)):
higher_density_indices = np.where(local_density > local_density[i])[0]
if len(higher_density_indices) > 0:
min_distances[i] = np.min(distance_matrix[i, higher_density_indices])
else:
min_distances[i] = np.max(distance_matrix[i])
return min_distances
def calculate_decision_values(local_density, min_distances):
"""
计算决策值
参数:
local_density: 每个样本点的局部密度
min_distances: 每个样本点到高密度点的最小距离
返回值:
decision_values: 每个样本点的决策值
"""
decision_values = local_density * min_distances
return decision_values
def get_cluster_centers(data, decision_values):
"""
获取簇中心
参数:
data: 数据集
decision_values: 每个样本点的决策值
返回值:
cluster_centers: 簇中心
"""
# 计算决策值的均值
mean_decision_value = np.mean(decision_values)
# 标记决策值大于均值的样本为簇中心
cluster_centers = data[decision_values > mean_decision_value]
return cluster_centers
def ems(dataset, k):
# 深度拷贝距离矩阵依此计算出eps
DistMatrix = pairwise.euclidean_distances(dataset)
tmp_matrix = copy.deepcopy(DistMatrix)
for i in range(len(dataset)):
tmp_matrix[i].sort()
Dk = tmp_matrix[:, k - 1]
eps = np.mean(Dk)
# 确保 eps 大于 0
eps = max(eps, 0.001) # 设置一个最小的 eps 值,比如 0.001
# 下面这部分计算 min_samples 的逻辑也存在问题,我们需要重新设计它
tmp_count = 0
for i in range(len(DistMatrix)):
for j in range(len(DistMatrix[i])):
if DistMatrix[i][j] <= eps:
tmp_count = tmp_count + 1
min_samples = round(tmp_count / len(dataset))
clustering = DBSCAN(eps=eps, min_samples=min_samples).fit(dataset)
num_clustering = max(clustering.labels_) + 1
return eps, min_samples, num_clustering
# 步骤 2: 计算局部密度
local_density = calculate_local_density(distance_matrix, dc)
# 步骤 3: 计算与高密度点的最小距离
min_distances = calculate_min_distance_to_high_density(local_density, distance_matrix)
# 步骤 4: 计算决策值
decision_values = calculate_decision_values(local_density, min_distances)
# 步骤 5: 将剩余上车位置数据点按其局部密度降序排列
sorted_indices = np.argsort(-local_density)
sorted_data = data[sorted_indices]
# 定义存储每个密度分区Di的数据点的列表
density_partitions = []
current_partition = []
# 遍历按局部密度降序排列的数据点,划分不同密度的数据集Di
for i in range(n):
# 如果当前数据点为第一个或者局部密度与前一个数据点不相同,则新建一个密度分区
if i == 0 or local_density[sorted_indices[i]] != local_density[sorted_indices[i - 1]]:
if current_partition:
density_partitions.append(np.array(current_partition))
current_partition = []
current_partition.append(sorted_data[i])
# 将最后一个密度分区加入列表
if current_partition:
density_partitions.append(np.array(current_partition))
# 步骤 6: 计算不同密度上车点密度数据集Di对应的参数Epsi和MinPtsi
E = []
M = []
for partition in density_partitions:
e, m, num_clustering = ems(partition, len(partition))
E.append(e)
M.append(m)
# 步骤 7-10: 对每个密度上车点密度数据集Di进行聚类
labels = np.zeros(n, dtype=int) - 1
current_label = 0
for i, partition in enumerate(density_partitions):
if labels[sorted_indices[i]] == -1:
clustering = DBSCAN(eps=E[i], min_samples=M[i]).fit(partition)
cluster_centers = get_cluster_centers(partition, decision_values)
for center in cluster_centers:
center_index = sorted_indices[np.argmin(np.linalg.norm(data - center, axis=1))]
if labels[center_index] == -1:
labels[center_index] = current_label
current_label += 1
queue = [center_index]
while queue:
current_index = queue.pop(0)
neighbors = np.where(distance_matrix[current_index] <= E[i])[0]
for neighbor in neighbors:
if labels[neighbor] == -1:
labels[neighbor] = current_label
queue.append(neighbor)
return labels
# 读取数据集
df = pd.read_csv('C:/Users/早高峰.csv', encoding='gbk')
data = df[['开始纬度', '开始经度']].values
# 进行密度分区的 DBSCAN 聚类
labels = density_based_clustering(data)
# 输出聚类结果标签
print(labels)