使用python读取本地csv文件,筛选大类为“旅游景点”的POI数据,进行DBSCAN聚类算法进行聚类,最后将聚类结果绘制到pyecharts的地图map上,但是最终只生成了map.html,聚类结果并没有显示到地图上。
代码如下:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
from sklearn import metrics
import pandas as pd
from pyecharts import options as opts
from pyecharts.charts import Map
UNCLASSIFIED = 0
NOISE = -1
# 计算数据点两两之间的距离
def getDistanceMatrix(datas):
N, D = np.shape(datas)
dists = np.zeros([N, N])
for i in range(N):
for j in range(N):
vi = datas[i, :]
vj = datas[j, :]
dists[i, j] = np.sqrt(np.dot((vi - vj), (vi - vj)))
return dists
# 寻找以点cluster_id为中心,eps为半径的圆内的所有点的id
def find_points_in_eps(point_id, eps, dists):
index = (dists[point_id] <= eps)
return np.where(index == True)[0].tolist()
# 聚类扩展
# dists:所有数据两两之间的距离 N x N
# labs: 所有数据的标签 labs N,
# cluster_id:一个簇的标号
# eps:密度评估半径
# seeds:用来进行簇扩展的点
# min_points:半径内最少的点数
def expand_cluster(dists, labs, cluster_id, seeds, eps, min_points):
i = 0
while i < len(seeds):
# 获取一个临近点
Pn = seeds[i]
# 如果该点被标记为NOISE 则重新标记
if labs[Pn] == NOISE:
labs[Pn] = cluster_id
# 如果该点没有被标记过
elif labs[Pn] == UNCLASSIFIED:
# 进行标记,并计算它的临近点 new_seeds
labs[Pn] = cluster_id
new_seeds = find_points_in_eps(Pn, eps, dists)
# 如果new_seeds足够长则把它加入到seed队列中
if len(new_seeds) >= min_points:
seeds = seeds + new_seeds
i = i + 1
def dbscan(datas, eps, min_points):
# 计算所有点之间的距离
dists = getDistanceMatrix(datas)
# 将所有点的标签初始化为UNCLASSIFIED
n_points = datas.shape[0]
labs = [UNCLASSIFIED] * n_points
cluster_id = 0
# 遍历所有点
for point_id in range(0, n_points):
# 如果当前点已经处理过了
if not (labs[point_id] == UNCLASSIFIED):
continue
# 没有处理过则计算临近点
seeds = find_points_in_eps(point_id, eps, dists)
# 如果临近点数量过少则标记为NOISE
if len(seeds) < min_points:
labs[point_id] = NOISE
else:
# 否则就开启一轮簇的扩张
cluster_id = cluster_id + 1
# 标记当前点
labs[point_id] = cluster_id
expand_cluster(dists, labs, cluster_id, seeds, eps, min_points)
return labs, cluster_id
# 绘制散点图
def draw_cluster(datas, labs, n_cluster):
plt.cla()
colors = [plt.cm.Spectral(each)
for each in np.linspace(0, 1, n_cluster)]
for i, lab in enumerate(labs):
if lab == NOISE:
plt.scatter(datas[i, 0], datas[i, 1], s=16., color=(0, 0, 0))
else:
plt.scatter(datas[i, 0], datas[i, 1], s=16., color=colors[lab - 1])
plt.show()
if __name__ == "__main__":
# 从CSV文件中读取数据
data = pd.read_csv("C:/Users/86136/Desktop/大数据/昆明市POI数据.csv") # 替换成你的CSV文件路径
filtered_data = data[data['大类'] == '旅游景点']
selected_columns = filtered_data[['经度', '纬度']].values
# 数据正则化
scaler = StandardScaler()
selected_columns = scaler.fit_transform(selected_columns)
# 设置DBSCAN参数
eps = 0.1
min_points = 8
# 执行DBSCAN聚类
#labels, n_clusters = dbscan(selected_columns, eps, min_points)
# 绘制散点图
#draw_cluster(selected_columns, labels, n_clusters)
# 构造地图数据
map_data = [(str(selected_columns[i][0]), str(selected_columns[i][1])) for i in range(len(selected_columns))]
# 绘制地图
map_chart = (
Map()
.add("聚类结果", selected_columns, "昆明")
.set_series_opts(label_opts=opts.LabelOpts(is_show=False))
.set_global_opts(
title_opts=opts.TitleOpts(title="DBSCAN聚类结果地图"),
visualmap_opts=opts.VisualMapOpts(),
)
)
# 保存地图为HTML文件
map_chart.render("dbscan_map.html")
csv文件如下:
[](链接: https://pan.baidu.com/s/1bHt2ajRLpJv3bxEPKR_4CA 提取码: yfzu 复制这段内容后打开百度网盘手机App,操作更方便哦)