knn电影推荐算法评估准确率,召回率和F1
```python
import pandas as pd
# usecols 允许选择自己选择的特征,并通过dtype设定对应类型
movies_df=pd.read_csv('D:/Datamovies/ml-latest-small/movies.csv',
usecols=['movieId','title'],
dtype={'movieId':'int32','title':'str'})
movies_df.head()
ratings_df=pd.read_csv('D:/Datamovies/ml-latest-small/ratings.csv',
usecols=['userId', 'movieId', 'rating','timestamp'],
dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'})
ratings_df.head()
# 检查缺失值
movies_df.isnull().sum()
ratings_df.isnull().sum() #条目数
print("Movies:",movies_df.shape)#获取数组或矩阵维度
print("Ratings:",ratings_df.shape)
#合并列上的数据帧‘movieID’
# movies_df.info()
# ratings_df.info()
movies_merged_df=movies_df.merge(ratings_df, on='movieId')
movies_merged_df.head()
#添加衍生特征
#通过按电影标题对用户评分进行分组来创建'Average Rating' & 'Rating Count'列。
movies_average_rating=movies_merged_df.groupby('title')['rating']\
.mean().sort_values(ascending=False)\
.reset_index().rename(columns={'rating':'Average Rating'})
movies_average_rating.head()
movies_rating_count=movies_merged_df.groupby('title')['rating']\
.count().sort_values(ascending=True)\
.reset_index().rename(columns={'rating':'Rating Count'}) #ascending=False
movies_rating_count_avg=movies_rating_count.merge(movies_average_rating, on='title')
movies_rating_count_avg.head()
# 导入可视化库
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(font_scale = 1)
plt.rcParams["axes.grid"] = False
plt.style.use('dark_background')
%matplotlib inline
# 绘制图形
plt.figure(figsize=(12,4))
plt.hist(movies_rating_count_avg['Rating Count'],bins=80,color='tab:purple')
plt.ylabel('Ratings Count(Scaled)',fontsize=16)
plt.savefig('D:/Datamovies/ml-latest-small/ratingcounthist.jpg')
plt.figure(figsize=(12,4))
plt.hist(movies_rating_count_avg['Average Rating'],bins=80,color='tab:purple')
plt.ylabel('Average Rating',fontsize=16)
plt.savefig('D:/Datamovies/ml-latest-small/avgratinghist.jpg')
plot=sns.jointplot(x='Average Rating',
y='Rating Count',
data=movies_rating_count_avg,
alpha=0.5,
color='tab:pink')
plot.savefig('D:/Datamovies/ml-latest-small/joinplot.jpg')
pd.set_option('display.float_format', lambda x: '%.3f' % x)#设置显示选项,让所有浮点数以三位小数显示
rating_with_RatingCount=movies_merged_df.merge(movies_rating_count_avg,on='title')#将movies_merged_df和movies_rating_count_avg按title列合并
print(rating_with_RatingCount['Rating Count'].describe())
popularity_threshold = 50
popular_movies= rating_with_RatingCount[
rating_with_RatingCount['Rating Count']>=popularity_threshold]#根据popularity_threshold中的值从rating_with_RatingCount中筛选出大于等于50的电影
popular_movies.head()
# popular_movies.shape
print(popular_movies['Rating Count'].describe())
import os
movie_features_df=popular_movies.pivot_table(
index='title',columns='userId',values='rating').fillna(0)
movie_features_df.to_excel('D:/Datamovies/ml-latest-small/output.xlsx')
movie_features_df.head()
from scipy.sparse import csr_matrix
movie_features_df_matrix = csr_matrix(movie_features_df.values)
from sklearn.neighbors import NearestNeighbors#寻找一个点的 K 个最近邻居。它返回每个点的邻居的索引和到邻居的距离
model_knn = NearestNeighbors(metric = 'cosine',
algorithm = 'brute')
model_knn.fit(movie_features_df_matrix)
import numpy as np
query_index = np.random.choice(movie_features_df.shape[0])
distances, indices = model_knn.kneighbors(movie_features_df.iloc[query_index,:].values.reshape(1, -1),
n_neighbors = 6)
for i in range(0, len(distances.flatten())):
if i == 0:
print('Recommendations for {0}:\n'
.format(movie_features_df.index[query_index]))
else:
print('{0}: {1}, with distance of {2}:'
.format(i, movie_features_df.index[indices.flatten()[i]],
distances.flatten()[i]))
```