基于统计分析的电影推荐算法数据形状的问题,ValueError: Shape of passed values is (1, 1), indices imply (1, 1614)
电影推荐算法代码
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
# 加载movielens 100K数据集
def load_movielens_data(path='D:/Datamovies/ml-100k/u1.base', test_size=0.2):
# 读取用户和电影的元数据
users_df = pd.read_csv(path, sep='\t', header=None, names=['user_id', 'movie_id', 'rating', 'timestamp'])
# 分割数据集为训练集和测试集
train, test = train_test_split(users_df, test_size=test_size, random_state=42)
# 计算电影之间的相似度
movie_similarity = cosine_similarity(train[['rating']].values.T)
# 将相似度矩阵转化为DataFrame
movie_similarity_df = pd.DataFrame(movie_similarity, columns=train['movie_id'].unique())
return train, test, movie_similarity_df
# 推荐电影给用户
def recommend_movies(movie_similarity_df, user_id, num_recommendations=10):
# 为当前用户找到最相似的用户
similar_users = movie_similarity_df.apply(lambda x: x.corr(movie_similarity_df.iloc[user_id]))
# 找到这些相似用户评分最高的电影
recommended_movies = similar_users.nlargest(num_recommendations).index
return recommended_movies
# 评估推荐系统的性能
def evaluate_performance(train, test, movie_similarity_df):
# 标准化评分
scaler = StandardScaler()
train_scaled = scaler.fit_transform(train[['rating']])
test_scaled = scaler.transform(test[['rating']])
# 训练推荐模型
model = NearestNeighbors(n_neighbors=10)
model.fit(train_scaled)
# 预测测试集
predictions = model.kneighbors(test_scaled, return_distance=False)
# 计算均方根误差
rmse = mean_squared_error(test['rating'], predictions, squared=False)
# 计算准确率和召回率
# 这里我们假设我们推荐的列表中只有1个电影是被评分过的,这是简化的情况
num_test_users = len(test)
accurate_recalls = [1 if len(set(predictions[i]).intersection(set(train['movie_id'][train['user_id'] == test.iloc[i]['user_id']]))) > 0 else 0 for i in range(num_test_users)]
total_recalls = [1 if len(set(predictions[i]).intersection(set(train['movie_id'][train['user_id'] == test.iloc[i]['user_id']]))) > 0 else 0 for i in range(num_test_users)]
accuracy = sum(accurate_recalls) / num_test_users
recall = sum(total_recalls) / num_test_users
return rmse,accuracy,recall
# 执行推荐系统
def run_recommender_system(path='D:/Datamovies/ml-100k/u1.base', test_size=0.2):
train, test, movie_similarity_df = load_movielens_data(path, test_size)
# 输出电影推荐结果
print(recommend_movies(movie_similarity_df, user_id=1))
# 评估推荐系统性能
print(evaluate_performance(train, test, movie_similarity_df))
run_recommender_system()
错误代码
ValueError Traceback (most recent call last)
~\AppData\Roaming\Python\Python36\site-packages\pandas\core\internals\managers.py in create_block_manager_from_blocks(blocks, axes)
1670 blocks = [
-> 1671 make_block(values=blocks[0], placement=slice(0, len(axes[0])))
1672 ]
~\AppData\Roaming\Python\Python36\site-packages\pandas\core\internals\blocks.py in make_block(values, placement, klass, ndim, dtype)
2743
-> 2744 return klass(values, ndim=ndim, placement=placement)
2745
~\AppData\Roaming\Python\Python36\site-packages\pandas\core\internals\blocks.py in __init__(self, values, placement, ndim)
130 raise ValueError(
--> 131 f"Wrong number of items passed {len(self.values)}, "
132 f"placement implies {len(self.mgr_locs)}"
ValueError: Wrong number of items passed 1, placement implies 1614
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
<ipython-input-9-a454d8d78675> in <module>()
66 # 评估推荐系统性能
67 print(evaluate_performance(train, test, movie_similarity_df))
---> 68 run_recommender_system()
69
70
<ipython-input-9-a454d8d78675> in run_recommender_system(path, test_size)
59 # 执行推荐系统
60 def run_recommender_system(path='D:/Datamovies/ml-100k/u1.base', test_size=0.2):
---> 61 train, test, movie_similarity_df = load_movielens_data(path, test_size)
62
63 # 输出电影推荐结果
<ipython-input-9-a454d8d78675> in load_movielens_data(path, test_size)
19
20 # 将相似度矩阵转化为DataFrame
---> 21 movie_similarity_df = pd.DataFrame(movie_similarity, columns=train['movie_id'].unique())
22
23 return train, test, movie_similarity_df
~\AppData\Roaming\Python\Python36\site-packages\pandas\core\frame.py in __init__(self, data, index, columns, dtype, copy)
495 mgr = init_dict({data.name: data}, index, columns, dtype=dtype)
496 else:
--> 497 mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy)
498
499 # For data is list-like, or Iterable (will consume into list)
~\AppData\Roaming\Python\Python36\site-packages\pandas\core\internals\construction.py in init_ndarray(values, index, columns, dtype, copy)
232 block_values = [values]
233
--> 234 return create_block_manager_from_blocks(block_values, [columns, index])
235
236
~\AppData\Roaming\Python\Python36\site-packages\pandas\core\internals\managers.py in create_block_manager_from_blocks(blocks, axes)
1679 blocks = [getattr(b, "values", b) for b in blocks]
1680 tot_items = sum(b.shape[0] for b in blocks)
-> 1681 raise construction_error(tot_items, blocks[0].shape[1:], axes, e)
1682
1683
ValueError: Shape of passed values is (1, 1), indices imply (1, 1614)