kaggle选择了GPU P100但是gpu使用率为0怎么处理,以下是我的代码用的tensorflow需要修改吗
```python
#!/usr/bin/env python
# coding: utf-8
import sys
sys.path.append('/kaggle/input')
from item0322.itemdata.utils import *
from item0322.itemdata.encrypt import *
from item0322.itemdata.model import *
from item0322.itemdata.preprocess import *
from item0322.itemdata.expansion import *
from item0322.itemdata.generator import *
from item0322.itemdata.const import *
import numpy as np
import random
import os
from scipy.io import loadmat
import tensorflow as tf
import matplotlib.pyplot as plt # 导入绘图库
# 检查并设置 GPU
gpus = tf.config.list_physical_devices('GPU')
if gpus:
try:
# 只使用第一个 GPU
tf.config.set_visible_devices(gpus[0], 'GPU')
# 设置内存增长
tf.config.experimental.set_memory_growth(gpus[0], True)
print(f"使用 GPU: {gpus[0].name}")
except RuntimeError as e:
print(e)
else:
print("未检测到 GPU,将使用 CPU")
path_dataset = '/kaggle/input/item0322/itemdata/training_test_dataset.mat'
def evaluate(model, testu, testi, testlabel, user_neighbor_emb, batch_size=4, k=10):
"""
评估模型性能,计算 MSE, HR, NDCG, Precision, Recall。
Args:
model: 训练好的模型。
testu, testi, testlabel: 测试数据。
user_neighbor_emb: 用户邻居嵌入。
batch_size: 批大小。
k: Top-K 推荐列表的长度。
Returns:
一个包含 MSE, HR, NDCG, Precision, Recall 的字典。
"""
all_predictions = []
all_labels = []
print("开始评估...")
with tf.device('/GPU:0'): # 强制在 GPU 上运行评估
for inputs, targets in generate_batch_data(batch_size, testu, testi, usernei, testlabel, user_neighbor_emb):
print(" 生成评估批次数据...")
predictions = model(inputs, training=False) # 预测时要设置 training=False
all_predictions.append(predictions.numpy())
all_labels.append(targets)
print(" 连接预测结果和标签...")
all_predictions = np.concatenate(all_predictions).flatten() * LABEL_SCALE
all_labels = np.concatenate(all_labels).flatten() * LABEL_SCALE
print(" 计算 MSE...")
mse = np.mean(np.square(all_predictions - all_labels))
# 计算 HR, NDCG, Precision, Recall (需要按用户分组)
hr_list = []
ndcg_list = []
precision_list = []
recall_list = []
print(" 开始计算 HR, NDCG, Precision, Recall...")
for user_id in np.unique(testu): # 遍历每个用户
user_indices = np.where(testu == user_id)[0]
user_predictions = all_predictions[user_indices]
user_labels = all_labels[user_indices]
user_items = testi[user_indices]
# 获取 Top-K 推荐
top_k_indices = np.argsort(user_predictions)[::-1][:k]
top_k_items = user_items[top_k_indices]
# 计算 HR
hits = np.isin(top_k_items, user_items[user_labels > 0]) # 假设 label > 0 表示相关
hr = np.sum(hits) / k if k > 0 else 0
hr_list.append(hr)
# 计算 NDCG
dcg = np.sum((2 ** hits - 1) / np.log2(np.arange(2, len(hits) + 2)))
idcg = np.sum((2 ** np.ones_like(hits) - 1) / np.log2(np.arange(2, len(hits) + 2)))
ndcg = dcg / idcg if idcg > 0 else 0
ndcg_list.append(ndcg)
# 计算 Precision
true_positives = np.sum(hits)
precision = true_positives / k if k > 0 else 0
precision_list.append(precision)
# 计算 Recall
relevant_items = np.sum(user_labels > 0)
recall = true_positives / relevant_items if relevant_items > 0 else 0
recall_list.append(recall)
print(" 计算指标的平均值...")
metrics = {
"MSE": mse,
"HR": np.mean(hr_list),
"NDCG": np.mean(ndcg_list),
"Precision": np.mean(precision_list),
"Recall": np.mean(recall_list),
}
print("评估完成。")
return metrics
def plot_metrics(history):
"""
绘制损失曲线和指标曲线。
Args:
history: 一个字典,包含每个 epoch 的训练损失、验证损失和指标。
"""
epochs = range(1, len(history['loss']) + 1)
# 绘制损失曲线
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(epochs, history['loss'], label='Training Loss')
plt.plot(epochs, history['val_loss'], label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
# 绘制指标曲线
plt.subplot(1, 2, 2)
for metric_name in ['HR', 'NDCG', 'Precision', 'Recall']:
plt.plot(epochs, history[metric_name.lower()], label=metric_name) # 假设指标名称在 history 中是小写
plt.title('Validation Metrics')
plt.xlabel('Epoch')
plt.ylabel('Metric Value')
plt.legend()
plt.tight_layout()
plt.show()
def train(model, userembedding_layer, trainu, traini, trainlabel, train_user_index, Otraining, testu, testi, testlabel, usernei):
EPOCHS = 2
# 学习率衰减
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
initial_learning_rate=float(LR),
decay_steps=1000,
decay_rate=0.96,
staircase=True)
initial_lr = float(lr_schedule(0))
best_val_ndcg = 0.0
epochs_no_improve = 0
history = {'loss': [], 'val_loss': [], 'hr': [], 'ndcg': [], 'precision': [], 'recall': []}
alluserembs = userembedding_layer.get_weights()[0]
print("开始图嵌入扩展...")
with tf.device('/GPU:0'): # 图嵌入扩展在 GPU 上运行
user_neighbor_emb = graph_embedding_expansion(Otraining, usernei, alluserembs)
print("图嵌入扩展完成。")
for epoch in range(EPOCHS):
train_loss = []
history_ = generate_history(Otraining)
print(f"开始第 {epoch + 1} 轮训练...")
with tf.device('/GPU:0'): # 训练循环在 GPU 上运行
for inputs, targets in generate_batch_data_random(BATCH_SIZE, train_user_index, trainu, traini, history_, trainlabel, user_neighbor_emb):
print(" 生成训练批次数据...")
with tf.GradientTape() as tape:
predictions = model(inputs, training=True)
print(" 模型预测完成...")
loss = tf.keras.losses.MSE(targets, predictions)
print(" 损失计算完成...")
gradients = tape.gradient(loss, model.trainable_variables)
print(" 梯度计算完成...")
gradients, _ = tf.clip_by_global_norm(gradients, clip_norm=5.0)
print(" 梯度裁剪完成...")
model.optimizer.apply_gradients(zip(gradients, model.trainable_variables))
print(" 模型参数更新完成...")
train_loss.append(loss.numpy())
# 计算平均训练损失
avg_train_loss = np.mean(train_loss)
history['loss'].append(avg_train_loss)
print(f" 第 {epoch + 1} 轮训练损失: {avg_train_loss:.4f}")
# 评估验证集
val_metrics = evaluate(model, testu, testi, testlabel, user_neighbor_emb, batch_size=BATCH_SIZE)
history['val_loss'].append(val_metrics['MSE'])
history['hr'].append(val_metrics['HR'])
history['ndcg'].append(val_metrics['NDCG'])
history['precision'].append(val_metrics['Precision'])
history['recall'].append(val_metrics['Recall'])
print(f" 第 {epoch + 1} 轮验证指标: "
f"Val MSE: {val_metrics['MSE']:.4f}, HR: {val_metrics['HR']:.4f}, "
f"NDCG: {val_metrics['NDCG']:.4f}, Precision: {val_metrics['Precision']:.4f}, Recall: {val_metrics['Recall']:.4f}")
# Early Stopping 检查
if val_metrics['NDCG'] > best_val_ndcg:
best_val_ndcg = val_metrics['NDCG']
epochs_no_improve = 0
# 保存最佳模型权重
model.save_weights("best_model_weights.h5")
print(" 保存了新的最佳模型权重。")
else:
epochs_no_improve += 1
if epochs_no_improve >= patience:
print(f"Early stopping triggered after epoch {epoch+1}")
model.load_weights("best_model_weights.h5") # 加载最佳权重
break
plot_metrics(history) # 训练结束后绘制曲线
return user_neighbor_emb
if __name__ == "__main__":
M = load_matlab_file(path_dataset, 'M')
Otraining = load_matlab_file(path_dataset, 'Otraining')
Otest = load_matlab_file(path_dataset, 'Otest')
usernei = generate_history(Otraining)
trainu, traini, trainlabel, train_user_index = generate_training_data(Otraining, M)
testu, testi, testlabel = generate_test_data(Otest, M)
generate_key()
print("开始获取模型...")
with tf.device('/GPU:0'): # 模型初始化在 GPU 上
model, userembedding_layer, itemembedding_layer = get_model(Otraining)
print("模型获取完成。")
print("开始训练模型...")
user_neighbor_emb = train(model, userembedding_layer, trainu, traini, trainlabel, train_user_index, Otraining, testu, testi, testlabel, usernei)
print("模型训练完成。")
# test(model, user_neighbor_emb) # 训练完模型,如果需要测试,可以取消注释
```