kaggle中的gpu使用问题

kaggle选择了GPU P100但是gpu使用率为0怎么处理，以下是我的代码用的tensorflow需要修改吗


```python
#!/usr/bin/env python
# coding: utf-8
import sys
sys.path.append('/kaggle/input')
from item0322.itemdata.utils import *
from item0322.itemdata.encrypt import *
from item0322.itemdata.model import *
from item0322.itemdata.preprocess import *
from item0322.itemdata.expansion import *
from item0322.itemdata.generator import *
from item0322.itemdata.const import *
import numpy as np
import random
import os
from scipy.io import loadmat
import tensorflow as tf
import matplotlib.pyplot as plt  # 导入绘图库

# 检查并设置 GPU
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        # 只使用第一个 GPU
        tf.config.set_visible_devices(gpus[0], 'GPU')
        # 设置内存增长
        tf.config.experimental.set_memory_growth(gpus[0], True)
        print(f"使用 GPU: {gpus[0].name}")
    except RuntimeError as e:
        print(e)
else:
    print("未检测到 GPU，将使用 CPU")

path_dataset = '/kaggle/input/item0322/itemdata/training_test_dataset.mat'

def evaluate(model, testu, testi, testlabel, user_neighbor_emb, batch_size=4, k=10):
   """
   评估模型性能，计算 MSE, HR, NDCG, Precision, Recall。

   Args:
       model: 训练好的模型。
       testu, testi, testlabel: 测试数据。
       user_neighbor_emb: 用户邻居嵌入。
       batch_size: 批大小。
       k: Top-K 推荐列表的长度。

   Returns:
       一个包含 MSE, HR, NDCG, Precision, Recall 的字典。
   """
   all_predictions = []
   all_labels = []
   print("开始评估...")

   with tf.device('/GPU:0'):  # 强制在 GPU 上运行评估
       for inputs, targets in generate_batch_data(batch_size, testu, testi, usernei, testlabel, user_neighbor_emb):
           print("  生成评估批次数据...")
           predictions = model(inputs, training=False)  # 预测时要设置 training=False
           all_predictions.append(predictions.numpy())
           all_labels.append(targets)

   print("  连接预测结果和标签...")
   all_predictions = np.concatenate(all_predictions).flatten() * LABEL_SCALE
   all_labels = np.concatenate(all_labels).flatten() * LABEL_SCALE

   print("  计算 MSE...")
   mse = np.mean(np.square(all_predictions - all_labels))

   # 计算 HR, NDCG, Precision, Recall (需要按用户分组)
   hr_list = []
   ndcg_list = []
   precision_list = []
   recall_list = []

   print("  开始计算 HR, NDCG, Precision, Recall...")
   for user_id in np.unique(testu):  # 遍历每个用户
       user_indices = np.where(testu == user_id)[0]
       user_predictions = all_predictions[user_indices]
       user_labels = all_labels[user_indices]
       user_items = testi[user_indices]

       # 获取 Top-K 推荐
       top_k_indices = np.argsort(user_predictions)[::-1][:k]
       top_k_items = user_items[top_k_indices]

       # 计算 HR
       hits = np.isin(top_k_items, user_items[user_labels > 0])  # 假设 label > 0 表示相关
       hr = np.sum(hits) / k if k > 0 else 0
       hr_list.append(hr)

       # 计算 NDCG
       dcg = np.sum((2 ** hits - 1) / np.log2(np.arange(2, len(hits) + 2)))
       idcg = np.sum((2 ** np.ones_like(hits) - 1) / np.log2(np.arange(2, len(hits) + 2)))
       ndcg = dcg / idcg if idcg > 0 else 0
       ndcg_list.append(ndcg)

       # 计算 Precision
       true_positives = np.sum(hits)
       precision = true_positives / k if k > 0 else 0
       precision_list.append(precision)
       # 计算 Recall
       relevant_items = np.sum(user_labels > 0)
       recall = true_positives / relevant_items if relevant_items > 0 else 0
       recall_list.append(recall)

   print("  计算指标的平均值...")
   metrics = {
       "MSE": mse,
       "HR": np.mean(hr_list),
       "NDCG": np.mean(ndcg_list),
       "Precision": np.mean(precision_list),
       "Recall": np.mean(recall_list),
   }
   print("评估完成。")
   return metrics

def plot_metrics(history):
   """
   绘制损失曲线和指标曲线。

   Args:
       history: 一个字典，包含每个 epoch 的训练损失、验证损失和指标。
   """
   epochs = range(1, len(history['loss']) + 1)

   # 绘制损失曲线
   plt.figure(figsize=(12, 5))
   plt.subplot(1, 2, 1)
   plt.plot(epochs, history['loss'], label='Training Loss')
   plt.plot(epochs, history['val_loss'], label='Validation Loss')
   plt.title('Training and Validation Loss')
   plt.xlabel('Epoch')
   plt.ylabel('Loss')
   plt.legend()

   # 绘制指标曲线
   plt.subplot(1, 2, 2)
   for metric_name in ['HR', 'NDCG', 'Precision', 'Recall']:
       plt.plot(epochs, history[metric_name.lower()], label=metric_name)  # 假设指标名称在 history 中是小写
   plt.title('Validation Metrics')
   plt.xlabel('Epoch')
   plt.ylabel('Metric Value')
   plt.legend()

   plt.tight_layout()
   plt.show()

def train(model, userembedding_layer, trainu, traini, trainlabel, train_user_index, Otraining, testu, testi, testlabel, usernei):
   EPOCHS = 2

   # 学习率衰减
   lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
       initial_learning_rate=float(LR),
       decay_steps=1000,
       decay_rate=0.96,
       staircase=True)
   initial_lr = float(lr_schedule(0))

   best_val_ndcg = 0.0
   epochs_no_improve = 0
   history = {'loss': [], 'val_loss': [], 'hr': [], 'ndcg': [], 'precision': [], 'recall': []}

   alluserembs = userembedding_layer.get_weights()[0]
   print("开始图嵌入扩展...")
   with tf.device('/GPU:0'):  # 图嵌入扩展在 GPU 上运行
       user_neighbor_emb = graph_embedding_expansion(Otraining, usernei, alluserembs)
   print("图嵌入扩展完成。")

   for epoch in range(EPOCHS):
       train_loss = []
       history_ = generate_history(Otraining)
       print(f"开始第 {epoch + 1} 轮训练...")

       with tf.device('/GPU:0'):  # 训练循环在 GPU 上运行
           for inputs, targets in generate_batch_data_random(BATCH_SIZE, train_user_index, trainu, traini, history_, trainlabel, user_neighbor_emb):
               print("  生成训练批次数据...")
               with tf.GradientTape() as tape:
                   predictions = model(inputs, training=True)
                   print("    模型预测完成...")
                   loss = tf.keras.losses.MSE(targets, predictions)
                   print("    损失计算完成...")
               gradients = tape.gradient(loss, model.trainable_variables)
               print("    梯度计算完成...")
               gradients, _ = tf.clip_by_global_norm(gradients, clip_norm=5.0)
               print("    梯度裁剪完成...")
               model.optimizer.apply_gradients(zip(gradients, model.trainable_variables))
               print("    模型参数更新完成...")
               train_loss.append(loss.numpy())

       # 计算平均训练损失
       avg_train_loss = np.mean(train_loss)
       history['loss'].append(avg_train_loss)
       print(f"  第 {epoch + 1} 轮训练损失: {avg_train_loss:.4f}")

       # 评估验证集
       val_metrics = evaluate(model, testu, testi, testlabel, user_neighbor_emb, batch_size=BATCH_SIZE)
       history['val_loss'].append(val_metrics['MSE'])
       history['hr'].append(val_metrics['HR'])
       history['ndcg'].append(val_metrics['NDCG'])
       history['precision'].append(val_metrics['Precision'])
       history['recall'].append(val_metrics['Recall'])

       print(f"  第 {epoch + 1} 轮验证指标: "
             f"Val MSE: {val_metrics['MSE']:.4f}, HR: {val_metrics['HR']:.4f}, "
             f"NDCG: {val_metrics['NDCG']:.4f}, Precision: {val_metrics['Precision']:.4f}, Recall: {val_metrics['Recall']:.4f}")

       # Early Stopping 检查
       if val_metrics['NDCG'] > best_val_ndcg:
           best_val_ndcg = val_metrics['NDCG']
           epochs_no_improve = 0
           # 保存最佳模型权重
           model.save_weights("best_model_weights.h5")
           print("    保存了新的最佳模型权重。")
       else:
           epochs_no_improve += 1
           if epochs_no_improve >= patience:
               print(f"Early stopping triggered after epoch {epoch+1}")
               model.load_weights("best_model_weights.h5")  # 加载最佳权重
               break

   plot_metrics(history)  # 训练结束后绘制曲线
   return user_neighbor_emb

if __name__ == "__main__":
   M = load_matlab_file(path_dataset, 'M')
   Otraining = load_matlab_file(path_dataset, 'Otraining')
   Otest = load_matlab_file(path_dataset, 'Otest')

   usernei = generate_history(Otraining)
   trainu, traini, trainlabel, train_user_index = generate_training_data(Otraining, M)
   testu, testi, testlabel = generate_test_data(Otest, M)

   generate_key()

   print("开始获取模型...")
   with tf.device('/GPU:0'):  # 模型初始化在 GPU 上
       model, userembedding_layer, itemembedding_layer = get_model(Otraining)
   print("模型获取完成。")
   print("开始训练模型...")
   user_neighbor_emb = train(model, userembedding_layer, trainu, traini, trainlabel, train_user_index, Otraining, testu, testi, testlabel, usernei)
   print("模型训练完成。")
   # test(model, user_neighbor_emb)  # 训练完模型，如果需要测试，可以取消注释

```

写回答
好问题 0 提建议
关注问题
分享
邀请回答
编辑收藏删除结题
收藏举报

4条回答默认最新

关注

码龄粉丝数原力等级 --

被采纳

被点赞

采纳率
道友老李 JWE233286一种基于机器视觉的水表指针读数识别及修正的方法专利发明者 2025-03-23 09:32
关注
让【道友老李】来帮你解答，本回答参考gpt编写，并整理提供，如果还有疑问可以点击头像关注私信或评论。
如果答案让您满意，请采纳、关注，非常感谢！
如果GPU使用率为0，可以尝试以下几种方法来处理：
检查代码中是否有错误导致无法正确使用GPU，比如可能没有正确设置GPU运行环境。
确保GPU驱动和TensorFlow版本兼容并且正确安装。
尝试在代码中设置使用GPU的设备，确保代码正确指定使用GPU。
检查GPU是否被其他进程占用，导致无法正常使用。
尝试在Kaggle平台上重新启动环境，也许可以解决GPU使用率为0的问题。以下是一个示例代码，可以尝试在代码中加入这部分来强制在GPU上运行评估：

import tensorflow as tf ... with tf.device('/GPU:0'): # 强制在 GPU 上运行评估 for inputs, targets in generate_batch_data(batch_size, testu, testi, usernei, testlabel, user_neighbor_emb): print(" 生成评估批次数据...") predictions = model(inputs, training=False) # 预测时要设置 training=False all_predictions.append(predictions.numpy()) all_labels.append(targets)

通过这样的设置，可以尝试解决GPU使用率为0的问题。如果问题仍然存在，可能需要进一步排查其他原因。
解决无用
评论打赏
分享
举报

评论

按下Enter换行，Ctrl+Enter发表内容

报告相同问题？

关注问题

如何用Kaggle免费GPU
2025-07-17 22:40

waterHBO的博客事情是这样的，我最近对AI绘画和图像识别产生了浓厚的兴趣，想训练一个能区分“好”与“坏”画风的模型。然而，当我运行脚本的那一刻，现实给了我沉重一击：我的 12GB 显存瞬间被占满，命令行无情地抛出了。它不仅...
Kaggle Notebook GPU使用教程（保姆版详细教程）
2025-03-21 09:44

Guynl:)的博客 Kaggle平台在线GPU使用教程，详细版
深度学习——使用kaggle中的GPU资源
2023-10-15 14:45

星石传说的博客之前都是使用CPU来进行模型训练，对于一些...本章将介绍一下如何使用Kaggle中免费的GPU。（每周只有30h)网络上还有许多免费使用GPU的平台，Kaggle只是其一。既以为人己愈有，既以与人己愈多。 –2023-10-14 进阶篇。
Kaggle Notebook免费GPU资源使用技巧汇总
2025-12-30 02:31

魔法小药丸的博客无需本地配置，Kaggle Notebook提供预装PyTorch与CUDA的云端GPU环境，适合深度学习快速开发。掌握设备确认、Jupyter交互式训练、SSH连接与资源管理技巧，能高效利用其9小时算力限额。配合Kaggle Datasets和版本控制...
在kaggle中用GPU训练模型
2023-07-23 14:50

云梦之上的博客在kaggle中用其中notebook加上其自带的gpu来训练模型
树叶分类竞赛(Baseline)以及kaggle的GPU使用
2024-11-01 09:44

风走茶未凉的博客树叶分类竞赛(Baseline)-kaggle的GPU使用文章目录树叶分类竞赛(Baseline)-kaggle的GPU使用竞赛的步骤代码实现创建自定义dataset 定义data_loader 模型定义超参数训练模型预测和保存结果 kaggle使用竞赛的...
利用kaggle的GPU训练自己的模型（项目）
2023-05-25 16:31

干就完事！的博客不管你使用哪种方法，都建议在你的代码里面使用的文件路径最好用相对路径，因为你使用Add Data添加数据时，你的数据的路径其实是/kaggle/input/数据集的名字（就是上传数据时命名那个），如果使用绝对路径的话，只能...
薅羊毛tips：kaggle平台免费GPU使用教程
2023-02-07 17:41

深度之眼的博客一些常见的问题和实用tips
kaggle解决不出现GPU选项
2024-10-21 10:11

二月红08的博客打开kaggle,找到setting,将里面的Phone verification认证了，认证成功后，重新进入网页就出现GPU选项了。
【人工智能概论】使用kaggle提供的GPU训练神经网络
2023-03-06 20:24

小白的努力探索的博客使用kaggle提供的GPU训练神经网络
没有解决我的问题, 去提问

问题事件

关注

码龄粉丝数原力等级 --

被采纳

被点赞

采纳率
创建了问题 3月23日

kaggle中的gpu使用问题

4条回答 默认 最新

问题事件

4条回答默认最新