首先我这里有一个yunfuzai_main.py:
from dqn_agent import Agent
from model import QNetwork
import matplotlib.pyplot as plt
import numpy as np
import torch
import xlrd
from openpyxl import load_workbook
import gc
STATE_SIZE = 10
EPISODE_COUNT = 1000
# def dqn(n_episodes=EPISODE_COUNT,eps_start=2.0,eps_end=0.03,eps_decay=0.990):
# scores = []
# for i_episode in range(1, n_episodes + 1):
# print("Episode" + str(i_episode))
# state = getState(stockData, 0, STATE_SIZE + 1)
# total_profit = 0
# agent.inventory = []
# eps = eps_start
#
# for t in range(l):
# action = agent.act(state, eps)
# next_state = getState(stockData, t + 1, STATE_SIZE + 1)
# reward = 0
#
# if action == 1: # 买入
# agent.inventory.append(stockData[t])
# # print("buy" + str(stockData[t]))
# elif action == 2 and len(agent.inventory) > 0: # 卖出
# bought_price = agent.inventory.pop(0)
# total_profit += stockData[t] - bought_price
# # reward = max(stockData[t] - bought_price, 0)
# reward = stockData[t] - bought_price
# # print("Sell: " + str(stockData[t]) + " | Profit: " + str(stockData[t] - bought_price))
# done = 1 if t == l - 1 else 0
# agent.step(state, action, reward, next_state, done)
# eps = max(eps_end, eps * eps_decay)
# state = next_state
#
# # if done:
# # print("------------------------------")
# # print("total_profit = " + str(total_profit))
# # print("------------------------------")
# scores.append(total_profit)
# return scores
def dqn1(n_episodes1=EPISODE_COUNT, eps_start1=1.0, eps_end1=0.01, eps_decay1=0.9995):
scores1 = []
for i_episode in range(1,n_episodes1+1):
print("Episode" + str(i_episode))
state = getState(stockData, 0, STATE_SIZE + 1)
agent.inventory = []
eps = eps_start1
#global loss
print(1)
for t in range(l):
print(2)
action = agent.act(state,eps)
print(1)
next_state = getState(stockData, t + 1, STATE_SIZE + 1)
reward = 0
done = 1 if t == l - 1 else 0
global loss
loss = agent.step(state, action, reward, next_state, done)
print("loss", + str(loss))
scores1.append(loss)
gc.collect()
gc.collect()
return scores1
# if action == 1:# 过载
# agent.inventory.append(stockData[t])
# elif action == 2 and len(agent.inventory) > 0:
def getState(data, t, n):
d = t - n + 1
# block = data[d:t + 1] if d>= 0 else -d * [data[0]]+ data[0:t+1]
block = data[d:t + 1]
#res = [0 for x in range(0, n)]
#res = []
buffer = []
for i in range(len(block) - 1):
print("res=",buffer[i])
buffer.append(block[i + 1]-block[i])
#print("res=",res[i])
return np.array([buffer])
#return np.array([res])
if __name__== '__main__':
print(1)
#stockData = []
#stockData = []
stockData = [None]*801
datas1 =xlrd.open_workbook(r'C:\Users\86138\Desktop\zi_ding_yi.xlsx',{'constant_memory':True})
#datas1 = xlrd.open_workbook(r'C:\Users\86138\Desktop\zi_ding_yi.xlsx')
print(2)
table = datas1.sheets()[0]
row_num = 0
print(3)
# for item in table[1:]:
# stockData.append(float(table.col_values(6)))
col = table.col_values(5)
while row_num <= 800 :
# stockData[row_num] = table.col_values(5,0,row_num)
#stockData[row_num] = table.cell_value(row_num,5)\
stockData.append(table.cell_value(row_num,5))
row_num += 1
# stockData{row_num} = table.col_values(5,0,row_num)
# row_num =+1
agent = Agent(state_size=STATE_SIZE, action_size=3)
print(4)
l = len(stockData) - 1
# scores = dqn()
scores1 =dqn1()
它说我的错误是:
C:\Users\86138\anaconda3\python.exe C:/Users/86138/Desktop/stockPrediction-master/yunfuzai_main.py
1
2
3
4
Episode1
1
2
Traceback (most recent call last):
File "C:/Users/86138/Desktop/stockPrediction-master/yunfuzai_main.py", line 123, in <module>
scores1 =dqn1()
File "C:/Users/86138/Desktop/stockPrediction-master/yunfuzai_main.py", line 61, in dqn1
action = agent.act(state,eps)
File "C:\Users\86138\Desktop\stockPrediction-master\dqn_agent.py", line 136, in act
state = torch.tensor(state).float.unsqueeze(0).to(device)
RuntimeError: Could not infer dtype of NoneType
而我的dqn_agent.py:
import numpy as np
import random
from collections import namedtuple, deque
from model import QNetwork
import torch
import torch.nn.functional as F
import torch.optim as optim
#初始化超参数
BUFFER_SIZE = int(1e5)
#缓冲去大小,重播缓冲区大小
BATCH_SIZE = 64
#批处理大小,最小批量大小,minbatch size
GAMMA = 0.99
##折扣率
TAU = 1e-3
#用于目标参数的软更新
LR = 5e-4
#学习率
UPDATE_EVERY = 4
#更新网络的快慢
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#首先看下你的设备有无cuda可用:
class ReplayBuffer:
def __init__(self, action_size, buffer_size, batch_size):# 初始化记忆库
self.action_size = action_size
self.memory = deque(maxlen=buffer_size)
#使用 deque(maxlen=N) 构造函数会创建一个固定大小的队列。当新的元素加入并且这个队列已满的时候,最老的元素会自动被移除掉
self.batch_size = batch_size
#batch字面上是批量的意思,在深度学习中指的是计算一次cost需要的输入数据个数。
self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
#经验池的作用就是把每次进行的游戏回合transition(episode,step)记录下来存储起来。
# 在训练的时候则是在经验池中随机取一组transition batch对Q网络进行优化。同时,也需要及时丢掉过老的记录,及时更新。
#首先,定义了一个名为Experience的namedtuple。包括内容除了上面算法中提到的(s,a,r,s').还有结束的标识’done’。
def add(self, state, action, reward, next_state, done):# 向记忆库中加入一个记忆
e = self.experience(state, action, reward, next_state, done)
self.memory.append(e)
def sample(self):# 随机取出一个minibatch
experiences = random.sample(self.memory, k=self.batch_size)
states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device)
actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).float().to(device)
rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device)
next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(device)
dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None])).float().to(device)
return (states, actions, rewards, next_states, dones)
def __len__(self):
return len(self.memory)
class Agent:
def __init__(self, state_size, action_size):
self.state_size = state_size
self.action_size = action_size
# Q-Network
#在学习过程中,我们使用两个不相关的Q网络(Q_network_local和Q_network_target)来计算预测值(权重θ)和目标值(权重θ’)。
# 经过若干步骤后,目标网络会被冻结,然后拷贝实际的Q网络的权重到目标网络权重。
# 冻结目标Q网络一段时间再用实际Q网络的权重更新其权重,可以稳定训练过程
self.qnetwork_local = QNetwork(state_size, action_size).to(device)
self.qnetwork_target = QNetwork(state_size, action_size).to(device)#目标策略,智能体要学习的策略
self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)
#在线性回归或者监督学习中,我们会计算预测值与真实值之间的差距,也就是loss
#在计算得出loss之后,通常会使用Optimizer对所构造的数学模型/网络模型进行参数优化,
#通常情况下,优化的最终目的是使得loss趋向于最小。
# Replay Buffer,所以我们设置一个replay_buffer,获得新的交互数据,抛弃旧的数据,
# 并且每次从这个replay_buffer中随机取一个batch,来训练我们的系统
self.memory = ReplayBuffer(action_size, buffer_size=BUFFER_SIZE,batch_size=BATCH_SIZE)
# 初始化迭代步数
self.t_step = 0
# 初始化持仓
self.inventory = []
# Experience Replay就是这样的一种技术,在游戏数据的采集过程中,所有的经验数据<script type="math/tex" id="MathJax-Element-85">< s, a, r, s'
# ></script>都被存储到一个回放池(replay memory)中。当训练网络时,从回放池中随机地取一小撮数据,
# 而不是最新连续的几条转换数据,进行训练。
def step(self, state, action, reward, next_state, done):
# 每一步需要先存储记忆库
self.memory.add(state, action, reward, next_state, done)
# 每隔若干步学习一次
self.t_step = (self.t_step + 1) % UPDATE_EVERY
if self.t_step == 0:
if len(self.memory) > BATCH_SIZE:
experience = self.memory.sample()
self.learn(experience, GAMMA)
def learn(self, experience, gamma):
# 更新迭代
states, actions, rewards, next_states, dones = experience
# target network:compute and minimize the loss.计算并最小化损失
# Get max predicted Q values(for next states) from target model:从目标模型得到最大的预测Q值(下一个状态)
Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)
# compute Q target for current states:计算当前状态的Q目标。
Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
Q_expected = self.qnetwork_local(states).gather(1, actions.long())# 固定行号,确认列
# Compute loss
loss = F.mse_loss(Q_expected, Q_targets)
# Minimize the loss
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
# update target network
self.soft_update(self.qnetwork_local, self.qnetwork_target, tau=TAU)
return loss
def soft_update(self, local_model, target_model, tau):
for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
#实现Q'到Q的逼近 use .data and .data.copy#
def act(self, state, eps = 0.):
#Returns actions for given state as per current policy.
#Params
# state (array_like): current state
# eps (float): epsilon, for epsilon-greedy action selection
#参数个数
#状态(array_like):当前状态
#eps (float):用于epsilon-贪婪动作选择
# 返回动作值orch.unsqueeze()这个函数主要是对数据维度进行扩充
#state = torch.from_numpy(state).float().unsqueeze(0).to(device)
state = torch.tensor(state).float.unsqueeze(0).to(device)
self.qnetwork_local.eval()
with torch.no_grad():
action_values = self.qnetwork_local(state)
self.qnetwork_local.train()
# # Epsilon-greedy action selection
if random.random() > eps:
return np.argmax(action_values.cpu().data.numpy())
else:
return random.choice(np.arange(self.action_size))
#eps:根据当前策略返回给定状态的操作参数个数
我也是真没辙了,所以我想问问,到底是怎么回事,因为我真实想法是把那个loss输出来。但似乎不行,想请各位大神帮我看看,因为是毕业论文用的代码,所以就想问问。拜托了