新手学python,我遇到一个python在句子里面,我不能理解这个for循环是什么意思。
如果改成常见的那种for循环应该怎么写。
我看不懂的代码是这一句
Q[s, a] = np.sum([T[s, a, sp] * (R[s, a, sp] + discount_rate * np.max(Q_prev[sp])) for sp in range(3)])
这是完整的代码,运行的环境是jupyter,python版本是3.6.5
import numpy as np
nan = np.nan
T = np.array([[[0.7,0.3,0.0],[1.0,0.0,0.0],[0.8,0.2,0.0]],
[[0.0,1.0,0.0],[nan,nan,nan],[0.0,0.0,1.0]],
[[nan,nan,nan],[0.8,0.1,0.1],[nan,nan,nan]]
])
R = np.array([[[10.,0.0,0.0],[0.0,0.0,0.0],[0.0,0.0,0.0]],
[[10.,0.0,0.0],[nan,nan,nan],[0.0,0.0,-50.0]],
[[nan,nan,nan],[40.0,0.0,0.0],[nan,nan,nan]]
])
possible_actions = [[0,1,2],[0,2],[1]]
Q = np.full((3, 3), -np.inf) # -inf 对应着不可能的动作
for state, actions in enumerate(possible_actions):
Q[state, actions] = 0.0 # 对所有可能的动作初始化为0.0
learning_rate = 0.01
discount_rate = 0.95
n_iterations = 100
for iteration in range(n_iterations):
Q_prev = Q.copy()
for s in range(3):
for a in possible_actions[s]:
Q[s, a] = np.sum([T[s, a, sp] * (R[s, a, sp] + discount_rate * np.max(Q_prev[sp]))
for sp in range(3)])