weixin_38946936
weixin_38946936
2018-11-16 12:59

tensorflow-gpu Failed to get convolution algorithm.

40
  • tensorflow
  • 神经网络
  • gpu

成功安装了gpu版的tensorflow之后,尝试跑两个神经网
第一个:全连接的DNN
关键代码如下:

xs=tf.placeholder(tf.float32,[None,10])
ys=tf.placeholder(tf.float32,[None,7])


'layer1:ful connect'
W_fc1=weight_variable([10,5000],name_data=None) 
b_fc1=bias_variable([5000],name_data=None)

h_fc1=tf.nn.relu(tf.matmul(xs,W_fc1)+b_fc1)

'layer2:ful connect'
W_fc2=weight_variable([5000,5000],name_data=None) 
b_fc2=bias_variable([5000],name_data=None)

h_fc2=tf.nn.relu(tf.matmul(h_fc1,W_fc2)+b_fc2)

'layer3:ful connect'
W_fc3=weight_variable([5000,5000],name_data=None) 
b_fc3=bias_variable([5000],name_data=None)

h_fc3=tf.nn.relu(tf.matmul(h_fc2,W_fc3)+b_fc3)

'output layer::ful connect,maxsoft'
W_fc4=weight_variable([5000,7],name_data=None) 
b_fc4=bias_variable([7],name_data=None)


output=tf.nn.sigmoid(tf.matmul(h_fc3,W_fc4)+b_fc4)

能够顺利的利用gpu加速,确实比cpu的计算速度快不少。
然而,在跑cnn的时候(部分代码如下)

'def weights'
def weight_variable(shape,name_data): 
    initial=tf.truncated_normal(shape,stddev=0.1)
    return tf.Variable(initial,dtype=tf.float32,name=name_data)

'def biases'
def bias_variable(shape,name_data): 
    initial=tf.constant(0.1,shape=shape) 
    return tf.Variable(initial,dtype=tf.float32,name=name_data)

'def conv2d layer'
def conv2d(x,W):
    return tf.nn.conv2d(x,W,strides=[1,1,1,1],padding='SAME')

'def pooling layer as max_pool'
def max_pool_2x2_v(x): 
    return tf.nn.max_pool(x,ksize=[1,2,2,1],strides=[1,2,2,1],padding='VALID')

'def pooling layer as max_pool'
def max_pool_2x2_s(x): 
    return tf.nn.max_pool(x,ksize=[1,2,2,1],strides=[1,1,1,1],padding='SAME')


#input layer
'placeholder xs & ys'
xs=tf.placeholder(tf.float32,[None,64])
ys=tf.placeholder(tf.float32,[None,1])
'reshape the xs as x_image,which shape is 10*10'
x_image=tf.reshape(xs,[-1,8,8,1])
print('red input::',x_image)


#layer2:conv layer 2 patches
'patch1'
W_conv_r_1_1=weight_variable([3,3,1,20],name_data='W_conv_r_1_1')
b_conv_r_1_1=bias_variable([20],name_data='b_conv_r_1_1')
h_conv_r_1_1=tf.nn.relu6(conv2d(x_image,W_conv_r_1_1)+b_conv_r_1_1)
'patch2'
W_conv_r_1_2=weight_variable([3,3,1,10],name_data='W_conv_r_1_2')
b_conv_r_1_2=bias_variable([10],name_data='b_conv_r_1_2')
h_conv_r_1_2=tf.nn.relu6(conv2d(x_image,W_conv_r_1_2)+b_conv_r_1_2)
'concat to layer2'
h_conv_r_1=tf.concat([h_conv_r_1_1,h_conv_r_1_2],3)
print("red layer2::",h_conv_r_1)

#layer3:conv layer:1 patch add with h_conv_r_1_2
'patch1'
W_conv_r_2_1=weight_variable([5,5,30,30],name_data='W_conv_r_2_1')
b_conv_r_2_1=bias_variable([30],name_data='b_conv_r_2_1')
h_conv_r_2_1=tf.nn.elu(conv2d(h_conv_r_1,W_conv_r_2_1)+b_conv_r_2_1)
'patch for next layer'
W_conv_r_2_2=weight_variable([5,5,30,15],name_data='W_conv_r_2_2')
b_conv_r_2_2=bias_variable([15],name_data='b_conv_r_2_2')
h_conv_r_2_2=tf.nn.elu(conv2d(h_conv_r_1,W_conv_r_2_2)+b_conv_r_2_2)
'concat for layer3'
h_conv_r_2=tf.concat([h_conv_r_2_1,h_conv_r_1_2],3)
print('red layer3;:',h_conv_r_2) 

上述代码是一个利用cnn训练黑白棋的程序,可以在CPU环境下顺利的运行,但是在gpu环境下,运行时会报错:Failed to get convolution algorithm (无法获得卷积算法)
完整的报错信息如下:

Traceback (most recent call last):
  File "C:\Users\fengg\AppData\Local\Programs\Python\Python35\lib\site-packages\tensorflow\python\client\session.py", line 1334, in _do_call
    return fn(*args)
  File "C:\Users\fengg\AppData\Local\Programs\Python\Python35\lib\site-packages\tensorflow\python\client\session.py", line 1319, in _run_fn
    options, feed_dict, fetch_list, target_list, run_metadata)
  File "C:\Users\fengg\AppData\Local\Programs\Python\Python35\lib\site-packages\tensorflow\python\client\session.py", line 1407, in _call_tf_sessionrun
    run_metadata)
tensorflow.python.framework.errors_impl.UnknownError: Failed to get convolution algorithm. This is probably because cuDNN failed to initialize, so try looking to see if a warning log message was printed above.
     [[{{node Conv2D}} = Conv2D[T=DT_FLOAT, data_format="NHWC", dilations=[1, 1, 1, 1], padding="SAME", strides=[1, 1, 1, 1], use_cudnn_on_gpu=true, _device="/job:localhost/replica:0/task:0/device:GPU:0"](Reshape, W_conv_r_1_1/read)]]
     [[{{node Sigmoid/_75}} = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device_incarnation=1, tensor_name="edge_105_Sigmoid", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\fengg\Desktop\Othello with ResNet  3\Othello with ResNet-large\Othello with ResNet-large\train_ResNet.py", line 326, in <module>
    try_point=sess.run(prediction_r, feed_dict={xs:board_try,ys:[[0.0001]]})
  File "C:\Users\fengg\AppData\Local\Programs\Python\Python35\lib\site-packages\tensorflow\python\client\session.py", line 929, in run
    run_metadata_ptr)
  File "C:\Users\fengg\AppData\Local\Programs\Python\Python35\lib\site-packages\tensorflow\python\client\session.py", line 1152, in _run
    feed_dict_tensor, options, run_metadata)
  File "C:\Users\fengg\AppData\Local\Programs\Python\Python35\lib\site-packages\tensorflow\python\client\session.py", line 1328, in _do_run
    run_metadata)
  File "C:\Users\fengg\AppData\Local\Programs\Python\Python35\lib\site-packages\tensorflow\python\client\session.py", line 1348, in _do_call
    raise type(e)(node_def, op, message)
tensorflow.python.framework.errors_impl.UnknownError: Failed to get convolution algorithm. This is probably because cuDNN failed to initialize, so try looking to see if a warning log message was printed above.
     [[node Conv2D (defined at C:\Users\fengg\Desktop\Othello with ResNet  3\Othello with ResNet-large\Othello with ResNet-large\train_ResNet.py:31)  = Conv2D[T=DT_FLOAT, data_format="NHWC", dilations=[1, 1, 1, 1], padding="SAME", strides=[1, 1, 1, 1], use_cudnn_on_gpu=true, _device="/job:localhost/replica:0/task:0/device:GPU:0"](Reshape, W_conv_r_1_1/read)]]
     [[{{node Sigmoid/_75}} = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device_incarnation=1, tensor_name="edge_105_Sigmoid", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]

Caused by op 'Conv2D', defined at:
  File "<string>", line 1, in <module>
  File "C:\Users\fengg\AppData\Local\Programs\Python\Python35\lib\idlelib\run.py", line 130, in main
    ret = method(*args, **kwargs)
  File "C:\Users\fengg\AppData\Local\Programs\Python\Python35\lib\idlelib\run.py", line 357, in runcode
    exec(code, self.locals)
  File "C:\Users\fengg\Desktop\Othello with ResNet  3\Othello with ResNet-large\Othello with ResNet-large\train_ResNet.py", line 57, in <module>
    h_conv_r_1_1=tf.nn.relu6(conv2d(x_image,W_conv_r_1_1)+b_conv_r_1_1)
  File "C:\Users\fengg\Desktop\Othello with ResNet  3\Othello with ResNet-large\Othello with ResNet-large\train_ResNet.py", line 31, in conv2d
    return tf.nn.conv2d(x,W,strides=[1,1,1,1],padding='SAME')
  File "C:\Users\fengg\AppData\Local\Programs\Python\Python35\lib\site-packages\tensorflow\python\ops\gen_nn_ops.py", line 1044, in conv2d
    data_format=data_format, dilations=dilations, name=name)
  File "C:\Users\fengg\AppData\Local\Programs\Python\Python35\lib\site-packages\tensorflow\python\framework\op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)
  File "C:\Users\fengg\AppData\Local\Programs\Python\Python35\lib\site-packages\tensorflow\python\util\deprecation.py", line 488, in new_func
    return func(*args, **kwargs)
  File "C:\Users\fengg\AppData\Local\Programs\Python\Python35\lib\site-packages\tensorflow\python\framework\ops.py", line 3274, in create_op
    op_def=op_def)
  File "C:\Users\fengg\AppData\Local\Programs\Python\Python35\lib\site-packages\tensorflow\python\framework\ops.py", line 1770, in __init__
    self._traceback = tf_stack.extract_stack()

UnknownError (see above for traceback): Failed to get convolution algorithm. This is probably because cuDNN failed to initialize, so try looking to see if a warning log message was printed above.
     [[node Conv2D (defined at C:\Users\fengg\Desktop\Othello with ResNet  3\Othello with ResNet-large\Othello with ResNet-large\train_ResNet.py:31)  = Conv2D[T=DT_FLOAT, data_format="NHWC", dilations=[1, 1, 1, 1], padding="SAME", strides=[1, 1, 1, 1], use_cudnn_on_gpu=true, _device="/job:localhost/replica:0/task:0/device:GPU:0"](Reshape, W_conv_r_1_1/read)]]
     [[{{node Sigmoid/_75}} = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device_incarnation=1, tensor_name="edge_105_Sigmoid", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]] 

请问这个问题该如何解决,谢谢了!

  • 点赞
  • 回答
  • 收藏
  • 复制链接分享

1条回答

为你推荐