在测试官方StyleGAN。
运行官方与训练模型pretrained_example.py generate_figures.py 没有问题。GPU工作正常。
运行train.py时报错
尝试只用单个GPU训练时没有报错。
NcclAllReduce应该跟多GPU通信有关,不太了解。
InvalidArgumentError (see above for traceback): No OpKernel was registered to support Op 'NcclAllReduce' with these attrs. Registered devices: [CPU,GPU], Registered kernels:
[[Node: TrainD/SumAcrossGPUs/NcclAllReduce = NcclAllReduce[T=DT_FLOAT, num_devices=2, reduction="sum", shared_name="c112", _device="/device:GPU:0"](GPU0/TrainD_grad/gradients/AddN_160)]]
经过多番google
尝试过
重启
conda install keras-gpu
重新安装tensorflow-gpu==1.10.0(跟官方版本保持一致)
……
Building TensorFlow graph...
Setting up snapshot image grid...
Setting up run dir...
Training...
Traceback (most recent call last):
File "d:\Users\admin\Anaconda3\envs\tfenv\lib\site-packages\tensorflow\python\client\session.py", line 1278, in _do_call
return fn(*args)
File "d:\Users\admin\Anaconda3\envs\tfenv\lib\site-packages\tensorflow\python\client\session.py", line 1263, in _run_fn
options, feed_dict, fetch_list, target_list, run_metadata)
File "d:\Users\admin\Anaconda3\envs\tfenv\lib\site-packages\tensorflow\python\client\session.py", line 1350, in _call_tf_sessionrun
run_metadata)
tensorflow.python.framework.errors_impl.InvalidArgumentError: No OpKernel was registered to support Op 'NcclAllReduce' with these attrs. Registered devices: [CPU,GPU], Registered kernels:
<no registered kernels>
[[Node: TrainD/SumAcrossGPUs/NcclAllReduce = NcclAllReduce[T=DT_FLOAT, num_devices=2, reduction="sum", shared_name="c112", _device="/device:GPU:0"](GPU0/TrainD_grad/gradients/AddN_160)]]
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "train.py", line 191, in <module>
main()
File "train.py", line 186, in main
dnnlib.submit_run(**kwargs)
File "E:\MachineLearning\stylegan-master\dnnlib\submission\submit.py", line 290, in submit_run
run_wrapper(submit_config)
File "E:\MachineLearning\stylegan-master\dnnlib\submission\submit.py", line 242, in run_wrapper
util.call_func_by_name(func_name=submit_config.run_func_name, submit_config=submit_config, **submit_config.run_func_kwargs)
File "E:\MachineLearning\stylegan-master\dnnlib\util.py", line 257, in call_func_by_name
return func_obj(*args, **kwargs)
File "E:\MachineLearning\stylegan-master\training\training_loop.py", line 230, in training_loop
tflib.run([D_train_op, Gs_update_op], {lod_in: sched.lod, lrate_in: sched.D_lrate, minibatch_in: sched.minibatch})
File "E:\MachineLearning\stylegan-master\dnnlib\tflib\tfutil.py", line 26, in run
return tf.get_default_session().run(*args, **kwargs)
File "d:\Users\admin\Anaconda3\envs\tfenv\lib\site-packages\tensorflow\python\client\session.py", line 877, in run
run_metadata_ptr)
File "d:\Users\admin\Anaconda3\envs\tfenv\lib\site-packages\tensorflow\python\client\session.py", line 1100, in _run
feed_dict_tensor, options, run_metadata)
File "d:\Users\admin\Anaconda3\envs\tfenv\lib\site-packages\tensorflow\python\client\session.py", line 1272, in _do_run
run_metadata)
File "d:\Users\admin\Anaconda3\envs\tfenv\lib\site-packages\tensorflow\python\client\session.py", line 1291, in _do_call
raise type(e)(node_def, op, message)
tensorflow.python.framework.errors_impl.InvalidArgumentError: No OpKernel was registered to support Op 'NcclAllReduce' with these attrs. Registered devices: [CPU,GPU], Registered kernels:
<no registered kernels>
[[Node: TrainD/SumAcrossGPUs/NcclAllReduce = NcclAllReduce[T=DT_FLOAT, num_devices=2, reduction="sum", shared_name="c112", _device="/device:GPU:0"](GPU0/TrainD_grad/gradients/AddN_160)]]
Caused by op 'TrainD/SumAcrossGPUs/NcclAllReduce', defined at:
File "train.py", line 191, in <module>
main()
File "train.py", line 186, in main
dnnlib.submit_run(**kwargs)
File "E:\MachineLearning\stylegan-master\dnnlib\submission\submit.py", line 290, in submit_run
run_wrapper(submit_config)
File "E:\MachineLearning\stylegan-master\dnnlib\submission\submit.py", line 242, in run_wrapper
util.call_func_by_name(func_name=submit_config.run_func_name, submit_config=submit_config, **submit_config.run_func_kwargs)
File "E:\MachineLearning\stylegan-master\dnnlib\util.py", line 257, in call_func_by_name
return func_obj(*args, **kwargs)
File "E:\MachineLearning\stylegan-master\training\training_loop.py", line 185, in training_loop
D_train_op = D_opt.apply_updates()
File "E:\MachineLearning\stylegan-master\dnnlib\tflib\optimizer.py", line 135, in apply_updates
g = nccl_ops.all_sum(g)
File "d:\Users\admin\Anaconda3\envs\tfenv\lib\site-packages\tensorflow\contrib\nccl\python\ops\nccl_ops.py", line 49, in all_sum
return _apply_all_reduce('sum', tensors)
File "d:\Users\admin\Anaconda3\envs\tfenv\lib\site-packages\tensorflow\contrib\nccl\python\ops\nccl_ops.py", line 230, in _apply_all_reduce
shared_name=shared_name))
File "d:\Users\admin\Anaconda3\envs\tfenv\lib\site-packages\tensorflow\contrib\nccl\ops\gen_nccl_ops.py", line 59, in nccl_all_reduce
num_devices=num_devices, shared_name=shared_name, name=name)
File "d:\Users\admin\Anaconda3\envs\tfenv\lib\site-packages\tensorflow\python\framework\op_def_library.py", line 787, in _apply_op_helper
op_def=op_def)
File "d:\Users\admin\Anaconda3\envs\tfenv\lib\site-packages\tensorflow\python\util\deprecation.py", line 454, in new_func
return func(*args, **kwargs)
File "d:\Users\admin\Anaconda3\envs\tfenv\lib\site-packages\tensorflow\python\framework\ops.py", line 3156, in create_op
op_def=op_def)
File "d:\Users\admin\Anaconda3\envs\tfenv\lib\site-packages\tensorflow\python\framework\ops.py", line 1718, in __init__
self._traceback = tf_stack.extract_stack()
InvalidArgumentError (see above for traceback): No OpKernel was registered to support Op 'NcclAllReduce' with these attrs. Registered devices: [CPU,GPU], Registered kernels:
<no registered kernels>
[[Node: TrainD/SumAcrossGPUs/NcclAllReduce = NcclAllReduce[T=DT_FLOAT, num_devices=2, reduction="sum", shared_name="c112", _device="/device:GPU:0"](GPU0/TrainD_grad/gradients/AddN_160)]]
#conda list:
# Name Version Build Channel
_tflow_select 2.1.0 gpu
absl-py 0.8.1 pypi_0 pypi
alabaster 0.7.12 py36_0
asn1crypto 1.2.0 py36_0
astor 0.8.0 pypi_0 pypi
astroid 2.3.2 py36_0
attrs 19.3.0 py_0
babel 2.7.0 py_0
backcall 0.1.0 py36_0
blas 1.0 mkl
bleach 3.1.0 py36_0
ca-certificates 2019.10.16 0
certifi 2019.9.11 py36_0
cffi 1.13.1 py36h7a1dbc1_0
chardet 3.0.4 py36_1003
cloudpickle 1.2.2 py_0
colorama 0.4.1 py36_0
cryptography 2.8 py36h7a1dbc1_0
cudatoolkit 9.0 1
cudnn 7.6.4 cuda9.0_0
decorator 4.4.1 py_0
defusedxml 0.6.0 py_0
django 2.2.7 pypi_0 pypi
docutils 0.15.2 py36_0
entrypoints 0.3 py36_0
gast 0.3.2 py_0
grpcio 1.25.0 pypi_0 pypi
h5py 2.9.0 py36h5e291fa_0
hdf5 1.10.4 h7ebc959_0
icc_rt 2019.0.0 h0cc432a_1
icu 58.2 ha66f8fd_1
idna 2.8 pypi_0 pypi
image 1.5.27 pypi_0 pypi
imagesize 1.1.0 py36_0
importlib_metadata 0.23 py36_0
intel-openmp 2019.4 245
ipykernel 5.1.3 py36h39e3cac_0
ipython 7.9.0 py36h39e3cac_0
ipython_genutils 0.2.0 py36h3c5d0ee_0
isort 4.3.21 py36_0
jedi 0.15.1 py36_0
jinja2 2.10.3 py_0
jpeg 9b hb83a4c4_2
jsonschema 3.1.1 py36_0
jupyter_client 5.3.4 py36_0
jupyter_core 4.6.1 py36_0
keras-applications 1.0.8 py_0
keras-base 2.2.4 py36_0
keras-gpu 2.2.4 0
keras-preprocessing 1.1.0 py_1
keyring 18.0.0 py36_0
lazy-object-proxy 1.4.3 py36he774522_0
libpng 1.6.37 h2a8f88b_0
libprotobuf 3.9.2 h7bd577a_0
libsodium 1.0.16 h9d3ae62_0
markdown 3.1.1 py36_0
markupsafe 1.1.1 py36he774522_0
mccabe 0.6.1 py36_1
mistune 0.8.4 py36he774522_0
mkl 2019.4 245
mkl-service 2.3.0 py36hb782905_0
mkl_fft 1.0.15 py36h14836fe_0
mkl_random 1.1.0 py36h675688f_0
more-itertools 7.2.0 py36_0
nbconvert 5.6.1 py36_0
nbformat 4.4.0 py36h3a5bc1b_0
numpy 1.17.3 py36h4ceb530_0
numpy-base 1.17.3 py36hc3f5095_0
numpydoc 0.9.1 py_0
openssl 1.1.1d he774522_3
packaging 19.2 py_0
pandoc 2.2.3.2 0
pandocfilters 1.4.2 py36_1
parso 0.5.1 py_0
pickleshare 0.7.5 py36_0
pillow 6.2.1 pypi_0 pypi
pip 19.3.1 py36_0
prompt_toolkit 2.0.10 py_0
protobuf 3.10.0 pypi_0 pypi
psutil 5.6.3 py36he774522_0
pycodestyle 2.5.0 py36_0
pycparser 2.19 py36_0
pyflakes 2.1.1 py36_0
pygments 2.4.2 py_0
pylint 2.4.3 py36_0
pyopenssl 19.0.0 py36_0
pyparsing 2.4.2 py_0
pyqt 5.9.2 py36h6538335_2
pyreadline 2.1 py36_1
pyrsistent 0.15.4 py36he774522_0
pysocks 1.7.1 py36_0
python 3.6.9 h5500b2f_0
python-dateutil 2.8.1 py_0
pytz 2019.3 py_0
pywin32 223 py36hfa6e2cd_1
pyyaml 5.1.2 py36he774522_0
pyzmq 18.1.0 py36ha925a31_0
qt 5.9.7 vc14h73c81de_0
qtawesome 0.6.0 py_0
qtconsole 4.5.5 py_0
qtpy 1.9.0 py_0
requests 2.22.0 py36_0
rope 0.14.0 py_0
scipy 1.3.1 py36h29ff71c_0
setuptools 39.1.0 pypi_0 pypi
sip 4.19.8 py36h6538335_0
six 1.13.0 pypi_0 pypi
snowballstemmer 2.0.0 py_0
sphinx 2.2.1 py_0
sphinxcontrib-applehelp 1.0.1 py_0
sphinxcontrib-devhelp 1.0.1 py_0
sphinxcontrib-htmlhelp 1.0.2 py_0
sphinxcontrib-jsmath 1.0.1 py_0
sphinxcontrib-qthelp 1.0.2 py_0
sphinxcontrib-serializinghtml 1.1.3 py_0
spyder 3.3.6 py36_0
spyder-kernels 0.5.2 py36_0
sqlite 3.30.1 he774522_0
sqlparse 0.3.0 pypi_0 pypi
tensorboard 1.10.0 py36he025d50_0
tensorflow 1.10.0 gpu_py36h3514669_0
tensorflow-base 1.10.0 gpu_py36h6e53903_0
tensorflow-gpu 1.10.0 pypi_0 pypi
termcolor 1.1.0 pypi_0 pypi
testpath 0.4.2 py36_0
tornado 6.0.3 py36he774522_0
traitlets 4.3.3 py36_0
typed-ast 1.4.0 py36he774522_0
urllib3 1.25.6 pypi_0 pypi
vc 14.1 h0510ff6_4
vs2015_runtime 14.16.27012 hf0eaf9b_0
wcwidth 0.1.7 py36h3d5aa90_0
webencodings 0.5.1 py36_1
werkzeug 0.16.0 py_0
wheel 0.33.6 py36_0
win_inet_pton 1.1.0 py36_0
wincertstore 0.2 py36h7fe50ca_0
wrapt 1.11.2 py36he774522_0
yaml 0.1.7 hc54c509_2
zeromq 4.3.1 h33f27b4_3
zipp 0.6.0 py_0
zlib 1.2.11 h62dcd97_3
2*RTX2080Ti driver 4.19.67