报错信息如下:
Traceback (most recent call last):
File "apex_sst.py", line 16, in <module>
File "apex_sst.py", line 16, in <module>
File "apex_sst.py", line 16, in <module>
Traceback (most recent call last):
File "apex_sst.py", line 16, in <module>
from apex import amp
File "/GPUFS/app_GPU/application/anaconda3/5.3.1/envs/pytorch14/lib/python3.6/site-packages/apex/__init__.py", line 18, in <module>
from apex import amp
File "/GPUFS/app_GPU/application/anaconda3/5.3.1/envs/pytorch14/lib/python3.6/site-packages/apex/__init__.py", line 18, in <module>
from apex import amp
File "/GPUFS/app_GPU/application/anaconda3/5.3.1/envs/pytorch14/lib/python3.6/site-packages/apex/__init__.py", line 18, in <module>
from apex import amp
File "/GPUFS/app_GPU/application/anaconda3/5.3.1/envs/pytorch14/lib/python3.6/site-packages/apex/__init__.py", line 18, in <module>
from apex.interfaces import (ApexImplementation,
from apex.interfaces import (ApexImplementation,
from apex.interfaces import (ApexImplementation,
File "/GPUFS/app_GPU/application/anaconda3/5.3.1/envs/pytorch14/lib/python3.6/site-packages/apex/interfaces.py", line 10, in <module>
File "/GPUFS/app_GPU/application/anaconda3/5.3.1/envs/pytorch14/lib/python3.6/site-packages/apex/interfaces.py", line 10, in <module>
File "/GPUFS/app_GPU/application/anaconda3/5.3.1/envs/pytorch14/lib/python3.6/site-packages/apex/interfaces.py", line 10, in <module>
from apex.interfaces import (ApexImplementation,
File "/GPUFS/app_GPU/application/anaconda3/5.3.1/envs/pytorch14/lib/python3.6/site-packages/apex/interfaces.py", line 10, in <module>
class ApexImplementation(object):
File "/GPUFS/app_GPU/application/anaconda3/5.3.1/envs/pytorch14/lib/python3.6/site-packages/apex/interfaces.py", line 14, in ApexImplementation
class ApexImplementation(object):
class ApexImplementation(object):
File "/GPUFS/app_GPU/application/anaconda3/5.3.1/envs/pytorch14/lib/python3.6/site-packages/apex/interfaces.py", line 14, in ApexImplementation
File "/GPUFS/app_GPU/application/anaconda3/5.3.1/envs/pytorch14/lib/python3.6/site-packages/apex/interfaces.py", line 14, in ApexImplementation
class ApexImplementation(object):
File "/GPUFS/app_GPU/application/anaconda3/5.3.1/envs/pytorch14/lib/python3.6/site-packages/apex/interfaces.py", line 14, in ApexImplementation
implements(IApex)
File "/GPUFS/app_GPU/application/anaconda3/5.3.1/envs/pytorch14/lib/python3.6/site-packages/zope/interface/declarations.py", line 706, in implements
implements(IApex)
implements(IApex)
File "/GPUFS/app_GPU/application/anaconda3/5.3.1/envs/pytorch14/lib/python3.6/site-packages/zope/interface/declarations.py", line 706, in implements
File "/GPUFS/app_GPU/application/anaconda3/5.3.1/envs/pytorch14/lib/python3.6/site-packages/zope/interface/declarations.py", line 706, in implements
implements(IApex)
File "/GPUFS/app_GPU/application/anaconda3/5.3.1/envs/pytorch14/lib/python3.6/site-packages/zope/interface/declarations.py", line 706, in implements
raise TypeError(_ADVICE_ERROR % 'implementer')
raise TypeError(_ADVICE_ERROR % 'implementer')
raise TypeError(_ADVICE_ERROR % 'implementer')
TypeError: Class advice impossible in Python3. Use the @implementer class decorator instead.
TypeError: Class advice impossible in Python3. Use the @implementer class decorator instead.
TypeError: Class advice impossible in Python3. Use the @implementer class decorator instead.
raise TypeError(_ADVICE_ERROR % 'implementer')
TypeError: Class advice impossible in Python3. Use the @implementer class decorator instead.
Traceback (most recent call last):
File "/GPUFS/app_GPU/application/anaconda3/5.3.1/envs/pytorch14/lib/python3.6/runpy.py", line 193, in _run_module_as_main
"__main__", mod_spec)
File "/GPUFS/app_GPU/application/anaconda3/5.3.1/envs/pytorch14/lib/python3.6/runpy.py", line 85, in _run_code
exec(code, run_globals)
File "/GPUFS/app_GPU/application/anaconda3/5.3.1/envs/pytorch14/lib/python3.6/site-packages/torch/distributed/launch.py", line 263, in <module>
main()
File "/GPUFS/app_GPU/application/anaconda3/5.3.1/envs/pytorch14/lib/python3.6/site-packages/torch/distributed/launch.py", line 259, in main
cmd=cmd)
subprocess.CalledProcessError: Command '['/GPUFS/app_GPU/application/anaconda3/5.3.1/envs/pytorch14/bin/python', '-u', 'apex_sst.py', '--local_rank=3']' returned non-zero exit status 1.
使用的执行命令是
CUDA_VISIBLE_DEVICES=0,1,2,3 python -m torch.distributed.launch --nproc_per_node=4 apex_sst.py
环境一个节点上有4块GPU。