硬件环境:NVIDIA A100 80GB
版本:NVIDIA-SMI 550.144.03 Driver Version: 550.144.03 CUDA Version: 12.4
运行命令:
```shell
docker run --gpus all -e CUDA_LAUNCH_BLOCKING=1 -e TORCH_USE_CUDA_DSA=1 -d \
-v /data/workspace/models:/data/workspace/models \
-p 38001:8000 \
--ipc=host \
--name Qwen2-Audio-7B-Instruct \
docker.1ms.run/vllm/vllm-openai:latest \
--model /data/workspace/models/Qwen2-Audio-7B-Instruct \
--served-model-name Qwen2-Audio-7B-Instruct \
--enforce-eager \
--gpu-memory-utilization 0.7
出现报错:
```shell
INFO 04-24 19:49:24 __init__.py:207] Automatically detected platform cuda.
INFO 04-24 19:49:24 api_server.py:912] vLLM API server version 0.7.3
INFO 04-24 19:49:24 api_server.py:913] args: Namespace(host=None, port=8000, uvicorn_log_level='info', allow_credentials=False, allowed_origins=['*'], allowed_methods=['*'], allowed_headers=['*'], api_key=None, lora_modules=None, prompt_adapters=None, chat_template=None, chat_template_content_format='auto', response_role='assistant', ssl_keyfile=None, ssl_certfile=None, ssl_ca_certs=None, ssl_cert_reqs=0, root_path=None, middleware=[], return_tokens_as_token_ids=False, disable_frontend_multiprocessing=False, enable_request_id_headers=False, enable_auto_tool_choice=False, enable_reasoning=False, reasoning_parser=None, tool_call_parser=None, tool_parser_plugin='', model='/data/workspace/models/Qwen2-Audio-7B-Instruct', task='auto', tokenizer=None, skip_tokenizer_init=False, revision=None, code_revision=None, tokenizer_revision=None, tokenizer_mode='auto', trust_remote_code=False, allowed_local_media_path=None, download_dir=None, load_format='auto', config_format=<ConfigFormat.AUTO: 'auto'>, dtype='auto', kv_cache_dtype='auto', max_model_len=None, guided_decoding_backend='xgrammar', logits_processor_pattern=None, model_impl='auto', distributed_executor_backend=None, pipeline_parallel_size=1, tensor_parallel_size=1, max_parallel_loading_workers=None, ray_workers_use_nsight=False, block_size=None, enable_prefix_caching=None, disable_sliding_window=False, use_v2_block_manager=True, num_lookahead_slots=0, seed=0, swap_space=4, cpu_offload_gb=0, gpu_memory_utilization=0.7, num_gpu_blocks_override=None, max_num_batched_tokens=None, max_num_partial_prefills=1, max_long_partial_prefills=1, long_prefill_token_threshold=0, max_num_seqs=None, max_logprobs=20, disable_log_stats=False, quantization=None, rope_scaling=None, rope_theta=None, hf_overrides=None, enforce_eager=True, max_seq_len_to_capture=8192, disable_custom_all_reduce=False, tokenizer_pool_size=0, tokenizer_pool_type='ray', tokenizer_pool_extra_config=None, limit_mm_per_prompt=None, mm_processor_kwargs=None, disable_mm_preprocessor_cache=False, enable_lora=False, enable_lora_bias=False, max_loras=1, max_lora_rank=16, lora_extra_vocab_size=256, lora_dtype='auto', long_lora_scaling_factors=None, max_cpu_loras=None, fully_sharded_loras=False, enable_prompt_adapter=False, max_prompt_adapters=1, max_prompt_adapter_token=0, device='auto', num_scheduler_steps=1, multi_step_stream_outputs=True, scheduler_delay_factor=0.0, enable_chunked_prefill=None, speculative_model=None, speculative_model_quantization=None, num_speculative_tokens=None, speculative_disable_mqa_scorer=False, speculative_draft_tensor_parallel_size=None, speculative_max_model_len=None, speculative_disable_by_batch_size=None, ngram_prompt_lookup_max=None, ngram_prompt_lookup_min=None, spec_decoding_acceptance_method='rejection_sampler', typical_acceptance_sampler_posterior_threshold=None, typical_acceptance_sampler_posterior_alpha=None, disable_logprobs_during_spec_decoding=None, model_loader_extra_config=None, ignore_patterns=[], preemption_mode=None, served_model_name=['Qwen2-Audio-7B-Instruct'], qlora_adapter_name_or_path=None, otlp_traces_endpoint=None, collect_detailed_traces=None, disable_async_output_proc=False, scheduling_policy='fcfs', scheduler_cls='vllm.core.scheduler.Scheduler', override_neuron_config=None, override_pooler_config=None, compilation_config=None, kv_transfer_config=None, worker_cls='auto', generation_config=None, override_generation_config=None, enable_sleep_mode=False, calculate_kv_scales=False, additional_config=None, disable_log_requests=False, max_log_len=None, disable_fastapi_docs=False, enable_prompt_tokens_details=False)
INFO 04-24 19:49:24 api_server.py:209] Started engine process with PID 28
INFO 04-24 19:49:29 __init__.py:207] Automatically detected platform cuda.
INFO 04-24 19:49:31 config.py:549] This model supports multiple tasks: {'generate', 'classify', 'embed', 'score', 'reward'}. Defaulting to 'generate'.
WARNING 04-24 19:49:31 cuda.py:95] To see benefits of async output processing, enable CUDA graph. Since, enforce-eager is enabled, async output processor cannot be used
WARNING 04-24 19:49:31 config.py:685] Async output processing is not supported on the current platform type cuda.
INFO 04-24 19:49:35 config.py:549] This model supports multiple tasks: {'classify', 'embed', 'score', 'reward', 'generate'}. Defaulting to 'generate'.
WARNING 04-24 19:49:35 cuda.py:95] To see benefits of async output processing, enable CUDA graph. Since, enforce-eager is enabled, async output processor cannot be used
WARNING 04-24 19:49:35 config.py:685] Async output processing is not supported on the current platform type cuda.
INFO 04-24 19:49:35 llm_engine.py:234] Initializing a V0 LLM engine (v0.7.3) with config: model='/data/workspace/models/Qwen2-Audio-7B-Instruct', speculative_config=None, tokenizer='/data/workspace/models/Qwen2-Audio-7B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=True, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=Qwen2-Audio-7B-Instruct, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=False, chunked_prefill_enabled=False, use_async_output_proc=False, disable_mm_preprocessor_cache=False, mm_processor_kwargs=None, pooler_config=None, compilation_config={"splitting_ops":[],"compile_sizes":[],"cudagraph_capture_sizes":[],"max_capture_size":0}, use_cached_outputs=True,
INFO 04-24 19:49:37 cuda.py:229] Using Flash Attention backend.
INFO 04-24 19:49:37 model_runner.py:1110] Starting to load model /data/workspace/models/Qwen2-Audio-7B-Instruct...
INFO 04-24 19:49:37 config.py:3054] cudagraph sizes specified by model runner [] is overridden by config []
ERROR 04-24 19:49:39 engine.py:400] CUDA error: unspecified launch failure
ERROR 04-24 19:49:39 engine.py:400] Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
ERROR 04-24 19:49:39 engine.py:400] Traceback (most recent call last):
ERROR 04-24 19:49:39 engine.py:400] File "/usr/local/lib/python3.12/dist-packages/vllm/engine/multiprocessing/engine.py", line 391, in run_mp_engine
ERROR 04-24 19:49:39 engine.py:400] engine = MQLLMEngine.from_engine_args(engine_args=engine_args,
ERROR 04-24 19:49:39 engine.py:400] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ERROR 04-24 19:49:39 engine.py:400] File "/usr/local/lib/python3.12/dist-packages/vllm/engine/multiprocessing/engine.py", line 124, in from_engine_args
ERROR 04-24 19:49:39 engine.py:400] return cls(ipc_path=ipc_path,
ERROR 04-24 19:49:39 engine.py:400] ^^^^^^^^^^^^^^^^^^^^^^
ERROR 04-24 19:49:39 engine.py:400] File "/usr/local/lib/python3.12/dist-packages/vllm/engine/multiprocessing/engine.py", line 76, in __init__
ERROR 04-24 19:49:39 engine.py:400] self.engine = LLMEngine(*args, **kwargs)
Process SpawnProcess-1:
ERROR 04-24 19:49:39 engine.py:400] ^^^^^^^^^^^^^^^^^^^^^^^^^^
ERROR 04-24 19:49:39 engine.py:400] File "/usr/local/lib/python3.12/dist-packages/vllm/engine/llm_engine.py", line 273, in __init__
ERROR 04-24 19:49:39 engine.py:400] self.model_executor = executor_class(vllm_config=vllm_config, )
ERROR 04-24 19:49:39 engine.py:400] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ERROR 04-24 19:49:39 engine.py:400] File "/usr/local/lib/python3.12/dist-packages/vllm/executor/executor_base.py", line 52, in __init__
ERROR 04-24 19:49:39 engine.py:400] self._init_executor()
ERROR 04-24 19:49:39 engine.py:400] File "/usr/local/lib/python3.12/dist-packages/vllm/executor/uniproc_executor.py", line 47, in _init_executor
ERROR 04-24 19:49:39 engine.py:400] self.collective_rpc("load_model")
ERROR 04-24 19:49:39 engine.py:400] File "/usr/local/lib/python3.12/dist-packages/vllm/executor/uniproc_executor.py", line 56, in collective_rpc
ERROR 04-24 19:49:39 engine.py:400] answer = run_method(self.driver_worker, method, args, kwargs)
ERROR 04-24 19:49:39 engine.py:400] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ERROR 04-24 19:49:39 engine.py:400] File "/usr/local/lib/python3.12/dist-packages/vllm/utils.py", line 2196, in run_method
ERROR 04-24 19:49:39 engine.py:400] return func(*args, **kwargs)
ERROR 04-24 19:49:39 engine.py:400] ^^^^^^^^^^^^^^^^^^^^^
ERROR 04-24 19:49:39 engine.py:400] File "/usr/local/lib/python3.12/dist-packages/vllm/worker/worker.py", line 183, in load_model
ERROR 04-24 19:49:39 engine.py:400] self.model_runner.load_model()
ERROR 04-24 19:49:39 engine.py:400] File "/usr/local/lib/python3.12/dist-packages/vllm/worker/model_runner.py", line 1112, in load_model
ERROR 04-24 19:49:39 engine.py:400] self.model = get_model(vllm_config=self.vllm_config)
ERROR 04-24 19:49:39 engine.py:400] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ERROR 04-24 19:49:39 engine.py:400] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/model_loader/__init__.py", line 14, in get_model
ERROR 04-24 19:49:39 engine.py:400] return loader.load_model(vllm_config=vllm_config)
ERROR 04-24 19:49:39 engine.py:400] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ERROR 04-24 19:49:39 engine.py:400] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/model_loader/loader.py", line 406, in load_model
ERROR 04-24 19:49:39 engine.py:400] model = _initialize_model(vllm_config=vllm_config)
ERROR 04-24 19:49:39 engine.py:400] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ERROR 04-24 19:49:39 engine.py:400] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/model_loader/loader.py", line 125, in _initialize_model
ERROR 04-24 19:49:39 engine.py:400] return model_class(vllm_config=vllm_config, prefix=prefix)
ERROR 04-24 19:49:39 engine.py:400] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ERROR 04-24 19:49:39 engine.py:400] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen2_audio.py", line 269, in __init__
ERROR 04-24 19:49:39 engine.py:400] self.language_model = init_vllm_registered_model(
ERROR 04-24 19:49:39 engine.py:400] ^^^^^^^^^^^^^^^^^^^^^^^^^^^
ERROR 04-24 19:49:39 engine.py:400] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/utils.py", line 260, in init_vllm_registered_model
ERROR 04-24 19:49:39 engine.py:400] return _initialize_model(vllm_config=vllm_config, prefix=prefix)
ERROR 04-24 19:49:39 engine.py:400] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ERROR 04-24 19:49:39 engine.py:400] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/model_loader/loader.py", line 125, in _initialize_model
ERROR 04-24 19:49:39 engine.py:400] return model_class(vllm_config=vllm_config, prefix=prefix)
ERROR 04-24 19:49:39 engine.py:400] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ERROR 04-24 19:49:39 engine.py:400] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen2.py", line 453, in __init__
ERROR 04-24 19:49:39 engine.py:400] self.model = Qwen2Model(vllm_config=vllm_config,
ERROR 04-24 19:49:39 engine.py:400] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ERROR 04-24 19:49:39 engine.py:400] File "/usr/local/lib/python3.12/dist-packages/vllm/compilation/decorators.py", line 151, in __init__
ERROR 04-24 19:49:39 engine.py:400] old_init(self, vllm_config=vllm_config, prefix=prefix, **kwargs)
ERROR 04-24 19:49:39 engine.py:400] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen2.py", line 307, in __init__
ERROR 04-24 19:49:39 engine.py:400] self.start_layer, self.end_layer, self.layers = make_layers(
ERROR 04-24 19:49:39 engine.py:400] ^^^^^^^^^^^^
ERROR 04-24 19:49:39 engine.py:400] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/utils.py", line 558, in make_layers
ERROR 04-24 19:49:39 engine.py:400] maybe_offload_to_cpu(layer_fn(prefix=f"{prefix}.{idx}"))
ERROR 04-24 19:49:39 engine.py:400] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ERROR 04-24 19:49:39 engine.py:400] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen2.py", line 309, in <lambda>
ERROR 04-24 19:49:39 engine.py:400] lambda prefix: Qwen2DecoderLayer(config=config,
ERROR 04-24 19:49:39 engine.py:400] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ERROR 04-24 19:49:39 engine.py:400] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen2.py", line 208, in __init__
ERROR 04-24 19:49:39 engine.py:400] self.self_attn = Qwen2Attention(
ERROR 04-24 19:49:39 engine.py:400] ^^^^^^^^^^^^^^^
ERROR 04-24 19:49:39 engine.py:400] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen2.py", line 153, in __init__
ERROR 04-24 19:49:39 engine.py:400] self.rotary_emb = get_rope(
ERROR 04-24 19:49:39 engine.py:400] ^^^^^^^^^
ERROR 04-24 19:49:39 engine.py:400] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding.py", line 1025, in get_rope
ERROR 04-24 19:49:39 engine.py:400] rotary_emb = RotaryEmbedding(head_size, rotary_dim, max_position, base,
ERROR 04-24 19:49:39 engine.py:400] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ERROR 04-24 19:49:39 engine.py:400] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding.py", line 98, in __init__
ERROR 04-24 19:49:39 engine.py:400] cache = self._compute_cos_sin_cache()
ERROR 04-24 19:49:39 engine.py:400] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ERROR 04-24 19:49:39 engine.py:400] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding.py", line 118, in _compute_cos_sin_cache
ERROR 04-24 19:49:39 engine.py:400] freqs = torch.einsum("i,j -> ij", t, inv_freq)
ERROR 04-24 19:49:39 engine.py:400] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ERROR 04-24 19:49:39 engine.py:400] File "/usr/local/lib/python3.12/dist-packages/torch/functional.py", line 390, in einsum
ERROR 04-24 19:49:39 engine.py:400] return handle_torch_function(einsum, operands, equation, *operands)
ERROR 04-24 19:49:39 engine.py:400] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ERROR 04-24 19:49:39 engine.py:400] File "/usr/local/lib/python3.12/dist-packages/torch/overrides.py", line 1717, in handle_torch_function
ERROR 04-24 19:49:39 engine.py:400] result = mode.__torch_function__(public_api, types, args, kwargs)
ERROR 04-24 19:49:39 engine.py:400] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ERROR 04-24 19:49:39 engine.py:400] File "/usr/local/lib/python3.12/dist-packages/torch/utils/_device.py", line 106, in __torch_function__
ERROR 04-24 19:49:39 engine.py:400] return func(*args, **kwargs)
ERROR 04-24 19:49:39 engine.py:400] ^^^^^^^^^^^^^^^^^^^^^
ERROR 04-24 19:49:39 engine.py:400] File "/usr/local/lib/python3.12/dist-packages/torch/functional.py", line 402, in einsum
ERROR 04-24 19:49:39 engine.py:400] return _VF.einsum(equation, operands) # type: ignore[attr-defined]
ERROR 04-24 19:49:39 engine.py:400] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ERROR 04-24 19:49:39 engine.py:400] RuntimeError: CUDA error: unspecified launch failure
ERROR 04-24 19:49:39 engine.py:400] Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
ERROR 04-24 19:49:39 engine.py:400]
Traceback (most recent call last):
File "/usr/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
self.run()
File "/usr/lib/python3.12/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/usr/local/lib/python3.12/dist-packages/vllm/engine/multiprocessing/engine.py", line 402, in run_mp_engine
raise e
File "/usr/local/lib/python3.12/dist-packages/vllm/engine/multiprocessing/engine.py", line 391, in run_mp_engine
engine = MQLLMEngine.from_engine_args(engine_args=engine_args,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/vllm/engine/multiprocessing/engine.py", line 124, in from_engine_args
return cls(ipc_path=ipc_path,
^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/vllm/engine/multiprocessing/engine.py", line 76, in __init__
self.engine = LLMEngine(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/vllm/engine/llm_engine.py", line 273, in __init__
self.model_executor = executor_class(vllm_config=vllm_config, )
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/vllm/executor/executor_base.py", line 52, in __init__
self._init_executor()
File "/usr/local/lib/python3.12/dist-packages/vllm/executor/uniproc_executor.py", line 47, in _init_executor
self.collective_rpc("load_model")
File "/usr/local/lib/python3.12/dist-packages/vllm/executor/uniproc_executor.py", line 56, in collective_rpc
answer = run_method(self.driver_worker, method, args, kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/vllm/utils.py", line 2196, in run_method
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/vllm/worker/worker.py", line 183, in load_model
self.model_runner.load_model()
File "/usr/local/lib/python3.12/dist-packages/vllm/worker/model_runner.py", line 1112, in load_model
self.model = get_model(vllm_config=self.vllm_config)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/model_loader/__init__.py", line 14, in get_model
return loader.load_model(vllm_config=vllm_config)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/model_loader/loader.py", line 406, in load_model
model = _initialize_model(vllm_config=vllm_config)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/model_loader/loader.py", line 125, in _initialize_model
return model_class(vllm_config=vllm_config, prefix=prefix)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen2_audio.py", line 269, in __init__
self.language_model = init_vllm_registered_model(
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/utils.py", line 260, in init_vllm_registered_model
return _initialize_model(vllm_config=vllm_config, prefix=prefix)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/model_loader/loader.py", line 125, in _initialize_model
return model_class(vllm_config=vllm_config, prefix=prefix)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen2.py", line 453, in __init__
self.model = Qwen2Model(vllm_config=vllm_config,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/vllm/compilation/decorators.py", line 151, in __init__
old_init(self, vllm_config=vllm_config, prefix=prefix, **kwargs)
File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen2.py", line 307, in __init__
self.start_layer, self.end_layer, self.layers = make_layers(
^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/utils.py", line 558, in make_layers
maybe_offload_to_cpu(layer_fn(prefix=f"{prefix}.{idx}"))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen2.py", line 309, in <lambda>
lambda prefix: Qwen2DecoderLayer(config=config,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen2.py", line 208, in __init__
self.self_attn = Qwen2Attention(
^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen2.py", line 153, in __init__
self.rotary_emb = get_rope(
^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding.py", line 1025, in get_rope
rotary_emb = RotaryEmbedding(head_size, rotary_dim, max_position, base,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding.py", line 98, in __init__
cache = self._compute_cos_sin_cache()
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding.py", line 118, in _compute_cos_sin_cache
freqs = torch.einsum("i,j -> ij", t, inv_freq)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/torch/functional.py", line 390, in einsum
return handle_torch_function(einsum, operands, equation, *operands)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/torch/overrides.py", line 1717, in handle_torch_function
result = mode.__torch_function__(public_api, types, args, kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/torch/utils/_device.py", line 106, in __torch_function__
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/torch/functional.py", line 402, in einsum
return _VF.einsum(equation, operands) # type: ignore[attr-defined]
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: CUDA error: unspecified launch failure
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
[rank0]:[W424 19:49:39.047951708 ProcessGroupNCCL.cpp:1250] Warning: WARNING: process group has NOT been destroyed before we destruct ProcessGroupNCCL. On normal program exit, the application should call destroy_process_group to ensure that any pending NCCL operations have finished in this process. In rare cases this process can exit before this point and block the progress of another member of the process group. This constraint has always been present, but this warning has only been added since PyTorch 2.4 (function operator())
Traceback (most recent call last):
File "<frozen runpy>", line 198, in _run_module_as_main
File "<frozen runpy>", line 88, in _run_code
File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 991, in <module>
uvloop.run(run_server(args))
File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 109, in run
return __asyncio.run(
^^^^^^^^^^^^^^
File "/usr/lib/python3.12/asyncio/runners.py", line 195, in run
return runner.run(main)
^^^^^^^^^^^^^^^^
File "/usr/lib/python3.12/asyncio/runners.py", line 118, in run
return self._loop.run_until_complete(task)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "uvloop/loop.pyx", line 1518, in uvloop.loop.Loop.run_until_complete
File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 61, in wrapper
return await main
^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 947, in run_server
async with build_async_engine_client(args) as engine_client:
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__
return await anext(self.gen)
^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 139, in build_async_engine_client
async with build_async_engine_client_from_engine_args(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__
return await anext(self.gen)
^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 233, in build_async_engine_client_from_engine_args
raise RuntimeError(
RuntimeError: Engine process failed to start. See stack trace for the root cause.