<추론사진>

<추론 오디오>
import soundfile as sf
import torch
import time
from transformers import Qwen2_5OmniForConditionalGeneration, Qwen2_5OmniProcessor
from qwen_omni_utils import process_mm_info
# default: Load the model on the available device(s)
#model = Qwen2_5OmniForConditionalGeneration.from_pretrained("Qwen/Qwen2.5-Omni-3B", torch_dtype=torch.bfloat16, device_map="auto")
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
# model = Qwen2_5OmniForConditionalGeneration.from_pretrained(
# "Qwen/Qwen2.5-Omni-3B",
# torch_dtype="auto",
# device_map="auto",
# attn_implementation="flash_attention_2",
# )
model = Qwen2_5OmniForConditionalGeneration.from_pretrained(
"Qwen/Qwen2.5-Omni-3B",
torch_dtype=torch.float16,
device_map="cuda:0",
# attn_implementation="flash_attention_2",
)
processor = Qwen2_5OmniProcessor.from_pretrained("Qwen/Qwen2.5-Omni-3B")
conversation = [
{
"role": "system",
"content": [
{"type": "text", "text": "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech."}
],
},
{
"role": "user",
"content": [
{"type": "image", "image": "dog.png"},
{"type": "audio", "audio": "dog_audio.mp3"},
],
},
]
# set use audio in video
USE_AUDIO_IN_VIDEO = False
# Preparation for inference
text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
# audios, images, videos = process_mm_info(conversation, use_audio_in_video=USE_AUDIO_IN_VIDEO)
# inputs = processor(text=text, audio=audios, images=images, videos=videos, return_tensors="pt", padding=True, use_audio_in_video=USE_AUDIO_IN_VIDEO)
# Preparation for inference
text = processor.apply_chat_template(
conversation,
add_generation_prompt=True,
tokenize=False
)
audios, images, videos = process_mm_info(
conversation,
# use_audio_in_video=USE_AUDIO_IN_VIDEO
use_audio_in_video=False
)
print("audios:", type(audios))
print("images:", type(images))
print("videos:", type(videos))
if videos is not None:
print("video count:", len(videos))
print("video shape:", videos[0].shape)
inputs = processor(
text=text,
audio=audios,
images=images,
videos=videos,
return_tensors="pt",
padding=True,
use_audio_in_video=USE_AUDIO_IN_VIDEO
)
inputs = inputs.to(model.device).to(model.dtype)
# ─── 추론 시간 측정 시작 ───────────────────────────────────────────
print("=" * 50)
print("추론 시작...")
# GPU 동기화 후 시작 시간 기록
if torch.cuda.is_available():
torch.cuda.synchronize()
total_start = time.perf_counter()
# text_ids 생성 시간 측정
text_gen_start = time.perf_counter()
# Inference: Generation of the output text and audio
text_ids, audio = model.generate(**inputs, use_audio_in_video=USE_AUDIO_IN_VIDEO)
## 추가
if torch.cuda.is_available():
torch.cuda.synchronize()
text_gen_end = time.perf_counter()
total_end = time.perf_counter()
# ─── 측정 끝 ──────────────────────────────────────────────────────
# 토큰 수 계산 (입력 제외한 생성된 토큰만)
input_token_count = inputs["input_ids"].shape[-1]
output_token_count = text_ids.shape[-1] - input_token_count
gen_elapsed = text_gen_end - text_gen_start
total_elapsed = total_end - total_start
tokens_per_sec = output_token_count / gen_elapsed if gen_elapsed > 0 else 0
# 오디오 정보
audio_samples = audio.reshape(-1).shape[0]
audio_duration_sec = audio_samples / 24000 # samplerate=24000
print("=" * 50)
print(f"[토큰 생성]")
print(f" 입력 토큰 수 : {input_token_count}")
print(f" 생성된 토큰 수 : {output_token_count}")
print(f" 생성 소요 시간 : {gen_elapsed:.3f}초")
print(f" 토큰 생성 속도 : {tokens_per_sec:.2f} tokens/sec")
print(f" (1초당 약 {tokens_per_sec:.1f}개 토큰 생성)")
print()
print(f"[오디오 출력]")
print(f" 오디오 샘플 수 : {audio_samples}")
print(f" 오디오 길이 : {audio_duration_sec:.2f}초")
print()
print(f"[전체]")
print(f" 총 추론 시간 : {total_elapsed:.3f}초")
print("=" * 50)
text = processor.batch_decode(text_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
print(text)
sf.write(
"output.wav",
audio.reshape(-1).detach().cpu().numpy(),
samplerate=24000,
)

import soundfile as sf
import torch
import time
from transformers import Qwen2_5OmniForConditionalGeneration, Qwen2_5OmniProcessor
from qwen_omni_utils import process_mm_info
# default: Load the model on the available device(s)
model = Qwen2_5OmniForConditionalGeneration.from_pretrained("Qwen/Qwen2.5-Omni-3B", torch_dtype=torch.bfloat16, device_map="auto")
processor = Qwen2_5OmniProcessor.from_pretrained("Qwen/Qwen2.5-Omni-3B")
conversation = [
{
"role": "system",
"content": [
{"type": "text", "text": "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech."}
],
},
{
"role": "user",
"content": [
{"type": "image", "image": "dog3.jpg"},
{"type": "audio", "audio": "dog_audio.mp3"},
],
},
]
# set use audio in video
USE_AUDIO_IN_VIDEO = False
# Preparation for inference
text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
# audios, images, videos = process_mm_info(conversation, use_audio_in_video=USE_AUDIO_IN_VIDEO)
# inputs = processor(text=text, audio=audios, images=images, videos=videos, return_tensors="pt", padding=True, use_audio_in_video=USE_AUDIO_IN_VIDEO)
# Preparation for inference
text = processor.apply_chat_template(
conversation,
add_generation_prompt=True,
tokenize=False
)
audios, images, videos = process_mm_info(
conversation,
# use_audio_in_video=USE_AUDIO_IN_VIDEO
use_audio_in_video=False
)
print("audios:", type(audios))
print("images:", type(images))
print("videos:", type(videos))
if videos is not None:
print("video count:", len(videos))
print("video shape:", videos[0].shape)
inputs = processor(
text=text,
audio=audios,
images=images,
videos=videos,
return_tensors="pt",
padding=True,
use_audio_in_video=USE_AUDIO_IN_VIDEO
)
inputs = inputs.to(model.device).to(model.dtype)
# ─── 추론 시간 측정 시작 ───────────────────────────────────────────
print("=" * 50)
print("추론 시작...")
# GPU 동기화 후 시작 시간 기록
if torch.cuda.is_available():
torch.cuda.synchronize()
total_start = time.perf_counter()
# text_ids 생성 시간 측정
text_gen_start = time.perf_counter()
# Inference: Generation of the output text and audio
text_ids, audio = model.generate(**inputs, use_audio_in_video=USE_AUDIO_IN_VIDEO)
## 추가
if torch.cuda.is_available():
torch.cuda.synchronize()
text_gen_end = time.perf_counter()
total_end = time.perf_counter()
# ─── 측정 끝 ──────────────────────────────────────────────────────
# 토큰 수 계산 (입력 제외한 생성된 토큰만)
input_token_count = inputs["input_ids"].shape[-1]
output_token_count = text_ids.shape[-1] - input_token_count
gen_elapsed = text_gen_end - text_gen_start
total_elapsed = total_end - total_start
tokens_per_sec = output_token_count / gen_elapsed if gen_elapsed > 0 else 0
# 오디오 정보
audio_samples = audio.reshape(-1).shape[0]
audio_duration_sec = audio_samples / 24000 # samplerate=24000
print("=" * 50)
print(f"[토큰 생성]")
print(f" 입력 토큰 수 : {input_token_count}")
print(f" 생성된 토큰 수 : {output_token_count}")
print(f" 생성 소요 시간 : {gen_elapsed:.3f}초")
print(f" 토큰 생성 속도 : {tokens_per_sec:.2f} tokens/sec")
print(f" (1초당 약 {tokens_per_sec:.1f}개 토큰 생성)")
print()
print(f"[오디오 출력]")
print(f" 오디오 샘플 수 : {audio_samples}")
print(f" 오디오 길이 : {audio_duration_sec:.2f}초")
print()
print(f"[전체]")
print(f" 총 추론 시간 : {total_elapsed:.3f}초")
print("=" * 50)
text = processor.batch_decode(text_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
print(text)
sf.write(
"output.wav",
audio.reshape(-1).detach().cpu().numpy(),
samplerate=24000,
)

import soundfile as sf
import os
import torch
import time
from transformers import Qwen2_5OmniForConditionalGeneration, Qwen2_5OmniProcessor
from qwen_omni_utils import process_mm_info
print("녹음 시작")
os.system(
"arecord -D plughw:2,0 -f S16_LE -r 16000 -c 1 -d 10 voice.wav"
)
print("녹음 종료")
# default: Load the model on the available device(s)
model = Qwen2_5OmniForConditionalGeneration.from_pretrained("Qwen/Qwen2.5-Omni-3B", torch_dtype=torch.float16, device_map="auto")
processor = Qwen2_5OmniProcessor.from_pretrained("Qwen/Qwen2.5-Omni-3B")
conversation = [
{
"role": "system",
"content": [
{"type": "text", "text": "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech."}
],
},
{
"role": "user",
"content": [
{"type": "image", "image": "dog.jpg"},
{"type": "audio", "audio": "voice.wav"},
],
},
]
# set use audio in video
USE_AUDIO_IN_VIDEO = False
# Preparation for inference
text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
# audios, images, videos = process_mm_info(conversation, use_audio_in_video=USE_AUDIO_IN_VIDEO)
# inputs = processor(text=text, audio=audios, images=images, videos=videos, return_tensors="pt", padding=True, use_audio_in_video=USE_AUDIO_IN_VIDEO)
# Preparation for inference
text = processor.apply_chat_template(
conversation,
add_generation_prompt=True,
tokenize=False
)
audios, images, videos = process_mm_info(
conversation,
# use_audio_in_video=USE_AUDIO_IN_VIDEO
use_audio_in_video=False
)
print("audios:", type(audios))
print("images:", type(images))
print("videos:", type(videos))
if videos is not None:
print("video count:", len(videos))
print("video shape:", videos[0].shape)
inputs = processor(
text=text,
audio=audios,
images=images,
videos=videos,
return_tensors="pt",
padding=True,
use_audio_in_video=USE_AUDIO_IN_VIDEO
)
inputs = inputs.to(model.device).to(model.dtype)
# ─── 추론 시간 측정 시작 ───────────────────────────────────────────
print("=" * 50)
print("추론 시작...")
# GPU 동기화 후 시작 시간 기록
if torch.cuda.is_available():
torch.cuda.synchronize()
total_start = time.perf_counter()
# text_ids 생성 시간 측정
text_gen_start = time.perf_counter()
# Inference: Generation of the output text and audio
text_ids, audio = model.generate(**inputs, use_audio_in_video=USE_AUDIO_IN_VIDEO)
## 추가
if torch.cuda.is_available():
torch.cuda.synchronize()
text_gen_end = time.perf_counter()
total_end = time.perf_counter()
# ─── 측정 끝 ──────────────────────────────────────────────────────
# 토큰 수 계산 (입력 제외한 생성된 토큰만)
input_token_count = inputs["input_ids"].shape[-1]
output_token_count = text_ids.shape[-1] - input_token_count
gen_elapsed = text_gen_end - text_gen_start
total_elapsed = total_end - total_start
tokens_per_sec = output_token_count / gen_elapsed if gen_elapsed > 0 else 0
# 오디오 정보
audio_samples = audio.reshape(-1).shape[0]
audio_duration_sec = audio_samples / 24000 # samplerate=24000
print("=" * 50)
print(f"[토큰 생성]")
print(f" 입력 토큰 수 : {input_token_count}")
print(f" 생성된 토큰 수 : {output_token_count}")
print(f" 생성 소요 시간 : {gen_elapsed:.3f}초")
print(f" 토큰 생성 속도 : {tokens_per_sec:.2f} tokens/sec")
print(f" (1초당 약 {tokens_per_sec:.1f}개 토큰 생성)")
print()
print(f"[오디오 출력]")
print(f" 오디오 샘플 수 : {audio_samples}")
print(f" 오디오 길이 : {audio_duration_sec:.2f}초")
print()
print(f"[전체]")
print(f" 총 추론 시간 : {total_elapsed:.3f}초")
print("=" * 50)
text = processor.batch_decode(text_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
print(text)
sf.write(
"output.wav",
audio.reshape(-1).detach().cpu().numpy(),
samplerate=24000,
)