<추론사진>

image.png

<추론 오디오>

dog_audio.mp3

Windows(RTX 4070 Super)

import soundfile as sf
import torch
import time
from transformers import Qwen2_5OmniForConditionalGeneration, Qwen2_5OmniProcessor
from qwen_omni_utils import process_mm_info

# default: Load the model on the available device(s)
#model = Qwen2_5OmniForConditionalGeneration.from_pretrained("Qwen/Qwen2.5-Omni-3B", torch_dtype=torch.bfloat16, device_map="auto")

# We recommend enabling flash_attention_2 for better acceleration and memory saving.
# model = Qwen2_5OmniForConditionalGeneration.from_pretrained(
#     "Qwen/Qwen2.5-Omni-3B",
#     torch_dtype="auto",
#     device_map="auto",
#     attn_implementation="flash_attention_2",
# )

model = Qwen2_5OmniForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2.5-Omni-3B",
    torch_dtype=torch.float16,
    device_map="cuda:0",
    # attn_implementation="flash_attention_2",
)

processor = Qwen2_5OmniProcessor.from_pretrained("Qwen/Qwen2.5-Omni-3B")

conversation = [
    {
        "role": "system",
        "content": [
            {"type": "text", "text": "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech."}
        ],
    },
    {
        "role": "user",
        "content": [
            {"type": "image", "image": "dog.png"},
            {"type": "audio", "audio": "dog_audio.mp3"},
        ],
    },
]

# set use audio in video
USE_AUDIO_IN_VIDEO = False

# Preparation for inference
text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)

# audios, images, videos = process_mm_info(conversation, use_audio_in_video=USE_AUDIO_IN_VIDEO)
# inputs = processor(text=text, audio=audios, images=images, videos=videos, return_tensors="pt", padding=True, use_audio_in_video=USE_AUDIO_IN_VIDEO)

# Preparation for inference
text = processor.apply_chat_template(
    conversation,
    add_generation_prompt=True,
    tokenize=False
)

audios, images, videos = process_mm_info(
    conversation,
    # use_audio_in_video=USE_AUDIO_IN_VIDEO
    use_audio_in_video=False
)

print("audios:", type(audios))
print("images:", type(images))
print("videos:", type(videos))

if videos is not None:
    print("video count:", len(videos))
    print("video shape:", videos[0].shape)

inputs = processor(
    text=text,
    audio=audios,
    images=images,
    videos=videos,
    return_tensors="pt",
    padding=True,
    use_audio_in_video=USE_AUDIO_IN_VIDEO
)

inputs = inputs.to(model.device).to(model.dtype)

# ─── 추론 시간 측정 시작 ───────────────────────────────────────────
print("=" * 50)
print("추론 시작...")

# GPU 동기화 후 시작 시간 기록
if torch.cuda.is_available():
    torch.cuda.synchronize()
total_start = time.perf_counter()

# text_ids 생성 시간 측정
text_gen_start = time.perf_counter()

# Inference: Generation of the output text and audio
text_ids, audio = model.generate(**inputs, use_audio_in_video=USE_AUDIO_IN_VIDEO)

## 추가
if torch.cuda.is_available():
    torch.cuda.synchronize()
text_gen_end = time.perf_counter()

total_end = time.perf_counter()
# ─── 측정 끝 ──────────────────────────────────────────────────────

# 토큰 수 계산 (입력 제외한 생성된 토큰만)
input_token_count = inputs["input_ids"].shape[-1]
output_token_count = text_ids.shape[-1] - input_token_count
gen_elapsed = text_gen_end - text_gen_start
total_elapsed = total_end - total_start

tokens_per_sec = output_token_count / gen_elapsed if gen_elapsed > 0 else 0

# 오디오 정보
audio_samples = audio.reshape(-1).shape[0]
audio_duration_sec = audio_samples / 24000  # samplerate=24000

print("=" * 50)
print(f"[토큰 생성]")
print(f"  입력 토큰 수     : {input_token_count}")
print(f"  생성된 토큰 수   : {output_token_count}")
print(f"  생성 소요 시간   : {gen_elapsed:.3f}초")
print(f"  토큰 생성 속도   : {tokens_per_sec:.2f} tokens/sec")
print(f"  (1초당 약 {tokens_per_sec:.1f}개 토큰 생성)")
print()
print(f"[오디오 출력]")
print(f"  오디오 샘플 수   : {audio_samples}")
print(f"  오디오 길이      : {audio_duration_sec:.2f}초")
print()
print(f"[전체]")
print(f"  총 추론 시간     : {total_elapsed:.3f}초")
print("=" * 50)

text = processor.batch_decode(text_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
print(text)
sf.write(
    "output.wav",
    audio.reshape(-1).detach().cpu().numpy(),
    samplerate=24000,
)

image.png

output.wav

Jetson AGX Orin 64gb

import soundfile as sf
import torch
import time
from transformers import Qwen2_5OmniForConditionalGeneration, Qwen2_5OmniProcessor
from qwen_omni_utils import process_mm_info

# default: Load the model on the available device(s)
model = Qwen2_5OmniForConditionalGeneration.from_pretrained("Qwen/Qwen2.5-Omni-3B", torch_dtype=torch.bfloat16, device_map="auto")

processor = Qwen2_5OmniProcessor.from_pretrained("Qwen/Qwen2.5-Omni-3B")

conversation = [
    {
        "role": "system",
        "content": [
            {"type": "text", "text": "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech."}
        ],
    },
    {
        "role": "user",
        "content": [
            {"type": "image", "image": "dog3.jpg"},
            {"type": "audio", "audio": "dog_audio.mp3"},
        ],
    },
]

# set use audio in video
USE_AUDIO_IN_VIDEO = False

# Preparation for inference
text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)

# audios, images, videos = process_mm_info(conversation, use_audio_in_video=USE_AUDIO_IN_VIDEO)
# inputs = processor(text=text, audio=audios, images=images, videos=videos, return_tensors="pt", padding=True, use_audio_in_video=USE_AUDIO_IN_VIDEO)

# Preparation for inference
text = processor.apply_chat_template(
    conversation,
    add_generation_prompt=True,
    tokenize=False
)

audios, images, videos = process_mm_info(
    conversation,
    # use_audio_in_video=USE_AUDIO_IN_VIDEO
    use_audio_in_video=False
)

print("audios:", type(audios))
print("images:", type(images))
print("videos:", type(videos))

if videos is not None:
    print("video count:", len(videos))
    print("video shape:", videos[0].shape)

inputs = processor(
    text=text,
    audio=audios,
    images=images,
    videos=videos,
    return_tensors="pt",
    padding=True,
    use_audio_in_video=USE_AUDIO_IN_VIDEO
)

inputs = inputs.to(model.device).to(model.dtype)

# ─── 추론 시간 측정 시작 ───────────────────────────────────────────
print("=" * 50)
print("추론 시작...")

# GPU 동기화 후 시작 시간 기록
if torch.cuda.is_available():
    torch.cuda.synchronize()
total_start = time.perf_counter()

# text_ids 생성 시간 측정
text_gen_start = time.perf_counter()

# Inference: Generation of the output text and audio
text_ids, audio = model.generate(**inputs, use_audio_in_video=USE_AUDIO_IN_VIDEO)

## 추가
if torch.cuda.is_available():
    torch.cuda.synchronize()
text_gen_end = time.perf_counter()

total_end = time.perf_counter()
# ─── 측정 끝 ──────────────────────────────────────────────────────

# 토큰 수 계산 (입력 제외한 생성된 토큰만)
input_token_count = inputs["input_ids"].shape[-1]
output_token_count = text_ids.shape[-1] - input_token_count
gen_elapsed = text_gen_end - text_gen_start
total_elapsed = total_end - total_start

tokens_per_sec = output_token_count / gen_elapsed if gen_elapsed > 0 else 0

# 오디오 정보
audio_samples = audio.reshape(-1).shape[0]
audio_duration_sec = audio_samples / 24000  # samplerate=24000

print("=" * 50)
print(f"[토큰 생성]")
print(f"  입력 토큰 수     : {input_token_count}")
print(f"  생성된 토큰 수   : {output_token_count}")
print(f"  생성 소요 시간   : {gen_elapsed:.3f}초")
print(f"  토큰 생성 속도   : {tokens_per_sec:.2f} tokens/sec")
print(f"  (1초당 약 {tokens_per_sec:.1f}개 토큰 생성)")
print()
print(f"[오디오 출력]")
print(f"  오디오 샘플 수   : {audio_samples}")
print(f"  오디오 길이      : {audio_duration_sec:.2f}초")
print()
print(f"[전체]")
print(f"  총 추론 시간     : {total_elapsed:.3f}초")
print("=" * 50)

text = processor.batch_decode(text_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
print(text)
sf.write(
    "output.wav",
    audio.reshape(-1).detach().cpu().numpy(),
    samplerate=24000,
)

추론결과

image.png

output.wav

오디오 추론

import soundfile as sf
import os
import torch
import time
from transformers import Qwen2_5OmniForConditionalGeneration, Qwen2_5OmniProcessor
from qwen_omni_utils import process_mm_info

print("녹음 시작")
os.system(
    "arecord -D plughw:2,0 -f S16_LE -r 16000 -c 1 -d 10 voice.wav"
)
print("녹음 종료")

# default: Load the model on the available device(s)
model = Qwen2_5OmniForConditionalGeneration.from_pretrained("Qwen/Qwen2.5-Omni-3B", torch_dtype=torch.float16, device_map="auto")

processor = Qwen2_5OmniProcessor.from_pretrained("Qwen/Qwen2.5-Omni-3B")

conversation = [
    {
        "role": "system",
        "content": [
            {"type": "text", "text": "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech."}
        ],
    },
    {
        "role": "user",
        "content": [
            {"type": "image", "image": "dog.jpg"},
            {"type": "audio", "audio": "voice.wav"},
        ],
    },
]

# set use audio in video
USE_AUDIO_IN_VIDEO = False

# Preparation for inference
text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)

# audios, images, videos = process_mm_info(conversation, use_audio_in_video=USE_AUDIO_IN_VIDEO)
# inputs = processor(text=text, audio=audios, images=images, videos=videos, return_tensors="pt", padding=True, use_audio_in_video=USE_AUDIO_IN_VIDEO)

# Preparation for inference
text = processor.apply_chat_template(
    conversation,
    add_generation_prompt=True,
    tokenize=False
)

audios, images, videos = process_mm_info(
    conversation,
    # use_audio_in_video=USE_AUDIO_IN_VIDEO
    use_audio_in_video=False
)

print("audios:", type(audios))
print("images:", type(images))
print("videos:", type(videos))

if videos is not None:
    print("video count:", len(videos))
    print("video shape:", videos[0].shape)

inputs = processor(
    text=text,
    audio=audios,
    images=images,
    videos=videos,
    return_tensors="pt",
    padding=True,
    use_audio_in_video=USE_AUDIO_IN_VIDEO
)

inputs = inputs.to(model.device).to(model.dtype)

# ─── 추론 시간 측정 시작 ───────────────────────────────────────────
print("=" * 50)
print("추론 시작...")

# GPU 동기화 후 시작 시간 기록
if torch.cuda.is_available():
    torch.cuda.synchronize()
total_start = time.perf_counter()

# text_ids 생성 시간 측정
text_gen_start = time.perf_counter()

# Inference: Generation of the output text and audio
text_ids, audio = model.generate(**inputs, use_audio_in_video=USE_AUDIO_IN_VIDEO)

## 추가
if torch.cuda.is_available():
    torch.cuda.synchronize()
text_gen_end = time.perf_counter()

total_end = time.perf_counter()
# ─── 측정 끝 ──────────────────────────────────────────────────────

# 토큰 수 계산 (입력 제외한 생성된 토큰만)
input_token_count = inputs["input_ids"].shape[-1]
output_token_count = text_ids.shape[-1] - input_token_count
gen_elapsed = text_gen_end - text_gen_start
total_elapsed = total_end - total_start

tokens_per_sec = output_token_count / gen_elapsed if gen_elapsed > 0 else 0

# 오디오 정보
audio_samples = audio.reshape(-1).shape[0]
audio_duration_sec = audio_samples / 24000  # samplerate=24000

print("=" * 50)
print(f"[토큰 생성]")
print(f"  입력 토큰 수     : {input_token_count}")
print(f"  생성된 토큰 수   : {output_token_count}")
print(f"  생성 소요 시간   : {gen_elapsed:.3f}초")
print(f"  토큰 생성 속도   : {tokens_per_sec:.2f} tokens/sec")
print(f"  (1초당 약 {tokens_per_sec:.1f}개 토큰 생성)")
print()
print(f"[오디오 출력]")
print(f"  오디오 샘플 수   : {audio_samples}")
print(f"  오디오 길이      : {audio_duration_sec:.2f}초")
print()
print(f"[전체]")
print(f"  총 추론 시간     : {total_elapsed:.3f}초")
print("=" * 50)

text = processor.batch_decode(text_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
print(text)
sf.write(
    "output.wav",
    audio.reshape(-1).detach().cpu().numpy(),
    samplerate=24000,
)