deepcam/python/test_vl_img_demo.py

import torch
from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
from PIL import Image
import os, sys

# 加载模型和处理器（替换为你自己的模型路径）
model_path = "./qwen2-vl-2b"

model = Qwen2VLForConditionalGeneration.from_pretrained(
    model_path,
    device_map="auto",  # 自动分配设备（CPU/GPU）
    torch_dtype=torch.float16  # 使用半精度加快推理
)
processor = AutoProcessor.from_pretrained(model_path)

# 加载本地图片（替换成你自己的图片路径）
image_path = "./example.jpg"
if not os.path.exists(image_path):
    raise FileNotFoundError(f"Image file {image_path} not found!")

image = Image.open(image_path).convert("RGB")

# 示例任务1：图像描述生成
prompt = "Describe the content of this image in detail."

# 构建输入
inputs = processor(text=prompt, images=image, return_tensors="pt").to(model.device)

# 推理
with torch.no_grad():
    generated_ids = model.generate(**inputs, max_new_tokens=100)

# 解码输出
description = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
print("【图像描述】:", description)

# 示例任务2：视觉问答（VQA）
question = "What is the main object in the image?"

# 构建输入
inputs = processor(text=question, images=image, return_tensors="pt").to(model.device)

# 推理
with torch.no_grad():
    generated_ids = model.generate(**inputs, max_new_tokens=50)

# 解码输出
answer = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
print("【问题回答】:", answer)