Spaces:

TheEighthDay
/

SeekWorld_APP

Sleeping

App Files Files Community

TheEighthDay commited on Apr 16

Commit

d5cd46b

verified ·

1 Parent(s): 83ba4e4

Upload simple_inference.py

Browse files

Files changed (1) hide show

simple_inference.py +233 -0

simple_inference.py ADDED Viewed

	@@ -0,0 +1,233 @@

+import argparse
+import json
+import os
+import torch
+import base64
+from io import BytesIO
+from PIL import Image
+# 条件导入，根据选择的推理引擎
+try:
+    from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
+    transformers_available = True
+except ImportError:
+    transformers_available = False
+    print("警告: transformers相关库未安装，无法使用transformers引擎")
+try:
+    from vllm import LLM, SamplingParams
+    vllm_available = True
+except ImportError:
+    vllm_available = False
+    print("警告: vllm相关库未安装，无法使用vllm引擎")
+# 合并 qwen_vl_utils 的代码
+def process_vision_info(messages):
+    """处理多模态消息中的图像和视频信息
+    Args:
+        messages: 包含图像或视频的消息列表
+    Returns:
+        images_data: 处理后的图像数据
+        videos_data: 处理后的视频数据
+    """
+    images_list, videos_list = [], []
+    for message in messages:
+        content = message.get("content", None)
+        if isinstance(content, str):
+            # 纯文本消息，不处理
+            continue
+        elif isinstance(content, list):
+            # 混合消息，可能包含图像或视频
+            for item in content:
+                if not isinstance(item, dict):
+                    continue
+                # 处理图像
+                if item.get("type") == "image" and "image" in item:
+                    image = item["image"]
+                    if isinstance(image, str):
+                        # 图像URL或路径，尝试加载
+                        try:
+                            image = Image.open(image)
+                        except Exception as e:
+                            print(f"图像加载失败: {e}")
+                            continue
+                    # 转换PIL图像为base64编码
+                    if isinstance(image, Image.Image):
+                        buffered = BytesIO()
+                        image.save(buffered, format="PNG")
+                        image_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
+                        images_list.append(image_str)
+                # 处理视频（如有需要）
+                elif item.get("type") == "video" and "video" in item:
+                    # 暂不支持视频
+                    pass
+    return images_list or None, videos_list or None
+def predict_location(
+    image_path,
+    model_name="Qwen/Qwen2.5-VL-7B-Instruct",
+    inference_engine="transformers"
+):
+    """
+    对单个图片进行位置识别预测
+    参数:
+        image_path: 图片文件路径
+        model_name: 模型名称或路径
+        inference_engine: 推理引擎，"vllm" 或 "transformers"
+    返回:
+        预测结果文本
+    """
+    # 检查图片是否存在
+    if not os.path.exists(image_path):
+        return f"错误: 图片文件不存在: {image_path}"
+    # 加载图片
+    try:
+        image = Image.open(image_path)
+        print(f"成功加载图片: {image_path}")
+    except Exception as e:
+        return f"错误: 无法加载图片: {str(e)}"
+    # 加载处理器
+    print(f"加载处理器: {model_name}")
+    processor = AutoProcessor.from_pretrained(model_name, padding_side='left')
+    # 构建提示消息 - 简化版本，没有SFT和COT
+    question_text = "In which country and within which first-level administrative region of that country was this picture taken?Please answer in the format of <answer>$country,administrative_area_level_1$</answer>?"
+    system_message = "You are a helpful assistant good at solving problems with step-by-step reasoning. You should first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags."
+    # 构建简化后的提示消息
+    prompt_messages = [
+        {
+            "role": "system",
+            "content": [
+                {"type": "text", "text": system_message}
+            ]
+        },
+        {
+            "role": "user",
+            "content": [
+                {"type": "image", "image": image},
+                {"type": "text", "text": question_text}
+            ]
+        }
+    ]
+    # 根据选定的引擎进行推理
+    if inference_engine == "vllm":
+        if not vllm_available:
+            return "错误: vLLM库不可用，请安装vllm或选择transformers引擎"
+        # 使用vLLM进行推理
+        print(f"使用vLLM加载模型: {model_name}")
+        llm = LLM(
+            model=model_name,
+            limit_mm_per_prompt={"image": 10, "video": 10},
+            dtype="auto",
+            gpu_memory_utilization=0.95,
+        )
+        # 设置采样参数
+        sampling_params = SamplingParams(
+            temperature=0.7,
+            top_p=0.8,
+            repetition_penalty=1.05,
+            max_tokens=2048,
+            stop_token_ids=[],
+        )
+        # 处理消息为vLLM格式
+        prompt = processor.apply_chat_template(
+            prompt_messages,
+            tokenize=False,
+            add_generation_prompt=True,
+        )
+        # 处理图像数据
+        image_inputs, video_inputs = process_vision_info(prompt_messages)
+        mm_data = {}
+        if image_inputs is not None:
+            mm_data["image"] = image_inputs
+        # 构建vLLM输入
+        llm_input = {
+            "prompt": prompt,
+            "multi_modal_data": mm_data,
+        }
+        # 生成回答
+        outputs = llm.generate([llm_input], sampling_params=sampling_params)
+        response = outputs[0].outputs[0].text
+    else:  # transformers
+        if not transformers_available:
+            return "错误: Transformers相关库不可用，请安装必要的包"
+        # 使用transformers加载模型
+        print(f"使用transformers加载模型: {model_name}")
+        model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+            model_name, torch_dtype="auto", device_map="auto"
+        )
+        # 准备输入
+        text = processor.apply_chat_template(
+            prompt_messages, tokenize=False, add_generation_prompt=True
+        )
+        # 处理输入
+        inputs = processor(
+            text=text,
+            images=prompt_messages[1]['content'][0]['image'],
+            return_tensors="pt",
+        )
+        inputs = inputs.to(model.device)
+        # 生成回答
+        with torch.no_grad():
+            generated_ids = model.generate(**inputs, max_new_tokens=2048)
+        # 处理输出
+        generated_ids_trimmed = generated_ids[0][len(inputs['input_ids'][0]):]
+        response = processor.decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
+        # 清理GPU缓存
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+    print("\n=== 推理结果 ===")
+    print(response)
+    print("=================\n")
+    return response
+if __name__ == "__main__":
+    # 命令行参数设置
+    parser = argparse.ArgumentParser(description='对单个图片进行位置识别预测')
+    parser.add_argument('--image_path', type=str, required=True,
+                        help='图片文件路径')
+    parser.add_argument('--model_name', type=str, default="Qwen/Qwen2.5-VL-7B-Instruct",
+                        help='模型名称或路径')
+    parser.add_argument('--inference_engine', type=str, default="transformers", choices=["vllm", "transformers"],
+                        help='推理引擎: vllm 或 transformers')
+    args = parser.parse_args()
+    # 单个图片推理
+    result = predict_location(
+        image_path=args.image_path,
+        model_name=args.model_name,
+        inference_engine=args.inference_engine
+    )
+    print(f"最终预测结果: {result}")