aiqtech commited on
Commit
36163a8
1 Parent(s): e228d7c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +57 -52
app.py CHANGED
@@ -5,34 +5,16 @@ import torch
5
  import gradio as gr
6
  from threading import Thread
7
  from PIL import Image
 
 
8
 
9
- # Install required packages
10
- import subprocess
11
- subprocess.run('pip install --upgrade transformers', shell=True)
12
- subprocess.run('pip install accelerate', shell=True)
13
-
14
- from transformers import AutoConfig, PreTrainedModel, AutoTokenizer
15
-
16
- # Model and tokenizer initialization
17
- model_name = "Qwen/QVQ-72B-Preview"
18
-
19
- config = AutoConfig.from_pretrained(
20
- model_name,
21
- trust_remote_code=True
22
- )
23
-
24
- tokenizer = AutoTokenizer.from_pretrained(
25
- model_name,
26
- trust_remote_code=True
27
- )
28
-
29
- model = PreTrainedModel.from_pretrained(
30
- model_name,
31
- config=config,
32
- trust_remote_code=True,
33
- device_map="auto",
34
- torch_dtype=torch.float16
35
  )
 
36
 
37
  # Footer
38
  footer = """
@@ -45,35 +27,58 @@ footer = """
45
  @spaces.GPU()
46
  def process_image(image, text_input=None):
47
  try:
48
- # Convert image to PIL format
49
- image = Image.fromarray(image).convert("RGB")
50
-
51
- # Prepare inputs
52
- if text_input:
53
- messages = [
54
- {
55
- "role": "user",
56
- "content": [
57
- {"image": image},
58
- {"text": text_input}
59
- ]
60
- }
61
- ]
62
- else:
63
- messages = [
64
- {
65
- "role": "user",
66
- "content": [
67
- {"image": image},
68
- {"text": "Please describe this image in detail."}
69
- ]
70
- }
71
- ]
72
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  # Generate response
74
- response = model.chat(tokenizer, messages=messages)
 
 
 
 
 
 
 
 
75
 
76
- return response
77
  except Exception as e:
78
  return f"Error processing image: {str(e)}"
79
 
 
5
  import gradio as gr
6
  from threading import Thread
7
  from PIL import Image
8
+ from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
9
+ from qwen_vl_utils import process_vision_info
10
 
11
+ # Model and processor initialization
12
+ model = Qwen2VLForConditionalGeneration.from_pretrained(
13
+ "Qwen/QVQ-72B-Preview",
14
+ torch_dtype="auto",
15
+ device_map="auto"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  )
17
+ processor = AutoProcessor.from_pretrained("Qwen/QVQ-72B-Preview")
18
 
19
  # Footer
20
  footer = """
 
27
  @spaces.GPU()
28
  def process_image(image, text_input=None):
29
  try:
30
+ # Convert image to PIL format if needed
31
+ if not isinstance(image, Image.Image):
32
+ image = Image.fromarray(image).convert("RGB")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
+ # Prepare messages
35
+ if not text_input:
36
+ text_input = "Please describe this image in detail."
37
+
38
+ messages = [
39
+ {
40
+ "role": "system",
41
+ "content": [
42
+ {"type": "text", "text": "You are a helpful and harmless assistant. You are Qwen developed by Alibaba. You should think step-by-step."}
43
+ ],
44
+ },
45
+ {
46
+ "role": "user",
47
+ "content": [
48
+ {"type": "image", "image": image},
49
+ {"type": "text", "text": text_input}
50
+ ],
51
+ }
52
+ ]
53
+
54
+ # Process inputs
55
+ text = processor.apply_chat_template(
56
+ messages,
57
+ tokenize=False,
58
+ add_generation_prompt=True
59
+ )
60
+ image_inputs, video_inputs = process_vision_info(messages)
61
+ inputs = processor(
62
+ text=[text],
63
+ images=image_inputs,
64
+ videos=video_inputs,
65
+ padding=True,
66
+ return_tensors="pt",
67
+ )
68
+ inputs = inputs.to("cuda")
69
+
70
  # Generate response
71
+ generated_ids = model.generate(**inputs, max_new_tokens=8192)
72
+ generated_ids_trimmed = [
73
+ out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
74
+ ]
75
+ output_text = processor.batch_decode(
76
+ generated_ids_trimmed,
77
+ skip_special_tokens=True,
78
+ clean_up_tokenization_spaces=False
79
+ )[0]
80
 
81
+ return output_text
82
  except Exception as e:
83
  return f"Error processing image: {str(e)}"
84