showui-2b-awq / infer.py
yyyang's picture
Update infer.py
1d23a19 verified
import argparse
import ast
import torch
from PIL import Image
import time
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from PIL import Image, ImageDraw
# Parse command-line arguments
parser = argparse.ArgumentParser(description="Qwen-VL Inference")
parser.add_argument("--image_path", type=str, required=False, default="./test_screenshots/amazon.png", help="Path to the input image")
parser.add_argument("--query", type=str, required=False, default="Click on the 'Chairs'.", help="Text query or instruction")
parser.add_argument("--model_dir", type=str, default="./", help="Path to the local ShowUI model directory")
args = parser.parse_args()
DEVICE = "cuda:0"
# Constants for the processor (adjust as per your model requirements)
MIN_PIXELS = 256 * 28 * 28
# MAX_PIXELS = 1024 * 28 * 28
# MAX_PIXELS = 1280 * 28 * 28
MAX_PIXELS = 1344 * 28 * 28
def draw_point_on_image(image_path, position, output_path="output_image.png", radius=2, color="red"):
image = Image.open(image_path).convert("RGB")
draw = ImageDraw.Draw(image)
width, height = image.size
x = int(position[0] * width)
y = int(position[1] * height)
draw.ellipse([(x - radius, y - radius), (x + radius, y + radius)], fill=color, outline=color)
image.save(output_path)
# print(f"Point drawn at ({x}, {y}) and saved to {output_path}")
# Load model and processor
model = Qwen2VLForConditionalGeneration.from_pretrained(
args.model_dir,
torch_dtype=torch.float16,
# torch_dtype=torch.bfloat16,
device_map="cpu",
)
print("Model dtype:", model.dtype)
# print("Model weights dtype:", model.model.layers[0].self_attn.q_proj.weight)
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", min_pixels=MIN_PIXELS, max_pixels=MAX_PIXELS)
# Print CUDA memory usage after model load
torch.cuda.empty_cache()
_NAV_SYSTEM = """You are an assistant trained to navigate the {_APP} screen.
Given a task instruction, a screen observation, and an action history sequence,
output the next action and wait for the next observation.
Here is the action space:
{_ACTION_SPACE}
"""
_ACTION_MAP = """
1. CLICK: Click on an element, value is not applicable and the position [x,y] is required.
2. INPUT: Type a string into an element, value is a string to type and the position [x,y] is required.
3. HOVER: Hover on an element, value is not applicable and the position [x,y] is required.
4. ENTER: Enter operation, value and position are not applicable.
5. SCROLL: Scroll the screen, value is the direction to scroll and the position is not applicable.
6. ESC: ESCAPE operation, value and position are not applicable.
7. PRESS: Long click on an element, value is not applicable and the position [x,y] is required.
"""
_SYSTEM = _NAV_SYSTEM.format(
_APP="web",
_ACTION_SPACE=_ACTION_MAP
)
if args.query:
_QUERY = args.query
else:
_QUERY = "Click on the 'Chairs'."
# Construct the input message
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": _SYSTEM},
{"type": "image", "image": args.image_path, "min_pixels": MIN_PIXELS, "max_pixels": MAX_PIXELS},
{"type": "text", "text": _QUERY}
],
}
]
# Process the message through the processor
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
# The processor automatically handles image loading when `images` parameter is used
image = Image.open(args.image_path).convert("RGB")
inputs = processor(
text=[text],
images=[image],
padding=True,
return_tensors="pt"
)
# Move inputs and model to GPU
model = model.to(DEVICE)
inputs = inputs.to(DEVICE)
# print("Model dtype after to(DEVICE):", model.model.layers[0].self_attn.q_proj.weight)
print(f"Max CUDA memory after model load: {torch.cuda.max_memory_allocated(device=DEVICE)/1024**2:.2f} MB")
# Reset memory stats before inference
torch.cuda.reset_peak_memory_stats(device=DEVICE)
N_RUNS = 10
times = []
model.eval()
with torch.no_grad():
for i in range(N_RUNS):
start_time = time.time()
generated_ids = model.generate(**inputs, max_new_tokens=128)
# Extract the portion of generated_ids corresponding to the new generation
generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
output_text = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
# Convert output string to Python object if it's a coordinate
try:
result = ast.literal_eval(output_text)
except:
result = output_text
end_time = time.time()
times.append(end_time - start_time)
print(f"Run {i+1}/{N_RUNS} - Time: {end_time - start_time:.4f} s, Output: {result}")
# output_str = "{'action': 'CLICK', 'value': None, 'position': [0.28, 0.29]}"
# parsed_output = ast.literal_eval(result)
if result['action'].upper() == 'CLICK':
x, y = result['position'][0], result['position'][1]
draw_point_on_image(args.image_path, [x, y], output_path="./output_image.png")
avg_time = sum(times) / len(times)
print(f"Average per inference time: {avg_time:.4f} seconds")
# Print CUDA memory usage after inference
print(f"Max CUDA memory after inference: {torch.cuda.max_memory_allocated(device=DEVICE)/1024**2:.2f} MB")
# Print image size information
print(f"Input image size: {Image.open(args.image_path).size}")