showui-2b-awq / infer.py

Update infer.py

1d23a19 verified about 1 month ago

5.73 kB

	import argparse
	import ast
	import torch
	from PIL import Image
	import time
	from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
	from PIL import Image, ImageDraw

	# Parse command-line arguments
	parser = argparse.ArgumentParser(description="Qwen-VL Inference")
	parser.add_argument("--image_path", type=str, required=False, default="./test_screenshots/amazon.png", help="Path to the input image")
	parser.add_argument("--query", type=str, required=False, default="Click on the 'Chairs'.", help="Text query or instruction")
	parser.add_argument("--model_dir", type=str, default="./", help="Path to the local ShowUI model directory")
	args = parser.parse_args()

	DEVICE = "cuda:0"

	# Constants for the processor (adjust as per your model requirements)
	MIN_PIXELS = 256 * 28 * 28
	# MAX_PIXELS = 1024 * 28 * 28
	# MAX_PIXELS = 1280 * 28 * 28
	MAX_PIXELS = 1344 * 28 * 28


	def draw_point_on_image(image_path, position, output_path="output_image.png", radius=2, color="red"):
	image = Image.open(image_path).convert("RGB")
	draw = ImageDraw.Draw(image)
	width, height = image.size
	x = int(position[0] * width)
	y = int(position[1] * height)
	draw.ellipse([(x - radius, y - radius), (x + radius, y + radius)], fill=color, outline=color)
	image.save(output_path)
	# print(f"Point drawn at ({x}, {y}) and saved to {output_path}")

	# Load model and processor
	model = Qwen2VLForConditionalGeneration.from_pretrained(
	args.model_dir,
	torch_dtype=torch.float16,
	# torch_dtype=torch.bfloat16,
	device_map="cpu",
	)

	print("Model dtype:", model.dtype)
	# print("Model weights dtype:", model.model.layers[0].self_attn.q_proj.weight)

	processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", min_pixels=MIN_PIXELS, max_pixels=MAX_PIXELS)

	# Print CUDA memory usage after model load
	torch.cuda.empty_cache()

	_NAV_SYSTEM = """You are an assistant trained to navigate the {_APP} screen.
	Given a task instruction, a screen observation, and an action history sequence,
	output the next action and wait for the next observation.
	Here is the action space:
	{_ACTION_SPACE}
	"""

	_ACTION_MAP = """
	1. CLICK: Click on an element, value is not applicable and the position [x,y] is required.
	2. INPUT: Type a string into an element, value is a string to type and the position [x,y] is required.
	3. HOVER: Hover on an element, value is not applicable and the position [x,y] is required.
	4. ENTER: Enter operation, value and position are not applicable.
	5. SCROLL: Scroll the screen, value is the direction to scroll and the position is not applicable.
	6. ESC: ESCAPE operation, value and position are not applicable.
	7. PRESS: Long click on an element, value is not applicable and the position [x,y] is required.
	"""

	_SYSTEM = _NAV_SYSTEM.format(
	_APP="web",
	_ACTION_SPACE=_ACTION_MAP
	)

	if args.query:
	_QUERY = args.query
	else:
	_QUERY = "Click on the 'Chairs'."

	# Construct the input message
	messages = [
	{
	"role": "user",
	"content": [
	{"type": "text", "text": _SYSTEM},
	{"type": "image", "image": args.image_path, "min_pixels": MIN_PIXELS, "max_pixels": MAX_PIXELS},
	{"type": "text", "text": _QUERY}
	],
	}
	]

	# Process the message through the processor
	text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

	# The processor automatically handles image loading when `images` parameter is used
	image = Image.open(args.image_path).convert("RGB")
	inputs = processor(
	text=[text],
	images=[image],
	padding=True,
	return_tensors="pt"
	)

	# Move inputs and model to GPU
	model = model.to(DEVICE)
	inputs = inputs.to(DEVICE)

	# print("Model dtype after to(DEVICE):", model.model.layers[0].self_attn.q_proj.weight)
	print(f"Max CUDA memory after model load: {torch.cuda.max_memory_allocated(device=DEVICE)/1024**2:.2f} MB")

	# Reset memory stats before inference
	torch.cuda.reset_peak_memory_stats(device=DEVICE)

	N_RUNS = 10
	times = []

	model.eval()
	with torch.no_grad():

	for i in range(N_RUNS):
	start_time = time.time()

	generated_ids = model.generate(**inputs, max_new_tokens=128)

	# Extract the portion of generated_ids corresponding to the new generation
	generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
	output_text = processor.batch_decode(
	generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
	)[0]

	# Convert output string to Python object if it's a coordinate
	try:
	result = ast.literal_eval(output_text)
	except:
	result = output_text

	end_time = time.time()
	times.append(end_time - start_time)
	print(f"Run {i+1}/{N_RUNS} - Time: {end_time - start_time:.4f} s, Output: {result}")

	# output_str = "{'action': 'CLICK', 'value': None, 'position': [0.28, 0.29]}"
	# parsed_output = ast.literal_eval(result)
	if result['action'].upper() == 'CLICK':
	x, y = result['position'][0], result['position'][1]
	draw_point_on_image(args.image_path, [x, y], output_path="./output_image.png")



	avg_time = sum(times) / len(times)
	print(f"Average per inference time: {avg_time:.4f} seconds")

	# Print CUDA memory usage after inference
	print(f"Max CUDA memory after inference: {torch.cuda.max_memory_allocated(device=DEVICE)/1024**2:.2f} MB")

	# Print image size information
	print(f"Input image size: {Image.open(args.image_path).size}")