PHI35VISION / app.py
aiqtech's picture
Update app.py
bf957c3 verified
import spaces
import os
import time
import torch
import gradio as gr
from threading import Thread
from PIL import Image
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoProcessor
# Model and processor initialization
model_name = "Qwen/QVQ-72B-Preview"
model = AutoModelForCausalLM.from_pretrained(
model_name,
trust_remote_code=True,
device_map="auto",
torch_dtype=torch.float16
)
tokenizer = AutoTokenizer.from_pretrained(
model_name,
trust_remote_code=True
)
processor = AutoProcessor.from_pretrained(
model_name,
trust_remote_code=True
)
# Footer
footer = """
<div style="text-align: center; margin-top: 20px;">
<p>Powered by QVQ-72B Model</p>
</div>
"""
# Vision model function
@spaces.GPU()
def process_image(image, text_input=None):
try:
# Convert image to PIL format if needed
if not isinstance(image, Image.Image):
image = Image.fromarray(image).convert("RGB")
# Prepare messages
if not text_input:
text_input = "Please describe this image in detail."
messages = [
{
"role": "system",
"content": "You are a helpful and harmless assistant."
},
{
"role": "user",
"content": [
{"image": image},
{"text": text_input}
]
}
]
# Process inputs
response = model.chat(tokenizer, messages)
return response
except Exception as e:
return f"Error processing image: {str(e)}"
# CSS styling
css = """
footer {
visibility: hidden;
}
"""
# Gradio interface
with gr.Blocks(theme="Yntec/HaleyCH_Theme_Orange", css=css) as demo:
with gr.Row():
input_img = gr.Image(label="Input Image")
with gr.Row():
text_input = gr.Textbox(label="Question (Optional)")
with gr.Row():
submit_btn = gr.Button(value="Submit")
with gr.Row():
output_text = gr.Textbox(label="Response")
submit_btn.click(process_image, [input_img, text_input], [output_text])
gr.HTML(footer)
# Launch the app
demo.launch(debug=True)