prithivMLmods commited on
Commit
6f09ee6
1 Parent(s): 652cdc4

Upload 38 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ font/calibri.ttf filter=lfs diff=lfs merge=lfs -text
37
+ font/youyuan.TTF filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,343 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import spaces
3
+ from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, TextIteratorStreamer
4
+ from qwen_vl_utils import process_vision_info
5
+ import torch
6
+ from PIL import Image
7
+ import os
8
+ import uuid
9
+ import io
10
+ from threading import Thread
11
+ from reportlab.lib.pagesizes import A4
12
+ from reportlab.lib.styles import getSampleStyleSheet
13
+ from reportlab.lib import colors
14
+ from reportlab.platypus import SimpleDocTemplate, Image as RLImage, Paragraph, Spacer
15
+ from reportlab.lib.units import inch
16
+ from reportlab.pdfbase import pdfmetrics
17
+ from reportlab.pdfbase.ttfonts import TTFont
18
+ import docx
19
+ from docx.enum.text import WD_ALIGN_PARAGRAPH
20
+
21
+ # Define model options
22
+ MODEL_OPTIONS = {
23
+ "Qwen2VL Base": "Qwen/Qwen2-VL-2B-Instruct",
24
+ "Latex OCR": "prithivMLmods/Qwen2-VL-OCR-2B-Instruct",
25
+ "Math Prase": "prithivMLmods/Qwen2-VL-Math-Prase-2B-Instruct",
26
+ "Text Analogy Ocrtest": "prithivMLmods/Qwen2-VL-Ocrtest-2B-Instruct"
27
+ }
28
+
29
+ # Preload models and processors into CUDA
30
+ models = {}
31
+ processors = {}
32
+ for name, model_id in MODEL_OPTIONS.items():
33
+ print(f"Loading {name}...")
34
+ models[name] = Qwen2VLForConditionalGeneration.from_pretrained(
35
+ model_id,
36
+ trust_remote_code=True,
37
+ torch_dtype=torch.float16
38
+ ).to("cuda").eval()
39
+ processors[name] = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
40
+
41
+ image_extensions = Image.registered_extensions()
42
+
43
+ def identify_and_save_blob(blob_path):
44
+ """Identifies if the blob is an image and saves it."""
45
+ try:
46
+ with open(blob_path, 'rb') as file:
47
+ blob_content = file.read()
48
+ try:
49
+ Image.open(io.BytesIO(blob_content)).verify() # Check if it's a valid image
50
+ extension = ".png" # Default to PNG for saving
51
+ media_type = "image"
52
+ except (IOError, SyntaxError):
53
+ raise ValueError("Unsupported media type. Please upload a valid image.")
54
+
55
+ filename = f"temp_{uuid.uuid4()}_media{extension}"
56
+ with open(filename, "wb") as f:
57
+ f.write(blob_content)
58
+
59
+ return filename, media_type
60
+
61
+ except FileNotFoundError:
62
+ raise ValueError(f"The file {blob_path} was not found.")
63
+ except Exception as e:
64
+ raise ValueError(f"An error occurred while processing the file: {e}")
65
+
66
+ @spaces.GPU
67
+ def qwen_inference(model_name, media_input, text_input=None):
68
+ """Handles inference for the selected model."""
69
+ model = models[model_name]
70
+ processor = processors[model_name]
71
+
72
+ if isinstance(media_input, str):
73
+ media_path = media_input
74
+ if media_path.endswith(tuple([i for i in image_extensions.keys()])):
75
+ media_type = "image"
76
+ else:
77
+ try:
78
+ media_path, media_type = identify_and_save_blob(media_input)
79
+ except Exception as e:
80
+ raise ValueError("Unsupported media type. Please upload a valid image.")
81
+
82
+ messages = [
83
+ {
84
+ "role": "user",
85
+ "content": [
86
+ {
87
+ "type": media_type,
88
+ media_type: media_path
89
+ },
90
+ {"type": "text", "text": text_input},
91
+ ],
92
+ }
93
+ ]
94
+
95
+ text = processor.apply_chat_template(
96
+ messages, tokenize=False, add_generation_prompt=True
97
+ )
98
+ image_inputs, _ = process_vision_info(messages)
99
+ inputs = processor(
100
+ text=[text],
101
+ images=image_inputs,
102
+ padding=True,
103
+ return_tensors="pt",
104
+ ).to("cuda")
105
+
106
+ streamer = TextIteratorStreamer(
107
+ processor.tokenizer, skip_prompt=True, skip_special_tokens=True
108
+ )
109
+ generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024)
110
+
111
+ thread = Thread(target=model.generate, kwargs=generation_kwargs)
112
+ thread.start()
113
+
114
+ buffer = ""
115
+ for new_text in streamer:
116
+ buffer += new_text
117
+ # Remove <|im_end|> or similar tokens from the output
118
+ buffer = buffer.replace("<|im_end|>", "")
119
+ yield buffer
120
+
121
+ def format_plain_text(output_text):
122
+ """Formats the output text as plain text without LaTeX delimiters."""
123
+ # Remove LaTeX delimiters and convert to plain text
124
+ plain_text = output_text.replace("\\(", "").replace("\\)", "").replace("\\[", "").replace("\\]", "")
125
+ return plain_text
126
+
127
+ def generate_document(media_path, output_text, file_format, font_choice, font_size, line_spacing, alignment, image_size):
128
+ """Generates a document with the input image and plain text output."""
129
+ plain_text = format_plain_text(output_text)
130
+ if file_format == "pdf":
131
+ return generate_pdf(media_path, plain_text, font_choice, font_size, line_spacing, alignment, image_size)
132
+ elif file_format == "docx":
133
+ return generate_docx(media_path, plain_text, font_choice, font_size, line_spacing, alignment, image_size)
134
+
135
+ def generate_pdf(media_path, plain_text, font_choice, font_size, line_spacing, alignment, image_size):
136
+ """Generates a PDF document."""
137
+ filename = f"output_{uuid.uuid4()}.pdf"
138
+ doc = SimpleDocTemplate(
139
+ filename,
140
+ pagesize=A4,
141
+ rightMargin=inch,
142
+ leftMargin=inch,
143
+ topMargin=inch,
144
+ bottomMargin=inch
145
+ )
146
+ styles = getSampleStyleSheet()
147
+ styles["Normal"].fontName = font_choice
148
+ styles["Normal"].fontSize = int(font_size)
149
+ styles["Normal"].leading = int(font_size) * line_spacing
150
+ styles["Normal"].alignment = {
151
+ "Left": 0,
152
+ "Center": 1,
153
+ "Right": 2,
154
+ "Justified": 4
155
+ }[alignment]
156
+
157
+ # Register font
158
+ font_path = f"font/{font_choice}"
159
+ pdfmetrics.registerFont(TTFont(font_choice, font_path))
160
+
161
+ story = []
162
+
163
+ # Add image with size adjustment
164
+ image_sizes = {
165
+ "Small": (200, 200),
166
+ "Medium": (400, 400),
167
+ "Large": (600, 600)
168
+ }
169
+ img = RLImage(media_path, width=image_sizes[image_size][0], height=image_sizes[image_size][1])
170
+ story.append(img)
171
+ story.append(Spacer(1, 12))
172
+
173
+ # Add plain text output
174
+ text = Paragraph(plain_text, styles["Normal"])
175
+ story.append(text)
176
+
177
+ doc.build(story)
178
+ return filename
179
+
180
+ def generate_docx(media_path, plain_text, font_choice, font_size, line_spacing, alignment, image_size):
181
+ """Generates a DOCX document."""
182
+ filename = f"output_{uuid.uuid4()}.docx"
183
+ doc = docx.Document()
184
+
185
+ # Add image with size adjustment
186
+ image_sizes = {
187
+ "Small": docx.shared.Inches(2),
188
+ "Medium": docx.shared.Inches(4),
189
+ "Large": docx.shared.Inches(6)
190
+ }
191
+ doc.add_picture(media_path, width=image_sizes[image_size])
192
+ doc.add_paragraph()
193
+
194
+ # Add plain text output
195
+ paragraph = doc.add_paragraph()
196
+ paragraph.paragraph_format.line_spacing = line_spacing
197
+ paragraph.paragraph_format.alignment = {
198
+ "Left": WD_ALIGN_PARAGRAPH.LEFT,
199
+ "Center": WD_ALIGN_PARAGRAPH.CENTER,
200
+ "Right": WD_ALIGN_PARAGRAPH.RIGHT,
201
+ "Justified": WD_ALIGN_PARAGRAPH.JUSTIFY
202
+ }[alignment]
203
+ run = paragraph.add_run(plain_text)
204
+ run.font.name = font_choice
205
+ run.font.size = docx.shared.Pt(int(font_size))
206
+
207
+ doc.save(filename)
208
+ return filename
209
+
210
+ # CSS for output styling
211
+ css = """
212
+ #output {
213
+ height: 500px;
214
+ overflow: auto;
215
+ border: 1px solid #ccc;
216
+ }
217
+ .submit-btn {
218
+ background-color: #cf3434 !important;
219
+ color: white !important;
220
+ }
221
+ .submit-btn:hover {
222
+ background-color: #ff2323 !important;
223
+ }
224
+ .download-btn {
225
+ background-color: #35a6d6 !important;
226
+ color: white !important;
227
+ }
228
+ .download-btn:hover {
229
+ background-color: #22bcff !important;
230
+ }
231
+ """
232
+
233
+ # Gradio app setup
234
+ with gr.Blocks(css=css) as demo:
235
+ gr.Markdown("# Qwen2VL Models: Vision and Language Processing")
236
+
237
+ with gr.Tab(label="Image Input"):
238
+
239
+ with gr.Row():
240
+ with gr.Column():
241
+ model_choice = gr.Dropdown(
242
+ label="Model Selection",
243
+ choices=list(MODEL_OPTIONS.keys()),
244
+ value="Latex OCR"
245
+ )
246
+ input_media = gr.File(
247
+ label="Upload Image", type="filepath"
248
+ )
249
+ text_input = gr.Textbox(label="Question", placeholder="Ask a question about the image...")
250
+ submit_btn = gr.Button(value="Submit", elem_classes="submit-btn")
251
+
252
+ with gr.Column():
253
+ output_text = gr.Textbox(label="Output Text", lines=10)
254
+ plain_text_output = gr.Textbox(label="Standardized Plain Text", lines=10)
255
+
256
+ submit_btn.click(
257
+ qwen_inference, [model_choice, input_media, text_input], [output_text]
258
+ ).then(
259
+ lambda output_text: format_plain_text(output_text), [output_text], [plain_text_output]
260
+ )
261
+
262
+ # Add examples directly usable by clicking
263
+ with gr.Row():
264
+ gr.Examples(
265
+ examples=[
266
+ ["examples/1.png", "summarize the letter", "Text Analogy Ocrtest"],
267
+ ["examples/2.jpg", "Summarize the full image in detail", "Latex OCR"],
268
+ ["examples/3.png", "Describe the photo", "Qwen2VL Base"],
269
+ ["examples/4.png", "summarize and solve the problem", "Math Prase"],
270
+ ],
271
+ inputs=[input_media, text_input, model_choice],
272
+ outputs=[output_text, plain_text_output],
273
+ fn=lambda img, question, model: qwen_inference(model, img, question),
274
+ cache_examples=False,
275
+ )
276
+
277
+ with gr.Row():
278
+ with gr.Column():
279
+ line_spacing = gr.Dropdown(
280
+ choices=[0.5, 1.0, 1.15, 1.5, 2.0, 2.5, 3.0],
281
+ value=1.5,
282
+ label="Line Spacing"
283
+ )
284
+ font_size = gr.Dropdown(
285
+ choices=["8", "10", "12", "14", "16", "18", "20", "22", "24"],
286
+ value="18",
287
+ label="Font Size"
288
+ )
289
+ font_choice = gr.Dropdown(
290
+ choices=[
291
+ "DejaVuMathTeXGyre.ttf",
292
+ "FiraCode-Medium.ttf",
293
+ "InputMono-Light.ttf",
294
+ "JetBrainsMono-Thin.ttf",
295
+ "ProggyCrossed Regular Mac.ttf",
296
+ "SourceCodePro-Black.ttf",
297
+ "arial.ttf",
298
+ "calibri.ttf",
299
+ "mukta-malar-extralight.ttf",
300
+ "noto-sans-arabic-medium.ttf",
301
+ "times new roman.ttf",
302
+ "ANGSA.ttf",
303
+ "Book-Antiqua.ttf",
304
+ "CONSOLA.TTF",
305
+ "COOPBL.TTF",
306
+ "Rockwell-Bold.ttf",
307
+ "Candara Light.TTF",
308
+ "Carlito-Regular.ttf Carlito-Regular.ttf",
309
+ "Castellar.ttf",
310
+ "Courier New.ttf",
311
+ "LSANS.TTF",
312
+ "Lucida Bright Regular.ttf",
313
+ "TRTempusSansITC.ttf",
314
+ "Verdana.ttf",
315
+ "bell-mt.ttf",
316
+ "eras-itc-light.ttf",
317
+ "fonnts.com-aptos-light.ttf",
318
+ "georgia.ttf",
319
+ "segoeuithis.ttf",
320
+ "youyuan.TTF",
321
+ "TfPonetoneExpanded-7BJZA.ttf",
322
+ ],
323
+ value="times new roman.ttf",
324
+ label="Font Choice"
325
+ )
326
+ alignment = gr.Dropdown(
327
+ choices=["Left", "Center", "Right", "Justified"],
328
+ value="Justified",
329
+ label="Text Alignment"
330
+ )
331
+ image_size = gr.Dropdown(
332
+ choices=["Small", "Medium", "Large"],
333
+ value="Small",
334
+ label="Image Size"
335
+ )
336
+ file_format = gr.Radio(["pdf", "docx"], label="File Format", value="pdf")
337
+ get_document_btn = gr.Button(value="Get Document", elem_classes="download-btn")
338
+
339
+ get_document_btn.click(
340
+ generate_document, [input_media, output_text, file_format, font_choice, font_size, line_spacing, alignment, image_size], gr.File(label="Download Document")
341
+ )
342
+
343
+ demo.launch(debug=True)
examples/1.png ADDED
examples/2.jpg ADDED
examples/3.png ADDED
examples/4.png ADDED
font/ANGSA.ttf ADDED
Binary file (110 kB). View file
 
font/Book-Antiqua.ttf ADDED
Binary file (174 kB). View file
 
font/CONSOLA.TTF ADDED
Binary file (358 kB). View file
 
font/COOPBL.TTF ADDED
Binary file (80.4 kB). View file
 
font/Candara Light.TTF ADDED
Binary file (122 kB). View file
 
font/Carlito-Regular.ttf ADDED
Binary file (636 kB). View file
 
font/Castellar.ttf ADDED
Binary file (48.3 kB). View file
 
font/Courier New.ttf ADDED
Binary file (710 kB). View file
 
font/DejaVuMathTeXGyre.ttf ADDED
Binary file (578 kB). View file
 
font/FiraCode-Medium.ttf ADDED
Binary file (284 kB). View file
 
font/InputMono-Light.ttf ADDED
Binary file (109 kB). View file
 
font/JetBrainsMono-Thin.ttf ADDED
Binary file (270 kB). View file
 
font/LSANS.TTF ADDED
Binary file (64.6 kB). View file
 
font/Lucida Bright Regular.ttf ADDED
Binary file (70.7 kB). View file
 
font/ProggyCrossed Regular Mac.ttf ADDED
Binary file (592 kB). View file
 
font/Rockwell-Bold.ttf ADDED
Binary file (63 kB). View file
 
font/SourceCodePro-Black.ttf ADDED
Binary file (207 kB). View file
 
font/TRTempusSansITC.ttf ADDED
Binary file (76 kB). View file
 
font/TfPonetoneExpanded-7BJZA.ttf ADDED
Binary file (45.3 kB). View file
 
font/Verdana.ttf ADDED
Binary file (242 kB). View file
 
font/arial.ttf ADDED
Binary file (915 kB). View file
 
font/bell-mt.ttf ADDED
Binary file (84.8 kB). View file
 
font/calibri.ttf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d7114f0913256fa13f757eb2db8669c5f6dfd2fe2afa4e161e15d9e3574e6dc1
3
+ size 1329860
font/demo.txt ADDED
File without changes
font/eras-itc-light.ttf ADDED
Binary file (68.7 kB). View file
 
font/fonnts.com-aptos-light.ttf ADDED
Binary file (234 kB). View file
 
font/georgia.ttf ADDED
Binary file (220 kB). View file
 
font/mukta-malar-extralight.ttf ADDED
Binary file (226 kB). View file
 
font/noto-sans-arabic-medium.ttf ADDED
Binary file (178 kB). View file
 
font/segoeuithis.ttf ADDED
Binary file (833 kB). View file
 
font/times new roman.ttf ADDED
Binary file (68.4 kB). View file
 
font/youyuan.TTF ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:16747b04038770d7ceeade94e943d7d79247cc1c95b550403dd0a0286d46c6c4
3
+ size 6794984
requirements.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ transformers
2
+ accelerate
3
+ numpy
4
+ Requests
5
+ torch
6
+ torchvision
7
+ qwen-vl-utils
8
+ av
9
+ ipython
10
+ reportlab
11
+ fpdf
12
+ python-docx
13
+ pillow
14
+ huggingface_hub