Spaces:
Running
on
Zero
Running
on
Zero
import gradio as gr | |
from transformers import AutoTokenizer, AutoModelForCausalLM | |
import spaces | |
import re | |
from markdownify import markdownify | |
models = { | |
"jinaai/reader-lm-0.5b": AutoModelForCausalLM.from_pretrained("jinaai/reader-lm-0.5b", trust_remote_code=True).eval().to("cuda"), | |
"jinaai/reader-lm-1.5b": AutoModelForCausalLM.from_pretrained("jinaai/reader-lm-1.5b", trust_remote_code=True).eval().to("cuda") | |
} | |
tokenizers = { | |
"jinaai/reader-lm-0.5b": AutoTokenizer.from_pretrained("jinaai/reader-lm-0.5b", trust_remote_code=True), | |
"jinaai/reader-lm-1.5b": AutoTokenizer.from_pretrained("jinaai/reader-lm-1.5b", trust_remote_code=True), | |
} | |
def run_example(html_content, model_id="jinaai/reader-lm-1.5b"): | |
print("Start Model Processing") | |
model = models[model_id] | |
tokenizer = tokenizers[model_id] | |
messages = [{"role": "user", "content": html_content}] | |
input_text=tokenizer.apply_chat_template(messages, tokenize=False) | |
inputs = tokenizer.encode(input_text, return_tensors="pt").to("cuda") | |
outputs = model.generate(inputs, max_new_tokens=1024, temperature=0, do_sample=False, repetition_penalty=1.08) | |
pattern = r"<\|im_start\|>assistant(.*?)<\|im_end\|>" | |
assistant_response = re.findall(pattern, tokenizer.decode(outputs[0]), re.DOTALL) | |
print("Start Markdownify Processing") | |
markdownify_output = markdownify(html_content) | |
return assistant_response[0], markdownify_output | |
css = """ | |
#output { | |
height: 500px; | |
overflow: auto; | |
border: 1px solid #ccc; | |
} | |
""" | |
example_html = """<div id="myDIV" class="header"> | |
<h2>My To Do List</h2> | |
<input type="text" id="myInput" placeholder="Title..."> | |
<span onclick="newElement()" class="addBtn">Add</span> | |
</div> | |
<ul id="myUL"> | |
<li>Hit the gym</li> | |
<li class="checked">Pay bills</li> | |
<li>Meet George</li> | |
<li>Buy eggs</li> | |
<li>Read a book</li> | |
<li>Organize office</li> | |
</ul>""" | |
with gr.Blocks(css=css) as demo: | |
gr.Markdown(""" | |
# HTML-to-Markdown | |
Try out model based HTML-to-Markdown with [Reader LM](https://huggingface.co/jinaai/reader-lm-1.5b) and rule based with [Markdownify](https://github.com/matthewwithanm/python-markdownify). | |
""") | |
with gr.Row(): | |
with gr.Column(): | |
model_selector = gr.Dropdown(choices=list(models.keys()), label="Model", value="jinaai/reader-lm-1.5b") | |
html_content = gr.Textbox(label="HTML") | |
submit_btn = gr.Button(value="Submit") | |
with gr.Column(): | |
model_output_text = gr.Textbox(label="Reader LM Output") | |
markdownify_output = gr.Textbox(label="Markdownify Output") | |
gr.Examples( | |
examples=[ | |
[example_html], | |
], | |
inputs=[html_content], | |
outputs=[model_output_text, markdownify_output], | |
fn=run_example, | |
cache_examples=True, | |
label="Try examples" | |
) | |
submit_btn.click(run_example, [html_content, model_selector], [model_output_text, markdownify_output]) | |
demo.launch(debug=True) |