shisa / app.py
leonardlin's picture
Update app.py
1a6b000
raw
history blame
2.77 kB
# https://www.gradio.app/guides/using-hugging-face-integrations
import gradio as gr
import logging
import html
import time
import torch
from threading import Thread
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
# Model
model_name = "augmxnt/shisa-7b-v1"
# UI Settings
title = "Shisa 7B"
description = "Test out Shisa 7B in either English or Japanese."
placeholder = "Type Here / ここに入力してください"
examples = [
"What's the best ramen in Tokyo?",
"あなたは熱狂的なポケモンファンです。",
"東京でおすすめのラーメン屋ってどこ?",
]
# LLM Settings
system_prompt = 'あなたは役に立つアシスタントです。'
chat_history = [{"role": "system", "content": system_prompt}]
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.bfloat16,
device_map="auto",
load_in_8bit=True,
# load_in_4bit=True
)
streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
def chat(message, history):
chat_history.append({"role": "user", "content": message})
input_ids = tokenizer.apply_chat_template(chat_history, add_generation_prompt=True, return_tensors="pt")
# for multi-gpu, find the device of the first parameter of the model
first_param_device = next(model.parameters()).device
input_ids = input_ids.to(first_param_device)
generate_kwargs = dict(
inputs=input_ids,
streamer=streamer,
max_new_tokens=200,
do_sample=True,
temperature=0.7,
repetition_penalty=1.15,
top_p=0.95,
eos_token_id=tokenizer.eos_token_id,
)
# https://www.gradio.app/main/guides/creating-a-chatbot-fast#example-using-a-local-open-source-llm-with-hugging-face
t = Thread(target=model.generate, kwargs=generate_kwargs)
t.start()
partial_message = ""
for new_token in streamer:
partial_message += new_token # html.escape(new_token)
yield partial_message
chat_interface = gr.ChatInterface(
chat,
chatbot=gr.Chatbot(height=400),
textbox=gr.Textbox(placeholder=placeholder, container=False, scale=7),
title=title,
description=description,
theme="soft",
examples=examples,
cache_examples=False,
undo_btn="Delete Previous",
clear_btn="Clear",
)
# https://huggingface.co/spaces/ysharma/Explore_llamav2_with_TGI/blob/main/app.py#L219 - we use this with construction b/c Gradio barfs on autoreload otherwise
with gr.Blocks() as demo:
chat_interface.render()
gr.Markdown("You can try asking this question in Japanese or English. We limit output to 200 tokens.")
demo.queue().launch()