import gradio as gr from transformers import pipeline, AutoTokenizer, BitsAndBytesConfig, AutoModelForCausalLM import torch import spaces MODEL_PATH = "benhaotang/phi4-qwq-sky-t1" MODEL_URL = f"https://huggingface.co/{MODEL_PATH}" def load_model(): bnb_config = BitsAndBytesConfig( load_in_8bit=False, llm_int8_enable_fp32_cpu_offload=True ) model = AutoModelForCausalLM.from_pretrained( MODEL_PATH, device_map="auto", torch_dtype=torch.float16, offload_folder="offload_folder", quantization_config=bnb_config ) tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH) pipe = pipeline( "text-generation", model=model, tokenizer=tokenizer, device_map="auto", ) return pipe pipe = load_model() @spaces.GPU(duration=110) def generate_response(prompt, max_length=1024): # Create messages with system prompt messages = [ {"role": "system", "content": "You are a helpful AI assistant. You always think step by step."}, {"role": "user", "content": prompt} ] outputs = pipe(messages, max_new_tokens=max_length) # print("Raw output:", outputs) #removed after debugging output format problem is done # Extract just the assistant's response try: # outputs[0]["generated_text"] is already a list of messages message_list = outputs[0]["generated_text"] # Get the last message (assistant's response) assistant_message = message_list[-1] if assistant_message["role"] == "assistant": return assistant_message["content"] except Exception as e: # print(f"Error extracting response: {e}") # If extraction fails, return the raw output return str(outputs[0]["generated_text"]) return outputs[0]["generated_text"] # Example with proper line breaks example_prompt = """For a scalar field theory with interaction Lagrangian $\mathcal{L}_{int} = g\phi^3 + \lambda\phi^4$: 1. Enumerate all possible 1-loop Feynman diagrams contributing to the scalar propagator 2. For each diagram, write down its loop contribution 3. Provide Mathematica code to calculate these loop amplitudes with dimensional regularization at $d=4-\epsilon$ Please explain your reasoning step by step.""" demo = gr.Interface( fn=generate_response, inputs=[ gr.Textbox( label="Enter your question", placeholder="Ask me anything...", lines=5 ), ], outputs=gr.Textbox(label="Response", lines=10), title="benhaotang/phi4-qwq-sky-t1", description=f""" To achieve CoT and science reasoning on small scale with a merge of CoT finetuned phi4 model. Model: [benhaotang/phi4-qwq-sky-t1]({MODEL_URL})""", examples=[ [example_prompt] # Now using the formatted example ] ) demo.launch()