Robin Genolet commited on
Commit
9730359
·
1 Parent(s): 90d439d

test: auto gptq

Browse files
Files changed (2) hide show
  1. requirements.txt +0 -0
  2. utils/epfl_meditron_utils.py +43 -34
requirements.txt CHANGED
Binary files a/requirements.txt and b/requirements.txt differ
 
utils/epfl_meditron_utils.py CHANGED
@@ -1,38 +1,47 @@
1
- from ctransformers import AutoModelForCausalLM, AutoTokenizer
2
- from transformers import pipeline
3
- import streamlit as st
4
- from langchain.chains import LLMChain
5
- from langchain.prompts import PromptTemplate
6
-
7
- # Simple inference example
8
- # output = llm(
9
- # "<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant", # Prompt
10
- # max_tokens=512, # Generate up to 512 tokens
11
- # stop=["</s>"], # Example stop token - not necessarily correct for this specific model! Please check before using.
12
- # echo=True # Whether to echo the prompt
13
- #)
14
-
15
- prompt_format = "<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant"
16
-
17
-
18
- template = """Question: {question}
19
-
20
- Answer:"""
21
-
22
 
23
 
24
 
25
  def get_llm_response(repo, filename, model_type, gpu_layers, prompt):
26
- print("Loading model")
27
- llm = AutoModelForCausalLM.from_pretrained(repo, model_file=filename, model_type=model_type, gpu_layers=gpu_layers)
28
- print("Model loaded")
29
-
30
- #llm_prompt = prompt_format.format(system_message=system_prompt, prompt=prompt)
31
- print(f"LLM prompt: {prompt}")
32
-
33
- prompt = PromptTemplate(template=template, input_variables=["question"])
34
-
35
- llm_chain = LLMChain(prompt=prompt, llm=llm)
36
- response = llm_chain.run(prompt)
37
-
38
- return response
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
 
2
 
3
 
4
  def get_llm_response(repo, filename, model_type, gpu_layers, prompt):
5
+ from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
6
+
7
+ model_name_or_path = "TheBloke/meditron-7B-GPTQ"
8
+ # To use a different branch, change revision
9
+ # For example: revision="gptq-4bit-128g-actorder_True"
10
+ model = AutoModelForCausalLM.from_pretrained(model_name_or_path,
11
+ device_map="auto",
12
+ trust_remote_code=False,
13
+ revision="main")
14
+
15
+ tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
16
+
17
+ print("\n\n*** Generate:")
18
+
19
+ #input_ids = tokenizer(prompt_template, return_tensors='pt').input_ids.cuda()
20
+ #output = model.generate(inputs=input_ids, temperature=0.7, do_sample=True, top_p=0.95, top_k=40, max_new_tokens=512)
21
+ #print(tokenizer.decode(output[0]))
22
+
23
+ # Inference can also be done using transformers' pipeline
24
+
25
+ print("*** Pipeline:")
26
+ pipe = pipeline(
27
+ "text-generation",
28
+ model=model,
29
+ tokenizer=tokenizer,
30
+ max_new_tokens=512,
31
+ do_sample=True,
32
+ temperature=0.7,
33
+ top_p=0.95,
34
+ top_k=40,
35
+ repetition_penalty=1.1
36
+ )
37
+
38
+ prompt_template=f'''<|im_start|>system
39
+ {system_message}<|im_end|>
40
+ <|im_start|>user
41
+ {prompt}<|im_end|>
42
+ <|im_start|>assistant
43
+ '''.format(system_message="You are an assistant", prompt=prompt)
44
+
45
+ response = pipe(prompt_template)[0]['generated_text']
46
+ print(response)
47
+ return response