Luke Stanley commited on
Commit
ddb0d91
·
1 Parent(s): 5c4f1cd

Default to in-memory LLM interface

Browse files
Files changed (2) hide show
  1. .gitignore +1 -0
  2. utils.py +43 -4
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ .aider*
utils.py CHANGED
@@ -2,7 +2,7 @@ import json
2
  from typing import Any, Dict, Union
3
  import requests
4
 
5
- from llama_cpp import json_schema_to_gbnf
6
 
7
  # The llama_cpp Python HTTP server communicates with the AI model, similar
8
  # to the OpenAI API but adds a unique "grammar" parameter.
@@ -10,6 +10,9 @@ from llama_cpp import json_schema_to_gbnf
10
  # It's possible to switch to another LLM API by changing the llm_streaming function.
11
 
12
  URL = "http://localhost:5834/v1/chat/completions"
 
 
 
13
 
14
  def llm_streaming(
15
  prompt: str, pydantic_model_class, return_pydantic_object=False
@@ -69,12 +72,48 @@ def replace_text(template: str, replacements: dict) -> str:
69
  return template
70
 
71
 
72
- def query_ai_prompt(prompt, replacements, model_class):
73
- prompt = replace_text(prompt, replacements)
74
- return llm_streaming(prompt, model_class)
75
 
76
 
77
  def calculate_overall_score(faithfulness, spiciness):
78
  baseline_weight = 0.8
79
  overall = faithfulness + (1 - baseline_weight) * spiciness * faithfulness
80
  return overall
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  from typing import Any, Dict, Union
3
  import requests
4
 
5
+ from llama_cpp import Llama, LlamaGrammar, json_schema_to_gbnf
6
 
7
  # The llama_cpp Python HTTP server communicates with the AI model, similar
8
  # to the OpenAI API but adds a unique "grammar" parameter.
 
10
  # It's possible to switch to another LLM API by changing the llm_streaming function.
11
 
12
  URL = "http://localhost:5834/v1/chat/completions"
13
+ in_memory_llm = None
14
+ IN_MEMORY_LLM_PATH = "/fast/mistral-7b-instruct-v0.1.Q4_K_M.gguf"
15
+ # TODO: Have a good way to set the model path
16
 
17
  def llm_streaming(
18
  prompt: str, pydantic_model_class, return_pydantic_object=False
 
72
  return template
73
 
74
 
 
 
 
75
 
76
 
77
  def calculate_overall_score(faithfulness, spiciness):
78
  baseline_weight = 0.8
79
  overall = faithfulness + (1 - baseline_weight) * spiciness * faithfulness
80
  return overall
81
+
82
+
83
+ def llm_stream_sans_network(
84
+ prompt: str, pydantic_model_class, return_pydantic_object=False
85
+ ) -> Union[str, Dict[str, Any]]:
86
+ global in_memory_llm
87
+ if in_memory_llm is None:
88
+ in_memory_llm = Llama(model_path=IN_MEMORY_LLM_PATH)
89
+ schema = pydantic_model_class.model_json_schema()
90
+
91
+ # Optional example field from schema, is not needed for the grammar generation
92
+ if "example" in schema:
93
+ del schema["example"]
94
+
95
+ json_schema = json.dumps(schema)
96
+ grammar = LlamaGrammar.from_json_schema(json_schema)
97
+
98
+ output_text = in_memory_llm(
99
+ prompt,
100
+ max_tokens=1000,
101
+ temperature=0.7,
102
+ grammar=grammar,
103
+ )["choices"][0]["text"]
104
+
105
+ print(output_text)
106
+
107
+ if return_pydantic_object:
108
+ model_object = pydantic_model_class.model_validate_json(output_text)
109
+ return model_object
110
+ else:
111
+ json_output = json.loads(output_text)
112
+ return json_output
113
+
114
+ def query_ai_prompt(prompt, replacements, model_class, in_memory=True):
115
+ prompt = replace_text(prompt, replacements)
116
+ if in_memory:
117
+ return llm_stream_sans_network(prompt, model_class)
118
+ else:
119
+ return llm_streaming(prompt, model_class)