Spaces:
Sleeping
Sleeping
Bhushan4829
commited on
Commit
·
c374fa9
1
Parent(s):
7c15e5a
Updated Code
Browse files- requirements.txt +1 -1
- semantic_search.py +41 -17
requirements.txt
CHANGED
@@ -2,4 +2,4 @@ fastapi
|
|
2 |
sentence_transformers
|
3 |
pandas
|
4 |
uvicorn
|
5 |
-
|
|
|
2 |
sentence_transformers
|
3 |
pandas
|
4 |
uvicorn
|
5 |
+
numpy
|
semantic_search.py
CHANGED
@@ -2,6 +2,7 @@ from fastapi import FastAPI, HTTPException
|
|
2 |
from pydantic import BaseModel
|
3 |
from sentence_transformers import SentenceTransformer, util
|
4 |
import pandas as pd
|
|
|
5 |
|
6 |
# Initialize FastAPI app
|
7 |
app = FastAPI()
|
@@ -12,15 +13,14 @@ class QueryRequest(BaseModel):
|
|
12 |
results: dict
|
13 |
|
14 |
class EnhancedSemanticSearchEvaluator:
|
15 |
-
def __init__(self,
|
16 |
self.models = {
|
17 |
"Model_1": SentenceTransformer('sentence-transformers/msmarco-distilbert-base-v3'),
|
18 |
"Model_2": SentenceTransformer('sentence-transformers/all-mpnet-base-v2'),
|
19 |
"Model_3": SentenceTransformer('sentence-transformers/paraphrase-MiniLM-L6-v2')
|
20 |
}
|
21 |
-
self.relevance_threshold = relevance_threshold
|
22 |
self.top_k = top_k
|
23 |
-
self.
|
24 |
|
25 |
def compute_similarity(self, model, query, matches):
|
26 |
query_embedding = model.encode(query, convert_to_tensor=True)
|
@@ -30,10 +30,44 @@ class EnhancedSemanticSearchEvaluator:
|
|
30 |
scores = util.pytorch_cos_sim(query_embedding, match_embeddings).squeeze(0).tolist()
|
31 |
return scores
|
32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
def rank_results(self, model, query, matches):
|
34 |
similarity_scores = self.compute_similarity(model, query, matches)
|
35 |
-
|
36 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
ranked_matches = sorted(matches, key=lambda x: x['similarity_score'], reverse=True)
|
38 |
return ranked_matches
|
39 |
|
@@ -47,18 +81,8 @@ class EnhancedSemanticSearchEvaluator:
|
|
47 |
results_with_scores = []
|
48 |
for rank, match in enumerate(ranked_matches[:self.top_k], start=1):
|
49 |
doc_id = match['id']
|
50 |
-
similarity_score = match['
|
51 |
-
|
52 |
-
if similarity_score >= 0.7:
|
53 |
-
llm_score = 5
|
54 |
-
elif similarity_score >= 0.5:
|
55 |
-
llm_score = 4
|
56 |
-
elif similarity_score >= 0.3:
|
57 |
-
llm_score = 3
|
58 |
-
elif similarity_score >= 0.1:
|
59 |
-
llm_score = 2
|
60 |
-
else:
|
61 |
-
llm_score = 1
|
62 |
|
63 |
results_with_scores.append({
|
64 |
"Rank": rank,
|
|
|
2 |
from pydantic import BaseModel
|
3 |
from sentence_transformers import SentenceTransformer, util
|
4 |
import pandas as pd
|
5 |
+
import numpy as np
|
6 |
|
7 |
# Initialize FastAPI app
|
8 |
app = FastAPI()
|
|
|
13 |
results: dict
|
14 |
|
15 |
class EnhancedSemanticSearchEvaluator:
|
16 |
+
def __init__(self, top_k=300, relevance_threshold=3):
|
17 |
self.models = {
|
18 |
"Model_1": SentenceTransformer('sentence-transformers/msmarco-distilbert-base-v3'),
|
19 |
"Model_2": SentenceTransformer('sentence-transformers/all-mpnet-base-v2'),
|
20 |
"Model_3": SentenceTransformer('sentence-transformers/paraphrase-MiniLM-L6-v2')
|
21 |
}
|
|
|
22 |
self.top_k = top_k
|
23 |
+
self.relevance_threshold = relevance_threshold
|
24 |
|
25 |
def compute_similarity(self, model, query, matches):
|
26 |
query_embedding = model.encode(query, convert_to_tensor=True)
|
|
|
30 |
scores = util.pytorch_cos_sim(query_embedding, match_embeddings).squeeze(0).tolist()
|
31 |
return scores
|
32 |
|
33 |
+
def normalize_scores(self, similarity_scores):
|
34 |
+
"""
|
35 |
+
Normalize similarity scores to a 0-1 range for consistent scaling.
|
36 |
+
"""
|
37 |
+
max_score = max(similarity_scores) if similarity_scores else 1
|
38 |
+
normalized_scores = [score / max_score for score in similarity_scores]
|
39 |
+
return normalized_scores
|
40 |
+
|
41 |
+
def compute_dynamic_thresholds(self, normalized_scores):
|
42 |
+
"""
|
43 |
+
Compute dynamic thresholds based on the score distribution (percentiles).
|
44 |
+
"""
|
45 |
+
high_threshold = np.percentile(normalized_scores, 90)
|
46 |
+
medium_threshold = np.percentile(normalized_scores, 70)
|
47 |
+
low_threshold = np.percentile(normalized_scores, 50)
|
48 |
+
return high_threshold, medium_threshold, low_threshold
|
49 |
+
|
50 |
def rank_results(self, model, query, matches):
|
51 |
similarity_scores = self.compute_similarity(model, query, matches)
|
52 |
+
normalized_scores = self.normalize_scores(similarity_scores)
|
53 |
+
high_threshold, medium_threshold, low_threshold = self.compute_dynamic_thresholds(normalized_scores)
|
54 |
+
|
55 |
+
for match, normalized_score in zip(matches, normalized_scores):
|
56 |
+
match['similarity_score'] = normalized_score
|
57 |
+
|
58 |
+
# Dynamically assign LLM scores based on thresholds
|
59 |
+
if normalized_score >= high_threshold:
|
60 |
+
match['llm_score'] = 5
|
61 |
+
elif normalized_score >= medium_threshold:
|
62 |
+
match['llm_score'] = 4
|
63 |
+
elif normalized_score >= low_threshold:
|
64 |
+
match['llm_score'] = 3
|
65 |
+
elif normalized_score >= 0.1: # Lowest tier
|
66 |
+
match['llm_score'] = 2
|
67 |
+
else:
|
68 |
+
match['llm_score'] = 1
|
69 |
+
|
70 |
+
# Rank results by similarity score
|
71 |
ranked_matches = sorted(matches, key=lambda x: x['similarity_score'], reverse=True)
|
72 |
return ranked_matches
|
73 |
|
|
|
81 |
results_with_scores = []
|
82 |
for rank, match in enumerate(ranked_matches[:self.top_k], start=1):
|
83 |
doc_id = match['id']
|
84 |
+
similarity_score = match['similarity_score']
|
85 |
+
llm_score = match['llm_score']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
86 |
|
87 |
results_with_scores.append({
|
88 |
"Rank": rank,
|