Bhushan4829 commited on
Commit
c374fa9
·
1 Parent(s): 7c15e5a

Updated Code

Browse files
Files changed (2) hide show
  1. requirements.txt +1 -1
  2. semantic_search.py +41 -17
requirements.txt CHANGED
@@ -2,4 +2,4 @@ fastapi
2
  sentence_transformers
3
  pandas
4
  uvicorn
5
-
 
2
  sentence_transformers
3
  pandas
4
  uvicorn
5
+ numpy
semantic_search.py CHANGED
@@ -2,6 +2,7 @@ from fastapi import FastAPI, HTTPException
2
  from pydantic import BaseModel
3
  from sentence_transformers import SentenceTransformer, util
4
  import pandas as pd
 
5
 
6
  # Initialize FastAPI app
7
  app = FastAPI()
@@ -12,15 +13,14 @@ class QueryRequest(BaseModel):
12
  results: dict
13
 
14
  class EnhancedSemanticSearchEvaluator:
15
- def __init__(self, relevance_threshold=3, top_k=300, similarity_threshold=0.5):
16
  self.models = {
17
  "Model_1": SentenceTransformer('sentence-transformers/msmarco-distilbert-base-v3'),
18
  "Model_2": SentenceTransformer('sentence-transformers/all-mpnet-base-v2'),
19
  "Model_3": SentenceTransformer('sentence-transformers/paraphrase-MiniLM-L6-v2')
20
  }
21
- self.relevance_threshold = relevance_threshold
22
  self.top_k = top_k
23
- self.similarity_threshold = similarity_threshold
24
 
25
  def compute_similarity(self, model, query, matches):
26
  query_embedding = model.encode(query, convert_to_tensor=True)
@@ -30,10 +30,44 @@ class EnhancedSemanticSearchEvaluator:
30
  scores = util.pytorch_cos_sim(query_embedding, match_embeddings).squeeze(0).tolist()
31
  return scores
32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  def rank_results(self, model, query, matches):
34
  similarity_scores = self.compute_similarity(model, query, matches)
35
- for match, score in zip(matches, similarity_scores):
36
- match['similarity_score'] = score
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  ranked_matches = sorted(matches, key=lambda x: x['similarity_score'], reverse=True)
38
  return ranked_matches
39
 
@@ -47,18 +81,8 @@ class EnhancedSemanticSearchEvaluator:
47
  results_with_scores = []
48
  for rank, match in enumerate(ranked_matches[:self.top_k], start=1):
49
  doc_id = match['id']
50
- similarity_score = match['score']
51
-
52
- if similarity_score >= 0.7:
53
- llm_score = 5
54
- elif similarity_score >= 0.5:
55
- llm_score = 4
56
- elif similarity_score >= 0.3:
57
- llm_score = 3
58
- elif similarity_score >= 0.1:
59
- llm_score = 2
60
- else:
61
- llm_score = 1
62
 
63
  results_with_scores.append({
64
  "Rank": rank,
 
2
  from pydantic import BaseModel
3
  from sentence_transformers import SentenceTransformer, util
4
  import pandas as pd
5
+ import numpy as np
6
 
7
  # Initialize FastAPI app
8
  app = FastAPI()
 
13
  results: dict
14
 
15
  class EnhancedSemanticSearchEvaluator:
16
+ def __init__(self, top_k=300, relevance_threshold=3):
17
  self.models = {
18
  "Model_1": SentenceTransformer('sentence-transformers/msmarco-distilbert-base-v3'),
19
  "Model_2": SentenceTransformer('sentence-transformers/all-mpnet-base-v2'),
20
  "Model_3": SentenceTransformer('sentence-transformers/paraphrase-MiniLM-L6-v2')
21
  }
 
22
  self.top_k = top_k
23
+ self.relevance_threshold = relevance_threshold
24
 
25
  def compute_similarity(self, model, query, matches):
26
  query_embedding = model.encode(query, convert_to_tensor=True)
 
30
  scores = util.pytorch_cos_sim(query_embedding, match_embeddings).squeeze(0).tolist()
31
  return scores
32
 
33
+ def normalize_scores(self, similarity_scores):
34
+ """
35
+ Normalize similarity scores to a 0-1 range for consistent scaling.
36
+ """
37
+ max_score = max(similarity_scores) if similarity_scores else 1
38
+ normalized_scores = [score / max_score for score in similarity_scores]
39
+ return normalized_scores
40
+
41
+ def compute_dynamic_thresholds(self, normalized_scores):
42
+ """
43
+ Compute dynamic thresholds based on the score distribution (percentiles).
44
+ """
45
+ high_threshold = np.percentile(normalized_scores, 90)
46
+ medium_threshold = np.percentile(normalized_scores, 70)
47
+ low_threshold = np.percentile(normalized_scores, 50)
48
+ return high_threshold, medium_threshold, low_threshold
49
+
50
  def rank_results(self, model, query, matches):
51
  similarity_scores = self.compute_similarity(model, query, matches)
52
+ normalized_scores = self.normalize_scores(similarity_scores)
53
+ high_threshold, medium_threshold, low_threshold = self.compute_dynamic_thresholds(normalized_scores)
54
+
55
+ for match, normalized_score in zip(matches, normalized_scores):
56
+ match['similarity_score'] = normalized_score
57
+
58
+ # Dynamically assign LLM scores based on thresholds
59
+ if normalized_score >= high_threshold:
60
+ match['llm_score'] = 5
61
+ elif normalized_score >= medium_threshold:
62
+ match['llm_score'] = 4
63
+ elif normalized_score >= low_threshold:
64
+ match['llm_score'] = 3
65
+ elif normalized_score >= 0.1: # Lowest tier
66
+ match['llm_score'] = 2
67
+ else:
68
+ match['llm_score'] = 1
69
+
70
+ # Rank results by similarity score
71
  ranked_matches = sorted(matches, key=lambda x: x['similarity_score'], reverse=True)
72
  return ranked_matches
73
 
 
81
  results_with_scores = []
82
  for rank, match in enumerate(ranked_matches[:self.top_k], start=1):
83
  doc_id = match['id']
84
+ similarity_score = match['similarity_score']
85
+ llm_score = match['llm_score']
 
 
 
 
 
 
 
 
 
 
86
 
87
  results_with_scores.append({
88
  "Rank": rank,