Spaces:
Sleeping
Sleeping
File size: 5,242 Bytes
11016fe c374fa9 11016fe c374fa9 11016fe c374fa9 11016fe ca7c7e1 11016fe c374fa9 11016fe c374fa9 11016fe c374fa9 11016fe |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 |
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from sentence_transformers import SentenceTransformer, util
import pandas as pd
import numpy as np
# Initialize FastAPI app
app = FastAPI()
# Define request model
class QueryRequest(BaseModel):
query: str
results: dict
class EnhancedSemanticSearchEvaluator:
def __init__(self, top_k=300, relevance_threshold=3):
self.models = {
"Model_1": SentenceTransformer('sentence-transformers/msmarco-distilbert-base-v3'),
"Model_2": SentenceTransformer('sentence-transformers/all-mpnet-base-v2'),
"Model_3": SentenceTransformer('sentence-transformers/paraphrase-MiniLM-L6-v2')
}
self.top_k = top_k
self.relevance_threshold = relevance_threshold
def compute_similarity(self, model, query, matches):
query_embedding = model.encode(query, convert_to_tensor=True)
match_embeddings = model.encode(
[match['metadata'] for match in matches], convert_to_tensor=True
)
scores = util.pytorch_cos_sim(query_embedding, match_embeddings).squeeze(0).tolist()
return scores
def normalize_scores(self, similarity_scores):
"""
Normalize similarity scores to a 0-1 range for consistent scaling.
"""
max_score = max(similarity_scores) if similarity_scores else 1
normalized_scores = [score / max_score for score in similarity_scores]
return normalized_scores
def compute_dynamic_thresholds(self, normalized_scores):
"""
Compute dynamic thresholds based on the score distribution (percentiles).
"""
high_threshold = np.percentile(normalized_scores, 90)
medium_threshold = np.percentile(normalized_scores, 70)
low_threshold = np.percentile(normalized_scores, 50)
return high_threshold, medium_threshold, low_threshold
def rank_results(self, model, query, matches):
similarity_scores = self.compute_similarity(model, query, matches)
normalized_scores = self.normalize_scores(similarity_scores)
high_threshold, medium_threshold, low_threshold = self.compute_dynamic_thresholds(normalized_scores)
for match, normalized_score in zip(matches, normalized_scores):
match['similarity_score'] = normalized_score
# Dynamically assign LLM scores based on thresholds
if normalized_score >= high_threshold:
match['llm_score'] = 5
elif normalized_score >= medium_threshold:
match['llm_score'] = 4
elif normalized_score >= low_threshold:
match['llm_score'] = 3
elif normalized_score >= 0.1: # Lowest tier
match['llm_score'] = 2
else:
match['llm_score'] = 1
# Rank results by similarity score
ranked_matches = sorted(matches, key=lambda x: x['similarity_score'], reverse=True)
return ranked_matches
def evaluate_results(self, query, results):
all_metrics = {}
results_status = {}
for model_name, model in self.models.items():
ranked_matches = self.rank_results(model, query, results['matches'])
results_with_scores = []
for rank, match in enumerate(ranked_matches[:self.top_k], start=1):
doc_id = match['id']
similarity_score = match['similarity_score']
llm_score = match['llm_score']
results_with_scores.append({
"Rank": rank,
"Document ID": doc_id,
"Similarity Score": similarity_score,
"LLM Score": llm_score
})
results_df = pd.DataFrame(results_with_scores)
results_df['Pass'] = results_df['LLM Score'] >= self.relevance_threshold
pass_rate = results_df['Pass'].mean()
metrics = {
"Pass Rate": pass_rate,
"Precision@K": results_df.head(self.top_k)['Pass'].mean(),
"Recall@K": results_df.head(self.top_k)['Pass'].sum() / max(results_df['Pass'].sum(), 1),
"F1@K": (
2 * (results_df.head(self.top_k)['Pass'].mean() * (results_df.head(self.top_k)['Pass'].sum() / max(results_df['Pass'].sum(), 1))) /
(results_df.head(self.top_k)['Pass'].mean() + (results_df.head(self.top_k)['Pass'].sum() / max(results_df['Pass'].sum(), 1)))
if (results_df.head(self.top_k)['Pass'].mean() + (results_df.head(self.top_k)['Pass'].sum() / max(results_df['Pass'].sum(), 1))) > 0 else 0)
}
all_metrics[model_name] = metrics
results_status[model_name] = "Test Passed" if pass_rate > 0.5 else "Test Failed"
return results_status
evaluator = EnhancedSemanticSearchEvaluator()
@app.post("/evaluate")
async def evaluate(request: QueryRequest):
try:
query = request.query
results = request.results
evaluation_result = evaluator.evaluate_results(query, results)
return evaluation_result
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
|