from fastapi import FastAPI, HTTPException from pydantic import BaseModel from sentence_transformers import SentenceTransformer, util import pandas as pd import numpy as np # Initialize FastAPI app app = FastAPI() # Define request model class QueryRequest(BaseModel): query: str results: dict class EnhancedSemanticSearchEvaluator: def __init__(self, top_k=300, relevance_threshold=3): self.models = { "Model_1": SentenceTransformer('sentence-transformers/msmarco-distilbert-base-v3'), "Model_2": SentenceTransformer('sentence-transformers/all-mpnet-base-v2'), "Model_3": SentenceTransformer('sentence-transformers/paraphrase-MiniLM-L6-v2') } self.top_k = top_k self.relevance_threshold = relevance_threshold def compute_similarity(self, model, query, matches): query_embedding = model.encode(query, convert_to_tensor=True) match_embeddings = model.encode( [match['metadata'] for match in matches], convert_to_tensor=True ) scores = util.pytorch_cos_sim(query_embedding, match_embeddings).squeeze(0).tolist() return scores def normalize_scores(self, similarity_scores): """ Normalize similarity scores to a 0-1 range for consistent scaling. """ max_score = max(similarity_scores) if similarity_scores else 1 normalized_scores = [score / max_score for score in similarity_scores] return normalized_scores def compute_dynamic_thresholds(self, normalized_scores): """ Compute dynamic thresholds based on the score distribution (percentiles). """ high_threshold = np.percentile(normalized_scores, 90) medium_threshold = np.percentile(normalized_scores, 70) low_threshold = np.percentile(normalized_scores, 50) return high_threshold, medium_threshold, low_threshold def rank_results(self, model, query, matches): similarity_scores = self.compute_similarity(model, query, matches) normalized_scores = self.normalize_scores(similarity_scores) high_threshold, medium_threshold, low_threshold = self.compute_dynamic_thresholds(normalized_scores) for match, normalized_score in zip(matches, normalized_scores): match['similarity_score'] = normalized_score # Dynamically assign LLM scores based on thresholds if normalized_score >= high_threshold: match['llm_score'] = 5 elif normalized_score >= medium_threshold: match['llm_score'] = 4 elif normalized_score >= low_threshold: match['llm_score'] = 3 elif normalized_score >= 0.1: # Lowest tier match['llm_score'] = 2 else: match['llm_score'] = 1 # Rank results by similarity score ranked_matches = sorted(matches, key=lambda x: x['similarity_score'], reverse=True) return ranked_matches def evaluate_results(self, query, results): all_metrics = {} results_status = {} for model_name, model in self.models.items(): ranked_matches = self.rank_results(model, query, results['matches']) results_with_scores = [] for rank, match in enumerate(ranked_matches[:self.top_k], start=1): doc_id = match['id'] similarity_score = match['similarity_score'] llm_score = match['llm_score'] results_with_scores.append({ "Rank": rank, "Document ID": doc_id, "Similarity Score": similarity_score, "LLM Score": llm_score }) results_df = pd.DataFrame(results_with_scores) results_df['Pass'] = results_df['LLM Score'] >= self.relevance_threshold pass_rate = results_df['Pass'].mean() metrics = { "Pass Rate": pass_rate, "Precision@K": results_df.head(self.top_k)['Pass'].mean(), "Recall@K": results_df.head(self.top_k)['Pass'].sum() / max(results_df['Pass'].sum(), 1), "F1@K": ( 2 * (results_df.head(self.top_k)['Pass'].mean() * (results_df.head(self.top_k)['Pass'].sum() / max(results_df['Pass'].sum(), 1))) / (results_df.head(self.top_k)['Pass'].mean() + (results_df.head(self.top_k)['Pass'].sum() / max(results_df['Pass'].sum(), 1))) if (results_df.head(self.top_k)['Pass'].mean() + (results_df.head(self.top_k)['Pass'].sum() / max(results_df['Pass'].sum(), 1))) > 0 else 0) } all_metrics[model_name] = metrics results_status[model_name] = "Test Passed" if pass_rate > 0.5 else "Test Failed" return results_status evaluator = EnhancedSemanticSearchEvaluator() @app.post("/evaluate") async def evaluate(request: QueryRequest): try: query = request.query results = request.results evaluation_result = evaluator.evaluate_results(query, results) return evaluation_result except Exception as e: raise HTTPException(status_code=500, detail=str(e))