from flask import Flask, request, jsonify from sentence_transformers import SentenceTransformer, util import pandas as pd import json app = Flask(__name__) class EnhancedSemanticSearchEvaluator: def __init__(self, relevance_threshold=3, top_k=300, similarity_threshold=0.5): self.models = { "Model_1": SentenceTransformer('sentence-transformers/msmarco-distilbert-base-v3'), "Model_2": SentenceTransformer('sentence-transformers/all-mpnet-base-v2'), "Model_3": SentenceTransformer('sentence-transformers/paraphrase-MiniLM-L6-v2') } self.relevance_threshold = relevance_threshold self.top_k = top_k self.similarity_threshold = similarity_threshold def compute_similarity(self, model, query, matches): query_embedding = model.encode(query, convert_to_tensor=True) match_embeddings = model.encode( [match['metadata'] for match in matches], convert_to_tensor=True ) scores = util.pytorch_cos_sim(query_embedding, match_embeddings).squeeze(0).tolist() return scores def rank_results(self, model, query, matches): similarity_scores = self.compute_similarity(model, query, matches) for match, score in zip(matches, similarity_scores): match['similarity_score'] = score ranked_matches = sorted(matches, key=lambda x: x['similarity_score'], reverse=True) return ranked_matches def evaluate_results(self, query, results): all_metrics = {} results_status = {} for model_name, model in self.models.items(): ranked_matches = self.rank_results(model, query, results['matches']) results_with_scores = [] for rank, match in enumerate(ranked_matches[:self.top_k], start=1): doc_id = match['id'] similarity_score = match['similarity_score'] if similarity_score >= 0.7: llm_score = 5 elif similarity_score >= 0.5: llm_score = 4 elif similarity_score >= 0.3: llm_score = 3 elif similarity_score >= 0.1: llm_score = 2 else: llm_score = 1 results_with_scores.append({ "Rank": rank, "Document ID": doc_id, "Similarity Score": similarity_score, "LLM Score": llm_score }) results_df = pd.DataFrame(results_with_scores) results_df['Pass'] = results_df['LLM Score'] >= self.relevance_threshold pass_rate = results_df['Pass'].mean() metrics = { "Pass Rate": pass_rate, "Precision@K": results_df.head(self.top_k)['Pass'].mean(), "Recall@K": results_df.head(self.top_k)['Pass'].sum() / max(results_df['Pass'].sum(), 1), "F1@K": ( 2 * (results_df.head(self.top_k)['Pass'].mean() * (results_df.head(self.top_k)['Pass'].sum() / max(results_df['Pass'].sum(), 1))) / (results_df.head(self.top_k)['Pass'].mean() + (results_df.head(self.top_k)['Pass'].sum() / max(results_df['Pass'].sum(), 1))) if (results_df.head(self.top_k)['Pass'].mean() + (results_df.head(self.top_k)['Pass'].sum() / max(results_df['Pass'].sum(), 1))) > 0 else 0) } all_metrics[model_name] = metrics results_status[model_name] = "Test Passed" if pass_rate > 0.5 else "Test Failed" return results_status evaluator = EnhancedSemanticSearchEvaluator() @app.route('/evaluate', methods=['POST']) def evaluate(): content = request.json query = content['query'] results = content['results'] evaluator = EnhancedSemanticSearchEvaluator() evaluation_result = evaluator.evaluate_results(query, results) return jsonify(evaluation_result) # if __name__ == '__main__': # app.run(debug=True, host='0.0.0.0', port=8000)