Spaces:

ashwinradhe
/

my-fastapi-app

Sleeping

File size: 5,242 Bytes

from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from sentence_transformers import SentenceTransformer, util
import pandas as pd
import numpy as np

# Initialize FastAPI app
app = FastAPI()

# Define request model
class QueryRequest(BaseModel):
    query: str
    results: dict

class EnhancedSemanticSearchEvaluator:
    def __init__(self, top_k=300, relevance_threshold=3):
        self.models = {
            "Model_1": SentenceTransformer('sentence-transformers/msmarco-distilbert-base-v3'),
            "Model_2": SentenceTransformer('sentence-transformers/all-mpnet-base-v2'),
            "Model_3": SentenceTransformer('sentence-transformers/paraphrase-MiniLM-L6-v2')
        }
        self.top_k = top_k
        self.relevance_threshold = relevance_threshold

    def compute_similarity(self, model, query, matches):
        query_embedding = model.encode(query, convert_to_tensor=True)
        match_embeddings = model.encode(
            [match['metadata'] for match in matches], convert_to_tensor=True
        )
        scores = util.pytorch_cos_sim(query_embedding, match_embeddings).squeeze(0).tolist()
        return scores

    def normalize_scores(self, similarity_scores):
        """
        Normalize similarity scores to a 0-1 range for consistent scaling.
        """
        max_score = max(similarity_scores) if similarity_scores else 1
        normalized_scores = [score / max_score for score in similarity_scores]
        return normalized_scores

    def compute_dynamic_thresholds(self, normalized_scores):
        """
        Compute dynamic thresholds based on the score distribution (percentiles).
        """
        high_threshold = np.percentile(normalized_scores, 90)
        medium_threshold = np.percentile(normalized_scores, 70)
        low_threshold = np.percentile(normalized_scores, 50)
        return high_threshold, medium_threshold, low_threshold

    def rank_results(self, model, query, matches):
        similarity_scores = self.compute_similarity(model, query, matches)
        normalized_scores = self.normalize_scores(similarity_scores)
        high_threshold, medium_threshold, low_threshold = self.compute_dynamic_thresholds(normalized_scores)

        for match, normalized_score in zip(matches, normalized_scores):
            match['similarity_score'] = normalized_score

            # Dynamically assign LLM scores based on thresholds
            if normalized_score >= high_threshold:
                match['llm_score'] = 5
            elif normalized_score >= medium_threshold:
                match['llm_score'] = 4
            elif normalized_score >= low_threshold:
                match['llm_score'] = 3
            elif normalized_score >= 0.1:  # Lowest tier
                match['llm_score'] = 2
            else:
                match['llm_score'] = 1

        # Rank results by similarity score
        ranked_matches = sorted(matches, key=lambda x: x['similarity_score'], reverse=True)
        return ranked_matches

    def evaluate_results(self, query, results):
        all_metrics = {}
        results_status = {}

        for model_name, model in self.models.items():
            ranked_matches = self.rank_results(model, query, results['matches'])

            results_with_scores = []
            for rank, match in enumerate(ranked_matches[:self.top_k], start=1):
                doc_id = match['id']
                similarity_score = match['similarity_score']
                llm_score = match['llm_score']

                results_with_scores.append({
                    "Rank": rank,
                    "Document ID": doc_id,
                    "Similarity Score": similarity_score,
                    "LLM Score": llm_score
                })

            results_df = pd.DataFrame(results_with_scores)
            results_df['Pass'] = results_df['LLM Score'] >= self.relevance_threshold

            pass_rate = results_df['Pass'].mean()
            metrics = {
                "Pass Rate": pass_rate,
                "Precision@K": results_df.head(self.top_k)['Pass'].mean(),
                "Recall@K": results_df.head(self.top_k)['Pass'].sum() / max(results_df['Pass'].sum(), 1),
                "F1@K": (
                    2 * (results_df.head(self.top_k)['Pass'].mean() * (results_df.head(self.top_k)['Pass'].sum() / max(results_df['Pass'].sum(), 1))) /
                    (results_df.head(self.top_k)['Pass'].mean() + (results_df.head(self.top_k)['Pass'].sum() / max(results_df['Pass'].sum(), 1)))
                    if (results_df.head(self.top_k)['Pass'].mean() + (results_df.head(self.top_k)['Pass'].sum() / max(results_df['Pass'].sum(), 1))) > 0 else 0)
            }

            all_metrics[model_name] = metrics
            results_status[model_name] = "Test Passed" if pass_rate > 0.5 else "Test Failed"

        return results_status

evaluator = EnhancedSemanticSearchEvaluator()

@app.post("/evaluate")
async def evaluate(request: QueryRequest):
    try:
        query = request.query
        results = request.results
        evaluation_result = evaluator.evaluate_results(query, results)
        return evaluation_result
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))