File size: 5,242 Bytes
11016fe
 
 
 
c374fa9
11016fe
 
 
 
 
 
 
 
 
 
c374fa9
11016fe
 
 
 
 
 
c374fa9
11016fe
 
 
 
ca7c7e1
11016fe
 
 
 
c374fa9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11016fe
 
c374fa9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11016fe
 
 
 
 
 
 
 
 
 
 
 
 
c374fa9
 
11016fe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from sentence_transformers import SentenceTransformer, util
import pandas as pd
import numpy as np

# Initialize FastAPI app
app = FastAPI()

# Define request model
class QueryRequest(BaseModel):
    query: str
    results: dict

class EnhancedSemanticSearchEvaluator:
    def __init__(self, top_k=300, relevance_threshold=3):
        self.models = {
            "Model_1": SentenceTransformer('sentence-transformers/msmarco-distilbert-base-v3'),
            "Model_2": SentenceTransformer('sentence-transformers/all-mpnet-base-v2'),
            "Model_3": SentenceTransformer('sentence-transformers/paraphrase-MiniLM-L6-v2')
        }
        self.top_k = top_k
        self.relevance_threshold = relevance_threshold

    def compute_similarity(self, model, query, matches):
        query_embedding = model.encode(query, convert_to_tensor=True)
        match_embeddings = model.encode(
            [match['metadata'] for match in matches], convert_to_tensor=True
        )
        scores = util.pytorch_cos_sim(query_embedding, match_embeddings).squeeze(0).tolist()
        return scores

    def normalize_scores(self, similarity_scores):
        """
        Normalize similarity scores to a 0-1 range for consistent scaling.
        """
        max_score = max(similarity_scores) if similarity_scores else 1
        normalized_scores = [score / max_score for score in similarity_scores]
        return normalized_scores

    def compute_dynamic_thresholds(self, normalized_scores):
        """
        Compute dynamic thresholds based on the score distribution (percentiles).
        """
        high_threshold = np.percentile(normalized_scores, 90)
        medium_threshold = np.percentile(normalized_scores, 70)
        low_threshold = np.percentile(normalized_scores, 50)
        return high_threshold, medium_threshold, low_threshold

    def rank_results(self, model, query, matches):
        similarity_scores = self.compute_similarity(model, query, matches)
        normalized_scores = self.normalize_scores(similarity_scores)
        high_threshold, medium_threshold, low_threshold = self.compute_dynamic_thresholds(normalized_scores)

        for match, normalized_score in zip(matches, normalized_scores):
            match['similarity_score'] = normalized_score

            # Dynamically assign LLM scores based on thresholds
            if normalized_score >= high_threshold:
                match['llm_score'] = 5
            elif normalized_score >= medium_threshold:
                match['llm_score'] = 4
            elif normalized_score >= low_threshold:
                match['llm_score'] = 3
            elif normalized_score >= 0.1:  # Lowest tier
                match['llm_score'] = 2
            else:
                match['llm_score'] = 1

        # Rank results by similarity score
        ranked_matches = sorted(matches, key=lambda x: x['similarity_score'], reverse=True)
        return ranked_matches

    def evaluate_results(self, query, results):
        all_metrics = {}
        results_status = {}

        for model_name, model in self.models.items():
            ranked_matches = self.rank_results(model, query, results['matches'])

            results_with_scores = []
            for rank, match in enumerate(ranked_matches[:self.top_k], start=1):
                doc_id = match['id']
                similarity_score = match['similarity_score']
                llm_score = match['llm_score']

                results_with_scores.append({
                    "Rank": rank,
                    "Document ID": doc_id,
                    "Similarity Score": similarity_score,
                    "LLM Score": llm_score
                })

            results_df = pd.DataFrame(results_with_scores)
            results_df['Pass'] = results_df['LLM Score'] >= self.relevance_threshold

            pass_rate = results_df['Pass'].mean()
            metrics = {
                "Pass Rate": pass_rate,
                "Precision@K": results_df.head(self.top_k)['Pass'].mean(),
                "Recall@K": results_df.head(self.top_k)['Pass'].sum() / max(results_df['Pass'].sum(), 1),
                "F1@K": (
                    2 * (results_df.head(self.top_k)['Pass'].mean() * (results_df.head(self.top_k)['Pass'].sum() / max(results_df['Pass'].sum(), 1))) /
                    (results_df.head(self.top_k)['Pass'].mean() + (results_df.head(self.top_k)['Pass'].sum() / max(results_df['Pass'].sum(), 1)))
                    if (results_df.head(self.top_k)['Pass'].mean() + (results_df.head(self.top_k)['Pass'].sum() / max(results_df['Pass'].sum(), 1))) > 0 else 0)
            }

            all_metrics[model_name] = metrics
            results_status[model_name] = "Test Passed" if pass_rate > 0.5 else "Test Failed"

        return results_status

evaluator = EnhancedSemanticSearchEvaluator()

@app.post("/evaluate")
async def evaluate(request: QueryRequest):
    try:
        query = request.query
        results = request.results
        evaluation_result = evaluator.evaluate_results(query, results)
        return evaluation_result
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))