Bhushan4829 commited on
Commit
11016fe
·
1 Parent(s): 52a5fbe

Initial Commit

Browse files
Files changed (3) hide show
  1. Dockerfile +47 -0
  2. requirements.txt +5 -0
  3. semantic_search.py +99 -0
Dockerfile ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use the official Python 3.9 image
2
+ FROM python:3.9-alpine
3
+
4
+ # Create a non-root user
5
+ RUN useradd -m appuser
6
+
7
+ # Set the working directory inside the container
8
+ WORKDIR /app
9
+
10
+ # Set environment variables for cache
11
+ ENV TRANSFORMERS_CACHE=/app/cache/huggingface/transformers
12
+ ENV HF_HOME=/app/cache/huggingface
13
+ ENV SENTENCE_TRANSFORMERS_HOME=/app/cache/sentence_transformers
14
+
15
+ # Ensure the cache directory can be written to
16
+ RUN mkdir -p /app/cache/huggingface/transformers && \
17
+ mkdir -p /app/cache/sentence_transformers && \
18
+ chown -R appuser:appuser /app/cache
19
+
20
+ # Copy the requirements file into the container
21
+ COPY requirements.txt ./requirements.txt
22
+
23
+ # Install system dependencies and Python packages
24
+ RUN apt-get update && \
25
+ apt-get -y install gcc libpq-dev && \
26
+ pip install --no-cache-dir -r requirements.txt
27
+
28
+ # Copy the entire project into the container
29
+ COPY . /app
30
+
31
+ # Change ownership of the /app directory to the non-root user
32
+ RUN chown -R appuser:appuser /app
33
+
34
+ # Switch to the non-root user
35
+ USER appuser
36
+
37
+ # Create a script to load the models
38
+ RUN echo "from sentence_transformers import SentenceTransformer; \
39
+ SentenceTransformer('sentence-transformers/msmarco-distilbert-base-v3'); \
40
+ SentenceTransformer('sentence-transformers/all-mpnet-base-v2'); \
41
+ SentenceTransformer('sentence-transformers/paraphrase-MiniLM-L6-v2');" > load_models.py
42
+
43
+ # Run the model loading script
44
+ RUN python load_models.py
45
+
46
+ # Start the application
47
+ CMD ["uvicorn", "semantic_search:app", "--host", "0.0.0.0", "--port", "7860"]
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ fastapi
2
+ sentence_transformers
3
+ pandas
4
+ uvicorn
5
+
semantic_search.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, HTTPException
2
+ from pydantic import BaseModel
3
+ from sentence_transformers import SentenceTransformer, util
4
+ import pandas as pd
5
+
6
+ # Initialize FastAPI app
7
+ app = FastAPI()
8
+
9
+ # Define request model
10
+ class QueryRequest(BaseModel):
11
+ query: str
12
+ results: dict
13
+
14
+ class EnhancedSemanticSearchEvaluator:
15
+ def __init__(self, relevance_threshold=3, top_k=300, similarity_threshold=0.5):
16
+ self.models = {
17
+ "Model_1": SentenceTransformer('sentence-transformers/msmarco-distilbert-base-v3'),
18
+ "Model_2": SentenceTransformer('sentence-transformers/all-mpnet-base-v2'),
19
+ "Model_3": SentenceTransformer('sentence-transformers/paraphrase-MiniLM-L6-v2')
20
+ }
21
+ self.relevance_threshold = relevance_threshold
22
+ self.top_k = top_k
23
+ self.similarity_threshold = similarity_threshold
24
+
25
+ def compute_similarity(self, model, query, matches):
26
+ query_embedding = model.encode(query, convert_to_tensor=True)
27
+ match_embeddings = model.encode(
28
+ [match['metadata'] for match in matches], convert_to_tensor=True
29
+ )
30
+ scores = util.pytorch_cos_sim(query_embedding, match_embeddings).squeeze(0).tolist()
31
+ return scores
32
+
33
+ def rank_results(self, model, query, matches):
34
+ similarity_scores = self.compute_similarity(model, query, matches)
35
+ for match, score in zip(matches, similarity_scores):
36
+ match['similarity_score'] = score
37
+ ranked_matches = sorted(matches, key=lambda x: x['similarity_score'], reverse=True)
38
+ return ranked_matches
39
+
40
+ def evaluate_results(self, query, results):
41
+ all_metrics = {}
42
+ results_status = {}
43
+
44
+ for model_name, model in self.models.items():
45
+ ranked_matches = self.rank_results(model, query, results['matches'])
46
+
47
+ results_with_scores = []
48
+ for rank, match in enumerate(ranked_matches[:self.top_k], start=1):
49
+ doc_id = match['id']
50
+ similarity_score = match['similarity_score']
51
+
52
+ if similarity_score >= 0.7:
53
+ llm_score = 5
54
+ elif similarity_score >= 0.5:
55
+ llm_score = 4
56
+ elif similarity_score >= 0.3:
57
+ llm_score = 3
58
+ elif similarity_score >= 0.1:
59
+ llm_score = 2
60
+ else:
61
+ llm_score = 1
62
+
63
+ results_with_scores.append({
64
+ "Rank": rank,
65
+ "Document ID": doc_id,
66
+ "Similarity Score": similarity_score,
67
+ "LLM Score": llm_score
68
+ })
69
+
70
+ results_df = pd.DataFrame(results_with_scores)
71
+ results_df['Pass'] = results_df['LLM Score'] >= self.relevance_threshold
72
+
73
+ pass_rate = results_df['Pass'].mean()
74
+ metrics = {
75
+ "Pass Rate": pass_rate,
76
+ "Precision@K": results_df.head(self.top_k)['Pass'].mean(),
77
+ "Recall@K": results_df.head(self.top_k)['Pass'].sum() / max(results_df['Pass'].sum(), 1),
78
+ "F1@K": (
79
+ 2 * (results_df.head(self.top_k)['Pass'].mean() * (results_df.head(self.top_k)['Pass'].sum() / max(results_df['Pass'].sum(), 1))) /
80
+ (results_df.head(self.top_k)['Pass'].mean() + (results_df.head(self.top_k)['Pass'].sum() / max(results_df['Pass'].sum(), 1)))
81
+ if (results_df.head(self.top_k)['Pass'].mean() + (results_df.head(self.top_k)['Pass'].sum() / max(results_df['Pass'].sum(), 1))) > 0 else 0)
82
+ }
83
+
84
+ all_metrics[model_name] = metrics
85
+ results_status[model_name] = "Test Passed" if pass_rate > 0.5 else "Test Failed"
86
+
87
+ return results_status
88
+
89
+ evaluator = EnhancedSemanticSearchEvaluator()
90
+
91
+ @app.post("/evaluate")
92
+ async def evaluate(request: QueryRequest):
93
+ try:
94
+ query = request.query
95
+ results = request.results
96
+ evaluation_result = evaluator.evaluate_results(query, results)
97
+ return evaluation_result
98
+ except Exception as e:
99
+ raise HTTPException(status_code=500, detail=str(e))