Rajarshi Roy commited on
Commit
42fa84c
·
verified ·
1 Parent(s): 974d2f2

Upload 28 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ Data/reft[[:space:]]paper.pdf filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ # *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # .github/workflows
30
+ # PyInstaller
31
+ # Usually these files are written by a python script from a template
32
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
33
+ *.manifest
34
+ *.spec
35
+
36
+ # Installer logs
37
+ pip-log.txt
38
+ pip-delete-this-directory.txt
39
+
40
+ # Unit test / coverage reports
41
+ htmlcov/
42
+ .tox/
43
+ .nox/
44
+ .coverage
45
+ .coverage.*
46
+ .cache
47
+ nosetests.xml
48
+ coverage.xml
49
+ *.cover
50
+ *.py,cover
51
+ .hypothesis/
52
+ .pytest_cache/
53
+ cover/
54
+
55
+ # Translations
56
+ *.mo
57
+ *.pot
58
+
59
+ # Django stuff:
60
+ *.log
61
+ local_settings.py
62
+ db.sqlite3
63
+ db.sqlite3-journal
64
+
65
+ # Flask stuff:
66
+ instance/
67
+ .webassets-cache
68
+
69
+ # Scrapy stuff:
70
+ .scrapy
71
+
72
+ # Sphinx documentation
73
+ docs/_build/
74
+
75
+ # PyBuilder
76
+ .pybuilder/
77
+ target/
78
+
79
+ # Jupyter Notebook
80
+ .ipynb_checkpoints
81
+
82
+ # IPython
83
+ profile_default/
84
+ ipython_config.py
85
+
86
+ # pyenv
87
+ # For a library or package, you might want to ignore these files since the code is
88
+ # intended to run in multiple environments; otherwise, check them in:
89
+ # .python-version
90
+
91
+ # pipenv
92
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
93
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
94
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
95
+ # install all needed dependencies.
96
+ #Pipfile.lock
97
+
98
+ # poetry
99
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
100
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
101
+ # commonly ignored for libraries.
102
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
103
+ #poetry.lock
104
+
105
+ # pdm
106
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
107
+ #pdm.lock
108
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
109
+ # in version control.
110
+ # https://pdm.fming.dev/#use-with-ide
111
+ .pdm.toml
112
+
113
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
114
+ __pypackages__/
115
+
116
+ # Celery stuff
117
+ celerybeat-schedule
118
+ celerybeat.pid
119
+
120
+ # SageMath parsed files
121
+ *.sage.py
122
+
123
+ # Environments
124
+ .env
125
+ .venv
126
+ env/
127
+ venv/
128
+ ENV/
129
+ env.bak/
130
+ venv.bak/
131
+
132
+ # Spyder project settings
133
+ .spyderproject
134
+ .spyproject
135
+
136
+ # Rope project settings
137
+ .ropeproject
138
+
139
+ # mkdocs documentation
140
+ /site
141
+
142
+ # mypy
143
+ .mypy_cache/
144
+ .dmypy.json
145
+ dmypy.json
146
+
147
+ # Pyre type checker
148
+ .pyre/
149
+
150
+ # pytype static type analyzer
151
+ .pytype/
152
+
153
+ # Cython debug symbols
154
+ cython_debug/
155
+
156
+ # PyCharm
157
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
158
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
159
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
160
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
161
+ #.idea/
Data/peft.pdf ADDED
Binary file (563 kB). View file
 
Data/reft paper.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a3d1dcf3e057506a4c470b7f0d4e3fa53abec21d216f2b3451b7dd736cb61e66
3
+ size 1496447
Dockerfile ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ WORKDIR /app
4
+
5
+ COPY . /app
6
+
7
+
8
+ RUN apt-get update && apt-get install -y --no-install-recommends \
9
+ ca-certificates \
10
+ netbase \
11
+ && rm -rf /var/lib/apt/lists/*
12
+
13
+ RUN pip3 install -r requirements.txt
14
+
15
+ ARG GEMINI_API_KEY1
16
+ ENV GEMINI_API_KEY=$GEMINI_API_KEY1
17
+
18
+ ARG PINECONE_API_KEY1
19
+ ENV PINECONE_API_KEY=$PINECONE_API_KEY1
20
+
21
+ EXPOSE 8501
22
+
23
+ ENTRYPOINT ["streamlit", "run"]
24
+
25
+ CMD ["app.py"]
app.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from pathlib import Path
3
+
4
+ import os
5
+
6
+ import google.generativeai as genai
7
+
8
+ from research_assistant_app.components.data_ingestion import (
9
+ get_cleaned_dir_docs,
10
+ get_cleaned_input_docs,
11
+ )
12
+
13
+ from research_assistant_app.components.data_querying import user_query
14
+ from research_assistant_app.components.data_indexing import run_indexing_pipeline
15
+
16
+
17
+ from dotenv import load_dotenv
18
+
19
+ load_dotenv()
20
+ os.getenv("GOOGLE_API_KEY")
21
+ genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
22
+
23
+
24
+ st.set_page_config("Chat PDF")
25
+
26
+ st.header("Your research assistant here to help💁 (Powered by Gemini)")
27
+
28
+
29
+ user_question = st.text_input(
30
+ "Chat with existing Pdfs in Pinecone data base or Your added PDF"
31
+ )
32
+
33
+ if user_question:
34
+ response = user_query(user_question)
35
+
36
+ st.write(response)
37
+
38
+
39
+ File = st.file_uploader(
40
+ "Upload Your new PDF file to store in Pinecone DB", type=("pdf"), key="pdf"
41
+ )
42
+
43
+ if File: # Save uploaded file to 'Data/' folder.
44
+ save_folder = "Data"
45
+ save_path = Path(save_folder, File.name)
46
+ with open(save_path, mode="wb") as w:
47
+ w.write(File.getvalue())
48
+
49
+ if save_path.exists():
50
+ st.success(f"File {File.name} is successfully saved!")
51
+
52
+ file_dir = f"Data/{File.name}"
53
+
54
+ res = get_cleaned_input_docs(file_dir)
55
+
56
+ print(res, "cleaned docs")
57
+
58
+ index_stats = run_indexing_pipeline(res)
59
+
60
+ print(index_stats, "checking indexes")
61
+
62
+ if index_stats != None:
63
+ st.success(f"File {File.name} is successfully upserted in Pinecone DB!")
64
+
65
+ user_question_pdf = st.text_input("Ask a Question from the PDF File")
66
+
67
+ if user_question_pdf:
68
+ response = user_query(user_question_pdf)
69
+
70
+ st.write(response)
71
+
72
+ File = None
main.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ from research_assistant_app.components.data_querying import user_query
2
+
3
+ ans = user_query("generate a summary based on the information you have")
4
+
5
+ print(ans)
requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ google-generativeai
3
+ python-dotenv
4
+ langchain
5
+ langchain_google_genai
6
+ llama-index>=0.9.31
7
+ pinecone-client>=3.0.0
8
+ regex
9
+ llama-index-llms-gemini
10
+ IPython
11
+ llama-index-embeddings-gemini
12
+ llama-index-vector-stores-pinecone
13
+ -e .
research/trials.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
setup.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import setuptools
2
+
3
+
4
+ with open("README.md", "r", encoding="utf-8") as f:
5
+ long_description = f.read()
6
+
7
+
8
+ __version__ = "0.0.0.1"
9
+
10
+ REPO_NAME = "AI_reasearch_assistant"
11
+ AUTHOR_USER_NAME = "Rajarshi12321"
12
+ SRC_REPO = "research_assistant_app"
13
+ AUTHOR_EMAIL = "[email protected]"
14
+
15
+
16
+ setuptools.setup(
17
+ name=SRC_REPO,
18
+ version=__version__,
19
+ author=AUTHOR_USER_NAME,
20
+ author_email=AUTHOR_EMAIL,
21
+ description="A small python package for sentiment analysis app",
22
+ long_description=long_description,
23
+ long_description_content="text/markdown",
24
+ url=f"https://github.com/{AUTHOR_USER_NAME}/{REPO_NAME}",
25
+ project_urls={
26
+ "Bug Tracker": f"https://github.com/{AUTHOR_USER_NAME}/{REPO_NAME}/issues",
27
+ },
28
+ package_dir={"": "src"},
29
+ packages=setuptools.find_packages(where="src"),
30
+ )
src/research_assistant_app.egg-info/PKG-INFO ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ Metadata-Version: 2.1
2
+ Name: research-assistant-app
3
+ Version: 0.0.0.1
4
+ Summary: A small python package for sentiment analysis app
5
+ Home-page: https://github.com/Rajarshi12321/AI_reasearch_assistant
6
+ Author: Rajarshi12321
7
+ Author-email: [email protected]
8
+ Project-URL: Bug Tracker, https://github.com/Rajarshi12321/AI_reasearch_assistant/issues
9
+
10
+ "# AI_reasearch_assistant"
src/research_assistant_app.egg-info/SOURCES.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ README.md
2
+ setup.py
3
+ src/research_assistant_app/__init__.py
4
+ src/research_assistant_app.egg-info/PKG-INFO
5
+ src/research_assistant_app.egg-info/SOURCES.txt
6
+ src/research_assistant_app.egg-info/dependency_links.txt
7
+ src/research_assistant_app.egg-info/top_level.txt
8
+ src/research_assistant_app/components/__init__.py
9
+ src/research_assistant_app/utils/__init__.py
10
+ src/research_assistant_app/utils/exception.py
src/research_assistant_app.egg-info/dependency_links.txt ADDED
@@ -0,0 +1 @@
 
 
1
+
src/research_assistant_app.egg-info/top_level.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ research_assistant_app
src/research_assistant_app/__init__.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import logging
4
+
5
+ logging_str = "[%(asctime)s: %(levelname)s: %(module)s: %(message)s]"
6
+
7
+ log_dir = "logs"
8
+ log_filepath = os.path.join(log_dir, "running_logs.log")
9
+ os.makedirs(log_dir, exist_ok=True)
10
+
11
+
12
+ logging.basicConfig(
13
+ level=logging.INFO,
14
+ format=logging_str,
15
+ handlers=[logging.FileHandler(log_filepath), logging.StreamHandler(sys.stdout)],
16
+ )
17
+
18
+ logger = logging.getLogger("research_assistantLogger")
src/research_assistant_app/__pycache__/__init__.cpython-39.pyc ADDED
Binary file (590 Bytes). View file
 
src/research_assistant_app/components/__init__.py ADDED
File without changes
src/research_assistant_app/components/__pycache__/__init__.cpython-39.pyc ADDED
Binary file (174 Bytes). View file
 
src/research_assistant_app/components/__pycache__/data_indexing.cpython-39.pyc ADDED
Binary file (2.03 kB). View file
 
src/research_assistant_app/components/__pycache__/data_ingestion.cpython-39.pyc ADDED
Binary file (1.52 kB). View file
 
src/research_assistant_app/components/__pycache__/data_querying.cpython-39.pyc ADDED
Binary file (2.43 kB). View file
 
src/research_assistant_app/components/data_indexing.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from llama_index.core import StorageContext
2
+ from llama_index.embeddings.gemini import GeminiEmbedding
3
+ import google.generativeai as genai
4
+
5
+ from llama_index.vector_stores.pinecone import PineconeVectorStore
6
+ from pinecone import Pinecone
7
+
8
+
9
+ from llama_index.core.node_parser import SemanticSplitterNodeParser
10
+ from llama_index.core.ingestion import IngestionPipeline
11
+
12
+ from research_assistant_app.constants import gemini_api_key, pinecone_api_key
13
+ from research_assistant_app.components.data_ingestion import get_cleaned_dir_docs
14
+
15
+
16
+ from research_assistant_app.constants import gemini_api_key, pinecone_api_key
17
+ from llama_index.embeddings.gemini import GeminiEmbedding
18
+ from llama_index.llms.gemini import Gemini
19
+ import google.generativeai as genai
20
+ from llama_index.core import Settings
21
+ from llama_index.core.node_parser import SentenceSplitter
22
+
23
+
24
+ genai.configure(api_key=gemini_api_key) # configuring api to run the pipeline
25
+ model = Gemini(models="gemini-pro", api_key=gemini_api_key, temperature=0.3)
26
+ gemini_embed_model = GeminiEmbedding(model_name="models/embedding-001")
27
+
28
+ embed_model = gemini_embed_model
29
+
30
+ Settings.llm = model
31
+ Settings.embed_model = gemini_embed_model
32
+ Settings.node_parser = SentenceSplitter(chunk_size=512, chunk_overlap=20)
33
+ Settings.num_output = 512
34
+ Settings.context_window = 3900
35
+
36
+
37
+ # Define the initial pipeline
38
+ pipeline = IngestionPipeline(
39
+ transformations=[
40
+ SemanticSplitterNodeParser(
41
+ buffer_size=1,
42
+ breakpoint_percentile_threshold=95,
43
+ embed_model=embed_model,
44
+ ),
45
+ embed_model,
46
+ ],
47
+ )
48
+
49
+
50
+ pc = Pinecone(api_key=pinecone_api_key)
51
+ pinecone_index = pc.Index(
52
+ "ai-research-assistant"
53
+ ) # `ai-research-assistant` is the index name
54
+
55
+ vector_store = PineconeVectorStore(pinecone_index=pinecone_index)
56
+ storage_context = StorageContext.from_defaults(vector_store=vector_store)
57
+
58
+ # cleaned_docs = get_cleaned_dir_docs()
59
+ # print(cleaned_docs, "Check 1")
60
+
61
+ pipeline = IngestionPipeline(
62
+ transformations=[
63
+ SemanticSplitterNodeParser(
64
+ buffer_size=1,
65
+ breakpoint_percentile_threshold=95,
66
+ embed_model=embed_model,
67
+ ),
68
+ embed_model,
69
+ ],
70
+ vector_store=vector_store, # Our new addition
71
+ )
72
+
73
+
74
+ # Now we run our pipeline!
75
+ def run_indexing_pipeline(docs):
76
+ genai.configure(api_key=gemini_api_key) # configuring api to run the pipeline
77
+
78
+ pipeline.run(documents=docs)
79
+
80
+ # print(pinecone_index.describe_index_stats(), "pincone index")
81
+ return pinecone_index.describe_index_stats()
82
+
83
+
84
+ # >>> {'dimension': 1536,
85
+ # >>> 'index_fullness': 0.0,
86
+ # >>> 'namespaces': {'': {'vector_count': 46}},
87
+ # >>> 'total_vector_count': 46}
88
+
89
+ if __name__ == "__main__":
90
+ cleaned_docs = get_cleaned_dir_docs("Data")
91
+
92
+ index_stats = run_indexing_pipeline(cleaned_docs[:3])
93
+
94
+ print(index_stats, "pincone index")
src/research_assistant_app/components/data_ingestion.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
2
+ import re
3
+
4
+
5
+ def clean_up_text(content: str) -> str:
6
+ """
7
+ Remove unwanted characters and patterns in text input.
8
+
9
+ :param content: Text input.
10
+
11
+ :return: Cleaned version of original text input.
12
+ """
13
+
14
+ # Fix hyphenated words broken by newline
15
+ content = re.sub(r"(\w+)-\n(\w+)", r"\1\2", content)
16
+
17
+ # Remove specific unwanted patterns and characters
18
+ unwanted_patterns = [
19
+ "\\n",
20
+ " —",
21
+ "——————————",
22
+ "—————————",
23
+ "—————",
24
+ r"\\u[\dA-Fa-f]{4}",
25
+ r"\uf075",
26
+ r"\uf0b7",
27
+ ]
28
+ for pattern in unwanted_patterns:
29
+ content = re.sub(pattern, "", content)
30
+
31
+ # Fix improperly spaced hyphenated words and normalize whitespace
32
+ content = re.sub(r"(\w)\s*-\s*(\w)", r"\1-\2", content)
33
+ content = re.sub(r"\s+", " ", content)
34
+
35
+ return content
36
+
37
+
38
+ def get_cleaned_dir_docs(pdf_file_dir):
39
+ print(pdf_file_dir)
40
+ documents = SimpleDirectoryReader(pdf_file_dir).load_data()
41
+
42
+ # Call function
43
+ cleaned_docs = []
44
+ for d in documents:
45
+ cleaned_text = clean_up_text(d.text)
46
+ d.text = cleaned_text
47
+ cleaned_docs.append(d)
48
+
49
+ return cleaned_docs
50
+
51
+
52
+ def get_cleaned_input_docs(pdf_file):
53
+
54
+ documents = SimpleDirectoryReader(input_files=[pdf_file]).load_data()
55
+
56
+ # Call function
57
+ cleaned_docs = []
58
+ for d in documents:
59
+ cleaned_text = clean_up_text(d.text)
60
+ d.text = cleaned_text
61
+ cleaned_docs.append(d)
62
+
63
+ return cleaned_docs
64
+
65
+
66
+ if __name__ == "__main__":
67
+ # docs = get_cleaned_dir_docs("Data\10200221027_Rajarshi Roy_ (1).pdf")
68
+ docs = get_cleaned_dir_docs("E:\projects\AI research assistant\Data")
69
+ print(docs)
src/research_assistant_app/components/data_querying.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from llama_index.core import VectorStoreIndex
2
+
3
+ from llama_index.vector_stores.pinecone import PineconeVectorStore
4
+ from pinecone import Pinecone
5
+
6
+
7
+ from research_assistant_app.constants import gemini_api_key, pinecone_api_key
8
+ import google.generativeai as genai
9
+
10
+
11
+ pc = Pinecone(api_key=pinecone_api_key)
12
+ pinecone_index = pc.Index(
13
+ "ai-research-assistant"
14
+ ) # `ai-research-assistant` is the index name
15
+
16
+ vector_store = PineconeVectorStore(pinecone_index=pinecone_index)
17
+
18
+ from llama_index.core.retrievers import VectorIndexRetriever
19
+
20
+ from llama_index.core.query_engine import RetrieverQueryEngine
21
+ from llama_index.core import PromptTemplate
22
+
23
+
24
+ def get_vector_retriever(Pinecone_vector_store):
25
+ # Instantiate VectorStoreIndex object from your vector_store object
26
+ vector_index = VectorStoreIndex.from_vector_store(
27
+ vector_store=Pinecone_vector_store
28
+ )
29
+
30
+ print(vector_index, "check indexes")
31
+
32
+ # Grab 5 search results
33
+ retriever = VectorIndexRetriever(index=vector_index, similarity_top_k=5)
34
+
35
+ # Pass in your retriever from above, which is configured to return the top 5 results
36
+ query_engine = RetrieverQueryEngine(retriever=retriever)
37
+
38
+ return query_engine, vector_index
39
+
40
+
41
+ def get_full_prompt_template(cur_instr: str, prompt_tmpl):
42
+ tmpl_str = prompt_tmpl.get_template()
43
+ new_tmpl_str = cur_instr + "\n" + tmpl_str
44
+ new_tmpl = PromptTemplate(new_tmpl_str)
45
+ return new_tmpl
46
+
47
+
48
+ def proper_prompting(my_query_enginge, my_vector_index):
49
+
50
+ QA_PROMPT_KEY = "response_synthesizer:text_qa_template"
51
+
52
+ # get the base qa prompt (without any instruction prefix)
53
+ base_qa_prompt = my_query_enginge.get_prompts()[QA_PROMPT_KEY]
54
+
55
+ initial_instr = """\
56
+ You are a QA assistant specifically designed to help in reaserch work as and research assistant.
57
+ ---------------------
58
+
59
+ Context information is below. Given the context information and not prior knowledge, \
60
+ "{context_str}\n"
61
+ ---------------------
62
+ answer the query. \
63
+
64
+ It is very important that If the context is not relevant,
65
+ please answer the question by using your own knowledge about the topic
66
+
67
+ """
68
+
69
+ # this is the "initial" prompt template
70
+ # implicitly used in the first stage of the loop during prompt optimization
71
+ # here we explicitly capture it so we can use it for evaluation
72
+ old_qa_prompt = get_full_prompt_template(initial_instr, base_qa_prompt)
73
+
74
+ old_qa_prompt
75
+ # Use the custom prompt when querying
76
+ # genai.configure(api_key=gemini_api_key)
77
+ query_engine = my_vector_index.as_query_engine(text_qa_template=old_qa_prompt)
78
+
79
+ return query_engine
80
+
81
+
82
+ ## This will be the main function that we would call for querying
83
+ def user_query(qus):
84
+ genai.configure(api_key=gemini_api_key)
85
+
86
+ my_query_enginge, my_vector_index = get_vector_retriever(vector_store)
87
+
88
+ query_engine = proper_prompting(my_query_enginge, my_vector_index)
89
+
90
+ response = query_engine.query(qus)
91
+
92
+ return response.response
src/research_assistant_app/constants/__init__.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # loading secret key
2
+ import os
3
+ from dotenv import load_dotenv
4
+
5
+ from llama_index.core import VectorStoreIndex
6
+ from llama_index.core import ServiceContext
7
+ from llama_index.core import StorageContext, load_index_from_storage
8
+ from llama_index.embeddings.gemini import GeminiEmbedding
9
+ from llama_index.llms.gemini import Gemini
10
+ import google.generativeai as genai
11
+
12
+ load_dotenv()
13
+
14
+ gemini_api_key = os.getenv("GEMINI_API_KEY")
15
+ pinecone_api_key = os.getenv("PINECONE_API_KEY")
src/research_assistant_app/constants/__pycache__/__init__.cpython-39.pyc ADDED
Binary file (681 Bytes). View file
 
src/research_assistant_app/utils/__init__.py ADDED
File without changes
src/research_assistant_app/utils/exception.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ from research_assistant_app import logging
3
+
4
+
5
+ def error_message_detail(error, error_detail: sys):
6
+ _, _, exc_tb = error_detail.exc_info()
7
+ file_name = exc_tb.tb_frame.f_code.co_filename
8
+ error_message = "Error ocurred in python script name [{0}] line number [{1}] error message [{2}]".format(
9
+ file_name, exc_tb.tb_lineno, str(error)
10
+ )
11
+
12
+ return error_message
13
+
14
+
15
+ class CustomException(Exception):
16
+ def __init__(self, error_message, error_detail: sys):
17
+ super().__init__(error_message)
18
+ self.error_message = error_message_detail(
19
+ error_message, error_detail=error_detail
20
+ )
21
+
22
+ def __str__(self):
23
+ return self.error_message
template.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from pathlib import Path
3
+ import logging
4
+
5
+ # logging string
6
+ logging.basicConfig(level=logging.INFO, format="[%(asctime)s]: %(message)s:")
7
+
8
+ project_name = "research_assistant_app"
9
+
10
+ list_of_files = [
11
+ ".github/workflows/.gitkeep",
12
+ f"src/{project_name}/__init__.py",
13
+ f"src/{project_name}/components/__init__.py",
14
+ f"src/{project_name}/utils/__init__.py",
15
+ "requirements.txt",
16
+ "setup.py",
17
+ "research/trials.ipynb",
18
+ ]
19
+
20
+
21
+ for filepath in list_of_files:
22
+ filepath = Path(filepath)
23
+ filedir, filename = os.path.split(filepath)
24
+
25
+ if filedir != "":
26
+ os.makedirs(filedir, exist_ok=True)
27
+ logging.info(f"Creating directory; {filedir} for the file: {filename}")
28
+
29
+ if (not os.path.exists(filepath)) or (os.path.getsize(filepath) == 0):
30
+ with open(filepath, "w") as f:
31
+ pass
32
+ logging.info(f"Creating empty file: {filepath}")
33
+
34
+ else:
35
+ logging.info(f"{filename} is already exists")