jeevan commited on
Commit
416fc9c
·
1 Parent(s): 77e353e

using ft model

Browse files
.gitattributes CHANGED
@@ -32,4 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
32
  *.xz filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
32
  *.xz filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
Tasks/Task 1/Task1.md CHANGED
@@ -37,8 +37,6 @@ In addition to the default strategy, I would like to test out a **Section- and T
37
 
38
 
39
 
40
-
41
-
42
  # Problem Statement
43
 
44
  People are concerned about the implications of AI, and no one seems to understand the right way to think about building ethical and useful AI applications for enterprises.
 
37
 
38
 
39
 
 
 
40
  # Problem Statement
41
 
42
  People are concerned about the implications of AI, and no one seems to understand the right way to think about building ethical and useful AI applications for enterprises.
Tasks/Task 1/pre-processing.ipynb CHANGED
@@ -24,7 +24,7 @@
24
  },
25
  {
26
  "cell_type": "code",
27
- "execution_count": 2,
28
  "metadata": {},
29
  "outputs": [],
30
  "source": [
@@ -40,14 +40,14 @@
40
  },
41
  {
42
  "cell_type": "code",
43
- "execution_count": 3,
44
  "metadata": {},
45
  "outputs": [
46
  {
47
  "name": "stderr",
48
  "output_type": "stream",
49
  "text": [
50
- "/Users/jeevan/Documents/Learnings/ai-engineering-bootcamp/AIE4/mid-term/ai-safety-chatty/.venv/lib/python3.11/site-packages/sentence_transformers/cross_encoder/CrossEncoder.py:13: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
51
  " from tqdm.autonotebook import tqdm, trange\n"
52
  ]
53
  }
@@ -56,24 +56,66 @@
56
  "# Embedding model - snowflake-arctic-embed-l\n",
57
  "from langchain_huggingface import HuggingFaceEmbeddings\n",
58
  "\n",
59
- "model_name = \"Snowflake/snowflake-arctic-embed-l\"\n",
60
  "embedding_model = HuggingFaceEmbeddings(model_name=model_name)"
61
  ]
62
  },
63
  {
64
  "cell_type": "code",
65
- "execution_count": 12,
66
  "metadata": {},
67
  "outputs": [],
68
  "source": [
69
- "from pdfloader import PDFLoaderWrapper\n",
70
- "from langchain_experimental.text_splitter import SemanticChunker\n",
 
 
 
 
 
 
 
71
  "\n",
 
 
 
72
  "\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  "pdf_loader = PDFLoaderWrapper(\n",
74
  " documents_to_preload, PDFLoaderWrapper.LoaderType.PYMUPDF\n",
75
  ")\n",
76
- "documents = await pdf_loader.aload()\n",
 
 
 
 
 
 
 
 
 
 
 
77
  "\n",
78
  "text_splitter = SemanticChunker(embedding_model, buffer_size=5, breakpoint_threshold_type=\"percentile\",breakpoint_threshold_amount=90)\n",
79
  "\n",
@@ -82,7 +124,7 @@
82
  },
83
  {
84
  "cell_type": "code",
85
- "execution_count": 23,
86
  "metadata": {},
87
  "outputs": [],
88
  "source": [
@@ -98,7 +140,7 @@
98
  },
99
  {
100
  "cell_type": "code",
101
- "execution_count": 4,
102
  "metadata": {},
103
  "outputs": [],
104
  "source": [
@@ -112,437 +154,481 @@
112
  },
113
  {
114
  "cell_type": "code",
115
- "execution_count": 13,
116
  "metadata": {},
117
  "outputs": [
118
  {
119
  "data": {
120
  "text/plain": [
121
- "['8dd5b1e7fd464e2a90c28a8eea8b0cb9',\n",
122
- " '906e0c268d564dbc89c0b8398e235407',\n",
123
- " '4b81191a4cc94fbd835dc9c942e9543a',\n",
124
- " '25c3b7fffa8d4bc29790057fe2f4d025',\n",
125
- " '3ad5906a8a274b56bd05e4ac39ffe459',\n",
126
- " 'e3fa01bef57c489ca014be2e589b7ef1',\n",
127
- " 'af5fc5121c6a438a8fc5dea454b7e92f',\n",
128
- " '80500cf02d5748c39b1c62288459c306',\n",
129
- " '5db6eebee14b4aafa948e4f9aa4f7aa2',\n",
130
- " '99385298e8744643822e01525bdff89e',\n",
131
- " 'eddc9704820d4005b7c62a5085f69454',\n",
132
- " '4324a624f4054ae5baa7270d9f6aaa56',\n",
133
- " '9eb24bea31a749f1b7a86ac2b186ec14',\n",
134
- " '7e9c9763bebf40cea1833ea6ad376eeb',\n",
135
- " 'cc8846008cac472e88eb16497c560a15',\n",
136
- " '5af0886e387449fc89f1d0e82c32c590',\n",
137
- " '824ae7c1c15a43c8b62713f02d91e0b5',\n",
138
- " 'f0ef1b30251b4429ad7d902b85fafcf8',\n",
139
- " '314a75e55d1b4c1fa46f49610d745f95',\n",
140
- " '66828a5f9536480bbd08d94f087bc44b',\n",
141
- " '8230b8add982486f9ac8e120a27d3aec',\n",
142
- " 'dd1c75bb5c1441468ac8e7d4595bf0b9',\n",
143
- " 'a9b1b1b87eeb48b78ed4cf6adddee9d2',\n",
144
- " 'eeacab16c9d94d08a791c516e0a65f6b',\n",
145
- " '187badb4dc064743898f5e5218114250',\n",
146
- " '0ecc4e873fe047ce8afc33e19fe40c3f',\n",
147
- " 'be7b81185ce140229bee6d1306120528',\n",
148
- " 'd8a9a361dc8a4917aefc2e0a17efafad',\n",
149
- " '7ca3b3bf947e449e8f58cd4fca12d884',\n",
150
- " '07a8b46151e74641b97ad823ef91082f',\n",
151
- " '421004f00dbb4a47a81c424ad5f64e39',\n",
152
- " '53750f6e3cfa481d971d7e5a6b9f55c5',\n",
153
- " 'e9f9524be6884599893590ad5acdb12b',\n",
154
- " '446225894d9747e0a888b596875ac83b',\n",
155
- " '7677215528c44061bfb018e42a13e528',\n",
156
- " '183002bfeeaa4b75968dda61451c2f37',\n",
157
- " '818cd888dd6b4735a602949aea2ea900',\n",
158
- " 'c803c1690b5549a5a13578b2cd757b17',\n",
159
- " '864bc201486e42bdb4caef6a266fa1ee',\n",
160
- " '02cae44fcb9e4d2eb60a6da08b01a4e6',\n",
161
- " '384e2a4f36d14359b22b0c0cdf463cd9',\n",
162
- " '6de4fdb915164aa68e076a2d6e1913ae',\n",
163
- " '8da0e4ecc526416ca3be4043879ea17b',\n",
164
- " 'e63359693473494d922f996b57c65d3b',\n",
165
- " '76313322b20b4e368f2aaefbf911df6b',\n",
166
- " '455bced7b2544b69a5c547d998548748',\n",
167
- " 'd1abce93130d4eb49cdc1aca8b7c9c60',\n",
168
- " '966dde23cfa144b899b60a3659f32eb4',\n",
169
- " '4fe01c77f1ae4b70a18c56fc6cb9679d',\n",
170
- " '7aa14bc014fa445cbc061c47a3fe1c31',\n",
171
- " '0aa076f900614fcfb37883057c67e6c1',\n",
172
- " '2bae6b24f8234792b914a75712fb89fc',\n",
173
- " '74fb5f5dfdb8413a9afaaf472a009ac2',\n",
174
- " 'b1c95e4126e842d598eae6fbc455b82c',\n",
175
- " '2e326893157844fe88f762b96aa46b99',\n",
176
- " 'd27b5c85573d44658a4d338c39890629',\n",
177
- " '1f967a5cae05418d94f9f3d07dcce74e',\n",
178
- " '7667f8b8e5914417a68e7d41256cfc98',\n",
179
- " '7fcd90b390494d3686e532a6528bd021',\n",
180
- " '8c64aecd850b48b3bac216e73ebad1e0',\n",
181
- " 'ca845f98d6b44c02b9f9edafcd75aa2e',\n",
182
- " '571b12484be44ea6b406e6cdcd0662a5',\n",
183
- " 'fa099bf314614075bf8bfc58110f52f1',\n",
184
- " 'b98bfe7e8c234ed59b5d893929bac64e',\n",
185
- " '3bc58c54665d4a4ca394ebb13debfabc',\n",
186
- " 'a276d45eccd54056a8855ccdb5907df3',\n",
187
- " '0f9ac79d967942d0b56b9fe200dc7846',\n",
188
- " '07c5e3794b244e28b384dad31a2c63a2',\n",
189
- " 'deb29326ff1e4fec934c35710d4e0dcf',\n",
190
- " '5278dac4122044879b9dd1a7c557b7a8',\n",
191
- " '041338e04daa4482a7d65c311ab0f3f5',\n",
192
- " '1ed9bbc381b1423a95ef935cc16e277c',\n",
193
- " '20bb221a9b654bbc99edc812475adc12',\n",
194
- " '188fba73978143e8a22370774e1d31fd',\n",
195
- " '3a65a41271a947848aba4939473d0f85',\n",
196
- " 'c5cee035027048338a81b9bf0830cf57',\n",
197
- " '03325dbfeb164512a565172407ae0ec3',\n",
198
- " '4494b4f19cff440281c034ede5e675e1',\n",
199
- " '2b55dd78b0f5461d917eeaec2a75dcb7',\n",
200
- " 'a2aef7ef0741438fb643a4fb225f1ffd',\n",
201
- " '3886d787efdd4d2bb3fc702ffd911db8',\n",
202
- " '71455bc57478429e8a1269ce7332302a',\n",
203
- " '0c2896f473c749f9888b5723ee834a5d',\n",
204
- " '1a8507e2597049dc8287d21172dfe518',\n",
205
- " '943e65c033774aed99116750cfbb5f5a',\n",
206
- " '78bfa949218f48c7b365569f2c3396ff',\n",
207
- " '6812e197478a40bf86cc363d11fc0856',\n",
208
- " 'e9cd2bf8ad454aa7af9446e62d9d845d',\n",
209
- " 'ca970fdebb1e4d79853090a7c73722f5',\n",
210
- " '84400697300c4d468f5c58d09fd63d56',\n",
211
- " 'ad1bf5c566a147e6b66f9f3502a227fd',\n",
212
- " '0c620c86587a4ae8975a0d066eb80e97',\n",
213
- " '1aa1ebd384774410ac011ca3e535808f',\n",
214
- " 'f30f7dba0182402c8abce8d9b07df99b',\n",
215
- " '73dd6c906bd14d3494dc8def54680e0f',\n",
216
- " '632c7eafa1c048979b65c5e8ecacb98e',\n",
217
- " 'b3edcc98879747dbbed2e8b6e19e1baf',\n",
218
- " '4f4bef9639bb454e87cb61051e2d4c82',\n",
219
- " 'd0c059bc4e04474f9355c87964b3b470',\n",
220
- " '412fa6deb62546c4a092988f96ddb425',\n",
221
- " '2283f3ef3e7141738c8966fa4333ba05',\n",
222
- " '7ed4340667d643b193b45f7f21a238ed',\n",
223
- " '9209830e86c54dbd9974cc737bbdbe91',\n",
224
- " '89e28024e5f14377a6b2efa1997f370a',\n",
225
- " '2f4b769a22b24bb49bed8917adae1f9e',\n",
226
- " 'fcfc743d434d43b886afb80c5377e1fd',\n",
227
- " '300f3cfca0874ee2818241856f2175df',\n",
228
- " '55ea80ccf78f4c9cb622c1451951e723',\n",
229
- " '4b755a511dbd46f79eb4b3bda119e79c',\n",
230
- " '956a7f2f70854e1a82dfc542fd761492',\n",
231
- " '190c9cd3f01a4ea894877f4ab35000bf',\n",
232
- " '381a0acf443244c78f303b9f6b72535f',\n",
233
- " 'd7afcec2075343b19d1320605cc41b46',\n",
234
- " '7898f81259be4c42b44cdfd3b41aa25b',\n",
235
- " 'd86e65ae1f8140e299ccf27583735b7a',\n",
236
- " 'ee553712557545a0b0320adc4d563bca',\n",
237
- " '6fb6a0b739a64b909cee096d722a4f6e',\n",
238
- " 'be1514e798f04581af33447b86f002c8',\n",
239
- " 'c92113c2d4344ce8a10e7a6c1d089f4e',\n",
240
- " 'dc47dd1e29bb49768c2b88054f91c69f',\n",
241
- " 'd72643a36bf6415795a3694f93e5c376',\n",
242
- " '0709e83c2d974cb7aa30b17f8a5e5050',\n",
243
- " 'f4bc420a5b4c4cdd91c4441837fbcfae',\n",
244
- " '712e9898ff5f44eda54aebcf54931760',\n",
245
- " '1916383ddb404a32ab833c2add8e2511',\n",
246
- " '4a01c037505943a3be4fa183de7d5c73',\n",
247
- " 'cb29c514ccf8416491654441749f9889',\n",
248
- " '57cc735133754979862b8dd27ccf45d8',\n",
249
- " '6e28a749e0ee49d8b6ba562ab268e474',\n",
250
- " 'd915b593c4194759a4ca48304ce54b56',\n",
251
- " '83bf4923ef4847b1b3faa0a85ce85d9c',\n",
252
- " 'c7165907a5774d7a9cfe034328875f16',\n",
253
- " 'fb83c16fe2ac4a0b8a2cdd3a099b0751',\n",
254
- " 'd8d56a63f4104e1d9b06c5c8d6246d4d',\n",
255
- " 'b92d479d4fea486980ce1133ef0d9049',\n",
256
- " 'b264b04721c14738a6e018c3d089e3e2',\n",
257
- " 'bf50880e770d4e2a80415e87b8f95788',\n",
258
- " '5ca16b29007f4b919ac1f3fdf261aa10',\n",
259
- " '30f3d89d1ae042afb3b745451e0a5fdc',\n",
260
- " '41dd324a662a4e79935980dc8e53ab8f',\n",
261
- " '7590bcc7d6d540b1bd92a7ce69c0e9b2',\n",
262
- " '72973a23774d4bebb9c42dcf885ae06a',\n",
263
- " 'ce3e692e73084116ae834e72349032a7',\n",
264
- " '044469614deb404f8d3b1860907e0f75',\n",
265
- " '3805bb3205c5411daf2a64a7742e59fe',\n",
266
- " '03aa772e62b44423b75ec05c90e8687e',\n",
267
- " '740e824d876d44c7b30599b4dcb8eb44',\n",
268
- " 'ba27340cf2144d15bee2a5f5b7e00622',\n",
269
- " '4c6a7847bc554fddadd0a884c26612bc',\n",
270
- " '8a13a8d664c4453b8f71c01b28ec8dff',\n",
271
- " '113a3db0cd0d4e0f8067c5fa074967df',\n",
272
- " '28af19cb148f49049336aa1b52c14a98',\n",
273
- " '795bc0bc5a9c4ff8b472e2a9c9c59dbd',\n",
274
- " 'fb36fe1dad1c4280a7186ea5c20e64dc',\n",
275
- " 'a50e1d30a67b4144bd8ce5ab32f1cd3a',\n",
276
- " 'f1ccd9cc27b0414f96243f1c63a07fd5',\n",
277
- " '3d2b64f6ceb74744b6b8374728142334',\n",
278
- " '15f314cbb8a14f9286a814cafef76192',\n",
279
- " 'ac540651b7a34d50b70e4c44cf25b3ca',\n",
280
- " '6b87356c50d1404abe0a676b7f322a72',\n",
281
- " '67902b525d014249900e54257590f7ea',\n",
282
- " 'fc1189d79c824a74bd60dd5dc341aa2b',\n",
283
- " 'b26845eaa60246399cef48b0a13d11b5',\n",
284
- " '7b592acd329743a8a7a3b2569a048416',\n",
285
- " '189e134f601441cc9f1514a778e3c820',\n",
286
- " 'f7f1425e4c2d4e1fa9040ee85d368bf1',\n",
287
- " '0297a5233d6f4275baf0a9957b0dc586',\n",
288
- " '5afeb076d11841c890517fc92d0aa6f9',\n",
289
- " '79fa6ed91f7a47b6bd764e1c8b412fa5',\n",
290
- " 'ff4b75e4daaf4588ae69ba2f83816c15',\n",
291
- " '84b7e45334a3477f8d8a64e3504fb620',\n",
292
- " 'dbf780a26828491da830425df5a7a03c',\n",
293
- " '25a6912b64f442f99f5787bef114ebff',\n",
294
- " '00c07386007a4dc18072a431f7cf83b1',\n",
295
- " 'f84828d74c0c446389732b8eb4d6570f',\n",
296
- " 'a12ba2aaf84640a8816d9ce8e8a417e9',\n",
297
- " '02223a887b2c4ec0891d45e75836b00e',\n",
298
- " 'a62632e5379a4af5be885b4750d18650',\n",
299
- " '5ef2d149eb314d879897648027e7e8aa',\n",
300
- " '1a2c86d6906141b18700239300599566',\n",
301
- " '584530895cc74af58cdb016c0ed63bc5',\n",
302
- " 'e19a1e82a1ec4884a7c72f2996ac927f',\n",
303
- " 'dbcd348813cc4365bf65c549333e669d',\n",
304
- " '8bac146f886b4272bd40f51adb35c32b',\n",
305
- " 'cae2438601594d6fb39d99d617fe6c0e',\n",
306
- " 'e8306f6a959b4a219d096b968784c44e',\n",
307
- " 'ebbdefa7da15403294655048c6fe3624',\n",
308
- " '60579bd40852405b8345114456963981',\n",
309
- " '211f66d20b5c451d91f310594b854ea0',\n",
310
- " '6d726194ac8641a6a5f6d8ce3f192a7c',\n",
311
- " 'c0154e0f56b049048f9bcb7f718173f2',\n",
312
- " '081a6179661e41f69ab10b92027d161e',\n",
313
- " '2c149c226d504053bfa94532a850efd1',\n",
314
- " 'f833250f67bc4329922a5a7f7b7d07f7',\n",
315
- " 'a0c1e7c49351406ca3567622b6b1e38e',\n",
316
- " '72abd1e5dc824d6c8852f7331990b6f0',\n",
317
- " '35c02c07ae484045a325940fcbe098fb',\n",
318
- " '63b83f297d1d486e84ddb42c2af32900',\n",
319
- " '17fb247ed7bc4599a8de06966e744b2a',\n",
320
- " '371ea02f2a7e4e11b82ed0593a26a806',\n",
321
- " '6847a89d7a944bf2bd95430c4d63def7',\n",
322
- " 'a186a88983e64831bf42523b6522d706',\n",
323
- " 'e3538719cffb4cb59efa815b27b4bf81',\n",
324
- " '8ae7bdfed4e249dd98727cbb4f34259d',\n",
325
- " 'd4f18e23e8444ffba7b13661b22ba1a9',\n",
326
- " 'ab0aace578b2457cb10966f9a57dbfce',\n",
327
- " '641437d62c1940d7a7d0711391802aa6',\n",
328
- " '3f88fe5f258244d581a053a53b844bdc',\n",
329
- " '5022a64b46dc4989b2a919193cdec7da',\n",
330
- " 'e620e702fb604457b4724ce0f753138d',\n",
331
- " 'ff9557d7d3c446a39829de03605a5254',\n",
332
- " '5314c1de556d4b96af06fd3ac37cf1f4',\n",
333
- " '56d98670e8f74b1881bac44ccfb9267d',\n",
334
- " '2d06bf2cf37a48f98708e345e86a6114',\n",
335
- " '1d6be38e70b74ce69cf68c32fdef0b9f',\n",
336
- " '936184dab9ab4ff9905cbaccc0844e61',\n",
337
- " 'ec080c630727417fa858340935e0557d',\n",
338
- " 'f6792c2c58774d4e9ba97575ae5a9ddb',\n",
339
- " 'c1df1df5060e41899363e2a0649100e4',\n",
340
- " 'b2eb147898c64b359c951297318e6831',\n",
341
- " '216d56dfdaae4c098b826c2c6dbe8132',\n",
342
- " '7921cab4d11646168b5d186794f5db24',\n",
343
- " 'b89e6d297f064e708c4fb903c6ebf15a',\n",
344
- " '4ee471c58dbe4185b6968113228bb20e',\n",
345
- " 'd64a74ed5b57427c9c6ce98a9f945b70',\n",
346
- " '58cd9cbd849c456e85fb72a4abc5c69c',\n",
347
- " '2e1835610aa749c896c8c165e3d84470',\n",
348
- " '8008134cc1c44751bb95a3270cb89a44',\n",
349
- " 'e4f098d6b9024392adc396aad0efb94f',\n",
350
- " 'e8fcb91812d048efa5ba38a46cf40531',\n",
351
- " '987e19cc2d674e2aa0d555af45ef874f',\n",
352
- " '2816810a2eea4f0081baae4b28614796',\n",
353
- " 'ffd0647c27664a779dabf843fcf83981',\n",
354
- " '7a5e718e26b14f4daf674c901b3dde93',\n",
355
- " 'a7248347d0b743d7b5db65f3b1b87cac',\n",
356
- " '640f3399c6c340f19a11919a6402ba85',\n",
357
- " 'fe73657d7e884bd48d93afdc193b73bb',\n",
358
- " 'a9c649a44de94b5f82af06c804e3bb08',\n",
359
- " '8265df5e7847443a8c91478ae1cabf72',\n",
360
- " 'a3bb9ab12f814c4faa382b586fe13680',\n",
361
- " '775f072a48674d6b9fdd0671c4064891',\n",
362
- " '21048806b4c74f5db5b7f873c45adedb',\n",
363
- " '1525157eac174ab684089f50e6c29969',\n",
364
- " '136132cedfec4e3bacf2a8adc1fbd50f',\n",
365
- " '08099904461749ddba96b17b61226622',\n",
366
- " '1da17dd87dd448c4902f8e3a1ad1c51e',\n",
367
- " 'fa9c4880b4a34b9aaca3f5363ff1b7f0',\n",
368
- " '30eeafb17ae74a42b370e173e22abfea',\n",
369
- " '0d3ad50f8c524e90b6c440865aaf63d8',\n",
370
- " '51d52addb9df4c04afaf8f008fe89259',\n",
371
- " '2aa50c713fe241f1b9b44107c0d47945',\n",
372
- " '8615b8c9442c4031aee25316cdfc7cb5',\n",
373
- " '8f8fd8a1edc043ee82c77381bf39a83c',\n",
374
- " 'f15223b9a98445f2b7613e518e7bca83',\n",
375
- " '6a437e8d6655430aaef679f6c6a84831',\n",
376
- " 'c0a3331686754cd9929c4abd5d81dc7f',\n",
377
- " 'ff36401f33d9424cb7112033ead5f58c',\n",
378
- " '841220093a1242f0b04a4ac8d852e280',\n",
379
- " '655ec122f3d24c069eecacc8e8bc8f82',\n",
380
- " '6da4281d97ad46ed9ef6dd169c640afe',\n",
381
- " 'eeef919024d54063ae3cd6c6f8f7a73a',\n",
382
- " '9f55e4818e0c4bef82dec178dc64eaa4',\n",
383
- " '8a652387c6eb424288a0022be058d00a',\n",
384
- " '9a19d1114e674c618d23a1299f14f1ba',\n",
385
- " '9e4b5fdbbda24ed5a2fbbc3923847a44',\n",
386
- " '8b442e6de7bf401b8300c567a642a759',\n",
387
- " '20c8152b25514d018eeb8542b4450ad2',\n",
388
- " '5ae8d5d8230f46cab713cecbd97c847e',\n",
389
- " 'e159ce134b7b46308fc919b22a9e808f',\n",
390
- " '5a0d312175db4d15b85c0255b68bc027',\n",
391
- " '6f82139d091145cba88a7f0fc367063d',\n",
392
- " 'ee60e7e10d924f01b530b0291d939aae',\n",
393
- " '3b73fe9ad953458bbd3d11f44b85fce5',\n",
394
- " 'd15e56f06ba24e3cb6a1c4dd0568201d',\n",
395
- " '16f3d27489ff423dbf7d027844d957db',\n",
396
- " '6091bda6320149a1af5ccfb541e75148',\n",
397
- " 'ec527d7203164f07ae7349cfa33829b3',\n",
398
- " '26b4090286e346b4b686b13360cceea3',\n",
399
- " '856dd023a8cd41108eea38b403eadd09',\n",
400
- " '5434f50e81db44e5b80d3bc8816eb5bf',\n",
401
- " 'e9022b1d39f24ec09981e8c66478705d',\n",
402
- " 'd3f895478ed74239b4bc88e04e215f1a',\n",
403
- " 'c1fb1bae1b5e42f8a65f8260c259e133',\n",
404
- " '2a86b477a07e48afb2658742c30494e8',\n",
405
- " 'da64e968fc3047089de2ffa4b62a8c0a',\n",
406
- " '0a42ebaaedae4f73914398ad1486afb4',\n",
407
- " 'dc66a40fba5f4e348216910fce0d2428',\n",
408
- " 'e125ed2241b24a31ba40ed768a21d4f6',\n",
409
- " '850eda324b734ffdad0fe63c92c91038',\n",
410
- " 'd9e9ca7b0a634afdac1f4da62f2dcddf',\n",
411
- " '7cf441e12164420fa8b58e0aa6d244de',\n",
412
- " '0a6c2f48e5094e3399f7e1d0f38d873f',\n",
413
- " 'c8c29dbfc7f840d7a7195aa74388b30c',\n",
414
- " '07a90e50dcbb4352baa6636e9b687aab',\n",
415
- " '85d60bfe6d684c1a8578c1d6710c867b',\n",
416
- " 'ceb46b27e8994626a6d6d1c1acabff5d',\n",
417
- " '506dd325656145ebb7d976de3b4953c0',\n",
418
- " 'a83e954196874363b13c7cb3d7d8e025',\n",
419
- " '16fc61ea959d4427b3fa723d7e58f2bf',\n",
420
- " '3d4599e2ad2f47deae8c1c25d30dec68',\n",
421
- " '260482de224a4ec998459a5d2f9384bd',\n",
422
- " 'd480305e9fc34a55b8f146343fe1dd8e',\n",
423
- " '9851b805fcf54766bd482d5a0d4a8d0f',\n",
424
- " '5147fb1a9a904ff09b7c6885567fa94e',\n",
425
- " '509f1c4ef2b348af821461d751850e93',\n",
426
- " 'a045c48dd9444211a2f0087229df189d',\n",
427
- " '3b90d03add21451aa40990b1f2dad9e1',\n",
428
- " 'fa68102a0555422db1cc0f3822496a48',\n",
429
- " 'ade010ff9a2644a38c7c3de875a3ac78',\n",
430
- " '390f54300e1f41ac9224da683f00d31a',\n",
431
- " 'aacaee53fce14e9395259a0609cc1646',\n",
432
- " '00ca0c3998b64339874ad036983a0922',\n",
433
- " '2b3b851a8bb6422abab843dc2148255f',\n",
434
- " '363ed4276aa144b2baeec9dbc1fba38c',\n",
435
- " 'e9bf60fe2f184793b37f268ffa486abc',\n",
436
- " '98b185639a5d441ca60d7a5fe7620f8c',\n",
437
- " '9ae478f07ca3465a9a447b3c7eab4b26',\n",
438
- " '01fc2f0676754dc7baae898343e2bebf',\n",
439
- " 'cf3dd2f39b1a48919b90571555e4befb',\n",
440
- " 'b0ef26aca0404662b5706ccfa737a52b',\n",
441
- " '9afcf7171eb74e628f99fa44a753c131',\n",
442
- " '26f3917cc2274e998e115212273fe2ba',\n",
443
- " 'ec369428b5fe43138d049f293dcd21a4',\n",
444
- " '947d581cb3cd4555933504b8c64c54d3',\n",
445
- " '16b495459a4e4bfd96a12655defa9551',\n",
446
- " 'af15b6c943834e96a24363fbdca209f3',\n",
447
- " 'a01a9d5cb1c041889bfd1cf29cd4c08a',\n",
448
- " 'd3d0ee8f3c394ed680324fdcdb442241',\n",
449
- " 'd56148720f974dffab53a4e8917c3833',\n",
450
- " '32593dde550f45af9a36349bcd63192c',\n",
451
- " '1d43c4caf83b470897e96410f4dae5ee',\n",
452
- " 'ee1cd8b353cf45e3a88ce76faeebc9a1',\n",
453
- " '8e7135cd24764e94b8d04e15ec86b9c0',\n",
454
- " 'a7a918cf594e4a2992398acc924e6015',\n",
455
- " '13837b8571154abe83bb0b8d8e08d406',\n",
456
- " '63a4ce49d82d4da6bcb4da66db26bf35',\n",
457
- " '2214b1db8432499286a9ad49d8a2391c',\n",
458
- " 'af23e4413b7c42cc982b011d6432ec5a',\n",
459
- " 'e4b5e669227c4112aec7a7c53f568b75',\n",
460
- " '2d5f634bb7414afcac7b78ce7c0a864b',\n",
461
- " 'beda5449b3124e379f35601a33ab4651',\n",
462
- " '271af180d99846e4a0d8c57f444df81d',\n",
463
- " '4af16168d5a1432e8ca9719c9000f58c',\n",
464
- " '405113fc9e334cada56589b758cd9fd7',\n",
465
- " 'c95f295e46ba4b82b9f92fc0dcc8c1df',\n",
466
- " '475897fbe33347cf907f3cc381f40c0e',\n",
467
- " 'b6779e2220c444d38741c06cc2bb380c',\n",
468
- " '7d56c936c7d84514a67cd75e369449f7',\n",
469
- " '272a9892cdb742dcbe5f90e29eefae72',\n",
470
- " 'c28ace207c9d437da68cf599ac028bbb',\n",
471
- " '6d3c684dd6894bd9bf24486175ed834d',\n",
472
- " 'd8766ca5bb7d468399e6b864756a04e6',\n",
473
- " 'f5501aae471447fba9a4ac7ccf88c1fe',\n",
474
- " 'bb4d8f7876a141e0ba82eaebe7899c5a',\n",
475
- " '1c87be78b3fd48a093c23a54904bf8bc',\n",
476
- " '7cfe5d24a86645e1928a4700e2175e82',\n",
477
- " '79a9e904f5bb48a3830647c6afccbb85',\n",
478
- " 'deda349c16f54a9f85cd302269c22456',\n",
479
- " '5339992d8dcd464294260f5c0c857fff',\n",
480
- " 'd86ca1d2c07f4784956acb34d4d8c48c',\n",
481
- " 'dadf0561bb1c4ad9a87cc33a21424d32',\n",
482
- " '63c86b1adaed4514a75e0409a66b15c3',\n",
483
- " '61425b5443b840f2b7d28347d4002192',\n",
484
- " 'e7166ad200694bb7ae645e63495dbfa5',\n",
485
- " '9a4e61507dbd4fcd96b9c4b8eb24e74d',\n",
486
- " '5cb628ca8b8245e0ae326ccb8ae5635d',\n",
487
- " 'cbf3322896f8445ead83a6907a9aae08',\n",
488
- " '9156196800e64996891c0703499ffbb5',\n",
489
- " '47c88e62be7e4cb88b7d4935ba38cff0',\n",
490
- " '0fee655d64c34f84a07e6b889866a486',\n",
491
- " 'e4472727736f4fa59d49536d8e331f95',\n",
492
- " 'f933c36480f64d8b9600c5075a085e61',\n",
493
- " '808c6b3ce87345b391843aaa6b253bfb',\n",
494
- " 'f5d6ee781bd742fc88d5ddc2e5f0a7f4',\n",
495
- " '281ac17550864cf5827193ddd577aad1',\n",
496
- " '3d3aa0ebb1574fa7b498a13abb1b7c40',\n",
497
- " 'f43bf31cfe994208b24e363f9459a7a0',\n",
498
- " '9931894cfd004a20991a7fef40c23c86',\n",
499
- " '1b34b9f61f164993b7387a73e961bf2b',\n",
500
- " 'e2ca8df6b02d4240b7f1e4474b4765c7',\n",
501
- " '42741aaeaec7422f8ab9c59d18430455',\n",
502
- " '6a0a67f326704e11baae384eb567fa09',\n",
503
- " 'f7894024f0764978a9eca821c29d3449',\n",
504
- " '7f88bf5493764642a14a5bd8bbf04a71',\n",
505
- " '93a5412c61204d53b94fda693fb561b8',\n",
506
- " '3d265760e45a45d990240628c46fde6f',\n",
507
- " '62112a36dcab48379590ca210ca09918',\n",
508
- " '1abadc8684e64e6c8cbe1f7427d39678',\n",
509
- " 'f1f8c1a6c7534c5bb386425728cfa2b3',\n",
510
- " '81b640e8ae0747daadeda29da9f677f6',\n",
511
- " '6428b2a89f384a6985d69b0183fc71c3',\n",
512
- " '107fb02d9c7e4bffa9669509015e8af1',\n",
513
- " '73b8fd9b8aea45a6adfc02d5795bec62',\n",
514
- " '931cdf852b634abfb01b656221a8d0ae',\n",
515
- " '81e9f8177fe9430a9fb17fd20522c955',\n",
516
- " '7dbc18b381454afeb2a6041f60c2b23b',\n",
517
- " 'f621fcf8f34f4629909ca455ecaa4f55',\n",
518
- " 'c358be2860cf436d8fabb3200888c307',\n",
519
- " '81516460f65740e9aef0f4babc29b2f1',\n",
520
- " 'ecd2409c27cf47aeadeff569bd25ea85',\n",
521
- " 'a4c997a1ad7f4990b2d71cb028463610',\n",
522
- " '1ed8f8421791456db0543cd3e1ede40f',\n",
523
- " '56f5077192e74e09a58017d0c3368bc4',\n",
524
- " 'a639e85d3bee4530a53d132bfa7c58de',\n",
525
- " '4c75146c59dd4541a8500f89dd060a2c',\n",
526
- " '95c74438067f4bada1fee37942e06ed7',\n",
527
- " '0b3f010515574c48b02bebf7a451052e',\n",
528
- " '4db576ae022d42beadd921a81e977096',\n",
529
- " '3998f2de8bf44929afa7ad0e2e86eccd',\n",
530
- " '73cf1599b76d4061874e660228ca5f06',\n",
531
- " '962fb1291b984d60adb133201b7eae48',\n",
532
- " '365a19df65514c698d826e86fcdc6091',\n",
533
- " '5e86e99df25a4cc287d8ea0605f8cb08',\n",
534
- " 'f2cccab55efc43d5b098c38c31f687fb',\n",
535
- " '3388b8f7db314bf5a60cd10dbbc45f9c',\n",
536
- " '2a83e0ed7b4e4d2f906cfbc8dca7c512',\n",
537
- " '5b4f99c2acab40248de70a0e92506bc0',\n",
538
- " '278560e5a9e244e1a0a2ffa0ef7c261a',\n",
539
- " '864b65e24dea4473ad0e4a5bc32f4c69',\n",
540
- " '6b089bf4dd004ed78f1b92c50d414e47',\n",
541
- " '5bebbff6685649b99fa304d40b9b6362',\n",
542
- " 'aecb11bcf1444ad589508ea8bec77bdb']"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
543
  ]
544
  },
545
- "execution_count": 13,
546
  "metadata": {},
547
  "output_type": "execute_result"
548
  }
@@ -554,13 +640,13 @@
554
  "from qdrant_client.http.models import Distance, VectorParams\n",
555
  "\n",
556
  "dimension = 1024\n",
557
- "collection_name = \"ai-safety-sr-arctic-embed-l-semantic\"\n",
558
  "qdrant_server = os.environ[\"QDRANT_API_URL\"]\n",
559
  "qdrant_client = QdrantClient(url=qdrant_server,api_key=os.environ[\"QDRANT_API_KEY\"])\n",
560
- "qdrant_client.create_collection(\n",
561
- " collection_name=collection_name,\n",
562
- " vectors_config=VectorParams(size=dimension, distance=Distance.COSINE),\n",
563
- ")\n",
564
  "\n",
565
  "vector_store = QdrantVectorStore(\n",
566
  " client=qdrant_client,\n",
@@ -568,13 +654,13 @@
568
  " embedding=embedding_model,\n",
569
  ")\n",
570
  "\n",
571
- "vector_store.add_documents(chunked_docs)\n",
572
  "\n"
573
  ]
574
  },
575
  {
576
  "cell_type": "code",
577
- "execution_count": 14,
578
  "metadata": {},
579
  "outputs": [],
580
  "source": [
@@ -584,25 +670,25 @@
584
  },
585
  {
586
  "cell_type": "code",
587
- "execution_count": 15,
588
  "metadata": {},
589
  "outputs": [
590
  {
591
  "data": {
592
  "text/plain": [
593
- "[Document(metadata={'source': 'https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf', 'file_path': 'https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf', 'page': 44, 'total_pages': 64, 'format': 'PDF 1.6', 'title': 'Artificial Intelligence Risk Management Framework: Generative Artificial Intelligence Profile', 'author': 'National Institute of Standards and Technology', 'subject': '', 'keywords': '', 'creator': 'Acrobat PDFMaker 24 for Word', 'producer': 'Adobe PDF Library 24.2.159', 'creationDate': \"D:20240805141702-04'00'\", 'modDate': \"D:20240805143048-04'00'\", 'trapped': '', '_id': 'b6779e22-20c4-44d3-8741-c06cc2bb380c', '_collection_name': 'ai-safety-sr-arctic-embed-l-semantic'}, page_content='Human-AI Configuration \\n'),\n",
594
- " Document(metadata={'source': 'https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf', 'file_path': 'https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf', 'page': 33, 'total_pages': 64, 'format': 'PDF 1.6', 'title': 'Artificial Intelligence Risk Management Framework: Generative Artificial Intelligence Profile', 'author': 'National Institute of Standards and Technology', 'subject': '', 'keywords': '', 'creator': 'Acrobat PDFMaker 24 for Word', 'producer': 'Adobe PDF Library 24.2.159', 'creationDate': \"D:20240805141702-04'00'\", 'modDate': \"D:20240805143048-04'00'\", 'trapped': '', '_id': '26f3917c-c227-4e99-8e11-5212273fe2ba', '_collection_name': 'ai-safety-sr-arctic-embed-l-semantic'}, page_content='Human-AI Conguration \\n'),\n",
595
- " Document(metadata={'source': 'https://www.whitehouse.gov/wp-content/uploads/2022/10/Blueprint-for-an-AI-Bill-of-Rights.pdf', 'file_path': 'https://www.whitehouse.gov/wp-content/uploads/2022/10/Blueprint-for-an-AI-Bill-of-Rights.pdf', 'page': 11, 'total_pages': 73, 'format': 'PDF 1.6', 'title': 'Blueprint for an AI Bill of Rights', 'author': '', 'subject': '', 'keywords': '', 'creator': 'Adobe Illustrator 26.3 (Macintosh)', 'producer': 'iLovePDF', 'creationDate': \"D:20220920133035-04'00'\", 'modDate': \"D:20221003104118-04'00'\", 'trapped': '', '_id': '07a8b461-51e7-4641-b97a-d823ef91082f', '_collection_name': 'ai-safety-sr-arctic-embed-l-semantic'}, page_content=' \\n \\n \\nFROM \\nPRINCIPLES \\nTO PRACTICE \\nA TECHINCAL COMPANION TO\\nTHE Blueprint for an \\nAI BILL OF RIGHTS\\n12\\n'),\n",
596
- " Document(metadata={'source': 'https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf', 'file_path': 'https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf', 'page': 37, 'total_pages': 64, 'format': 'PDF 1.6', 'title': 'Artificial Intelligence Risk Management Framework: Generative Artificial Intelligence Profile', 'author': 'National Institute of Standards and Technology', 'subject': '', 'keywords': '', 'creator': 'Acrobat PDFMaker 24 for Word', 'producer': 'Adobe PDF Library 24.2.159', 'creationDate': \"D:20240805141702-04'00'\", 'modDate': \"D:20240805143048-04'00'\", 'trapped': '', '_id': '1d43c4ca-f83b-4708-97e9-6410f4dae5ee', '_collection_name': 'ai-safety-sr-arctic-embed-l-semantic'}, page_content='Human-AI Configuration \\nAI Actor Tasks: AI Deployment, AI Impact Assessment, Domain Experts, Operation and Monitoring, TEVV \\n \\n'),\n",
597
- " Document(metadata={'source': 'https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf', 'file_path': 'https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf', 'page': 61, 'total_pages': 64, 'format': 'PDF 1.6', 'title': 'Artificial Intelligence Risk Management Framework: Generative Artificial Intelligence Profile', 'author': 'National Institute of Standards and Technology', 'subject': '', 'keywords': '', 'creator': 'Acrobat PDFMaker 24 for Word', 'producer': 'Adobe PDF Library 24.2.159', 'creationDate': \"D:20240805141702-04'00'\", 'modDate': \"D:20240805143048-04'00'\", 'trapped': '', '_id': '73cf1599-b76d-4061-874e-660228ca5f06', '_collection_name': 'ai-safety-sr-arctic-embed-l-semantic'}, page_content='et al. (2023) Whose Opinions Do Language Models Reflect? arXiv.'),\n",
598
- " Document(metadata={'source': 'https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf', 'file_path': 'https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf', 'page': 28, 'total_pages': 64, 'format': 'PDF 1.6', 'title': 'Artificial Intelligence Risk Management Framework: Generative Artificial Intelligence Profile', 'author': 'National Institute of Standards and Technology', 'subject': '', 'keywords': '', 'creator': 'Acrobat PDFMaker 24 for Word', 'producer': 'Adobe PDF Library 24.2.159', 'creationDate': \"D:20240805141702-04'00'\", 'modDate': \"D:20240805143048-04'00'\", 'trapped': '', '_id': '00ca0c39-98b6-4339-874a-d036983a0922', '_collection_name': 'ai-safety-sr-arctic-embed-l-semantic'}, page_content='Make sure these tests cover various scenarios, such as crisis \\nsituations or ethically sensitive contexts. Human-AI Configuration; \\nInformation Integrity; Harmful Bias \\nand Homogenization; Dangerous, \\nViolent, or Hateful Content \\nAI Actor Tasks: AI Design, AI Development, Domain Experts, End-Users, Human Factors, Operation and Monitoring \\n \\n'),\n",
599
- " Document(metadata={'source': 'https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf', 'file_path': 'https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf', 'page': 59, 'total_pages': 64, 'format': 'PDF 1.6', 'title': 'Artificial Intelligence Risk Management Framework: Generative Artificial Intelligence Profile', 'author': 'National Institute of Standards and Technology', 'subject': '', 'keywords': '', 'creator': 'Acrobat PDFMaker 24 for Word', 'producer': 'Adobe PDF Library 24.2.159', 'creationDate': \"D:20240805141702-04'00'\", 'modDate': \"D:20240805143048-04'00'\", 'trapped': '', '_id': '81516460-f657-40e9-aef0-f4babc29b2f1', '_collection_name': 'ai-safety-sr-arctic-embed-l-semantic'}, page_content='https://www.rand.org/pubs/research_reports/RRA2977-2.html. Nicoletti, L. et al. (2023) Humans Are Biased. Generative Ai Is Even Worse. Bloomberg. https://www.bloomberg.com/graphics/2023-generative-ai-bias/. National Institute of Standards and Technology (2024) Adversarial Machine Learning: A Taxonomy and \\nTerminology of Attacks and Mitigations https://csrc.nist.gov/pubs/ai/100/2/e2023/final \\nNational Institute of Standards and Technology (2023) AI Risk Management Framework. https://www.nist.gov/itl/ai-risk-management-framework \\nNational Institute of Standards and Technology (2023) AI Risk Management Framework, Chapter 3: AI \\nRisks and Trustworthiness. https://airc.nist.gov/AI_RMF_Knowledge_Base/AI_RMF/Foundational_Information/3-sec-characteristics \\nNational Institute of Standards and Technology (2023) AI Risk Management Framework, Chapter 6: AI \\nRMF Profiles. https://airc.nist.gov/AI_RMF_Knowledge_Base/AI_RMF/Core_And_Profiles/6-sec-profile \\nNational Institute of Standards and Technology (2023) AI Risk Management Framework, Appendix A: \\nDescriptions of AI Actor Tasks. https://airc.nist.gov/AI_RMF_Knowledge_Base/AI_RMF/Appendices/Appendix_A#:~:text=AI%20actors%\\n20in%20this%20category,data%20providers%2C%20system%20funders%2C%20product \\n'),\n",
600
- " Document(metadata={'source': 'https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf', 'file_path': 'https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf', 'page': 57, 'total_pages': 64, 'format': 'PDF 1.6', 'title': 'Artificial Intelligence Risk Management Framework: Generative Artificial Intelligence Profile', 'author': 'National Institute of Standards and Technology', 'subject': '', 'keywords': '', 'creator': 'Acrobat PDFMaker 24 for Word', 'producer': 'Adobe PDF Library 24.2.159', 'creationDate': \"D:20240805141702-04'00'\", 'modDate': \"D:20240805143048-04'00'\", 'trapped': '', '_id': '1b34b9f6-1f16-4993-b738-7a73e961bf2b', '_collection_name': 'ai-safety-sr-arctic-embed-l-semantic'}, page_content='(2020) Overcoming Failures of Imagination in AI Infused System Development and \\nDeployment. arXiv.'),\n",
601
- " Document(metadata={'source': 'https://www.whitehouse.gov/wp-content/uploads/2022/10/Blueprint-for-an-AI-Bill-of-Rights.pdf', 'file_path': 'https://www.whitehouse.gov/wp-content/uploads/2022/10/Blueprint-for-an-AI-Bill-of-Rights.pdf', 'page': 0, 'total_pages': 73, 'format': 'PDF 1.6', 'title': 'Blueprint for an AI Bill of Rights', 'author': '', 'subject': '', 'keywords': '', 'creator': 'Adobe Illustrator 26.3 (Macintosh)', 'producer': 'iLovePDF', 'creationDate': \"D:20220920133035-04'00'\", 'modDate': \"D:20221003104118-04'00'\", 'trapped': '', '_id': '8dd5b1e7-fd46-4e2a-90c2-8a8eea8b0cb9', '_collection_name': 'ai-safety-sr-arctic-embed-l-semantic'}, page_content=' \\n \\n \\n \\n \\n \\n \\n \\n \\n \\nBLUEPRINT FOR AN \\nAI BILL OF \\nRIGHTS \\nMAKING AUTOMATED \\nSYSTEMS WORK FOR \\nTHE AMERICAN PEOPLE \\nOCTOBER 2022 \\n'),\n",
602
- " Document(metadata={'source': 'https://www.whitehouse.gov/wp-content/uploads/2022/10/Blueprint-for-an-AI-Bill-of-Rights.pdf', 'file_path': 'https://www.whitehouse.gov/wp-content/uploads/2022/10/Blueprint-for-an-AI-Bill-of-Rights.pdf', 'page': 23, 'total_pages': 73, 'format': 'PDF 1.6', 'title': 'Blueprint for an AI Bill of Rights', 'author': '', 'subject': '', 'keywords': '', 'creator': 'Adobe Illustrator 26.3 (Macintosh)', 'producer': 'iLovePDF', 'creationDate': \"D:20220920133035-04'00'\", 'modDate': \"D:20221003104118-04'00'\", 'trapped': '', '_id': '8c64aecd-850b-48b3-bac2-16e73ebad1e0', '_collection_name': 'ai-safety-sr-arctic-embed-l-semantic'}, page_content='Some companies have instituted bias testing as part of their product \\nquality assessment and launch procedures, and in some cases this testing has led products to be changed or not \\nlaunched, preventing harm to the public. Federal government agencies have been developing standards and guidance \\nfor the use of automated systems in order to help prevent bias.')]"
603
  ]
604
  },
605
- "execution_count": 15,
606
  "metadata": {},
607
  "output_type": "execute_result"
608
  }
@@ -1094,14 +1180,20 @@
1094
  ],
1095
  "source": [
1096
  "# Vector Store with recursive chunked documents\n",
 
 
 
 
1097
  "\n",
1098
- "recursive_collection_name = \"ai-safety-sr-arctic-embed-l-recursive\"\n",
 
 
1099
  "\n",
1100
  "recursive_qdrant_client = QdrantClient(url=qdrant_server,api_key=os.environ[\"QDRANT_API_KEY\"])\n",
1101
- "# recursive_qdrant_client.create_collection(\n",
1102
- "# collection_name=recursive_collection_name,\n",
1103
- "# vectors_config=VectorParams(size=dimension, distance=Distance.COSINE),\n",
1104
- "# )\n",
1105
  "\n",
1106
  "recursive_vector_store = QdrantVectorStore(\n",
1107
  " client=recursive_qdrant_client,\n",
 
24
  },
25
  {
26
  "cell_type": "code",
27
+ "execution_count": 6,
28
  "metadata": {},
29
  "outputs": [],
30
  "source": [
 
40
  },
41
  {
42
  "cell_type": "code",
43
+ "execution_count": 1,
44
  "metadata": {},
45
  "outputs": [
46
  {
47
  "name": "stderr",
48
  "output_type": "stream",
49
  "text": [
50
+ "/Users/jeevan/Documents/Learnings/ai-engineering-bootcamp/AIE4/mid-term/SafeGuardAI/.venv/lib/python3.11/site-packages/sentence_transformers/cross_encoder/CrossEncoder.py:13: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
51
  " from tqdm.autonotebook import tqdm, trange\n"
52
  ]
53
  }
 
56
  "# Embedding model - snowflake-arctic-embed-l\n",
57
  "from langchain_huggingface import HuggingFaceEmbeddings\n",
58
  "\n",
59
+ "model_name = \"jeevanions/finetuned_arctic-embedd-l\"\n",
60
  "embedding_model = HuggingFaceEmbeddings(model_name=model_name)"
61
  ]
62
  },
63
  {
64
  "cell_type": "code",
65
+ "execution_count": 4,
66
  "metadata": {},
67
  "outputs": [],
68
  "source": [
69
+ "from enum import Enum\n",
70
+ "from typing import List\n",
71
+ "from langchain_community.document_loaders import PyMuPDFLoader\n",
72
+ "from langchain_core.documents import Document\n",
73
+ "import asyncio\n",
74
+ "\n",
75
+ "class PDFLoaderWrapper():\n",
76
+ " class LoaderType(str, Enum):\n",
77
+ " PYMUPDF = \"pymupdf\"\n",
78
  "\n",
79
+ " def __init__(self, file_path: str | List[str] , loader_type: LoaderType = LoaderType.PYMUPDF):\n",
80
+ " self.file_path = file_path if isinstance(file_path, list) else [file_path]\n",
81
+ " self.loader_type = loader_type\n",
82
  "\n",
83
+ " async def aload(self) -> List[Document]:\n",
84
+ " all_docs = []\n",
85
+ " for file_path in self.file_path:\n",
86
+ " if self.loader_type == self.LoaderType.PYMUPDF:\n",
87
+ " try:\n",
88
+ " loader = PyMuPDFLoader(file_path)\n",
89
+ " docs = await loader.aload()\n",
90
+ " all_docs.extend(docs)\n",
91
+ " except Exception as e:\n",
92
+ " print(f\"Error loading file {file_path}: {e}\")\n",
93
+ " continue\n",
94
+ " return all_docs\n",
95
+ "\n"
96
+ ]
97
+ },
98
+ {
99
+ "cell_type": "code",
100
+ "execution_count": 7,
101
+ "metadata": {},
102
+ "outputs": [],
103
+ "source": [
104
  "pdf_loader = PDFLoaderWrapper(\n",
105
  " documents_to_preload, PDFLoaderWrapper.LoaderType.PYMUPDF\n",
106
  ")\n",
107
+ "documents = await pdf_loader.aload()"
108
+ ]
109
+ },
110
+ {
111
+ "cell_type": "code",
112
+ "execution_count": null,
113
+ "metadata": {},
114
+ "outputs": [],
115
+ "source": [
116
+ "from langchain_experimental.text_splitter import SemanticChunker\n",
117
+ "\n",
118
+ "\n",
119
  "\n",
120
  "text_splitter = SemanticChunker(embedding_model, buffer_size=5, breakpoint_threshold_type=\"percentile\",breakpoint_threshold_amount=90)\n",
121
  "\n",
 
124
  },
125
  {
126
  "cell_type": "code",
127
+ "execution_count": 8,
128
  "metadata": {},
129
  "outputs": [],
130
  "source": [
 
140
  },
141
  {
142
  "cell_type": "code",
143
+ "execution_count": 9,
144
  "metadata": {},
145
  "outputs": [],
146
  "source": [
 
154
  },
155
  {
156
  "cell_type": "code",
157
+ "execution_count": 11,
158
  "metadata": {},
159
  "outputs": [
160
  {
161
  "data": {
162
  "text/plain": [
163
+ "['7e6a73422dd04376b5212e1c71275f5c',\n",
164
+ " '9fc355b29e534d4d9ea5d87d8c9bf77c',\n",
165
+ " '5e6a933b1bae4db7922b50a8a6bab44d',\n",
166
+ " '83ddb01ec3954f1dbdda744124a8c76d',\n",
167
+ " 'd6e795fad13242498f11b8bfd8216f7d',\n",
168
+ " 'ba26430bfb714249ac4ce151f6819ac2',\n",
169
+ " '06818b77aa704eeba5c7499d653c0433',\n",
170
+ " 'bfff4b85b5b440869b1fadcaf0ba0489',\n",
171
+ " '7810842f60234aa0bee8c81a6577fc73',\n",
172
+ " '85fbfbfdd52349dca312edcbd70af79e',\n",
173
+ " 'abddd6d787a9496c9c3f93d7386cd1c4',\n",
174
+ " 'da0779d479454c1ba82407a68d536997',\n",
175
+ " '39b0ea70a678462fbf305d1096338cc0',\n",
176
+ " 'aff239aa6f5c447d885b6f7b0346ed1f',\n",
177
+ " 'e486a65b3a414225aaee7cb21e44cd4a',\n",
178
+ " '16014bcfe609485d919b310f040ff735',\n",
179
+ " '163bee1b3971476d97bf3c92c2aafd32',\n",
180
+ " '10806d74287549288bc664e03e1528c8',\n",
181
+ " '1c8cb58feffc4bd38061f6bb47fc9974',\n",
182
+ " 'e48046904f4f44d8a5042385d618b84b',\n",
183
+ " 'adf374bab6264d75b4598e9e6b0c31f1',\n",
184
+ " 'dc30905487ec493398c89c5db30c8e41',\n",
185
+ " 'e571028cd6804e259d217f3936ab1ac5',\n",
186
+ " '5708400cf26f4f5ab83c151cab3f29b5',\n",
187
+ " '3d66be0e03ed4c6d8d6d4bd89c3885bc',\n",
188
+ " '6c1368d79ada4a87b0bcf0d5ecbb1d05',\n",
189
+ " '3364bb5cb27f4df9a84e671fc47f3299',\n",
190
+ " 'feea03d2dc68453fb545cf746ec9be62',\n",
191
+ " 'f1ba6b845006430798e4191b7bf63b7e',\n",
192
+ " '778f730ed4ce40bc872cac0b1320ee7d',\n",
193
+ " '322981b8743a4bd88c2366901593feae',\n",
194
+ " '48a6cebe5639442f87abdff4d4d7ad8f',\n",
195
+ " 'fcaab76461d04caf8cddaae701bc1ced',\n",
196
+ " '41d70fefc71a436892b6f724b4643bf8',\n",
197
+ " '01bdb6c2fa9b43879231c6aa88530bf6',\n",
198
+ " '6f4da3635bca41ecb6a3b5f6c53b842b',\n",
199
+ " 'c92f7b0ece1d4e12a116cb43ac7ab1d2',\n",
200
+ " '807f090f360c4a5b9a29387a6f73cb55',\n",
201
+ " '7f387826030348e691e24fb3a14cb470',\n",
202
+ " '8146a7c2a6784cc7bbbe698ea7c1676c',\n",
203
+ " '8f30db72c63b4b1b8cf598becd646e94',\n",
204
+ " 'ed405fa7b4f64e8db2944f5220a5c8fe',\n",
205
+ " '8f1aedce957141bea4dd5d532188567f',\n",
206
+ " '4e887209f4a64ee3b7b3f8232534a9d9',\n",
207
+ " '8ccb6b9f04614326ba3bf5f96e59a960',\n",
208
+ " '2a0f3e2b88da49efb934430eef0c447b',\n",
209
+ " '27c3630e2f4048b18454e1477885ef6d',\n",
210
+ " '522d9265216c448db3859c7ccf079bc0',\n",
211
+ " '29055cef02b14cb8adfb95db7359014d',\n",
212
+ " '458ae662b668416397a84995a6ef6e9c',\n",
213
+ " '3a2974f8cce8431ebb562d72fad43809',\n",
214
+ " '16c5eec5b4574d51bc35125af1624f57',\n",
215
+ " 'ce0060ece6724eac9999b0f5bd3e1dd9',\n",
216
+ " 'ae1d9dd5a3264480897857ec12bca123',\n",
217
+ " '69e6715305a6429db45a9d5db5ffde10',\n",
218
+ " 'f504045bcf334cc0b53f44fca8c6712d',\n",
219
+ " '3120dcf898d641ba99dfca96bb916662',\n",
220
+ " '792a483c0aa74dd48c0df8d92fcdd46d',\n",
221
+ " '192e82b275f84c60b06fc5bd18de1fbc',\n",
222
+ " '9b434b7acb85487883b924ac8191fbce',\n",
223
+ " '57e81edbe7604de4bc470a8f44feafbb',\n",
224
+ " 'a2267ad4ad2a4aad8b4cbcb36f8274ac',\n",
225
+ " '1d4aee78da4c43bdb16cccb4cc799b32',\n",
226
+ " '5b4987809648438d868140daf4d477e4',\n",
227
+ " '6218bded0a5a4d4e93cd8fca57072ff9',\n",
228
+ " '4e6d3c750f42485ea29cd176eec1cd62',\n",
229
+ " '01ca0d114b384e7fad435ac2338e0103',\n",
230
+ " '80dc9865efae432a952eec9f8c46f778',\n",
231
+ " 'e1e9da01efe54ddf8c98771b2d1135f1',\n",
232
+ " '3aae2eb905054cd7812127a4e78534dc',\n",
233
+ " '8edf2a7164614da7a283abb6ecbe9088',\n",
234
+ " 'cd82af66911c478e8d37d8cef1729de1',\n",
235
+ " 'be33bb91fb3d4208933839769fb98dda',\n",
236
+ " '0edbcc77fa6d4735a959dfe489a4204f',\n",
237
+ " '85abc246a02c4ab1aba4a39a7994c61f',\n",
238
+ " 'b9f2b7053e074e909faf2e691bf9c57e',\n",
239
+ " 'd6f1812813c547ac81c6c64d0bc28ef1',\n",
240
+ " 'c3a273cc36d7498c94255f47c2dcfa38',\n",
241
+ " '708872346a8d4731b77ac2b603c8b9a9',\n",
242
+ " '40577ac91f2f46b2be704852b3b7da73',\n",
243
+ " '012e24872e4843829d0a8c57897354f6',\n",
244
+ " '0aaeca11740c45ffab1ac96fec61d0b6',\n",
245
+ " '83c9873b978f49839a8b801c7c3456bf',\n",
246
+ " '6ea0896cf6244e9393323f48ceae351c',\n",
247
+ " 'cbbbc204feff4106ad07182a17663b65',\n",
248
+ " 'c96038d6efc6460587351b73ce06c477',\n",
249
+ " '65ff657993f7493a85bf7b8545822958',\n",
250
+ " 'a5f1605302dd446498e3d97d9a65ff5e',\n",
251
+ " 'e67ff0603d934f8daf490506e095654e',\n",
252
+ " 'fbb4ddc0fffa43c693b73f35e6b8b046',\n",
253
+ " '52b0f10d941b44ce91ab2d201c6abc39',\n",
254
+ " '8b87551f187741f3841131c45b1f98cf',\n",
255
+ " '8546fe434cce46a98183e85f9fbe508c',\n",
256
+ " '8b69c605429e4843871064329c727940',\n",
257
+ " '2c6215e1bf8148a39e1ee5044e7d25f6',\n",
258
+ " 'a800aa91c46a4ac6938205ccefbd8cc2',\n",
259
+ " '39c5beb614c9410f9db1006fe064c89d',\n",
260
+ " '61158196e2274bb6a6116040e306aa41',\n",
261
+ " '4ecdf2b619444dfeafdfa08c0d7a188b',\n",
262
+ " '2ef6c61937a346c4b7d45044d0ab9e3f',\n",
263
+ " '3aff173caadf422584f6d0d00b238250',\n",
264
+ " '199301aec5584b31bb173ef65af3c226',\n",
265
+ " '0329bbf676454d5cb83a754634a3b039',\n",
266
+ " '8892f081f8d74a519005c2aa0103343b',\n",
267
+ " 'f248adeb051a4f84b205f4d7f25c6bca',\n",
268
+ " '1e95159724754c3c827b4705357682a7',\n",
269
+ " '4578074dbac549799b4415cac01bb42a',\n",
270
+ " '6e0644408e9c44479f7a39aa2374cce7',\n",
271
+ " '5568071c77c14cbc810df23635d8a570',\n",
272
+ " '4866d8188ebd4a73a6b08c8952499880',\n",
273
+ " '317311af1cb24262a493af4dbd711502',\n",
274
+ " 'd8030df3c00141dab0a63ea4994b12ee',\n",
275
+ " '55019342d242454f81794ec6fc4ab672',\n",
276
+ " '5f6a23f645e94b8ba355cbbbc0edf86f',\n",
277
+ " '93f909dfc6884ea68be8f42f7222efeb',\n",
278
+ " '61ee9886027b46d8ac7feee9af1c135c',\n",
279
+ " '4328e2ca81584d3384e9dae389aa7a42',\n",
280
+ " 'fa9e59c2a3704dcc98bee24820a3b24e',\n",
281
+ " 'ca23ebefba4a4785b0c159b601b2684e',\n",
282
+ " '795ce21fd9ee41d2a307b17d1d2caaee',\n",
283
+ " '64d5b336913a48c5bca2b1a13950a84f',\n",
284
+ " '430128844f3a44239e1b8089cfa3cbc8',\n",
285
+ " '590ba6b4e9e347c9950c1e61eedf4a0a',\n",
286
+ " 'eb8292f67a2c491aab48a8721437801d',\n",
287
+ " '74fcf851178146beaf87dacf7e9880b3',\n",
288
+ " 'ba8b6e6dc7594131966758ee8cc1f281',\n",
289
+ " 'db5ff7f95ce84f799c44faa9174f9255',\n",
290
+ " 'f0da98b07de44206bbcce8bf8af5d03a',\n",
291
+ " '6aa11aa8b65e4c948e0e7902324e335c',\n",
292
+ " 'fa2a4dd8878a438c91bf462657c655e0',\n",
293
+ " '22b4dcec8583498b9f5c9272fa03fd03',\n",
294
+ " 'e89a54b5d0b24306a11f5ff4b34eea50',\n",
295
+ " 'f930c1ba25a1498d9964df40e039a2b6',\n",
296
+ " 'f3a5320dcd7741c7a7a7b5414a698bca',\n",
297
+ " '8aa395d5d9bc46eba9a4e6cb1424a5b0',\n",
298
+ " '315cb577037c4d468e6a109876406c2b',\n",
299
+ " '6ecdcd5051a1403da50545e71d6da209',\n",
300
+ " 'f440b367ddc64c3796d37c681e2f2611',\n",
301
+ " '22b4bcfc59f242e68724607b592a1570',\n",
302
+ " '35a5bacfa0724c47a3ef49dd707216a3',\n",
303
+ " '65dc165cf9604918870e1f8bc659edc9',\n",
304
+ " 'a39b7bb6d500424f89b0397cd58242a9',\n",
305
+ " 'b797378ce695439f9e33909cd9b59a19',\n",
306
+ " '5c807877b26447aca4cbd19a6d437706',\n",
307
+ " '7e47508998b54b01a4028350464b0651',\n",
308
+ " 'eb0ed3dcc27c413f86b97c6373a95b2d',\n",
309
+ " '36b590a2bce247b19c2493d77fc79f35',\n",
310
+ " '7eeb74d2a0a842f88a6cf54c22fdaf78',\n",
311
+ " '03aaa157c1b64b4583a22a8f68a041d5',\n",
312
+ " 'a52aa607634e42debe74aa23703ced9a',\n",
313
+ " 'fb61c2ab526a40d7973b277302c28884',\n",
314
+ " '133a7abc2b6744e1bb39936bfe4ab153',\n",
315
+ " '4819c2e044bb40baa0955d500726ae25',\n",
316
+ " '9a097681fac5463eab74d6b8d037c7f2',\n",
317
+ " 'eed0c6e24a7a4d60b50fd77560f09515',\n",
318
+ " '51c53cf71b1049108115f72111dd0628',\n",
319
+ " 'eb8eecc5375e4055b0f9dbb3ef9de2d5',\n",
320
+ " '35f84f29041243ef8c706f29c4291885',\n",
321
+ " '18b009a2f29349d3898cdf36945d8a79',\n",
322
+ " 'fc123881e9034da0822eb70738d6d32c',\n",
323
+ " 'b4f6c6f6f5a842b38d83984c338325a3',\n",
324
+ " 'f20d2030d1364928881c784f431e5bfc',\n",
325
+ " '493d5cf218ac4d4fbebfbc86cac36765',\n",
326
+ " 'b4126c1ea602422fa497d73be286c109',\n",
327
+ " 'a255ec54a9e944bf8e8ed4fb3750ea88',\n",
328
+ " '920aeb2be1694376992f641365e8bd43',\n",
329
+ " '8307d5c77e4346af954f9202f56420a3',\n",
330
+ " 'c91a1252fcd843fdaa67bf1716d37441',\n",
331
+ " '60492f40e6814fbca9475582b1f1b126',\n",
332
+ " '0a08b2e235394021a43c9e7b5ae9550d',\n",
333
+ " '8a2746def9a64df29cfc906498503baf',\n",
334
+ " 'd805a2f9efa549138d9e008beb51f92c',\n",
335
+ " 'af9273ba0f9d4124a83c4f8a1de7cf8f',\n",
336
+ " '6c58a810f69d4d7ca0d8de99fd52c776',\n",
337
+ " 'f407e14346b64292a81c666f5ab45ebe',\n",
338
+ " '9b0ba54ac74d4982b5bea390beb42fa5',\n",
339
+ " '54ea0f7eeb694034b7ca0a81b8a2439d',\n",
340
+ " '6b407ca21eea4650b52f1f06d5f50513',\n",
341
+ " '298b573bcc96424a8d20f2237dfafdf2',\n",
342
+ " 'e9ed96c3a795415e9fc7a6cfee48fdfb',\n",
343
+ " '1fe33d1cac824d38a7e7c935cf9d95e2',\n",
344
+ " '3dbf2b5152f243939ffd87fa068e469b',\n",
345
+ " 'ded73ec56ff94f5ca844b8f56a55c560',\n",
346
+ " 'cb3bf04b763249a7853cde719e0bfe37',\n",
347
+ " '7411abd2e00a46189a3341ff1a4077c7',\n",
348
+ " '49a9a48d88cb4b8ea9fb28d5300e6279',\n",
349
+ " '1aeb055eb2984cb1a67d1f8ba7b40975',\n",
350
+ " '4662c1887c504009bb5f3f6edfb8d8e0',\n",
351
+ " 'e1145b7317bc4695898567041071d13b',\n",
352
+ " '5124c1648cc1458c9747f3eea6a2a301',\n",
353
+ " 'e4a2f6f9d63047d1b768faa280c8686f',\n",
354
+ " '955259cbf75a427395cefe5bc5834d08',\n",
355
+ " 'af8c9f932d9a45d2956e32b6c2762dc9',\n",
356
+ " '8f03b01f28e6401f8779a8b98dd0e584',\n",
357
+ " '4c80fe955cce49c2866e565a7d2c1c58',\n",
358
+ " 'b35c3a592cc549bf8f37a4b470759ddb',\n",
359
+ " '81e348d53d7f47a1a8b63ce096bd5dc0',\n",
360
+ " '143d5eef78ab4561a97e1a54ad54bd6a',\n",
361
+ " 'f9e56fd50d79462bb649902940e78aa7',\n",
362
+ " '7ba07d3105ca4e30b3e292aaa315ab0e',\n",
363
+ " 'e55b0a37dfd34565a8b896fe1c30c5c0',\n",
364
+ " 'ff0bad0434074a789797a01fb93828fb',\n",
365
+ " '1263eee1196c4f52a87748bf1fc32faa',\n",
366
+ " '7e9f6b49537f4f7d86eb167c54ef53fd',\n",
367
+ " '517f9f9b9d664f05b12803bbade89cfe',\n",
368
+ " '3f7e8ca69eca44f2b02024de41a60747',\n",
369
+ " '36361f8ed6c84a1c88cbfa3c44620f94',\n",
370
+ " '3b1a1dd74fa44410a5f7b5923d194046',\n",
371
+ " '20c990f82aae4c27a65f427a61a4ff2e',\n",
372
+ " '6cf1b79772db47119861ae7368cdcfb9',\n",
373
+ " 'bd9c8cbb658540d5ba56054c0b9001e0',\n",
374
+ " 'c6d74fd0f00343f08dba101d12919539',\n",
375
+ " 'd69c356e507c4ec2a9894979c0dacec1',\n",
376
+ " 'ab981093667c4dfbb699a23631f0ece8',\n",
377
+ " '067cc55582b949b182315c7a782cb871',\n",
378
+ " '149f1899cdd5435693e03a6fe34faafd',\n",
379
+ " 'f11e46b3744d48359af1289b2c329670',\n",
380
+ " 'baacb9574ff04cdba608ab1213836623',\n",
381
+ " '91b3470ebdcf4105a0fd5b5ed8131588',\n",
382
+ " '202c5981c8874a0ba7516eb30361183c',\n",
383
+ " 'f12a70efb5534d4ba33731b2e02f3c16',\n",
384
+ " '9dd7f662df5346238845363b93374d67',\n",
385
+ " '68b06be60d17429b8bb72c3f48d9513e',\n",
386
+ " '2f4ea4436156470fa9c32c2b045100f7',\n",
387
+ " 'a3a5ffc9711b4feb9f2fa6a7b65017a1',\n",
388
+ " '461fd5b2359f4234af45e0c14dc754ee',\n",
389
+ " 'e9521091e9d54c469b426e451dcc549a',\n",
390
+ " '5f985651e7564c3abec885352e0e8c34',\n",
391
+ " '24cfff725b2b49e6aa0888777e6e6377',\n",
392
+ " 'c20f6353f47541a4bd6add6e2b49f5f7',\n",
393
+ " 'c17ad6f1db804af98021992177d05901',\n",
394
+ " '0c08d3080f1f403f8a25e9aadba4d515',\n",
395
+ " '2316394619e44d9eb3ec83fcadb03d71',\n",
396
+ " '109c1b954c114152800ab3dff35897b3',\n",
397
+ " '6f2da3f0de9e4821920927cd442df7f2',\n",
398
+ " 'cb610e59a9c54cfc8f6961cc1c84e9c6',\n",
399
+ " 'f35424710f1d401fa7934f6d5c418235',\n",
400
+ " '069ff4b21bc943bfa9dcdfa4fabc7d32',\n",
401
+ " '3567a864869841da9068c9646b6e35f3',\n",
402
+ " '4665091843c34983906f4dddb04908d8',\n",
403
+ " '94179992c73a4c4187ec6bbcefe79ce0',\n",
404
+ " '62d850b75fea48dabdf133627f8bac4c',\n",
405
+ " '3a27b1fd38d541d79cdf8834bde3e08b',\n",
406
+ " 'd37aac990eed4b4eaef154355283d51b',\n",
407
+ " 'a46cba7542234baaa29ac6349c1a8ff7',\n",
408
+ " 'dd54b984ac1048a693d12641e5d17fb0',\n",
409
+ " '967f7918604a4ee386c41ec4fdc78de4',\n",
410
+ " 'bbe13bb721054544b560c29eb101ec7a',\n",
411
+ " 'ee58b2e1ef584f73a16f947d4c2dac2b',\n",
412
+ " 'cb821f21e675460ca22a0feb58787658',\n",
413
+ " 'c87c4709fded48a6956776d3e2236040',\n",
414
+ " '981dde0ace23485e94aed52aa0ff9a7a',\n",
415
+ " '90f53d8039524f75ab132e7e7507c0c4',\n",
416
+ " 'ec166cb9fbac4947869a03b52e518401',\n",
417
+ " 'e9796c57fadf4226be635cacac670a7d',\n",
418
+ " '0d5b0f646568418ab53b0a1c7c3f649e',\n",
419
+ " 'a7477b8261624f5a9b79052649e5adde',\n",
420
+ " '1498599a5522410d9d72fd3f382d0093',\n",
421
+ " '7b32b94f550147ec93c49889c72ddab4',\n",
422
+ " 'dc38bf202eef471ea1abbdd6a44008b5',\n",
423
+ " '5789a475a4a6453eba7522b88536a56c',\n",
424
+ " 'ef17f685fb184e179587886fb3cf2fca',\n",
425
+ " '976d8b607ce941048d58376ce9cc86d3',\n",
426
+ " '07303a1defb046029ac61adc07dae7ec',\n",
427
+ " '9eb77576298742709ca790ef7114fbb7',\n",
428
+ " '618846b8c0ab4b3bb1a740e45d6c1f53',\n",
429
+ " 'dec492d1d29f4416957f1156d8c2982d',\n",
430
+ " 'd059a5a5d19c4490b17fca7d049bcabe',\n",
431
+ " 'ee4edd8f7f9b410bb60fd55bd6ac1010',\n",
432
+ " 'dedfda38a2a942558716d0c094b583a6',\n",
433
+ " '9499ddff0eef4d368e72a17a80df12b0',\n",
434
+ " '67c059163f6b49e1bd98ee30cd8913c1',\n",
435
+ " '1400b471ea774a51b67ed115ed3d2629',\n",
436
+ " 'eb168f728da042fb99b221e736cc58ba',\n",
437
+ " '2e13e8baa94c413fb36354751d6500a7',\n",
438
+ " '7a05c4e1ec244287bdc53434f385237e',\n",
439
+ " '77c3f0079d3442e59bf2a4292e3b9889',\n",
440
+ " 'b3ab4e9272914e1fa6ce549f615bb5af',\n",
441
+ " 'a219215029174816ad2ad730a419e4ab',\n",
442
+ " 'a97e6c5a132345cc9129513e8fd4f629',\n",
443
+ " '3c2f44dfb00f4a9cb83212190710d165',\n",
444
+ " '43717b33878246bd92a12a79c24eb6ad',\n",
445
+ " '00983beb4f4e4f0c85063db2412aa0a4',\n",
446
+ " '9ee6cebe64fe4c94aa8022db635a1834',\n",
447
+ " '1394c2cbe97b4561bf2871cab16ba969',\n",
448
+ " '70046c7fbe104d9cac9593156dfa3baa',\n",
449
+ " 'ccf1ade95ce24d33a915ea4723ee01e4',\n",
450
+ " 'cffd6cd2d3b34fbaa32d677294d2b811',\n",
451
+ " '8a7ebcd4521b432b803d0d6fa2c035c6',\n",
452
+ " 'a9d009255da74706825461a6c4c1aed8',\n",
453
+ " '377153d600df475f91971ea413ba1eb0',\n",
454
+ " '0f7c81ba23324798bbc25bf941f8e4b0',\n",
455
+ " 'f0a482198a1440628370e7bed1a85bc9',\n",
456
+ " '7da7c179070644f99e1b2fdab4b7fb75',\n",
457
+ " '25932ef3bd104c1aa25c7ba24c8225dd',\n",
458
+ " '62ed7a98dc964dc5b05838dc7fac652c',\n",
459
+ " '9a42744ce7ce40218341bfc98958244f',\n",
460
+ " '23b6cb8e8f1b403aa472a3b2138c75b1',\n",
461
+ " '3616f0b38d964fec980bd3d1353fbcae',\n",
462
+ " '0b23a4d006a0434d8a17a3c85675cc50',\n",
463
+ " 'c4dd0bc2654c48b88c4c645149145461',\n",
464
+ " '44bbccb8264b4831a95a7134fddb6986',\n",
465
+ " '9cef11cbe3c3455bbe304ca4b9f7a761',\n",
466
+ " '84c93c3fef4f48f1bd977dde14de25e7',\n",
467
+ " 'd410724332284b6491672429905d8841',\n",
468
+ " '58102d923dbb405fb8543001a39b2fa9',\n",
469
+ " '82a54c0a1a2d4297ab9667b1613e1a62',\n",
470
+ " '4dead17ef7144f4d97b447fda4aebe43',\n",
471
+ " '612f6cf6123e400e95e8ff7f0253d3fd',\n",
472
+ " '7a6f3cf51abc41a09254b70d01c3d7c0',\n",
473
+ " '9614ad1cda3f40b1942c6e2f1c6ac785',\n",
474
+ " 'ec99982b671b4bb799763c9b45b43af1',\n",
475
+ " '23cdc25d55214f5ab0728a9af99115a3',\n",
476
+ " '88c36979cd574d26a4abcec3d22c3dc6',\n",
477
+ " '00ddf8f6f08b415d97262cd9ce31037e',\n",
478
+ " '25eac9c21b8b497db1127b023daafac4',\n",
479
+ " 'd870e4c7e50f43e7b02258c5c74cf729',\n",
480
+ " 'ad27ad6c04864d9694ace786bc6453a7',\n",
481
+ " '568443576592490db5c3e54b196bf078',\n",
482
+ " '14db0f53ad8c471c8ea14c74f5d92f23',\n",
483
+ " 'aeacfbc1adda40ee81b7c8bf4e27ed81',\n",
484
+ " 'be506fd2d1ff4430af8fff44ab5d5dd4',\n",
485
+ " 'ad169d2df3e74117b8432cbb80b23682',\n",
486
+ " 'ca6e653042c54fbe8f66c715d0799e1b',\n",
487
+ " '34e46b2b561845b3bb58d8ae1497433c',\n",
488
+ " 'ef6470836d2f42fb96fa4d8a76029009',\n",
489
+ " 'e7f547eba08f46529116dcd17b971d44',\n",
490
+ " '0badc20385f74c7f8b21e6d8b4b77285',\n",
491
+ " '96e2d5a8b73941e299787afb22b56447',\n",
492
+ " '22bacdff64dd41899670c930d0bdc3a9',\n",
493
+ " 'fcc0f6618237436d98bdb9ee918f1689',\n",
494
+ " '840adc73a593400eb79ae99a972a8237',\n",
495
+ " '8a929883791a4584a2229f68d07d4e40',\n",
496
+ " 'f2bfce8fb0f845cfbfd0dd7c68489a0b',\n",
497
+ " '45c77fa7f00448cbad485da8fff9d59c',\n",
498
+ " '9ae29211013c49adab26fd76000de05f',\n",
499
+ " 'd9b4f6ff9eab4ab492262766d7dcc5f1',\n",
500
+ " 'edcce910e0e342f58b9aa60751b2f9e7',\n",
501
+ " '0b9ddfa4be1e466c92d6da22021263e7',\n",
502
+ " 'be8a0b13fe634a558a8f3787da07d3c6',\n",
503
+ " 'c7e8436b98534ab9992e7598d82f62a1',\n",
504
+ " 'a3d5e62deba04f64ae592da593863837',\n",
505
+ " 'a058d293c88842769184bd31cdd46157',\n",
506
+ " 'e134c211bbde4cee9c184481c861eb28',\n",
507
+ " 'f2853c4d3da9403ea054e1acbfb14c23',\n",
508
+ " '6095ecdf409e49e0894b8b409bfa61ef',\n",
509
+ " '9fcd650ff6e84d53b6c0d85cf51ee0e3',\n",
510
+ " 'eca00c45032a4b548ca0bbfca9123e52',\n",
511
+ " '20599a4f87034dba864a683a98aaa7c5',\n",
512
+ " 'af646536cdbc48f38149f5d37f768f6f',\n",
513
+ " '0849d591aace432e877b87405e8875f6',\n",
514
+ " '93aab2bc54894dad96b714936951540f',\n",
515
+ " '01f8251387af402fb3babd21db484b1f',\n",
516
+ " 'c8d5ec4643e9488284a72693ab903ee1',\n",
517
+ " 'f1ccf53b5cd34d12ab12e2bc1a54cc65',\n",
518
+ " 'ffcddc3efcf646449f996ca0c48b5dda',\n",
519
+ " '853dffcece54408db6d851745bbb1b6b',\n",
520
+ " '692f4bc7425f419ab5d716aae8553ebd',\n",
521
+ " '9260733ca60548f2a6385ab1ce923865',\n",
522
+ " '34705d1443714e7f9684d0d9bd67b2e1',\n",
523
+ " '4a9e6f60bd564c5c97f6babe95ea8fe0',\n",
524
+ " 'd5ab615c8ea64428b5dcb7214cada1fe',\n",
525
+ " '062c0c4daacf42da865484f4a30a588e',\n",
526
+ " '029dcc932f6b479aa20b3dde7e98423c',\n",
527
+ " '66542637c66842d9a0218f941ae9b5aa',\n",
528
+ " 'c2af2bc898b74a1eb9694292dd639f07',\n",
529
+ " 'a7e2168cb5644703aa95839b4ce030f5',\n",
530
+ " '18b4bb366fa84ade94295d9811a6bc03',\n",
531
+ " '4c405bded8094ac99314eca15a158636',\n",
532
+ " '555bcd57341047d49fbbf24bb643b85c',\n",
533
+ " 'fbf9a1f3639f4c6eb108b8fdc2f5f342',\n",
534
+ " '32d0da43076d40bb9d28bc852fd87d63',\n",
535
+ " '0535837a18714cba9768f823055518f5',\n",
536
+ " '0817188bfef4461f8f6d3df6d196c34f',\n",
537
+ " '6484425365594adab5a9f4bd7c36a83b',\n",
538
+ " '8ad7968e37a9408fb89fb78bb1c3005f',\n",
539
+ " 'b403a906399d45adbffcb10fecf0fecb',\n",
540
+ " 'ddf79f51d4544b989b0788987ee147c9',\n",
541
+ " 'ddaa17eb15674e29809dcf804449c645',\n",
542
+ " '0e876fb1cc00450097dfb79b2410395d',\n",
543
+ " '1397f3810eb64c69b8b049a1f10f8d13',\n",
544
+ " 'ac87d1446f1b44f28ca73d7e84145c57',\n",
545
+ " '69be5aedf644486f8a2fd88507532a56',\n",
546
+ " 'f2114973a4fe4d408ddf30b721b5ef10',\n",
547
+ " 'a2dca7cc375d4499848180bdb30c1445',\n",
548
+ " 'e09f62b7ccb24ec5b83c4c594b5d98d2',\n",
549
+ " '411b5054a2e4445dbfbca42783cff1c2',\n",
550
+ " '9c1e6c59375e4cc5b35a9cefa0cd2cbe',\n",
551
+ " '5a46f2388ff44c65aa7f64fb7c4323a0',\n",
552
+ " 'e1dbd1b6c3c3431a960fe2c21a6dde12',\n",
553
+ " '2374ce9cb0bf4e9eb0dff0e49bce6472',\n",
554
+ " '9a525105b44d4ca68e0adb98864f439c',\n",
555
+ " '050374e7aab94e7ba852d46393149296',\n",
556
+ " 'a9ee85e00ef94e1eaca35ac31a9f897a',\n",
557
+ " '72f950b08c12431f9e78310521956ee0',\n",
558
+ " '5b7fd2143f774014a47ccddc2f7b341b',\n",
559
+ " '7dad2a8ace574afe850c6ad1d94bca94',\n",
560
+ " 'd4c2c17550cb4a3f93e0baefd0736481',\n",
561
+ " 'd468ba975bd14da5ba7c12d62baea086',\n",
562
+ " '0a9b07ca34f3497a87736cedd9d3b717',\n",
563
+ " 'ee48a8101b8646048f22af19fbfcb15c',\n",
564
+ " '936aa920ab1e4639aac5adeadc4ddea8',\n",
565
+ " 'e0a43b13e02b443ab605f496d7883599',\n",
566
+ " '66ea7fb1c42e48f18cc8be7e19025fde',\n",
567
+ " 'db14e88e7a4b42b1b89c02c509436637',\n",
568
+ " '261c80c62a9d498c85e6015e8e7b6a00',\n",
569
+ " '380765573ec6450a8b9c88ca090e73dc',\n",
570
+ " '7ea2367162214d6b8568af4339fb566f',\n",
571
+ " '0b58820498a24afe9ebb0f1e732e61f6',\n",
572
+ " 'd14f20185d81474b8c693be9b25d819a',\n",
573
+ " 'b3c8a1723f35429eabca07670537dae9',\n",
574
+ " '30e9aaa2eea34b1aa721788c0936dcec',\n",
575
+ " 'edef675b0ae241bf908d9f33a2157b94',\n",
576
+ " 'bc9ffa0bef644901bd335eb6569becf5',\n",
577
+ " '710894a380aa4ab7b8a5f00998351fd7',\n",
578
+ " 'bca6ad1fc44747dcb3841ca028c933c1',\n",
579
+ " 'da0d232bd42d4143863bdc3662a3b0ba',\n",
580
+ " '3ca05f6fb84f400aa40647e909b99f86',\n",
581
+ " 'ac3d4115af5f4f9b84d9d91441476eea',\n",
582
+ " '8f95d44658fb467ebf078da25c5d3048',\n",
583
+ " '77522a3de4b64cba9cac3976ce486556',\n",
584
+ " '98b110db2b934b0092af19c5cb3d661d',\n",
585
+ " '92652663675848bd88952e6fa060510a',\n",
586
+ " '7e8b14b3189048698fa9dc991bd2e969',\n",
587
+ " 'a0f266fa20794f27b6884b84d61405df',\n",
588
+ " 'bef94726a254426ebb56b5e31cb84e6d',\n",
589
+ " '19aa6056079e43d59ec313be58b7d15c',\n",
590
+ " 'e9eaaf55533240acbe3e826243fcce5d',\n",
591
+ " '7daead857efd48d3a924121a75f1d554',\n",
592
+ " 'e5b478a30210476a80cadd87e6b08142',\n",
593
+ " 'df5d55b2532c4652be2d5931e1801b29',\n",
594
+ " 'b5835c1c3bae4a8daf54eba94e971f44',\n",
595
+ " 'a4683fd71d8e40bbabf4cef4456cd964',\n",
596
+ " 'a78e39b65ebf4c3c9b30aebd5ec79982',\n",
597
+ " 'b581cab62fb74ecfacd8f375873467d6',\n",
598
+ " 'dd441662fa93493b82bb1ee75062ab46',\n",
599
+ " '16247f37e8cf46ef801958417634dccb',\n",
600
+ " 'a58028cc6f8740849fb48649f2d962ad',\n",
601
+ " '4b38111bc5434823ac19c6b1a7f2f2d6',\n",
602
+ " '382e7f375c9443199ae54635cb98508c',\n",
603
+ " '5a249cd0ee6e467b8efe5ae1ed5a7ed3',\n",
604
+ " '3b2ec61acb7a403fa49b0fa7dd6cd781',\n",
605
+ " '768086e7a529403b9cf2ff8d2a7cbbb4',\n",
606
+ " '7ca4df9a94af468ca5ce2301fe4c6d62',\n",
607
+ " '98e495ddab774a0d979c936fe465f6e6',\n",
608
+ " '177c7f64f17c45ca9305ee781ca4262a',\n",
609
+ " '1e7ae0bf23214eefa127b74da16af4a9',\n",
610
+ " '4c21c9b290404bf8aa79158c05d14e05',\n",
611
+ " '2ef558347c014b4ca14aa4b3ec008922',\n",
612
+ " 'e8ca0578755a498a91f2d0e571315cd9',\n",
613
+ " 'a5354ab136384770bcff710559952c91',\n",
614
+ " '7843a15da9614bfab5b8ec924f06d3c4',\n",
615
+ " 'bc61624871844fa2881909e81482daec',\n",
616
+ " '9ab126657c5344e3b4a5fa8f434c6e8a',\n",
617
+ " 'cbfdf829144b42ec94c45b908e65b9cc',\n",
618
+ " '03181f2d16c649e6a71e1f34a9791c7d',\n",
619
+ " '0659af0b07884a948181548e4863ce05',\n",
620
+ " '01c1681b11ac4ed79ff841e4bbf04458',\n",
621
+ " 'bef030cd7fd54d2eaad06670fd10e5a4',\n",
622
+ " 'aa4247fba2564b339c8ad1cdad241ebf',\n",
623
+ " '96368c9f1df448298937970ab3e382dd',\n",
624
+ " 'f22b1a033e4549e7933131817d1f6dbb',\n",
625
+ " '2d7de339810a4d66ad1350385406a040',\n",
626
+ " 'b76569b1b16a45048e67630c823be3dd',\n",
627
+ " '291feba0e65240a39ab40b04898ed776',\n",
628
+ " 'dc42dc35613c4fbc8097d5bb3c60101a']"
629
  ]
630
  },
631
+ "execution_count": 11,
632
  "metadata": {},
633
  "output_type": "execute_result"
634
  }
 
640
  "from qdrant_client.http.models import Distance, VectorParams\n",
641
  "\n",
642
  "dimension = 1024\n",
643
+ "collection_name = \"ai-safety-sf-arctic-embed-l-semantic\"\n",
644
  "qdrant_server = os.environ[\"QDRANT_API_URL\"]\n",
645
  "qdrant_client = QdrantClient(url=qdrant_server,api_key=os.environ[\"QDRANT_API_KEY\"])\n",
646
+ "# qdrant_client.create_collection(\n",
647
+ "# collection_name=collection_name,\n",
648
+ "# vectors_config=VectorParams(size=dimension, distance=Distance.COSINE),\n",
649
+ "# )\n",
650
  "\n",
651
  "vector_store = QdrantVectorStore(\n",
652
  " client=qdrant_client,\n",
 
654
  " embedding=embedding_model,\n",
655
  ")\n",
656
  "\n",
657
+ "vector_store.add_documents(recursive_chunked_docs)\n",
658
  "\n"
659
  ]
660
  },
661
  {
662
  "cell_type": "code",
663
+ "execution_count": 12,
664
  "metadata": {},
665
  "outputs": [],
666
  "source": [
 
670
  },
671
  {
672
  "cell_type": "code",
673
+ "execution_count": 13,
674
  "metadata": {},
675
  "outputs": [
676
  {
677
  "data": {
678
  "text/plain": [
679
+ "[Document(metadata={'source': 'https://www.whitehouse.gov/wp-content/uploads/2022/10/Blueprint-for-an-AI-Bill-of-Rights.pdf', 'file_path': 'https://www.whitehouse.gov/wp-content/uploads/2022/10/Blueprint-for-an-AI-Bill-of-Rights.pdf', 'page': 11, 'total_pages': 73, 'format': 'PDF 1.6', 'title': 'Blueprint for an AI Bill of Rights', 'author': '', 'subject': '', 'keywords': '', 'creator': 'Adobe Illustrator 26.3 (Macintosh)', 'producer': 'iLovePDF', 'creationDate': \"D:20220920133035-04'00'\", 'modDate': \"D:20221003104118-04'00'\", 'trapped': '', '_id': 'ed405fa7-b4f6-4e8d-b294-4f5220a5c8fe', '_collection_name': 'ai-safety-sf-arctic-embed-l-semantic'}, page_content='FROM \\nPRINCIPLES \\nTO PRACTICE \\nA TECHINCAL COMPANION TO\\nTHE Blueprint for an \\nAI BILL OF RIGHTS\\n12'),\n",
680
+ " Document(metadata={'source': 'https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf', 'file_path': 'https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf', 'page': 50, 'total_pages': 64, 'format': 'PDF 1.6', 'title': 'Artificial Intelligence Risk Management Framework: Generative Artificial Intelligence Profile', 'author': 'National Institute of Standards and Technology', 'subject': '', 'keywords': '', 'creator': 'Acrobat PDFMaker 24 for Word', 'producer': 'Adobe PDF Library 24.2.159', 'creationDate': \"D:20240805141702-04'00'\", 'modDate': \"D:20240805143048-04'00'\", 'trapped': '', '_id': '98b110db-2b93-4b00-92af-19c5cb3d661d', '_collection_name': 'ai-safety-sf-arctic-embed-l-semantic'}, page_content='• Accessibility and reasonable \\naccommodations \\n• AI actor credentials and qualications \\n• Alignment to organizational values \\n• Auditing and assessment \\n• Change-management controls \\n• Commercial use \\n• Data provenance'),\n",
681
+ " Document(metadata={'source': 'https://www.whitehouse.gov/wp-content/uploads/2022/10/Blueprint-for-an-AI-Bill-of-Rights.pdf', 'file_path': 'https://www.whitehouse.gov/wp-content/uploads/2022/10/Blueprint-for-an-AI-Bill-of-Rights.pdf', 'page': 19, 'total_pages': 73, 'format': 'PDF 1.6', 'title': 'Blueprint for an AI Bill of Rights', 'author': '', 'subject': '', 'keywords': '', 'creator': 'Adobe Illustrator 26.3 (Macintosh)', 'producer': 'iLovePDF', 'creationDate': \"D:20220920133035-04'00'\", 'modDate': \"D:20221003104118-04'00'\", 'trapped': '', '_id': 'e1e9da01-efe5-4ddf-8c98-771b2d1135f1', '_collection_name': 'ai-safety-sf-arctic-embed-l-semantic'}, page_content='organization’s business processes or other activities, system goals, any human-run procedures that form a \\npart of the system, and specific performance expectations; a description of any data used to train machine \\nlearning models or for other purposes, including how data sources were processed and interpreted, a \\nsummary of what data might be missing, incomplete, or erroneous, and data relevancy justifications; the \\nresults of public consultation such as concerns raised and any decisions made due to these concerns; risk \\nidentification and management assessments and any steps taken to mitigate potential harms; the results of \\nperformance testing including, but not limited to, accuracy, differential demographic impact, resulting \\nerror rates (overall and per demographic group), and comparisons to previously deployed systems; \\nongoing monitoring procedures and regular performance testing reports, including monitoring frequency,'),\n",
682
+ " Document(metadata={'source': 'https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf', 'file_path': 'https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf', 'page': 51, 'total_pages': 64, 'format': 'PDF 1.6', 'title': 'Artificial Intelligence Risk Management Framework: Generative Artificial Intelligence Profile', 'author': 'National Institute of Standards and Technology', 'subject': '', 'keywords': '', 'creator': 'Acrobat PDFMaker 24 for Word', 'producer': 'Adobe PDF Library 24.2.159', 'creationDate': \"D:20240805141702-04'00'\", 'modDate': \"D:20240805143048-04'00'\", 'trapped': '', '_id': 'bef94726-a254-426e-bb56-b5e31cb84e6d', '_collection_name': 'ai-safety-sf-arctic-embed-l-semantic'}, page_content='lifecycle and informed by representative AI Actors (see Figure 3 of the AI RMF). Until new and rigorous'),\n",
683
+ " Document(metadata={'source': 'https://www.whitehouse.gov/wp-content/uploads/2022/10/Blueprint-for-an-AI-Bill-of-Rights.pdf', 'file_path': 'https://www.whitehouse.gov/wp-content/uploads/2022/10/Blueprint-for-an-AI-Bill-of-Rights.pdf', 'page': 25, 'total_pages': 73, 'format': 'PDF 1.6', 'title': 'Blueprint for an AI Bill of Rights', 'author': '', 'subject': '', 'keywords': '', 'creator': 'Adobe Illustrator 26.3 (Macintosh)', 'producer': 'iLovePDF', 'creationDate': \"D:20220920133035-04'00'\", 'modDate': \"D:20221003104118-04'00'\", 'trapped': '', '_id': '2c6215e1-bf81-48a3-9e1e-e5044e7d25f6', '_collection_name': 'ai-safety-sf-arctic-embed-l-semantic'}, page_content='for any resulting algorithmic discrimination. \\n26\\nAlgorithmic \\nDiscrimination \\nProtections'),\n",
684
+ " Document(metadata={'source': 'https://www.whitehouse.gov/wp-content/uploads/2022/10/Blueprint-for-an-AI-Bill-of-Rights.pdf', 'file_path': 'https://www.whitehouse.gov/wp-content/uploads/2022/10/Blueprint-for-an-AI-Bill-of-Rights.pdf', 'page': 0, 'total_pages': 73, 'format': 'PDF 1.6', 'title': 'Blueprint for an AI Bill of Rights', 'author': '', 'subject': '', 'keywords': '', 'creator': 'Adobe Illustrator 26.3 (Macintosh)', 'producer': 'iLovePDF', 'creationDate': \"D:20220920133035-04'00'\", 'modDate': \"D:20221003104118-04'00'\", 'trapped': '', '_id': '7e6a7342-2dd0-4376-b521-2e1c71275f5c', '_collection_name': 'ai-safety-sf-arctic-embed-l-semantic'}, page_content='BLUEPRINT FOR AN \\nAI BILL OF \\nRIGHTS \\nMAKING AUTOMATED \\nSYSTEMS WORK FOR \\nTHE AMERICAN PEOPLE \\nOCTOBER 2022'),\n",
685
+ " Document(metadata={'source': 'https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf', 'file_path': 'https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf', 'page': 38, 'total_pages': 64, 'format': 'PDF 1.6', 'title': 'Artificial Intelligence Risk Management Framework: Generative Artificial Intelligence Profile', 'author': 'National Institute of Standards and Technology', 'subject': '', 'keywords': '', 'creator': 'Acrobat PDFMaker 24 for Word', 'producer': 'Adobe PDF Library 24.2.159', 'creationDate': \"D:20240805141702-04'00'\", 'modDate': \"D:20240805143048-04'00'\", 'trapped': '', '_id': '411b5054-a2e4-445d-bfbc-a42783cff1c2', '_collection_name': 'ai-safety-sf-arctic-embed-l-semantic'}, page_content='guide the design of provenance data-tracking techniques. \\nHuman-AI Configuration; \\nInformation Integrity \\nMS-2.10-003 Verify deduplication of GAI training data samples, particularly regarding synthetic \\ndata. \\nHarmful Bias and Homogenization \\nAI Actor Tasks: AI Deployment, AI Impact Assessment, Domain Experts, End-Users, Operation and Monitoring, TEVV'),\n",
686
+ " Document(metadata={'source': 'https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf', 'file_path': 'https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf', 'page': 59, 'total_pages': 64, 'format': 'PDF 1.6', 'title': 'Artificial Intelligence Risk Management Framework: Generative Artificial Intelligence Profile', 'author': 'National Institute of Standards and Technology', 'subject': '', 'keywords': '', 'creator': 'Acrobat PDFMaker 24 for Word', 'producer': 'Adobe PDF Library 24.2.159', 'creationDate': \"D:20240805141702-04'00'\", 'modDate': \"D:20240805143048-04'00'\", 'trapped': '', '_id': '9ab12665-7c53-44e3-b4a5-fa8f434c6e8a', '_collection_name': 'ai-safety-sf-arctic-embed-l-semantic'}, page_content='https://www.bloomberg.com/graphics/2023-generative-ai-bias/. \\nNational Institute of Standards and Technology (2024) Adversarial Machine Learning: A Taxonomy and \\nTerminology of Attacks and Mitigations https://csrc.nist.gov/pubs/ai/100/2/e2023/final \\nNational Institute of Standards and Technology (2023) AI Risk Management Framework. \\nhttps://www.nist.gov/itl/ai-risk-management-framework \\nNational Institute of Standards and Technology (2023) AI Risk Management Framework, Chapter 3: AI \\nRisks and Trustworthiness. \\nhttps://airc.nist.gov/AI_RMF_Knowledge_Base/AI_RMF/Foundational_Information/3-sec-characteristics \\nNational Institute of Standards and Technology (2023) AI Risk Management Framework, Chapter 6: AI \\nRMF Profiles. https://airc.nist.gov/AI_RMF_Knowledge_Base/AI_RMF/Core_And_Profiles/6-sec-profile \\nNational Institute of Standards and Technology (2023) AI Risk Management Framework, Appendix A: \\nDescriptions of AI Actor Tasks.'),\n",
687
+ " Document(metadata={'source': 'https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf', 'file_path': 'https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf', 'page': 57, 'total_pages': 64, 'format': 'PDF 1.6', 'title': 'Artificial Intelligence Risk Management Framework: Generative Artificial Intelligence Profile', 'author': 'National Institute of Standards and Technology', 'subject': '', 'keywords': '', 'creator': 'Acrobat PDFMaker 24 for Word', 'producer': 'Adobe PDF Library 24.2.159', 'creationDate': \"D:20240805141702-04'00'\", 'modDate': \"D:20240805143048-04'00'\", 'trapped': '', '_id': '98e495dd-ab77-4a0d-979c-936fe465f6e6', '_collection_name': 'ai-safety-sf-arctic-embed-l-semantic'}, page_content='54 \\nAppendix B. References \\nAcemoglu, D. (2024) The Simple Macroeconomics of AI https://www.nber.org/papers/w32487 \\nAI Incident Database. https://incidentdatabase.ai/ \\nAtherton, D. (2024) Deepfakes and Child Safety: A Survey and Analysis of 2023 Incidents and Responses. \\nAI Incident Database. https://incidentdatabase.ai/blog/deepfakes-and-child-safety/ \\nBadyal, N. et al. (2023) Intentional Biases in LLM Responses. arXiv. https://arxiv.org/pdf/2311.07611 \\nBing Chat: Data Exfiltration Exploit Explained. Embrace The Red. \\nhttps://embracethered.com/blog/posts/2023/bing-chat-data-exfiltration-poc-and-fix/ \\nBommasani, R. et al. (2022) Picking on the Same Person: Does Algorithmic Monoculture lead to Outcome \\nHomogenization? arXiv. https://arxiv.org/pdf/2211.13972 \\nBoyarskaya, M. et al. (2020) Overcoming Failures of Imagination in AI Infused System Development and \\nDeployment. arXiv. https://arxiv.org/pdf/2011.13416 \\nBrowne, D. et al. (2023) Securing the AI Pipeline. Mandiant.'),\n",
688
+ " Document(metadata={'source': 'https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf', 'file_path': 'https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf', 'page': 12, 'total_pages': 64, 'format': 'PDF 1.6', 'title': 'Artificial Intelligence Risk Management Framework: Generative Artificial Intelligence Profile', 'author': 'National Institute of Standards and Technology', 'subject': '', 'keywords': '', 'creator': 'Acrobat PDFMaker 24 for Word', 'producer': 'Adobe PDF Library 24.2.159', 'creationDate': \"D:20240805141702-04'00'\", 'modDate': \"D:20240805143048-04'00'\", 'trapped': '', '_id': '4dead17e-f714-4f4d-97b4-47fda4aebe43', '_collection_name': 'ai-safety-sf-arctic-embed-l-semantic'}, page_content='Priorities Related to Information Integrity Research and Development.')]"
689
  ]
690
  },
691
+ "execution_count": 13,
692
  "metadata": {},
693
  "output_type": "execute_result"
694
  }
 
1180
  ],
1181
  "source": [
1182
  "# Vector Store with recursive chunked documents\n",
1183
+ "from langchain_qdrant import QdrantVectorStore\n",
1184
+ "from langchain_core.documents import Document\n",
1185
+ "from qdrant_client import QdrantClient\n",
1186
+ "from qdrant_client.http.models import Distance, VectorParams\n",
1187
  "\n",
1188
+ "dimension = 1024\n",
1189
+ "qdrant_server = os.environ[\"QDRANT_API_URL\"]\n",
1190
+ "recursive_collection_name = \"ai-safety-ft-arctic-embed-l-recursive\"\n",
1191
  "\n",
1192
  "recursive_qdrant_client = QdrantClient(url=qdrant_server,api_key=os.environ[\"QDRANT_API_KEY\"])\n",
1193
+ "recursive_qdrant_client.create_collection(\n",
1194
+ " collection_name=recursive_collection_name,\n",
1195
+ " vectors_config=VectorParams(size=dimension, distance=Distance.COSINE),\n",
1196
+ ")\n",
1197
  "\n",
1198
  "recursive_vector_store = QdrantVectorStore(\n",
1199
  " client=recursive_qdrant_client,\n",
app.py CHANGED
@@ -46,7 +46,7 @@ Now preloading below documents:
46
  Please wait for a moment to load the documents.
47
  """
48
  chat_model_name = "gpt-4o"
49
- embedding_model_name = "Snowflake/snowflake-arctic-embed-l"
50
  chat_model = ChatOpenAI(model=chat_model_name, temperature=0)
51
 
52
  async def connect_to_qdrant():
@@ -99,7 +99,6 @@ def get_text_splitter(strategy, embedding_model):
99
  if strategy == "semantic":
100
  return SemanticChunker(
101
  embedding_model,
102
- buffer_size=3,
103
  breakpoint_threshold_type="percentile",
104
  breakpoint_threshold_amount=90,
105
  )
@@ -246,4 +245,4 @@ async def main(message: cl.Message):
246
  if __name__ == "__main__":
247
  from chainlit.cli import run_chainlit
248
 
249
- run_chainlit(__file__)
 
46
  Please wait for a moment to load the documents.
47
  """
48
  chat_model_name = "gpt-4o"
49
+ embedding_model_name = "jeevanions/finetuned_arctic-embedd-l" # Fine tuned model used
50
  chat_model = ChatOpenAI(model=chat_model_name, temperature=0)
51
 
52
  async def connect_to_qdrant():
 
99
  if strategy == "semantic":
100
  return SemanticChunker(
101
  embedding_model,
 
102
  breakpoint_threshold_type="percentile",
103
  breakpoint_threshold_amount=90,
104
  )
 
245
  if __name__ == "__main__":
246
  from chainlit.cli import run_chainlit
247
 
248
+ run_chainlit(__file__)