diff --git "a/spacy.ipynb" "b/spacy.ipynb" new file mode 100644--- /dev/null +++ "b/spacy.ipynb" @@ -0,0 +1,1415 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 66, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: spacy in ./.venv/lib/python3.12/site-packages (3.7.6)\n", + "Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.11 in ./.venv/lib/python3.12/site-packages (from spacy) (3.0.12)\n", + "Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in ./.venv/lib/python3.12/site-packages (from spacy) (1.0.5)\n", + "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in ./.venv/lib/python3.12/site-packages (from spacy) (1.0.10)\n", + "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in ./.venv/lib/python3.12/site-packages (from spacy) (2.0.8)\n", + "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in ./.venv/lib/python3.12/site-packages (from spacy) (3.0.9)\n", + "Requirement already satisfied: thinc<8.3.0,>=8.2.2 in ./.venv/lib/python3.12/site-packages (from spacy) (8.2.5)\n", + "Requirement already satisfied: wasabi<1.2.0,>=0.9.1 in ./.venv/lib/python3.12/site-packages (from spacy) (1.1.3)\n", + "Requirement already satisfied: srsly<3.0.0,>=2.4.3 in ./.venv/lib/python3.12/site-packages (from spacy) (2.4.8)\n", + "Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in ./.venv/lib/python3.12/site-packages (from spacy) (2.0.10)\n", + "Requirement already satisfied: weasel<0.5.0,>=0.1.0 in ./.venv/lib/python3.12/site-packages (from spacy) (0.4.1)\n", + "Requirement already satisfied: typer<1.0.0,>=0.3.0 in ./.venv/lib/python3.12/site-packages (from spacy) (0.12.5)\n", + "Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in ./.venv/lib/python3.12/site-packages (from spacy) (4.66.5)\n", + "Requirement already satisfied: requests<3.0.0,>=2.13.0 in ./.venv/lib/python3.12/site-packages (from spacy) (2.32.3)\n", + "Requirement already satisfied: pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4 in ./.venv/lib/python3.12/site-packages (from spacy) (2.8.2)\n", + "Requirement already satisfied: jinja2 in ./.venv/lib/python3.12/site-packages (from spacy) (3.1.4)\n", + "Requirement already satisfied: setuptools in ./.venv/lib/python3.12/site-packages (from spacy) (74.1.2)\n", + "Requirement already satisfied: packaging>=20.0 in ./.venv/lib/python3.12/site-packages (from spacy) (24.1)\n", + "Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in ./.venv/lib/python3.12/site-packages (from spacy) (3.4.0)\n", + "Requirement already satisfied: numpy>=1.19.0 in ./.venv/lib/python3.12/site-packages (from spacy) (1.26.4)\n", + "Requirement already satisfied: language-data>=1.2 in ./.venv/lib/python3.12/site-packages (from langcodes<4.0.0,>=3.2.0->spacy) (1.2.0)\n", + "Requirement already satisfied: annotated-types>=0.4.0 in ./.venv/lib/python3.12/site-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy) (0.7.0)\n", + "Requirement already satisfied: pydantic-core==2.20.1 in ./.venv/lib/python3.12/site-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy) (2.20.1)\n", + "Requirement already satisfied: typing-extensions>=4.6.1 in ./.venv/lib/python3.12/site-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy) (4.12.2)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in ./.venv/lib/python3.12/site-packages (from requests<3.0.0,>=2.13.0->spacy) (3.3.2)\n", + "Requirement already satisfied: idna<4,>=2.5 in ./.venv/lib/python3.12/site-packages (from requests<3.0.0,>=2.13.0->spacy) (3.7)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in ./.venv/lib/python3.12/site-packages (from requests<3.0.0,>=2.13.0->spacy) (2.2.2)\n", + "Requirement already satisfied: certifi>=2017.4.17 in ./.venv/lib/python3.12/site-packages (from requests<3.0.0,>=2.13.0->spacy) (2024.7.4)\n", + "Requirement already satisfied: blis<0.8.0,>=0.7.8 in ./.venv/lib/python3.12/site-packages (from thinc<8.3.0,>=8.2.2->spacy) (0.7.11)\n", + "Requirement already satisfied: confection<1.0.0,>=0.0.1 in ./.venv/lib/python3.12/site-packages (from thinc<8.3.0,>=8.2.2->spacy) (0.1.5)\n", + "Requirement already satisfied: click>=8.0.0 in ./.venv/lib/python3.12/site-packages (from typer<1.0.0,>=0.3.0->spacy) (8.1.7)\n", + "Requirement already satisfied: shellingham>=1.3.0 in ./.venv/lib/python3.12/site-packages (from typer<1.0.0,>=0.3.0->spacy) (1.5.4)\n", + "Requirement already satisfied: rich>=10.11.0 in ./.venv/lib/python3.12/site-packages (from typer<1.0.0,>=0.3.0->spacy) (13.8.0)\n", + "Requirement already satisfied: cloudpathlib<1.0.0,>=0.7.0 in ./.venv/lib/python3.12/site-packages (from weasel<0.5.0,>=0.1.0->spacy) (0.19.0)\n", + "Requirement already satisfied: smart-open<8.0.0,>=5.2.1 in ./.venv/lib/python3.12/site-packages (from weasel<0.5.0,>=0.1.0->spacy) (7.0.4)\n", + "Requirement already satisfied: MarkupSafe>=2.0 in ./.venv/lib/python3.12/site-packages (from jinja2->spacy) (2.1.5)\n", + "Requirement already satisfied: marisa-trie>=0.7.7 in ./.venv/lib/python3.12/site-packages (from language-data>=1.2->langcodes<4.0.0,>=3.2.0->spacy) (1.2.0)\n", + "Requirement already satisfied: markdown-it-py>=2.2.0 in ./.venv/lib/python3.12/site-packages (from rich>=10.11.0->typer<1.0.0,>=0.3.0->spacy) (3.0.0)\n", + "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in ./.venv/lib/python3.12/site-packages (from rich>=10.11.0->typer<1.0.0,>=0.3.0->spacy) (2.18.0)\n", + "Requirement already satisfied: wrapt in ./.venv/lib/python3.12/site-packages (from smart-open<8.0.0,>=5.2.1->weasel<0.5.0,>=0.1.0->spacy) (1.16.0)\n", + "Requirement already satisfied: mdurl~=0.1 in ./.venv/lib/python3.12/site-packages (from markdown-it-py>=2.2.0->rich>=10.11.0->typer<1.0.0,>=0.3.0->spacy) (0.1.2)\n", + "Collecting en-core-web-sm==3.7.1\n", + " Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m12.8/12.8 MB\u001b[0m \u001b[31m22.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m0:01\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: spacy<3.8.0,>=3.7.2 in ./.venv/lib/python3.12/site-packages (from en-core-web-sm==3.7.1) (3.7.6)\n", + "Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.11 in ./.venv/lib/python3.12/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (3.0.12)\n", + "Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in ./.venv/lib/python3.12/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (1.0.5)\n", + "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in ./.venv/lib/python3.12/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (1.0.10)\n", + "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in ./.venv/lib/python3.12/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.0.8)\n", + "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in ./.venv/lib/python3.12/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (3.0.9)\n", + "Requirement already satisfied: thinc<8.3.0,>=8.2.2 in ./.venv/lib/python3.12/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (8.2.5)\n", + "Requirement already satisfied: wasabi<1.2.0,>=0.9.1 in ./.venv/lib/python3.12/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (1.1.3)\n", + "Requirement already satisfied: srsly<3.0.0,>=2.4.3 in ./.venv/lib/python3.12/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.4.8)\n", + "Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in ./.venv/lib/python3.12/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.0.10)\n", + "Requirement already satisfied: weasel<0.5.0,>=0.1.0 in ./.venv/lib/python3.12/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (0.4.1)\n", + "Requirement already satisfied: typer<1.0.0,>=0.3.0 in ./.venv/lib/python3.12/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (0.12.5)\n", + "Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in ./.venv/lib/python3.12/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (4.66.5)\n", + "Requirement already satisfied: requests<3.0.0,>=2.13.0 in ./.venv/lib/python3.12/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.32.3)\n", + "Requirement already satisfied: pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4 in ./.venv/lib/python3.12/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.8.2)\n", + "Requirement already satisfied: jinja2 in ./.venv/lib/python3.12/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (3.1.4)\n", + "Requirement already satisfied: setuptools in ./.venv/lib/python3.12/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (74.1.2)\n", + "Requirement already satisfied: packaging>=20.0 in ./.venv/lib/python3.12/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (24.1)\n", + "Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in ./.venv/lib/python3.12/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (3.4.0)\n", + "Requirement already satisfied: numpy>=1.19.0 in ./.venv/lib/python3.12/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (1.26.4)\n", + "Requirement already satisfied: language-data>=1.2 in ./.venv/lib/python3.12/site-packages (from langcodes<4.0.0,>=3.2.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (1.2.0)\n", + "Requirement already satisfied: annotated-types>=0.4.0 in ./.venv/lib/python3.12/site-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (0.7.0)\n", + "Requirement already satisfied: pydantic-core==2.20.1 in ./.venv/lib/python3.12/site-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.20.1)\n", + "Requirement already satisfied: typing-extensions>=4.6.1 in ./.venv/lib/python3.12/site-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (4.12.2)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in ./.venv/lib/python3.12/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (3.3.2)\n", + "Requirement already satisfied: idna<4,>=2.5 in ./.venv/lib/python3.12/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (3.7)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in ./.venv/lib/python3.12/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.2.2)\n", + "Requirement already satisfied: certifi>=2017.4.17 in ./.venv/lib/python3.12/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2024.7.4)\n", + "Requirement already satisfied: blis<0.8.0,>=0.7.8 in ./.venv/lib/python3.12/site-packages (from thinc<8.3.0,>=8.2.2->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (0.7.11)\n", + "Requirement already satisfied: confection<1.0.0,>=0.0.1 in ./.venv/lib/python3.12/site-packages (from thinc<8.3.0,>=8.2.2->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (0.1.5)\n", + "Requirement already satisfied: click>=8.0.0 in ./.venv/lib/python3.12/site-packages (from typer<1.0.0,>=0.3.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (8.1.7)\n", + "Requirement already satisfied: shellingham>=1.3.0 in ./.venv/lib/python3.12/site-packages (from typer<1.0.0,>=0.3.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (1.5.4)\n", + "Requirement already satisfied: rich>=10.11.0 in ./.venv/lib/python3.12/site-packages (from typer<1.0.0,>=0.3.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (13.8.0)\n", + "Requirement already satisfied: cloudpathlib<1.0.0,>=0.7.0 in ./.venv/lib/python3.12/site-packages (from weasel<0.5.0,>=0.1.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (0.19.0)\n", + "Requirement already satisfied: smart-open<8.0.0,>=5.2.1 in ./.venv/lib/python3.12/site-packages (from weasel<0.5.0,>=0.1.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (7.0.4)\n", + "Requirement already satisfied: MarkupSafe>=2.0 in ./.venv/lib/python3.12/site-packages (from jinja2->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.1.5)\n", + "Requirement already satisfied: marisa-trie>=0.7.7 in ./.venv/lib/python3.12/site-packages (from language-data>=1.2->langcodes<4.0.0,>=3.2.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (1.2.0)\n", + "Requirement already satisfied: markdown-it-py>=2.2.0 in ./.venv/lib/python3.12/site-packages (from rich>=10.11.0->typer<1.0.0,>=0.3.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (3.0.0)\n", + "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in ./.venv/lib/python3.12/site-packages (from rich>=10.11.0->typer<1.0.0,>=0.3.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.18.0)\n", + "Requirement already satisfied: wrapt in ./.venv/lib/python3.12/site-packages (from smart-open<8.0.0,>=5.2.1->weasel<0.5.0,>=0.1.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (1.16.0)\n", + "Requirement already satisfied: mdurl~=0.1 in ./.venv/lib/python3.12/site-packages (from markdown-it-py>=2.2.0->rich>=10.11.0->typer<1.0.0,>=0.3.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (0.1.2)\n", + "\u001b[38;5;2m✔ Download and installation successful\u001b[0m\n", + "You can now load the package via spacy.load('en_core_web_sm')\n", + "Collecting ipywidgets\n", + " Using cached ipywidgets-8.1.5-py3-none-any.whl.metadata (2.3 kB)\n", + "Requirement already satisfied: comm>=0.1.3 in ./.venv/lib/python3.12/site-packages (from ipywidgets) (0.2.2)\n", + "Requirement already satisfied: ipython>=6.1.0 in ./.venv/lib/python3.12/site-packages (from ipywidgets) (8.27.0)\n", + "Requirement already satisfied: traitlets>=4.3.1 in ./.venv/lib/python3.12/site-packages (from ipywidgets) (5.14.3)\n", + "Collecting widgetsnbextension~=4.0.12 (from ipywidgets)\n", + " Using cached widgetsnbextension-4.0.13-py3-none-any.whl.metadata (1.6 kB)\n", + "Collecting jupyterlab-widgets~=3.0.12 (from ipywidgets)\n", + " Using cached jupyterlab_widgets-3.0.13-py3-none-any.whl.metadata (4.1 kB)\n", + "Requirement already satisfied: decorator in ./.venv/lib/python3.12/site-packages (from ipython>=6.1.0->ipywidgets) (5.1.1)\n", + "Requirement already satisfied: jedi>=0.16 in ./.venv/lib/python3.12/site-packages (from ipython>=6.1.0->ipywidgets) (0.19.1)\n", + "Requirement already satisfied: matplotlib-inline in ./.venv/lib/python3.12/site-packages (from ipython>=6.1.0->ipywidgets) (0.1.7)\n", + "Requirement already satisfied: prompt-toolkit<3.1.0,>=3.0.41 in ./.venv/lib/python3.12/site-packages (from ipython>=6.1.0->ipywidgets) (3.0.47)\n", + "Requirement already satisfied: pygments>=2.4.0 in ./.venv/lib/python3.12/site-packages (from ipython>=6.1.0->ipywidgets) (2.18.0)\n", + "Requirement already satisfied: stack-data in ./.venv/lib/python3.12/site-packages (from ipython>=6.1.0->ipywidgets) (0.6.3)\n", + "Requirement already satisfied: pexpect>4.3 in ./.venv/lib/python3.12/site-packages (from ipython>=6.1.0->ipywidgets) (4.9.0)\n", + "Requirement already satisfied: parso<0.9.0,>=0.8.3 in ./.venv/lib/python3.12/site-packages (from jedi>=0.16->ipython>=6.1.0->ipywidgets) (0.8.4)\n", + "Requirement already satisfied: ptyprocess>=0.5 in ./.venv/lib/python3.12/site-packages (from pexpect>4.3->ipython>=6.1.0->ipywidgets) (0.7.0)\n", + "Requirement already satisfied: wcwidth in ./.venv/lib/python3.12/site-packages (from prompt-toolkit<3.1.0,>=3.0.41->ipython>=6.1.0->ipywidgets) (0.2.13)\n", + "Requirement already satisfied: executing>=1.2.0 in ./.venv/lib/python3.12/site-packages (from stack-data->ipython>=6.1.0->ipywidgets) (2.1.0)\n", + "Requirement already satisfied: asttokens>=2.1.0 in ./.venv/lib/python3.12/site-packages (from stack-data->ipython>=6.1.0->ipywidgets) (2.4.1)\n", + "Requirement already satisfied: pure-eval in ./.venv/lib/python3.12/site-packages (from stack-data->ipython>=6.1.0->ipywidgets) (0.2.3)\n", + "Requirement already satisfied: six>=1.12.0 in ./.venv/lib/python3.12/site-packages (from asttokens>=2.1.0->stack-data->ipython>=6.1.0->ipywidgets) (1.16.0)\n", + "Using cached ipywidgets-8.1.5-py3-none-any.whl (139 kB)\n", + "Using cached jupyterlab_widgets-3.0.13-py3-none-any.whl (214 kB)\n", + "Using cached widgetsnbextension-4.0.13-py3-none-any.whl (2.3 MB)\n", + "Installing collected packages: widgetsnbextension, jupyterlab-widgets, ipywidgets\n", + "Successfully installed ipywidgets-8.1.5 jupyterlab-widgets-3.0.13 widgetsnbextension-4.0.13\n" + ] + } + ], + "source": [ + "!pip install spacy\n", + "!python -m spacy download en_core_web_sm\n", + "!pip install ipywidgets" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "import spacy\n", + "import dotenv\n", + "from rich.console import Console\n", + "\n", + "import web_rag as wr\n", + "import web_crawler as wc\n" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "console = Console()\n", + "dotenv.load_dotenv()" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [], + "source": [ + "if not spacy.util.is_package(\"en_core_web_sm\"):\n", + " print(\"Downloading en_core_web_sm model...\")\n", + " spacy.cli.download(\"en_core_web_sm\")\n", + " print(\"Model downloaded successfully!\")\n", + "nlp = spacy.load(\"en_core_web_sm\")" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [], + "source": [ + "chat, embedding_model = wr.get_models(\"openai\", \"gpt-4o-mini\", 0.5)" + ] + }, + { + "cell_type": "code", + "execution_count": 108, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
Elizabeth Holmes trial verdict guilty\n", + "\n" + ], + "text/plain": [ + "Elizabeth Holmes trial verdict guilty\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from datetime import date\n", + "today = date.today().strftime(\"%B %d, %Y\")\n", + "query = f\"Is Elizabeth Holmes guilty?\"\n", + "optimized_search_query = wr.optimize_search_query(chat, query)\n", + "console.print(optimized_search_query)" + ] + }, + { + "cell_type": "code", + "execution_count": 109, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
retrieved 5 contents\n",
+ "
\n"
+ ],
+ "text/plain": [
+ "retrieved \u001b[1;36m5\u001b[0m contents\n"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "sources = wc.get_sources(optimized_search_query, max_pages=5)\n",
+ "contents = wc.get_links_contents(sources)\n",
+ "console.print(f\"retrieved {len(contents)} contents\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 110,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/var/folders/6k/6b1pw86x1yvc1_ds42gc1ms80000gq/T/ipykernel_7568/1451878299.py:13: UserWarning: [W007] The model you're using has no word vectors loaded, so the result of the Doc.similarity method will be based on the tagger, parser and NER, which may not give useful similarity judgements. This may happen if you're using one of the small models, e.g. `en_core_web_sm`, which don't ship with word vectors and only use context-sensitive tensors. You can always add your own word vectors, or use one of the larger models instead if available.\n",
+ " if prev_sent.similarity(curr_sent) >= similarity_threshold:\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "Created 115 semantic chunks\n",
+ "
\n"
+ ],
+ "text/plain": [
+ "Created \u001b[1;36m115\u001b[0m semantic chunks\n"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "Sample chunk:\n", + "\n" + ], + "text/plain": [ + "Sample chunk:\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
{\n", + " 'text': \"Theranos founder Elizabeth Holmes was sentenced Friday to more than 11 years in prison for fraud after\n", + "deceiving investors about the purported efficacy of her company's blood-testing technology.\",\n", + " 'metadata': {\n", + " 'title': 'Theranos founder Elizabeth Holmes sentenced to more than 11 years in prison',\n", + " 'source': \n", + "'https://www.cnbc.com/2022/11/18/former-theranos-ceo-elizabeth-holmes-sentenced-to-more-than-11-years-in-prison.htm\n", + "l'\n", + " }\n", + "}\n", + "\n" + ], + "text/plain": [ + "\u001b[1m{\u001b[0m\n", + " \u001b[32m'text'\u001b[0m: \u001b[32m\"Theranos founder Elizabeth Holmes was sentenced Friday to more than 11 years in prison for fraud after\u001b[0m\n", + "\u001b[32mdeceiving investors about the purported efficacy of her company's blood-testing technology.\"\u001b[0m,\n", + " \u001b[32m'metadata'\u001b[0m: \u001b[1m{\u001b[0m\n", + " \u001b[32m'title'\u001b[0m: \u001b[32m'Theranos founder Elizabeth Holmes sentenced to more than 11 years in prison'\u001b[0m,\n", + " \u001b[32m'source'\u001b[0m: \n", + "\u001b[32m'https://www.cnbc.com/2022/11/18/former-theranos-ceo-elizabeth-holmes-sentenced-to-more-than-11-years-in-prison.htm\u001b[0m\n", + "\u001b[32ml'\u001b[0m\n", + " \u001b[1m}\u001b[0m\n", + "\u001b[1m}\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from itertools import groupby\n", + "from operator import itemgetter\n", + "\n", + "def group_similar_sentences(sentences, similarity_threshold=0.5):\n", + " grouped_sentences = []\n", + " for i, sentence in enumerate(sentences):\n", + " if i == 0:\n", + " grouped_sentences.append([sentence])\n", + " else:\n", + " prev_group = grouped_sentences[-1]\n", + " prev_sent = nlp(' '.join([s.text for s in prev_group]))\n", + " curr_sent = nlp(sentence.text)\n", + " if prev_sent.similarity(curr_sent) >= similarity_threshold:\n", + " prev_group.append(sentence)\n", + " else:\n", + " grouped_sentences.append([sentence])\n", + " return grouped_sentences\n", + "\n", + "# Initialize an empty list to store the semantically split content\n", + "semantic_chunks = []\n", + "\n", + "# Iterate through the contents\n", + "for content in contents:\n", + " # Extract the page content\n", + " page_content = content.get('page_content', '')\n", + " \n", + " if page_content:\n", + " # Process the text with spaCy\n", + " doc = nlp(page_content)\n", + " \n", + " # Split the content into sentences\n", + " sentences = list(doc.sents)\n", + " \n", + " # Group similar sentences\n", + " grouped_sentences = group_similar_sentences(sentences)\n", + " \n", + " for group in grouped_sentences:\n", + " # Create a chunk with the grouped sentences and metadata\n", + " chunk = {\n", + " 'text': ' '.join([sent.text for sent in group]),\n", + " 'metadata': {\n", + " 'title': content.get('title', ''),\n", + " 'source': content.get('link', '')\n", + " }\n", + " }\n", + " semantic_chunks.append(chunk)\n", + "\n", + "# Print the number of semantic chunks created\n", + "console.print(f\"Created {len(semantic_chunks)} semantic chunks\")\n", + "\n", + "# Optionally, print a sample chunk to verify the structure\n", + "if semantic_chunks:\n", + " console.print(\"Sample chunk:\")\n", + " console.print(semantic_chunks[0])" + ] + }, + { + "cell_type": "code", + "execution_count": 103, + "metadata": {}, + "outputs": [ + { + "ename": "SyntaxError", + "evalue": "invalid syntax (1591802571.py, line 33)", + "output_type": "error", + "traceback": [ + "\u001b[0;36m Cell \u001b[0;32mIn[103], line 33\u001b[0;36m\u001b[0m\n\u001b[0;31m sents, vecs = process(contents[0].)\u001b[0m\n\u001b[0m ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m invalid syntax\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "import spacy\n", + "\n", + "# Load the Spacy model\n", + "nlp = spacy.load('en_core_web_sm')\n", + "\n", + "def process(text):\n", + " doc = nlp(text)\n", + " sents = list(doc.sents)\n", + " vecs = np.stack([sent.vector / sent.vector_norm for sent in sents])\n", + "\n", + " return sents, vecs\n", + "\n", + "def cluster_text(sents, vecs, threshold):\n", + " clusters = [[0]]\n", + " for i in range(1, len(sents)):\n", + " if np.dot(vecs[i], vecs[i-1]) < threshold:\n", + " clusters.append([])\n", + " clusters[-1].append(i)\n", + " \n", + " return clusters\n", + "\n", + "def clean_text(text):\n", + " # Add your text cleaning process here\n", + " return text\n", + "\n", + "# Initialize the clusters lengths list and final texts list\n", + "clusters_lens = []\n", + "final_texts = []\n", + "\n", + "# Process the chunk\n", + "threshold = 0.3\n", + "sents, vecs = process(contents[0].)\n", + "\n", + "# Cluster the sentences\n", + "clusters = cluster_text(sents, vecs, threshold)\n", + "\n", + "for cluster in clusters:\n", + " cluster_txt = clean_text(' '.join([sents[i].text for i in cluster]))\n", + " cluster_len = len(cluster_txt)\n", + " \n", + " # Check if the cluster is too short\n", + " if cluster_len < 60:\n", + " continue\n", + " \n", + " # Check if the cluster is too long\n", + " elif cluster_len > 3000:\n", + " threshold = 0.6\n", + " sents_div, vecs_div = process(cluster_txt)\n", + " reclusters = cluster_text(sents_div, vecs_div, threshold)\n", + " \n", + " for subcluster in reclusters:\n", + " div_txt = clean_text(' '.join([sents_div[i].text for i in subcluster]))\n", + " div_len = len(div_txt)\n", + " \n", + " if div_len < 60 or div_len > 3000:\n", + " continue\n", + " \n", + " clusters_lens.append(div_len)\n", + " final_texts.append(div_txt)\n", + " \n", + " else:\n", + " clusters_lens.append(cluster_len)\n", + " final_texts.append(cluster_txt)" + ] + }, + { + "cell_type": "code", + "execution_count": 111, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/6k/6b1pw86x1yvc1_ds42gc1ms80000gq/T/ipykernel_7568/342810436.py:8: UserWarning: [W007] The model you're using has no word vectors loaded, so the result of the Doc.similarity method will be based on the tagger, parser and NER, which may not give useful similarity judgements. This may happen if you're using one of the small models, e.g. `en_core_web_sm`, which don't ship with word vectors and only use context-sensitive tensors. You can always add your own word vectors, or use one of the larger models instead if available.\n", + " similarity = query_doc.similarity(chunk_doc)\n" + ] + }, + { + "data": { + "text/html": [ + "
Found 8 relevant chunks\n",
+ "
\n"
+ ],
+ "text/plain": [
+ "Found \u001b[1;36m8\u001b[0m relevant chunks\n"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n", + "Top relevant chunks:\n", + "\n" + ], + "text/plain": [ + "\n", + "Top relevant chunks:\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "1. Similarity: 0.6973\n", + "\n" + ], + "text/plain": [ + "\n", + "\u001b[1;36m1\u001b[0m. Similarity: \u001b[1;36m0.6973\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
Text: Related Content\n", + "Press Release\n", + "SAN JOSE – Elizabeth A. Holmes was sentenced today to 135 months (11 years, 3 months) in federal prison for \n", + "defrauding investors in Theranos, Inc. of hundreds of millions of dollars, announced United States Attorney \n", + "Stephanie M. Hinds, Federal Bureau of Investigation Special Agent in Charge Robert K. Tripp, Food and Drug \n", + "Administration (FDA) Assistant Commissioner for Criminal Investigations Catherine A. Hermsen, and U.S. Postal \n", + "Inspection Service (USPIS) San Francisco Division Acting Inspector in Charge Kevin Rho. The sentence was handed \n", + "down by United States District Judge Edward J. Davila.\n", + " “Silicon Valley has seen the rise of companies whose inventions have changed the world and, through intellectual \n", + "prowess, hard work, and sheer determination, this region continues to innovate,” said U.S. Attorney Stephanie M. \n", + "Hinds.\n", + "\n" + ], + "text/plain": [ + "Text: Related Content\n", + "Press Release\n", + "SAN JOSE – Elizabeth A. Holmes was sentenced today to \u001b[1;36m135\u001b[0m months \u001b[1m(\u001b[0m\u001b[1;36m11\u001b[0m years, \u001b[1;36m3\u001b[0m months\u001b[1m)\u001b[0m in federal prison for \n", + "defrauding investors in Theranos, Inc. of hundreds of millions of dollars, announced United States Attorney \n", + "Stephanie M. Hinds, Federal Bureau of Investigation Special Agent in Charge Robert K. Tripp, Food and Drug \n", + "Administration \u001b[1m(\u001b[0mFDA\u001b[1m)\u001b[0m Assistant Commissioner for Criminal Investigations Catherine A. Hermsen, and U.S. Postal \n", + "Inspection Service \u001b[1m(\u001b[0mUSPIS\u001b[1m)\u001b[0m San Francisco Division Acting Inspector in Charge Kevin Rho. The sentence was handed \n", + "down by United States District Judge Edward J. Davila.\n", + " “Silicon Valley has seen the rise of companies whose inventions have changed the world and, through intellectual \n", + "prowess, hard work, and sheer determination, this region continues to innovate,” said U.S. Attorney Stephanie M. \n", + "Hinds.\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
Source: \n", + "https://www.justice.gov/usao-ndca/pr/elizabeth-holmes-sentenced-more-11-years-defrauding-theranos-investors-hundred\n", + "s\n", + "\n" + ], + "text/plain": [ + "Source: \n", + "\u001b[4;94mhttps://www.justice.gov/usao-ndca/pr/elizabeth-holmes-sentenced-more-11-years-defrauding-theranos-investors-hundred\u001b[0m\n", + "\u001b[4;94ms\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
Title: Northern District of California | Elizabeth Holmes Sentenced To More Than 11 Years For Defrauding Theranos \n",
+ "Investors Of Hundreds Of Millions | United States Department of Justice\n",
+ "
\n"
+ ],
+ "text/plain": [
+ "Title: Northern District of California | Elizabeth Holmes Sentenced To More Than \u001b[1;36m11\u001b[0m Years For Defrauding Theranos \n",
+ "Investors Of Hundreds Of Millions | United States Department of Justice\n"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n", + "2. Similarity: 0.6427\n", + "\n" + ], + "text/plain": [ + "\n", + "\u001b[1;36m2\u001b[0m. Similarity: \u001b[1;36m0.6427\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
Text: The FBI is committed to investigating corporate fraud and working with our partners to help keep our capital \n", + "markets working effectively,” said FBI Special Agent in Charge Robert K. Tripp. “The FBI and our partners worked \n", + "tirelessly on this multi-year case and are proud justice has been served as a result.”\n", + " “Today’s announcement should serve as a reminder that fraud related to medical products will not be tolerated,” \n", + "said FDA Assistant Commissioner for Criminal Investigations Catherine A. Hermsen. “The FDA will continue to work \n", + "with our law enforcement partners to bring to justice those who place profits above public health.”\n", + " “Postal Inspectors worked closely with our partners at the U.S. Attorney’s Office, the FDA Office of Criminal \n", + "Investigations, and the FBI to bring this case to court,” said USPIS San Francisco Division Acting Inspector in \n", + "Charge Kevin Rho.\n", + "\n" + ], + "text/plain": [ + "Text: The FBI is committed to investigating corporate fraud and working with our partners to help keep our capital \n", + "markets working effectively,” said FBI Special Agent in Charge Robert K. Tripp. “The FBI and our partners worked \n", + "tirelessly on this multi-year case and are proud justice has been served as a result.”\n", + " “Today’s announcement should serve as a reminder that fraud related to medical products will not be tolerated,” \n", + "said FDA Assistant Commissioner for Criminal Investigations Catherine A. Hermsen. “The FDA will continue to work \n", + "with our law enforcement partners to bring to justice those who place profits above public health.”\n", + " “Postal Inspectors worked closely with our partners at the U.S. Attorney’s Office, the FDA Office of Criminal \n", + "Investigations, and the FBI to bring this case to court,” said USPIS San Francisco Division Acting Inspector in \n", + "Charge Kevin Rho.\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
Source: \n", + "https://www.justice.gov/usao-ndca/pr/elizabeth-holmes-sentenced-more-11-years-defrauding-theranos-investors-hundred\n", + "s\n", + "\n" + ], + "text/plain": [ + "Source: \n", + "\u001b[4;94mhttps://www.justice.gov/usao-ndca/pr/elizabeth-holmes-sentenced-more-11-years-defrauding-theranos-investors-hundred\u001b[0m\n", + "\u001b[4;94ms\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
Title: Northern District of California | Elizabeth Holmes Sentenced To More Than 11 Years For Defrauding Theranos \n",
+ "Investors Of Hundreds Of Millions | United States Department of Justice\n",
+ "
\n"
+ ],
+ "text/plain": [
+ "Title: Northern District of California | Elizabeth Holmes Sentenced To More Than \u001b[1;36m11\u001b[0m Years For Defrauding Theranos \n",
+ "Investors Of Hundreds Of Millions | United States Department of Justice\n"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n", + "3. Similarity: 0.6053\n", + "\n" + ], + "text/plain": [ + "\n", + "\u001b[1;36m3\u001b[0m. Similarity: \u001b[1;36m0.6053\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
Text: The harsh ruling – which aligned with federal sentencing guidelines but was far more severe than the 18 \n",
+ "months of house arrest requested by Holmes – sends a strong message from the US government to Silicon Valley, said \n",
+ "Anat Alon-Beck, a law professor at Case Western Reserve University in Ohio.\n",
+ "\n",
+ "
\n"
+ ],
+ "text/plain": [
+ "Text: The harsh ruling – which aligned with federal sentencing guidelines but was far more severe than the \u001b[1;36m18\u001b[0m \n",
+ "months of house arrest requested by Holmes – sends a strong message from the US government to Silicon Valley, said \n",
+ "Anat Alon-Beck, a law professor at Case Western Reserve University in Ohio.\n",
+ "\n"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "Source: https://www.theguardian.com/us-news/2022/nov/18/elizabeth-holmes-theranos-trial-sentencing\n",
+ "
\n"
+ ],
+ "text/plain": [
+ "Source: \u001b[4;94mhttps://www.theguardian.com/us-news/2022/nov/18/elizabeth-holmes-theranos-trial-sentencing\u001b[0m\n"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "Title: Theranos founder Elizabeth Holmes sentenced to more than 11 years for defrauding investors | Theranos | The \n",
+ "Guardian\n",
+ "
\n"
+ ],
+ "text/plain": [
+ "Title: Theranos founder Elizabeth Holmes sentenced to more than \u001b[1;36m11\u001b[0m years for defrauding investors | Theranos | The \n",
+ "Guardian\n"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n", + "4. Similarity: 0.5888\n", + "\n" + ], + "text/plain": [ + "\n", + "\u001b[1;36m4\u001b[0m. Similarity: \u001b[1;36m0.5888\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
Text: U.S. District Court Judge Edward Davila, who presided over Holmes' trial, handed down the sentence.\n", + "\n", + "\n" + ], + "text/plain": [ + "Text: U.S. District Court Judge Edward Davila, who presided over Holmes' trial, handed down the sentence.\n", + "\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
Source: \n",
+ "https://www.cnbc.com/2022/11/18/former-theranos-ceo-elizabeth-holmes-sentenced-to-more-than-11-years-in-prison.html\n",
+ "
\n"
+ ],
+ "text/plain": [
+ "Source: \n",
+ "\u001b[4;94mhttps://www.cnbc.com/2022/11/18/former-theranos-ceo-elizabeth-holmes-sentenced-to-more-than-11-years-in-prison.html\u001b[0m\n"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "Title: Theranos founder Elizabeth Holmes sentenced to more than 11 years in prison\n",
+ "
\n"
+ ],
+ "text/plain": [
+ "Title: Theranos founder Elizabeth Holmes sentenced to more than \u001b[1;36m11\u001b[0m years in prison\n"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n", + "5. Similarity: 0.5769\n", + "\n" + ], + "text/plain": [ + "\n", + "\u001b[1;36m5\u001b[0m. Similarity: \u001b[1;36m0.5769\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
Text: Assistant U.S. Attorneys Robert S. Leach, Jeff Schenk, John C. Bostic, and Kelly Volkar are prosecuting the \n", + "case with the assistance of Madeline Wachs, Lakisha Holliman, Sara Slattery, Elise Etter, Susan Kreider, and Leeya \n", + "Kekona. The prosecution is the result of an investigation by the FBI, USPIS, and the FDA Office of Criminal \n", + "Investigations.\n", + "\n" + ], + "text/plain": [ + "Text: Assistant U.S. Attorneys Robert S. Leach, Jeff Schenk, John C. Bostic, and Kelly Volkar are prosecuting the \n", + "case with the assistance of Madeline Wachs, Lakisha Holliman, Sara Slattery, Elise Etter, Susan Kreider, and Leeya \n", + "Kekona. The prosecution is the result of an investigation by the FBI, USPIS, and the FDA Office of Criminal \n", + "Investigations.\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
Source: \n", + "https://www.justice.gov/usao-ndca/pr/elizabeth-holmes-sentenced-more-11-years-defrauding-theranos-investors-hundred\n", + "s\n", + "\n" + ], + "text/plain": [ + "Source: \n", + "\u001b[4;94mhttps://www.justice.gov/usao-ndca/pr/elizabeth-holmes-sentenced-more-11-years-defrauding-theranos-investors-hundred\u001b[0m\n", + "\u001b[4;94ms\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
Title: Northern District of California | Elizabeth Holmes Sentenced To More Than 11 Years For Defrauding Theranos \n",
+ "Investors Of Hundreds Of Millions | United States Department of Justice\n",
+ "
\n"
+ ],
+ "text/plain": [
+ "Title: Northern District of California | Elizabeth Holmes Sentenced To More Than \u001b[1;36m11\u001b[0m Years For Defrauding Theranos \n",
+ "Investors Of Hundreds Of Millions | United States Department of Justice\n"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# Perform semantic search using spaCy\n",
+ "def semantic_search(query, chunks, nlp, similarity_threshold=0.5):\n",
+ " query_doc = nlp(query)\n",
+ " relevant_chunks = []\n",
+ " \n",
+ " for chunk in chunks:\n",
+ " chunk_doc = nlp(chunk['text'])\n",
+ " similarity = query_doc.similarity(chunk_doc)\n",
+ " \n",
+ " if similarity > similarity_threshold:\n",
+ " relevant_chunks.append((chunk, similarity))\n",
+ " \n",
+ " # Sort the relevant chunks by similarity score in descending order\n",
+ " relevant_chunks.sort(key=lambda x: x[1], reverse=True)\n",
+ " \n",
+ " return relevant_chunks\n",
+ "\n",
+ "# Perform the semantic search\n",
+ "relevant_results = semantic_search(optimized_search_query, semantic_chunks, nlp)\n",
+ "\n",
+ "# Print the number of relevant chunks found\n",
+ "console.print(f\"Found {len(relevant_results)} relevant chunks\")\n",
+ "\n",
+ "# Print the top 5 most relevant chunks (or all if less than 5)\n",
+ "console.print(\"\\nTop relevant chunks:\")\n",
+ "for i, (chunk, similarity) in enumerate(relevant_results[:5], 1):\n",
+ " console.print(f\"\\n{i}. Similarity: {similarity:.4f}\")\n",
+ " console.print(f\"Text: {chunk['text']}\")\n",
+ " console.print(f\"Source: {chunk['metadata']['source']}\")\n",
+ " console.print(f\"Title: {chunk['metadata']['title']}\")\n",
+ "\n",
+ "# Optionally, you can store these results for further processing\n",
+ "# relevant_chunks = [chunk for chunk, _ in relevant_results]\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 112,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "Formatted chunks:\n", + "\n" + ], + "text/plain": [ + "Formatted chunks:\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + " <source>\n", + " <url>https://www.justice.gov/usao-ndca/pr/elizabeth-holmes-sentenced-more-11-years-defrauding-theranos-investor\n", + "s-hundreds</url>\n", + " <title>Northern District of California | Elizabeth Holmes Sentenced To More Than 11 Years For Defrauding \n", + "Theranos Investors Of Hundreds Of Millions | United States Department of Justice</title>\n", + " <text>Related Content\n", + "Press Release\n", + "SAN JOSE – Elizabeth A. Holmes was sentenced today to 135 months (11 years, 3 months) in federal prison for \n", + "defrauding investors in Theranos, Inc. of hundreds of millions of dollars, announced United States Attorney \n", + "Stephanie M. Hinds, Federal Bureau of Investigation Special Agent in Charge Robert K. Tripp, Food and Drug \n", + "Administration (FDA) Assistant Commissioner for Criminal Investigations Catherine A. Hermsen, and U.S. Postal \n", + "Inspection Service (USPIS) San Francisco Division Acting Inspector in Charge Kevin Rho. The sentence was handed \n", + "down by United States District Judge Edward J. Davila.\n", + " “Silicon Valley has seen the rise of companies whose inventions have changed the world and, through intellectual \n", + "prowess, hard work, and sheer determination, this region continues to innovate,” said U.S. Attorney Stephanie M. \n", + "Hinds.</text>\n", + " </source>\n", + " \n", + " \n", + " <source>\n", + " <url>https://www.justice.gov/usao-ndca/pr/elizabeth-holmes-sentenced-more-11-years-defrauding-theranos-investor\n", + "s-hundreds</url>\n", + " <title>Northern District of California | Elizabeth Holmes Sentenced To More Than 11 Years For Defrauding \n", + "Theranos Investors Of Hundreds Of Millions | United States Department of Justice</title>\n", + " <text>The FBI is committed to investigating corporate fraud and working with our partners to help keep our \n", + "capital markets working effectively,” said FBI Special Agent in Charge Robert K. Tripp. “The FBI and our partners \n", + "worked tirelessly on this multi-year case and are proud justice has been served as a result.”\n", + " “Today’s announcement should serve as a reminder that fraud related to medical products will not be tolerated,” \n", + "said FDA Assistant Commissioner for Criminal Investigations Catherine A. Hermsen. “The FDA will continue to work \n", + "with our law enforcement partners to bring to justice those who place profits above public health.”\n", + " “Postal Inspectors worked closely with our partners at the U.S. Attorney’s Office, the FDA Office of Criminal \n", + "Investigations, and the FBI to bring this case to court,” said USPIS San Francisco Division Acting Inspector in \n", + "Charge Kevin Rho.</text>\n", + " </source>\n", + " \n", + " \n", + " <source>\n", + " <url>https://www.theguardian.com/us-news/2022/nov/18/elizabeth-holmes-theranos-trial-sentencing</url>\n", + " <title>Theranos founder Elizabeth Holmes sentenced to more than 11 years for defrauding investors | Theranos | \n", + "The Guardian</title>\n", + " <text>The harsh ruling – which aligned with federal sentencing guidelines but was far more severe than the 18 \n", + "months of house arrest requested by Holmes – sends a strong message from the US government to Silicon Valley, said \n", + "Anat Alon-Beck, a law professor at Case Western Reserve University in Ohio.\n", + "</text>\n", + " </source>\n", + " \n", + " \n", + " <source>\n", + " <url>https://www.cnbc.com/2022/11/18/former-theranos-ceo-elizabeth-holmes-sentenced-to-more-than-11-years-in-pr\n", + "ison.html</url>\n", + " <title>Theranos founder Elizabeth Holmes sentenced to more than 11 years in prison</title>\n", + " <text>U.S. District Court Judge Edward Davila, who presided over Holmes' trial, handed down the sentence.\n", + "</text>\n", + " </source>\n", + " \n", + " \n", + " <source>\n", + " <url>https://www.justice.gov/usao-ndca/pr/elizabeth-holmes-sentenced-more-11-years-defrauding-theranos-investor\n", + "s-hundreds</url>\n", + " <title>Northern District of California | Elizabeth Holmes Sentenced To More Than 11 Years For Defrauding \n", + "Theranos Investors Of Hundreds Of Millions | United States Department of Justice</title>\n", + " <text>Assistant U.S. Attorneys Robert S. Leach, Jeff Schenk, John C. Bostic, and Kelly Volkar are prosecuting \n", + "the case with the assistance of Madeline Wachs, Lakisha Holliman, Sara Slattery, Elise Etter, Susan Kreider, and \n", + "Leeya Kekona. The prosecution is the result of an investigation by the FBI, USPIS, and the FDA Office of Criminal \n", + "Investigations.</text>\n", + " </source>\n", + " \n", + " \n", + " <source>\n", + " <url>https://www.bbc.com/news/business-58336998</url>\n", + " <title>Theranos scandal: Who is Elizabeth Holmes and why was she on trial?</title>\n", + " <text>US Treasury Secretary George Schultz, media tycoon Rupert Murdoch and America's richest family, the \n", + "Waltons, were among her backers.\n", + "</text>\n", + " </source>\n", + " \n", + " \n", + " <source>\n", + " <url>https://www.justice.gov/usao-ndca/pr/elizabeth-holmes-sentenced-more-11-years-defrauding-theranos-investor\n", + "s-hundreds</url>\n", + " <title>Northern District of California | Elizabeth Holmes Sentenced To More Than 11 Years For Defrauding \n", + "Theranos Investors Of Hundreds Of Millions | United States Department of Justice</title>\n", + " <text>The New York Times Style Magazine. Holmes dined at the White House, joined the Board of Fellows of \n", + "Harvard Medical School, and was named by Time as one of the 100 Most Influential People in the World.</text>\n", + " </source>\n", + " \n", + " \n", + " <source>\n", + " <url>https://www.npr.org/2023/05/30/1178728092/elizabeth-holmes-prison-sentence-theranos-fraud-silicon-valley</\n", + "url>\n", + " <title>Elizabeth Holmes has started her 11-year prison sentence. Here's what to know</title>\n", + " <text>Elizabeth Holmes has started her 11-year prison sentence. Here's what to know\n", + "Disgraced Silicon Valley superstar Elizabeth Holmes has surrendered to federal prison in Texas to begin serving a \n", + "11-year term for defrauding investors with her once high-flying blood-testing company Theranos.\n", + " Holmes, a 39-year-old mother of two, reported Tuesday to the prison camp in Bryan, Texas, an all-female facility \n", + "about 100 miles outside of Houston, where some family members of Holmes reside.\n", + " Lawyers for Holmes attempted to delay the start of her prison sentence by asking that she remain free while she \n", + "appeals her conviction, but the 9th Circuit Court of Appeals denied her request, ruling that Holmes has not raised \n", + "a \"substantial question\" of law or fact about her case.\n", + " Here's what to know about the incarceration of Holmes, the most high-profile tech executive to be sentenced to \n", + "prison time.\n", + "</text>\n", + " </source>\n", + " \n", + " \n", + "\n" + ], + "text/plain": [ + "\n", + " \u001b[1m<\u001b[0m\u001b[1;95msource\u001b[0m\u001b[39m>\u001b[0m\n", + "\u001b[39m