import os from pathlib import Path import gradio as gr import requests from fastapi import BackgroundTasks, Response, status from huggingface_hub import WebhookPayload, WebhooksServer from huggingface_hub.utils import build_hf_headers, get_session from src.build_nomic import build_nomic from src.my_logger import setup_logger from src.readme_update import update_dataset_readme from src.utilities import load_datasets, merge_and_update_datasets from src.visualize_logs import log_file_to_html_string proj_dir = Path(__name__).parent logger = setup_logger(__name__) logger.info("Starting Application...") SUBREDDIT = os.environ["SUBREDDIT"] USERNAME = os.environ["USERNAME"] OG_DATASET = f"{USERNAME}/dataset-creator-reddit-{SUBREDDIT}" PROCESSED_DATASET = os.environ['PROCESSED_DATASET'] # HF_TOKEN = os.environ["HF_TOKEN"] WEBHOOK_SECRET = os.getenv("HF_WEBHOOK_SECRET", 'secret') intro_md = """ # Processing BORU This is a space to visually search the subreddit [/r/bestofredditorupdates](https://www.reddit.com/r/BestofRedditorUpdates/). Have you ever been curious to search for stories that are similar to one of your favorites? This can help! - Each dot represents a post (try clicking on one) - Closer dots are similar in topic - Use the filters on the left to help you narrow down what you are looking for - The lasso can help you search in a smaller range that you drag with your mouse - The filter can help you narrow by field, - Filtering posts that are `CONCLUDED` - Filtering popular posts - Filtering by date - The search can help you look by keyword Check out the original on [Nomic](https://atlas.nomic.ai/data/derek2/boru-subreddit-neural-search/map) """ details_md = """ # Details ## Creation Details 1. This space is triggered by a webhook for changes on [reddit-tools-HF/dataset-creator-reddit-bestofredditorupdates](https://huggingface.co/datasets/reddit-tools-HF/dataset-creator-reddit-bestofredditorupdates). 2. It then takes the updates from that dataset and get embeddings by making leveraging [reddit-tools-HF/nomic-embeddings](https://huggingface.co/spaces/reddit-tools-HF/nomic-embeddings) - [reddit-tools-HF/nomic-embeddings](https://huggingface.co/spaces/reddit-tools-HF/nomic-embeddings) is using [zero-spaces](https://huggingface.co/zero-gpu-explorers) a free GPU service to compute the model [nomic-ai/nomic-embed-text-v1.5](https://huggingface.co/nomic-ai/nomic-embed-text-v1.5) - Im calling this via [gradio_client](https://www.gradio.app/docs/client) which allows any space to be used as an API 3. The calculated embeddings are stored in this dataset [reddit-tools-HF/reddit-bestofredditorupdates-processed](https://huggingface.co/datasets/reddit-tools-HF/reddit-bestofredditorupdates-processed) 4. These get visualized by [nomic atlas](https://docs.nomic.ai/atlas/introduction/quick-start). You can see how I process it in [build_nomic.py](https://huggingface.co/spaces/reddit-tools-HF/processing-bestofredditorupdates/blob/main/src/build_nomic.py) """ url = "https://atlas.nomic.ai/data/derek2/boru-subreddit-neural-search/map" html_str = f'