############################################################################################################################################################### # _____ _ ___ _ ___ # |_ _|| |_ ___ | _ )(_) __ _ / __| __ _ _ __ _ _ __ ___ _ _ # | | | ' \ / -_) | _ \| |/ _` | \__ \/ _|| '_|/ _` || '_ \/ -_)| '_| # |_| |_||_|\___| |___/|_|\__, | |___/\__||_| \__,_|| .__/\___||_| # |___/ |_| # ############################################################################################################################################################## # _ ______ _ _ _______ _ _ # _ | | (_____ \ | | (_) (_______) (_) (_) # _____ _ _ _| |_ | |__ ___ ____ _ _____) ) ___ __| | ____ _ ____ ___ _ _ _ _____ ___ _ ____ _ # (____ || | | |(_ _)| _ \ / _ \ / ___)(_) | __ / / _ \ / _ | / ___)| | / _ | / _ \ | ||_|| |(____ | /___)| || _ \ | | # / ___ || |_| | | |_ | | | || |_| || | _ | | \ \ | |_| |( (_| || | | |( (_| || |_| | | | | |/ ___ ||___ || || | | || | # \_____||____/ \__)|_| |_| \___/ |_| (_) |_| |_| \___/ \____||_| |_| \___ | \___/ |_| |_|\_____|(___/ |_||_| |_||_| # (_____| ############################################################################################################################################################### # # Last updated in: 8/15/2024 # ############################################################################################################################################################### # ------------------------------------------------------------------------------ # IMPORTS # ------------------------------------------------------------------------------ import os import subprocess from typing import Tuple import gradio as gr from bs4 import BeautifulSoup as Soup from dotenv import load_dotenv from scrapegraphai.graphs import SmartScraperGraph from scrapegraphai.utils import prettify_exec_info from langchain_community.embeddings.huggingface import HuggingFaceInferenceAPIEmbeddings from langchain_huggingface.llms.huggingface_endpoint import HuggingFaceEndpoint from langchain_community.document_loaders import (AsyncHtmlLoader, NewsURLLoader, PubMedLoader, PlaywrightURLLoader, RecursiveUrlLoader, SeleniumURLLoader, UnstructuredURLLoader, WebBaseLoader) # ------------------------------------------------------------------------------ # DEV ENVIRONMENT SETUP # ------------------------------------------------------------------------------ # Load environment variables load_dotenv() HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN') # Foudational Model and Embeeding Model HF repo ID FM_REPO_ID = "mistralai/Mistral-7B-Instruct-v0.3" EM_REPO_ID="sentence-transformers/all-MiniLM-l6-v2" # Initialize the model instances llm_model_instance = HuggingFaceEndpoint( repo_id=FM_REPO_ID, max_new_tokens=8192, top_k=10, top_p=0.95, typical_p=0.95, temperature=0.1, repetition_penalty=1.03, huggingfacehub_api_token = HUGGINGFACEHUB_API_TOKEN, ) embedder_model_instance = HuggingFaceInferenceAPIEmbeddings( api_key=HUGGINGFACEHUB_API_TOKEN, model_name=EM_REPO_ID ) graph_config = { "llm": {"model_instance": llm_model_instance}, "embeddings": {"model_instance": embedder_model_instance} } # ------------------------------------------------------------------------------ # THE BIG SCRAPER # ------------------------------------------------------------------------------ def extractDataFromUrls(urls: str, loader_type: str) -> Tuple[list, list]: """Extracts data from provided URLs using specified loader type. Args: urls (str): Comma-separated URLs to extract data from. loader_type (str): Type of loader to use for data extraction. Returns: tuple: A tuple containing the extracted data in JSON format and as a list of Document objects. Returns error messages if an exception occurs. """ try: urls = urls.split(',') data = [] if loader_type == 'AsyncHtmlLoader': loader = AsyncHtmlLoader(urls) elif loader_type == 'UnstructuredURL': loader = UnstructuredURLLoader(urls=urls) elif loader_type == 'RecursiveURL': loader = RecursiveUrlLoader( url=urls[0], max_depth=2, extractor=lambda x: Soup(x, "html.parser").text ) elif loader_type == 'SeleniumURL': loader = SeleniumURLLoader(urls=urls) elif loader_type == 'SeleniumURLH': loader = SeleniumURLLoader(urls=urls, headless=False) elif loader_type == 'PlaywrightURL': loader = PlaywrightURLLoader(urls=urls) elif loader_type == 'PubMed': loader = PubMedLoader(urls[0]) elif loader_type == 'NewsURL': loader = NewsURLLoader(urls) elif loader_type == 'WebBaseLoader': loader = WebBaseLoader(urls) else: return "Not Implemented. Development in Progress", "Work In Progress" data = loader.load() jsonData = [] for item in data: jsonData.append(item.to_json()) return jsonData, data except Exception as err: return "An Error Occurred. Contact Developer" + str(err), "Error Occured. Boom" def scrapeAndSummarize(prompt: str, source: str) -> Tuple[dict, str]: """Scrapes website content based on the prompt and summarizes it. Args: prompt (str): The prompt to guide the scraping process. source (str): The URL of the website to scrape. Returns: tuple: A tuple containing the scraped data as a dictionary and the execution information. """ smart_scraper_graph = SmartScraperGraph( prompt=prompt, source=source, config=graph_config ) result = smart_scraper_graph.run() exec_info = smart_scraper_graph.get_execution_info() return result, prettify_exec_info(exec_info) # ------------------------------------------------------------------------------ # TABBED GRADIO UI # ------------------------------------------------------------------------------ # Define choices for the dropdown menu choices = [ 'AsyncHtmlLoader', 'UnstructuredURL', 'RecursiveURL', 'PubMed', 'WebBaseLoader', 'Scrapy', 'PySpider', 'Beautiful Soup', 'SeleniumURL', 'SeleniumURLH', 'PlaywrightURL', 'NewsURL', ] # Create the Gradio interface with tabs with gr.Blocks(theme="sudeepshouche/minimalist") as demo: gr.Markdown("# THE BIG SCRAPER") with gr.Tabs(): # Tab 1: Data Extraction with gr.TabItem("Data Extraction"): gr.Markdown("## Extract data from URLs using various loaders") with gr.Row(): url_input = gr.Textbox(label="Enter your comma separated URLs here") loader_dropdown = gr.Dropdown(choices=choices, label="Pick your Loader from here") extract_button = gr.Button("Extract Data") with gr.Row(): extracted_data_json = gr.JSON(label="Extracted Data (JSON)") extracted_data_text = gr.Textbox(label="Extracted Data (Text)") extract_button.click( extractDataFromUrls, inputs=[url_input, loader_dropdown], outputs=[extracted_data_json, extracted_data_text] ) # Tab 2: Website Scraping and Summarization with gr.TabItem("Scraping & Summarization"): with gr.Row(): with gr.Column(): model_dropdown = gr.Textbox(label="Model", value="Mistral-7B-Instruct-v0.2") prompt_input = gr.Textbox(label="Prompt", value="List me all the press releases with their headlines and urls.") source_input = gr.Textbox(label="Source URL", value="https://www.whitehouse.gov/") scrape_button = gr.Button("Scrape and Summarize") with gr.Column(): result_output = gr.JSON(label="Result") exec_info_output = gr.Textbox(label="Execution Info") scrape_button.click( scrapeAndSummarize, inputs=[prompt_input, source_input], outputs=[result_output, exec_info_output] ) # Launch the Gradio interface demo.launch()