Spaces:

ResearchMAGIC
/

the-big-scraper

Sleeping

File size: 9,289 Bytes

###############################################################################################################################################################
#                                         _____  _           ___  _         ___                                
#                                        |_   _|| |_   ___  | _ )(_) __ _  / __| __  _ _  __ _  _ __  ___  _ _ 
#                                          | |  | ' \ / -_) | _ \| |/ _` | \__ \/ _|| '_|/ _` || '_ \/ -_)| '_|
#                                          |_|  |_||_|\___| |___/|_|\__, | |___/\__||_|  \__,_|| .__/\___||_|  
#                                                                   |___/                      |_|   
#
##############################################################################################################################################################
#                          _                         ______              _         _                  _______               _         _ 
#                     _   | |                       (_____ \            | |       (_)                (_______)             (_)       (_)
#     _____  _   _  _| |_ | |__    ___    ____  _    _____) )  ___    __| |  ____  _   ____   ___     _  _  _  _____   ___  _  ____   _ 
#    (____ || | | |(_   _)|  _ \  / _ \  / ___)(_)  |  __  /  / _ \  / _  | / ___)| | / _  | / _ \   | ||_|| |(____ | /___)| ||  _ \ | |
#    / ___ || |_| |  | |_ | | | || |_| || |     _   | |  \ \ | |_| |( (_| || |    | |( (_| || |_| |  | |   | |/ ___ ||___ || || | | || |
#    \_____||____/    \__)|_| |_| \___/ |_|    (_)  |_|   |_| \___/  \____||_|    |_| \___ | \___/   |_|   |_|\_____|(___/ |_||_| |_||_|
#                                                                                    (_____|                                            
###############################################################################################################################################################
#
# Last updated in: 8/15/2024
#
###############################################################################################################################################################

# ------------------------------------------------------------------------------
# IMPORTS
# ------------------------------------------------------------------------------

import os
import subprocess
from typing import Tuple

import gradio as gr
from bs4 import BeautifulSoup as Soup
from dotenv import load_dotenv

from scrapegraphai.graphs import SmartScraperGraph
from scrapegraphai.utils import prettify_exec_info

from langchain_community.embeddings.huggingface import HuggingFaceInferenceAPIEmbeddings
from langchain_huggingface.llms.huggingface_endpoint import HuggingFaceEndpoint
from langchain_community.document_loaders import (AsyncHtmlLoader,
                                               NewsURLLoader, PubMedLoader,
                                               PlaywrightURLLoader,
                                               RecursiveUrlLoader,
                                               SeleniumURLLoader,
                                               UnstructuredURLLoader,
                                               WebBaseLoader)


# ------------------------------------------------------------------------------
# DEV ENVIRONMENT SETUP
# ------------------------------------------------------------------------------

# Load environment variables
load_dotenv()
HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')

# Foudational Model and Embeeding Model HF repo ID
FM_REPO_ID = "mistralai/Mistral-7B-Instruct-v0.3"
EM_REPO_ID="sentence-transformers/all-MiniLM-l6-v2"


# Initialize the model instances
llm_model_instance = HuggingFaceEndpoint(
    repo_id=FM_REPO_ID, 
    max_new_tokens=8192,
    top_k=10,
    top_p=0.95,
    typical_p=0.95,
    temperature=0.1,
    repetition_penalty=1.03,
    huggingfacehub_api_token = HUGGINGFACEHUB_API_TOKEN,
)

embedder_model_instance = HuggingFaceInferenceAPIEmbeddings(
    api_key=HUGGINGFACEHUB_API_TOKEN, 
    model_name=EM_REPO_ID
)

graph_config = {
    "llm": {"model_instance": llm_model_instance},
    "embeddings": {"model_instance": embedder_model_instance}
}

# ------------------------------------------------------------------------------
# THE BIG SCRAPER
# ------------------------------------------------------------------------------

def extractDataFromUrls(urls: str, loader_type: str) -> Tuple[list, list]:
    """Extracts data from provided URLs using specified loader type.

    Args:
        urls (str): Comma-separated URLs to extract data from.
        loader_type (str): Type of loader to use for data extraction.

    Returns:
        tuple: A tuple containing the extracted data in JSON format and as a list of Document objects.
               Returns error messages if an exception occurs.
    """
    try:
        urls = urls.split(',')
        data = []

        if loader_type == 'AsyncHtmlLoader':
            loader = AsyncHtmlLoader(urls)
        elif loader_type == 'UnstructuredURL':
            loader = UnstructuredURLLoader(urls=urls)
        elif loader_type == 'RecursiveURL':
            loader = RecursiveUrlLoader(
                url=urls[0], max_depth=2, extractor=lambda x: Soup(x, "html.parser").text
            )
        elif loader_type == 'SeleniumURL':
            loader = SeleniumURLLoader(urls=urls)
        elif loader_type == 'SeleniumURLH':
            loader = SeleniumURLLoader(urls=urls, headless=False)
        elif loader_type == 'PlaywrightURL':
            loader = PlaywrightURLLoader(urls=urls)
        elif loader_type == 'PubMed':
            loader = PubMedLoader(urls[0])
        elif loader_type == 'NewsURL':
            loader = NewsURLLoader(urls)
        elif loader_type == 'WebBaseLoader':
            loader = WebBaseLoader(urls)
        else:
            return "Not Implemented. Development in Progress", "Work In Progress"

        data = loader.load()
        jsonData = []
        for item in data:
            jsonData.append(item.to_json())

        return jsonData, data

    except Exception as err:
        return "An Error Occurred. Contact Developer" + str(err), "Error Occured. Boom" 


def scrapeAndSummarize(prompt: str, source: str) -> Tuple[dict, str]:
    """Scrapes website content based on the prompt and summarizes it.

    Args:
        prompt (str): The prompt to guide the scraping process.
        source (str): The URL of the website to scrape.

    Returns:
        tuple: A tuple containing the scraped data as a dictionary and the execution information.
    """
    smart_scraper_graph = SmartScraperGraph(
        prompt=prompt,
        source=source,
        config=graph_config
    )
    result = smart_scraper_graph.run()
    exec_info = smart_scraper_graph.get_execution_info()
    return result, prettify_exec_info(exec_info)

# ------------------------------------------------------------------------------
# TABBED GRADIO UI
# ------------------------------------------------------------------------------

# Define choices for the dropdown menu
choices = [
    'AsyncHtmlLoader', 'UnstructuredURL', 'RecursiveURL', 'PubMed',
    'WebBaseLoader', 'Scrapy', 'PySpider', 'Beautiful Soup',
    'SeleniumURL', 'SeleniumURLH', 'PlaywrightURL', 'NewsURL',
]

# Create the Gradio interface with tabs
with gr.Blocks(theme="sudeepshouche/minimalist") as demo:
    gr.Markdown("# THE BIG SCRAPER")

    with gr.Tabs():
        # Tab 1: Data Extraction
        with gr.TabItem("Data Extraction"):
            gr.Markdown("## Extract data from URLs using various loaders")
            with gr.Row():
                url_input = gr.Textbox(label="Enter your comma separated URLs here")
                loader_dropdown = gr.Dropdown(choices=choices, label="Pick your Loader from here")
            extract_button = gr.Button("Extract Data")
            with gr.Row():
                extracted_data_json = gr.JSON(label="Extracted Data (JSON)")
                extracted_data_text = gr.Textbox(label="Extracted Data (Text)")
            extract_button.click(
                extractDataFromUrls,
                inputs=[url_input, loader_dropdown],
                outputs=[extracted_data_json, extracted_data_text]
            )

        # Tab 2: Website Scraping and Summarization
        with gr.TabItem("Scraping & Summarization"):
            with gr.Row():
                with gr.Column():
                    model_dropdown = gr.Textbox(label="Model", value="Mistral-7B-Instruct-v0.2")
                    prompt_input = gr.Textbox(label="Prompt", value="List me all the press releases with their headlines and urls.")
                    source_input = gr.Textbox(label="Source URL", value="https://www.whitehouse.gov/")
                    scrape_button = gr.Button("Scrape and Summarize")
                with gr.Column():
                    result_output = gr.JSON(label="Result")
                    exec_info_output = gr.Textbox(label="Execution Info")

            scrape_button.click(
                scrapeAndSummarize,
                inputs=[prompt_input, source_input],
                outputs=[result_output, exec_info_output]
            )

# Launch the Gradio interface
demo.launch()