File size: 9,289 Bytes
c63697d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3df52c2
c63697d
3df52c2
c63697d
 
 
3f9fa90
 
c63697d
 
 
 
 
 
 
 
 
 
 
 
 
 
3df52c2
c63697d
 
3f9fa90
 
 
 
3df52c2
c63697d
 
3f9fa90
 
 
 
 
 
 
3df52c2
c63697d
 
 
3f9fa90
 
c63697d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
be26d84
 
c63697d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
be26d84
c63697d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
###############################################################################################################################################################
#                                         _____  _           ___  _         ___                                
#                                        |_   _|| |_   ___  | _ )(_) __ _  / __| __  _ _  __ _  _ __  ___  _ _ 
#                                          | |  | ' \ / -_) | _ \| |/ _` | \__ \/ _|| '_|/ _` || '_ \/ -_)| '_|
#                                          |_|  |_||_|\___| |___/|_|\__, | |___/\__||_|  \__,_|| .__/\___||_|  
#                                                                   |___/                      |_|   
#
##############################################################################################################################################################
#                          _                         ______              _         _                  _______               _         _ 
#                     _   | |                       (_____ \            | |       (_)                (_______)             (_)       (_)
#     _____  _   _  _| |_ | |__    ___    ____  _    _____) )  ___    __| |  ____  _   ____   ___     _  _  _  _____   ___  _  ____   _ 
#    (____ || | | |(_   _)|  _ \  / _ \  / ___)(_)  |  __  /  / _ \  / _  | / ___)| | / _  | / _ \   | ||_|| |(____ | /___)| ||  _ \ | |
#    / ___ || |_| |  | |_ | | | || |_| || |     _   | |  \ \ | |_| |( (_| || |    | |( (_| || |_| |  | |   | |/ ___ ||___ || || | | || |
#    \_____||____/    \__)|_| |_| \___/ |_|    (_)  |_|   |_| \___/  \____||_|    |_| \___ | \___/   |_|   |_|\_____|(___/ |_||_| |_||_|
#                                                                                    (_____|                                            
###############################################################################################################################################################
#
# Last updated in: 8/15/2024
#
###############################################################################################################################################################

# ------------------------------------------------------------------------------
# IMPORTS
# ------------------------------------------------------------------------------

import os
import subprocess
from typing import Tuple

import gradio as gr
from bs4 import BeautifulSoup as Soup
from dotenv import load_dotenv

from scrapegraphai.graphs import SmartScraperGraph
from scrapegraphai.utils import prettify_exec_info

from langchain_community.embeddings.huggingface import HuggingFaceInferenceAPIEmbeddings
from langchain_huggingface.llms.huggingface_endpoint import HuggingFaceEndpoint
from langchain_community.document_loaders import (AsyncHtmlLoader,
                                               NewsURLLoader, PubMedLoader,
                                               PlaywrightURLLoader,
                                               RecursiveUrlLoader,
                                               SeleniumURLLoader,
                                               UnstructuredURLLoader,
                                               WebBaseLoader)


# ------------------------------------------------------------------------------
# DEV ENVIRONMENT SETUP
# ------------------------------------------------------------------------------

# Load environment variables
load_dotenv()
HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')

# Foudational Model and Embeeding Model HF repo ID
FM_REPO_ID = "mistralai/Mistral-7B-Instruct-v0.3"
EM_REPO_ID="sentence-transformers/all-MiniLM-l6-v2"


# Initialize the model instances
llm_model_instance = HuggingFaceEndpoint(
    repo_id=FM_REPO_ID, 
    max_new_tokens=8192,
    top_k=10,
    top_p=0.95,
    typical_p=0.95,
    temperature=0.1,
    repetition_penalty=1.03,
    huggingfacehub_api_token = HUGGINGFACEHUB_API_TOKEN,
)

embedder_model_instance = HuggingFaceInferenceAPIEmbeddings(
    api_key=HUGGINGFACEHUB_API_TOKEN, 
    model_name=EM_REPO_ID
)

graph_config = {
    "llm": {"model_instance": llm_model_instance},
    "embeddings": {"model_instance": embedder_model_instance}
}

# ------------------------------------------------------------------------------
# THE BIG SCRAPER
# ------------------------------------------------------------------------------

def extractDataFromUrls(urls: str, loader_type: str) -> Tuple[list, list]:
    """Extracts data from provided URLs using specified loader type.

    Args:
        urls (str): Comma-separated URLs to extract data from.
        loader_type (str): Type of loader to use for data extraction.

    Returns:
        tuple: A tuple containing the extracted data in JSON format and as a list of Document objects.
               Returns error messages if an exception occurs.
    """
    try:
        urls = urls.split(',')
        data = []

        if loader_type == 'AsyncHtmlLoader':
            loader = AsyncHtmlLoader(urls)
        elif loader_type == 'UnstructuredURL':
            loader = UnstructuredURLLoader(urls=urls)
        elif loader_type == 'RecursiveURL':
            loader = RecursiveUrlLoader(
                url=urls[0], max_depth=2, extractor=lambda x: Soup(x, "html.parser").text
            )
        elif loader_type == 'SeleniumURL':
            loader = SeleniumURLLoader(urls=urls)
        elif loader_type == 'SeleniumURLH':
            loader = SeleniumURLLoader(urls=urls, headless=False)
        elif loader_type == 'PlaywrightURL':
            loader = PlaywrightURLLoader(urls=urls)
        elif loader_type == 'PubMed':
            loader = PubMedLoader(urls[0])
        elif loader_type == 'NewsURL':
            loader = NewsURLLoader(urls)
        elif loader_type == 'WebBaseLoader':
            loader = WebBaseLoader(urls)
        else:
            return "Not Implemented. Development in Progress", "Work In Progress"

        data = loader.load()
        jsonData = []
        for item in data:
            jsonData.append(item.to_json())

        return jsonData, data

    except Exception as err:
        return "An Error Occurred. Contact Developer" + str(err), "Error Occured. Boom" 


def scrapeAndSummarize(prompt: str, source: str) -> Tuple[dict, str]:
    """Scrapes website content based on the prompt and summarizes it.

    Args:
        prompt (str): The prompt to guide the scraping process.
        source (str): The URL of the website to scrape.

    Returns:
        tuple: A tuple containing the scraped data as a dictionary and the execution information.
    """
    smart_scraper_graph = SmartScraperGraph(
        prompt=prompt,
        source=source,
        config=graph_config
    )
    result = smart_scraper_graph.run()
    exec_info = smart_scraper_graph.get_execution_info()
    return result, prettify_exec_info(exec_info)

# ------------------------------------------------------------------------------
# TABBED GRADIO UI
# ------------------------------------------------------------------------------

# Define choices for the dropdown menu
choices = [
    'AsyncHtmlLoader', 'UnstructuredURL', 'RecursiveURL', 'PubMed',
    'WebBaseLoader', 'Scrapy', 'PySpider', 'Beautiful Soup',
    'SeleniumURL', 'SeleniumURLH', 'PlaywrightURL', 'NewsURL',
]

# Create the Gradio interface with tabs
with gr.Blocks(theme="sudeepshouche/minimalist") as demo:
    gr.Markdown("# THE BIG SCRAPER")

    with gr.Tabs():
        # Tab 1: Data Extraction
        with gr.TabItem("Data Extraction"):
            gr.Markdown("## Extract data from URLs using various loaders")
            with gr.Row():
                url_input = gr.Textbox(label="Enter your comma separated URLs here")
                loader_dropdown = gr.Dropdown(choices=choices, label="Pick your Loader from here")
            extract_button = gr.Button("Extract Data")
            with gr.Row():
                extracted_data_json = gr.JSON(label="Extracted Data (JSON)")
                extracted_data_text = gr.Textbox(label="Extracted Data (Text)")
            extract_button.click(
                extractDataFromUrls,
                inputs=[url_input, loader_dropdown],
                outputs=[extracted_data_json, extracted_data_text]
            )

        # Tab 2: Website Scraping and Summarization
        with gr.TabItem("Scraping & Summarization"):
            with gr.Row():
                with gr.Column():
                    model_dropdown = gr.Textbox(label="Model", value="Mistral-7B-Instruct-v0.2")
                    prompt_input = gr.Textbox(label="Prompt", value="List me all the press releases with their headlines and urls.")
                    source_input = gr.Textbox(label="Source URL", value="https://www.whitehouse.gov/")
                    scrape_button = gr.Button("Scrape and Summarize")
                with gr.Column():
                    result_output = gr.JSON(label="Result")
                    exec_info_output = gr.Textbox(label="Execution Info")

            scrape_button.click(
                scrapeAndSummarize,
                inputs=[prompt_input, source_input],
                outputs=[result_output, exec_info_output]
            )

# Launch the Gradio interface
demo.launch()