rodrigomasini commited on
Commit
c63697d
·
verified ·
1 Parent(s): abdffe5

Create alternative.py

Browse files
Files changed (1) hide show
  1. alternative.py +196 -0
alternative.py ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ###############################################################################################################################################################
2
+ # _____ _ ___ _ ___
3
+ # |_ _|| |_ ___ | _ )(_) __ _ / __| __ _ _ __ _ _ __ ___ _ _
4
+ # | | | ' \ / -_) | _ \| |/ _` | \__ \/ _|| '_|/ _` || '_ \/ -_)| '_|
5
+ # |_| |_||_|\___| |___/|_|\__, | |___/\__||_| \__,_|| .__/\___||_|
6
+ # |___/ |_|
7
+ #
8
+ ##############################################################################################################################################################
9
+ # _ ______ _ _ _______ _ _
10
+ # _ | | (_____ \ | | (_) (_______) (_) (_)
11
+ # _____ _ _ _| |_ | |__ ___ ____ _ _____) ) ___ __| | ____ _ ____ ___ _ _ _ _____ ___ _ ____ _
12
+ # (____ || | | |(_ _)| _ \ / _ \ / ___)(_) | __ / / _ \ / _ | / ___)| | / _ | / _ \ | ||_|| |(____ | /___)| || _ \ | |
13
+ # / ___ || |_| | | |_ | | | || |_| || | _ | | \ \ | |_| |( (_| || | | |( (_| || |_| | | | | |/ ___ ||___ || || | | || |
14
+ # \_____||____/ \__)|_| |_| \___/ |_| (_) |_| |_| \___/ \____||_| |_| \___ | \___/ |_| |_|\_____|(___/ |_||_| |_||_|
15
+ # (_____|
16
+ ###############################################################################################################################################################
17
+ #
18
+ # Last updated in: 8/15/2024
19
+ #
20
+ ###############################################################################################################################################################
21
+
22
+ # ------------------------------------------------------------------------------
23
+ # IMPORTS
24
+ # ------------------------------------------------------------------------------
25
+
26
+ import os
27
+ import subprocess
28
+ from typing import Tuple
29
+
30
+ import gradio as gr
31
+ from dotenv import load_dotenv
32
+ from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
33
+ from langchain_community.llms import HuggingFaceEndpoint
34
+ from scrapegraphai.graphs import SmartScraperGraph
35
+ from scrapegraphai.utils import prettify_exec_info
36
+
37
+ from bs4 import BeautifulSoup as Soup
38
+ from langchain_community.document_loaders import (AsyncHtmlLoader,
39
+ NewsURLLoader, PubMedLoader,
40
+ PlaywrightURLLoader,
41
+ RecursiveUrlLoader,
42
+ SeleniumURLLoader,
43
+ UnstructuredURLLoader,
44
+ WebBaseLoader)
45
+
46
+
47
+ # ------------------------------------------------------------------------------
48
+ # DEV ENVIRONMENT SETUP
49
+ # ------------------------------------------------------------------------------
50
+
51
+ # Load environment variables
52
+ HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')
53
+
54
+ # Initialize the model instances
55
+ repo_id = "mistralai/Mistral-7B-Instruct-v0.2"
56
+ llm_model_instance = HuggingFaceEndpoint(
57
+ repo_id=repo_id, max_length=8192, temperature=0.5, token=HUGGINGFACEHUB_API_TOKEN
58
+ )
59
+
60
+ embedder_model_instance = HuggingFaceInferenceAPIEmbeddings(
61
+ api_key=HUGGINGFACEHUB_API_TOKEN, model_name="sentence-transformers/all-MiniLM-l6-v2"
62
+ )
63
+
64
+ graph_config = {
65
+ "llm": {"model_instance": llm_model_instance},
66
+ "embeddings": {"model_instance": embedder_model_instance}
67
+ }
68
+
69
+ # ------------------------------------------------------------------------------
70
+ # THE BIG SCRAPER
71
+ # ------------------------------------------------------------------------------
72
+
73
+ def extractDataFromUrls(urls: str, loader_type: str) -> Tuple[list, list]:
74
+ """Extracts data from provided URLs using specified loader type.
75
+
76
+ Args:
77
+ urls (str): Comma-separated URLs to extract data from.
78
+ loader_type (str): Type of loader to use for data extraction.
79
+
80
+ Returns:
81
+ tuple: A tuple containing the extracted data in JSON format and as a list of Document objects.
82
+ Returns error messages if an exception occurs.
83
+ """
84
+ try:
85
+ urls = urls.split(',')
86
+ data = []
87
+
88
+ if loader_type == 'AsyncHtmlLoader':
89
+ loader = AsyncHtmlLoader(urls)
90
+ elif loader_type == 'UnstructuredURL':
91
+ loader = UnstructuredURLLoader(urls=urls)
92
+ elif loader_type == 'RecursiveURL':
93
+ loader = RecursiveUrlLoader(
94
+ url=urls[0], max_depth=2, extractor=lambda x: Soup(x, "html.parser").text
95
+ )
96
+ elif loader_type == 'SeleniumURL':
97
+ loader = SeleniumURLLoader(urls=urls)
98
+ elif loader_type == 'SeleniumURLH':
99
+ loader = SeleniumURLLoader(urls=urls, headless=False)
100
+ elif loader_type == 'PlaywrightURL':
101
+ loader = PlaywrightURLLoader(urls=urls)
102
+ elif loader_type == 'PubMed':
103
+ loader = PubMedLoader(urls[0])
104
+ elif loader_type == 'NewsURL':
105
+ loader = NewsURLLoader(urls)
106
+ elif loader_type == 'WebBaseLoader':
107
+ loader = WebBaseLoader(urls)
108
+ else:
109
+ return "Not Implemented. Development in Progress", "Work In Progress"
110
+
111
+ data = loader.load()
112
+ jsonData = []
113
+ for item in data:
114
+ jsonData.append(item.to_json())
115
+
116
+ return jsonData, data
117
+
118
+ except Exception as err:
119
+ return "An Error Occurred. Contact Developer" + str(err), "Error Occured. Boom"
120
+
121
+
122
+ def scrapeAndSummarize(prompt: str, source: str) -> Tuple[dict, str]:
123
+ """Scrapes website content based on the prompt and summarizes it.
124
+
125
+ Args:
126
+ prompt (str): The prompt to guide the scraping process.
127
+ source (str): The URL of the website to scrape.
128
+
129
+ Returns:
130
+ tuple: A tuple containing the scraped data as a dictionary and the execution information.
131
+ """
132
+ smart_scraper_graph = SmartScraperGraph(
133
+ prompt=prompt,
134
+ source=source,
135
+ config=graph_config
136
+ )
137
+ result = smart_scraper_graph.run()
138
+ exec_info = smart_scraper_graph.get_execution_info()
139
+ return result, prettify_exec_info(exec_info)
140
+
141
+ # ------------------------------------------------------------------------------
142
+ # TABBED GRADIO UI
143
+ # ------------------------------------------------------------------------------
144
+
145
+ # Define choices for the dropdown menu
146
+ choices = [
147
+ 'AsyncHtmlLoader', 'UnstructuredURL', 'RecursiveURL', 'PubMed',
148
+ 'WebBaseLoader', 'Scrapy', 'PySpider', 'Beautiful Soup',
149
+ 'SeleniumURL', 'SeleniumURLH', 'PlaywrightURL', 'NewsURL',
150
+ ]
151
+
152
+ # Create the Gradio interface with tabs
153
+ with gr.Blocks() as demo:
154
+ gr.Markdown("# Web Scraping and Summarization")
155
+
156
+ with gr.Tabs():
157
+ # Tab 1: Data Extraction
158
+ with gr.TabItem("Data Extraction"):
159
+ gr.Markdown("## Extract data from URLs using various loaders")
160
+ with gr.Row():
161
+ url_input = gr.Textbox(label="Enter your comma separated URLs here")
162
+ loader_dropdown = gr.Dropdown(choices=choices, label="Pick your Loader from here")
163
+ extract_button = gr.Button("Extract Data")
164
+ with gr.Row():
165
+ extracted_data_json = gr.JSON(label="Extracted Data (JSON)")
166
+ extracted_data_text = gr.Textbox(label="Extracted Data (Text)")
167
+ extract_button.click(
168
+ extractDataFromUrls,
169
+ inputs=[url_input, loader_dropdown],
170
+ outputs=[extracted_data_json, extracted_data_text]
171
+ )
172
+
173
+ # Tab 2: Website Scraping and Summarization
174
+ with gr.TabItem("Website Scraping & Summarization"):
175
+ gr.Markdown("# Scrape websites, no-code version")
176
+ gr.Markdown("""Easily scrape and summarize web content using advanced AI models on the Hugging Face Hub without writing any code. Input your desired prompt and source URL to get started.
177
+ This is a no-code version of the excellent lib [ScrapeGraphAI](https://github.com/VinciGit00/Scrapegraph-ai).
178
+ It's a basic demo and a work in progress. Please contribute to it to make it more useful!""")
179
+ with gr.Row():
180
+ with gr.Column():
181
+ model_dropdown = gr.Textbox(label="Model", value="Mistral-7B-Instruct-v0.2")
182
+ prompt_input = gr.Textbox(label="Prompt", value="List me all the press releases with their headlines and urls.")
183
+ source_input = gr.Textbox(label="Source URL", value="https://www.whitehouse.gov/")
184
+ scrape_button = gr.Button("Scrape and Summarize")
185
+ with gr.Column():
186
+ result_output = gr.JSON(label="Result")
187
+ exec_info_output = gr.Textbox(label="Execution Info")
188
+
189
+ scrape_button.click(
190
+ scrapeAndSummarize,
191
+ inputs=[prompt_input, source_input],
192
+ outputs=[result_output, exec_info_output]
193
+ )
194
+
195
+ # Launch the Gradio interface
196
+ demo.launch()