Spaces:
Sleeping
Sleeping
File size: 9,289 Bytes
c63697d 3df52c2 c63697d 3df52c2 c63697d 3f9fa90 c63697d 3df52c2 c63697d 3f9fa90 3df52c2 c63697d 3f9fa90 3df52c2 c63697d 3f9fa90 c63697d be26d84 c63697d be26d84 c63697d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 |
###############################################################################################################################################################
# _____ _ ___ _ ___
# |_ _|| |_ ___ | _ )(_) __ _ / __| __ _ _ __ _ _ __ ___ _ _
# | | | ' \ / -_) | _ \| |/ _` | \__ \/ _|| '_|/ _` || '_ \/ -_)| '_|
# |_| |_||_|\___| |___/|_|\__, | |___/\__||_| \__,_|| .__/\___||_|
# |___/ |_|
#
##############################################################################################################################################################
# _ ______ _ _ _______ _ _
# _ | | (_____ \ | | (_) (_______) (_) (_)
# _____ _ _ _| |_ | |__ ___ ____ _ _____) ) ___ __| | ____ _ ____ ___ _ _ _ _____ ___ _ ____ _
# (____ || | | |(_ _)| _ \ / _ \ / ___)(_) | __ / / _ \ / _ | / ___)| | / _ | / _ \ | ||_|| |(____ | /___)| || _ \ | |
# / ___ || |_| | | |_ | | | || |_| || | _ | | \ \ | |_| |( (_| || | | |( (_| || |_| | | | | |/ ___ ||___ || || | | || |
# \_____||____/ \__)|_| |_| \___/ |_| (_) |_| |_| \___/ \____||_| |_| \___ | \___/ |_| |_|\_____|(___/ |_||_| |_||_|
# (_____|
###############################################################################################################################################################
#
# Last updated in: 8/15/2024
#
###############################################################################################################################################################
# ------------------------------------------------------------------------------
# IMPORTS
# ------------------------------------------------------------------------------
import os
import subprocess
from typing import Tuple
import gradio as gr
from bs4 import BeautifulSoup as Soup
from dotenv import load_dotenv
from scrapegraphai.graphs import SmartScraperGraph
from scrapegraphai.utils import prettify_exec_info
from langchain_community.embeddings.huggingface import HuggingFaceInferenceAPIEmbeddings
from langchain_huggingface.llms.huggingface_endpoint import HuggingFaceEndpoint
from langchain_community.document_loaders import (AsyncHtmlLoader,
NewsURLLoader, PubMedLoader,
PlaywrightURLLoader,
RecursiveUrlLoader,
SeleniumURLLoader,
UnstructuredURLLoader,
WebBaseLoader)
# ------------------------------------------------------------------------------
# DEV ENVIRONMENT SETUP
# ------------------------------------------------------------------------------
# Load environment variables
load_dotenv()
HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')
# Foudational Model and Embeeding Model HF repo ID
FM_REPO_ID = "mistralai/Mistral-7B-Instruct-v0.3"
EM_REPO_ID="sentence-transformers/all-MiniLM-l6-v2"
# Initialize the model instances
llm_model_instance = HuggingFaceEndpoint(
repo_id=FM_REPO_ID,
max_new_tokens=8192,
top_k=10,
top_p=0.95,
typical_p=0.95,
temperature=0.1,
repetition_penalty=1.03,
huggingfacehub_api_token = HUGGINGFACEHUB_API_TOKEN,
)
embedder_model_instance = HuggingFaceInferenceAPIEmbeddings(
api_key=HUGGINGFACEHUB_API_TOKEN,
model_name=EM_REPO_ID
)
graph_config = {
"llm": {"model_instance": llm_model_instance},
"embeddings": {"model_instance": embedder_model_instance}
}
# ------------------------------------------------------------------------------
# THE BIG SCRAPER
# ------------------------------------------------------------------------------
def extractDataFromUrls(urls: str, loader_type: str) -> Tuple[list, list]:
"""Extracts data from provided URLs using specified loader type.
Args:
urls (str): Comma-separated URLs to extract data from.
loader_type (str): Type of loader to use for data extraction.
Returns:
tuple: A tuple containing the extracted data in JSON format and as a list of Document objects.
Returns error messages if an exception occurs.
"""
try:
urls = urls.split(',')
data = []
if loader_type == 'AsyncHtmlLoader':
loader = AsyncHtmlLoader(urls)
elif loader_type == 'UnstructuredURL':
loader = UnstructuredURLLoader(urls=urls)
elif loader_type == 'RecursiveURL':
loader = RecursiveUrlLoader(
url=urls[0], max_depth=2, extractor=lambda x: Soup(x, "html.parser").text
)
elif loader_type == 'SeleniumURL':
loader = SeleniumURLLoader(urls=urls)
elif loader_type == 'SeleniumURLH':
loader = SeleniumURLLoader(urls=urls, headless=False)
elif loader_type == 'PlaywrightURL':
loader = PlaywrightURLLoader(urls=urls)
elif loader_type == 'PubMed':
loader = PubMedLoader(urls[0])
elif loader_type == 'NewsURL':
loader = NewsURLLoader(urls)
elif loader_type == 'WebBaseLoader':
loader = WebBaseLoader(urls)
else:
return "Not Implemented. Development in Progress", "Work In Progress"
data = loader.load()
jsonData = []
for item in data:
jsonData.append(item.to_json())
return jsonData, data
except Exception as err:
return "An Error Occurred. Contact Developer" + str(err), "Error Occured. Boom"
def scrapeAndSummarize(prompt: str, source: str) -> Tuple[dict, str]:
"""Scrapes website content based on the prompt and summarizes it.
Args:
prompt (str): The prompt to guide the scraping process.
source (str): The URL of the website to scrape.
Returns:
tuple: A tuple containing the scraped data as a dictionary and the execution information.
"""
smart_scraper_graph = SmartScraperGraph(
prompt=prompt,
source=source,
config=graph_config
)
result = smart_scraper_graph.run()
exec_info = smart_scraper_graph.get_execution_info()
return result, prettify_exec_info(exec_info)
# ------------------------------------------------------------------------------
# TABBED GRADIO UI
# ------------------------------------------------------------------------------
# Define choices for the dropdown menu
choices = [
'AsyncHtmlLoader', 'UnstructuredURL', 'RecursiveURL', 'PubMed',
'WebBaseLoader', 'Scrapy', 'PySpider', 'Beautiful Soup',
'SeleniumURL', 'SeleniumURLH', 'PlaywrightURL', 'NewsURL',
]
# Create the Gradio interface with tabs
with gr.Blocks(theme="sudeepshouche/minimalist") as demo:
gr.Markdown("# THE BIG SCRAPER")
with gr.Tabs():
# Tab 1: Data Extraction
with gr.TabItem("Data Extraction"):
gr.Markdown("## Extract data from URLs using various loaders")
with gr.Row():
url_input = gr.Textbox(label="Enter your comma separated URLs here")
loader_dropdown = gr.Dropdown(choices=choices, label="Pick your Loader from here")
extract_button = gr.Button("Extract Data")
with gr.Row():
extracted_data_json = gr.JSON(label="Extracted Data (JSON)")
extracted_data_text = gr.Textbox(label="Extracted Data (Text)")
extract_button.click(
extractDataFromUrls,
inputs=[url_input, loader_dropdown],
outputs=[extracted_data_json, extracted_data_text]
)
# Tab 2: Website Scraping and Summarization
with gr.TabItem("Scraping & Summarization"):
with gr.Row():
with gr.Column():
model_dropdown = gr.Textbox(label="Model", value="Mistral-7B-Instruct-v0.2")
prompt_input = gr.Textbox(label="Prompt", value="List me all the press releases with their headlines and urls.")
source_input = gr.Textbox(label="Source URL", value="https://www.whitehouse.gov/")
scrape_button = gr.Button("Scrape and Summarize")
with gr.Column():
result_output = gr.JSON(label="Result")
exec_info_output = gr.Textbox(label="Execution Info")
scrape_button.click(
scrapeAndSummarize,
inputs=[prompt_input, source_input],
outputs=[result_output, exec_info_output]
)
# Launch the Gradio interface
demo.launch() |