the-big-scraper / alternative.py
rodrigomasini's picture
Update alternative.py
3f9fa90 verified
###############################################################################################################################################################
# _____ _ ___ _ ___
# |_ _|| |_ ___ | _ )(_) __ _ / __| __ _ _ __ _ _ __ ___ _ _
# | | | ' \ / -_) | _ \| |/ _` | \__ \/ _|| '_|/ _` || '_ \/ -_)| '_|
# |_| |_||_|\___| |___/|_|\__, | |___/\__||_| \__,_|| .__/\___||_|
# |___/ |_|
#
##############################################################################################################################################################
# _ ______ _ _ _______ _ _
# _ | | (_____ \ | | (_) (_______) (_) (_)
# _____ _ _ _| |_ | |__ ___ ____ _ _____) ) ___ __| | ____ _ ____ ___ _ _ _ _____ ___ _ ____ _
# (____ || | | |(_ _)| _ \ / _ \ / ___)(_) | __ / / _ \ / _ | / ___)| | / _ | / _ \ | ||_|| |(____ | /___)| || _ \ | |
# / ___ || |_| | | |_ | | | || |_| || | _ | | \ \ | |_| |( (_| || | | |( (_| || |_| | | | | |/ ___ ||___ || || | | || |
# \_____||____/ \__)|_| |_| \___/ |_| (_) |_| |_| \___/ \____||_| |_| \___ | \___/ |_| |_|\_____|(___/ |_||_| |_||_|
# (_____|
###############################################################################################################################################################
#
# Last updated in: 8/15/2024
#
###############################################################################################################################################################
# ------------------------------------------------------------------------------
# IMPORTS
# ------------------------------------------------------------------------------
import os
import subprocess
from typing import Tuple
import gradio as gr
from bs4 import BeautifulSoup as Soup
from dotenv import load_dotenv
from scrapegraphai.graphs import SmartScraperGraph
from scrapegraphai.utils import prettify_exec_info
from langchain_community.embeddings.huggingface import HuggingFaceInferenceAPIEmbeddings
from langchain_huggingface.llms.huggingface_endpoint import HuggingFaceEndpoint
from langchain_community.document_loaders import (AsyncHtmlLoader,
NewsURLLoader, PubMedLoader,
PlaywrightURLLoader,
RecursiveUrlLoader,
SeleniumURLLoader,
UnstructuredURLLoader,
WebBaseLoader)
# ------------------------------------------------------------------------------
# DEV ENVIRONMENT SETUP
# ------------------------------------------------------------------------------
# Load environment variables
load_dotenv()
HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')
# Foudational Model and Embeeding Model HF repo ID
FM_REPO_ID = "mistralai/Mistral-7B-Instruct-v0.3"
EM_REPO_ID="sentence-transformers/all-MiniLM-l6-v2"
# Initialize the model instances
llm_model_instance = HuggingFaceEndpoint(
repo_id=FM_REPO_ID,
max_new_tokens=8192,
top_k=10,
top_p=0.95,
typical_p=0.95,
temperature=0.1,
repetition_penalty=1.03,
huggingfacehub_api_token = HUGGINGFACEHUB_API_TOKEN,
)
embedder_model_instance = HuggingFaceInferenceAPIEmbeddings(
api_key=HUGGINGFACEHUB_API_TOKEN,
model_name=EM_REPO_ID
)
graph_config = {
"llm": {"model_instance": llm_model_instance},
"embeddings": {"model_instance": embedder_model_instance}
}
# ------------------------------------------------------------------------------
# THE BIG SCRAPER
# ------------------------------------------------------------------------------
def extractDataFromUrls(urls: str, loader_type: str) -> Tuple[list, list]:
"""Extracts data from provided URLs using specified loader type.
Args:
urls (str): Comma-separated URLs to extract data from.
loader_type (str): Type of loader to use for data extraction.
Returns:
tuple: A tuple containing the extracted data in JSON format and as a list of Document objects.
Returns error messages if an exception occurs.
"""
try:
urls = urls.split(',')
data = []
if loader_type == 'AsyncHtmlLoader':
loader = AsyncHtmlLoader(urls)
elif loader_type == 'UnstructuredURL':
loader = UnstructuredURLLoader(urls=urls)
elif loader_type == 'RecursiveURL':
loader = RecursiveUrlLoader(
url=urls[0], max_depth=2, extractor=lambda x: Soup(x, "html.parser").text
)
elif loader_type == 'SeleniumURL':
loader = SeleniumURLLoader(urls=urls)
elif loader_type == 'SeleniumURLH':
loader = SeleniumURLLoader(urls=urls, headless=False)
elif loader_type == 'PlaywrightURL':
loader = PlaywrightURLLoader(urls=urls)
elif loader_type == 'PubMed':
loader = PubMedLoader(urls[0])
elif loader_type == 'NewsURL':
loader = NewsURLLoader(urls)
elif loader_type == 'WebBaseLoader':
loader = WebBaseLoader(urls)
else:
return "Not Implemented. Development in Progress", "Work In Progress"
data = loader.load()
jsonData = []
for item in data:
jsonData.append(item.to_json())
return jsonData, data
except Exception as err:
return "An Error Occurred. Contact Developer" + str(err), "Error Occured. Boom"
def scrapeAndSummarize(prompt: str, source: str) -> Tuple[dict, str]:
"""Scrapes website content based on the prompt and summarizes it.
Args:
prompt (str): The prompt to guide the scraping process.
source (str): The URL of the website to scrape.
Returns:
tuple: A tuple containing the scraped data as a dictionary and the execution information.
"""
smart_scraper_graph = SmartScraperGraph(
prompt=prompt,
source=source,
config=graph_config
)
result = smart_scraper_graph.run()
exec_info = smart_scraper_graph.get_execution_info()
return result, prettify_exec_info(exec_info)
# ------------------------------------------------------------------------------
# TABBED GRADIO UI
# ------------------------------------------------------------------------------
# Define choices for the dropdown menu
choices = [
'AsyncHtmlLoader', 'UnstructuredURL', 'RecursiveURL', 'PubMed',
'WebBaseLoader', 'Scrapy', 'PySpider', 'Beautiful Soup',
'SeleniumURL', 'SeleniumURLH', 'PlaywrightURL', 'NewsURL',
]
# Create the Gradio interface with tabs
with gr.Blocks(theme="sudeepshouche/minimalist") as demo:
gr.Markdown("# THE BIG SCRAPER")
with gr.Tabs():
# Tab 1: Data Extraction
with gr.TabItem("Data Extraction"):
gr.Markdown("## Extract data from URLs using various loaders")
with gr.Row():
url_input = gr.Textbox(label="Enter your comma separated URLs here")
loader_dropdown = gr.Dropdown(choices=choices, label="Pick your Loader from here")
extract_button = gr.Button("Extract Data")
with gr.Row():
extracted_data_json = gr.JSON(label="Extracted Data (JSON)")
extracted_data_text = gr.Textbox(label="Extracted Data (Text)")
extract_button.click(
extractDataFromUrls,
inputs=[url_input, loader_dropdown],
outputs=[extracted_data_json, extracted_data_text]
)
# Tab 2: Website Scraping and Summarization
with gr.TabItem("Scraping & Summarization"):
with gr.Row():
with gr.Column():
model_dropdown = gr.Textbox(label="Model", value="Mistral-7B-Instruct-v0.2")
prompt_input = gr.Textbox(label="Prompt", value="List me all the press releases with their headlines and urls.")
source_input = gr.Textbox(label="Source URL", value="https://www.whitehouse.gov/")
scrape_button = gr.Button("Scrape and Summarize")
with gr.Column():
result_output = gr.JSON(label="Result")
exec_info_output = gr.Textbox(label="Execution Info")
scrape_button.click(
scrapeAndSummarize,
inputs=[prompt_input, source_input],
outputs=[result_output, exec_info_output]
)
# Launch the Gradio interface
demo.launch()