Spaces:

ResearchMAGIC
/

the-big-scraper

Sleeping

App Files Files Community

the-big-scraper / alternative.py

rodrigomasini

Create alternative.py

c63697d verified 10 months ago

raw

history blame

9.54 kB

	###############################################################################################################################################################
	# _____ _ ___ _ ___
	# \|_ _\|\| \|_ ___ \| _ )(_) __ _ / __\| __ _ _ __ _ _ __ ___ _ _
	# \| \| \| ' \ / -_) \| _ \\| \|/ _` \| \__ \/ _\|\| '_\|/ _` \|\| '_ \/ -_)\| '_\|
	# \|_\| \|_\|\|_\|\___\| \|___/\|_\|\__, \| \|___/\__\|\|_\| \__,_\|\| .__/\___\|\|_\|
	# \|___/ \|_\|
	#
	##############################################################################################################################################################
	# _ ______ _ _ _______ _ _
	# _ \| \| (_____ \ \| \| (_) (_______) (_) (_)
	# _____ _ _ _\| \|_ \| \|__ ___ ____ _ _____) ) ___ __\| \| ____ _ ____ ___ _ _ _ _____ ___ _ ____ _
	# (____ \|\| \| \| \|(_ _)\| _ \ / _ \ / ___)(_) \| __ / / _ \ / _ \| / ___)\| \| / _ \| / _ \ \| \|\|_\|\| \|(____ \| /___)\| \|\| _ \ \| \|
	# / ___ \|\| \|_\| \| \| \|_ \| \| \| \|\| \|_\| \|\| \| _ \| \| \ \ \| \|_\| \|( (_\| \|\| \| \| \|( (_\| \|\| \|_\| \| \| \| \| \|/ ___ \|\|___ \|\| \|\| \| \| \|\| \|
	# \_____\|\|____/ \__)\|_\| \|_\| \___/ \|_\| (_) \|_\| \|_\| \___/ \____\|\|_\| \|_\| \___ \| \___/ \|_\| \|_\|\_____\|(___/ \|_\|\|_\| \|_\|\|_\|
	# (_____\|
	###############################################################################################################################################################
	#
	# Last updated in: 8/15/2024
	#
	###############################################################################################################################################################

	# ------------------------------------------------------------------------------
	# IMPORTS
	# ------------------------------------------------------------------------------

	import os
	import subprocess
	from typing import Tuple

	import gradio as gr
	from dotenv import load_dotenv
	from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
	from langchain_community.llms import HuggingFaceEndpoint
	from scrapegraphai.graphs import SmartScraperGraph
	from scrapegraphai.utils import prettify_exec_info

	from bs4 import BeautifulSoup as Soup
	from langchain_community.document_loaders import (AsyncHtmlLoader,
	NewsURLLoader, PubMedLoader,
	PlaywrightURLLoader,
	RecursiveUrlLoader,
	SeleniumURLLoader,
	UnstructuredURLLoader,
	WebBaseLoader)


	# ------------------------------------------------------------------------------
	# DEV ENVIRONMENT SETUP
	# ------------------------------------------------------------------------------

	# Load environment variables
	HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')

	# Initialize the model instances
	repo_id = "mistralai/Mistral-7B-Instruct-v0.2"
	llm_model_instance = HuggingFaceEndpoint(
	repo_id=repo_id, max_length=8192, temperature=0.5, token=HUGGINGFACEHUB_API_TOKEN
	)

	embedder_model_instance = HuggingFaceInferenceAPIEmbeddings(
	api_key=HUGGINGFACEHUB_API_TOKEN, model_name="sentence-transformers/all-MiniLM-l6-v2"
	)

	graph_config = {
	"llm": {"model_instance": llm_model_instance},
	"embeddings": {"model_instance": embedder_model_instance}
	}

	# ------------------------------------------------------------------------------
	# THE BIG SCRAPER
	# ------------------------------------------------------------------------------

	def extractDataFromUrls(urls: str, loader_type: str) -> Tuple[list, list]:
	"""Extracts data from provided URLs using specified loader type.

	Args:
	urls (str): Comma-separated URLs to extract data from.
	loader_type (str): Type of loader to use for data extraction.

	Returns:
	tuple: A tuple containing the extracted data in JSON format and as a list of Document objects.
	Returns error messages if an exception occurs.
	"""
	try:
	urls = urls.split(',')
	data = []

	if loader_type == 'AsyncHtmlLoader':
	loader = AsyncHtmlLoader(urls)
	elif loader_type == 'UnstructuredURL':
	loader = UnstructuredURLLoader(urls=urls)
	elif loader_type == 'RecursiveURL':
	loader = RecursiveUrlLoader(
	url=urls[0], max_depth=2, extractor=lambda x: Soup(x, "html.parser").text
	)
	elif loader_type == 'SeleniumURL':
	loader = SeleniumURLLoader(urls=urls)
	elif loader_type == 'SeleniumURLH':
	loader = SeleniumURLLoader(urls=urls, headless=False)
	elif loader_type == 'PlaywrightURL':
	loader = PlaywrightURLLoader(urls=urls)
	elif loader_type == 'PubMed':
	loader = PubMedLoader(urls[0])
	elif loader_type == 'NewsURL':
	loader = NewsURLLoader(urls)
	elif loader_type == 'WebBaseLoader':
	loader = WebBaseLoader(urls)
	else:
	return "Not Implemented. Development in Progress", "Work In Progress"

	data = loader.load()
	jsonData = []
	for item in data:
	jsonData.append(item.to_json())

	return jsonData, data

	except Exception as err:
	return "An Error Occurred. Contact Developer" + str(err), "Error Occured. Boom"


	def scrapeAndSummarize(prompt: str, source: str) -> Tuple[dict, str]:
	"""Scrapes website content based on the prompt and summarizes it.

	Args:
	prompt (str): The prompt to guide the scraping process.
	source (str): The URL of the website to scrape.

	Returns:
	tuple: A tuple containing the scraped data as a dictionary and the execution information.
	"""
	smart_scraper_graph = SmartScraperGraph(
	prompt=prompt,
	source=source,
	config=graph_config
	)
	result = smart_scraper_graph.run()
	exec_info = smart_scraper_graph.get_execution_info()
	return result, prettify_exec_info(exec_info)

	# ------------------------------------------------------------------------------
	# TABBED GRADIO UI
	# ------------------------------------------------------------------------------

	# Define choices for the dropdown menu
	choices = [
	'AsyncHtmlLoader', 'UnstructuredURL', 'RecursiveURL', 'PubMed',
	'WebBaseLoader', 'Scrapy', 'PySpider', 'Beautiful Soup',
	'SeleniumURL', 'SeleniumURLH', 'PlaywrightURL', 'NewsURL',
	]

	# Create the Gradio interface with tabs
	with gr.Blocks() as demo:
	gr.Markdown("# Web Scraping and Summarization")

	with gr.Tabs():
	# Tab 1: Data Extraction
	with gr.TabItem("Data Extraction"):
	gr.Markdown("## Extract data from URLs using various loaders")
	with gr.Row():
	url_input = gr.Textbox(label="Enter your comma separated URLs here")
	loader_dropdown = gr.Dropdown(choices=choices, label="Pick your Loader from here")
	extract_button = gr.Button("Extract Data")
	with gr.Row():
	extracted_data_json = gr.JSON(label="Extracted Data (JSON)")
	extracted_data_text = gr.Textbox(label="Extracted Data (Text)")
	extract_button.click(
	extractDataFromUrls,
	inputs=[url_input, loader_dropdown],
	outputs=[extracted_data_json, extracted_data_text]
	)

	# Tab 2: Website Scraping and Summarization
	with gr.TabItem("Website Scraping & Summarization"):
	gr.Markdown("# Scrape websites, no-code version")
	gr.Markdown("""Easily scrape and summarize web content using advanced AI models on the Hugging Face Hub without writing any code. Input your desired prompt and source URL to get started.
	This is a no-code version of the excellent lib [ScrapeGraphAI](https://github.com/VinciGit00/Scrapegraph-ai).
	It's a basic demo and a work in progress. Please contribute to it to make it more useful!""")
	with gr.Row():
	with gr.Column():
	model_dropdown = gr.Textbox(label="Model", value="Mistral-7B-Instruct-v0.2")
	prompt_input = gr.Textbox(label="Prompt", value="List me all the press releases with their headlines and urls.")
	source_input = gr.Textbox(label="Source URL", value="https://www.whitehouse.gov/")
	scrape_button = gr.Button("Scrape and Summarize")
	with gr.Column():
	result_output = gr.JSON(label="Result")
	exec_info_output = gr.Textbox(label="Execution Info")

	scrape_button.click(
	scrapeAndSummarize,
	inputs=[prompt_input, source_input],
	outputs=[result_output, exec_info_output]
	)

	# Launch the Gradio interface
	demo.launch()