Spaces:
Sleeping
Sleeping
File size: 7,693 Bytes
d03a652 8683afa d03a652 17a80d5 8683afa 06443d3 d03a652 8683afa 1f57449 8683afa 1f57449 8683afa d6f5474 d03a652 0068042 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 |
###############################################################################################################################################################
# _____ _ ___ _ ___
# |_ _|| |_ ___ | _ )(_) __ _ / __| __ _ _ __ _ _ __ ___ _ _
# | | | ' \ / -_) | _ \| |/ _` | \__ \/ _|| '_|/ _` || '_ \/ -_)| '_|
# |_| |_||_|\___| |___/|_|\__, | |___/\__||_| \__,_|| .__/\___||_|
# |___/ |_|
#
##############################################################################################################################################################
# _ ______ _ _ _______ _ _
# _ | | (_____ \ | | (_) (_______) (_) (_)
# _____ _ _ _| |_ | |__ ___ ____ _ _____) ) ___ __| | ____ _ ____ ___ _ _ _ _____ ___ _ ____ _
# (____ || | | |(_ _)| _ \ / _ \ / ___)(_) | __ / / _ \ / _ | / ___)| | / _ | / _ \ | ||_|| |(____ | /___)| || _ \ | |
# / ___ || |_| | | |_ | | | || |_| || | _ | | \ \ | |_| |( (_| || | | |( (_| || |_| | | | | |/ ___ ||___ || || | | || |
# \_____||____/ \__)|_| |_| \___/ |_| (_) |_| |_| \___/ \____||_| |_| \___ | \___/ |_| |_|\_____|(___/ |_||_| |_||_|
# (_____|
###############################################################################################################################################################
#
# Last updated in: 8/20/2024
#
###############################################################################################################################################################
# ------------------------------------------------------------------------------
# IMPORTS
# ------------------------------------------------------------------------------
import gradio as gr
from bs4 import BeautifulSoup as Soup
from langchain_community.document_loaders import (AsyncHtmlLoader,
NewsURLLoader, PubMedLoader,
PlaywrightURLLoader,
RecursiveUrlLoader,
SeleniumURLLoader,
UnstructuredURLLoader,
WebBaseLoader)
from selenium import webdriver
from selenium.common.exceptions import WebDriverException
from PIL import Image
from io import BytesIO
# ------------------------------------------------------------------------------
# THE BIG SCRAPER METHOD
# ------------------------------------------------------------------------------
def extractDataFromUrls(urls: str, loader_type: str):
"""Extracts data from provided URLs using specified loader type.
Args:
urls (str): Comma-separated URLs to extract data from.
loader_type (str): Type of loader to use for data extraction.
Returns:
tuple: A tuple containing the extracted data in JSON format and as a list of Document objects.
Returns error messages if an exception occurs.
"""
try:
urls = urls.split(',')
data = []
# Instantiate the selected loader based on loader_type
if loader_type == 'AsyncHtmlLoader':
loader = AsyncHtmlLoader(urls)
elif loader_type == 'UnstructuredURL':
loader = UnstructuredURLLoader(urls=urls)
elif loader_type == 'RecursiveURL':
loader = RecursiveUrlLoader(
url=urls[0], max_depth=2, extractor=lambda x: Soup(x, "html.parser").text
)
elif loader_type == 'SeleniumURL':
loader = SeleniumURLLoader(urls=urls)
elif loader_type == 'SeleniumURLH':
loader = SeleniumURLLoader(urls=urls, headless=False)
elif loader_type == 'PlaywrightURL':
loader = PlaywrightURLLoader(urls=urls)
elif loader_type == 'PubMed':
loader = PubMedLoader(urls[0])
elif loader_type == 'NewsURL':
loader = NewsURLLoader(urls)
elif loader_type == 'WebBaseLoader':
loader = WebBaseLoader(urls)
else:
return "Not Implemented. Development in Progress", "Work In Progress"
# Load data using the selected loader
data = loader.load()
# Convert data to JSON format
jsonData = []
for item in data:
jsonData.append(item.to_json())
return jsonData, data, urls[0]
except Exception as err:
return "An Error Occurred. Contact Developer" + str(err), "Error Occured. Boom"
# ------------------------------------------------------------------------------
# WEB DATA AND SCREENSHOT
# ------------------------------------------------------------------------------
def take_webdata(url):
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
try:
wd = webdriver.Chrome(options=options)
wd.set_window_size(1080, 720)
wd.get(url)
wd.implicitly_wait(5)
page_title = wd.title
screenshot = wd.get_screenshot_as_png()
except WebDriverException as e:
return Image.new('RGB', (1, 1)), page_title
finally:
if wd:
wd.quit()
return Image.open(BytesIO(screenshot)) , page_title
# ------------------------------------------------------------------------------
# GRADIO
# ------------------------------------------------------------------------------
# Define choices for the dropdown menu
choices = [
'AsyncHtmlLoader', 'UnstructuredURL', 'RecursiveURL', 'PubMed',
'WebBaseLoader', 'Scrapy', 'PySpider', 'Beautiful Soup',
'SeleniumURL', 'SeleniumURLH', 'PlaywrightURL', 'NewsURL',
]
# Create the Gradio interface
with gr.Blocks(theme="sudeepshouche/minimalist") as demo:
extracted_url = gr.State() # Use gr.State() to store the URL
screenshot_output = gr.State()
title_output = gr.State()
gr.Markdown("# The Big Scraper")
with gr.Tab("Scraped"):
with gr.Row():
with gr.Column():
url_input = gr.Textbox(label="Enter your comma separated URLs here")
loader_dropdown = gr.Dropdown(choices=choices, label="Pick your Loader from here")
btn = gr.Button("Extract Data")
with gr.Column():
screenshot_output = gr.Image(label="Screenshot")
title_output = gr.Textbox(label="Page Title")
json_output = gr.JSON(label="Extracted Data (JSON)")
text_output = gr.Textbox(label="Extracted Data (Text)")
btn.click(extractDataFromUrls, inputs=[url_input, loader_dropdown], outputs=[json_output, text_output, extracted_url]) \
.then(take_webdata, inputs=extracted_url, outputs=[screenshot_output, title_output], queue=True)
# Launch the Gradio interface
demo.launch(share=True) |