############################################################################################################################################################### # _____ _ ___ _ ___ # |_ _|| |_ ___ | _ )(_) __ _ / __| __ _ _ __ _ _ __ ___ _ _ # | | | ' \ / -_) | _ \| |/ _` | \__ \/ _|| '_|/ _` || '_ \/ -_)| '_| # |_| |_||_|\___| |___/|_|\__, | |___/\__||_| \__,_|| .__/\___||_| # |___/ |_| # ############################################################################################################################################################## # _ ______ _ _ _______ _ _ # _ | | (_____ \ | | (_) (_______) (_) (_) # _____ _ _ _| |_ | |__ ___ ____ _ _____) ) ___ __| | ____ _ ____ ___ _ _ _ _____ ___ _ ____ _ # (____ || | | |(_ _)| _ \ / _ \ / ___)(_) | __ / / _ \ / _ | / ___)| | / _ | / _ \ | ||_|| |(____ | /___)| || _ \ | | # / ___ || |_| | | |_ | | | || |_| || | _ | | \ \ | |_| |( (_| || | | |( (_| || |_| | | | | |/ ___ ||___ || || | | || | # \_____||____/ \__)|_| |_| \___/ |_| (_) |_| |_| \___/ \____||_| |_| \___ | \___/ |_| |_|\_____|(___/ |_||_| |_||_| # (_____| ############################################################################################################################################################### # # Last updated in: 8/20/2024 # ############################################################################################################################################################### # ------------------------------------------------------------------------------ # IMPORTS # ------------------------------------------------------------------------------ import gradio as gr from bs4 import BeautifulSoup as Soup from langchain_community.document_loaders import (AsyncHtmlLoader, NewsURLLoader, PubMedLoader, PlaywrightURLLoader, RecursiveUrlLoader, SeleniumURLLoader, UnstructuredURLLoader, WebBaseLoader) from selenium import webdriver from selenium.common.exceptions import WebDriverException from PIL import Image from io import BytesIO # ------------------------------------------------------------------------------ # THE BIG SCRAPER METHOD # ------------------------------------------------------------------------------ def extractDataFromUrls(urls: str, loader_type: str): """Extracts data from provided URLs using specified loader type. Args: urls (str): Comma-separated URLs to extract data from. loader_type (str): Type of loader to use for data extraction. Returns: tuple: A tuple containing the extracted data in JSON format and as a list of Document objects. Returns error messages if an exception occurs. """ try: urls = urls.split(',') data = [] # Instantiate the selected loader based on loader_type if loader_type == 'AsyncHtmlLoader': loader = AsyncHtmlLoader(urls) elif loader_type == 'UnstructuredURL': loader = UnstructuredURLLoader(urls=urls) elif loader_type == 'RecursiveURL': loader = RecursiveUrlLoader( url=urls[0], max_depth=2, extractor=lambda x: Soup(x, "html.parser").text ) elif loader_type == 'SeleniumURL': loader = SeleniumURLLoader(urls=urls) elif loader_type == 'SeleniumURLH': loader = SeleniumURLLoader(urls=urls, headless=False) elif loader_type == 'PlaywrightURL': loader = PlaywrightURLLoader(urls=urls) elif loader_type == 'PubMed': loader = PubMedLoader(urls[0]) elif loader_type == 'NewsURL': loader = NewsURLLoader(urls) elif loader_type == 'WebBaseLoader': loader = WebBaseLoader(urls) else: return "Not Implemented. Development in Progress", "Work In Progress" # Load data using the selected loader data = loader.load() # Convert data to JSON format jsonData = [] for item in data: jsonData.append(item.to_json()) return jsonData, data, urls[0] except Exception as err: return "An Error Occurred. Contact Developer" + str(err), "Error Occured. Boom" # ------------------------------------------------------------------------------ # WEB DATA AND SCREENSHOT # ------------------------------------------------------------------------------ def take_webdata(url): options = webdriver.ChromeOptions() options.add_argument('--headless') options.add_argument('--no-sandbox') options.add_argument('--disable-dev-shm-usage') try: wd = webdriver.Chrome(options=options) wd.set_window_size(1080, 720) wd.get(url) wd.implicitly_wait(5) page_title = wd.title screenshot = wd.get_screenshot_as_png() except WebDriverException as e: return Image.new('RGB', (1, 1)), page_title finally: if wd: wd.quit() return Image.open(BytesIO(screenshot)) , page_title # ------------------------------------------------------------------------------ # GRADIO # ------------------------------------------------------------------------------ # Define choices for the dropdown menu choices = [ 'AsyncHtmlLoader', 'UnstructuredURL', 'RecursiveURL', 'PubMed', 'WebBaseLoader', 'Scrapy', 'PySpider', 'Beautiful Soup', 'SeleniumURL', 'SeleniumURLH', 'PlaywrightURL', 'NewsURL', ] # Create the Gradio interface with gr.Blocks(theme="sudeepshouche/minimalist") as demo: extracted_url = gr.State() # Use gr.State() to store the URL screenshot_output = gr.State() title_output = gr.State() gr.Markdown("# The Big Scraper") with gr.Tab("Scraped"): with gr.Row(): with gr.Column(): url_input = gr.Textbox(label="Enter your comma separated URLs here") loader_dropdown = gr.Dropdown(choices=choices, label="Pick your Loader from here") btn = gr.Button("Extract Data") with gr.Column(): screenshot_output = gr.Image(label="Screenshot") title_output = gr.Textbox(label="Page Title") json_output = gr.JSON(label="Extracted Data (JSON)") text_output = gr.Textbox(label="Extracted Data (Text)") btn.click(extractDataFromUrls, inputs=[url_input, loader_dropdown], outputs=[json_output, text_output, extracted_url]) \ .then(take_webdata, inputs=extracted_url, outputs=[screenshot_output, title_output], queue=True) # Launch the Gradio interface demo.launch(share=True)