rodrigomasini commited on
Commit
d03a652
·
verified ·
1 Parent(s): e744ba8

Create alternative-2.py

Browse files
Files changed (1) hide show
  1. alternative-2.py +166 -0
alternative-2.py ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ###############################################################################################################################################################
2
+ # _____ _ ___ _ ___
3
+ # |_ _|| |_ ___ | _ )(_) __ _ / __| __ _ _ __ _ _ __ ___ _ _
4
+ # | | | ' \ / -_) | _ \| |/ _` | \__ \/ _|| '_|/ _` || '_ \/ -_)| '_|
5
+ # |_| |_||_|\___| |___/|_|\__, | |___/\__||_| \__,_|| .__/\___||_|
6
+ # |___/ |_|
7
+ #
8
+ ##############################################################################################################################################################
9
+ # _ ______ _ _ _______ _ _
10
+ # _ | | (_____ \ | | (_) (_______) (_) (_)
11
+ # _____ _ _ _| |_ | |__ ___ ____ _ _____) ) ___ __| | ____ _ ____ ___ _ _ _ _____ ___ _ ____ _
12
+ # (____ || | | |(_ _)| _ \ / _ \ / ___)(_) | __ / / _ \ / _ | / ___)| | / _ | / _ \ | ||_|| |(____ | /___)| || _ \ | |
13
+ # / ___ || |_| | | |_ | | | || |_| || | _ | | \ \ | |_| |( (_| || | | |( (_| || |_| | | | | |/ ___ ||___ || || | | || |
14
+ # \_____||____/ \__)|_| |_| \___/ |_| (_) |_| |_| \___/ \____||_| |_| \___ | \___/ |_| |_|\_____|(___/ |_||_| |_||_|
15
+ # (_____|
16
+ ###############################################################################################################################################################
17
+ #
18
+ # Last updated in: 8/20/2024
19
+ #
20
+ ###############################################################################################################################################################
21
+
22
+ # ------------------------------------------------------------------------------
23
+ # IMPORTS
24
+ # ------------------------------------------------------------------------------
25
+ import gradio as gr
26
+ from bs4 import BeautifulSoup as Soup
27
+ from langchain_community.document_loaders import (AsyncHtmlLoader,
28
+ NewsURLLoader, PubMedLoader,
29
+ PlaywrightURLLoader,
30
+ RecursiveUrlLoader,
31
+ SeleniumURLLoader,
32
+ UnstructuredURLLoader,
33
+ WebBaseLoader)
34
+ from selenium import webdriver
35
+ from selenium.common.exceptions import WebDriverException
36
+ from PIL import Image
37
+ from io import BytesIO
38
+
39
+ # ------------------------------------------------------------------------------
40
+ # THE BIG SCRAPER METHOD
41
+ # ------------------------------------------------------------------------------
42
+
43
+ def extractDataFromUrls(urls: str, loader_type: str):
44
+ """Extracts data from provided URLs using specified loader type.
45
+
46
+ Args:
47
+ urls (str): Comma-separated URLs to extract data from.
48
+ loader_type (str): Type of loader to use for data extraction.
49
+
50
+ Returns:
51
+ tuple: A tuple containing the extracted data in JSON format and as a list of Document objects.
52
+ Returns error messages if an exception occurs.
53
+ """
54
+ try:
55
+ urls = urls.split(',')
56
+ data = []
57
+
58
+ # Instantiate the selected loader based on loader_type
59
+ if loader_type == 'AsyncHtmlLoader':
60
+ loader = AsyncHtmlLoader(urls)
61
+
62
+ elif loader_type == 'UnstructuredURL':
63
+ loader = UnstructuredURLLoader(urls=urls)
64
+
65
+ elif loader_type == 'RecursiveURL':
66
+ loader = RecursiveUrlLoader(
67
+ url=urls[0], max_depth=2, extractor=lambda x: Soup(x, "html.parser").text
68
+ )
69
+
70
+ elif loader_type == 'SeleniumURL':
71
+ loader = SeleniumURLLoader(urls=urls)
72
+
73
+ elif loader_type == 'SeleniumURLH':
74
+ loader = SeleniumURLLoader(urls=urls, headless=False)
75
+
76
+ elif loader_type == 'PlaywrightURL':
77
+ loader = PlaywrightURLLoader(urls=urls)
78
+
79
+ elif loader_type == 'PubMed':
80
+ loader = PubMedLoader(urls[0])
81
+
82
+ elif loader_type == 'NewsURL':
83
+ loader = NewsURLLoader(urls)
84
+
85
+ elif loader_type == 'WebBaseLoader':
86
+ loader = WebBaseLoader(urls)
87
+
88
+ else:
89
+ return "Not Implemented. Development in Progress", "Work In Progress"
90
+
91
+ # Load data using the selected loader
92
+ data = loader.load()
93
+
94
+ # Convert data to JSON format
95
+ jsonData = []
96
+
97
+ for item in data:
98
+ jsonData.append(item.to_json())
99
+
100
+ return jsonData, data
101
+
102
+ except Exception as err:
103
+ return "An Error Occurred. Contact Developer" + str(err), "Error Occured. Boom"
104
+
105
+ # ------------------------------------------------------------------------------
106
+ # WEB DATA AND SCREENSHOT
107
+ # ------------------------------------------------------------------------------
108
+ def take_webdata(url):
109
+ options = webdriver.ChromeOptions()
110
+ options.add_argument('--headless')
111
+ options.add_argument('--no-sandbox')
112
+ options.add_argument('--disable-dev-shm-usage')
113
+
114
+ try:
115
+ wd = webdriver.Chrome(options=options)
116
+ wd.set_window_size(1080, 720)
117
+ wd.get(url)
118
+ wd.implicitly_wait(5)
119
+ page_title = wd.title
120
+ screenshot = wd.get_screenshot_as_png()
121
+
122
+ except WebDriverException as e:
123
+ return Image.new('RGB', (1, 1)), page_title
124
+ finally:
125
+ if wd:
126
+ wd.quit()
127
+
128
+ return Image.open(BytesIO(screenshot)) , page_title
129
+
130
+ # ------------------------------------------------------------------------------
131
+ # GRADIO
132
+ # ------------------------------------------------------------------------------
133
+
134
+ # Define choices for the dropdown menu
135
+ choices = [
136
+ 'AsyncHtmlLoader', 'UnstructuredURL', 'RecursiveURL', 'PubMed',
137
+ 'WebBaseLoader', 'Scrapy', 'PySpider', 'Beautiful Soup',
138
+ 'SeleniumURL', 'SeleniumURLH', 'PlaywrightURL', 'NewsURL',
139
+ ]
140
+
141
+ # Create the Gradio interface
142
+ with gr.Blocks(allow_flagging='never', theme="sudeepshouche/minimalist") as demo:
143
+ gr.Markdown("# The Big Scraper")
144
+ with gr.Tab("Scraped"):
145
+ with gr.Row():
146
+ url_input = gr.Textbox(label="Enter your comma separated URLs here")
147
+ loader_dropdown = gr.Dropdown(choices=choices, label="Pick your Loader from here")
148
+ with gr.Row():
149
+ json_output = gr.JSON(label="Extracted Data (JSON)")
150
+ text_output = gr.Textbox(label="Extracted Data (Text)")
151
+
152
+ btn = gr.Button("Extract Data")
153
+ btn.click(extractDataFromUrls, inputs=[url_input, loader_dropdown], outputs=[json_output, text_output])
154
+
155
+ with gr.Tab("Images"):
156
+ with gr.Row():
157
+ image_url_input = gr.Textbox(label="Enter URL for Screenshot:")
158
+ with gr.Row():
159
+ screenshot_output = gr.Image(label="Screenshot")
160
+ title_output = gr.Textbox(label="Page Title")
161
+
162
+ btn2 = gr.Button("Take Screenshot")
163
+ btn2.click(take_webdata, inputs=image_url_input, outputs=[screenshot_output, title_output])
164
+
165
+ # Launch the Gradio interface
166
+ demo.launch()