rodrigomasini commited on
Commit
abdffe5
·
verified ·
1 Parent(s): f3f9165

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +127 -0
app.py CHANGED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ###############################################################################################################################################################
2
+ # _____ _ ___ _ ___
3
+ # |_ _|| |_ ___ | _ )(_) __ _ / __| __ _ _ __ _ _ __ ___ _ _
4
+ # | | | ' \ / -_) | _ \| |/ _` | \__ \/ _|| '_|/ _` || '_ \/ -_)| '_|
5
+ # |_| |_||_|\___| |___/|_|\__, | |___/\__||_| \__,_|| .__/\___||_|
6
+ # |___/ |_|
7
+ #
8
+ ##############################################################################################################################################################
9
+ # _ ______ _ _ _______ _ _
10
+ # _ | | (_____ \ | | (_) (_______) (_) (_)
11
+ # _____ _ _ _| |_ | |__ ___ ____ _ _____) ) ___ __| | ____ _ ____ ___ _ _ _ _____ ___ _ ____ _
12
+ # (____ || | | |(_ _)| _ \ / _ \ / ___)(_) | __ / / _ \ / _ | / ___)| | / _ | / _ \ | ||_|| |(____ | /___)| || _ \ | |
13
+ # / ___ || |_| | | |_ | | | || |_| || | _ | | \ \ | |_| |( (_| || | | |( (_| || |_| | | | | |/ ___ ||___ || || | | || |
14
+ # \_____||____/ \__)|_| |_| \___/ |_| (_) |_| |_| \___/ \____||_| |_| \___ | \___/ |_| |_|\_____|(___/ |_||_| |_||_|
15
+ # (_____|
16
+ ###############################################################################################################################################################
17
+ #
18
+ # Last updated in: 8/15/2024
19
+ #
20
+ ###############################################################################################################################################################
21
+
22
+ # ------------------------------------------------------------------------------
23
+ # IMPORTS
24
+ # ------------------------------------------------------------------------------
25
+ import gradio as gr
26
+ from bs4 import BeautifulSoup as Soup
27
+ from langchain_community.document_loaders import (AsyncHtmlLoader,
28
+ NewsURLLoader, PubMedLoader,
29
+ PlaywrightURLLoader,
30
+ RecursiveUrlLoader,
31
+ SeleniumURLLoader,
32
+ UnstructuredURLLoader,
33
+ WebBaseLoader)
34
+
35
+
36
+ # ------------------------------------------------------------------------------
37
+ # THE BIG SCRAPER METHOD
38
+ # ------------------------------------------------------------------------------
39
+
40
+ def extractDataFromUrls(urls: str, loader_type: str):
41
+ """Extracts data from provided URLs using specified loader type.
42
+
43
+ Args:
44
+ urls (str): Comma-separated URLs to extract data from.
45
+ loader_type (str): Type of loader to use for data extraction.
46
+
47
+ Returns:
48
+ tuple: A tuple containing the extracted data in JSON format and as a list of Document objects.
49
+ Returns error messages if an exception occurs.
50
+ """
51
+ try:
52
+ urls = urls.split(',')
53
+ data = []
54
+
55
+ # Instantiate the selected loader based on loader_type
56
+ if loader_type == 'AsyncHtmlLoader':
57
+ loader = AsyncHtmlLoader(urls)
58
+
59
+ elif loader_type == 'UnstructuredURL':
60
+ loader = UnstructuredURLLoader(urls=urls)
61
+
62
+ elif loader_type == 'RecursiveURL':
63
+ loader = RecursiveUrlLoader(
64
+ url=urls[0], max_depth=2, extractor=lambda x: Soup(x, "html.parser").text
65
+ )
66
+
67
+ elif loader_type == 'SeleniumURL':
68
+ loader = SeleniumURLLoader(urls=urls)
69
+
70
+ elif loader_type == 'SeleniumURLH':
71
+ loader = SeleniumURLLoader(urls=urls, headless=False)
72
+
73
+ elif loader_type == 'PlaywrightURL':
74
+ loader = PlaywrightURLLoader(urls=urls)
75
+
76
+ elif loader_type == 'PubMed':
77
+ loader = PubMedLoader(urls[0])
78
+
79
+ elif loader_type == 'NewsURL':
80
+ loader = NewsURLLoader(urls)
81
+
82
+ elif loader_type == 'WebBaseLoader':
83
+ loader = WebBaseLoader(urls)
84
+
85
+ else:
86
+ return "Not Implemented. Development in Progress", "Work In Progress"
87
+
88
+ # Load data using the selected loader
89
+ data = loader.load()
90
+
91
+ # Convert data to JSON format
92
+ jsonData = []
93
+
94
+ for item in data:
95
+ jsonData.append(item.to_json())
96
+
97
+ return jsonData, data
98
+
99
+ except Exception as err:
100
+ return "An Error Occurred. Contact Developer" + str(err), "Error Occured. Boom"
101
+
102
+
103
+ # ------------------------------------------------------------------------------
104
+ # GRADIO
105
+ # ------------------------------------------------------------------------------
106
+
107
+ # Define choices for the dropdown menu
108
+ choices = [
109
+ 'AsyncHtmlLoader', 'UnstructuredURL', 'RecursiveURL', 'PubMed',
110
+ 'WebBaseLoader', 'Scrapy', 'PySpider', 'Beautiful Soup',
111
+ 'SeleniumURL', 'SeleniumURLH', 'PlaywrightURL', 'NewsURL',
112
+ ]
113
+
114
+ # Create the Gradio interface
115
+ demo = gr.Interface(
116
+ fn=extractDataFromUrls,
117
+ inputs=[
118
+ gr.Textbox(label="Enter your comma separated URLs here"),
119
+ gr.Dropdown(choices=choices, label="Pick your Loader from here")
120
+ ],
121
+ outputs=["json", "textbox"],
122
+ allow_flagging='never',
123
+ theme="sudeepshouche/minimalist"
124
+ )
125
+
126
+ # Launch the Gradio interface
127
+ demo.launch()