bharathmunakala commited on
Commit
5dbc77d
·
verified ·
1 Parent(s): 187bcf5

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +288 -38
src/streamlit_app.py CHANGED
@@ -1,40 +1,290 @@
1
- import altair as alt
2
- import numpy as np
3
- import pandas as pd
4
  import streamlit as st
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
- """
7
- # Welcome to Streamlit!
8
-
9
- Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
10
- If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
11
- forums](https://discuss.streamlit.io).
12
-
13
- In the meantime, below is an example of what you can do with just a few lines of code:
14
- """
15
-
16
- num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
17
- num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
18
-
19
- indices = np.linspace(0, 1, num_points)
20
- theta = 2 * np.pi * num_turns * indices
21
- radius = indices
22
-
23
- x = radius * np.cos(theta)
24
- y = radius * np.sin(theta)
25
-
26
- df = pd.DataFrame({
27
- "x": x,
28
- "y": y,
29
- "idx": indices,
30
- "rand": np.random.randn(num_points),
31
- })
32
-
33
- st.altair_chart(alt.Chart(df, height=700, width=700)
34
- .mark_point(filled=True)
35
- .encode(
36
- x=alt.X("x", axis=None),
37
- y=alt.Y("y", axis=None),
38
- color=alt.Color("idx", legend=None, scale=alt.Scale()),
39
- size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
40
- ))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
+ import os
3
+ import gc
4
+ from firecrawl import FirecrawlApp
5
+ from dotenv import load_dotenv
6
+ import time
7
+ import pandas as pd
8
+ from typing import Dict, Any
9
+ import base64
10
+ from pydantic import BaseModel, Field
11
+ import inspect
12
+ from langchain_openai import ChatOpenAI
13
+ from langchain.schema import HumanMessage
14
+ from langchain.callbacks.base import BaseCallbackHandler
15
+ from langdetect import detect
16
+
17
+ # Page configuration
18
+ st.set_page_config(
19
+ page_title="Multilingual Website Data Extractor",
20
+ page_icon="🌐",
21
+ layout="wide"
22
+ )
23
+
24
+ # Define supported languages
25
+ languages = [
26
+ "English", "Hindi", "Gujarati", "Bengali", "Tamil",
27
+ "Telugu", "Kannada", "Malayalam", "Punjabi", "Marathi",
28
+ "Urdu", "Assamese", "Odia", "Sanskrit", "Korean",
29
+ "Japanese", "Arabic", "French", "German", "Spanish",
30
+ "Portuguese", "Russian", "Chinese", "Vietnamese", "Thai",
31
+ "Indonesian", "Turkish", "Polish", "Ukrainian", "Dutch",
32
+ "Italian", "Greek", "Hebrew", "Persian", "Swedish",
33
+ "Norwegian", "Danish", "Finnish", "Czech", "Hungarian",
34
+ "Romanian", "Bulgarian", "Croatian", "Serbian", "Slovak",
35
+ "Slovenian", "Estonian", "Latvian", "Lithuanian", "Malay",
36
+ "Tagalog", "Swahili"
37
+ ]
38
+
39
+ # Language code mapping
40
+ language_codes = {
41
+ "English": "en", "Hindi": "hi", "Gujarati": "gu", "Bengali": "bn", "Tamil": "ta",
42
+ "Telugu": "te", "Kannada": "kn", "Malayalam": "ml", "Punjabi": "pa", "Marathi": "mr",
43
+ "Urdu": "ur", "Assamese": "as", "Odia": "or", "Sanskrit": "sa", "Korean": "ko",
44
+ "Japanese": "ja", "Arabic": "ar", "French": "fr", "German": "de", "Spanish": "es",
45
+ "Portuguese": "pt", "Russian": "ru", "Chinese": "zh", "Vietnamese": "vi", "Thai": "th",
46
+ "Indonesian": "id", "Turkish": "tr", "Polish": "pl", "Ukrainian": "uk", "Dutch": "nl",
47
+ "Italian": "it", "Greek": "el", "Hebrew": "he", "Persian": "fa", "Swedish": "sv",
48
+ "Norwegian": "no", "Danish": "da", "Finnish": "fi", "Czech": "cs", "Hungarian": "hu",
49
+ "Romanian": "ro", "Bulgarian": "bg", "Croatian": "hr", "Serbian": "sr", "Slovak": "sk",
50
+ "Slovenian": "sl", "Estonian": "et", "Latvian": "lv", "Lithuanian": "lt", "Malay": "ms",
51
+ "Tagalog": "tl", "Swahili": "sw"
52
+ }
53
+
54
+ # Streaming callback handler
55
+ class StreamHandler(BaseCallbackHandler):
56
+ def __init__(self, container, initial_text=""):
57
+ self.container = container
58
+ self.text = initial_text
59
+ self.run_id_ignore_token = None
60
+
61
+ def on_llm_new_token(self, token: str, **kwargs):
62
+ self.text += token
63
+ self.container.markdown(self.text)
64
+
65
+ load_dotenv()
66
+ firecrawl_api_key = os.getenv("FIRECRAWL_API_KEY")
67
+ sutra_api_key = os.getenv("SUTRA_API_KEY")
68
+
69
+ @st.cache_resource
70
+ def load_app():
71
+ app = FirecrawlApp(api_key=firecrawl_api_key)
72
+ return app
73
+
74
+ # Initialize the ChatOpenAI model
75
+ @st.cache_resource
76
+ def get_chat_model():
77
+ if not st.session_state.get("sutra_api_key"):
78
+ raise ValueError("SUTRA API key is not set. Please enter your API key in the sidebar.")
79
+
80
+ return ChatOpenAI(
81
+ api_key=st.session_state.sutra_api_key,
82
+ base_url="https://api.two.ai/v2",
83
+ model="sutra-v2",
84
+ temperature=0.7,
85
+ )
86
+
87
+ def translate_text(text: str, target_lang: str = "en") -> str:
88
+ """Translate text to target language using Sutra model."""
89
+ try:
90
+ chat = get_chat_model()
91
+ # Make the translation prompt more specific and strict
92
+ prompt = f"""Translate the following text to {target_lang}.
93
+ Important:
94
+ 1. Only provide the translation, no explanations
95
+ 2. Maintain the exact same format and structure
96
+ 3. If it's a table, keep the table format
97
+ 4. If it's a list, keep the list format
98
+ 5. Ensure the output is strictly in {target_lang} language
99
+ 6. Do not include any other language in the response
100
+ 7. If the text is already in {target_lang}, return it as is
101
+
102
+ Text to translate: {text}"""
103
+ response = chat.invoke([HumanMessage(content=prompt)])
104
+ return response.content.strip()
105
+ except ValueError as ve:
106
+ st.error(str(ve))
107
+ return text
108
+ except Exception as e:
109
+ st.error(f"Translation error: {str(e)}")
110
+ return text
111
+
112
+ # Initialize session state
113
+ if "messages" not in st.session_state:
114
+ st.session_state.messages = []
115
+ if "urls" not in st.session_state:
116
+ st.session_state.urls = [""] # Initialize with one empty URL
117
+
118
+ def add_url():
119
+ st.session_state.urls.append("")
120
+
121
+ def remove_url(index):
122
+ if len(st.session_state.urls) > 1: # Keep at least one URL input
123
+ st.session_state.urls.pop(index)
124
+
125
+ def reset_chat():
126
+ st.session_state.messages = []
127
+ gc.collect()
128
+
129
+ def convert_to_table(data):
130
+ """Convert a list of dictionaries to a simple markdown table."""
131
+ if not data:
132
+ return ""
133
+
134
+ if isinstance(data, dict):
135
+ data = [data]
136
+ elif isinstance(data, list):
137
+ pass
138
+ else:
139
+ return ""
140
+
141
+ df = pd.DataFrame(data)
142
+ return df.to_markdown(index=False)
143
+
144
+ def stream_text(text: str, delay: float = 0.001) -> None:
145
+ """Stream text with a typing effect."""
146
+ placeholder = st.empty()
147
+ displayed_text = ""
148
+
149
+ for char in text:
150
+ displayed_text += char
151
+ placeholder.markdown(displayed_text)
152
+ time.sleep(delay)
153
+
154
+ return placeholder
155
+
156
+ # Main content area
157
+ st.markdown(
158
+ f'<h1><img src="https://framerusercontent.com/images/9vH8BcjXKRcC5OrSfkohhSyDgX0.png" width="60" style="vertical-align: middle;"/>Multilingual Chat via URLs <img src="https://static.vecteezy.com/system/resources/previews/036/004/783/non_2x/website-logo-searching-illustration-free-png.png" width="70" height="70" style="vertical-align: middle;"/></h1>',
159
+ unsafe_allow_html=True
160
+ )
161
+
162
+ # Sidebar
163
+ with st.sidebar:
164
+ st.header("Configuration")
165
+
166
+ # API Key sections
167
+ st.markdown("### API Keys")
168
+ st.markdown("**SUTRA API**")
169
+ st.markdown("Get your free API key from [SUTRA API](https://www.two.ai/sutra/api)")
170
+ sutra_api_key = st.text_input("Enter your Sutra API Key:",
171
+ value=st.session_state.get("sutra_api_key", ""),
172
+ type="password",
173
+ label_visibility="collapsed")
174
+ if sutra_api_key:
175
+ st.session_state.sutra_api_key = sutra_api_key
176
+
177
+ st.markdown("**Firecrawl API**")
178
+ st.markdown("Get your API key from [Firecrawl](https://firecrawl.dev/)")
179
+ firecrawl_api_key = st.text_input("Enter your Firecrawl API Key:",
180
+ value=st.session_state.get("firecrawl_api_key", ""),
181
+ type="password",
182
+ label_visibility="collapsed")
183
+ if firecrawl_api_key:
184
+ st.session_state.firecrawl_api_key = firecrawl_api_key
185
+
186
+ # Language selector
187
+ selected_language = st.selectbox("Select output language:", languages)
188
+
189
+ # Website URLs input with plus button
190
+ st.markdown("### Website URLs")
191
+ for i, url in enumerate(st.session_state.urls):
192
+ col1, col2 = st.columns([4, 1])
193
+ with col1:
194
+ st.session_state.urls[i] = st.text_input(
195
+ f"URL {i+1}",
196
+ value=url,
197
+ placeholder="https://example.com",
198
+ key=f"url_{i}"
199
+ )
200
+ with col2:
201
+ if i == len(st.session_state.urls) - 1: # Only show plus button on last URL input
202
+ st.button("➕", key=f"add_{i}", on_click=add_url)
203
+ if len(st.session_state.urls) > 1: # Show remove button if more than one URL
204
+ st.button("➖", key=f"remove_{i}", on_click=lambda i=i: remove_url(i))
205
+
206
+ # Chat interface
207
+ for message in st.session_state.messages:
208
+ with st.chat_message(message["role"]):
209
+ st.markdown(message["content"])
210
 
211
+ if prompt := st.chat_input("Ask about the website in any language..."):
212
+ st.session_state.messages.append({"role": "user", "content": prompt})
213
+ with st.chat_message("user"):
214
+ st.markdown(prompt)
215
+
216
+ with st.chat_message("assistant"):
217
+ # Filter out empty URLs
218
+ valid_urls = [url for url in st.session_state.urls if url.strip()]
219
+
220
+ if not valid_urls:
221
+ st.error("Please enter at least one website URL!")
222
+ elif not st.session_state.get("sutra_api_key"):
223
+ st.error("Please enter your SUTRA API key in the sidebar!")
224
+ elif not st.session_state.get("firecrawl_api_key"):
225
+ st.error("Please enter your Firecrawl API key in the sidebar!")
226
+ else:
227
+ try:
228
+ with st.spinner("Processing your request..."):
229
+ # Get target language code
230
+ target_lang = language_codes[selected_language]
231
+
232
+ # Detect input language
233
+ input_lang = detect(prompt)
234
+
235
+ # Translate to English if not already in English
236
+ if input_lang != 'en':
237
+ translated_prompt = translate_text(prompt, "en")
238
+ else:
239
+ translated_prompt = prompt
240
+
241
+ # Extract data from website
242
+ app = load_app()
243
+
244
+ extract_params = {
245
+ 'prompt': translated_prompt
246
+ }
247
+
248
+ # Process all valid URLs
249
+ all_data = []
250
+ for url in valid_urls:
251
+ try:
252
+ # Call extract with correct parameters - URLs must be in an array
253
+ data = app.extract([url], extract_params)
254
+ if isinstance(data, dict) and 'data' in data:
255
+ all_data.append(data['data'])
256
+ else:
257
+ all_data.append(data)
258
+ except Exception as e:
259
+ st.warning(f"Error processing URL {url}: {str(e)}")
260
+ continue
261
+
262
+ if not all_data:
263
+ st.error("No data could be extracted from any of the provided URLs.")
264
+ else:
265
+ # Combine all data
266
+ if len(all_data) == 1:
267
+ response = str(all_data[0])
268
+ else:
269
+ response = "\n\n".join([f"Results from {url}:\n{str(data)}"
270
+ for url, data in zip(valid_urls, all_data)])
271
+
272
+ # Always translate to selected language if not English
273
+ if target_lang != 'en':
274
+ # Add a verification step for translation
275
+ st.info(f"Translating to {selected_language}...")
276
+ response = translate_text(response, target_lang)
277
+
278
+ # Verify the translation
279
+ detected_lang = detect(response)
280
+ if detected_lang != target_lang:
281
+ # If translation is not in the correct language, try again with more strict prompt
282
+ st.warning(f"Detected language ({detected_lang}) doesn't match target language ({target_lang}). Retrying translation...")
283
+ response = translate_text(response, target_lang)
284
+
285
+ st.markdown(response)
286
+ st.session_state.messages.append({"role": "assistant", "content": response})
287
+
288
+ except Exception as e:
289
+ st.error(f"An error occurred: {str(e)}")
290
+ st.info("Please check your API keys and try again.")