Update src/streamlit_app.py
Browse files- src/streamlit_app.py +288 -38
src/streamlit_app.py
CHANGED
@@ -1,40 +1,290 @@
|
|
1 |
-
import altair as alt
|
2 |
-
import numpy as np
|
3 |
-
import pandas as pd
|
4 |
import streamlit as st
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
|
6 |
-
""
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import streamlit as st
|
2 |
+
import os
|
3 |
+
import gc
|
4 |
+
from firecrawl import FirecrawlApp
|
5 |
+
from dotenv import load_dotenv
|
6 |
+
import time
|
7 |
+
import pandas as pd
|
8 |
+
from typing import Dict, Any
|
9 |
+
import base64
|
10 |
+
from pydantic import BaseModel, Field
|
11 |
+
import inspect
|
12 |
+
from langchain_openai import ChatOpenAI
|
13 |
+
from langchain.schema import HumanMessage
|
14 |
+
from langchain.callbacks.base import BaseCallbackHandler
|
15 |
+
from langdetect import detect
|
16 |
+
|
17 |
+
# Page configuration
|
18 |
+
st.set_page_config(
|
19 |
+
page_title="Multilingual Website Data Extractor",
|
20 |
+
page_icon="🌐",
|
21 |
+
layout="wide"
|
22 |
+
)
|
23 |
+
|
24 |
+
# Define supported languages
|
25 |
+
languages = [
|
26 |
+
"English", "Hindi", "Gujarati", "Bengali", "Tamil",
|
27 |
+
"Telugu", "Kannada", "Malayalam", "Punjabi", "Marathi",
|
28 |
+
"Urdu", "Assamese", "Odia", "Sanskrit", "Korean",
|
29 |
+
"Japanese", "Arabic", "French", "German", "Spanish",
|
30 |
+
"Portuguese", "Russian", "Chinese", "Vietnamese", "Thai",
|
31 |
+
"Indonesian", "Turkish", "Polish", "Ukrainian", "Dutch",
|
32 |
+
"Italian", "Greek", "Hebrew", "Persian", "Swedish",
|
33 |
+
"Norwegian", "Danish", "Finnish", "Czech", "Hungarian",
|
34 |
+
"Romanian", "Bulgarian", "Croatian", "Serbian", "Slovak",
|
35 |
+
"Slovenian", "Estonian", "Latvian", "Lithuanian", "Malay",
|
36 |
+
"Tagalog", "Swahili"
|
37 |
+
]
|
38 |
+
|
39 |
+
# Language code mapping
|
40 |
+
language_codes = {
|
41 |
+
"English": "en", "Hindi": "hi", "Gujarati": "gu", "Bengali": "bn", "Tamil": "ta",
|
42 |
+
"Telugu": "te", "Kannada": "kn", "Malayalam": "ml", "Punjabi": "pa", "Marathi": "mr",
|
43 |
+
"Urdu": "ur", "Assamese": "as", "Odia": "or", "Sanskrit": "sa", "Korean": "ko",
|
44 |
+
"Japanese": "ja", "Arabic": "ar", "French": "fr", "German": "de", "Spanish": "es",
|
45 |
+
"Portuguese": "pt", "Russian": "ru", "Chinese": "zh", "Vietnamese": "vi", "Thai": "th",
|
46 |
+
"Indonesian": "id", "Turkish": "tr", "Polish": "pl", "Ukrainian": "uk", "Dutch": "nl",
|
47 |
+
"Italian": "it", "Greek": "el", "Hebrew": "he", "Persian": "fa", "Swedish": "sv",
|
48 |
+
"Norwegian": "no", "Danish": "da", "Finnish": "fi", "Czech": "cs", "Hungarian": "hu",
|
49 |
+
"Romanian": "ro", "Bulgarian": "bg", "Croatian": "hr", "Serbian": "sr", "Slovak": "sk",
|
50 |
+
"Slovenian": "sl", "Estonian": "et", "Latvian": "lv", "Lithuanian": "lt", "Malay": "ms",
|
51 |
+
"Tagalog": "tl", "Swahili": "sw"
|
52 |
+
}
|
53 |
+
|
54 |
+
# Streaming callback handler
|
55 |
+
class StreamHandler(BaseCallbackHandler):
|
56 |
+
def __init__(self, container, initial_text=""):
|
57 |
+
self.container = container
|
58 |
+
self.text = initial_text
|
59 |
+
self.run_id_ignore_token = None
|
60 |
+
|
61 |
+
def on_llm_new_token(self, token: str, **kwargs):
|
62 |
+
self.text += token
|
63 |
+
self.container.markdown(self.text)
|
64 |
+
|
65 |
+
load_dotenv()
|
66 |
+
firecrawl_api_key = os.getenv("FIRECRAWL_API_KEY")
|
67 |
+
sutra_api_key = os.getenv("SUTRA_API_KEY")
|
68 |
+
|
69 |
+
@st.cache_resource
|
70 |
+
def load_app():
|
71 |
+
app = FirecrawlApp(api_key=firecrawl_api_key)
|
72 |
+
return app
|
73 |
+
|
74 |
+
# Initialize the ChatOpenAI model
|
75 |
+
@st.cache_resource
|
76 |
+
def get_chat_model():
|
77 |
+
if not st.session_state.get("sutra_api_key"):
|
78 |
+
raise ValueError("SUTRA API key is not set. Please enter your API key in the sidebar.")
|
79 |
+
|
80 |
+
return ChatOpenAI(
|
81 |
+
api_key=st.session_state.sutra_api_key,
|
82 |
+
base_url="https://api.two.ai/v2",
|
83 |
+
model="sutra-v2",
|
84 |
+
temperature=0.7,
|
85 |
+
)
|
86 |
+
|
87 |
+
def translate_text(text: str, target_lang: str = "en") -> str:
|
88 |
+
"""Translate text to target language using Sutra model."""
|
89 |
+
try:
|
90 |
+
chat = get_chat_model()
|
91 |
+
# Make the translation prompt more specific and strict
|
92 |
+
prompt = f"""Translate the following text to {target_lang}.
|
93 |
+
Important:
|
94 |
+
1. Only provide the translation, no explanations
|
95 |
+
2. Maintain the exact same format and structure
|
96 |
+
3. If it's a table, keep the table format
|
97 |
+
4. If it's a list, keep the list format
|
98 |
+
5. Ensure the output is strictly in {target_lang} language
|
99 |
+
6. Do not include any other language in the response
|
100 |
+
7. If the text is already in {target_lang}, return it as is
|
101 |
+
|
102 |
+
Text to translate: {text}"""
|
103 |
+
response = chat.invoke([HumanMessage(content=prompt)])
|
104 |
+
return response.content.strip()
|
105 |
+
except ValueError as ve:
|
106 |
+
st.error(str(ve))
|
107 |
+
return text
|
108 |
+
except Exception as e:
|
109 |
+
st.error(f"Translation error: {str(e)}")
|
110 |
+
return text
|
111 |
+
|
112 |
+
# Initialize session state
|
113 |
+
if "messages" not in st.session_state:
|
114 |
+
st.session_state.messages = []
|
115 |
+
if "urls" not in st.session_state:
|
116 |
+
st.session_state.urls = [""] # Initialize with one empty URL
|
117 |
+
|
118 |
+
def add_url():
|
119 |
+
st.session_state.urls.append("")
|
120 |
+
|
121 |
+
def remove_url(index):
|
122 |
+
if len(st.session_state.urls) > 1: # Keep at least one URL input
|
123 |
+
st.session_state.urls.pop(index)
|
124 |
+
|
125 |
+
def reset_chat():
|
126 |
+
st.session_state.messages = []
|
127 |
+
gc.collect()
|
128 |
+
|
129 |
+
def convert_to_table(data):
|
130 |
+
"""Convert a list of dictionaries to a simple markdown table."""
|
131 |
+
if not data:
|
132 |
+
return ""
|
133 |
+
|
134 |
+
if isinstance(data, dict):
|
135 |
+
data = [data]
|
136 |
+
elif isinstance(data, list):
|
137 |
+
pass
|
138 |
+
else:
|
139 |
+
return ""
|
140 |
+
|
141 |
+
df = pd.DataFrame(data)
|
142 |
+
return df.to_markdown(index=False)
|
143 |
+
|
144 |
+
def stream_text(text: str, delay: float = 0.001) -> None:
|
145 |
+
"""Stream text with a typing effect."""
|
146 |
+
placeholder = st.empty()
|
147 |
+
displayed_text = ""
|
148 |
+
|
149 |
+
for char in text:
|
150 |
+
displayed_text += char
|
151 |
+
placeholder.markdown(displayed_text)
|
152 |
+
time.sleep(delay)
|
153 |
+
|
154 |
+
return placeholder
|
155 |
+
|
156 |
+
# Main content area
|
157 |
+
st.markdown(
|
158 |
+
f'<h1><img src="https://framerusercontent.com/images/9vH8BcjXKRcC5OrSfkohhSyDgX0.png" width="60" style="vertical-align: middle;"/>Multilingual Chat via URLs <img src="https://static.vecteezy.com/system/resources/previews/036/004/783/non_2x/website-logo-searching-illustration-free-png.png" width="70" height="70" style="vertical-align: middle;"/></h1>',
|
159 |
+
unsafe_allow_html=True
|
160 |
+
)
|
161 |
+
|
162 |
+
# Sidebar
|
163 |
+
with st.sidebar:
|
164 |
+
st.header("Configuration")
|
165 |
+
|
166 |
+
# API Key sections
|
167 |
+
st.markdown("### API Keys")
|
168 |
+
st.markdown("**SUTRA API**")
|
169 |
+
st.markdown("Get your free API key from [SUTRA API](https://www.two.ai/sutra/api)")
|
170 |
+
sutra_api_key = st.text_input("Enter your Sutra API Key:",
|
171 |
+
value=st.session_state.get("sutra_api_key", ""),
|
172 |
+
type="password",
|
173 |
+
label_visibility="collapsed")
|
174 |
+
if sutra_api_key:
|
175 |
+
st.session_state.sutra_api_key = sutra_api_key
|
176 |
+
|
177 |
+
st.markdown("**Firecrawl API**")
|
178 |
+
st.markdown("Get your API key from [Firecrawl](https://firecrawl.dev/)")
|
179 |
+
firecrawl_api_key = st.text_input("Enter your Firecrawl API Key:",
|
180 |
+
value=st.session_state.get("firecrawl_api_key", ""),
|
181 |
+
type="password",
|
182 |
+
label_visibility="collapsed")
|
183 |
+
if firecrawl_api_key:
|
184 |
+
st.session_state.firecrawl_api_key = firecrawl_api_key
|
185 |
+
|
186 |
+
# Language selector
|
187 |
+
selected_language = st.selectbox("Select output language:", languages)
|
188 |
+
|
189 |
+
# Website URLs input with plus button
|
190 |
+
st.markdown("### Website URLs")
|
191 |
+
for i, url in enumerate(st.session_state.urls):
|
192 |
+
col1, col2 = st.columns([4, 1])
|
193 |
+
with col1:
|
194 |
+
st.session_state.urls[i] = st.text_input(
|
195 |
+
f"URL {i+1}",
|
196 |
+
value=url,
|
197 |
+
placeholder="https://example.com",
|
198 |
+
key=f"url_{i}"
|
199 |
+
)
|
200 |
+
with col2:
|
201 |
+
if i == len(st.session_state.urls) - 1: # Only show plus button on last URL input
|
202 |
+
st.button("➕", key=f"add_{i}", on_click=add_url)
|
203 |
+
if len(st.session_state.urls) > 1: # Show remove button if more than one URL
|
204 |
+
st.button("➖", key=f"remove_{i}", on_click=lambda i=i: remove_url(i))
|
205 |
+
|
206 |
+
# Chat interface
|
207 |
+
for message in st.session_state.messages:
|
208 |
+
with st.chat_message(message["role"]):
|
209 |
+
st.markdown(message["content"])
|
210 |
|
211 |
+
if prompt := st.chat_input("Ask about the website in any language..."):
|
212 |
+
st.session_state.messages.append({"role": "user", "content": prompt})
|
213 |
+
with st.chat_message("user"):
|
214 |
+
st.markdown(prompt)
|
215 |
+
|
216 |
+
with st.chat_message("assistant"):
|
217 |
+
# Filter out empty URLs
|
218 |
+
valid_urls = [url for url in st.session_state.urls if url.strip()]
|
219 |
+
|
220 |
+
if not valid_urls:
|
221 |
+
st.error("Please enter at least one website URL!")
|
222 |
+
elif not st.session_state.get("sutra_api_key"):
|
223 |
+
st.error("Please enter your SUTRA API key in the sidebar!")
|
224 |
+
elif not st.session_state.get("firecrawl_api_key"):
|
225 |
+
st.error("Please enter your Firecrawl API key in the sidebar!")
|
226 |
+
else:
|
227 |
+
try:
|
228 |
+
with st.spinner("Processing your request..."):
|
229 |
+
# Get target language code
|
230 |
+
target_lang = language_codes[selected_language]
|
231 |
+
|
232 |
+
# Detect input language
|
233 |
+
input_lang = detect(prompt)
|
234 |
+
|
235 |
+
# Translate to English if not already in English
|
236 |
+
if input_lang != 'en':
|
237 |
+
translated_prompt = translate_text(prompt, "en")
|
238 |
+
else:
|
239 |
+
translated_prompt = prompt
|
240 |
+
|
241 |
+
# Extract data from website
|
242 |
+
app = load_app()
|
243 |
+
|
244 |
+
extract_params = {
|
245 |
+
'prompt': translated_prompt
|
246 |
+
}
|
247 |
+
|
248 |
+
# Process all valid URLs
|
249 |
+
all_data = []
|
250 |
+
for url in valid_urls:
|
251 |
+
try:
|
252 |
+
# Call extract with correct parameters - URLs must be in an array
|
253 |
+
data = app.extract([url], extract_params)
|
254 |
+
if isinstance(data, dict) and 'data' in data:
|
255 |
+
all_data.append(data['data'])
|
256 |
+
else:
|
257 |
+
all_data.append(data)
|
258 |
+
except Exception as e:
|
259 |
+
st.warning(f"Error processing URL {url}: {str(e)}")
|
260 |
+
continue
|
261 |
+
|
262 |
+
if not all_data:
|
263 |
+
st.error("No data could be extracted from any of the provided URLs.")
|
264 |
+
else:
|
265 |
+
# Combine all data
|
266 |
+
if len(all_data) == 1:
|
267 |
+
response = str(all_data[0])
|
268 |
+
else:
|
269 |
+
response = "\n\n".join([f"Results from {url}:\n{str(data)}"
|
270 |
+
for url, data in zip(valid_urls, all_data)])
|
271 |
+
|
272 |
+
# Always translate to selected language if not English
|
273 |
+
if target_lang != 'en':
|
274 |
+
# Add a verification step for translation
|
275 |
+
st.info(f"Translating to {selected_language}...")
|
276 |
+
response = translate_text(response, target_lang)
|
277 |
+
|
278 |
+
# Verify the translation
|
279 |
+
detected_lang = detect(response)
|
280 |
+
if detected_lang != target_lang:
|
281 |
+
# If translation is not in the correct language, try again with more strict prompt
|
282 |
+
st.warning(f"Detected language ({detected_lang}) doesn't match target language ({target_lang}). Retrying translation...")
|
283 |
+
response = translate_text(response, target_lang)
|
284 |
+
|
285 |
+
st.markdown(response)
|
286 |
+
st.session_state.messages.append({"role": "assistant", "content": response})
|
287 |
+
|
288 |
+
except Exception as e:
|
289 |
+
st.error(f"An error occurred: {str(e)}")
|
290 |
+
st.info("Please check your API keys and try again.")
|