import streamlit as st import os import gc from firecrawl import FirecrawlApp from dotenv import load_dotenv import time import pandas as pd from typing import Dict, Any import base64 from pydantic import BaseModel, Field import inspect from langchain_openai import ChatOpenAI from langchain.schema import HumanMessage from langchain.callbacks.base import BaseCallbackHandler from langdetect import detect # Page configuration st.set_page_config( page_title="Multilingual Website Data Extractor", page_icon="🌐", layout="wide" ) # Define supported languages languages = [ "English", "Hindi", "Gujarati", "Bengali", "Tamil", "Telugu", "Kannada", "Malayalam", "Punjabi", "Marathi", "Urdu", "Assamese", "Odia", "Sanskrit", "Korean", "Japanese", "Arabic", "French", "German", "Spanish", "Portuguese", "Russian", "Chinese", "Vietnamese", "Thai", "Indonesian", "Turkish", "Polish", "Ukrainian", "Dutch", "Italian", "Greek", "Hebrew", "Persian", "Swedish", "Norwegian", "Danish", "Finnish", "Czech", "Hungarian", "Romanian", "Bulgarian", "Croatian", "Serbian", "Slovak", "Slovenian", "Estonian", "Latvian", "Lithuanian", "Malay", "Tagalog", "Swahili" ] # Language code mapping language_codes = { "English": "en", "Hindi": "hi", "Gujarati": "gu", "Bengali": "bn", "Tamil": "ta", "Telugu": "te", "Kannada": "kn", "Malayalam": "ml", "Punjabi": "pa", "Marathi": "mr", "Urdu": "ur", "Assamese": "as", "Odia": "or", "Sanskrit": "sa", "Korean": "ko", "Japanese": "ja", "Arabic": "ar", "French": "fr", "German": "de", "Spanish": "es", "Portuguese": "pt", "Russian": "ru", "Chinese": "zh", "Vietnamese": "vi", "Thai": "th", "Indonesian": "id", "Turkish": "tr", "Polish": "pl", "Ukrainian": "uk", "Dutch": "nl", "Italian": "it", "Greek": "el", "Hebrew": "he", "Persian": "fa", "Swedish": "sv", "Norwegian": "no", "Danish": "da", "Finnish": "fi", "Czech": "cs", "Hungarian": "hu", "Romanian": "ro", "Bulgarian": "bg", "Croatian": "hr", "Serbian": "sr", "Slovak": "sk", "Slovenian": "sl", "Estonian": "et", "Latvian": "lv", "Lithuanian": "lt", "Malay": "ms", "Tagalog": "tl", "Swahili": "sw" } # Streaming callback handler class StreamHandler(BaseCallbackHandler): def __init__(self, container, initial_text=""): self.container = container self.text = initial_text self.run_id_ignore_token = None def on_llm_new_token(self, token: str, **kwargs): self.text += token self.container.markdown(self.text) load_dotenv() firecrawl_api_key = os.getenv("FIRECRAWL_API_KEY") sutra_api_key = os.getenv("SUTRA_API_KEY") @st.cache_resource def load_app(): app = FirecrawlApp(api_key=firecrawl_api_key) return app # Initialize the ChatOpenAI model @st.cache_resource def get_chat_model(): if not st.session_state.get("sutra_api_key"): raise ValueError("SUTRA API key is not set. Please enter your API key in the sidebar.") return ChatOpenAI( api_key=st.session_state.sutra_api_key, base_url="https://api.two.ai/v2", model="sutra-v2", temperature=0.7, ) def translate_text(text: str, target_lang: str = "en") -> str: """Translate text to target language using Sutra model.""" try: chat = get_chat_model() # Make the translation prompt more specific and strict prompt = f"""Translate the following text to {target_lang}. Important: 1. Only provide the translation, no explanations 2. Maintain the exact same format and structure 3. If it's a table, keep the table format 4. If it's a list, keep the list format 5. Ensure the output is strictly in {target_lang} language 6. Do not include any other language in the response 7. If the text is already in {target_lang}, return it as is Text to translate: {text}""" response = chat.invoke([HumanMessage(content=prompt)]) return response.content.strip() except ValueError as ve: st.error(str(ve)) return text except Exception as e: st.error(f"Translation error: {str(e)}") return text # Initialize session state if "messages" not in st.session_state: st.session_state.messages = [] if "urls" not in st.session_state: st.session_state.urls = [""] # Initialize with one empty URL def add_url(): st.session_state.urls.append("") def remove_url(index): if len(st.session_state.urls) > 1: # Keep at least one URL input st.session_state.urls.pop(index) def reset_chat(): st.session_state.messages = [] gc.collect() def convert_to_table(data): """Convert a list of dictionaries to a simple markdown table.""" if not data: return "" if isinstance(data, dict): data = [data] elif isinstance(data, list): pass else: return "" df = pd.DataFrame(data) return df.to_markdown(index=False) def stream_text(text: str, delay: float = 0.001) -> None: """Stream text with a typing effect.""" placeholder = st.empty() displayed_text = "" for char in text: displayed_text += char placeholder.markdown(displayed_text) time.sleep(delay) return placeholder # Main content area st.markdown( f'