import streamlit as st import requests import re from bs4 import BeautifulSoup from langchain_text_splitters import RecursiveCharacterTextSplitter from langchain.docstore.document import Document import chromadb from sentence_transformers import SentenceTransformer import google.generativeai as genai genai.configure(api_key="AIzaSyAxUd2tS-qj9C7frYuHRsv92tziXHgIvLo") CHROMA_PATH = "chroma_db" chroma_client = chromadb.PersistentClient(path=CHROMA_PATH) collection = chroma_client.get_or_create_collection(name="formula_1") embedding_model = SentenceTransformer("all-MiniLM-L6-v2") def clean_text(text): text = re.sub(r'http\S+', '', text) text = re.sub(r'\s+', ' ', text).strip() return text def split_content_into_chunks(content): text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200, length_function=len) documents = [Document(page_content=content)] return text_splitter.split_documents(documents) def add_chunks_to_db(chunks): documents = [chunk.page_content for chunk in chunks] ids = [f"ID{i}" for i in range(len(chunks))] embeddings = embedding_model.encode(documents, convert_to_list=True) collection.upsert(documents=documents, ids=ids, embeddings=embeddings) def scrape_text(url): try: response = requests.get(url) response.raise_for_status() soup = BeautifulSoup(response.text, 'html.parser') text = clean_text(soup.get_text()) chunks = split_content_into_chunks(text) add_chunks_to_db(chunks) return "Scraping and processing complete. You can now ask questions!" except requests.exceptions.RequestException as e: return f"Error scraping {url}: {e}" def ask_question(query): query_embedding = embedding_model.encode(query, convert_to_list=True) results = collection.query(query_embeddings=[query_embedding], n_results=2) top_chunks = results.get("documents", [[]])[0] system_prompt = """ You are a Formula 1 expert. You answer questions about Formula 1. But you only answer based on knowledge I'm providing you. You don't use your internal knowledge and you don't make things up. If you don't know the answer, just say: I don't know. """ + str(top_chunks) full_prompt = system_prompt + "\nUser Query: " + query model = genai.GenerativeModel('gemini-2.0-flash') response = model.generate_content(full_prompt) return response.text st.title("Web Scraping & Chatbot") url = st.text_input("Enter a URL:") if url: if st.button("Scrape & Process"): result = scrape_text(url) st.success(result) if 'scraped' in st.session_state and st.session_state.scraped: st.subheader("Ask a Question") query = st.text_input("Enter your question:") if query: if st.button("Get Answer"): answer = ask_question(query) st.write(answer)