RChaubey16's picture
Create app.py
c22f035 verified
raw
history blame
2.89 kB
import streamlit as st
import requests
import re
from bs4 import BeautifulSoup
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
import chromadb
from sentence_transformers import SentenceTransformer
import google.generativeai as genai
genai.configure(api_key="AIzaSyAxUd2tS-qj9C7frYuHRsv92tziXHgIvLo")
CHROMA_PATH = "chroma_db"
chroma_client = chromadb.PersistentClient(path=CHROMA_PATH)
collection = chroma_client.get_or_create_collection(name="formula_1")
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
def clean_text(text):
text = re.sub(r'http\S+', '', text)
text = re.sub(r'\s+', ' ', text).strip()
return text
def split_content_into_chunks(content):
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200, length_function=len)
documents = [Document(page_content=content)]
return text_splitter.split_documents(documents)
def add_chunks_to_db(chunks):
documents = [chunk.page_content for chunk in chunks]
ids = [f"ID{i}" for i in range(len(chunks))]
embeddings = embedding_model.encode(documents, convert_to_list=True)
collection.upsert(documents=documents, ids=ids, embeddings=embeddings)
def scrape_text(url):
try:
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
text = clean_text(soup.get_text())
chunks = split_content_into_chunks(text)
add_chunks_to_db(chunks)
return "Scraping and processing complete. You can now ask questions!"
except requests.exceptions.RequestException as e:
return f"Error scraping {url}: {e}"
def ask_question(query):
query_embedding = embedding_model.encode(query, convert_to_list=True)
results = collection.query(query_embeddings=[query_embedding], n_results=2)
top_chunks = results.get("documents", [[]])[0]
system_prompt = """
You are a Formula 1 expert. You answer questions about Formula 1.
But you only answer based on knowledge I'm providing you. You don't use your internal
knowledge and you don't make things up.
If you don't know the answer, just say: I don't know.
""" + str(top_chunks)
full_prompt = system_prompt + "\nUser Query: " + query
model = genai.GenerativeModel('gemini-2.0-flash')
response = model.generate_content(full_prompt)
return response.text
st.title("Web Scraping & Chatbot")
url = st.text_input("Enter a URL:")
if url:
if st.button("Scrape & Process"):
result = scrape_text(url)
st.success(result)
if 'scraped' in st.session_state and st.session_state.scraped:
st.subheader("Ask a Question")
query = st.text_input("Enter your question:")
if query:
if st.button("Get Answer"):
answer = ask_question(query)
st.write(answer)