david commited on
Commit
84f799d
·
0 Parent(s):

改成可以多層爬取

Browse files
Files changed (5) hide show
  1. .gitattributes +35 -0
  2. .gitignore +1 -0
  3. README.md +13 -0
  4. app.py +131 -0
  5. requirements.txt +7 -0
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ myenv/
README.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Web Scraper & Q&A Chatbot with RAG
3
+ emoji: 🏃
4
+ colorFrom: blue
5
+ colorTo: yellow
6
+ sdk: streamlit
7
+ sdk_version: 1.43.1
8
+ app_file: app.py
9
+ pinned: false
10
+ short_description: Scrape, store, and query web data using RAG and AI chat.
11
+ ---
12
+
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import requests
3
+ import re
4
+ import urllib.parse
5
+ from bs4 import BeautifulSoup
6
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
7
+ from langchain.docstore.document import Document
8
+ import chromadb
9
+ from sentence_transformers import SentenceTransformer
10
+ import google.generativeai as genai
11
+
12
+ # Page configuration
13
+ st.set_page_config(layout="wide")
14
+
15
+ # Initialize Gemini API
16
+ genai.configure(api_key="AIzaSyAxUd2tS-qj9C7frYuHRsv92tziXHgIvLo")
17
+
18
+ # Initialize ChromaDB
19
+ CHROMA_PATH = "chroma_db"
20
+ chroma_client = chromadb.PersistentClient(path=CHROMA_PATH)
21
+
22
+ # Initialize session state
23
+ if 'scraped' not in st.session_state:
24
+ st.session_state.scraped = False
25
+ if 'collection_name' not in st.session_state:
26
+ st.session_state.collection_name = "default_collection"
27
+ if 'chat_history' not in st.session_state:
28
+ st.session_state.chat_history = []
29
+
30
+ # Initialize embedding model
31
+ embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
32
+
33
+ def clean_text(text):
34
+ return re.sub(r'\s+', ' ', re.sub(r'http\S+', '', text)).strip()
35
+
36
+ def split_content_into_chunks(content):
37
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200, length_function=len)
38
+ return text_splitter.split_documents([Document(page_content=content)])
39
+
40
+ def add_chunks_to_db(chunks, collection_name):
41
+ collection = chroma_client.get_or_create_collection(name=collection_name)
42
+ documents = [chunk.page_content for chunk in chunks]
43
+ embeddings = embedding_model.encode(documents, convert_to_list=True)
44
+ collection.upsert(documents=documents, ids=[f"ID{i}" for i in range(len(chunks))], embeddings=embeddings)
45
+
46
+ def scrape_text(url, max_depth=1, same_domain=True):
47
+ visited = set()
48
+ base_domain = urllib.parse.urlparse(url).netloc
49
+
50
+ def _scrape(u, depth):
51
+ if depth > max_depth or u in visited:
52
+ return
53
+ visited.add(u)
54
+ try:
55
+ response = requests.get(u)
56
+ response.raise_for_status()
57
+ soup = BeautifulSoup(response.text, 'html.parser')
58
+
59
+ text = clean_text(soup.get_text())
60
+ chunks = split_content_into_chunks(text)
61
+ add_chunks_to_db(chunks, st.session_state.collection_name)
62
+
63
+ # 遞迴爬取新連結
64
+ if depth < max_depth:
65
+ for link in soup.find_all('a', href=True):
66
+ next_url = urllib.parse.urljoin(u, link['href'])
67
+ next_domain = urllib.parse.urlparse(next_url).netloc
68
+ if same_domain and next_domain != base_domain:
69
+ continue
70
+ if next_url.startswith('mailto:') or next_url.startswith('javascript:'):
71
+ continue
72
+ _scrape(next_url, depth + 1)
73
+ except requests.exceptions.RequestException:
74
+ pass # 忽略單一頁面錯誤
75
+
76
+ _scrape(url, 1)
77
+ st.session_state.scraped = True
78
+ return "Scraping and processing complete. You can now ask questions!"
79
+
80
+ def ask_question(query, collection_name):
81
+ collection = chroma_client.get_or_create_collection(name=collection_name)
82
+ query_embedding = embedding_model.encode(query, convert_to_list=True)
83
+ results = collection.query(query_embeddings=[query_embedding], n_results=2)
84
+ top_chunks = results.get("documents", [[]])[0]
85
+
86
+ system_prompt = f"""
87
+ You are a helpful assistant. Answer only from the provided context.
88
+ If you lack information, say: "I don't have enough information to answer that question."
89
+ Context:
90
+ {str(top_chunks)}
91
+ """
92
+
93
+ model = genai.GenerativeModel('gemini-2.0-flash')
94
+ response = model.generate_content(system_prompt + "\nUser Query: " + query)
95
+ return response.text
96
+
97
+ # Sidebar
98
+ with st.sidebar:
99
+ st.header("Database Management")
100
+ if st.button("Clear Chat History"):
101
+ st.session_state.chat_history = []
102
+ st.rerun()
103
+
104
+ st.header("Step 1: Scrape a Website")
105
+ url = st.text_input("Enter URL:")
106
+ max_depth = st.selectbox("Recursion Depth (層數)", options=[1,2,3,4,5], index=0, help="選擇要遞迴爬幾層,預設1層")
107
+ same_domain = st.checkbox("只允許同網域遞迴", value=True, help="預設只爬同一網域的連結")
108
+ if url and st.button("Scrape & Process"):
109
+ with st.spinner("Scraping..."):
110
+ st.success(scrape_text(url, max_depth=max_depth, same_domain=same_domain))
111
+
112
+ # Main content
113
+ st.title("Web Scraper & Q&A Chatbot")
114
+ if st.session_state.scraped:
115
+ st.subheader("Step 2: Ask Questions")
116
+ for message in st.session_state.chat_history:
117
+ with st.chat_message(message["role"]):
118
+ st.write(message["content"])
119
+
120
+ user_query = st.chat_input("Ask your question here")
121
+ if user_query:
122
+ st.session_state.chat_history.append({"role": "user", "content": user_query})
123
+ with st.spinner("Searching..."):
124
+ answer = ask_question(user_query, st.session_state.collection_name)
125
+ st.session_state.chat_history.append({"role": "assistant", "content": answer})
126
+
127
+ # Limit chat history to 6 messages
128
+ st.session_state.chat_history = st.session_state.chat_history[-6:]
129
+ st.rerun()
130
+ else:
131
+ st.info("Please scrape a website first.")
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ requests
2
+ beautifulsoup4
3
+ langchain
4
+ chromadb
5
+ sentence-transformers
6
+ google-generativeai
7
+ streamlit