Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -67,35 +67,62 @@ def download_file(file_id, file_name):
|
|
67 |
return file_path
|
68 |
|
69 |
# ✅ Process documents
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
|
|
75 |
if file_name.endswith(".pdf"):
|
76 |
-
|
77 |
elif file_name.endswith(".txt"):
|
78 |
-
|
79 |
elif file_name.endswith(".docx"):
|
80 |
-
|
81 |
else:
|
82 |
logging.warning(f"⚠️ Unsupported file type: {file_name}")
|
83 |
-
|
84 |
-
|
85 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
86 |
split_docs = text_splitter.split_documents(docs)
|
|
|
|
|
87 |
vector_store = Chroma.from_documents(split_docs, embeddings)
|
|
|
88 |
return "✅ Documents processed successfully!"
|
89 |
|
|
|
90 |
# ✅ Query document
|
91 |
|
92 |
|
93 |
-
|
94 |
-
import time
|
95 |
-
import logging
|
96 |
-
from gtts import gTTS
|
97 |
-
from langchain.chains import RetrievalQA
|
98 |
-
from langchain_google_genai import ChatGoogleGenerativeAI
|
99 |
|
100 |
# ✅ Ensure temp_file_map exists
|
101 |
temp_file_map = {}
|
|
|
67 |
return file_path
|
68 |
|
69 |
# ✅ Process documents
|
70 |
+
import concurrent.futures
|
71 |
+
|
72 |
+
|
73 |
+
def load_document(file_name, file_path):
|
74 |
+
"""Loads a document based on its file type."""
|
75 |
+
try:
|
76 |
if file_name.endswith(".pdf"):
|
77 |
+
return PyPDFLoader(file_path).load()
|
78 |
elif file_name.endswith(".txt"):
|
79 |
+
return TextLoader(file_path).load()
|
80 |
elif file_name.endswith(".docx"):
|
81 |
+
return Docx2txtLoader(file_path).load()
|
82 |
else:
|
83 |
logging.warning(f"⚠️ Unsupported file type: {file_name}")
|
84 |
+
return []
|
85 |
+
except Exception as e:
|
86 |
+
logging.error(f"❌ Error loading {file_name}: {e}")
|
87 |
+
return []
|
88 |
+
|
89 |
+
def process_documents(selected_files):
|
90 |
+
global vector_store
|
91 |
+
docs = []
|
92 |
+
|
93 |
+
# ✅ Use parallel processing to load documents faster
|
94 |
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
95 |
+
future_to_file = {executor.submit(load_document, file_name, download_file(file_id_map[file_name], file_name)): file_name for file_name in selected_files}
|
96 |
+
for future in concurrent.futures.as_completed(future_to_file):
|
97 |
+
docs.extend(future.result())
|
98 |
+
|
99 |
+
# ✅ Calculate total word count
|
100 |
+
total_words = sum(len(doc.page_content.split()) for doc in docs)
|
101 |
+
|
102 |
+
# ✅ Dynamically adjust chunk size for efficiency
|
103 |
+
if total_words < 1000:
|
104 |
+
chunk_size, chunk_overlap = 500, 50 # Small
|
105 |
+
elif total_words < 5000:
|
106 |
+
chunk_size, chunk_overlap = 1000, 100 # Medium
|
107 |
+
else:
|
108 |
+
chunk_size, chunk_overlap = 2000, 200 # Large
|
109 |
+
|
110 |
+
logging.info(f"📄 Document Size: {total_words} words | Chunk Size: {chunk_size}, Overlap: {chunk_overlap}")
|
111 |
+
|
112 |
+
# ✅ Efficient document splitting
|
113 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
114 |
split_docs = text_splitter.split_documents(docs)
|
115 |
+
|
116 |
+
# ✅ Store efficiently in vector database
|
117 |
vector_store = Chroma.from_documents(split_docs, embeddings)
|
118 |
+
|
119 |
return "✅ Documents processed successfully!"
|
120 |
|
121 |
+
|
122 |
# ✅ Query document
|
123 |
|
124 |
|
125 |
+
|
|
|
|
|
|
|
|
|
|
|
126 |
|
127 |
# ✅ Ensure temp_file_map exists
|
128 |
temp_file_map = {}
|