Spaces:

sreesh2804
/

Doc_Chatbot

Sleeping

App Files Files Community

sreesh2804 commited on Apr 1

Commit

3af1614

verified ·

1 Parent(s): b4f2b93

Update app.py

Browse files

Files changed (1) hide show

app.py +44 -17

app.py CHANGED Viewed

@@ -67,35 +67,62 @@ def download_file(file_id, file_name):
     return file_path
 # ✅ Process documents
-def process_documents(selected_files):
-    global vector_store
-    docs = []
-    for file_name in selected_files:
-        file_path = download_file(file_id_map[file_name], file_name)
         if file_name.endswith(".pdf"):
-            loader = PyPDFLoader(file_path)
         elif file_name.endswith(".txt"):
-            loader = TextLoader(file_path)
         elif file_name.endswith(".docx"):
-            loader = Docx2txtLoader(file_path)
         else:
             logging.warning(f"⚠️ Unsupported file type: {file_name}")
-            continue
-        docs.extend(loader.load())
-    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
     split_docs = text_splitter.split_documents(docs)
     vector_store = Chroma.from_documents(split_docs, embeddings)
     return "✅ Documents processed successfully!"
 # ✅ Query document
-import os
-import time
-import logging
-from gtts import gTTS
-from langchain.chains import RetrievalQA
-from langchain_google_genai import ChatGoogleGenerativeAI
 # ✅ Ensure temp_file_map exists
 temp_file_map = {}

     return file_path
 # ✅ Process documents
+import concurrent.futures
+def load_document(file_name, file_path):
+    """Loads a document based on its file type."""
+    try:
         if file_name.endswith(".pdf"):
+            return PyPDFLoader(file_path).load()
         elif file_name.endswith(".txt"):
+            return TextLoader(file_path).load()
         elif file_name.endswith(".docx"):
+            return Docx2txtLoader(file_path).load()
         else:
             logging.warning(f"⚠️ Unsupported file type: {file_name}")
+            return []
+    except Exception as e:
+        logging.error(f"❌ Error loading {file_name}: {e}")
+        return []
+def process_documents(selected_files):
+    global vector_store
+    docs = []
+    # ✅ Use parallel processing to load documents faster
+    with concurrent.futures.ThreadPoolExecutor() as executor:
+        future_to_file = {executor.submit(load_document, file_name, download_file(file_id_map[file_name], file_name)): file_name for file_name in selected_files}
+        for future in concurrent.futures.as_completed(future_to_file):
+            docs.extend(future.result())
+    # ✅ Calculate total word count
+    total_words = sum(len(doc.page_content.split()) for doc in docs)
+    # ✅ Dynamically adjust chunk size for efficiency
+    if total_words < 1000:
+        chunk_size, chunk_overlap = 500, 50   # Small
+    elif total_words < 5000:
+        chunk_size, chunk_overlap = 1000, 100  # Medium
+    else:
+        chunk_size, chunk_overlap = 2000, 200  # Large
+    logging.info(f"📄 Document Size: {total_words} words | Chunk Size: {chunk_size}, Overlap: {chunk_overlap}")
+    # ✅ Efficient document splitting
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
     split_docs = text_splitter.split_documents(docs)
+    # ✅ Store efficiently in vector database
     vector_store = Chroma.from_documents(split_docs, embeddings)
     return "✅ Documents processed successfully!"
 # ✅ Query document
 # ✅ Ensure temp_file_map exists
 temp_file_map = {}