sreesh2804 commited on
Commit
3af1614
·
verified ·
1 Parent(s): b4f2b93

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +44 -17
app.py CHANGED
@@ -67,35 +67,62 @@ def download_file(file_id, file_name):
67
  return file_path
68
 
69
  # ✅ Process documents
70
- def process_documents(selected_files):
71
- global vector_store
72
- docs = []
73
- for file_name in selected_files:
74
- file_path = download_file(file_id_map[file_name], file_name)
 
75
  if file_name.endswith(".pdf"):
76
- loader = PyPDFLoader(file_path)
77
  elif file_name.endswith(".txt"):
78
- loader = TextLoader(file_path)
79
  elif file_name.endswith(".docx"):
80
- loader = Docx2txtLoader(file_path)
81
  else:
82
  logging.warning(f"⚠️ Unsupported file type: {file_name}")
83
- continue
84
- docs.extend(loader.load())
85
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  split_docs = text_splitter.split_documents(docs)
 
 
87
  vector_store = Chroma.from_documents(split_docs, embeddings)
 
88
  return "✅ Documents processed successfully!"
89
 
 
90
  # ✅ Query document
91
 
92
 
93
- import os
94
- import time
95
- import logging
96
- from gtts import gTTS
97
- from langchain.chains import RetrievalQA
98
- from langchain_google_genai import ChatGoogleGenerativeAI
99
 
100
  # ✅ Ensure temp_file_map exists
101
  temp_file_map = {}
 
67
  return file_path
68
 
69
  # ✅ Process documents
70
+ import concurrent.futures
71
+
72
+
73
+ def load_document(file_name, file_path):
74
+ """Loads a document based on its file type."""
75
+ try:
76
  if file_name.endswith(".pdf"):
77
+ return PyPDFLoader(file_path).load()
78
  elif file_name.endswith(".txt"):
79
+ return TextLoader(file_path).load()
80
  elif file_name.endswith(".docx"):
81
+ return Docx2txtLoader(file_path).load()
82
  else:
83
  logging.warning(f"⚠️ Unsupported file type: {file_name}")
84
+ return []
85
+ except Exception as e:
86
+ logging.error(f"❌ Error loading {file_name}: {e}")
87
+ return []
88
+
89
+ def process_documents(selected_files):
90
+ global vector_store
91
+ docs = []
92
+
93
+ # ✅ Use parallel processing to load documents faster
94
+ with concurrent.futures.ThreadPoolExecutor() as executor:
95
+ future_to_file = {executor.submit(load_document, file_name, download_file(file_id_map[file_name], file_name)): file_name for file_name in selected_files}
96
+ for future in concurrent.futures.as_completed(future_to_file):
97
+ docs.extend(future.result())
98
+
99
+ # ✅ Calculate total word count
100
+ total_words = sum(len(doc.page_content.split()) for doc in docs)
101
+
102
+ # ✅ Dynamically adjust chunk size for efficiency
103
+ if total_words < 1000:
104
+ chunk_size, chunk_overlap = 500, 50 # Small
105
+ elif total_words < 5000:
106
+ chunk_size, chunk_overlap = 1000, 100 # Medium
107
+ else:
108
+ chunk_size, chunk_overlap = 2000, 200 # Large
109
+
110
+ logging.info(f"📄 Document Size: {total_words} words | Chunk Size: {chunk_size}, Overlap: {chunk_overlap}")
111
+
112
+ # ✅ Efficient document splitting
113
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
114
  split_docs = text_splitter.split_documents(docs)
115
+
116
+ # ✅ Store efficiently in vector database
117
  vector_store = Chroma.from_documents(split_docs, embeddings)
118
+
119
  return "✅ Documents processed successfully!"
120
 
121
+
122
  # ✅ Query document
123
 
124
 
125
+
 
 
 
 
 
126
 
127
  # ✅ Ensure temp_file_map exists
128
  temp_file_map = {}