Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -41,6 +41,7 @@ documents = load_docx_files_from_drive(docs_folder)
|
|
41 |
|
42 |
|
43 |
def split_extracted_text_into_chunks(documents):
|
|
|
44 |
# List to hold all chunks
|
45 |
chunks = []
|
46 |
|
@@ -73,6 +74,7 @@ chunks = split_extracted_text_into_chunks(documents)
|
|
73 |
|
74 |
|
75 |
def save_chunks_to_file(chunks, output_file_path):
|
|
|
76 |
# Open the file in write mode
|
77 |
with open(output_file_path, "w", encoding="utf-8") as file:
|
78 |
for i, chunk in enumerate(chunks, start=1):
|
@@ -100,6 +102,7 @@ embedding_model = HuggingFaceEmbeddings(
|
|
100 |
|
101 |
# Step 2: Embed the chunks (now simplified)
|
102 |
def embed_chunks(chunks):
|
|
|
103 |
return [
|
104 |
{"chunk": chunk, "embedding": embedding_model.embed_query(chunk)}
|
105 |
for chunk in chunks
|
@@ -111,6 +114,7 @@ embeddings = embed_chunks(chunks)
|
|
111 |
|
112 |
# Step 3: Prepare documents (unchanged)
|
113 |
def prepare_documents_for_chroma(embeddings):
|
|
|
114 |
return [
|
115 |
Document2(page_content=entry["chunk"], metadata={"chunk_index": i})
|
116 |
for i, entry in enumerate(embeddings, start=1)
|
|
|
41 |
|
42 |
|
43 |
def split_extracted_text_into_chunks(documents):
|
44 |
+
print("Splitting text into chunks")
|
45 |
# List to hold all chunks
|
46 |
chunks = []
|
47 |
|
|
|
74 |
|
75 |
|
76 |
def save_chunks_to_file(chunks, output_file_path):
|
77 |
+
print("Saving chunks to file")
|
78 |
# Open the file in write mode
|
79 |
with open(output_file_path, "w", encoding="utf-8") as file:
|
80 |
for i, chunk in enumerate(chunks, start=1):
|
|
|
102 |
|
103 |
# Step 2: Embed the chunks (now simplified)
|
104 |
def embed_chunks(chunks):
|
105 |
+
print("Embedding the chunks")
|
106 |
return [
|
107 |
{"chunk": chunk, "embedding": embedding_model.embed_query(chunk)}
|
108 |
for chunk in chunks
|
|
|
114 |
|
115 |
# Step 3: Prepare documents (unchanged)
|
116 |
def prepare_documents_for_chroma(embeddings):
|
117 |
+
print("Preparing documents for chroma")
|
118 |
return [
|
119 |
Document2(page_content=entry["chunk"], metadata={"chunk_index": i})
|
120 |
for i, entry in enumerate(embeddings, start=1)
|