Mojo3 commited on
Commit
2a1b8e8
·
verified ·
1 Parent(s): cc4a792

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +4 -0
app.py CHANGED
@@ -41,6 +41,7 @@ documents = load_docx_files_from_drive(docs_folder)
41
 
42
 
43
  def split_extracted_text_into_chunks(documents):
 
44
  # List to hold all chunks
45
  chunks = []
46
 
@@ -73,6 +74,7 @@ chunks = split_extracted_text_into_chunks(documents)
73
 
74
 
75
  def save_chunks_to_file(chunks, output_file_path):
 
76
  # Open the file in write mode
77
  with open(output_file_path, "w", encoding="utf-8") as file:
78
  for i, chunk in enumerate(chunks, start=1):
@@ -100,6 +102,7 @@ embedding_model = HuggingFaceEmbeddings(
100
 
101
  # Step 2: Embed the chunks (now simplified)
102
  def embed_chunks(chunks):
 
103
  return [
104
  {"chunk": chunk, "embedding": embedding_model.embed_query(chunk)}
105
  for chunk in chunks
@@ -111,6 +114,7 @@ embeddings = embed_chunks(chunks)
111
 
112
  # Step 3: Prepare documents (unchanged)
113
  def prepare_documents_for_chroma(embeddings):
 
114
  return [
115
  Document2(page_content=entry["chunk"], metadata={"chunk_index": i})
116
  for i, entry in enumerate(embeddings, start=1)
 
41
 
42
 
43
  def split_extracted_text_into_chunks(documents):
44
+ print("Splitting text into chunks")
45
  # List to hold all chunks
46
  chunks = []
47
 
 
74
 
75
 
76
  def save_chunks_to_file(chunks, output_file_path):
77
+ print("Saving chunks to file")
78
  # Open the file in write mode
79
  with open(output_file_path, "w", encoding="utf-8") as file:
80
  for i, chunk in enumerate(chunks, start=1):
 
102
 
103
  # Step 2: Embed the chunks (now simplified)
104
  def embed_chunks(chunks):
105
+ print("Embedding the chunks")
106
  return [
107
  {"chunk": chunk, "embedding": embedding_model.embed_query(chunk)}
108
  for chunk in chunks
 
114
 
115
  # Step 3: Prepare documents (unchanged)
116
  def prepare_documents_for_chroma(embeddings):
117
+ print("Preparing documents for chroma")
118
  return [
119
  Document2(page_content=entry["chunk"], metadata={"chunk_index": i})
120
  for i, entry in enumerate(embeddings, start=1)