Shreyas094 commited on
Commit
06a7cda
·
verified ·
1 Parent(s): 1f08962

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +47 -8
app.py CHANGED
@@ -66,6 +66,9 @@ def load_document(file: NamedTemporaryFile, parser: str = "llamaparse") -> List[
66
  else:
67
  raise ValueError("Invalid parser specified. Use 'pypdf' or 'llamaparse'.")
68
 
 
 
 
69
  class HuggingFaceEmbeddings:
70
  def __init__(self, api_token):
71
  self.api_url = "https://api-inference.huggingface.co/models/dunzhang/stella_en_1.5B_v5"
@@ -76,12 +79,44 @@ class HuggingFaceEmbeddings:
76
  return response.json()
77
 
78
  def embed_documents(self, texts):
79
- payload = {"inputs": texts}
80
- response = self.query(payload)
81
- if isinstance(response, list):
82
- return [np.array(embedding) for embedding in response]
83
- else:
84
- raise ValueError(f"Unexpected response format: {response}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
 
86
  def embed_query(self, text):
87
  return self.embed_documents([text])[0]
@@ -114,7 +149,12 @@ def update_vectors(files, parser):
114
  logging.warning("No files provided for update_vectors")
115
  return "Please upload at least one PDF file.", display_documents()
116
 
117
- embed = get_embeddings()
 
 
 
 
 
118
  total_chunks = 0
119
 
120
  all_data = []
@@ -163,7 +203,6 @@ def update_vectors(files, parser):
163
  save_documents(uploaded_documents)
164
 
165
  return f"Vector store updated successfully. Processed {total_chunks} chunks from {len(files)} files using {parser}.", display_documents()
166
-
167
  def delete_documents(selected_docs):
168
  global uploaded_documents
169
 
 
66
  else:
67
  raise ValueError("Invalid parser specified. Use 'pypdf' or 'llamaparse'.")
68
 
69
+ import requests
70
+ import numpy as np
71
+
72
  class HuggingFaceEmbeddings:
73
  def __init__(self, api_token):
74
  self.api_url = "https://api-inference.huggingface.co/models/dunzhang/stella_en_1.5B_v5"
 
79
  return response.json()
80
 
81
  def embed_documents(self, texts):
82
+ # Split long texts into smaller chunks
83
+ max_chunk_length = 512 # Adjust this value based on the model's requirements
84
+ chunked_texts = []
85
+ for text in texts:
86
+ if len(text) > max_chunk_length:
87
+ chunks = [text[i:i+max_chunk_length] for i in range(0, len(text), max_chunk_length)]
88
+ chunked_texts.extend(chunks)
89
+ else:
90
+ chunked_texts.append(text)
91
+
92
+ # Process chunks in batches
93
+ batch_size = 8 # Adjust this value based on API limits and performance
94
+ all_embeddings = []
95
+ for i in range(0, len(chunked_texts), batch_size):
96
+ batch = chunked_texts[i:i+batch_size]
97
+ payload = {
98
+ "inputs": batch,
99
+ "task": "sentence-similarity" # Specify the task
100
+ }
101
+ response = self.query(payload)
102
+ if isinstance(response, list):
103
+ all_embeddings.extend(response)
104
+ elif isinstance(response, dict) and 'error' in response:
105
+ raise ValueError(f"API Error: {response['error']}")
106
+ else:
107
+ raise ValueError(f"Unexpected response format: {response}")
108
+
109
+ # Average embeddings for chunks of the same original text
110
+ final_embeddings = []
111
+ i = 0
112
+ for text in texts:
113
+ num_chunks = max(1, len(text) // max_chunk_length)
114
+ text_embeddings = all_embeddings[i:i+num_chunks]
115
+ avg_embedding = np.mean(text_embeddings, axis=0)
116
+ final_embeddings.append(avg_embedding)
117
+ i += num_chunks
118
+
119
+ return final_embeddings
120
 
121
  def embed_query(self, text):
122
  return self.embed_documents([text])[0]
 
149
  logging.warning("No files provided for update_vectors")
150
  return "Please upload at least one PDF file.", display_documents()
151
 
152
+ try:
153
+ embed = get_embeddings()
154
+ except Exception as e:
155
+ logging.error(f"Error initializing embeddings: {str(e)}")
156
+ return f"Error initializing embeddings: {str(e)}", display_documents()
157
+
158
  total_chunks = 0
159
 
160
  all_data = []
 
203
  save_documents(uploaded_documents)
204
 
205
  return f"Vector store updated successfully. Processed {total_chunks} chunks from {len(files)} files using {parser}.", display_documents()
 
206
  def delete_documents(selected_docs):
207
  global uploaded_documents
208