Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -66,6 +66,9 @@ def load_document(file: NamedTemporaryFile, parser: str = "llamaparse") -> List[
|
|
66 |
else:
|
67 |
raise ValueError("Invalid parser specified. Use 'pypdf' or 'llamaparse'.")
|
68 |
|
|
|
|
|
|
|
69 |
class HuggingFaceEmbeddings:
|
70 |
def __init__(self, api_token):
|
71 |
self.api_url = "https://api-inference.huggingface.co/models/dunzhang/stella_en_1.5B_v5"
|
@@ -76,12 +79,44 @@ class HuggingFaceEmbeddings:
|
|
76 |
return response.json()
|
77 |
|
78 |
def embed_documents(self, texts):
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
85 |
|
86 |
def embed_query(self, text):
|
87 |
return self.embed_documents([text])[0]
|
@@ -114,7 +149,12 @@ def update_vectors(files, parser):
|
|
114 |
logging.warning("No files provided for update_vectors")
|
115 |
return "Please upload at least one PDF file.", display_documents()
|
116 |
|
117 |
-
|
|
|
|
|
|
|
|
|
|
|
118 |
total_chunks = 0
|
119 |
|
120 |
all_data = []
|
@@ -163,7 +203,6 @@ def update_vectors(files, parser):
|
|
163 |
save_documents(uploaded_documents)
|
164 |
|
165 |
return f"Vector store updated successfully. Processed {total_chunks} chunks from {len(files)} files using {parser}.", display_documents()
|
166 |
-
|
167 |
def delete_documents(selected_docs):
|
168 |
global uploaded_documents
|
169 |
|
|
|
66 |
else:
|
67 |
raise ValueError("Invalid parser specified. Use 'pypdf' or 'llamaparse'.")
|
68 |
|
69 |
+
import requests
|
70 |
+
import numpy as np
|
71 |
+
|
72 |
class HuggingFaceEmbeddings:
|
73 |
def __init__(self, api_token):
|
74 |
self.api_url = "https://api-inference.huggingface.co/models/dunzhang/stella_en_1.5B_v5"
|
|
|
79 |
return response.json()
|
80 |
|
81 |
def embed_documents(self, texts):
|
82 |
+
# Split long texts into smaller chunks
|
83 |
+
max_chunk_length = 512 # Adjust this value based on the model's requirements
|
84 |
+
chunked_texts = []
|
85 |
+
for text in texts:
|
86 |
+
if len(text) > max_chunk_length:
|
87 |
+
chunks = [text[i:i+max_chunk_length] for i in range(0, len(text), max_chunk_length)]
|
88 |
+
chunked_texts.extend(chunks)
|
89 |
+
else:
|
90 |
+
chunked_texts.append(text)
|
91 |
+
|
92 |
+
# Process chunks in batches
|
93 |
+
batch_size = 8 # Adjust this value based on API limits and performance
|
94 |
+
all_embeddings = []
|
95 |
+
for i in range(0, len(chunked_texts), batch_size):
|
96 |
+
batch = chunked_texts[i:i+batch_size]
|
97 |
+
payload = {
|
98 |
+
"inputs": batch,
|
99 |
+
"task": "sentence-similarity" # Specify the task
|
100 |
+
}
|
101 |
+
response = self.query(payload)
|
102 |
+
if isinstance(response, list):
|
103 |
+
all_embeddings.extend(response)
|
104 |
+
elif isinstance(response, dict) and 'error' in response:
|
105 |
+
raise ValueError(f"API Error: {response['error']}")
|
106 |
+
else:
|
107 |
+
raise ValueError(f"Unexpected response format: {response}")
|
108 |
+
|
109 |
+
# Average embeddings for chunks of the same original text
|
110 |
+
final_embeddings = []
|
111 |
+
i = 0
|
112 |
+
for text in texts:
|
113 |
+
num_chunks = max(1, len(text) // max_chunk_length)
|
114 |
+
text_embeddings = all_embeddings[i:i+num_chunks]
|
115 |
+
avg_embedding = np.mean(text_embeddings, axis=0)
|
116 |
+
final_embeddings.append(avg_embedding)
|
117 |
+
i += num_chunks
|
118 |
+
|
119 |
+
return final_embeddings
|
120 |
|
121 |
def embed_query(self, text):
|
122 |
return self.embed_documents([text])[0]
|
|
|
149 |
logging.warning("No files provided for update_vectors")
|
150 |
return "Please upload at least one PDF file.", display_documents()
|
151 |
|
152 |
+
try:
|
153 |
+
embed = get_embeddings()
|
154 |
+
except Exception as e:
|
155 |
+
logging.error(f"Error initializing embeddings: {str(e)}")
|
156 |
+
return f"Error initializing embeddings: {str(e)}", display_documents()
|
157 |
+
|
158 |
total_chunks = 0
|
159 |
|
160 |
all_data = []
|
|
|
203 |
save_documents(uploaded_documents)
|
204 |
|
205 |
return f"Vector store updated successfully. Processed {total_chunks} chunks from {len(files)} files using {parser}.", display_documents()
|
|
|
206 |
def delete_documents(selected_docs):
|
207 |
global uploaded_documents
|
208 |
|