Shreyas094 commited on
Commit
ac831ca
·
verified ·
1 Parent(s): 06a7cda

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -10
app.py CHANGED
@@ -18,6 +18,10 @@ from huggingface_hub import InferenceClient
18
  import inspect
19
  import logging
20
  import shutil
 
 
 
 
21
 
22
 
23
  # Set up basic configuration for logging
@@ -66,21 +70,21 @@ def load_document(file: NamedTemporaryFile, parser: str = "llamaparse") -> List[
66
  else:
67
  raise ValueError("Invalid parser specified. Use 'pypdf' or 'llamaparse'.")
68
 
69
- import requests
70
- import numpy as np
71
-
72
  class HuggingFaceEmbeddings:
73
  def __init__(self, api_token):
74
  self.api_url = "https://api-inference.huggingface.co/models/dunzhang/stella_en_1.5B_v5"
75
  self.headers = {"Authorization": f"Bearer {api_token}"}
76
 
 
77
  def query(self, payload):
78
  response = requests.post(self.api_url, headers=self.headers, json=payload)
79
- return response.json()
 
 
 
80
 
81
  def embed_documents(self, texts):
82
- # Split long texts into smaller chunks
83
- max_chunk_length = 512 # Adjust this value based on the model's requirements
84
  chunked_texts = []
85
  for text in texts:
86
  if len(text) > max_chunk_length:
@@ -89,14 +93,13 @@ class HuggingFaceEmbeddings:
89
  else:
90
  chunked_texts.append(text)
91
 
92
- # Process chunks in batches
93
- batch_size = 8 # Adjust this value based on API limits and performance
94
  all_embeddings = []
95
  for i in range(0, len(chunked_texts), batch_size):
96
  batch = chunked_texts[i:i+batch_size]
97
  payload = {
98
  "inputs": batch,
99
- "task": "sentence-similarity" # Specify the task
100
  }
101
  response = self.query(payload)
102
  if isinstance(response, list):
@@ -106,7 +109,6 @@ class HuggingFaceEmbeddings:
106
  else:
107
  raise ValueError(f"Unexpected response format: {response}")
108
 
109
- # Average embeddings for chunks of the same original text
110
  final_embeddings = []
111
  i = 0
112
  for text in texts:
@@ -203,6 +205,7 @@ def update_vectors(files, parser):
203
  save_documents(uploaded_documents)
204
 
205
  return f"Vector store updated successfully. Processed {total_chunks} chunks from {len(files)} files using {parser}.", display_documents()
 
206
  def delete_documents(selected_docs):
207
  global uploaded_documents
208
 
 
18
  import inspect
19
  import logging
20
  import shutil
21
+ import time
22
+ from tenacity import retry, stop_after_attempt, wait_exponential
23
+ import requests
24
+ import numpy as np
25
 
26
 
27
  # Set up basic configuration for logging
 
70
  else:
71
  raise ValueError("Invalid parser specified. Use 'pypdf' or 'llamaparse'.")
72
 
 
 
 
73
  class HuggingFaceEmbeddings:
74
  def __init__(self, api_token):
75
  self.api_url = "https://api-inference.huggingface.co/models/dunzhang/stella_en_1.5B_v5"
76
  self.headers = {"Authorization": f"Bearer {api_token}"}
77
 
78
+ @retry(stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1, min=4, max=10))
79
  def query(self, payload):
80
  response = requests.post(self.api_url, headers=self.headers, json=payload)
81
+ result = response.json()
82
+ if 'error' in result and 'is currently loading' in result['error']:
83
+ raise Exception("Model is still loading")
84
+ return result
85
 
86
  def embed_documents(self, texts):
87
+ max_chunk_length = 512
 
88
  chunked_texts = []
89
  for text in texts:
90
  if len(text) > max_chunk_length:
 
93
  else:
94
  chunked_texts.append(text)
95
 
96
+ batch_size = 8
 
97
  all_embeddings = []
98
  for i in range(0, len(chunked_texts), batch_size):
99
  batch = chunked_texts[i:i+batch_size]
100
  payload = {
101
  "inputs": batch,
102
+ "task": "sentence-similarity"
103
  }
104
  response = self.query(payload)
105
  if isinstance(response, list):
 
109
  else:
110
  raise ValueError(f"Unexpected response format: {response}")
111
 
 
112
  final_embeddings = []
113
  i = 0
114
  for text in texts:
 
205
  save_documents(uploaded_documents)
206
 
207
  return f"Vector store updated successfully. Processed {total_chunks} chunks from {len(files)} files using {parser}.", display_documents()
208
+
209
  def delete_documents(selected_docs):
210
  global uploaded_documents
211