Shreyas094 commited on
Commit
3ea794f
·
verified ·
1 Parent(s): 84f09dc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +33 -97
app.py CHANGED
@@ -18,10 +18,10 @@ from huggingface_hub import InferenceClient
18
  import inspect
19
  import logging
20
  import shutil
21
- import time
22
- from tenacity import retry, stop_after_attempt, wait_exponential
23
  import requests
24
- import numpy as np
 
 
25
 
26
 
27
  # Set up basic configuration for logging
@@ -70,61 +70,35 @@ def load_document(file: NamedTemporaryFile, parser: str = "llamaparse") -> List[
70
  else:
71
  raise ValueError("Invalid parser specified. Use 'pypdf' or 'llamaparse'.")
72
 
73
- class HuggingFaceEmbeddings:
74
- def __init__(self, api_token):
75
  self.api_url = "https://api-inference.huggingface.co/models/dunzhang/stella_en_1.5B_v5"
76
- self.headers = {"Authorization": f"Bearer {api_token}"}
77
-
78
- @retry(stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1, min=4, max=10))
79
- def query(self, payload):
80
- response = requests.post(self.api_url, headers=self.headers, json=payload)
81
- result = response.json()
82
- if 'error' in result and 'is currently loading' in result['error']:
83
- raise Exception("Model is still loading")
84
- return result
85
 
86
- def embed_documents(self, texts):
87
- max_chunk_length = 512
88
- chunked_texts = []
89
  for text in texts:
90
- if len(text) > max_chunk_length:
91
- chunks = [text[i:i+max_chunk_length] for i in range(0, len(text), max_chunk_length)]
92
- chunked_texts.extend(chunks)
93
- else:
94
- chunked_texts.append(text)
95
-
96
- batch_size = 8
97
- all_embeddings = []
98
- for i in range(0, len(chunked_texts), batch_size):
99
- batch = chunked_texts[i:i+batch_size]
100
- payload = {
101
- "inputs": batch,
102
- "task": "sentence-similarity"
103
  }
104
- response = self.query(payload)
105
- if isinstance(response, list):
106
- all_embeddings.extend(response)
107
- elif isinstance(response, dict) and 'error' in response:
108
- raise ValueError(f"API Error: {response['error']}")
109
- else:
110
- raise ValueError(f"Unexpected response format: {response}")
111
-
112
- final_embeddings = []
113
- i = 0
114
- for text in texts:
115
- num_chunks = max(1, len(text) // max_chunk_length)
116
- text_embeddings = all_embeddings[i:i+num_chunks]
117
- avg_embedding = np.mean(text_embeddings, axis=0)
118
- final_embeddings.append(avg_embedding)
119
- i += num_chunks
120
-
121
- return final_embeddings
122
-
123
- def embed_query(self, text):
124
- return self.embed_documents([text])[0]
125
 
126
  def get_embeddings():
127
- return HuggingFaceEmbeddings(api_token=huggingface_token)
 
128
 
129
  # Add this at the beginning of your script, after imports
130
  DOCUMENTS_FILE = "uploaded_documents.json"
@@ -151,12 +125,7 @@ def update_vectors(files, parser):
151
  logging.warning("No files provided for update_vectors")
152
  return "Please upload at least one PDF file.", display_documents()
153
 
154
- try:
155
- embed = get_embeddings()
156
- except Exception as e:
157
- logging.error(f"Error initializing embeddings: {str(e)}")
158
- return f"Error initializing embeddings: {str(e)}", display_documents()
159
-
160
  total_chunks = 0
161
 
162
  all_data = []
@@ -187,13 +156,11 @@ def update_vectors(files, parser):
187
  try:
188
  if os.path.exists("faiss_database"):
189
  logging.info("Updating existing FAISS database")
190
- database = FAISS.load_local("faiss_database", embed.embed_query, allow_dangerous_deserialization=True)
191
- embeddings = embed.embed_documents([doc.page_content for doc in all_data])
192
- database.add_embeddings(embeddings, all_data)
193
  else:
194
  logging.info("Creating new FAISS database")
195
- embeddings = embed.embed_documents([doc.page_content for doc in all_data])
196
- database = FAISS.from_embeddings(embeddings, all_data, embed)
197
 
198
  database.save_local("faiss_database")
199
  logging.info("FAISS database saved")
@@ -483,42 +450,12 @@ After writing the document, please provide a list of sources used in your respon
483
  if not full_response:
484
  yield "I apologize, but I couldn't generate a response at this time. Please try again later."
485
 
486
- def create_web_search_vectors(search_results):
487
- embed = get_embeddings()
488
-
489
- documents = []
490
- for result in search_results:
491
- if 'body' in result:
492
- content = f"{result['title']}\n{result['body']}\nSource: {result['href']}"
493
- documents.append(Document(page_content=content, metadata={"source": result['href']}))
494
-
495
- return FAISS.from_documents(documents, embed)
496
-
497
- def create_web_search_vectors(search_results):
498
- embed = get_embeddings()
499
-
500
- documents = []
501
- for result in search_results:
502
- if 'body' in result:
503
- content = f"{result['title']}\n{result['body']}\nSource: {result['href']}"
504
- documents.append(Document(page_content=content, metadata={"source": result['href']}))
505
-
506
- return FAISS.from_documents(documents, embed)
507
-
508
  def get_response_with_search(query, model, num_calls=3, temperature=0.2):
509
  search_results = duckduckgo_search(query)
510
- web_search_database = create_web_search_vectors(search_results)
511
-
512
- if not web_search_database:
513
- yield "No web search results available. Please try again.", ""
514
- return
515
 
516
- retriever = web_search_database.as_retriever(search_kwargs={"k": 5})
517
- relevant_docs = retriever.get_relevant_documents(query)
518
-
519
- context = "\n".join([doc.page_content for doc in relevant_docs])
520
-
521
- prompt = f"""Using the following context from web search results:
522
  {context}
523
  Write a detailed and complete research document that fulfills the following user request: '{query}'
524
  After writing the document, please provide a list of sources used in your response."""
@@ -544,7 +481,6 @@ After writing the document, please provide a list of sources used in your respon
544
  main_content += chunk
545
  yield main_content, "" # Yield partial main content without sources
546
 
547
-
548
  def get_response_from_pdf(query, model, selected_docs, num_calls=3, temperature=0.2):
549
  logging.info(f"Entering get_response_from_pdf with query: {query}, model: {model}, selected_docs: {selected_docs}")
550
 
 
18
  import inspect
19
  import logging
20
  import shutil
 
 
21
  import requests
22
+ from typing import List
23
+ from langchain.embeddings.base import Embeddings
24
+
25
 
26
 
27
  # Set up basic configuration for logging
 
70
  else:
71
  raise ValueError("Invalid parser specified. Use 'pypdf' or 'llamaparse'.")
72
 
73
+ class HuggingFaceInferenceAPIEmbeddings(Embeddings):
74
+ def __init__(self, api_key: str):
75
  self.api_url = "https://api-inference.huggingface.co/models/dunzhang/stella_en_1.5B_v5"
76
+ self.headers = {"Authorization": f"Bearer {api_key}"}
 
 
 
 
 
 
 
 
77
 
78
+ def embed_documents(self, texts: List[str]) -> List[List[float]]:
79
+ embeddings = []
 
80
  for text in texts:
81
+ embedding = self.embed_query(text)
82
+ embeddings.append(embedding)
83
+ return embeddings
84
+
85
+ def embed_query(self, text: str) -> List[float]:
86
+ payload = {
87
+ "inputs": {
88
+ "source_sentence": text,
89
+ "sentences": [text]
 
 
 
 
90
  }
91
+ }
92
+ response = requests.post(self.api_url, headers=self.headers, json=payload)
93
+ result = response.json()
94
+ if isinstance(result, list) and len(result) > 0:
95
+ return result[0]
96
+ else:
97
+ raise ValueError(f"Unexpected response format: {result}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
 
99
  def get_embeddings():
100
+ huggingface_token = os.environ.get("HUGGINGFACE_TOKEN")
101
+ return HuggingFaceInferenceAPIEmbeddings(api_key=huggingface_token)
102
 
103
  # Add this at the beginning of your script, after imports
104
  DOCUMENTS_FILE = "uploaded_documents.json"
 
125
  logging.warning("No files provided for update_vectors")
126
  return "Please upload at least one PDF file.", display_documents()
127
 
128
+ embed = get_embeddings()
 
 
 
 
 
129
  total_chunks = 0
130
 
131
  all_data = []
 
156
  try:
157
  if os.path.exists("faiss_database"):
158
  logging.info("Updating existing FAISS database")
159
+ database = FAISS.load_local("faiss_database", embed, allow_dangerous_deserialization=True)
160
+ database.add_documents(all_data)
 
161
  else:
162
  logging.info("Creating new FAISS database")
163
+ database = FAISS.from_documents(all_data, embed)
 
164
 
165
  database.save_local("faiss_database")
166
  logging.info("FAISS database saved")
 
450
  if not full_response:
451
  yield "I apologize, but I couldn't generate a response at this time. Please try again later."
452
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
453
  def get_response_with_search(query, model, num_calls=3, temperature=0.2):
454
  search_results = duckduckgo_search(query)
455
+ context = "\n".join(f"{result['title']}\n{result['body']}\nSource: {result['href']}\n"
456
+ for result in search_results if 'body' in result)
 
 
 
457
 
458
+ prompt = f"""Using the following context:
 
 
 
 
 
459
  {context}
460
  Write a detailed and complete research document that fulfills the following user request: '{query}'
461
  After writing the document, please provide a list of sources used in your response."""
 
481
  main_content += chunk
482
  yield main_content, "" # Yield partial main content without sources
483
 
 
484
  def get_response_from_pdf(query, model, selected_docs, num_calls=3, temperature=0.2):
485
  logging.info(f"Entering get_response_from_pdf with query: {query}, model: {model}, selected_docs: {selected_docs}")
486