That1BrainCell commited on
Commit
a5c315b
·
verified ·
1 Parent(s): cde83bd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +119 -96
app.py CHANGED
@@ -1,6 +1,6 @@
1
  import streamlit as st
2
  import concurrent.futures
3
- from concurrent.futures import ThreadPoolExecutor,as_completed
4
  from functools import partial
5
  import numpy as np
6
  from io import StringIO
@@ -16,25 +16,29 @@ from io import BytesIO
16
  from PyPDF2 import PdfReader
17
  import hashlib
18
  import os
 
 
 
 
 
19
 
20
  # File Imports
21
- from embedding import get_embeddings,get_image_embeddings,get_embed_chroma,imporve_text # Ensure this file/module is available
22
  from preprocess import filtering # Ensure this file/module is available
 
23
  from search import *
24
 
25
 
26
  # Chroma Connections
27
- client = chromadb.PersistentClient(path = "/home/user/embeddings")
28
-
29
- for item in client.list_collections():
30
- print(item)
31
- collection = client.get_or_create_collection(name="data",metadata={"hnsw:space": "l2"})
32
 
33
 
34
 
35
  def generate_hash(content):
36
  return hashlib.sha256(content.encode('utf-8')).hexdigest()
37
 
 
38
  def get_key(link):
39
  text = ''
40
  try:
@@ -48,11 +52,10 @@ def get_key(link):
48
  # Load the PDF file
49
  reader = PdfReader(pdf_file)
50
  num_pages = len(reader.pages)
51
-
52
  first_page_text = reader.pages[0].extract_text()
53
  if first_page_text:
54
  text += first_page_text
55
-
56
 
57
  last_page_text = reader.pages[-1].extract_text()
58
  if last_page_text:
@@ -62,43 +65,44 @@ def get_key(link):
62
  print(f'HTTP error occurred: {e}')
63
  except Exception as e:
64
  print(f'An error occurred: {e}')
65
-
66
  unique_key = generate_hash(text)
67
-
68
  return unique_key
69
 
 
70
  # Cosine Similarity Function
71
  def cosine_similarity(vec1, vec2):
72
  vec1 = np.array(vec1)
73
  vec2 = np.array(vec2)
74
-
75
  dot_product = np.dot(vec1, vec2.T)
76
  magnitude_vec1 = np.linalg.norm(vec1)
77
  magnitude_vec2 = np.linalg.norm(vec2)
78
-
79
  if magnitude_vec1 == 0 or magnitude_vec2 == 0:
80
  return 0.0
81
-
82
  cosine_sim = dot_product / (magnitude_vec1 * magnitude_vec2)
83
  return cosine_sim
84
 
85
- def update_chroma(product_name,url,key,text,vector,log_area):
86
 
87
- id_list = [key+str(i) for i in range(len(text))]
 
88
 
89
  metadata_list = [
90
- { 'key':key,
91
- 'product_name': product_name,
92
- 'url': url,
93
- 'text':item
94
- }
95
- for item in text
96
- ]
97
 
98
  collection.upsert(
99
- ids = id_list,
100
- embeddings = vector,
101
- metadatas = metadata_list
102
  )
103
 
104
  logger.write(f"\n\u2713 Updated DB - {url}\n\n")
@@ -118,10 +122,9 @@ class StreamCapture:
118
  def __exit__(self, exc_type, exc_val, exc_tb):
119
  sys.stdout = self._stdout
120
 
 
121
  # Main Function
122
  def score(main_product, main_url, product_count, link_count, search, logger, log_area):
123
-
124
-
125
  data = {}
126
  similar_products = extract_similar_products(main_product)[:product_count]
127
 
@@ -132,12 +135,10 @@ def score(main_product, main_url, product_count, link_count, search, logger, log
132
  def process_product(product, search_function, main_product):
133
  search_result = search_function(product)
134
  return filtering(search_result, main_product, product, link_count)
135
-
136
-
137
  search_functions = {
138
  'google': search_google,
139
  'duckduckgo': search_duckduckgo,
140
- # 'archive': search_archive,
141
  'github': search_github,
142
  'wikipedia': search_wikipedia
143
  }
@@ -173,27 +174,24 @@ def score(main_product, main_url, product_count, link_count, search, logger, log
173
  elif search == 'wikipedia':
174
  data[product] = filtering(search_wikipedia(product), main_product, product, link_count)
175
 
176
-
177
  # Filtered Link -----------------------------------------
178
  logger.write("\n\n\u2713 Filtered Links\n")
179
  log_area.text(logger.getvalue())
180
 
181
-
182
  # Main product Embeddings ---------------------------------
183
  logger.write("\n\n--> Creating Main product Embeddings\n")
184
 
185
  main_key = get_key(main_url)
186
- main_text,main_vector = get_embed_chroma(main_url)
187
 
188
- update_chroma(main_product,main_url,main_key,main_text,main_vector,log_area)
189
 
190
  # log_area.text(logger.getvalue())
191
  print("\n\n\u2713 Main Product embeddings Created")
192
 
193
-
194
  logger.write("\n\n--> Creating Similar product Embeddings\n")
195
- log_area.text(logger.getvalue())
196
- test_embedding = [0]*768
197
 
198
  for product in data:
199
  for link in data[product]:
@@ -202,33 +200,31 @@ def score(main_product, main_url, product_count, link_count, search, logger, log
202
  similar_key = get_key(url)
203
 
204
  res = collection.query(
205
- query_embeddings = [test_embedding],
206
- n_results=1,
207
- where={"key": similar_key},
208
- )
209
 
210
  if not res['distances'][0]:
211
- similar_text,similar_vector = get_embed_chroma(url)
212
- update_chroma(product,url,similar_key,similar_text,similar_vector,log_area)
213
-
214
 
215
  logger.write("\n\n\u2713 Similar Product embeddings Created\n")
216
  log_area.text(logger.getvalue())
217
 
218
  top_similar = []
219
 
220
- for idx,chunk in enumerate(main_vector):
221
  res = collection.query(
222
- query_embeddings = [chunk],
223
- n_results=1,
224
- where={"key": {'$ne':main_key}},
225
- include=['metadatas','embeddings','distances']
226
- )
227
-
228
- top_similar.append((main_text[idx],chunk,res,res['distances'][0]))
229
-
230
- most_similar_items = sorted(top_similar,key = lambda x:x[3])[:top_similar_count]
231
 
 
232
 
233
  logger.write("--------------- DONE -----------------\n")
234
  log_area.text(logger.getvalue())
@@ -236,71 +232,81 @@ def score(main_product, main_url, product_count, link_count, search, logger, log
236
  return most_similar_items
237
 
238
 
239
-
240
-
241
-
242
  # Streamlit Interface
243
- st.title("Check Infringement")
244
 
 
245
 
246
  # Inputs
247
- main_product = st.text_input('Enter Main Product Name', 'Philips led 7w bulb')
248
- main_url = st.text_input('Enter Main Product Manual URL', 'https://www.assets.signify.com/is/content/PhilipsConsumer/PDFDownloads/Colombia/technical-sheets/ODLI20180227_001-UPD-es_CO-Ficha_Tecnica_LED_MR16_Master_7W_Dim_12V_CRI90.pdf')
249
- search_method = st.selectbox('Choose Search Engine', ['All','duckduckgo', 'google', 'archive', 'github', 'wikipedia'])
 
250
 
251
- col1, col2, col3= st.columns(3)
252
- with col1:
253
- product_count = st.number_input("Number of Simliar Products",min_value=1, step=1, format="%i")
254
- with col2:
255
- link_count = st.number_input("Number of Links per product",min_value=1, step=1, format="%i")
256
- with col3:
257
- need_image = st.selectbox("Process Images", ['True','False'])
258
 
259
- top_similar_count = st.number_input("Top Similarities to be displayed",value=3,min_value=1, step=1, format="%i")
260
- tag_option = "Complete Document Similarity"
 
261
 
 
262
 
263
  if st.button('Check for Infringement'):
264
- global log_output # Placeholder for log output
265
 
266
- tab1, tab2 = st.tabs(["Output", "Console"])
267
 
268
  with tab2:
269
  log_output = st.empty()
270
 
271
  with tab1:
272
  with st.spinner('Processing...'):
 
 
 
 
 
273
  with StreamCapture() as logger:
274
  top_similar_values = score(main_product, main_url, product_count, link_count, search_method, logger, log_output)
275
 
276
- st.success('Processing complete!')
277
 
278
- st.subheader("Cosine Similarity Scores")
279
 
280
  for main_text, main_vector, response, _ in top_similar_values:
281
  product_name = response['metadatas'][0][0]['product_name']
282
  link = response['metadatas'][0][0]['url']
283
  similar_text = response['metadatas'][0][0]['text']
 
 
284
 
285
  cosine_score = cosine_similarity([main_vector], response['embeddings'][0])[0][0]
286
 
287
  # Display the product information
288
- with st.container():
289
- st.markdown(f"### [Product: {product_name}]({link})")
290
- st.markdown(f"#### Cosine Score: {cosine_score:.4f}")
291
- col1, col2 = st.columns(2)
292
- with col1:
293
- st.markdown(f"**Main Text:** \n{imporve_text(main_text)}")
294
- with col2:
295
- st.markdown(f"**Similar Text:** \n{imporve_text(similar_text)}")
296
-
297
- st.markdown("---")
 
 
 
 
 
 
 
298
 
299
  if need_image == 'True':
300
  with st.spinner('Processing Images...'):
301
- emb_main = get_image_embeddings(main_product)
302
  similar_prod = extract_similar_products(main_product)[0]
303
- emb_similar = get_image_embeddings(similar_prod)
304
 
305
  similarity_matrix = np.zeros((5, 5))
306
  for i in range(5):
@@ -323,20 +329,37 @@ if st.button('Check for Infringement'):
323
 
324
 
325
 
 
 
 
 
 
 
 
 
326
 
327
- # main_product = 'Philips led 7w bulb'
328
- # main_url = 'https://www.assets.signify.com/is/content/PhilipsConsumer/PDFDownloads/Colombia/technical-sheets/ODLI20180227_001-UPD-es_CO-Ficha_Tecnica_LED_MR16_Master_7W_Dim_12V_CRI90.pdf'
329
- # search_method = 'duckduckgo'
330
 
331
- # product_count = 1
332
- # link_count = 1
333
- # need_image = False
 
 
 
334
 
 
 
335
 
336
- # tag_option = "Field Wise Document Similarity"
337
 
338
- # logger = StreamCapture()
339
- # score(main_product, main_url,product_count, link_count, search_method, logger, st.empty())
340
 
 
 
341
 
 
 
342
 
 
 
1
  import streamlit as st
2
  import concurrent.futures
3
+ from concurrent.futures import ThreadPoolExecutor, as_completed
4
  from functools import partial
5
  import numpy as np
6
  from io import StringIO
 
16
  from PyPDF2 import PdfReader
17
  import hashlib
18
  import os
19
+ from plotly.subplots import make_subplots
20
+ import plotly.graph_objects as go
21
+ from PIL import Image
22
+ import datetime
23
+ from apscheduler.schedulers.blocking import BlockingScheduler
24
 
25
  # File Imports
26
+ from embedding import get_embeddings, get_image_embeddings, get_embed_chroma,imporve_text # Ensure this file/module is available
27
  from preprocess import filtering # Ensure this file/module is available
28
+ from github_storage import update_db,download_db
29
  from search import *
30
 
31
 
32
  # Chroma Connections
33
+ client = chromadb.PersistentClient(path="embeddings")
34
+ collection = client.get_or_create_collection(name="data", metadata={"hnsw:space": "l2"})
 
 
 
35
 
36
 
37
 
38
  def generate_hash(content):
39
  return hashlib.sha256(content.encode('utf-8')).hexdigest()
40
 
41
+
42
  def get_key(link):
43
  text = ''
44
  try:
 
52
  # Load the PDF file
53
  reader = PdfReader(pdf_file)
54
  num_pages = len(reader.pages)
55
+
56
  first_page_text = reader.pages[0].extract_text()
57
  if first_page_text:
58
  text += first_page_text
 
59
 
60
  last_page_text = reader.pages[-1].extract_text()
61
  if last_page_text:
 
65
  print(f'HTTP error occurred: {e}')
66
  except Exception as e:
67
  print(f'An error occurred: {e}')
68
+
69
  unique_key = generate_hash(text)
70
+
71
  return unique_key
72
 
73
+
74
  # Cosine Similarity Function
75
  def cosine_similarity(vec1, vec2):
76
  vec1 = np.array(vec1)
77
  vec2 = np.array(vec2)
78
+
79
  dot_product = np.dot(vec1, vec2.T)
80
  magnitude_vec1 = np.linalg.norm(vec1)
81
  magnitude_vec2 = np.linalg.norm(vec2)
82
+
83
  if magnitude_vec1 == 0 or magnitude_vec2 == 0:
84
  return 0.0
85
+
86
  cosine_sim = dot_product / (magnitude_vec1 * magnitude_vec2)
87
  return cosine_sim
88
 
 
89
 
90
+ def update_chroma(product_name, url, key, text, vector, log_area):
91
+ id_list = [key + str(i) for i in range(len(text))]
92
 
93
  metadata_list = [
94
+ {'key': key,
95
+ 'product_name': product_name,
96
+ 'url': url,
97
+ 'text': item
98
+ }
99
+ for item in text
100
+ ]
101
 
102
  collection.upsert(
103
+ ids=id_list,
104
+ embeddings=vector,
105
+ metadatas=metadata_list
106
  )
107
 
108
  logger.write(f"\n\u2713 Updated DB - {url}\n\n")
 
122
  def __exit__(self, exc_type, exc_val, exc_tb):
123
  sys.stdout = self._stdout
124
 
125
+
126
  # Main Function
127
  def score(main_product, main_url, product_count, link_count, search, logger, log_area):
 
 
128
  data = {}
129
  similar_products = extract_similar_products(main_product)[:product_count]
130
 
 
135
  def process_product(product, search_function, main_product):
136
  search_result = search_function(product)
137
  return filtering(search_result, main_product, product, link_count)
138
+
 
139
  search_functions = {
140
  'google': search_google,
141
  'duckduckgo': search_duckduckgo,
 
142
  'github': search_github,
143
  'wikipedia': search_wikipedia
144
  }
 
174
  elif search == 'wikipedia':
175
  data[product] = filtering(search_wikipedia(product), main_product, product, link_count)
176
 
 
177
  # Filtered Link -----------------------------------------
178
  logger.write("\n\n\u2713 Filtered Links\n")
179
  log_area.text(logger.getvalue())
180
 
 
181
  # Main product Embeddings ---------------------------------
182
  logger.write("\n\n--> Creating Main product Embeddings\n")
183
 
184
  main_key = get_key(main_url)
185
+ main_text, main_vector = get_embed_chroma(main_url)
186
 
187
+ update_chroma(main_product, main_url, main_key, main_text, main_vector, log_area)
188
 
189
  # log_area.text(logger.getvalue())
190
  print("\n\n\u2713 Main Product embeddings Created")
191
 
 
192
  logger.write("\n\n--> Creating Similar product Embeddings\n")
193
+ log_area.text(logger.getvalue())
194
+ test_embedding = [0] * 768
195
 
196
  for product in data:
197
  for link in data[product]:
 
200
  similar_key = get_key(url)
201
 
202
  res = collection.query(
203
+ query_embeddings=[test_embedding],
204
+ n_results=1,
205
+ where={"key": similar_key},
206
+ )
207
 
208
  if not res['distances'][0]:
209
+ similar_text, similar_vector = get_embed_chroma(url)
210
+ update_chroma(product, url, similar_key, similar_text, similar_vector, log_area)
 
211
 
212
  logger.write("\n\n\u2713 Similar Product embeddings Created\n")
213
  log_area.text(logger.getvalue())
214
 
215
  top_similar = []
216
 
217
+ for idx, chunk in enumerate(main_vector):
218
  res = collection.query(
219
+ query_embeddings=[chunk],
220
+ n_results=1,
221
+ where={"key": {'$ne': main_key}},
222
+ include=['metadatas', 'embeddings', 'distances']
223
+ )
224
+
225
+ top_similar.append((main_text[idx], chunk, res, res['distances'][0]))
 
 
226
 
227
+ most_similar_items = sorted(top_similar, key=lambda x: x[3])[:top_similar_count]
228
 
229
  logger.write("--------------- DONE -----------------\n")
230
  log_area.text(logger.getvalue())
 
232
  return most_similar_items
233
 
234
 
 
 
 
235
  # Streamlit Interface
236
+ # st.set_page_config(layout="wide", page_title="Infringement Checker", page_icon="🔍")
237
 
238
+ st.title("🔍 Infringement Checker")
239
 
240
  # Inputs
241
+ with st.sidebar:
242
+ st.header("📋 Product Information")
243
+ main_product = st.text_input('Enter Main Product Name', 'Philips led 7w bulb')
244
+ main_url = st.text_input('Enter Main Product Manual URL', 'https://www.assets.signify.com/is/content/PhilipsConsumer/PDFDownloads/Colombia/technical-sheets/ODLI20180227_001-UPD-es_CO-Ficha_Tecnica_LED_MR16_Master_7W_Dim_12V_CRI90.pdf')
245
 
246
+ st.header("🔎 Search Settings")
247
+ search_method = st.selectbox('Choose Search Engine', ['All', 'duckduckgo', 'google', 'archive', 'github', 'wikipedia'])
 
 
 
 
 
248
 
249
+ product_count = st.number_input("Number of Similar Products", min_value=1, step=1, format="%i")
250
+ link_count = st.number_input("Number of Links per Product", min_value=1, step=1, format="%i")
251
+ need_image = st.selectbox("Process Images", ['True', 'False'])
252
 
253
+ top_similar_count = st.number_input("Top Similarities to be Displayed", value=3, min_value=1, step=1, format="%i")
254
 
255
  if st.button('Check for Infringement'):
256
+ global log_output
257
 
258
+ tab1, tab2 = st.tabs(["📊 Output", "🖥️ Console"])
259
 
260
  with tab2:
261
  log_output = st.empty()
262
 
263
  with tab1:
264
  with st.spinner('Processing...'):
265
+
266
+ if not os.path.exists('/home/user/app/embeddings'):
267
+ download_db()
268
+ print("\u2713 Downloaded Database\n\n")
269
+
270
  with StreamCapture() as logger:
271
  top_similar_values = score(main_product, main_url, product_count, link_count, search_method, logger, log_output)
272
 
273
+ st.success('Processing complete!')
274
 
275
+ st.subheader("📈 Cosine Similarity Scores")
276
 
277
  for main_text, main_vector, response, _ in top_similar_values:
278
  product_name = response['metadatas'][0][0]['product_name']
279
  link = response['metadatas'][0][0]['url']
280
  similar_text = response['metadatas'][0][0]['text']
281
+ # similar_text_refined = imporve_text(similar_text)
282
+ # main_text_refined = imporve_text(main_text)
283
 
284
  cosine_score = cosine_similarity([main_vector], response['embeddings'][0])[0][0]
285
 
286
  # Display the product information
287
+ with st.expander(f"### Product: {product_name} - Score: {cosine_score:.4f}"):
288
+ link = link.replace(" ","%20")
289
+ st.markdown(f"[View Product Manual]({link})")
290
+ tab1, tab2 = st.tabs(["Raw Text", "Refined Text"])
291
+ with tab2:
292
+ col1, col2 = st.columns(2)
293
+ with col1:
294
+ st.markdown(f"*Main Text:\n* {imporve_text(main_text)}")
295
+ with col2:
296
+ st.markdown(f"*Similar Text\n:* {imporve_text(similar_text)}")
297
+
298
+ with tab1:
299
+ col1, col2 = st.columns(2)
300
+ with col1:
301
+ st.markdown(f"*Main Text:* {main_text}")
302
+ with col2:
303
+ st.markdown(f"*Similar Text:* {similar_text}")
304
 
305
  if need_image == 'True':
306
  with st.spinner('Processing Images...'):
307
+ emb_main , main_prod_imgs = get_image_embeddings(main_product)
308
  similar_prod = extract_similar_products(main_product)[0]
309
+ emb_similar , similar_prod_imgs = get_image_embeddings(similar_prod)
310
 
311
  similarity_matrix = np.zeros((5, 5))
312
  for i in range(5):
 
329
 
330
 
331
 
332
+ @st.experimental_fragment
333
+ def image_viewer():
334
+ # Form to handle image selection
335
+
336
+ st.subheader("Image Viewer")
337
+
338
+ selected_row = st.selectbox('Select a row (Main Product Image)', [f'Image {i+1}' for i in range(5)])
339
+ selected_col = st.selectbox('Select a column (Similar Product Image)', [f'Image {i+1}' for i in range(5)])
340
 
341
+ # Get the selected indices from session state
342
+ row_idx = int(selected_row.split()[1]) - 1
343
+ col_idx = int(selected_col.split()[1]) - 1
344
 
345
+ col1, col2 = st.columns(2)
346
+
347
+ with col1:
348
+ st.image(main_prod_imgs[row_idx], caption=f'Main Product Image {row_idx+1}', use_column_width=True)
349
+ with col2:
350
+ st.image(similar_prod_imgs[col_idx], caption=f'Similar Product Image {col_idx+1}', use_column_width=True)
351
 
352
+ # Call the fragment
353
+ image_viewer()
354
 
 
355
 
356
+ def job_function():
357
+ print("Job executed at:", datetime.datetime.now())
358
 
359
+ # Create an instance of scheduler
360
+ scheduler = BlockingScheduler()
361
 
362
+ # Schedule job_function to be executed every 10 seconds
363
+ scheduler.add_job(job_function, 'interval', seconds=5)
364
 
365
+ scheduler.start()