jackkuo commited on
Commit
c1f38dc
·
verified ·
1 Parent(s): ec2dfeb

Create streamlit_app.py

Browse files
Files changed (1) hide show
  1. streamlit_app.py +541 -0
streamlit_app.py ADDED
@@ -0,0 +1,541 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import os
4
+ import numpy as np
5
+ from sentence_transformers import SentenceTransformer, models
6
+ import torch
7
+ from sentence_transformers.quantization import semantic_search_faiss
8
+ from pathlib import Path
9
+ import time
10
+ import plotly.express as px
11
+ import doi
12
+ import requests
13
+ from groq import Groq
14
+ import dropbox
15
+ from datetime import datetime, timedelta
16
+
17
+
18
+ API_URL = (
19
+ "https://api-inference.huggingface.co/models/mixedbread-ai/mxbai-embed-large-v1"
20
+ )
21
+ summarization_API_URL = (
22
+ "https://api-inference.huggingface.co/models/Falconsai/text_summarization"
23
+ )
24
+
25
+ LLM_API_URL = (
26
+ "https://api-inference.huggingface.co/models/HuggingFaceH4/zephyr-7b-beta"
27
+ )
28
+
29
+ from openai import OpenAI
30
+
31
+ api_key = os.getenv('API_KEY')
32
+ base_url = os.getenv("BASE_URL")
33
+
34
+ client_openai = OpenAI(
35
+ api_key=api_key,
36
+ base_url=base_url,
37
+ )
38
+
39
+ API_TOKEN = os.getenv('hf_token')
40
+
41
+ headers = {"Authorization": f"Bearer {API_TOKEN}"}
42
+
43
+
44
+ def query_hf_api(text, api=API_URL, parameters=None):
45
+
46
+ if not parameters:
47
+ payload = {"inputs": text}
48
+ else:
49
+ payload = {
50
+ "inputs": text,
51
+ "parameters": parameters,
52
+ }
53
+
54
+ response = requests.post(api, headers=headers, json=payload)
55
+
56
+ try:
57
+ response_data = response.json()
58
+ except requests.exceptions.JSONDecodeError:
59
+ st.error("Failed to get a valid response from the server. Please try again later.")
60
+ return {}
61
+
62
+ # Prepare an empty placeholder that can be filled if needed
63
+ progress_placeholder = st.empty()
64
+
65
+ # Check if the model is currently loading
66
+ if "error" in response_data and "loading" in response_data["error"]:
67
+ estimated_time = response_data.get("estimated_time", 30) # Default wait time to 30 seconds if not provided
68
+ with progress_placeholder.container():
69
+ st.warning(f"Model from :hugging_face: is currently loading. Estimated wait time: {estimated_time:.1f} seconds. Please wait...")
70
+
71
+ # Create a progress bar within the container
72
+ progress_bar = st.progress(0)
73
+ for i in range(int(estimated_time) + 5): # Adding a buffer time to ensure the model is loaded
74
+ # Update progress bar. The factor of 100 is used to convert to percentage completion
75
+ progress = int((i / (estimated_time + 5)) * 100)
76
+ progress_bar.progress(progress)
77
+ time.sleep(1) # Wait for a second
78
+
79
+ # Clear the placeholder once loading is complete
80
+ progress_placeholder.empty()
81
+
82
+ st.rerun() # Rerun the app after waiting
83
+
84
+ return response_data
85
+
86
+
87
+ def normalize_embeddings(embeddings):
88
+ """
89
+ Normalizes the embeddings matrix, so that each sentence embedding has unit length.
90
+
91
+ Args:
92
+ embeddings (Tensor): The embeddings tensor to normalize.
93
+
94
+ Returns:
95
+ Tensor: The normalized embeddings.
96
+ """
97
+ if embeddings.dim() == 1:
98
+ # Add an extra dimension if the tensor is 1-dimensional
99
+ embeddings = embeddings.unsqueeze(0)
100
+ return torch.nn.functional.normalize(embeddings, p=2, dim=1)
101
+
102
+
103
+ def quantize_embeddings(
104
+ embeddings, precision="ubinary", ranges=None, calibration_embeddings=None
105
+ ):
106
+ """
107
+ Quantizes embeddings to a specified precision using PyTorch and numpy.
108
+
109
+ Args:
110
+ embeddings (Tensor): The embeddings to quantize, assumed to be a Tensor.
111
+ precision (str): The precision to convert to.
112
+ ranges (np.ndarray, optional): Ranges for quantization.
113
+ calibration_embeddings (Tensor, optional): Embeddings used for calibration.
114
+
115
+ Returns:
116
+ Tensor: The quantized embeddings.
117
+ """
118
+ if precision == "float32":
119
+ return embeddings.float()
120
+
121
+ if precision in ["int8", "uint8"]:
122
+ if ranges is None:
123
+ if calibration_embeddings is not None:
124
+ ranges = torch.stack(
125
+ (
126
+ torch.min(calibration_embeddings, dim=0)[0],
127
+ torch.max(calibration_embeddings, dim=0)[0],
128
+ )
129
+ )
130
+ else:
131
+ ranges = torch.stack(
132
+ (torch.min(embeddings, dim=0)[0], torch.max(embeddings, dim=0)[0])
133
+ )
134
+
135
+ starts, ends = ranges[0], ranges[1]
136
+ steps = (ends - starts) / 255
137
+
138
+ if precision == "uint8":
139
+ quantized_embeddings = torch.clip(
140
+ ((embeddings - starts) / steps), 0, 255
141
+ ).byte()
142
+ elif precision == "int8":
143
+ quantized_embeddings = torch.clip(
144
+ ((embeddings - starts) / steps - 128), -128, 127
145
+ ).char()
146
+
147
+ elif precision == "binary" or precision == "ubinary":
148
+ embeddings_np = embeddings.numpy() > 0
149
+ packed_bits = np.packbits(embeddings_np, axis=-1)
150
+ if precision == "binary":
151
+ quantized_embeddings = torch.from_numpy(packed_bits - 128).char()
152
+ else:
153
+ quantized_embeddings = torch.from_numpy(packed_bits).byte()
154
+
155
+ else:
156
+ raise ValueError(f"Precision {precision} is not supported")
157
+
158
+ return quantized_embeddings
159
+
160
+
161
+ def process_embeddings(embeddings, precision="ubinary", calibration_embeddings=None):
162
+ """
163
+ Normalizes and quantizes embeddings from an API list to a specified precision using PyTorch.
164
+
165
+ Args:
166
+ embeddings (list or Tensor): Raw embeddings from an external API, either as a list or a Tensor.
167
+ precision (str): Desired precision for quantization.
168
+ calibration_embeddings (Tensor, optional): Embeddings for calibration.
169
+
170
+ Returns:
171
+ Tensor: Processed embeddings, normalized and quantized.
172
+ """
173
+ # Convert list to Tensor if necessary
174
+ if isinstance(embeddings, list):
175
+ embeddings = torch.tensor(embeddings, dtype=torch.float32)
176
+
177
+ elif not isinstance(embeddings, torch.Tensor):
178
+ st.error(embeddings)
179
+ raise TypeError(
180
+ f"Embeddings must be a list or a torch.Tensor. Message from the server: {embeddings}"
181
+ )
182
+
183
+ # Convert calibration_embeddings list to Tensor if necessary
184
+ if isinstance(calibration_embeddings, list):
185
+ calibration_embeddings = torch.tensor(
186
+ calibration_embeddings, dtype=torch.float32
187
+ )
188
+ elif calibration_embeddings is not None and not isinstance(
189
+ calibration_embeddings, torch.Tensor
190
+ ):
191
+ raise TypeError(
192
+ "Calibration embeddings must be a list or a torch.Tensor if provided. "
193
+ )
194
+
195
+ normalized_embeddings = normalize_embeddings(embeddings)
196
+ quantized_embeddings = quantize_embeddings(
197
+ normalized_embeddings,
198
+ precision=precision,
199
+ calibration_embeddings=calibration_embeddings,
200
+ )
201
+ return quantized_embeddings.cpu().numpy()
202
+
203
+
204
+ # Load data and embeddings
205
+ @st.cache_resource(ttl="1d")
206
+ def load_data_embeddings():
207
+ existing_data_path = "aggregated_data"
208
+ new_data_directory_bio = "db_update"
209
+ existing_embeddings_path = "biorxiv_ubin_embaddings.npy"
210
+ updated_embeddings_directory_bio = "embed_update"
211
+
212
+ new_data_directory_med = "db_update_med"
213
+ updated_embeddings_directory_med = "embed_update_med"
214
+
215
+ # Load existing database and embeddings
216
+ df_existing = pd.read_parquet(existing_data_path)
217
+ embeddings_existing = np.load(existing_embeddings_path, allow_pickle=True)
218
+
219
+ print(f"Existing data shape: {df_existing.shape}, Existing embeddings shape: {embeddings_existing.shape}")
220
+
221
+ # Determine the embedding size from existing embeddings
222
+ embedding_size = embeddings_existing.shape[1]
223
+
224
+ # Prepare lists to collect new updates
225
+ df_updates_list = []
226
+ embeddings_updates_list = []
227
+
228
+ # Helper function to process updates from a specified directory
229
+ def process_updates(new_data_directory, updated_embeddings_directory):
230
+ new_data_files = sorted(Path(new_data_directory).glob("*.parquet"))
231
+ print(new_data_files)
232
+ for data_file in new_data_files:
233
+ corresponding_embedding_file = Path(updated_embeddings_directory) / (
234
+ data_file.stem + ".npy"
235
+ )
236
+
237
+ if corresponding_embedding_file.exists():
238
+ df = pd.read_parquet(data_file)
239
+ new_embeddings = np.load(corresponding_embedding_file, allow_pickle=True)
240
+
241
+ # Check if the number of rows in the DataFrame matches the number of rows in the embeddings
242
+ if df.shape[0] != new_embeddings.shape[0]:
243
+ print(f"Shape mismatch for {data_file.name}: DataFrame has {df.shape[0]} rows, embeddings have {new_embeddings.shape[0]} rows. Skipping.")
244
+ continue
245
+
246
+ # Check embedding size and adjust if necessary
247
+ if new_embeddings.shape[1] != embedding_size:
248
+ print(f"Skipping {data_file.name} due to embedding size mismatch.")
249
+ continue
250
+
251
+ df_updates_list.append(df)
252
+ embeddings_updates_list.append(new_embeddings)
253
+ else:
254
+ print(f"No corresponding embedding file found for {data_file.name}")
255
+
256
+ # Process updates from both BioRxiv and MedArXiv
257
+ process_updates(new_data_directory_bio, updated_embeddings_directory_bio)
258
+ process_updates(new_data_directory_med, updated_embeddings_directory_med)
259
+
260
+ # Concatenate all updates
261
+ if df_updates_list:
262
+ df_updates = pd.concat(df_updates_list)
263
+ else:
264
+ df_updates = pd.DataFrame()
265
+
266
+ if embeddings_updates_list:
267
+ embeddings_updates = np.vstack(embeddings_updates_list)
268
+ else:
269
+ embeddings_updates = np.array([])
270
+
271
+ # Append new data to existing, handling duplicates as needed
272
+ df_combined = pd.concat([df_existing, df_updates])
273
+
274
+ # Create a mask for filtering
275
+ mask = ~df_combined.duplicated(subset=["title"], keep="last")
276
+ df_combined = df_combined[mask]
277
+
278
+ # Combine embeddings, ensuring alignment with the DataFrame
279
+ embeddings_combined = (
280
+ np.vstack([embeddings_existing, embeddings_updates])
281
+ if embeddings_updates.size
282
+ else embeddings_existing
283
+ )
284
+
285
+ # Filter the embeddings based on the dataframe unique entries
286
+ embeddings_combined = embeddings_combined[mask]
287
+
288
+ return df_combined, embeddings_combined
289
+
290
+
291
+ LLM_prompt = "Review the abstracts listed above and create a list and summary that captures their main themes and findings. Identify any commonalities across the abstracts and highlight these in your summary. Ensure your response is concise, avoids external links, and is formatted in markdown.\n\n"
292
+
293
+ def summarize_abstract(abstract, llm_model="llama-3.1-70b-versatile", instructions=LLM_prompt, api_key=""):
294
+ """
295
+ Summarizes the provided abstract using a specified LLM model.
296
+
297
+ Parameters:
298
+ - abstract (str): The abstract text to be summarized.
299
+ - llm_model (str): The LLM model used for summarization. Defaults to "llama-3.1-70b-versatile".
300
+
301
+ Returns:
302
+ - str: A summary of the abstract, condensed into one to two sentences.
303
+ """
304
+ print("use openai api: gpt-4o-mini")
305
+ client = client_openai
306
+ formatted_text = "\n".join(f"{idx + 1}. {abstract}" for idx, abstract in enumerate(abstracts))
307
+ try:
308
+ # Create a chat completion with the abstract and specified LLM model
309
+ chat_completion = client.chat.completions.create(
310
+ messages=[{"role": "user", "content": f'{instructions} "{formatted_text}"'}],
311
+ model="gpt-4o-mini",
312
+ )
313
+ except Exception as e: # Catch the exception
314
+ print(f"An error occurred: {e}") # Print the error
315
+ return 'Groq model not available or above the usage limit. Use own API key from here: https://console.groq.com/keys'
316
+
317
+
318
+ # Return the summarized content
319
+ return chat_completion.choices[0].message.content
320
+
321
+
322
+ def define_style():
323
+ st.markdown(
324
+ """
325
+ <style>
326
+ .stExpander > .stButton > button {
327
+ width: 100%;
328
+ border: none;
329
+ background-color: #f0f2f6;
330
+ color: #333;
331
+ text-align: left;
332
+ padding: 15px;
333
+ font-size: 18px;
334
+ border-radius: 10px;
335
+ margin-top: 5px;
336
+ }
337
+ .stExpander > .stExpanderContent {
338
+ padding-left: 10px;
339
+ padding-top: 10px;
340
+ }
341
+ a {
342
+ color: #FF4B4B;
343
+ text-decoration: none;
344
+ }
345
+ </style>
346
+ """,
347
+ unsafe_allow_html=True,
348
+ )
349
+
350
+
351
+ def logo(db_update_date, db_size_bio, db_size_med):
352
+ # Initialize Streamlit app
353
+ biorxiv_logo = "https://www.biorxiv.org/sites/default/files/biorxiv_logo_homepage.png"
354
+ medarxiv_logo = "https://www.medrxiv.org/sites/default/files/medRxiv_homepage_logo.png"
355
+ st.markdown(
356
+ f"""
357
+ <div style='display: flex; justify-content: center; align-items: center;'>
358
+ <div style='margin-right: 20px;'>
359
+ <img src='{biorxiv_logo}' alt='BioRxiv logo' style='max-height: 100px;'>
360
+ </div>
361
+ <div style='margin-left: 20px;'>
362
+ <img src='{medarxiv_logo}' alt='medRxiv logo' style='max-height: 100px;'>
363
+ </div>
364
+ </div>
365
+ <div style='text-align: center; margin-top: 10px;'>
366
+ <h3 style='color: black;'>Manuscript Semantic Search [bMSS]</h3>
367
+ <h1 align="center"> Automated Enzyme Kinetics Extractor for Markdown</h1>
368
+ <p>How to use:
369
+ <br><strong>1</strong>: Enter your search query (Optional modification "Number of results to show")
370
+ <br><strong>2</strong>: Click the Enter key or Click "Search" to preview it.
371
+ <br><strong>3</strong>: Enter summary prompt in the below input box.
372
+ <br><strong>4</strong>: Click "AI summary" to summarize the search results above.
373
+ </p>
374
+ Last database update: {db_update_date}; Database size: bioRxiv: {db_size_bio} / medRxiv: {db_size_med} entries
375
+ </div>
376
+ <br>
377
+ """,
378
+ unsafe_allow_html=True,
379
+ )
380
+
381
+
382
+ st.set_page_config(
383
+ page_title="bMSS",
384
+ page_icon=":scroll:",
385
+ )
386
+ define_style()
387
+
388
+ df, embeddings_unique = load_data_embeddings()
389
+ logo(df["date"].max(), df[df['server']=='biorxiv'].shape[0], df[df['server']=='medrxiv'].shape[0])
390
+
391
+ # model = model_to_device()
392
+
393
+ corpus_index = None
394
+ corpus_precision = "ubinary"
395
+ use_hf = False
396
+
397
+ query = st.text_input("Enter your search query:")
398
+
399
+ num_to_show = st.number_input(
400
+ "Number of results to show:",
401
+ min_value=1,
402
+ max_value=50,
403
+ value=10,
404
+ )
405
+
406
+ # 搜索逻辑触发
407
+ if query:
408
+ with st.spinner("Searching..."):
409
+ # Encode the query
410
+ search_start_time = time.time()
411
+ # query_embedding = model.encode([query], normalize_embeddings=True, precision=corpus_precision)
412
+ embedding_time = time.time()
413
+
414
+ raw_embadding = query_hf_api(query)
415
+ query_embedding = process_embeddings(raw_embadding)
416
+
417
+ embedding_time_total = time.time() - embedding_time
418
+
419
+ # Perform the search
420
+ results, search_time, corpus_index = semantic_search_faiss(
421
+ query_embedding,
422
+ corpus_index=corpus_index,
423
+ corpus_embeddings=embeddings_unique if corpus_index is None else None,
424
+ corpus_precision=corpus_precision,
425
+ top_k=num_to_show, # type: ignore
426
+ calibration_embeddings=None,
427
+ rescore=False,
428
+ rescore_multiplier=4,
429
+ exact=True,
430
+ output_index=True,
431
+ )
432
+
433
+ search_end_time = time.time()
434
+ search_duration = search_end_time - search_start_time
435
+
436
+ st.markdown(
437
+ f"<h6 style='text-align: center; color: #7882af;'>Search Completed in {search_duration:.2f} seconds (embeddings time: {embedding_time_total:.2f})</h3>",
438
+ unsafe_allow_html=True,
439
+ )
440
+
441
+ # Prepare the results for plotting
442
+ plot_data = {"Date": [], "Title": [], "Score": [], "DOI": [], "category": [], "server": []}
443
+
444
+ search_df = pd.DataFrame(results[0])
445
+
446
+ # Find the minimum and maximum original scores
447
+ min_score = search_df["score"].min()
448
+ max_score = search_df["score"].max()
449
+
450
+ # Normalize scores. The best score (min_score) becomes 100%, and the worst score (max_score) gets a value above 0%.
451
+ search_df["score"] = abs(search_df["score"] - max_score) + min_score
452
+
453
+ abstracts = []
454
+
455
+ # Iterate over each row in the search_df DataFrame
456
+ for index, entry in search_df.iterrows():
457
+ row = df.iloc[int(entry["corpus_id"])]
458
+
459
+ # Construct the DOI link
460
+ try:
461
+ doi_link = f"{doi.get_real_url_from_doi(row['doi'])}"
462
+ except:
463
+ doi_link = f'https://www.doi.org/'+row['doi']
464
+
465
+ # Append information to plot_data for visualization
466
+ plot_data["Date"].append(row["date"])
467
+ plot_data["Title"].append(row["title"])
468
+ plot_data["Score"].append(search_df["score"][index]) # type: ignore
469
+ plot_data["DOI"].append(row["doi"])
470
+ plot_data["category"].append(row["category"])
471
+ plot_data["server"].append(row["server"])
472
+
473
+ #summary_text = summarize_abstract(row['abstract'])
474
+
475
+ with st.expander(f"{index+1}\. {row['title']}"): # type: ignore
476
+ col1, col2 = st.columns(2)
477
+ col1.markdown(f"**Score:** {entry['score']:.1f}")
478
+ col2.markdown(f"**Server:** [{row['server']}]")
479
+ st.markdown(f"**Authors:** {row['authors']}")
480
+ col1, col2 = st.columns(2)
481
+ col2.markdown(f"**Category:** {row['category']}")
482
+ col1.markdown(f"**Date:** {row['date']}")
483
+ #st.markdown(f"**Summary:**\n{summary_text}", unsafe_allow_html=False)
484
+ abstracts.append(row['abstract'])
485
+ st.markdown(
486
+ f"**Abstract:**\n{row['abstract']}", unsafe_allow_html=False
487
+ )
488
+ st.markdown(
489
+ f"**[Full Text Read]({doi_link})** 🔗", unsafe_allow_html=True
490
+ )
491
+
492
+ plot_df = pd.DataFrame(plot_data)
493
+
494
+ # Convert 'Date' to datetime if it's not already in that format
495
+ plot_df["Date"] = pd.to_datetime(plot_df["Date"])
496
+
497
+ # Sort the DataFrame based on the Date to make sure it's ordered
498
+ plot_df = plot_df.sort_values(by="Date")
499
+
500
+ prompt = st.text_area("Enter your summary prompt", value=LLM_prompt)
501
+ summary_button = st.button("AI summary")
502
+ if summary_button:
503
+ ai_gen_start = time.time()
504
+ st.markdown('**AI Summary of 10 abstracts:**')
505
+ st.markdown(summarize_abstract(abstracts[:9], instructions=prompt))
506
+ total_ai_time = time.time()-ai_gen_start
507
+ st.markdown(f'**Time to generate summary:** {total_ai_time:.2f} s')
508
+
509
+ # Create a Plotly figure
510
+ fig = px.scatter(
511
+ plot_df,
512
+ x="Date",
513
+ y="Score",
514
+ hover_data=["Title", "DOI"],
515
+ color='server',
516
+ title="Publication Times and Scores",
517
+ )
518
+ fig.update_traces(marker=dict(size=10))
519
+ # Customize hover text to display the title and link it to the DOI
520
+ fig.update_traces(
521
+ hovertemplate="<b>%{hovertext}</b>",
522
+ hovertext=plot_df.apply(lambda row: f"{row['Title']}", axis=1),
523
+ )
524
+
525
+ # Show the figure in the Streamlit app
526
+ st.plotly_chart(fig, use_container_width=True)
527
+
528
+ # Generate category counts for the pie chart
529
+ category_counts = plot_df["category"].value_counts().reset_index()
530
+ category_counts.columns = ["category", "count"]
531
+
532
+ # Create a pie chart with Plotly Express
533
+ fig = px.pie(
534
+ category_counts,
535
+ values="count",
536
+ names="category",
537
+ title="Category Distribution",
538
+ )
539
+
540
+ # Show the pie chart in the Streamlit app
541
+ st.plotly_chart(fig, use_container_width=True)