Spaces:

jackkuo
/

BioRxiv-search

Sleeping

App Files Files Community

jackkuo commited on Dec 24, 2024

Commit

7bab2b1

verified ·

1 Parent(s): 6364266

Update app.py

Browse files

Files changed (1) hide show

app.py +139 -43

app.py CHANGED Viewed

@@ -12,17 +12,9 @@ import doi
 import requests
 from datetime import datetime, timedelta
 API_URL = (
     "https://api-inference.huggingface.co/models/mixedbread-ai/mxbai-embed-large-v1"
 )
-summarization_API_URL = (
-    "https://api-inference.huggingface.co/models/Falconsai/text_summarization"
-)
-LLM_API_URL = (
-    "https://api-inference.huggingface.co/models/HuggingFaceH4/zephyr-7b-beta"
-)
 from openai import OpenAI
@@ -33,6 +25,13 @@ client_openai = OpenAI(
     api_key=api_key,
     base_url=base_url,
 )
 API_TOKEN = os.getenv('hf_token')
@@ -40,7 +39,6 @@ headers = {"Authorization": f"Bearer {API_TOKEN}"}
 def query_hf_api(text, api=API_URL, parameters=None):
     if not parameters:
         payload = {"inputs": text}
     else:
@@ -64,8 +62,9 @@ def query_hf_api(text, api=API_URL, parameters=None):
     if "error" in response_data and "loading" in response_data["error"]:
         estimated_time = response_data.get("estimated_time", 30)  # Default wait time to 30 seconds if not provided
         with progress_placeholder.container():
-            st.warning(f"Model from :hugging_face: is currently loading. Estimated wait time: {estimated_time:.1f} seconds. Please wait...")
             # Create a progress bar within the container
             progress_bar = st.progress(0)
             for i in range(int(estimated_time) + 5):  # Adding a buffer time to ensure the model is loaded
@@ -76,7 +75,7 @@ def query_hf_api(text, api=API_URL, parameters=None):
         # Clear the placeholder once loading is complete
         progress_placeholder.empty()
         st.rerun()  # Rerun the app after waiting
     return response_data
@@ -99,7 +98,7 @@ def normalize_embeddings(embeddings):
 def quantize_embeddings(
-    embeddings, precision="ubinary", ranges=None, calibration_embeddings=None
 ):
     """
     Quantizes embeddings to a specified precision using PyTorch and numpy.
@@ -184,7 +183,7 @@ def process_embeddings(embeddings, precision="ubinary", calibration_embeddings=N
             calibration_embeddings, dtype=torch.float32
         )
     elif calibration_embeddings is not None and not isinstance(
-        calibration_embeddings, torch.Tensor
     ):
         raise TypeError(
             "Calibration embeddings must be a list or a torch.Tensor if provided. "
@@ -209,7 +208,7 @@ def load_data_embeddings():
     new_data_directory_med = "db_update_med"
     updated_embeddings_directory_med = "embed_update_med"
     # Load existing database and embeddings
     df_existing = pd.read_parquet(existing_data_path)
     embeddings_existing = np.load(existing_embeddings_path, allow_pickle=True)
@@ -229,18 +228,19 @@ def load_data_embeddings():
         print(new_data_files)
         for data_file in new_data_files:
             corresponding_embedding_file = Path(updated_embeddings_directory) / (
-                data_file.stem + ".npy"
             )
             if corresponding_embedding_file.exists():
                 df = pd.read_parquet(data_file)
                 new_embeddings = np.load(corresponding_embedding_file, allow_pickle=True)
                 # Check if the number of rows in the DataFrame matches the number of rows in the embeddings
                 if df.shape[0] != new_embeddings.shape[0]:
-                    print(f"Shape mismatch for {data_file.name}: DataFrame has {df.shape[0]} rows, embeddings have {new_embeddings.shape[0]} rows. Skipping.")
                     continue
                 # Check embedding size and adjust if necessary
                 if new_embeddings.shape[1] != embedding_size:
                     print(f"Skipping {data_file.name} due to embedding size mismatch.")
@@ -288,14 +288,15 @@ def load_data_embeddings():
 LLM_prompt = "Review the abstracts listed above and create a list and summary that captures their main themes and findings. Identify any commonalities across the abstracts and highlight these in your summary. Ensure your response is concise, avoids external links, and is formatted in markdown.\n\n"
 def summarize_abstract(abstract, llm_model="llama-3.1-70b-versatile", instructions=LLM_prompt, api_key=""):
     """
     Summarizes the provided abstract using a specified LLM model.
     Parameters:
     - abstract (str): The abstract text to be summarized.
     - llm_model (str): The LLM model used for summarization. Defaults to "llama-3.1-70b-versatile".
     Returns:
     - str: A summary of the abstract, condensed into one to two sentences.
     """
@@ -311,8 +312,90 @@ def summarize_abstract(abstract, llm_model="llama-3.1-70b-versatile", instructio
     except Exception as e:  # Catch the exception
         print(f"An error occurred: {e}")  # Print the error
         return 'LLM API not available or above the usage limit.'
     # Return the summarized content
     return chat_completion.choices[0].message.content
@@ -381,13 +464,13 @@ def logo(db_update_date, db_size_bio, db_size_med):
 st.set_page_config(
-    page_title="bMSS",
     page_icon=":scroll:",
 )
 define_style()
 df, embeddings_unique = load_data_embeddings()
-logo(df["date"].max(), df[df['server']=='biorxiv'].shape[0], df[df['server']=='medrxiv'].shape[0])
 # model = model_to_device()
@@ -471,7 +554,7 @@ if query or search_button:
         search_df["score"] = abs(search_df["score"] - max_score) + min_score
         abstracts = []
         # Iterate over each row in the search_df DataFrame
         for index, entry in search_df.iterrows():
             row = df.iloc[int(entry["corpus_id"])]
@@ -480,7 +563,7 @@ if query or search_button:
             try:
                 doi_link = f"{doi.get_real_url_from_doi(row['doi'])}"
             except:
-                doi_link = f'https://www.doi.org/'+row['doi']
             # Append information to plot_data for visualization
             plot_data["Date"].append(row["date"])
@@ -490,9 +573,7 @@ if query or search_button:
             plot_data["category"].append(row["category"])
             plot_data["server"].append(row["server"])
-            #summary_text = summarize_abstract(row['abstract'])
-            with st.expander(f"{index+1}\. {row['title']}"): # type: ignore
                 col1, col2 = st.columns(2)
                 col1.markdown(f"**Score:** {entry['score']:.1f}")
                 col2.markdown(f"**Server:** [{row['server']}]")
@@ -500,7 +581,7 @@ if query or search_button:
                 col1, col2 = st.columns(2)
                 col2.markdown(f"**Category:** {row['category']}")
                 col1.markdown(f"**Date:** {row['date']}")
-                #st.markdown(f"**Summary:**\n{summary_text}", unsafe_allow_html=False)
                 abstracts.append(row['abstract'])
                 st.markdown(
                     f"**Abstract:**\n{row['abstract']}", unsafe_allow_html=False
@@ -508,17 +589,33 @@ if query or search_button:
                 st.markdown(
                     f"**[Full Text Read]({doi_link})** 🔗", unsafe_allow_html=True
                 )
     if plot_data:
         with st.spinner("Under statistics..."):
             plot_df = pd.DataFrame(plot_data)
             # Convert 'Date' to datetime if it's not already in that format
             plot_df["Date"] = pd.to_datetime(plot_df["Date"])
             # Sort the DataFrame based on the Date to make sure it's ordered
             plot_df = plot_df.sort_values(by="Date")
             # Create a Plotly figure
             fig = px.scatter(
                 plot_df,
@@ -534,14 +631,14 @@ if query or search_button:
                 hovertemplate="<b>%{hovertext}</b>",
                 hovertext=plot_df.apply(lambda row: f"{row['Title']}", axis=1),
             )
             # Show the figure in the Streamlit app
             st.plotly_chart(fig, use_container_width=True)
             # Generate category counts for the pie chart
             category_counts = plot_df["category"].value_counts().reset_index()
             category_counts.columns = ["category", "count"]
             # Create a pie chart with Plotly Express
             fig = px.pie(
                 category_counts,
@@ -549,18 +646,17 @@ if query or search_button:
                 names="category",
                 title="Category Distribution",
             )
             # Show the pie chart in the Streamlit app
             st.plotly_chart(fig, use_container_width=True)
     if abstracts:
         with st.spinner("LLM is summarizing..."):
             prompt = st.text_area("Enter your summary prompt", value=LLM_prompt)
-            summary_button = st.button("AI summary")
             if summary_button:
                 ai_gen_start = time.time()
                 st.markdown('**AI Summary of 10 abstracts:**')
                 st.markdown(summarize_abstract(abstracts[:9], instructions=prompt))
-                total_ai_time = time.time()-ai_gen_start
                 st.markdown(f'**Time to generate summary:** {total_ai_time:.2f} s')

 import requests
 from datetime import datetime, timedelta
 API_URL = (
     "https://api-inference.huggingface.co/models/mixedbread-ai/mxbai-embed-large-v1"
 )
 from openai import OpenAI
     api_key=api_key,
     base_url=base_url,
 )
+api_key_kimi = os.getenv('API_KEY_KIMI')
+base_url_kimi = os.getenv("BASE_URL_KIMI")
+client_openai_kimi = OpenAI(
+    api_key=api_key_kimi,
+    base_url=base_url_kimi,
+)
 API_TOKEN = os.getenv('hf_token')
 def query_hf_api(text, api=API_URL, parameters=None):
     if not parameters:
         payload = {"inputs": text}
     else:
     if "error" in response_data and "loading" in response_data["error"]:
         estimated_time = response_data.get("estimated_time", 30)  # Default wait time to 30 seconds if not provided
         with progress_placeholder.container():
+            st.warning(
+                f"Model from :hugging_face: is currently loading. Estimated wait time: {estimated_time:.1f} seconds. Please wait...")
             # Create a progress bar within the container
             progress_bar = st.progress(0)
             for i in range(int(estimated_time) + 5):  # Adding a buffer time to ensure the model is loaded
         # Clear the placeholder once loading is complete
         progress_placeholder.empty()
         st.rerun()  # Rerun the app after waiting
     return response_data
 def quantize_embeddings(
+        embeddings, precision="ubinary", ranges=None, calibration_embeddings=None
 ):
     """
     Quantizes embeddings to a specified precision using PyTorch and numpy.
             calibration_embeddings, dtype=torch.float32
         )
     elif calibration_embeddings is not None and not isinstance(
+            calibration_embeddings, torch.Tensor
     ):
         raise TypeError(
             "Calibration embeddings must be a list or a torch.Tensor if provided. "
     new_data_directory_med = "db_update_med"
     updated_embeddings_directory_med = "embed_update_med"
     # Load existing database and embeddings
     df_existing = pd.read_parquet(existing_data_path)
     embeddings_existing = np.load(existing_embeddings_path, allow_pickle=True)
         print(new_data_files)
         for data_file in new_data_files:
             corresponding_embedding_file = Path(updated_embeddings_directory) / (
+                    data_file.stem + ".npy"
             )
             if corresponding_embedding_file.exists():
                 df = pd.read_parquet(data_file)
                 new_embeddings = np.load(corresponding_embedding_file, allow_pickle=True)
                 # Check if the number of rows in the DataFrame matches the number of rows in the embeddings
                 if df.shape[0] != new_embeddings.shape[0]:
+                    print(
+                        f"Shape mismatch for {data_file.name}: DataFrame has {df.shape[0]} rows, embeddings have {new_embeddings.shape[0]} rows. Skipping.")
                     continue
                 # Check embedding size and adjust if necessary
                 if new_embeddings.shape[1] != embedding_size:
                     print(f"Skipping {data_file.name} due to embedding size mismatch.")
 LLM_prompt = "Review the abstracts listed above and create a list and summary that captures their main themes and findings. Identify any commonalities across the abstracts and highlight these in your summary. Ensure your response is concise, avoids external links, and is formatted in markdown.\n\n"
 def summarize_abstract(abstract, llm_model="llama-3.1-70b-versatile", instructions=LLM_prompt, api_key=""):
     """
     Summarizes the provided abstract using a specified LLM model.
     Parameters:
     - abstract (str): The abstract text to be summarized.
     - llm_model (str): The LLM model used for summarization. Defaults to "llama-3.1-70b-versatile".
     Returns:
     - str: A summary of the abstract, condensed into one to two sentences.
     """
     except Exception as e:  # Catch the exception
         print(f"An error occurred: {e}")  # Print the error
         return 'LLM API not available or above the usage limit.'
+    # Return the summarized content
+    return chat_completion.choices[0].message.content
+def summarize_abstract_kimi(title, link):
+    """
+    Summarizes the provided abstract using a specified LLM model.
+    Parameters:
+    - abstract (str): The abstract text to be summarized.
+    - llm_model (str): The LLM model used for summarization. Defaults to "llama-3.1-70b-versatile".
+    Returns:
+    - str: A summary of the abstract, condensed into one to two sentences.
+    """
+    print("use openai api: moonshot-v1-32k")
+    print(title, link)
+    client = client_openai_kimi
+    formatted_text = "The paper we are going to discuss is "+ title +". The link is"+link+""" .
+    Please use this as a basis to answer my questions. Please output your answers according to the following format. Please pay attention to the logic of subheading stratification and ensure that each layer includes 4-10 points.
+**Q: What problem does this paper try to solve?**
+A: [Use one sentence to summarize what problem this paper tries to solve]
+1. Subheading 1: [Content under subheading 1]
+2. Subheading 2: [Content under subheading 2]
+3. Subheading 3: […]
+[…]
+** Q: What are the related studies?**
+A: [Use one sentence to summarize the relevant research]
+1. Subheading 1: [Subheading 1]
+2. Subheading 2: [Subheading 2]
+3. Subheading 3: […]
+[…]
+** Q: How does the paper solve this problem?**
+A: [Use one sentence to summarize how the paper solves this problem]
+1. Subheading 1: [Subheading 1]
+2. Subheading 2: [Subheading 2]
+3. Subheading 3: […]
+[…]
+** Q: What experiments were done in the paper?**
+A: [Use one sentence to summarize the experiments done in the paper]
+1. Subheading 1: [Subheading 1]
+2. Subheading 2: [Subheading 2]
+3. Subheading 3: […]
+[…]
+** Q: Is there anything that can be further explored?**
+A: [Use one sentence here to summarize what can be further explored]
+1. Subheading 1: [Content under subheading 1]
+2. Subheading 2: [Content under subheading 2]
+3. Subheading 3: […]
+[…]
+** Q: Summarize the main content of the paper**
+A: [Use one sentence here to summarize the main content of the paper]
+1. Research background: […]
+2. Research methods: […]
+3. Experimental design: […]
+4. Main findings: […]
+5. Research contributions: […]
+6. Future research directions: […]
+7. Methods and tools: […]
+8. Dataset: […]
+9. Conclusion: […]
+    """
+    try:
+        # Create a chat completion with the abstract and specified LLM model
+        chat_completion = client.chat.completions.create(
+            messages=[{"role": "user", "content": f'"{formatted_text}"'}],
+            model="moonshot-v1-32k",
+        )
+    except Exception as e:  # Catch the exception
+        print(f"An error occurred: {e}")  # Print the error
+        return 'LLM API not available or above the usage limit.'
     # Return the summarized content
     return chat_completion.choices[0].message.content
 st.set_page_config(
+    page_title="BioRxiv Search",
     page_icon=":scroll:",
 )
 define_style()
 df, embeddings_unique = load_data_embeddings()
+logo(df["date"].max(), df[df['server'] == 'biorxiv'].shape[0], df[df['server'] == 'medrxiv'].shape[0])
 # model = model_to_device()
         search_df["score"] = abs(search_df["score"] - max_score) + min_score
         abstracts = []
         # Iterate over each row in the search_df DataFrame
         for index, entry in search_df.iterrows():
             row = df.iloc[int(entry["corpus_id"])]
             try:
                 doi_link = f"{doi.get_real_url_from_doi(row['doi'])}"
             except:
+                doi_link = f'https://www.doi.org/' + row['doi']
             # Append information to plot_data for visualization
             plot_data["Date"].append(row["date"])
             plot_data["category"].append(row["category"])
             plot_data["server"].append(row["server"])
+            with st.expander(f"{index + 1}\. {row['title']}"):  # type: ignore
                 col1, col2 = st.columns(2)
                 col1.markdown(f"**Score:** {entry['score']:.1f}")
                 col2.markdown(f"**Server:** [{row['server']}]")
                 col1, col2 = st.columns(2)
                 col2.markdown(f"**Category:** {row['category']}")
                 col1.markdown(f"**Date:** {row['date']}")
+                # st.markdown(f"**Summary:**\n{summary_text}", unsafe_allow_html=False)
                 abstracts.append(row['abstract'])
                 st.markdown(
                     f"**Abstract:**\n{row['abstract']}", unsafe_allow_html=False
                 st.markdown(
                     f"**[Full Text Read]({doi_link})** 🔗", unsafe_allow_html=True
                 )
+                summary_button_one_paper = st.button("AI summary of this Paper", key="b_"+str(index+1))
+                if summary_button_one_paper:
+                    with st.spinner("AI summary of this Paper..."):
+                        ai_gen_start = time.time()
+                        st.markdown('**AI summary of this Paper:**')
+                        summary_of_this_Paper = summarize_abstract_kimi(title=row['title'], link=doi_link)
+                        st.markdown(summary_of_this_Paper)
+                        new_link = f"https://kimi.moonshot.cn/_prefill_chat?prefill_prompt=The paper we are going to discuss is {row['title']}, the link is {str(doi_link)}  or https://www.biorxiv.org/content/{str(row['doi'])}v1   " \
+                                   f" Please use this as a basis to continue summarize this article and answer my follow-up questions &send_immediately=true&force_search=false"
+                        total_ai_time = time.time() - ai_gen_start
+                        st.markdown(f'**Time to generate summary:** {total_ai_time:.2f} s')
+                        # Make sure the HTML link is formatted correctly
+                        st.markdown(f'<a href="{new_link}" target="_blank">**Full Text Dialogue** 🔗</a>',
+                                    unsafe_allow_html=True)
     if plot_data:
         with st.spinner("Under statistics..."):
             plot_df = pd.DataFrame(plot_data)
             # Convert 'Date' to datetime if it's not already in that format
             plot_df["Date"] = pd.to_datetime(plot_df["Date"])
             # Sort the DataFrame based on the Date to make sure it's ordered
             plot_df = plot_df.sort_values(by="Date")
             # Create a Plotly figure
             fig = px.scatter(
                 plot_df,
                 hovertemplate="<b>%{hovertext}</b>",
                 hovertext=plot_df.apply(lambda row: f"{row['Title']}", axis=1),
             )
             # Show the figure in the Streamlit app
             st.plotly_chart(fig, use_container_width=True)
             # Generate category counts for the pie chart
             category_counts = plot_df["category"].value_counts().reset_index()
             category_counts.columns = ["category", "count"]
             # Create a pie chart with Plotly Express
             fig = px.pie(
                 category_counts,
                 names="category",
                 title="Category Distribution",
             )
             # Show the pie chart in the Streamlit app
             st.plotly_chart(fig, use_container_width=True)
     if abstracts:
         with st.spinner("LLM is summarizing..."):
             prompt = st.text_area("Enter your summary prompt", value=LLM_prompt)
+            summary_button = st.button("AI summary", key="b2")
             if summary_button:
                 ai_gen_start = time.time()
                 st.markdown('**AI Summary of 10 abstracts:**')
                 st.markdown(summarize_abstract(abstracts[:9], instructions=prompt))
+                total_ai_time = time.time() - ai_gen_start
                 st.markdown(f'**Time to generate summary:** {total_ai_time:.2f} s')