Spaces:

ksg-dfci
/

trial_search_alpha

Running on CPU Upgrade

App Files Files Community

kenlkehl commited on Dec 22, 2024

Commit

c3ce722

verified ·

1 Parent(s): 0262ad2

Upload app.py

Browse files

Files changed (1) hide show

app.py +66 -54

app.py CHANGED Viewed

@@ -1,8 +1,8 @@
 import gradio as gr
 import pandas as pd
 import torch
-import tempfile
 import torch.nn.functional as F
 from sentence_transformers import SentenceTransformer
 from safetensors import safe_open
 from transformers import pipeline, AutoTokenizer
@@ -19,43 +19,42 @@ with safe_open("trial_space_embeddings.safetensors", framework="pt") as f:
 # Load checker pipeline
 tokenizer = AutoTokenizer.from_pretrained("roberta-large")
-checker_pipe = pipeline('text-classification', 'ksg-dfci/TrialChecker', tokenizer=tokenizer,
-                        truncation=True, padding='max_length', max_length=512)
-import gradio as gr
-import pandas as pd
-import torch
-import torch.nn.functional as F
-import tempfile
-# Assume the following are already loaded:
-#   trial_spaces (DataFrame), embedding_model (SentenceTransformer),
-#   trial_space_embeddings (torch.tensor), checker_pipe (transformers pipeline)
-#
-# For example:
-# trial_spaces = pd.read_csv("some_file.csv")
-# embedding_model = SentenceTransformer("model-name", device="cuda")
-# trial_space_embeddings = torch.load("trial_space_embeddings.pt")
-# checker_pipe = pipeline(...)
-# etc.
-def match_clinical_trials_dropdown(patient_summary: str):
     """
     1) Runs the trial matching logic.
-    2) Returns a gr.update(...) for the dropdown (setting its choices),
-       plus a DataFrame for further use.
     """
     # 1. Encode user input
     patient_embedding = embedding_model.encode([patient_summary], convert_to_tensor=True)
     # 2. Compute similarities
     similarities = F.cosine_similarity(patient_embedding, trial_space_embeddings)
-    # 3. Pull top 10
     sorted_similarities, sorted_indices = torch.sort(similarities, descending=True)
-    top_indices = sorted_indices[0:20].cpu().numpy()
     # 4. Build DataFrame
     relevant_spaces = trial_spaces.iloc[top_indices].this_space
@@ -85,10 +84,10 @@ def match_clinical_trials_dropdown(patient_summary: str):
     analysis['trial_checker_result'] = [x['label'] for x in classifier_results]
     analysis['trial_checker_score'] = [x['score'] for x in classifier_results]
-    # restrict to trials that pass Checker
-    analysis = analysis[analysis.trial_checker_result == 'POSITIVE'].reset_index()
-    # 7. Final columns
     out_df = analysis[[
         'patient_summary_query',
         'nct_id',
@@ -100,38 +99,41 @@ def match_clinical_trials_dropdown(patient_summary: str):
         'trial_checker_score'
     ]]
-    # Build the dropdown choices, e.g., "NCT001 - Some Title"
     dropdown_options = []
-    for this_index, row in out_df.iterrows():
-        option_str = f"{this_index+1}. {row['nct_id']} - {row['trial_title']}"
         dropdown_options.append(option_str)
-    # Return an update for the dropdown (choices + clear any initial value)
-    dropdown_update = gr.Dropdown(
-        choices=dropdown_options,
-        interactive=True,
-        value=dropdown_options[0]
-    )
-    return dropdown_update, out_df
 def show_selected_trial(selected_option: str, df: pd.DataFrame):
     """
-    1) Given the selected dropdown option, e.g. "NCT001 - Some Title"
     2) Find the row in df and build a summary string.
     """
     if not selected_option:
         return ""
-    # Parse NCT ID from "NCT001 - Some Title"
-    chosen_index = selected_option.split(".")[0].strip()
-    #row = df[df['nct_id'] == nct_id]
-    row = df.iloc[[int(chosen_index) - 1]]
-    if row.empty:
         return "No data found for the selected trial."
-    record = row.iloc[0].to_dict()
     details = (
         f"Patient Summary Query: {record['patient_summary_query']}\n\n"
         f"NCT ID: {record['nct_id']}\n"
@@ -139,9 +141,8 @@ def show_selected_trial(selected_option: str, df: pd.DataFrame):
         f"Trial Space: {record['this_space']}\n\n"
         f"Trial Checker Result: {record['trial_checker_result']}\n"
         f"Trial Checker Score: {record['trial_checker_score']}\n\n"
-        f"Trial Brief Summary: {record['trial_brief_summary']}\n\n"
-        f"Trial Full Eligibility Criteria: {record['trial_eligibility_criteria']}\n\n"
     )
     return details
@@ -153,7 +154,7 @@ def export_results(df: pd.DataFrame):
     df.to_csv(temp.name, index=False)
     return temp.name
-# A little CSS for the input box
 custom_css = """
 #input_box textarea {
     width: 600px !important;
@@ -166,7 +167,12 @@ with gr.Blocks(css=custom_css) as demo:
     gr.HTML("""
     <h3>Demonstration version of clinical trial search based on MatchMiner-AI</h3>
     <p>Based on clinicaltrials.gov cancer trials export 10/31/24.</p>
-    <p>Queries take approximately 60 seconds to run (demo is running on a small CPU instance).</p>
     """)
     # Textbox for patient summary
@@ -176,6 +182,12 @@ with gr.Blocks(css=custom_css) as demo:
         value="metastatic lung adenocarcinoma, KRAS G12C mutation, PD-L1 high, previously treated with pembrolizumab."
     )
     # Button to run the matching
     submit_btn = gr.Button("Find Matches")
@@ -204,7 +216,7 @@ with gr.Blocks(css=custom_css) as demo:
     # 1) "Find Matches" => updates the dropdown choices and the state
     submit_btn.click(
         fn=match_clinical_trials_dropdown,
-        inputs=patient_summary_input,
         outputs=[trial_dropdown, results_state]
     )

 import gradio as gr
 import pandas as pd
 import torch
 import torch.nn.functional as F
+import tempfile
 from sentence_transformers import SentenceTransformer
 from safetensors import safe_open
 from transformers import pipeline, AutoTokenizer
 # Load checker pipeline
 tokenizer = AutoTokenizer.from_pretrained("roberta-large")
+checker_pipe = pipeline(
+    'text-classification',
+    'ksg-dfci/TrialChecker',
+    tokenizer=tokenizer,
+    truncation=True,
+    padding='max_length',
+    max_length=512
+)
+def match_clinical_trials_dropdown(patient_summary: str, max_results_str: str):
     """
     1) Runs the trial matching logic.
+    2) Returns a Dropdown (with the matched trials) and a DataFrame (for further use).
+    3) The user-supplied max_results_str is converted to an int (1-50).
     """
+    # Parse the max_results input
+    try:
+        max_results = int(max_results_str)
+    except ValueError:
+        max_results = 10  # if invalid input, default to 10
+    # Clamp within [1, 50]
+    if max_results < 1:
+        max_results = 1
+    elif max_results > 50:
+        max_results = 50
     # 1. Encode user input
     patient_embedding = embedding_model.encode([patient_summary], convert_to_tensor=True)
     # 2. Compute similarities
     similarities = F.cosine_similarity(patient_embedding, trial_space_embeddings)
+    # 3. Pull top 'max_results'
     sorted_similarities, sorted_indices = torch.sort(similarities, descending=True)
+    top_indices = sorted_indices[:max_results].cpu().numpy()
     # 4. Build DataFrame
     relevant_spaces = trial_spaces.iloc[top_indices].this_space
     analysis['trial_checker_result'] = [x['label'] for x in classifier_results]
     analysis['trial_checker_score'] = [x['score'] for x in classifier_results]
+    # 7. Restrict to POSITIVE results only
+    analysis = analysis[analysis.trial_checker_result == 'POSITIVE'].reset_index(drop=True)
+    # 8. Final columns
     out_df = analysis[[
         'patient_summary_query',
         'nct_id',
         'trial_checker_score'
     ]]
+    # Build the dropdown choices, e.g., "1. NCT001 - Some Title"
     dropdown_options = []
+    for i, row in out_df.iterrows():
+        option_str = f"{i+1}. {row['nct_id']} - {row['trial_title']}"
         dropdown_options.append(option_str)
+    # If we have no results, keep the dropdown empty
+    if len(dropdown_options) == 0:
+        return gr.Dropdown(choices=[], interactive=True, value=None), out_df
+    # Otherwise, pick the first item as the default
+    return (
+        gr.Dropdown(choices=dropdown_options, interactive=True, value=dropdown_options[0]),
+        out_df
+    )
 def show_selected_trial(selected_option: str, df: pd.DataFrame):
     """
+    1) Given the selected dropdown option, e.g. "1. NCT001 - Some Title"
     2) Find the row in df and build a summary string.
     """
     if not selected_option:
         return ""
+    # Parse the index from "1. NCT001 - Some Title"
+    chosen_index_str = selected_option.split(".")[0].strip()
+    try:
+        chosen_index = int(chosen_index_str) - 1
+    except ValueError:
+        return "No data found for the selected trial."
+    if chosen_index < 0 or chosen_index >= len(df):
         return "No data found for the selected trial."
+    record = df.iloc[chosen_index].to_dict()
     details = (
         f"Patient Summary Query: {record['patient_summary_query']}\n\n"
         f"NCT ID: {record['nct_id']}\n"
         f"Trial Space: {record['this_space']}\n\n"
         f"Trial Checker Result: {record['trial_checker_result']}\n"
         f"Trial Checker Score: {record['trial_checker_score']}\n\n"
+        f"Brief Summary: {record['trial_brief_summary']}\n\n"
+        f"Full Eligibility Criteria: {record['trial_eligibility_criteria']}\n\n"
     )
     return details
     df.to_csv(temp.name, index=False)
     return temp.name
+# A little CSS for the input boxes
 custom_css = """
 #input_box textarea {
     width: 600px !important;
     gr.HTML("""
     <h3>Demonstration version of clinical trial search based on MatchMiner-AI</h3>
     <p>Based on clinicaltrials.gov cancer trials export 10/31/24.</p>
+    <p>Queries take approximately 30 seconds to run per ten results returned,
+       since demo is running on a small CPU instance.</p>
+    <p>Disclaimers:</p>
+    <p>1. Not a clinical decision support tool</p>
+    <p>2. AI-extracted trial "spaces" and candidate matches may contain errors</p>
+    <p>3. Will not necessarily return all trials that match a given query</p>
     """)
     # Textbox for patient summary
         value="metastatic lung adenocarcinoma, KRAS G12C mutation, PD-L1 high, previously treated with pembrolizumab."
     )
+    # Textbox for max results
+    max_results_input = gr.Textbox(
+        label="Enter the maximum number of results to return (1-50)",
+        value="10"  # default
+    )
     # Button to run the matching
     submit_btn = gr.Button("Find Matches")
     # 1) "Find Matches" => updates the dropdown choices and the state
     submit_btn.click(
         fn=match_clinical_trials_dropdown,
+        inputs=[patient_summary_input, max_results_input],
         outputs=[trial_dropdown, results_state]
     )