Spaces:

ksg-dfci
/

trial_search_alpha

Running on CPU Upgrade

App Files Files Community

kenlkehl commited on Dec 21, 2024

Commit

fd5440d

verified ·

1 Parent(s): d1e19bb

Upload app.py

Browse files

Files changed (1) hide show

app.py +85 -36

app.py CHANGED Viewed

@@ -22,7 +22,6 @@ tokenizer = AutoTokenizer.from_pretrained("roberta-large")
 checker_pipe = pipeline('text-classification', 'ksg-dfci/TrialChecker', tokenizer=tokenizer,
                         truncation=True, padding='max_length', max_length=512)
 import gradio as gr
 import pandas as pd
 import torch
@@ -32,11 +31,16 @@ from safetensors import safe_open
 from transformers import pipeline, AutoTokenizer
 import tempfile
-# We assume the following objects have already been loaded:
 # trial_spaces (DataFrame), embedding_model (SentenceTransformer),
 # trial_space_embeddings (torch.tensor), checker_pipe (transformers pipeline)
-def match_clinical_trials(patient_summary: str):
     # Encode patient summary
     patient_embedding = embedding_model.encode([patient_summary], convert_to_tensor=True)
@@ -47,12 +51,14 @@ def match_clinical_trials(patient_summary: str):
     sorted_similarities, sorted_indices = torch.sort(similarities, descending=True)
     top_indices = sorted_indices[0:10].cpu().numpy()
     relevant_spaces = trial_spaces.iloc[top_indices].this_space
     relevant_nctid = trial_spaces.iloc[top_indices].nct_id
     relevant_title = trial_spaces.iloc[top_indices].title
     relevant_brief_summary = trial_spaces.iloc[top_indices].brief_summary
     relevant_eligibility_criteria = trial_spaces.iloc[top_indices].eligibility_criteria
     analysis = pd.DataFrame({
         'patient_summary_query': patient_summary,
         'this_space': relevant_spaces,
@@ -62,6 +68,7 @@ def match_clinical_trials(patient_summary: str):
         'trial_eligibility_criteria': relevant_eligibility_criteria
     }).reset_index(drop=True)
     analysis['pt_trial_pair'] = (
         analysis['this_space']
         + "\nNow here is the patient summary:"
@@ -73,7 +80,7 @@ def match_clinical_trials(patient_summary: str):
     analysis['trial_checker_result'] = [x['label'] for x in classifier_results]
     analysis['trial_checker_score'] = [x['score'] for x in classifier_results]
-    # Return the final subset of columns including patient_summary_query as first column
     out_df = analysis[[
         'patient_summary_query',
         'nct_id',
@@ -84,38 +91,91 @@ def match_clinical_trials(patient_summary: str):
         'trial_checker_result',
         'trial_checker_score'
     ]]
-    return out_df, out_df
 def export_results(df: pd.DataFrame):
-    # Save the dataframe to a temporary CSV file and return its path
     temp = tempfile.NamedTemporaryFile(delete=False, suffix=".csv")
     df.to_csv(temp.name, index=False)
     return temp.name
 custom_css = """
 #input_box textarea {
     width: 600px !important;
     height: 250px !important;
 }
-#output_df table {
-    width: 100% !important;
-    table-layout: auto !important;
-    border-collapse: collapse !important;
 }
-#output_df table td, #output_df table th {
-    min-width: 100px;
-    max-width: 300px;
-    overflow-wrap: anywhere; /* or 'word-wrap: break-word;' */
-    white-space: pre-wrap;   /* or 'white-space: normal;' */
     border: 1px solid #ccc;
-    padding: 4px;
 }
 """
 with gr.Blocks(css=custom_css) as demo:
-    gr.HTML("<h3>Alpha Version of Clinical Trial Search based on MatchMiner-AI models</h3>")
     gr.HTML("<h3>Based on clinicaltrials.gov cancer trials export 10/31/24</h3>")
     patient_summary_input = gr.Textbox(
@@ -126,38 +186,27 @@ with gr.Blocks(css=custom_css) as demo:
     submit_btn = gr.Button("Find Matches")
-    # We'll store the DataFrame in a state so we can export it after generation
     results_state = gr.State()
-    output_df = gr.DataFrame(
-        headers=[
-            "patient_summary_query",
-            "nct_id",
-            "title",
-            "trial_brief_summary",
-            "eligibility_criteria",
-            "this_space",
-            "trial_checker_result",
-            "trial_checker_score"
-        ],
-        elem_id="output_df"
-    )
     export_btn = gr.Button("Export Results")
-    # On "Find Matches", show the DataFrame and store it in state
     submit_btn.click(
-        fn=match_clinical_trials,
         inputs=patient_summary_input,
-        outputs=[output_df, results_state]
     )
-    # On "Export Results", use the state to create and return a CSV file
     export_btn.click(
         fn=export_results,
         inputs=results_state,
         outputs=gr.File(label="Download CSV")
     )
-if __name__ == '__main__':
     demo.launch()

 checker_pipe = pipeline('text-classification', 'ksg-dfci/TrialChecker', tokenizer=tokenizer,
                         truncation=True, padding='max_length', max_length=512)
 import gradio as gr
 import pandas as pd
 import torch
 from transformers import pipeline, AutoTokenizer
 import tempfile
+# We assume the following objects have already been loaded in your environment:
 # trial_spaces (DataFrame), embedding_model (SentenceTransformer),
 # trial_space_embeddings (torch.tensor), checker_pipe (transformers pipeline)
+def match_clinical_trials_html(patient_summary: str):
+    """
+    Takes in a patient_summary string, computes the top 10 matching trials,
+    and returns a tuple of:
+        (html_table_string, df_for_export)
+    """
     # Encode patient summary
     patient_embedding = embedding_model.encode([patient_summary], convert_to_tensor=True)
     sorted_similarities, sorted_indices = torch.sort(similarities, descending=True)
     top_indices = sorted_indices[0:10].cpu().numpy()
+    # Retrieve relevant columns from trial_spaces
     relevant_spaces = trial_spaces.iloc[top_indices].this_space
     relevant_nctid = trial_spaces.iloc[top_indices].nct_id
     relevant_title = trial_spaces.iloc[top_indices].title
     relevant_brief_summary = trial_spaces.iloc[top_indices].brief_summary
     relevant_eligibility_criteria = trial_spaces.iloc[top_indices].eligibility_criteria
+    # Build the main DataFrame for analysis
     analysis = pd.DataFrame({
         'patient_summary_query': patient_summary,
         'this_space': relevant_spaces,
         'trial_eligibility_criteria': relevant_eligibility_criteria
     }).reset_index(drop=True)
+    # Create a merged text input for the reranking checker
     analysis['pt_trial_pair'] = (
         analysis['this_space']
         + "\nNow here is the patient summary:"
     analysis['trial_checker_result'] = [x['label'] for x in classifier_results]
     analysis['trial_checker_score'] = [x['score'] for x in classifier_results]
+    # Subset (and reorder) the final columns we want
     out_df = analysis[[
         'patient_summary_query',
         'nct_id',
         'trial_checker_result',
         'trial_checker_score'
     ]]
+    # Convert that DataFrame to an HTML table
+    html_table = df_to_html(out_df)
+    # Return (HTML for display, DataFrame for exporting)
+    return html_table, out_df
+def df_to_html(df: pd.DataFrame) -> str:
+    """
+    Utility function to convert a DataFrame into an HTML table
+    with wrapping text.
+    """
+    # Build the table headers
+    header_row = "".join([f"<th>{col}</th>" for col in df.columns])
+    # Build the table rows
+    table_rows = []
+    for _, row in df.iterrows():
+        cells = ""
+        for col in df.columns:
+            cell_value = row[col]
+            # Convert to string and replace newlines with <br> (optional)
+            cell_str = str(cell_value).replace("\n", "<br>")
+            cells += f"<td>{cell_str}</td>"
+        table_rows.append(f"<tr>{cells}</tr>")
+    table_body = "\n".join(table_rows)
+    # Put it all together as an HTML string
+    table_html = f"""
+    <table class="styled-table">
+      <thead><tr>{header_row}</tr></thead>
+      <tbody>
+        {table_body}
+      </tbody>
+    </table>
+    """
+    return table_html
 def export_results(df: pd.DataFrame):
+    """
+    Saves the DataFrame to a temporary CSV file and returns its path
+    so that Gradio can prompt the user to download it.
+    """
     temp = tempfile.NamedTemporaryFile(delete=False, suffix=".csv")
     df.to_csv(temp.name, index=False)
     return temp.name
 custom_css = """
 #input_box textarea {
     width: 600px !important;
     height: 250px !important;
 }
+/* Make the custom table more readable: allow wrapping text */
+.styled-table {
+    width: 100%;
+    border-collapse: collapse;
+    table-layout: auto;
+    margin-top: 1em;
 }
+.styled-table th, .styled-table td {
     border: 1px solid #ccc;
+    padding: 8px;
+    vertical-align: top;
+    text-align: left;
+    white-space: pre-wrap;       /* Wrap text */
+    overflow-wrap: anywhere;     /* Break long text automatically */
+}
+.styled-table thead tr {
+    background-color: #f2f2f2;
+    font-weight: bold;
+}
+.styled-table tbody tr:nth-of-type(even) {
+    background-color: #f9f9f9;
 }
 """
+# Build the Gradio interface
 with gr.Blocks(css=custom_css) as demo:
+    gr.HTML("<h3>Alpha Version of Clinical Trial Search (HTML Table Output)</h3>")
     gr.HTML("<h3>Based on clinicaltrials.gov cancer trials export 10/31/24</h3>")
     patient_summary_input = gr.Textbox(
     submit_btn = gr.Button("Find Matches")
+    # We'll store the DataFrame in a state for exporting to CSV
     results_state = gr.State()
+    # The output is now HTML, instead of a DataFrame
+    output_html = gr.HTML(label="Results")
     export_btn = gr.Button("Export Results")
+    # When "Find Matches" is clicked, we get (HTML string, DataFrame)
     submit_btn.click(
+        fn=match_clinical_trials_html,
         inputs=patient_summary_input,
+        outputs=[output_html, results_state]
     )
+    # When "Export Results" is clicked, we export the DataFrame as CSV
     export_btn.click(
         fn=export_results,
         inputs=results_state,
         outputs=gr.File(label="Download CSV")
     )
+if __name__ == "__main__":
     demo.launch()