Spaces:

pendar02
/

biomedical

Sleeping

App Files Files Community

pendar02 commited on Jan 11

Commit

a12289f

verified ·

1 Parent(s): cfd2959

Update app.py

Browse files

Files changed (1) hide show

app.py +79 -18

app.py CHANGED Viewed

@@ -150,46 +150,107 @@ def post_process_summary(summary):
     return cleaned_summary
 def improve_summary_generation(text, model, tokenizer):
-    """Enhanced version of generate_summary with better parameters and post-processing"""
     if not isinstance(text, str) or not text.strip():
         return "No abstract available to summarize."
     word_count = len(text.split())
-    if word_count < 50:
         return text
     formatted_text = preprocess_text(text)
-    # Adjust generation parameters for better coherence
-    inputs = tokenizer(formatted_text, return_tensors="pt", max_length=1024, truncation=True)
     inputs = {k: v.to(model.device) for k, v in inputs.items()}
     with torch.no_grad():
         summary_ids = model.generate(
             **{
                 "input_ids": inputs["input_ids"],
                 "attention_mask": inputs["attention_mask"],
-                "max_length": min(200, word_count + 50),
-                "min_length": min(50, word_count),
-                "num_beams": 5,  # Increased from 4
-                "length_penalty": 1.5,  # Adjusted from 2.0
-                "early_stopping": True,
                 "no_repeat_ngram_size": 3,
-                "temperature": 0.7,  # Added temperature for better diversity
-                "top_p": 0.9,  # Added top_p sampling
-                "repetition_penalty": 1.2  # Added repetition penalty
             }
         )
     summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
-    # Apply post-processing
-    summary = post_process_summary(summary)
-    # Check if summary is too similar to original
-    if summary.lower() == text.lower() or len(summary.split()) / word_count > 0.9:
-        return text
     return summary
 def generate_focused_summary(question, abstracts, model, tokenizer):

     return cleaned_summary
 def improve_summary_generation(text, model, tokenizer):
+    """Enhanced version of summary generation optimized for biomedical papers"""
     if not isinstance(text, str) or not text.strip():
         return "No abstract available to summarize."
+    # Don't summarize if text is too short
     word_count = len(text.split())
+    if word_count < 100:  # Increased minimum length for medical texts
         return text
+    # Preprocess text
     formatted_text = preprocess_text(text)
+    # Prepare inputs
+    inputs = tokenizer(
+        formatted_text,
+        return_tensors="pt",
+        max_length=1024,
+        truncation=True,
+        padding=True
+    )
     inputs = {k: v.to(model.device) for k, v in inputs.items()}
+    # Generate summary with parameters tuned for biomedical text
     with torch.no_grad():
         summary_ids = model.generate(
             **{
                 "input_ids": inputs["input_ids"],
                 "attention_mask": inputs["attention_mask"],
+                "max_length": 300,  # Increased for medical summaries
+                "min_length": 100,  # Increased to ensure comprehensive coverage
+                "num_beams": 4,
+                "length_penalty": 2.0,  # Encourage slightly longer summaries
                 "no_repeat_ngram_size": 3,
+                "early_stopping": True,
+                "do_sample": True,  # Enable sampling
+                "top_p": 0.95,  # Nucleus sampling
+                "temperature": 0.85,  # Slightly higher temperature for medical terms
+                "repetition_penalty": 1.5  # Increased to avoid repeated stats/numbers
             }
         )
     summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
+    # Enhanced post-processing for medical text
+    summary = post_process_medical_summary(summary)
+    return summary
+def post_process_medical_summary(summary):
+    """Special post-processing for medical/scientific summaries"""
+    if not summary:
+        return summary
+    # Fix common medical text issues
+    summary = (summary
+        .replace(" p =", " p=")  # Fix p-value spacing
+        .replace(" n =", " n=")  # Fix sample size spacing
+        .replace("( ", "(")      # Fix parentheses spacing
+        .replace(" )", ")")
+        .replace("vs.", "versus")  # Expand abbreviations
+        .replace("..", ".")      # Fix double periods
+    )
+    # Ensure statistical significance symbols are correct
+    summary = (summary
+        .replace("p < ", "p<")
+        .replace("p > ", "p>")
+        .replace("P < ", "p<")
+        .replace("P > ", "p>")
+    )
+    # Fix number formatting
+    summary = (summary
+        .replace(" +/- ", "±")
+        .replace(" ± ", "±")
+    )
+    # Split into sentences and process each
+    sentences = [s.strip() for s in summary.split('.')]
+    processed_sentences = []
+    for sentence in sentences:
+        if sentence:
+            # Capitalize first letter
+            sentence = sentence[0].upper() + sentence[1:] if sentence else sentence
+            # Fix common medical abbreviations spacing
+            sentence = (sentence
+                .replace(" et al ", " et al. ")
+                .replace("et al.", "et al.")  # Fix double period
+            )
+            processed_sentences.append(sentence)
+    # Join sentences
+    summary = '. '.join(processed_sentences)
+    # Ensure proper ending
+    if summary and not summary.endswith('.'):
+        summary += '.'
     return summary
 def generate_focused_summary(question, abstracts, model, tokenizer):