import re import nltk import spacy import fitz # PyMuPDF from transformers import pipeline import textwrap import gradio as gr import spacy # Download NLTK punkt if not already done nltk.download('punkt') nltk.download('punkt_tab') # Load spaCy model try: nlp = spacy.load("en_core_web_sm") except Exception as e: spacy.cli.download("en_core_web_sm") nlp = spacy.load("en_core_web_sm") # Initialize the BigBird-Pegasus summarization pipeline for PubMed texts summarizer = pipeline("summarization", model="google/bigbird-pegasus-large-pubmed") # Helper Function: Read PDF with Content Filter def read_pdf_with_content_filter(file_path, keywords=["Abstract", "Introduction", "Methods", "Results", "Conclusions"]): """ Reads a PDF file and returns text only from pages that contain one of the specified keywords. This helps exclude pages that mainly contain header/metadata. """ doc = fitz.open(file_path) content_pages = [] for i in range(len(doc)): page_text = doc[i].get_text() if any(keyword.lower() in page_text.lower() for keyword in keywords): content_pages.append(page_text) return "\n".join(content_pages) # Helper Function: Clean Text def clean_text(text): """ Cleans the text by removing citations, extra whitespace, and unwanted characters. """ text = re.sub(r'\[\d+\]', '', text) # Remove citations like [12] text = re.sub(r'\(\d+\)', '', text) # Remove citations like (3) text = re.sub(r'\s+', ' ', text) # Normalize whitespace return text.strip() # Helper Function: Extract Core Sections def extract_core_sections(text): """ Attempts to extract core sections using common headings. Returns a dictionary with section name (lowercase) as key and its content as value. """ pattern = r'(?i)(Abstract|Introduction|Methods|Results|Conclusions|Discussion)\s*[:\n\.]' splits = re.split(pattern, text) sections = {} if len(splits) > 1: for i in range(1, len(splits), 2): heading = splits[i].strip().lower() content = splits[i+1].strip() if i+1 < len(splits) else "" sections[heading] = content return sections # Helper Function: Remove Header Metadata def remove_header_metadata(text, marker="Competing Interests:"): """ Removes header/metadata from the text by using a marker. If the marker is found, returns text after it; otherwise, returns the original text. """ idx = text.find(marker) if idx != -1: return text[idx + len(marker):].strip() return text # Helper Function: Split Text into Chunks def split_into_chunks(text, chunk_size=500): """ Splits the text into chunks of approximately chunk_size words. """ words = text.split() chunks = [] for i in range(0, len(words), chunk_size): chunk = " ".join(words[i:i+chunk_size]) chunks.append(chunk) return chunks # Helper Function: Summarize Text def summarize_text(text, max_length=200, min_length=50): """ Summarizes the given text using BigBird-Pegasus. Adjusts output lengths if the input is very short. """ input_length = len(text.split()) if input_length < 60: max_length = min(max_length, 40) min_length = min(min_length, 10) summary = summarizer(text, max_length=max_length, min_length=min_length, do_sample=False) return summary[0]['summary_text'] # Helper Function: Format Bullet Points def format_bullet_points(summary): """ Splits the summary into sentences and formats each as a bullet point. """ sentences = nltk.sent_tokenize(summary) bullets = ["- " + sentence for sentence in sentences] return "\n".join(bullets) # Helper Function: Convert Bullets to Wrapped Paragraph def bullet_to_paragraph_wrapped(bullet_text, width=80): """ Converts bullet point summary into a paragraph and wraps the text to a specified width. """ paragraph = bullet_text.replace("- ", "").replace("", " ") paragraph = re.sub(r'\s+', ' ', paragraph).strip() wrapped_paragraph = textwrap.fill(paragraph, width=width) return wrapped_paragraph # Process PDF Function (Gradio Interface) def process_pdf(file_obj): """ Processes the uploaded PDF file and returns a bullet summary and a wrapped paragraph summary. """ # file_obj is a temporary file path provided by Gradio full_text = read_pdf_with_content_filter(file_obj.name) cleaned_text = clean_text(full_text) sections = extract_core_sections(cleaned_text) if not sections: core_text = remove_header_metadata(cleaned_text) else: order = ['abstract', 'introduction', 'methods', 'results', 'conclusions', 'discussion'] core_content = [sections[sec] for sec in order if sec in sections] core_text = " ".join(core_content) if core_content else cleaned_text chunks = split_into_chunks(core_text, chunk_size=500) chunk_summaries = [] for chunk in chunks: try: chunk_summary = summarize_text(chunk, max_length=200, min_length=50) except Exception as e: chunk_summary = "" chunk_summaries.append(chunk_summary) final_core_summary_text = " ".join(chunk_summaries) final_summary = summarize_text(final_core_summary_text, max_length=200, min_length=50) bullet_points = format_bullet_points(final_summary) paragraph_summary_wrapped = bullet_to_paragraph_wrapped(bullet_points, width=80) return bullet_points, paragraph_summary_wrapped # Create Gradio Interface iface = gr.Interface( fn=process_pdf, inputs=gr.File(label="Upload a Medical PDF"), outputs=[ gr.Textbox(label="Bullet Summary"), gr.Textbox(label="Paragraph Summary") ], title="Medical Document Summarization", description="Upload a medical PDF document to get a summarized bullet-point and paragraph summary of its core content." ) if __name__ == "__main__": iface.launch()