Spaces:

Xindus
/

xindus_summarizer

Sleeping

App Files Files Community

madankn79 commited on May 1

Commit

28b68a3

1 Parent(s): 7245c1f

google

Browse files

Files changed (1) hide show

app.py +47 -34

app.py CHANGED Viewed

@@ -2,14 +2,12 @@ import gradio as gr
 import re
 import torch
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
-from spaces import GPU  # Required for ZeroGPU Spaces
 from nltk.corpus import stopwords
-from nltk.tokenize import word_tokenize, sent_tokenize
 import nltk
-# Download NLTK stopwords if not already available
 nltk.download("stopwords")
-nltk.download('punkt')
 stop_words = set(stopwords.words("english"))
 # Model list
@@ -23,32 +21,27 @@ model_choices = {
 model_cache = {}
-# Clean text: remove special characters, stop words, SKU codes, and short words
 def clean_text(input_text):
-    # Step 1: Remove any non-English characters (like special symbols, non-latin characters)
-    cleaned_text = re.sub(r"[^A-Za-z0-9\s]", " ", input_text)
-    cleaned_text = re.sub(r"\s+", " ", cleaned_text)  # Replace multiple spaces with a single space
-    # Step 2: Tokenize the text and remove stopwords and words that are too short to be meaningful
-    words = cleaned_text.split()
-    filtered_words = [word for word in words if word.lower() not in stop_words and len(word) > 2]
-    # Step 3: Rebuild the text from the remaining words
-    filtered_text = " ".join(filtered_words)
-    # Step 4: Remove any product codes or sequences (e.g., ST1642, AB1234)
-    filtered_text = re.sub(r"\b[A-Za-z]{2,}[0-9]{3,}\b", "", filtered_text)  # SKU/product code pattern
-    # Step 5: Strip leading/trailing spaces
-    filtered_text = filtered_text.strip()
-    return filtered_text
-# Extractive Summarization: Select sentences directly from the input text
-def extractive_summary(input_text, num_sentences=2):
-    sentences = sent_tokenize(input_text)  # Tokenize into sentences
-    filtered_sentences = [sentence for sentence in sentences if len(sentence.split()) > 2]  # Filter out very short sentences
-    return " ".join(filtered_sentences[:num_sentences])  # Return first `num_sentences` sentences
 # Main function triggered by Gradio
 @GPU  # 👈 Required for ZeroGPU to trigger GPU spin-up
@@ -57,11 +50,31 @@ def summarize_text(input_text, model_label, char_limit):
         return "Please enter some text."
     input_text = clean_text(input_text)
-    # For extractive summarization, we don't use the models that generate new tokens.
-    summary = extractive_summary(input_text)
-    # Truncate summary based on the character limit
     return summary[:char_limit].strip()
 # Gradio UI
@@ -73,8 +86,8 @@ iface = gr.Interface(
         gr.Slider(minimum=30, maximum=200, value=65, step=1, label="Max Character Limit")
     ],
     outputs=gr.Textbox(lines=3, label="Summary (truncated to character limit)"),
-    title="🔥 Fast Summarizer (Extractive Only)",
-    description="Summarizes input by selecting key sentences from the input text, without generating new tokens."
 )
 iface.launch()

 import re
 import torch
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 from nltk.corpus import stopwords
+from spaces import GPU  # Required for ZeroGPU Spaces
 import nltk
+# Download stopwords if not already available
 nltk.download("stopwords")
 stop_words = set(stopwords.words("english"))
 # Model list
 model_cache = {}
+# Clean text: remove special characters and stop words
 def clean_text(input_text):
+    cleaned = re.sub(r"[^A-Za-z0-9\s]", " ", input_text)
+    words = cleaned.split()
+    words = [word for word in words if word.lower() not in stop_words]
+    return " ".join(words).strip()
+# Load model and tokenizer
+def load_model(model_name):
+    if model_name not in model_cache:
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        model = AutoModelForSeq2SeqLM.from_pretrained(
+            model_name, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
+        )
+        model.to("cuda" if torch.cuda.is_available() else "cpu")
+        model_cache[model_name] = (tokenizer, model)
+        # Warm up
+        dummy_input = tokenizer("summarize: warmup", return_tensors="pt").input_ids.to(model.device)
+        model.generate(dummy_input, max_length=10)
+    return model_cache[model_name]
 # Main function triggered by Gradio
 @GPU  # 👈 Required for ZeroGPU to trigger GPU spin-up
         return "Please enter some text."
     input_text = clean_text(input_text)
+    model_name = model_choices[model_label]
+    tokenizer, model = load_model(model_name)
+    # Prefix for T5/FLAN-style models
+    if "t5" in model_name.lower():
+        input_text = "summarize: " + input_text
+    inputs = tokenizer(input_text, return_tensors="pt", truncation=True)
+    input_ids = inputs["input_ids"].to(model.device)
+    # Adjust the generation parameters
+    summary_ids = model.generate(
+        input_ids,
+        max_length=30,                # Keep output length short, around the original text's length
+        min_length=15,                # Ensure the summary is not too short
+        do_sample=False,              # Disable sampling to avoid introducing new words
+        num_beams=5,                  # Beam search to find the most likely sequence of tokens
+        early_stopping=True,          # Stop once a reasonable summary is generated
+        no_repeat_ngram_size=2        # Prevent repetition of n-grams (bigrams in this case)
+    )
+    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
     return summary[:char_limit].strip()
 # Gradio UI
         gr.Slider(minimum=30, maximum=200, value=65, step=1, label="Max Character Limit")
     ],
     outputs=gr.Textbox(lines=3, label="Summary (truncated to character limit)"),
+    title="🔥 Fast Summarizer (GPU-Optimized)",
+    description="Summarizes input using Hugging Face models with ZeroGPU support. Now faster with CUDA, float16, and warm start!"
 )
 iface.launch()