Spaces:

Xindus
/

xindus_summarizer

Sleeping

App Files Files Community

madankn79 commited on May 1

Commit

7245c1f

1 Parent(s): f9aee87

google

Browse files

Files changed (1) hide show

app.py +37 -50

app.py CHANGED Viewed

@@ -2,12 +2,14 @@ import gradio as gr
 import re
 import torch
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
-from nltk.corpus import stopwords
 from spaces import GPU  # Required for ZeroGPU Spaces
 import nltk
-# Download stopwords if not already available
 nltk.download("stopwords")
 stop_words = set(stopwords.words("english"))
 # Model list
@@ -21,27 +23,32 @@ model_choices = {
 model_cache = {}
-# Clean text: remove special characters and stop words
 def clean_text(input_text):
-    cleaned = re.sub(r"[^A-Za-z0-9\s]", " ", input_text)
-    words = cleaned.split()
-    words = [word for word in words if word.lower() not in stop_words]
-    return " ".join(words).strip()
-# Load model and tokenizer
-def load_model(model_name):
-    if model_name not in model_cache:
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
-        model = AutoModelForSeq2SeqLM.from_pretrained(
-            model_name, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
-        )
-        model.to("cuda" if torch.cuda.is_available() else "cpu")
-        model_cache[model_name] = (tokenizer, model)
-        # Warm up
-        dummy_input = tokenizer("summarize: warmup", return_tensors="pt").input_ids.to(model.device)
-        model.generate(dummy_input, max_length=10)
-    return model_cache[model_name]
 # Main function triggered by Gradio
 @GPU  # 👈 Required for ZeroGPU to trigger GPU spin-up
@@ -50,33 +57,13 @@ def summarize_text(input_text, model_label, char_limit):
         return "Please enter some text."
     input_text = clean_text(input_text)
-    model_name = model_choices[model_label]
-    tokenizer, model = load_model(model_name)
-    # Prefix for T5/FLAN-style models
-    if "t5" in model_name.lower():
-        input_text = "summarize: " + input_text
-    inputs = tokenizer(input_text, return_tensors="pt", truncation=True)
-    input_ids = inputs["input_ids"].to(model.device)
-    # Adjust the generation parameters
-    summary_ids = model.generate(
-        input_ids,
-        max_length=20,
-        min_length=10,
-        do_sample=True,               # Enable sampling for more diverse outputs
-        top_k=50,                     # Consider top 50 tokens for each step
-        top_p=0.95,                   # Top-p (nucleus) sampling to control diversity
-        temperature=0.7,              # Control randomness in output (lower is less random)
-        no_repeat_ngram_size=2,       # Restrict repetition of bigrams (2-grams)
-        early_stopping=True           # Stop generating once the model has finished a reasonable output
-    )
-    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
     return summary[:char_limit].strip()
 # Gradio UI
 iface = gr.Interface(
     fn=summarize_text,
@@ -86,8 +73,8 @@ iface = gr.Interface(
         gr.Slider(minimum=30, maximum=200, value=65, step=1, label="Max Character Limit")
     ],
     outputs=gr.Textbox(lines=3, label="Summary (truncated to character limit)"),
-    title="🔥 Fast Summarizer (GPU-Optimized)",
-    description="Summarizes input using Hugging Face models with ZeroGPU support. Now faster with CUDA, float16, and warm start!"
 )
 iface.launch()

 import re
 import torch
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 from spaces import GPU  # Required for ZeroGPU Spaces
+from nltk.corpus import stopwords
+from nltk.tokenize import word_tokenize, sent_tokenize
 import nltk
+# Download NLTK stopwords if not already available
 nltk.download("stopwords")
+nltk.download('punkt')
 stop_words = set(stopwords.words("english"))
 # Model list
 model_cache = {}
+# Clean text: remove special characters, stop words, SKU codes, and short words
 def clean_text(input_text):
+    # Step 1: Remove any non-English characters (like special symbols, non-latin characters)
+    cleaned_text = re.sub(r"[^A-Za-z0-9\s]", " ", input_text)
+    cleaned_text = re.sub(r"\s+", " ", cleaned_text)  # Replace multiple spaces with a single space
+    # Step 2: Tokenize the text and remove stopwords and words that are too short to be meaningful
+    words = cleaned_text.split()
+    filtered_words = [word for word in words if word.lower() not in stop_words and len(word) > 2]
+    # Step 3: Rebuild the text from the remaining words
+    filtered_text = " ".join(filtered_words)
+    # Step 4: Remove any product codes or sequences (e.g., ST1642, AB1234)
+    filtered_text = re.sub(r"\b[A-Za-z]{2,}[0-9]{3,}\b", "", filtered_text)  # SKU/product code pattern
+    # Step 5: Strip leading/trailing spaces
+    filtered_text = filtered_text.strip()
+    return filtered_text
+# Extractive Summarization: Select sentences directly from the input text
+def extractive_summary(input_text, num_sentences=2):
+    sentences = sent_tokenize(input_text)  # Tokenize into sentences
+    filtered_sentences = [sentence for sentence in sentences if len(sentence.split()) > 2]  # Filter out very short sentences
+    return " ".join(filtered_sentences[:num_sentences])  # Return first `num_sentences` sentences
 # Main function triggered by Gradio
 @GPU  # 👈 Required for ZeroGPU to trigger GPU spin-up
         return "Please enter some text."
     input_text = clean_text(input_text)
+    # For extractive summarization, we don't use the models that generate new tokens.
+    summary = extractive_summary(input_text)
+    # Truncate summary based on the character limit
     return summary[:char_limit].strip()
 # Gradio UI
 iface = gr.Interface(
     fn=summarize_text,
         gr.Slider(minimum=30, maximum=200, value=65, step=1, label="Max Character Limit")
     ],
     outputs=gr.Textbox(lines=3, label="Summary (truncated to character limit)"),
+    title="🔥 Fast Summarizer (Extractive Only)",
+    description="Summarizes input by selecting key sentences from the input text, without generating new tokens."
 )
 iface.launch()