madankn79 commited on
Commit
7245c1f
·
1 Parent(s): f9aee87
Files changed (1) hide show
  1. app.py +37 -50
app.py CHANGED
@@ -2,12 +2,14 @@ import gradio as gr
2
  import re
3
  import torch
4
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
5
- from nltk.corpus import stopwords
6
  from spaces import GPU # Required for ZeroGPU Spaces
 
 
7
  import nltk
8
 
9
- # Download stopwords if not already available
10
  nltk.download("stopwords")
 
11
  stop_words = set(stopwords.words("english"))
12
 
13
  # Model list
@@ -21,27 +23,32 @@ model_choices = {
21
 
22
  model_cache = {}
23
 
24
- # Clean text: remove special characters and stop words
25
  def clean_text(input_text):
26
- cleaned = re.sub(r"[^A-Za-z0-9\s]", " ", input_text)
27
- words = cleaned.split()
28
- words = [word for word in words if word.lower() not in stop_words]
29
- return " ".join(words).strip()
30
-
31
- # Load model and tokenizer
32
- def load_model(model_name):
33
- if model_name not in model_cache:
34
- tokenizer = AutoTokenizer.from_pretrained(model_name)
35
- model = AutoModelForSeq2SeqLM.from_pretrained(
36
- model_name, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
37
- )
38
- model.to("cuda" if torch.cuda.is_available() else "cpu")
39
- model_cache[model_name] = (tokenizer, model)
40
-
41
- # Warm up
42
- dummy_input = tokenizer("summarize: warmup", return_tensors="pt").input_ids.to(model.device)
43
- model.generate(dummy_input, max_length=10)
44
- return model_cache[model_name]
 
 
 
 
 
45
 
46
  # Main function triggered by Gradio
47
  @GPU # 👈 Required for ZeroGPU to trigger GPU spin-up
@@ -50,33 +57,13 @@ def summarize_text(input_text, model_label, char_limit):
50
  return "Please enter some text."
51
 
52
  input_text = clean_text(input_text)
53
- model_name = model_choices[model_label]
54
- tokenizer, model = load_model(model_name)
55
-
56
- # Prefix for T5/FLAN-style models
57
- if "t5" in model_name.lower():
58
- input_text = "summarize: " + input_text
59
-
60
- inputs = tokenizer(input_text, return_tensors="pt", truncation=True)
61
- input_ids = inputs["input_ids"].to(model.device)
62
-
63
-
64
- # Adjust the generation parameters
65
- summary_ids = model.generate(
66
- input_ids,
67
- max_length=20,
68
- min_length=10,
69
- do_sample=True, # Enable sampling for more diverse outputs
70
- top_k=50, # Consider top 50 tokens for each step
71
- top_p=0.95, # Top-p (nucleus) sampling to control diversity
72
- temperature=0.7, # Control randomness in output (lower is less random)
73
- no_repeat_ngram_size=2, # Restrict repetition of bigrams (2-grams)
74
- early_stopping=True # Stop generating once the model has finished a reasonable output
75
- )
76
-
77
- summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
78
  return summary[:char_limit].strip()
79
-
80
  # Gradio UI
81
  iface = gr.Interface(
82
  fn=summarize_text,
@@ -86,8 +73,8 @@ iface = gr.Interface(
86
  gr.Slider(minimum=30, maximum=200, value=65, step=1, label="Max Character Limit")
87
  ],
88
  outputs=gr.Textbox(lines=3, label="Summary (truncated to character limit)"),
89
- title="🔥 Fast Summarizer (GPU-Optimized)",
90
- description="Summarizes input using Hugging Face models with ZeroGPU support. Now faster with CUDA, float16, and warm start!"
91
  )
92
 
93
  iface.launch()
 
2
  import re
3
  import torch
4
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 
5
  from spaces import GPU # Required for ZeroGPU Spaces
6
+ from nltk.corpus import stopwords
7
+ from nltk.tokenize import word_tokenize, sent_tokenize
8
  import nltk
9
 
10
+ # Download NLTK stopwords if not already available
11
  nltk.download("stopwords")
12
+ nltk.download('punkt')
13
  stop_words = set(stopwords.words("english"))
14
 
15
  # Model list
 
23
 
24
  model_cache = {}
25
 
26
+ # Clean text: remove special characters, stop words, SKU codes, and short words
27
  def clean_text(input_text):
28
+ # Step 1: Remove any non-English characters (like special symbols, non-latin characters)
29
+ cleaned_text = re.sub(r"[^A-Za-z0-9\s]", " ", input_text)
30
+ cleaned_text = re.sub(r"\s+", " ", cleaned_text) # Replace multiple spaces with a single space
31
+
32
+ # Step 2: Tokenize the text and remove stopwords and words that are too short to be meaningful
33
+ words = cleaned_text.split()
34
+ filtered_words = [word for word in words if word.lower() not in stop_words and len(word) > 2]
35
+
36
+ # Step 3: Rebuild the text from the remaining words
37
+ filtered_text = " ".join(filtered_words)
38
+
39
+ # Step 4: Remove any product codes or sequences (e.g., ST1642, AB1234)
40
+ filtered_text = re.sub(r"\b[A-Za-z]{2,}[0-9]{3,}\b", "", filtered_text) # SKU/product code pattern
41
+
42
+ # Step 5: Strip leading/trailing spaces
43
+ filtered_text = filtered_text.strip()
44
+
45
+ return filtered_text
46
+
47
+ # Extractive Summarization: Select sentences directly from the input text
48
+ def extractive_summary(input_text, num_sentences=2):
49
+ sentences = sent_tokenize(input_text) # Tokenize into sentences
50
+ filtered_sentences = [sentence for sentence in sentences if len(sentence.split()) > 2] # Filter out very short sentences
51
+ return " ".join(filtered_sentences[:num_sentences]) # Return first `num_sentences` sentences
52
 
53
  # Main function triggered by Gradio
54
  @GPU # 👈 Required for ZeroGPU to trigger GPU spin-up
 
57
  return "Please enter some text."
58
 
59
  input_text = clean_text(input_text)
60
+
61
+ # For extractive summarization, we don't use the models that generate new tokens.
62
+ summary = extractive_summary(input_text)
63
+
64
+ # Truncate summary based on the character limit
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  return summary[:char_limit].strip()
66
+
67
  # Gradio UI
68
  iface = gr.Interface(
69
  fn=summarize_text,
 
73
  gr.Slider(minimum=30, maximum=200, value=65, step=1, label="Max Character Limit")
74
  ],
75
  outputs=gr.Textbox(lines=3, label="Summary (truncated to character limit)"),
76
+ title="🔥 Fast Summarizer (Extractive Only)",
77
+ description="Summarizes input by selecting key sentences from the input text, without generating new tokens."
78
  )
79
 
80
  iface.launch()