Spaces:

Xindus
/

xindus_summarizer

Sleeping

madankn79 commited on May 1

Commit

e4f016c

1 Parent(s): 26a1a9d

google

Files changed (1) hide show

app.py CHANGED Viewed

@@ -5,7 +5,7 @@ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 from nltk.corpus import stopwords
 import nltk
-# Download NLTK stopwords
 nltk.download('stopwords')
 stop_words = set(stopwords.words('english'))
@@ -20,25 +20,27 @@ model_choices = {
 model_cache = {}
-# Clean input text (remove stopwords and SKUs/product codes)
 def clean_text(input_text):
-    # Remove simple SKU codes (e.g., ST1642, AB1234, etc.)
-    cleaned_text = re.sub(r'\b[A-Za-z]{2,}[0-9]{3,}\b', '', input_text)  # Alphanumeric SKU
-    # Replace special characters with a space
-    cleaned_text = re.sub(r'[^A-Za-z0-9\s]', ' ', cleaned_text)
-    # Tokenize the input text and remove stop words
     words = cleaned_text.split()
-    words = [word for word in words if word.lower() not in stop_words]
-    # Rebuild the cleaned text
-    cleaned_text = " ".join(words)
-    # Strip leading and trailing spaces
-    cleaned_text = cleaned_text.strip()
-    return cleaned_text
 # Load model and tokenizer
 def load_model(model_name):

 from nltk.corpus import stopwords
 import nltk
+# Download NLTK stopwords (only the first time you run)
 nltk.download('stopwords')
 stop_words = set(stopwords.words('english'))
 model_cache = {}
+# Clean input text (remove stopwords, SKU codes, and non-meaningful text)
 def clean_text(input_text):
+    # Step 1: Remove any non-English characters (like special symbols, non-latin characters)
+    cleaned_text = re.sub(r'[^A-Za-z0-9\s]', ' ', input_text)  # Allow only letters and numbers
+    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)  # Replace multiple spaces with a single space
+    # Step 2: Tokenize the text and remove stopwords and words that are too short to be meaningful
     words = cleaned_text.split()
+    filtered_words = [word for word in words if word.lower() not in stop_words and len(word) > 2]
+    # Step 3: Rebuild the text from the remaining words
+    filtered_text = " ".join(filtered_words)
+    # Step 4: Remove any product codes or sequences (e.g., ST1642, AB1234)
+    # Assuming product codes follow a pattern of letters followed by numbers
+    filtered_text = re.sub(r'\b[A-Za-z]{2,}[0-9]{3,}\b', '', filtered_text)  # SKU/product code pattern
+    # Strip leading/trailing spaces
+    filtered_text = filtered_text.strip()
+    return filtered_text
 # Load model and tokenizer
 def load_model(model_name):