madankn79 commited on
Commit
e4f016c
·
1 Parent(s): 26a1a9d
Files changed (1) hide show
  1. app.py +20 -18
app.py CHANGED
@@ -5,7 +5,7 @@ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
5
  from nltk.corpus import stopwords
6
  import nltk
7
 
8
- # Download NLTK stopwords
9
  nltk.download('stopwords')
10
  stop_words = set(stopwords.words('english'))
11
 
@@ -20,25 +20,27 @@ model_choices = {
20
 
21
  model_cache = {}
22
 
23
- # Clean input text (remove stopwords and SKUs/product codes)
24
  def clean_text(input_text):
25
- # Remove simple SKU codes (e.g., ST1642, AB1234, etc.)
26
- cleaned_text = re.sub(r'\b[A-Za-z]{2,}[0-9]{3,}\b', '', input_text) # Alphanumeric SKU
27
-
28
- # Replace special characters with a space
29
- cleaned_text = re.sub(r'[^A-Za-z0-9\s]', ' ', cleaned_text)
30
-
31
- # Tokenize the input text and remove stop words
32
  words = cleaned_text.split()
33
- words = [word for word in words if word.lower() not in stop_words]
34
-
35
- # Rebuild the cleaned text
36
- cleaned_text = " ".join(words)
37
-
38
- # Strip leading and trailing spaces
39
- cleaned_text = cleaned_text.strip()
40
-
41
- return cleaned_text
 
 
 
 
42
 
43
  # Load model and tokenizer
44
  def load_model(model_name):
 
5
  from nltk.corpus import stopwords
6
  import nltk
7
 
8
+ # Download NLTK stopwords (only the first time you run)
9
  nltk.download('stopwords')
10
  stop_words = set(stopwords.words('english'))
11
 
 
20
 
21
  model_cache = {}
22
 
23
+ # Clean input text (remove stopwords, SKU codes, and non-meaningful text)
24
  def clean_text(input_text):
25
+ # Step 1: Remove any non-English characters (like special symbols, non-latin characters)
26
+ cleaned_text = re.sub(r'[^A-Za-z0-9\s]', ' ', input_text) # Allow only letters and numbers
27
+ cleaned_text = re.sub(r'\s+', ' ', cleaned_text) # Replace multiple spaces with a single space
28
+
29
+ # Step 2: Tokenize the text and remove stopwords and words that are too short to be meaningful
 
 
30
  words = cleaned_text.split()
31
+ filtered_words = [word for word in words if word.lower() not in stop_words and len(word) > 2]
32
+
33
+ # Step 3: Rebuild the text from the remaining words
34
+ filtered_text = " ".join(filtered_words)
35
+
36
+ # Step 4: Remove any product codes or sequences (e.g., ST1642, AB1234)
37
+ # Assuming product codes follow a pattern of letters followed by numbers
38
+ filtered_text = re.sub(r'\b[A-Za-z]{2,}[0-9]{3,}\b', '', filtered_text) # SKU/product code pattern
39
+
40
+ # Strip leading/trailing spaces
41
+ filtered_text = filtered_text.strip()
42
+
43
+ return filtered_text
44
 
45
  # Load model and tokenizer
46
  def load_model(model_name):