kokluch commited on
Commit
504cee7
·
1 Parent(s): 63815c9

Fix official brand exclude.

Browse files
Files changed (3) hide show
  1. app.py +8 -9
  2. mnemonic_attack.py +13 -4
  3. scam_brands.txt +1 -5
app.py CHANGED
@@ -26,7 +26,7 @@ for handler in logging.root.handlers[:]:
26
 
27
  logging.basicConfig(
28
  level=logging.DEBUG,
29
- format='%(asctime)s [%(levelname)s] %(message)s',
30
  handlers=[logging.StreamHandler(sys.stdout)]
31
  )
32
 
@@ -91,12 +91,6 @@ def predict(model: InputModel) -> OutputModel:
91
 
92
  logging.info(f"[{sender}] {text}")
93
 
94
- # Brand usurpation detection using confusables
95
- confusable_brand = find_confusable_brand(text)
96
- if confusable_brand:
97
- logging.warning(f"[BRAND USURPATION] Confusable/homoglyph variant of brand '{confusable_brand}' detected in message. Classified as JUNK.")
98
- return OutputModel(action=ActionModel.JUNK, sub_action=SubActionModel.NONE)
99
-
100
  # Debug sleep
101
  pattern = r"^Sent from your Twilio trial account - sleep (\d+)$"
102
  match = re.search(pattern, text)
@@ -123,6 +117,13 @@ def predict(model: InputModel) -> OutputModel:
123
  case 'promotion':
124
  return OutputModel(action=ActionModel.PROMOTION, sub_action=SubActionModel.NONE)
125
 
 
 
 
 
 
 
 
126
  result = pipe(text)
127
 
128
  label = result[0]['label']
@@ -140,8 +141,6 @@ def predict(model: InputModel) -> OutputModel:
140
 
141
  commercial_stop = False
142
 
143
-
144
-
145
  # Detection of commercial senders (short code or alphanumeric)
146
  if re.search(shorten_sender_pattern, sender):
147
  logging.info("[COMMERCIAL] Commercial sender detected (short code)")
 
26
 
27
  logging.basicConfig(
28
  level=logging.DEBUG,
29
+ format='%(levelname)s: %(asctime)s %(message)s',
30
  handlers=[logging.StreamHandler(sys.stdout)]
31
  )
32
 
 
91
 
92
  logging.info(f"[{sender}] {text}")
93
 
 
 
 
 
 
 
94
  # Debug sleep
95
  pattern = r"^Sent from your Twilio trial account - sleep (\d+)$"
96
  match = re.search(pattern, text)
 
117
  case 'promotion':
118
  return OutputModel(action=ActionModel.PROMOTION, sub_action=SubActionModel.NONE)
119
 
120
+
121
+ # Brand usurpation detection using confusables
122
+ confusable_brand = find_confusable_brand(text)
123
+ if confusable_brand:
124
+ logging.warning(f"[BRAND USURPATION] Confusable/homoglyph variant of brand '{confusable_brand}' detected in message. Classified as JUNK.")
125
+ return OutputModel(action=ActionModel.JUNK, sub_action=SubActionModel.NONE)
126
+
127
  result = pipe(text)
128
 
129
  label = result[0]['label']
 
141
 
142
  commercial_stop = False
143
 
 
 
144
  # Detection of commercial senders (short code or alphanumeric)
145
  if re.search(shorten_sender_pattern, sender):
146
  logging.info("[COMMERCIAL] Commercial sender detected (short code)")
mnemonic_attack.py CHANGED
@@ -15,12 +15,19 @@ def find_confusable_brand(message):
15
  """
16
  Check if the message contains a confusable/homoglyph variant of any scam brand.
17
  Returns the matched brand if found, otherwise None.
 
18
  """
19
  for brand in SCAM_BRANDS:
20
  # Build a regex that matches the brand or any confusable variant
21
- regex_string = confusable_regex(brand, include_character_padding=True)
22
  regex = re.compile(regex_string)
23
- if regex.search(message):
 
 
 
 
 
 
24
  return brand
25
  return None
26
 
@@ -30,11 +37,12 @@ def test_find_confusable_brand():
30
  """
31
  test_cases = [
32
  "This is a message from Amazοn support.", # Greek omicron instead of o
33
- "Your Apple account has been locked.",
34
  "Contact S0ciété Générale for more info.", # Zero instead of O
35
  "Welcome to Netflix!",
36
  "This is a message from a random sender.",
37
- "Bonjour, c'est le livreur votre colis ne rentrait pas dans la boite aux lettres merci de choisir un point relais sur : https://mondiaIrelais-expedition.com"
 
38
  ]
39
  for msg in test_cases:
40
  result = find_confusable_brand(msg)
@@ -43,5 +51,6 @@ def test_find_confusable_brand():
43
  else:
44
  print(f"[OK] Message: '{msg}' => No confusable brand detected.")
45
 
 
46
  if __name__ == "__main__":
47
  test_find_confusable_brand()
 
15
  """
16
  Check if the message contains a confusable/homoglyph variant of any scam brand.
17
  Returns the matched brand if found, otherwise None.
18
+ Does not return the brand if the match is an exact (byte-for-byte, case-sensitive) match.
19
  """
20
  for brand in SCAM_BRANDS:
21
  # Build a regex that matches the brand or any confusable variant
22
+ regex_string = confusable_regex(brand, include_character_padding=False)
23
  regex = re.compile(regex_string)
24
+ for match in regex.finditer(message):
25
+ matched_text = match.group(0)
26
+ # Skip if the matched text is exactly the same as the brand (case-sensitive)
27
+ if matched_text.strip().lower() == brand.lower().strip():
28
+ continue
29
+ else:
30
+ print(f"matched_text: {matched_text.lower().strip()} brand: {brand.lower().strip()}")
31
  return brand
32
  return None
33
 
 
37
  """
38
  test_cases = [
39
  "This is a message from Amazοn support.", # Greek omicron instead of o
40
+ "Your Αpple account has been locked.", # Greek capital alpha instead of a
41
  "Contact S0ciété Générale for more info.", # Zero instead of O
42
  "Welcome to Netflix!",
43
  "This is a message from a random sender.",
44
+ "Bonjour, c'est le livreur votre colis ne rentrait pas dans la boite aux lettres merci de choisir un point relais sur : https://mondiaIrelais-expedition.com",
45
+ "599915 est votre code de vérification Leboncoin."
46
  ]
47
  for msg in test_cases:
48
  result = find_confusable_brand(msg)
 
51
  else:
52
  print(f"[OK] Message: '{msg}' => No confusable brand detected.")
53
 
54
+
55
  if __name__ == "__main__":
56
  test_find_confusable_brand()
scam_brands.txt CHANGED
@@ -40,7 +40,6 @@ Doctolib
40
  Douanes françaises
41
  EDF
42
  Engie
43
- Eni
44
  Epic Games
45
  Europ Assistance
46
  Facebook
@@ -49,7 +48,6 @@ Fnac
49
  Fortuneo
50
  Free
51
  GLS
52
- Gendarmerie
53
  GitHub
54
  Google
55
  Groupama
@@ -81,12 +79,12 @@ N26
81
  NMBS
82
  Nespresso
83
  Netflix
84
- Nickel
85
  Norauto
86
  Oney
87
  Orange
88
  PayPal
89
  Paylib
 
90
  Police Nationale
91
  Préfecture
92
  Pôle emploi
@@ -106,9 +104,7 @@ Spotify
106
  Steam
107
  Suez
108
  Sécurité Sociale
109
- TNT
110
  TikTok
111
- Total Direct Energie
112
  TotalEnergies
113
  TransferWise
114
  Trésor Public
 
40
  Douanes françaises
41
  EDF
42
  Engie
 
43
  Epic Games
44
  Europ Assistance
45
  Facebook
 
48
  Fortuneo
49
  Free
50
  GLS
 
51
  GitHub
52
  Google
53
  Groupama
 
79
  NMBS
80
  Nespresso
81
  Netflix
 
82
  Norauto
83
  Oney
84
  Orange
85
  PayPal
86
  Paylib
87
+ Planity
88
  Police Nationale
89
  Préfecture
90
  Pôle emploi
 
104
  Steam
105
  Suez
106
  Sécurité Sociale
 
107
  TikTok
 
108
  TotalEnergies
109
  TransferWise
110
  Trésor Public