kokluch commited on
Commit
05026d9
·
1 Parent(s): 664d4ee

Add mnemonic detection

Browse files
Files changed (5) hide show
  1. app.py +30 -14
  2. mnemonic_attack.py +47 -0
  3. requirements.txt +2 -1
  4. run.py +4 -0
  5. scam_brands.txt +52 -0
app.py CHANGED
@@ -1,5 +1,7 @@
1
  from time import sleep
2
  import logging
 
 
3
 
4
  import httpx
5
  from fastapi import FastAPI
@@ -11,17 +13,21 @@ from phishing_datasets import submit_entry
11
  from url_tools import extract_urls, resolve_short_url, extract_domain_from_url
12
  from urlscan_client import UrlscanClient
13
  import requests
14
- import re
15
 
16
 
17
 
18
  app = FastAPI()
19
  urlscan = UrlscanClient()
20
 
21
- # Configuration de base du logging
 
 
 
22
  logging.basicConfig(
23
  level=logging.DEBUG,
24
- format='%(asctime)s [%(levelname)s] %(message)s'
 
25
  )
26
 
27
  class MessageModel(BaseModel):
@@ -85,6 +91,12 @@ def predict(model: InputModel) -> OutputModel:
85
 
86
  logging.info(f"[{sender}] {text}")
87
 
 
 
 
 
 
 
88
  # Debug sleep
89
  pattern = r"^Sent from your Twilio trial account - sleep (\d+)$"
90
  match = re.search(pattern, text)
@@ -112,6 +124,7 @@ def predict(model: InputModel) -> OutputModel:
112
  return OutputModel(action=ActionModel.PROMOTION, sub_action=SubActionModel.NONE)
113
 
114
  result = pipe(text)
 
115
  label = result[0]['label']
116
  score = result[0]['score']
117
 
@@ -120,19 +133,22 @@ def predict(model: InputModel) -> OutputModel:
120
  if label == 'LABEL_0':
121
  score = 1 - score
122
 
123
- commercial_sender_pattern = r'\b[2-8]\d{4}\b'
124
- commercial_stop_pattern = r'\bSTOP(?:\s+SMS)?(?:\s+au)?\s+([2-8]\d{4})\b'
 
 
 
125
  commercial_stop = False
126
 
127
- if re.search(commercial_sender_pattern, sender):
128
- logging.info("[COMMERCIAL] Commercial sender detected")
129
- score = score * 0.9
130
- if re.search(commercial_stop_pattern, text):
131
- logging.info("[COMMERCIAL] STOP keyword detected")
132
- score = score * 0.9
133
- commercial_stop = True
134
- else:
135
- logging.info("[COMMERCIAL] STOP keyword missing")
136
 
137
  urls = extract_urls(text)
138
 
 
1
  from time import sleep
2
  import logging
3
+ import sys
4
+ import re
5
 
6
  import httpx
7
  from fastapi import FastAPI
 
13
  from url_tools import extract_urls, resolve_short_url, extract_domain_from_url
14
  from urlscan_client import UrlscanClient
15
  import requests
16
+ from mnemonic_attack import find_confusable_brand
17
 
18
 
19
 
20
  app = FastAPI()
21
  urlscan = UrlscanClient()
22
 
23
+ # Remove all handlers associated with the root logger object
24
+ for handler in logging.root.handlers[:]:
25
+ logging.root.removeHandler(handler)
26
+
27
  logging.basicConfig(
28
  level=logging.DEBUG,
29
+ format='%(asctime)s [%(levelname)s] %(message)s',
30
+ handlers=[logging.StreamHandler(sys.stdout)]
31
  )
32
 
33
  class MessageModel(BaseModel):
 
91
 
92
  logging.info(f"[{sender}] {text}")
93
 
94
+ # Brand usurpation detection using confusables
95
+ confusable_brand = find_confusable_brand(text)
96
+ if confusable_brand:
97
+ logging.warning(f"[BRAND USURPATION] Confusable/homoglyph variant of brand '{confusable_brand}' detected in message. Classified as JUNK.")
98
+ return OutputModel(action=ActionModel.JUNK, sub_action=SubActionModel.NONE)
99
+
100
  # Debug sleep
101
  pattern = r"^Sent from your Twilio trial account - sleep (\d+)$"
102
  match = re.search(pattern, text)
 
124
  return OutputModel(action=ActionModel.PROMOTION, sub_action=SubActionModel.NONE)
125
 
126
  result = pipe(text)
127
+
128
  label = result[0]['label']
129
  score = result[0]['score']
130
 
 
133
  if label == 'LABEL_0':
134
  score = 1 - score
135
 
136
+ # Pattern for detecting an alphanumeric SenderID
137
+ alphanumeric_sender_pattern = r'^[A-Za-z][A-Za-z0-9\-\.]{2,14}$'
138
+ # Pattern for detecting a short code
139
+ shorten_sender_pattern = r'^(?:3\d{4}|[4-8]\d{4})$'
140
+
141
  commercial_stop = False
142
 
143
+
144
+
145
+ # Detection of commercial senders (short code or alphanumeric)
146
+ if re.search(shorten_sender_pattern, sender):
147
+ logging.info("[COMMERCIAL] Commercial sender detected (short code)")
148
+ score = score * 0.7
149
+ elif re.match(alphanumeric_sender_pattern, sender):
150
+ logging.info("[COMMERCIAL] Alphanumeric SenderID detected")
151
+ score = score * 0.7
152
 
153
  urls = extract_urls(text)
154
 
mnemonic_attack.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ from confusables import is_confusable, confusable_regex
4
+
5
+ SCAM_BRANDS_FILE = os.path.join(os.path.dirname(__file__), 'scam_brands.txt')
6
+
7
+ def load_scam_brands():
8
+ """Load the list of scam brands from the text file."""
9
+ with open(SCAM_BRANDS_FILE, encoding='utf-8') as f:
10
+ return [line.strip().strip('"') for line in f if line.strip()]
11
+
12
+ SCAM_BRANDS = load_scam_brands()
13
+
14
+ def find_confusable_brand(message):
15
+ """
16
+ Check if the message contains a confusable/homoglyph variant of any scam brand.
17
+ Returns the matched brand if found, otherwise None.
18
+ """
19
+ for brand in SCAM_BRANDS:
20
+ # Build a regex that matches the brand or any confusable variant
21
+ regex_string = confusable_regex(brand, include_character_padding=True)
22
+ regex = re.compile(regex_string)
23
+ if regex.search(message):
24
+ return brand
25
+ return None
26
+
27
+ def test_find_confusable_brand():
28
+ """
29
+ Test the find_confusable_brand function with example messages.
30
+ """
31
+ test_cases = [
32
+ "This is a message from Amazοn support.", # Greek omicron instead of o
33
+ "Your Apple account has been locked.",
34
+ "Contact S0ciété Générale for more info.", # Zero instead of O
35
+ "Welcome to Netflix!",
36
+ "This is a message from a random sender.",
37
+ "Bonjour, c'est le livreur votre colis ne rentrait pas dans la boite aux lettres merci de choisir un point relais sur : https://mondiaIrelais-expedition.com"
38
+ ]
39
+ for msg in test_cases:
40
+ result = find_confusable_brand(msg)
41
+ if result:
42
+ print(f"[ALERT] Message: '{msg}' => Confusable brand detected: {result}")
43
+ else:
44
+ print(f"[OK] Message: '{msg}' => No confusable brand detected.")
45
+
46
+ if __name__ == "__main__":
47
+ test_find_confusable_brand()
requirements.txt CHANGED
@@ -7,4 +7,5 @@ datasets~=3.6.0
7
  pandas~=2.2.3
8
  httpx~=0.28.1
9
  numpy~=2.2.5
10
- requests~=2.32.3
 
 
7
  pandas~=2.2.3
8
  httpx~=0.28.1
9
  numpy~=2.2.5
10
+ requests~=2.32.3
11
+ confusables~=1.2.0
run.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ import uvicorn
2
+ from app import app
3
+
4
+ uvicorn.run(app, host="127.0.0.1", port=7860)
scam_brands.txt ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Amazon
2
+ Apple
3
+ Microsoft
4
+ Google
5
+ La Poste
6
+ SNCF
7
+ Orange
8
+ Free
9
+ Bouygues Telecom
10
+ SFR
11
+ PayPal
12
+ Crédit Agricole
13
+ Société Générale
14
+ BNP Paribas
15
+ La Banque Postale
16
+ EDF
17
+ Engie
18
+ Ameli
19
+ CAF
20
+ Impots.gouv
21
+ Chronopost
22
+ Colissimo
23
+ DHL
24
+ FedEx
25
+ UPS
26
+ Facebook
27
+ Instagram
28
+ WhatsApp
29
+ Netflix
30
+ Disney+
31
+ CIC
32
+ Crédit Mutuel
33
+ Boursorama
34
+ Revolut
35
+ N26
36
+ Hello Bank
37
+ Cdiscount
38
+ Vinted
39
+ Leboncoin
40
+ Carrefour
41
+ Auchan
42
+ Lidl
43
+ MondialRelay
44
+ Mondial
45
+ TotalEnergies
46
+ Oney
47
+ Yves Rocher
48
+ Decathlon
49
+ "Service Public"
50
+ "Sécurité Sociale"
51
+ "Police Nationale"
52
+ "Gendarmerie"