Spaces:
Running
Running
Add mnemonic detection
Browse files- app.py +30 -14
- mnemonic_attack.py +47 -0
- requirements.txt +2 -1
- run.py +4 -0
- scam_brands.txt +52 -0
app.py
CHANGED
@@ -1,5 +1,7 @@
|
|
1 |
from time import sleep
|
2 |
import logging
|
|
|
|
|
3 |
|
4 |
import httpx
|
5 |
from fastapi import FastAPI
|
@@ -11,17 +13,21 @@ from phishing_datasets import submit_entry
|
|
11 |
from url_tools import extract_urls, resolve_short_url, extract_domain_from_url
|
12 |
from urlscan_client import UrlscanClient
|
13 |
import requests
|
14 |
-
import
|
15 |
|
16 |
|
17 |
|
18 |
app = FastAPI()
|
19 |
urlscan = UrlscanClient()
|
20 |
|
21 |
-
#
|
|
|
|
|
|
|
22 |
logging.basicConfig(
|
23 |
level=logging.DEBUG,
|
24 |
-
format='%(asctime)s [%(levelname)s] %(message)s'
|
|
|
25 |
)
|
26 |
|
27 |
class MessageModel(BaseModel):
|
@@ -85,6 +91,12 @@ def predict(model: InputModel) -> OutputModel:
|
|
85 |
|
86 |
logging.info(f"[{sender}] {text}")
|
87 |
|
|
|
|
|
|
|
|
|
|
|
|
|
88 |
# Debug sleep
|
89 |
pattern = r"^Sent from your Twilio trial account - sleep (\d+)$"
|
90 |
match = re.search(pattern, text)
|
@@ -112,6 +124,7 @@ def predict(model: InputModel) -> OutputModel:
|
|
112 |
return OutputModel(action=ActionModel.PROMOTION, sub_action=SubActionModel.NONE)
|
113 |
|
114 |
result = pipe(text)
|
|
|
115 |
label = result[0]['label']
|
116 |
score = result[0]['score']
|
117 |
|
@@ -120,19 +133,22 @@ def predict(model: InputModel) -> OutputModel:
|
|
120 |
if label == 'LABEL_0':
|
121 |
score = 1 - score
|
122 |
|
123 |
-
|
124 |
-
|
|
|
|
|
|
|
125 |
commercial_stop = False
|
126 |
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
|
137 |
urls = extract_urls(text)
|
138 |
|
|
|
1 |
from time import sleep
|
2 |
import logging
|
3 |
+
import sys
|
4 |
+
import re
|
5 |
|
6 |
import httpx
|
7 |
from fastapi import FastAPI
|
|
|
13 |
from url_tools import extract_urls, resolve_short_url, extract_domain_from_url
|
14 |
from urlscan_client import UrlscanClient
|
15 |
import requests
|
16 |
+
from mnemonic_attack import find_confusable_brand
|
17 |
|
18 |
|
19 |
|
20 |
app = FastAPI()
|
21 |
urlscan = UrlscanClient()
|
22 |
|
23 |
+
# Remove all handlers associated with the root logger object
|
24 |
+
for handler in logging.root.handlers[:]:
|
25 |
+
logging.root.removeHandler(handler)
|
26 |
+
|
27 |
logging.basicConfig(
|
28 |
level=logging.DEBUG,
|
29 |
+
format='%(asctime)s [%(levelname)s] %(message)s',
|
30 |
+
handlers=[logging.StreamHandler(sys.stdout)]
|
31 |
)
|
32 |
|
33 |
class MessageModel(BaseModel):
|
|
|
91 |
|
92 |
logging.info(f"[{sender}] {text}")
|
93 |
|
94 |
+
# Brand usurpation detection using confusables
|
95 |
+
confusable_brand = find_confusable_brand(text)
|
96 |
+
if confusable_brand:
|
97 |
+
logging.warning(f"[BRAND USURPATION] Confusable/homoglyph variant of brand '{confusable_brand}' detected in message. Classified as JUNK.")
|
98 |
+
return OutputModel(action=ActionModel.JUNK, sub_action=SubActionModel.NONE)
|
99 |
+
|
100 |
# Debug sleep
|
101 |
pattern = r"^Sent from your Twilio trial account - sleep (\d+)$"
|
102 |
match = re.search(pattern, text)
|
|
|
124 |
return OutputModel(action=ActionModel.PROMOTION, sub_action=SubActionModel.NONE)
|
125 |
|
126 |
result = pipe(text)
|
127 |
+
|
128 |
label = result[0]['label']
|
129 |
score = result[0]['score']
|
130 |
|
|
|
133 |
if label == 'LABEL_0':
|
134 |
score = 1 - score
|
135 |
|
136 |
+
# Pattern for detecting an alphanumeric SenderID
|
137 |
+
alphanumeric_sender_pattern = r'^[A-Za-z][A-Za-z0-9\-\.]{2,14}$'
|
138 |
+
# Pattern for detecting a short code
|
139 |
+
shorten_sender_pattern = r'^(?:3\d{4}|[4-8]\d{4})$'
|
140 |
+
|
141 |
commercial_stop = False
|
142 |
|
143 |
+
|
144 |
+
|
145 |
+
# Detection of commercial senders (short code or alphanumeric)
|
146 |
+
if re.search(shorten_sender_pattern, sender):
|
147 |
+
logging.info("[COMMERCIAL] Commercial sender detected (short code)")
|
148 |
+
score = score * 0.7
|
149 |
+
elif re.match(alphanumeric_sender_pattern, sender):
|
150 |
+
logging.info("[COMMERCIAL] Alphanumeric SenderID detected")
|
151 |
+
score = score * 0.7
|
152 |
|
153 |
urls = extract_urls(text)
|
154 |
|
mnemonic_attack.py
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import re
|
3 |
+
from confusables import is_confusable, confusable_regex
|
4 |
+
|
5 |
+
SCAM_BRANDS_FILE = os.path.join(os.path.dirname(__file__), 'scam_brands.txt')
|
6 |
+
|
7 |
+
def load_scam_brands():
|
8 |
+
"""Load the list of scam brands from the text file."""
|
9 |
+
with open(SCAM_BRANDS_FILE, encoding='utf-8') as f:
|
10 |
+
return [line.strip().strip('"') for line in f if line.strip()]
|
11 |
+
|
12 |
+
SCAM_BRANDS = load_scam_brands()
|
13 |
+
|
14 |
+
def find_confusable_brand(message):
|
15 |
+
"""
|
16 |
+
Check if the message contains a confusable/homoglyph variant of any scam brand.
|
17 |
+
Returns the matched brand if found, otherwise None.
|
18 |
+
"""
|
19 |
+
for brand in SCAM_BRANDS:
|
20 |
+
# Build a regex that matches the brand or any confusable variant
|
21 |
+
regex_string = confusable_regex(brand, include_character_padding=True)
|
22 |
+
regex = re.compile(regex_string)
|
23 |
+
if regex.search(message):
|
24 |
+
return brand
|
25 |
+
return None
|
26 |
+
|
27 |
+
def test_find_confusable_brand():
|
28 |
+
"""
|
29 |
+
Test the find_confusable_brand function with example messages.
|
30 |
+
"""
|
31 |
+
test_cases = [
|
32 |
+
"This is a message from Amazοn support.", # Greek omicron instead of o
|
33 |
+
"Your Apple account has been locked.",
|
34 |
+
"Contact S0ciété Générale for more info.", # Zero instead of O
|
35 |
+
"Welcome to Netflix!",
|
36 |
+
"This is a message from a random sender.",
|
37 |
+
"Bonjour, c'est le livreur votre colis ne rentrait pas dans la boite aux lettres merci de choisir un point relais sur : https://mondiaIrelais-expedition.com"
|
38 |
+
]
|
39 |
+
for msg in test_cases:
|
40 |
+
result = find_confusable_brand(msg)
|
41 |
+
if result:
|
42 |
+
print(f"[ALERT] Message: '{msg}' => Confusable brand detected: {result}")
|
43 |
+
else:
|
44 |
+
print(f"[OK] Message: '{msg}' => No confusable brand detected.")
|
45 |
+
|
46 |
+
if __name__ == "__main__":
|
47 |
+
test_find_confusable_brand()
|
requirements.txt
CHANGED
@@ -7,4 +7,5 @@ datasets~=3.6.0
|
|
7 |
pandas~=2.2.3
|
8 |
httpx~=0.28.1
|
9 |
numpy~=2.2.5
|
10 |
-
requests~=2.32.3
|
|
|
|
7 |
pandas~=2.2.3
|
8 |
httpx~=0.28.1
|
9 |
numpy~=2.2.5
|
10 |
+
requests~=2.32.3
|
11 |
+
confusables~=1.2.0
|
run.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import uvicorn
|
2 |
+
from app import app
|
3 |
+
|
4 |
+
uvicorn.run(app, host="127.0.0.1", port=7860)
|
scam_brands.txt
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Amazon
|
2 |
+
Apple
|
3 |
+
Microsoft
|
4 |
+
Google
|
5 |
+
La Poste
|
6 |
+
SNCF
|
7 |
+
Orange
|
8 |
+
Free
|
9 |
+
Bouygues Telecom
|
10 |
+
SFR
|
11 |
+
PayPal
|
12 |
+
Crédit Agricole
|
13 |
+
Société Générale
|
14 |
+
BNP Paribas
|
15 |
+
La Banque Postale
|
16 |
+
EDF
|
17 |
+
Engie
|
18 |
+
Ameli
|
19 |
+
CAF
|
20 |
+
Impots.gouv
|
21 |
+
Chronopost
|
22 |
+
Colissimo
|
23 |
+
DHL
|
24 |
+
FedEx
|
25 |
+
UPS
|
26 |
+
Facebook
|
27 |
+
Instagram
|
28 |
+
WhatsApp
|
29 |
+
Netflix
|
30 |
+
Disney+
|
31 |
+
CIC
|
32 |
+
Crédit Mutuel
|
33 |
+
Boursorama
|
34 |
+
Revolut
|
35 |
+
N26
|
36 |
+
Hello Bank
|
37 |
+
Cdiscount
|
38 |
+
Vinted
|
39 |
+
Leboncoin
|
40 |
+
Carrefour
|
41 |
+
Auchan
|
42 |
+
Lidl
|
43 |
+
MondialRelay
|
44 |
+
Mondial
|
45 |
+
TotalEnergies
|
46 |
+
Oney
|
47 |
+
Yves Rocher
|
48 |
+
Decathlon
|
49 |
+
"Service Public"
|
50 |
+
"Sécurité Sociale"
|
51 |
+
"Police Nationale"
|
52 |
+
"Gendarmerie"
|