Spaces:

kokluch
/

phishing-detector-api

Running

App Files Files Community

kokluch commited on Jul 1

Commit

05026d9

1 Parent(s): 664d4ee

Add mnemonic detection

Browse files

Files changed (5) hide show

app.py +30 -14
mnemonic_attack.py +47 -0
requirements.txt +2 -1
run.py +4 -0
scam_brands.txt +52 -0

app.py CHANGED Viewed

@@ -1,5 +1,7 @@
 from time import sleep
 import logging
 import httpx
 from fastapi import FastAPI
@@ -11,17 +13,21 @@ from phishing_datasets import submit_entry
 from url_tools import extract_urls, resolve_short_url, extract_domain_from_url
 from urlscan_client import UrlscanClient
 import requests
-import re
 app = FastAPI()
 urlscan = UrlscanClient()
-# Configuration de base du logging
 logging.basicConfig(
     level=logging.DEBUG,
-    format='%(asctime)s [%(levelname)s] %(message)s'
 )
 class MessageModel(BaseModel):
@@ -85,6 +91,12 @@ def predict(model: InputModel) -> OutputModel:
     logging.info(f"[{sender}] {text}")
     # Debug sleep
     pattern = r"^Sent from your Twilio trial account - sleep (\d+)$"
     match = re.search(pattern, text)
@@ -112,6 +124,7 @@ def predict(model: InputModel) -> OutputModel:
                 return OutputModel(action=ActionModel.PROMOTION, sub_action=SubActionModel.NONE)
     result = pipe(text)
     label = result[0]['label']
     score = result[0]['score']
@@ -120,19 +133,22 @@ def predict(model: InputModel) -> OutputModel:
     if label == 'LABEL_0':
         score = 1 - score
-    commercial_sender_pattern = r'\b[2-8]\d{4}\b'
-    commercial_stop_pattern = r'\bSTOP(?:\s+SMS)?(?:\s+au)?\s+([2-8]\d{4})\b'
     commercial_stop = False
-    if re.search(commercial_sender_pattern, sender):
-        logging.info("[COMMERCIAL] Commercial sender detected")
-        score = score * 0.9
-        if re.search(commercial_stop_pattern, text):
-            logging.info("[COMMERCIAL] STOP keyword detected")
-            score = score * 0.9
-            commercial_stop = True
-        else:
-            logging.info("[COMMERCIAL] STOP keyword missing")
     urls = extract_urls(text)

 from time import sleep
 import logging
+import sys
+import re
 import httpx
 from fastapi import FastAPI
 from url_tools import extract_urls, resolve_short_url, extract_domain_from_url
 from urlscan_client import UrlscanClient
 import requests
+from mnemonic_attack import find_confusable_brand
 app = FastAPI()
 urlscan = UrlscanClient()
+# Remove all handlers associated with the root logger object
+for handler in logging.root.handlers[:]:
+    logging.root.removeHandler(handler)
 logging.basicConfig(
     level=logging.DEBUG,
+    format='%(asctime)s [%(levelname)s] %(message)s',
+    handlers=[logging.StreamHandler(sys.stdout)]
 )
 class MessageModel(BaseModel):
     logging.info(f"[{sender}] {text}")
+    # Brand usurpation detection using confusables
+    confusable_brand = find_confusable_brand(text)
+    if confusable_brand:
+        logging.warning(f"[BRAND USURPATION] Confusable/homoglyph variant of brand '{confusable_brand}' detected in message. Classified as JUNK.")
+        return OutputModel(action=ActionModel.JUNK, sub_action=SubActionModel.NONE)
     # Debug sleep
     pattern = r"^Sent from your Twilio trial account - sleep (\d+)$"
     match = re.search(pattern, text)
                 return OutputModel(action=ActionModel.PROMOTION, sub_action=SubActionModel.NONE)
     result = pipe(text)
     label = result[0]['label']
     score = result[0]['score']
     if label == 'LABEL_0':
         score = 1 - score
+    # Pattern for detecting an alphanumeric SenderID
+    alphanumeric_sender_pattern = r'^[A-Za-z][A-Za-z0-9\-\.]{2,14}$'
+    # Pattern for detecting a short code
+    shorten_sender_pattern = r'^(?:3\d{4}|[4-8]\d{4})$'
     commercial_stop = False
+    # Detection of commercial senders (short code or alphanumeric)
+    if re.search(shorten_sender_pattern, sender):
+        logging.info("[COMMERCIAL] Commercial sender detected (short code)")
+        score = score * 0.7
+    elif re.match(alphanumeric_sender_pattern, sender):
+        logging.info("[COMMERCIAL] Alphanumeric SenderID detected")
+        score = score * 0.7
     urls = extract_urls(text)

mnemonic_attack.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import os
+import re
+from confusables import is_confusable, confusable_regex
+SCAM_BRANDS_FILE = os.path.join(os.path.dirname(__file__), 'scam_brands.txt')
+def load_scam_brands():
+    """Load the list of scam brands from the text file."""
+    with open(SCAM_BRANDS_FILE, encoding='utf-8') as f:
+        return [line.strip().strip('"') for line in f if line.strip()]
+SCAM_BRANDS = load_scam_brands()
+def find_confusable_brand(message):
+    """
+    Check if the message contains a confusable/homoglyph variant of any scam brand.
+    Returns the matched brand if found, otherwise None.
+    """
+    for brand in SCAM_BRANDS:
+        # Build a regex that matches the brand or any confusable variant
+        regex_string = confusable_regex(brand, include_character_padding=True)
+        regex = re.compile(regex_string)
+        if regex.search(message):
+            return brand
+    return None
+def test_find_confusable_brand():
+    """
+    Test the find_confusable_brand function with example messages.
+    """
+    test_cases = [
+        "This is a message from Amazοn support.",  # Greek omicron instead of o
+        "Your Apple account has been locked.",
+        "Contact S0ciété Générale for more info.",  # Zero instead of O
+        "Welcome to Netflix!",
+        "This is a message from a random sender.",
+        "Bonjour, c'est le livreur votre colis ne rentrait pas dans la boite aux lettres merci de choisir un point relais sur : https://mondiaIrelais-expedition.com"
+    ]
+    for msg in test_cases:
+        result = find_confusable_brand(msg)
+        if result:
+            print(f"[ALERT] Message: '{msg}' => Confusable brand detected: {result}")
+        else:
+            print(f"[OK] Message: '{msg}' => No confusable brand detected.")
+if __name__ == "__main__":
+    test_find_confusable_brand()

requirements.txt CHANGED Viewed

@@ -7,4 +7,5 @@ datasets~=3.6.0
 pandas~=2.2.3
 httpx~=0.28.1
 numpy~=2.2.5
-requests~=2.32.3

 pandas~=2.2.3
 httpx~=0.28.1
 numpy~=2.2.5
+requests~=2.32.3
+confusables~=1.2.0

run.py ADDED Viewed

	@@ -0,0 +1,4 @@

+import uvicorn
+from app import app
+uvicorn.run(app, host="127.0.0.1", port=7860)

scam_brands.txt ADDED Viewed

	@@ -0,0 +1,52 @@

+Amazon
+Apple
+Microsoft
+Google
+La Poste
+SNCF
+Orange
+Free
+Bouygues Telecom
+SFR
+PayPal
+Crédit Agricole
+Société Générale
+BNP Paribas
+La Banque Postale
+EDF
+Engie
+Ameli
+CAF
+Impots.gouv
+Chronopost
+Colissimo
+DHL
+FedEx
+UPS
+Facebook
+Instagram
+WhatsApp
+Netflix
+Disney+
+CIC
+Crédit Mutuel
+Boursorama
+Revolut
+N26
+Hello Bank
+Cdiscount
+Vinted
+Leboncoin
+Carrefour
+Auchan
+Lidl
+MondialRelay
+Mondial
+TotalEnergies
+Oney
+Yves Rocher
+Decathlon
+"Service Public"
+"Sécurité Sociale"
+"Police Nationale"
+"Gendarmerie"