Spaces:

kokluch
/

phishing-detector-api

Running

App Files Files Community

kokluch commited on Jul 3

Commit

504cee7

1 Parent(s): 63815c9

Fix official brand exclude.

Browse files

Files changed (3) hide show

app.py +8 -9
mnemonic_attack.py +13 -4
scam_brands.txt +1 -5

app.py CHANGED Viewed

@@ -26,7 +26,7 @@ for handler in logging.root.handlers[:]:
 logging.basicConfig(
     level=logging.DEBUG,
-    format='%(asctime)s [%(levelname)s] %(message)s',
     handlers=[logging.StreamHandler(sys.stdout)]
 )
@@ -91,12 +91,6 @@ def predict(model: InputModel) -> OutputModel:
     logging.info(f"[{sender}] {text}")
-    # Brand usurpation detection using confusables
-    confusable_brand = find_confusable_brand(text)
-    if confusable_brand:
-        logging.warning(f"[BRAND USURPATION] Confusable/homoglyph variant of brand '{confusable_brand}' detected in message. Classified as JUNK.")
-        return OutputModel(action=ActionModel.JUNK, sub_action=SubActionModel.NONE)
     # Debug sleep
     pattern = r"^Sent from your Twilio trial account - sleep (\d+)$"
     match = re.search(pattern, text)
@@ -123,6 +117,13 @@ def predict(model: InputModel) -> OutputModel:
             case 'promotion':
                 return OutputModel(action=ActionModel.PROMOTION, sub_action=SubActionModel.NONE)
     result = pipe(text)
     label = result[0]['label']
@@ -140,8 +141,6 @@ def predict(model: InputModel) -> OutputModel:
     commercial_stop = False
     # Detection of commercial senders (short code or alphanumeric)
     if re.search(shorten_sender_pattern, sender):
         logging.info("[COMMERCIAL] Commercial sender detected (short code)")

 logging.basicConfig(
     level=logging.DEBUG,
+    format='%(levelname)s:     %(asctime)s     %(message)s',
     handlers=[logging.StreamHandler(sys.stdout)]
 )
     logging.info(f"[{sender}] {text}")
     # Debug sleep
     pattern = r"^Sent from your Twilio trial account - sleep (\d+)$"
     match = re.search(pattern, text)
             case 'promotion':
                 return OutputModel(action=ActionModel.PROMOTION, sub_action=SubActionModel.NONE)
+    # Brand usurpation detection using confusables
+    confusable_brand = find_confusable_brand(text)
+    if confusable_brand:
+        logging.warning(f"[BRAND USURPATION] Confusable/homoglyph variant of brand '{confusable_brand}' detected in message. Classified as JUNK.")
+        return OutputModel(action=ActionModel.JUNK, sub_action=SubActionModel.NONE)
     result = pipe(text)
     label = result[0]['label']
     commercial_stop = False
     # Detection of commercial senders (short code or alphanumeric)
     if re.search(shorten_sender_pattern, sender):
         logging.info("[COMMERCIAL] Commercial sender detected (short code)")

mnemonic_attack.py CHANGED Viewed

@@ -15,12 +15,19 @@ def find_confusable_brand(message):
     """
     Check if the message contains a confusable/homoglyph variant of any scam brand.
     Returns the matched brand if found, otherwise None.
     """
     for brand in SCAM_BRANDS:
         # Build a regex that matches the brand or any confusable variant
-        regex_string = confusable_regex(brand, include_character_padding=True)
         regex = re.compile(regex_string)
-        if regex.search(message):
             return brand
     return None
@@ -30,11 +37,12 @@ def test_find_confusable_brand():
     """
     test_cases = [
         "This is a message from Amazοn support.",  # Greek omicron instead of o
-        "Your Apple account has been locked.",
         "Contact S0ciété Générale for more info.",  # Zero instead of O
         "Welcome to Netflix!",
         "This is a message from a random sender.",
-        "Bonjour, c'est le livreur votre colis ne rentrait pas dans la boite aux lettres merci de choisir un point relais sur : https://mondiaIrelais-expedition.com"
     ]
     for msg in test_cases:
         result = find_confusable_brand(msg)
@@ -43,5 +51,6 @@ def test_find_confusable_brand():
         else:
             print(f"[OK] Message: '{msg}' => No confusable brand detected.")
 if __name__ == "__main__":
     test_find_confusable_brand()

     """
     Check if the message contains a confusable/homoglyph variant of any scam brand.
     Returns the matched brand if found, otherwise None.
+    Does not return the brand if the match is an exact (byte-for-byte, case-sensitive) match.
     """
     for brand in SCAM_BRANDS:
         # Build a regex that matches the brand or any confusable variant
+        regex_string = confusable_regex(brand, include_character_padding=False)
         regex = re.compile(regex_string)
+        for match in regex.finditer(message):
+            matched_text = match.group(0)
+            # Skip if the matched text is exactly the same as the brand (case-sensitive)
+            if matched_text.strip().lower() == brand.lower().strip():
+                continue
+            else:
+                print(f"matched_text: {matched_text.lower().strip()} brand: {brand.lower().strip()}")
             return brand
     return None
     """
     test_cases = [
         "This is a message from Amazοn support.",  # Greek omicron instead of o
+        "Your Αpple account has been locked.", # Greek capital alpha instead of a
         "Contact S0ciété Générale for more info.",  # Zero instead of O
         "Welcome to Netflix!",
         "This is a message from a random sender.",
+        "Bonjour, c'est le livreur votre colis ne rentrait pas dans la boite aux lettres merci de choisir un point relais sur : https://mondiaIrelais-expedition.com",
+        "599915 est votre code de vérification Leboncoin."
     ]
     for msg in test_cases:
         result = find_confusable_brand(msg)
         else:
             print(f"[OK] Message: '{msg}' => No confusable brand detected.")
 if __name__ == "__main__":
     test_find_confusable_brand()

scam_brands.txt CHANGED Viewed

@@ -40,7 +40,6 @@ Doctolib
 Douanes françaises
 EDF
 Engie
-Eni
 Epic Games
 Europ Assistance
 Facebook
@@ -49,7 +48,6 @@ Fnac
 Fortuneo
 Free
 GLS
-Gendarmerie
 GitHub
 Google
 Groupama
@@ -81,12 +79,12 @@ N26
 NMBS
 Nespresso
 Netflix
-Nickel
 Norauto
 Oney
 Orange
 PayPal
 Paylib
 Police Nationale
 Préfecture
 Pôle emploi
@@ -106,9 +104,7 @@ Spotify
 Steam
 Suez
 Sécurité Sociale
-TNT
 TikTok
-Total Direct Energie
 TotalEnergies
 TransferWise
 Trésor Public

 Douanes françaises
 EDF
 Engie
 Epic Games
 Europ Assistance
 Facebook
 Fortuneo
 Free
 GLS
 GitHub
 Google
 Groupama
 NMBS
 Nespresso
 Netflix
 Norauto
 Oney
 Orange
 PayPal
 Paylib
+Planity
 Police Nationale
 Préfecture
 Pôle emploi
 Steam
 Suez
 Sécurité Sociale
 TikTok
 TotalEnergies
 TransferWise
 Trésor Public