File size: 2,366 Bytes
05026d9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
504cee7
05026d9
 
 
504cee7
05026d9
504cee7
 
 
 
 
 
 
05026d9
 
 
 
 
 
 
 
 
504cee7
05026d9
 
 
504cee7
 
05026d9
 
 
 
 
 
 
 
504cee7
05026d9
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import os
import re
from confusables import is_confusable, confusable_regex

SCAM_BRANDS_FILE = os.path.join(os.path.dirname(__file__), 'scam_brands.txt')

def load_scam_brands():
    """Load the list of scam brands from the text file."""
    with open(SCAM_BRANDS_FILE, encoding='utf-8') as f:
        return [line.strip().strip('"') for line in f if line.strip()]

SCAM_BRANDS = load_scam_brands()

def find_confusable_brand(message):
    """
    Check if the message contains a confusable/homoglyph variant of any scam brand.
    Returns the matched brand if found, otherwise None.
    Does not return the brand if the match is an exact (byte-for-byte, case-sensitive) match.
    """
    for brand in SCAM_BRANDS:
        # Build a regex that matches the brand or any confusable variant
        regex_string = confusable_regex(brand, include_character_padding=False)
        regex = re.compile(regex_string)
        for match in regex.finditer(message):
            matched_text = match.group(0)
            # Skip if the matched text is exactly the same as the brand (case-sensitive)
            if matched_text.strip().lower() == brand.lower().strip():
                continue
            else:
                print(f"matched_text: {matched_text.lower().strip()} brand: {brand.lower().strip()}") 
            return brand
    return None

def test_find_confusable_brand():
    """
    Test the find_confusable_brand function with example messages.
    """
    test_cases = [
        "This is a message from Amazοn support.",  # Greek omicron instead of o
        "Your Αpple account has been locked.", # Greek capital alpha instead of a
        "Contact S0ciété Générale for more info.",  # Zero instead of O
        "Welcome to Netflix!",
        "This is a message from a random sender.",
        "Bonjour, c'est le livreur votre colis ne rentrait pas dans la boite aux lettres merci de choisir un point relais sur : https://mondiaIrelais-expedition.com",
        "599915 est votre code de vérification Leboncoin."
    ]
    for msg in test_cases:
        result = find_confusable_brand(msg)
        if result:
            print(f"[ALERT] Message: '{msg}' => Confusable brand detected: {result}")
        else:
            print(f"[OK] Message: '{msg}' => No confusable brand detected.")


if __name__ == "__main__":
    test_find_confusable_brand()