Spaces:
Running
Running
File size: 5,610 Bytes
1d93e26 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 |
# hoax_filter.py
# Lightweight, stateless misinformation heuristics for language/source/scale
import re
from urllib.parse import urlparse
from dataclasses import dataclass
from typing import Dict, Any, Optional, Tuple, List
_NUMBER_UNIT = re.compile(
r'(?P<num>[\d,]+(?:\.\d+)?)\s*(?P<unit>mile|miles|km|kilometer|kilometers)',
re.I
)
LANG_RED_FLAGS = [
r'\brecently\s+declassified\b',
r'\bshocking\b',
r'\bastonishing\b',
r'\bexplosive\b',
r'\bexperts\s+say\b',
r'\breportedly\b',
r'\bmothership\b',
r'\bancient\s+alien\b',
r'\bdormant\s+(?:observational\s+)?craft\b',
r'\bangular\s+edges\b',
r'\bviral\b',
r'\bnever\s+before\s+seen\b',
r'\bshaking\s+(?:the\s+)?scientific\s+community\b',
r'\bfootage\b',
]
# Trusted primary sources (add/remove as you like)
ALLOW_DOMAINS = {
'nasa.gov', 'jpl.nasa.gov', 'pds.nasa.gov', 'science.nasa.gov', 'heasarc.gsfc.nasa.gov',
'esa.int', 'esawebservices.esa.int', 'esa-maine.esa.int',
'noirlab.edu', 'cfa.harvard.edu', 'caltech.edu', 'berkeley.edu', 'mit.edu',
'nature.com', 'science.org', 'iopscience.iop.org', 'agu.org',
'arxiv.org', 'adsabs.harvard.edu',
}
# High-virality social/video platforms: treat as high risk for scientific “scoops”
DENY_DOMAINS = {
'm.facebook.com', 'facebook.com', 'x.com', 'twitter.com', 't.co',
'tiktok.com', 'youtube.com', 'youtu.be', 'instagram.com', 'reddit.com',
}
# Medium-risk tabloid/aggregator examples (tune to preference)
MEDIUM_DOMAINS = {
'dailyMail.co.uk', 'dailymail.co.uk', 'newyorkpost.com', 'the-sun.com',
'mirror.co.uk', 'sputniknews.com', 'rt.com',
}
@dataclass
class HoaxFilterResult:
red_flag_hits: int
source_score: float
scale_score: float
combined: float
notes: Dict[str, Any]
class HoaxFilter:
"""
Scores are in [0,1]; higher means more likely hoax/misinformation.
"""
def __init__(self,
red_flag_weight: float = 0.35,
source_weight: float = 0.25,
scale_weight: float = 0.40,
extraordinary_km: float = 50.0):
"""
extraordinary_km: any single claimed length >= this is 'extraordinary'.
Adjust to tighten/loosen sensitivity (100–500 for stricter).
"""
self.red_flag_weight = red_flag_weight
self.source_weight = source_weight
self.scale_weight = scale_weight
self.extraordinary_km = extraordinary_km
self._flag_res = [re.compile(p, re.I) for p in LANG_RED_FLAGS]
@staticmethod
def _km_from_match(num: str, unit: str) -> float:
n = float(num.replace(',', ''))
if unit.lower().startswith('mile'):
return n * 1.609344
return n
def language_red_flags(self, text: str) -> Tuple[int, List[str]]:
hits = []
for rx in self._flag_res:
if rx.search(text):
hits.append(rx.pattern)
return len(hits), hits
def source_heuristic(self, url: Optional[str]) -> Tuple[float, str]:
"""
Returns (risk, note). risk in [0,1]; higher is worse.
"""
if not url:
return 0.5, "no_source"
host = urlparse(url).netloc.lower()
# Strip common subdomains to compare base domains
parts = host.split(':')[0].split('.')
base = '.'.join(parts[-2:]) if len(parts) >= 2 else host
if host in ALLOW_DOMAINS or base in ALLOW_DOMAINS:
return 0.05, f"allow:{host}"
if host in DENY_DOMAINS or base in DENY_DOMAINS:
return 0.85, f"deny:{host}"
if host in MEDIUM_DOMAINS or base in MEDIUM_DOMAINS:
return 0.7, f"medium:{host}"
return 0.6, f"unknown:{host}"
def scale_check(self, text: str, context_keywords: Optional[List[str]] = None) -> Tuple[float, Dict]:
"""
Parse lengths and judge extraordinariness, boosting risk when context
suggests planetary/astronomical claims.
"""
context_keywords = context_keywords or []
sizes_km = []
for m in _NUMBER_UNIT.finditer(text):
sizes_km.append(self._km_from_match(m.group('num'), m.group('unit')))
if not sizes_km:
return 0.0, {"sizes_km": []}
max_km = max(sizes_km)
extraordinary_context = any(k in text.lower() for k in context_keywords)
ratio = max_km / max(self.extraordinary_km, 1.0)
base = min(ratio, 1.0) # saturate at 1.0
if extraordinary_context:
base = min(1.0, base * 1.25) # slight boost in relevant context
return base, {"sizes_km": sizes_km, "max_km": max_km, "extraordinary_context": extraordinary_context}
def score(self, text: str, url: Optional[str] = None,
context_keywords: Optional[List[str]] = None) -> HoaxFilterResult:
rf_count, rf_hits = self.language_red_flags(text)
rf_score = min(rf_count / 4.0, 1.0)
src_risk, src_note = self.source_heuristic(url)
scale_risk, scale_notes = self.scale_check(text, context_keywords=context_keywords)
combined = (self.red_flag_weight * rf_score
+ self.source_weight * src_risk
+ self.scale_weight * scale_risk)
return HoaxFilterResult(
red_flag_hits=rf_count,
source_score=src_risk,
scale_score=scale_risk,
combined=min(combined, 1.0),
notes={
"red_flag_patterns": rf_hits,
"source": src_note,
**scale_notes
}
)
|