SamanthaStorm commited on
Commit
b9947a5
·
verified ·
1 Parent(s): 666c665

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -123
app.py CHANGED
@@ -130,51 +130,6 @@ ESCALATION_QUESTIONS = [
130
  ("Violence has increased in frequency or severity", 3),
131
  ("Partner monitors your calls/GPS/social media", 2)
132
  ]
133
- DARVO_PATTERNS = [
134
- "blame shifting", # "You're the reason this happens"
135
- "projection", # "You're the abusive one"
136
- "deflection", # "This isn't about that"
137
- "dismissiveness", # "You're overreacting"
138
- "insults", # Personal attacks that redirect attention
139
- "aggression", # Escalates tone to destabilize
140
- "recovery phase", # Sudden affection following aggression
141
- "contradictory statements" # “I never said that” immediately followed by a version of what they said
142
- ]
143
- DARVO_MOTIFS = [
144
- "I never said that.", "You’re imagining things.", "That never happened.",
145
- "You’re making a big deal out of nothing.", "It was just a joke.", "You’re too sensitive.",
146
- "I don’t know what you’re talking about.", "You’re overreacting.", "I didn’t mean it that way.",
147
- "You’re twisting my words.", "You’re remembering it wrong.", "You’re always looking for something to complain about.",
148
- "You’re just trying to start a fight.", "I was only trying to help.", "You’re making things up.",
149
- "You’re blowing this out of proportion.", "You’re being paranoid.", "You’re too emotional.",
150
- "You’re always so dramatic.", "You’re just trying to make me look bad.",
151
-
152
- "You’re crazy.", "You’re the one with the problem.", "You’re always so negative.",
153
- "You’re just trying to control me.", "You’re the abusive one.", "You’re trying to ruin my life.",
154
- "You’re just jealous.", "You’re the one who needs help.", "You’re always playing the victim.",
155
- "You’re the one causing all the problems.", "You’re just trying to make me feel guilty.",
156
- "You’re the one who can’t let go of the past.", "You’re the one who’s always angry.",
157
- "You’re the one who’s always complaining.", "You’re the one who’s always starting arguments.",
158
- "You’re the one who’s always making things worse.", "You’re the one who’s always making me feel bad.",
159
- "You’re the one who’s always making me look like the bad guy.",
160
- "You’re the one who’s always making me feel like a failure.",
161
- "You’re the one who’s always making me feel like I’m not good enough.",
162
-
163
- "I can’t believe you’re doing this to me.", "You’re hurting me.",
164
- "You’re making me feel like a terrible person.", "You’re always blaming me for everything.",
165
- "You’re the one who’s abusive.", "You’re the one who’s controlling.", "You’re the one who’s manipulative.",
166
- "You’re the one who’s toxic.", "You’re the one who’s gaslighting me.",
167
- "You’re the one who’s always putting me down.", "You’re the one who’s always making me feel bad.",
168
- "You’re the one who’s always making me feel like I’m not good enough.",
169
- "You’re the one who’s always making me feel like I’m the problem.",
170
- "You’re the one who’s always making me feel like I’m the bad guy.",
171
- "You’re the one who’s always making me feel like I’m the villain.",
172
- "You’re the one who’s always making me feel like I’m the one who needs to change.",
173
- "You’re the one who’s always making me feel like I’m the one who’s wrong.",
174
- "You’re the one who’s always making me feel like I’m the one who’s crazy.",
175
- "You’re the one who’s always making me feel like I’m the one who’s abusive.",
176
- "You’re the one who’s always making me feel like I’m the one who’s toxic."
177
- ]
178
  def get_emotional_tone_tag(emotions, sentiment, patterns, abuse_score):
179
  sadness = emotions.get("sadness", 0)
180
  joy = emotions.get("joy", 0)
@@ -304,42 +259,21 @@ def get_emotional_tone_tag(emotions, sentiment, patterns, abuse_score):
304
  return "emotional instability"
305
 
306
  return None
307
- def detect_contradiction(message):
308
- patterns = [
309
- (r"\b(i love you).{0,15}(i hate you|you ruin everything)", re.IGNORECASE),
310
- (r"\b(i’m sorry).{0,15}(but you|if you hadn’t)", re.IGNORECASE),
311
- (r"\b(i’m trying).{0,15}(you never|why do you)", re.IGNORECASE),
312
- (r"\b(do what you want).{0,15}(you’ll regret it|i always give everything)", re.IGNORECASE),
313
- (r"\b(i don’t care).{0,15}(you never think of me)", re.IGNORECASE),
314
- (r"\b(i guess i’m just).{0,15}(the bad guy|worthless|never enough)", re.IGNORECASE)
315
- ]
316
- return any(re.search(p, message, flags) for p, flags in patterns)
317
-
318
- def calculate_darvo_score(patterns, sentiment_before, sentiment_after, motifs_found, contradiction_flag=False):
319
- # Count all detected DARVO-related patterns
320
- pattern_hits = sum(1 for p in patterns if p.lower() in DARVO_PATTERNS)
321
-
322
- # Sentiment delta
323
- sentiment_shift_score = max(0.0, sentiment_after - sentiment_before)
324
-
325
- # Match against DARVO motifs more loosely
326
- motif_hits = sum(
327
- any(phrase.lower() in motif.lower() or motif.lower() in phrase.lower()
328
- for phrase in DARVO_MOTIFS)
329
- for motif in motifs_found
330
- )
331
- motif_score = motif_hits / max(len(DARVO_MOTIFS), 1)
332
-
333
- # Contradiction still binary
334
- contradiction_score = 1.0 if contradiction_flag else 0.0
335
-
336
- # Final DARVO score
337
- return round(min(
338
- 0.3 * pattern_hits +
339
- 0.3 * sentiment_shift_score +
340
- 0.25 * motif_score +
341
- 0.15 * contradiction_score, 1.0
342
- ), 3)
343
  def detect_weapon_language(text):
344
  weapon_keywords = [
345
  "knife", "knives", "stab", "cut you", "cutting",
@@ -420,35 +354,6 @@ def generate_risk_snippet(abuse_score, top_label, escalation_score, stage):
420
  base += "🧠 You can review the pattern in context. This tool highlights possible dynamics—not judgments."
421
  return base
422
 
423
- WHY_FLAGGED = {
424
- "control": "This message may reflect efforts to restrict someone’s autonomy, even if it's framed as concern or care.",
425
- "gaslighting": "This message could be manipulating someone into questioning their perception or feelings.",
426
- "dismissiveness": "This message may include belittling, invalidating, or ignoring the other person’s experience.",
427
- "insults": "Direct insults often appear in escalating abusive dynamics and can erode emotional safety.",
428
- "blame shifting": "This message may redirect responsibility to avoid accountability, especially during conflict.",
429
- "guilt tripping": "This message may induce guilt in order to control or manipulate behavior.",
430
- "recovery phase": "This message may be part of a tension-reset cycle, appearing kind but avoiding change.",
431
- "projection": "This message may involve attributing the abuser’s own behaviors to the victim.",
432
- "contradictory statements": "This message may contain internal contradictions used to confuse, destabilize, or deflect responsibility.",
433
- "obscure language": "This message may use overly formal, vague, or complex language to obscure meaning or avoid accountability.",
434
- "default": "This message contains language patterns that may affect safety, clarity, or emotional autonomy."
435
- }
436
- explanation = WHY_FLAGGED.get(pattern_label.lower(), WHY_FLAGGED["default"])
437
-
438
- base = f"\n\n🛑 Risk Level: {risk_level.capitalize()}\n"
439
- base += f"This message shows strong indicators of **{pattern_label}**. "
440
-
441
- if risk_level == "high":
442
- base += "The language may reflect patterns of emotional control, even when expressed in soft or caring terms.\n"
443
- elif risk_level == "moderate":
444
- base += "There are signs of emotional pressure or indirect control that may escalate if repeated.\n"
445
- else:
446
- base += "The message does not strongly indicate abuse, but it's important to monitor for patterns.\n"
447
-
448
- base += f"\n💡 *Why this might be flagged:*\n{explanation}\n"
449
- base += f"\nDetected Pattern: **{pattern_label} ({pattern_score})**\n"
450
- base += "🧠 You can review the pattern in context. This tool highlights possible dynamics—not judgments."
451
- return base
452
 
453
  # --- Step X: Detect Immediate Danger Threats ---
454
  THREAT_MOTIFS = [
@@ -535,23 +440,14 @@ def analyze_single_message(text, thresholds):
535
  k: v + 0.05 if sentiment == "supportive" else v
536
  for k, v in thresholds.items()
537
  }
538
-
539
- contradiction_flag = detect_contradiction(text)
540
 
541
  threshold_labels = [
542
  label for label, score in zip(LABELS, scores)
543
  if score > adjusted_thresholds[label]
544
  ]
545
  tone_tag = get_emotional_tone_tag(emotion_profile, sentiment, threshold_labels, 0)
546
- motifs = [phrase for _, phrase in matched_phrases]
547
-
548
- darvo_score = calculate_darvo_score(
549
- threshold_labels,
550
- sentiment_before=0.0,
551
- sentiment_after=sentiment_score,
552
- motifs_found=motifs,
553
- contradiction_flag=contradiction_flag
554
- )
555
 
556
  top_patterns = sorted(
557
  [(label, score) for label, score in zip(LABELS, scores)],
@@ -621,8 +517,6 @@ def analyze_single_message(text, thresholds):
621
  print(f" {label:25} → {score:.3f} {passed}")
622
  print(f"Matched for score: {[(l, round(s, 3)) for l, s, _ in matched_scores]}")
623
  print(f"Abuse Score Raw: {round(abuse_score_raw, 1)}")
624
- print(f"Motifs: {motifs}")
625
- print(f"Contradiction: {contradiction_flag}")
626
  print("------------------\n")
627
 
628
  return abuse_score, threshold_labels, top_patterns, {"label": sentiment}, stage, darvo_score, tone_tag
 
130
  ("Violence has increased in frequency or severity", 3),
131
  ("Partner monitors your calls/GPS/social media", 2)
132
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
  def get_emotional_tone_tag(emotions, sentiment, patterns, abuse_score):
134
  sadness = emotions.get("sadness", 0)
135
  joy = emotions.get("joy", 0)
 
259
  return "emotional instability"
260
 
261
  return None
262
+ # 🔄 New DARVO score model (regression-based)
263
+ from torch.nn.functional import sigmoid
264
+ import torch
265
+
266
+ # Load your trained DARVO regressor from Hugging Face Hub
267
+ darvo_model = AutoModelForSequenceClassification.from_pretrained("SamanthaStorm/tether-darvo-regressor-v1")
268
+ darvo_tokenizer = AutoTokenizer.from_pretrained("SamanthaStorm/tether-darvo-regressor-v1", use_fast=False)
269
+ darvo_model.eval()
270
+
271
+ def predict_darvo_score(text):
272
+ inputs = darvo_tokenizer(text, return_tensors="pt", truncation=True, padding=True)
273
+ with torch.no_grad():
274
+ logits = darvo_model(**inputs).logits
275
+ score = sigmoid(logits).item()
276
+ return round(score, 4) # Rounded for display/output
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
277
  def detect_weapon_language(text):
278
  weapon_keywords = [
279
  "knife", "knives", "stab", "cut you", "cutting",
 
354
  base += "🧠 You can review the pattern in context. This tool highlights possible dynamics—not judgments."
355
  return base
356
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
357
 
358
  # --- Step X: Detect Immediate Danger Threats ---
359
  THREAT_MOTIFS = [
 
440
  k: v + 0.05 if sentiment == "supportive" else v
441
  for k, v in thresholds.items()
442
  }
443
+ darvo_score = predict_darvo_score(text)
 
444
 
445
  threshold_labels = [
446
  label for label, score in zip(LABELS, scores)
447
  if score > adjusted_thresholds[label]
448
  ]
449
  tone_tag = get_emotional_tone_tag(emotion_profile, sentiment, threshold_labels, 0)
450
+
 
 
 
 
 
 
 
 
451
 
452
  top_patterns = sorted(
453
  [(label, score) for label, score in zip(LABELS, scores)],
 
517
  print(f" {label:25} → {score:.3f} {passed}")
518
  print(f"Matched for score: {[(l, round(s, 3)) for l, s, _ in matched_scores]}")
519
  print(f"Abuse Score Raw: {round(abuse_score_raw, 1)}")
 
 
520
  print("------------------\n")
521
 
522
  return abuse_score, threshold_labels, top_patterns, {"label": sentiment}, stage, darvo_score, tone_tag