LinkLinkWu commited on
Commit
ae44182
·
verified ·
1 Parent(s): 64d5a00

Update func.py

Browse files
Files changed (1) hide show
  1. func.py +69 -33
func.py CHANGED
@@ -1,4 +1,4 @@
1
- from typing import List
2
 
3
  from transformers import (
4
  pipeline,
@@ -10,16 +10,14 @@ from bs4 import BeautifulSoup
10
  import requests
11
 
12
  # ---------------------------------------------------------------------------
13
- # Model identifiers – use your custom sentiment model hosted on Hugging Face
14
  # ---------------------------------------------------------------------------
15
- SENTIMENT_MODEL_ID = "LinkLinkWu/Stock_Analysis_Test_Ahamed" # binary sentiment
16
  NER_MODEL_ID = "dslim/bert-base-NER"
17
 
18
  # ---------------------------------------------------------------------------
19
- # Eager initialisation (singletons shared by the whole Streamlit session)
20
  # ---------------------------------------------------------------------------
21
- # Sentiment pipeline – returns one label with its score. We will *ignore* the
22
- # numeric score down‑stream to satisfy the "no numbers" requirement.
23
  sentiment_tokenizer = AutoTokenizer.from_pretrained(SENTIMENT_MODEL_ID)
24
  sentiment_model = AutoModelForSequenceClassification.from_pretrained(SENTIMENT_MODEL_ID)
25
  sentiment_pipeline = pipeline(
@@ -28,7 +26,6 @@ sentiment_pipeline = pipeline(
28
  tokenizer=sentiment_tokenizer,
29
  )
30
 
31
- # Named‑entity‑recognition pipeline (ORG extraction)
32
  ner_tokenizer = AutoTokenizer.from_pretrained(NER_MODEL_ID)
33
  ner_model = AutoModelForTokenClassification.from_pretrained(NER_MODEL_ID)
34
  ner_pipeline = pipeline(
@@ -43,7 +40,7 @@ ner_pipeline = pipeline(
43
  # ---------------------------------------------------------------------------
44
 
45
  def fetch_news(ticker: str) -> List[dict]:
46
- """Return at most 30 latest Finviz headlines for *ticker* ("title" & "link")."""
47
  try:
48
  url = f"https://finviz.com/quote.ashx?t={ticker}"
49
  headers = {
@@ -59,7 +56,7 @@ def fetch_news(ticker: str) -> List[dict]:
59
 
60
  soup = BeautifulSoup(r.text, "html.parser")
61
  if ticker.upper() not in (soup.title.text if soup.title else "").upper():
62
- return [] # possibly a redirect page
63
 
64
  table = soup.find(id="news-table")
65
  if table is None:
@@ -75,48 +72,87 @@ def fetch_news(ticker: str) -> List[dict]:
75
  return []
76
 
77
  # ---------------------------------------------------------------------------
78
- # Sentiment helpers – binary classification, *no* numeric score exposed
79
  # ---------------------------------------------------------------------------
80
- _LABEL_MAP = {"LABEL_0": "Negative", "LABEL_1": "Positive"} # adjust if model config differs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
 
82
 
83
- def analyze_sentiment(text: str, pipe=None) -> str:
84
- """Return **"Positive"** or **"Negative"** for a single headline.
85
 
86
- *Neutral* outputs (if ever returned by the model) are coerced to *Negative*.
87
- Numeric confidence scores are deliberately discarded to honour the
88
- "no numbers" requirement.
89
  """
90
  try:
91
  sentiment_pipe = pipe or sentiment_pipeline
92
- result = sentiment_pipe(text, truncation=True, return_all_scores=False)[0]
93
- raw_label = result.get("label", "").upper()
94
- label = _LABEL_MAP.get(raw_label, "Negative") # default to Negative
95
- return label
 
 
 
 
96
  except Exception:
97
- return "Unknown"
98
 
99
  # ---------------------------------------------------------------------------
100
- # Aggregation – majority vote (Positive‑ratio) → binary label
101
  # ---------------------------------------------------------------------------
102
-
103
- _POS_RATIO_THRESHOLD = 0.6 # ≥60 % positives → overall Positive
104
 
105
 
106
- def aggregate_sentiments(labels: List[str], pos_ratio_threshold: float = _POS_RATIO_THRESHOLD) -> str:
107
- """Combine individual headline labels into an overall binary sentiment.
108
 
109
- * If *Positive* proportion *pos_ratio_threshold* *Positive*.
110
- * Otherwise → *Negative*.
111
  * Empty list → *Unknown*.
 
112
  """
113
- if not labels:
114
  return "Unknown"
115
 
116
- total = len(labels)
117
- positives = sum(1 for l in labels if l == "Positive")
118
- ratio = positives / total
119
- return "Positive" if ratio >= pos_ratio_threshold else "Negative"
120
 
121
  # ---------------------------------------------------------------------------
122
  # ORG‑entity extraction (ticker discovery)
 
1
+ from typing import List, Tuple
2
 
3
  from transformers import (
4
  pipeline,
 
10
  import requests
11
 
12
  # ---------------------------------------------------------------------------
13
+ # Model identifiers – custom binary‑sentiment model hosted on Hugging Face
14
  # ---------------------------------------------------------------------------
15
+ SENTIMENT_MODEL_ID = "LinkLinkWu/Stock_Analysis_Test_Ahamed" # LABEL_0 = Negative, LABEL_1 = Positive
16
  NER_MODEL_ID = "dslim/bert-base-NER"
17
 
18
  # ---------------------------------------------------------------------------
19
+ # Pipeline singletons (initialised once per session)
20
  # ---------------------------------------------------------------------------
 
 
21
  sentiment_tokenizer = AutoTokenizer.from_pretrained(SENTIMENT_MODEL_ID)
22
  sentiment_model = AutoModelForSequenceClassification.from_pretrained(SENTIMENT_MODEL_ID)
23
  sentiment_pipeline = pipeline(
 
26
  tokenizer=sentiment_tokenizer,
27
  )
28
 
 
29
  ner_tokenizer = AutoTokenizer.from_pretrained(NER_MODEL_ID)
30
  ner_model = AutoModelForTokenClassification.from_pretrained(NER_MODEL_ID)
31
  ner_pipeline = pipeline(
 
40
  # ---------------------------------------------------------------------------
41
 
42
  def fetch_news(ticker: str) -> List[dict]:
43
+ """Return 30 latest Finviz headlines for *ticker* ("title" & "link")."""
44
  try:
45
  url = f"https://finviz.com/quote.ashx?t={ticker}"
46
  headers = {
 
56
 
57
  soup = BeautifulSoup(r.text, "html.parser")
58
  if ticker.upper() not in (soup.title.text if soup.title else "").upper():
59
+ return [] # redirect / placeholder page
60
 
61
  table = soup.find(id="news-table")
62
  if table is None:
 
72
  return []
73
 
74
  # ---------------------------------------------------------------------------
75
+ # Sentiment helpers – binary output, internal probabilities retained
76
  # ---------------------------------------------------------------------------
77
+ _LABEL_MAP = {"LABEL_0": "Negative", "LABEL_1": "Positive", "NEUTRAL": "Positive"}
78
+ _POSITIVE_RAW = "LABEL_1"
79
+ _NEUTRAL_RAW = "NEUTRAL" # rarely returned; mapped to Positive on purpose
80
+ _SINGLE_THRESHOLD = 0.55 # per‑headline cut‑off
81
+
82
+
83
+ def analyze_sentiment(
84
+ text: str,
85
+ pipe=None,
86
+ threshold: float = _SINGLE_THRESHOLD,
87
+ ) -> Tuple[str, float]:
88
+ """Return ``(label, positive_probability)`` for *text*.
89
+
90
+ * Neutral predictions – if produced by the model – are **treated as Positive**.
91
+ * Numeric probability is kept for aggregation; front‑end may discard it to
92
+ satisfy the "no numbers" display requirement.
93
+ """
94
+ try:
95
+ sentiment_pipe = pipe or sentiment_pipeline
96
+ all_scores = sentiment_pipe(text, return_all_scores=True, truncation=True)[0]
97
+ score_map = {item["label"].upper(): item["score"] for item in all_scores}
98
+
99
+ # Positive probability: include Neutral as positive when present
100
+ pos_prob = score_map.get(_POSITIVE_RAW, 0.0)
101
+ if _NEUTRAL_RAW in score_map:
102
+ pos_prob = max(pos_prob, score_map[_NEUTRAL_RAW])
103
+
104
+ # Determine final label (Neutral → Positive by design)
105
+ label = "Positive" if (
106
+ (_NEUTRAL_RAW in score_map) or (pos_prob >= threshold)
107
+ ) else "Negative"
108
+ return label, pos_prob
109
+ except Exception:
110
+ return "Unknown", 0.0
111
+
112
+ # ---------------------------------------------------------------------------
113
+ _LABEL_MAP = {"LABEL_0": "Negative", "LABEL_1": "Positive"}
114
+ _POSITIVE_RAW = "LABEL_1"
115
+ _SINGLE_THRESHOLD = 0.55 # per‑headline cut‑off
116
 
117
 
118
+ def analyze_sentiment(text: str, pipe=None, threshold: float = _SINGLE_THRESHOLD) -> Tuple[str, float]:
119
+ """Return ``(label, positive_probability)`` for *text*.
120
 
121
+ * Neutral is not expected from a binary model; if encountered, treat as Negative.
122
+ * Numeric probability is for internal aggregation only front‑end can ignore
123
+ it to satisfy the "no numbers" requirement.
124
  """
125
  try:
126
  sentiment_pipe = pipe or sentiment_pipeline
127
+ scores = sentiment_pipe(text, return_all_scores=True, truncation=True)[0]
128
+ pos_prob = 0.0
129
+ for item in scores:
130
+ if item["label"].upper() == _POSITIVE_RAW:
131
+ pos_prob = item["score"]
132
+ break
133
+ label = "Positive" if pos_prob >= threshold else "Negative"
134
+ return label, pos_prob
135
  except Exception:
136
+ return "Unknown", 0.0
137
 
138
  # ---------------------------------------------------------------------------
139
+ # Aggregation – average positive probability → binary overall label
140
  # ---------------------------------------------------------------------------
141
+ _AVG_THRESHOLD = 0.55 # ≥55 % mean positive probability → overall Positive
 
142
 
143
 
144
+ def aggregate_sentiments(results: List[Tuple[str, float]], avg_threshold: float = _AVG_THRESHOLD) -> str:
145
+ """Compute overall **Positive/Negative** via *average positive probability*.
146
 
147
+ * *results* list of tuples from ``analyze_sentiment``.
 
148
  * Empty list → *Unknown*.
149
+ * The returned label is **binary**; numeric values remain internal.
150
  """
151
+ if not results:
152
  return "Unknown"
153
 
154
+ avg_pos = sum(prob for _, prob in results) / len(results)
155
+ return "Positive" if avg_pos >= avg_threshold else "Negative"
 
 
156
 
157
  # ---------------------------------------------------------------------------
158
  # ORG‑entity extraction (ticker discovery)