LinkLinkWu commited on
Commit
6eecf76
·
verified ·
1 Parent(s): ae44182

Update func.py

Browse files
Files changed (1) hide show
  1. func.py +100 -99
func.py CHANGED
@@ -1,3 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from typing import List, Tuple
2
 
3
  from transformers import (
@@ -10,37 +23,98 @@ from bs4 import BeautifulSoup
10
  import requests
11
 
12
  # ---------------------------------------------------------------------------
13
- # Model identifiers custom binary‑sentiment model hosted on Hugging Face
14
  # ---------------------------------------------------------------------------
15
  SENTIMENT_MODEL_ID = "LinkLinkWu/Stock_Analysis_Test_Ahamed" # LABEL_0 = Negative, LABEL_1 = Positive
16
  NER_MODEL_ID = "dslim/bert-base-NER"
17
 
18
  # ---------------------------------------------------------------------------
19
- # Pipeline singletons (initialised once per session)
20
  # ---------------------------------------------------------------------------
21
- sentiment_tokenizer = AutoTokenizer.from_pretrained(SENTIMENT_MODEL_ID)
22
- sentiment_model = AutoModelForSequenceClassification.from_pretrained(SENTIMENT_MODEL_ID)
 
23
  sentiment_pipeline = pipeline(
24
- "sentiment-analysis",
25
- model=sentiment_model,
26
- tokenizer=sentiment_tokenizer,
 
27
  )
28
 
29
- ner_tokenizer = AutoTokenizer.from_pretrained(NER_MODEL_ID)
30
- ner_model = AutoModelForTokenClassification.from_pretrained(NER_MODEL_ID)
 
31
  ner_pipeline = pipeline(
32
  "ner",
33
- model=ner_model,
34
- tokenizer=ner_tokenizer,
35
  grouped_entities=True,
36
  )
37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  # ---------------------------------------------------------------------------
39
  # Web‑scraping helper (Finviz)
40
  # ---------------------------------------------------------------------------
41
 
42
- def fetch_news(ticker: str) -> List[dict]:
43
- """Return ≤30 latest Finviz headlines for *ticker* ("title" & "link")."""
 
 
 
 
44
  try:
45
  url = f"https://finviz.com/quote.ashx?t={ticker}"
46
  headers = {
@@ -56,110 +130,34 @@ def fetch_news(ticker: str) -> List[dict]:
56
 
57
  soup = BeautifulSoup(r.text, "html.parser")
58
  if ticker.upper() not in (soup.title.text if soup.title else "").upper():
59
- return [] # redirect / placeholder page
60
 
61
  table = soup.find(id="news-table")
62
  if table is None:
63
  return []
64
 
65
  headlines: List[dict] = []
66
- for row in table.find_all("tr")[:30]:
67
  link_tag = row.find("a")
68
  if link_tag:
69
- headlines.append({"title": link_tag.get_text(strip=True), "link": link_tag["href"]})
 
 
70
  return headlines
71
  except Exception:
72
  return []
73
 
74
- # ---------------------------------------------------------------------------
75
- # Sentiment helpers – binary output, internal probabilities retained
76
- # ---------------------------------------------------------------------------
77
- _LABEL_MAP = {"LABEL_0": "Negative", "LABEL_1": "Positive", "NEUTRAL": "Positive"}
78
- _POSITIVE_RAW = "LABEL_1"
79
- _NEUTRAL_RAW = "NEUTRAL" # rarely returned; mapped to Positive on purpose
80
- _SINGLE_THRESHOLD = 0.55 # per‑headline cut‑off
81
-
82
-
83
- def analyze_sentiment(
84
- text: str,
85
- pipe=None,
86
- threshold: float = _SINGLE_THRESHOLD,
87
- ) -> Tuple[str, float]:
88
- """Return ``(label, positive_probability)`` for *text*.
89
-
90
- * Neutral predictions – if produced by the model – are **treated as Positive**.
91
- * Numeric probability is kept for aggregation; front‑end may discard it to
92
- satisfy the "no numbers" display requirement.
93
- """
94
- try:
95
- sentiment_pipe = pipe or sentiment_pipeline
96
- all_scores = sentiment_pipe(text, return_all_scores=True, truncation=True)[0]
97
- score_map = {item["label"].upper(): item["score"] for item in all_scores}
98
-
99
- # Positive probability: include Neutral as positive when present
100
- pos_prob = score_map.get(_POSITIVE_RAW, 0.0)
101
- if _NEUTRAL_RAW in score_map:
102
- pos_prob = max(pos_prob, score_map[_NEUTRAL_RAW])
103
-
104
- # Determine final label (Neutral → Positive by design)
105
- label = "Positive" if (
106
- (_NEUTRAL_RAW in score_map) or (pos_prob >= threshold)
107
- ) else "Negative"
108
- return label, pos_prob
109
- except Exception:
110
- return "Unknown", 0.0
111
 
112
  # ---------------------------------------------------------------------------
113
- _LABEL_MAP = {"LABEL_0": "Negative", "LABEL_1": "Positive"}
114
- _POSITIVE_RAW = "LABEL_1"
115
- _SINGLE_THRESHOLD = 0.55 # per‑headline cut‑off
116
-
117
-
118
- def analyze_sentiment(text: str, pipe=None, threshold: float = _SINGLE_THRESHOLD) -> Tuple[str, float]:
119
- """Return ``(label, positive_probability)`` for *text*.
120
-
121
- * Neutral is not expected from a binary model; if encountered, treat as Negative.
122
- * Numeric probability is for internal aggregation only – front‑end can ignore
123
- it to satisfy the "no numbers" requirement.
124
- """
125
- try:
126
- sentiment_pipe = pipe or sentiment_pipeline
127
- scores = sentiment_pipe(text, return_all_scores=True, truncation=True)[0]
128
- pos_prob = 0.0
129
- for item in scores:
130
- if item["label"].upper() == _POSITIVE_RAW:
131
- pos_prob = item["score"]
132
- break
133
- label = "Positive" if pos_prob >= threshold else "Negative"
134
- return label, pos_prob
135
- except Exception:
136
- return "Unknown", 0.0
137
-
138
- # ---------------------------------------------------------------------------
139
- # Aggregation – average positive probability → binary overall label
140
  # ---------------------------------------------------------------------------
141
- _AVG_THRESHOLD = 0.55 # ≥55 % mean positive probability → overall Positive
142
 
 
 
143
 
144
- def aggregate_sentiments(results: List[Tuple[str, float]], avg_threshold: float = _AVG_THRESHOLD) -> str:
145
- """Compute overall **Positive/Negative** via *average positive probability*.
146
-
147
- * *results* – list of tuples from ``analyze_sentiment``.
148
- * Empty list → *Unknown*.
149
- * The returned label is **binary**; numeric values remain internal.
150
  """
151
- if not results:
152
- return "Unknown"
153
-
154
- avg_pos = sum(prob for _, prob in results) / len(results)
155
- return "Positive" if avg_pos >= avg_threshold else "Negative"
156
-
157
- # ---------------------------------------------------------------------------
158
- # ORG‑entity extraction (ticker discovery)
159
- # ---------------------------------------------------------------------------
160
-
161
- def extract_org_entities(text: str, pipe=None, max_entities: int = 5) -> List[str]:
162
- """Extract up to *max_entities* unique ORG tokens (upper‑case, de‑hashed)."""
163
  try:
164
  ner_pipe = pipe or ner_pipeline
165
  entities = ner_pipe(text)
@@ -175,13 +173,16 @@ def extract_org_entities(text: str, pipe=None, max_entities: int = 5) -> List[st
175
  except Exception:
176
  return []
177
 
 
178
  # ---------------------------------------------------------------------------
179
  # Public accessors (legacy compatibility)
180
  # ---------------------------------------------------------------------------
181
 
182
  def get_sentiment_pipeline():
 
183
  return sentiment_pipeline
184
 
185
 
186
  def get_ner_pipeline():
 
187
  return ner_pipeline
 
1
+ """func.py – utility functions for EquiPulse
2
+ Cleaned‑up single‑source version (2025‑05‑18).
3
+
4
+ Highlights
5
+ ----------
6
+ * **Single** `analyze_sentiment` implementation – no more duplicates.
7
+ * Returns **label string by default**, optional probability via `return_prob`.
8
+ * Threshold lowered to **0.50** and Neutral treated as Positive.
9
+ * Helper pipelines cached at module level.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
  from typing import List, Tuple
15
 
16
  from transformers import (
 
23
  import requests
24
 
25
  # ---------------------------------------------------------------------------
26
+ # Model identifiers (Hugging Face)
27
  # ---------------------------------------------------------------------------
28
  SENTIMENT_MODEL_ID = "LinkLinkWu/Stock_Analysis_Test_Ahamed" # LABEL_0 = Negative, LABEL_1 = Positive
29
  NER_MODEL_ID = "dslim/bert-base-NER"
30
 
31
  # ---------------------------------------------------------------------------
32
+ # Pipeline singletons loaded once on first import
33
  # ---------------------------------------------------------------------------
34
+ # Sentiment
35
+ _sent_tok = AutoTokenizer.from_pretrained(SENTIMENT_MODEL_ID)
36
+ _sent_model = AutoModelForSequenceClassification.from_pretrained(SENTIMENT_MODEL_ID)
37
  sentiment_pipeline = pipeline(
38
+ "text-classification",
39
+ model=_sent_model,
40
+ tokenizer=_sent_tok,
41
+ return_all_scores=True,
42
  )
43
 
44
+ # NER
45
+ _ner_tok = AutoTokenizer.from_pretrained(NER_MODEL_ID)
46
+ _ner_model = AutoModelForTokenClassification.from_pretrained(NER_MODEL_ID)
47
  ner_pipeline = pipeline(
48
  "ner",
49
+ model=_ner_model,
50
+ tokenizer=_ner_tok,
51
  grouped_entities=True,
52
  )
53
 
54
+ # ---------------------------------------------------------------------------
55
+ # Sentiment helpers
56
+ # ---------------------------------------------------------------------------
57
+ _POSITIVE_RAW = "LABEL_1" # positive class id in model output
58
+ _NEUTRAL_RAW = "NEUTRAL" # some models add a neutral class
59
+ _SINGLE_THRESHOLD = 0.50 # ≥50% positive prob → Positive
60
+ _LABEL_NEG = "Negative"
61
+ _LABEL_POS = "Positive"
62
+ _LABEL_UNK = "Unknown"
63
+
64
+
65
+ def analyze_sentiment(
66
+ text: str,
67
+ *,
68
+ pipe=None,
69
+ threshold: float = _SINGLE_THRESHOLD,
70
+ return_prob: bool = False,
71
+ ):
72
+ """Classify *text* as Positive / Negative.
73
+
74
+ Parameters
75
+ ----------
76
+ text : str
77
+ Input sentence (e.g. news headline).
78
+ pipe : transformers.Pipeline, optional
79
+ Custom sentiment pipeline; defaults to module‑level singleton.
80
+ threshold : float, default 0.50
81
+ Positive‑probability cut‑off.
82
+ return_prob : bool, default False
83
+ If *True*, returns ``(label, positive_probability)`` tuple;
84
+ otherwise returns just the label string.
85
+
86
+ Notes
87
+ -----
88
+ * When the underlying model emits *NEUTRAL*, we treat it the same
89
+ as *Positive* – finance headlines often sound cautious.
90
+ * Function never raises; on failure returns ``"Unknown"`` (or
91
+ ``("Unknown", 0.0)`` when *return_prob* is *True*).
92
+ """
93
+ try:
94
+ s_pipe = pipe or sentiment_pipeline
95
+ scores = s_pipe(text, truncation=True)[0] # list[dict]
96
+ score_map = {item["label"].upper(): item["score"] for item in scores}
97
+
98
+ pos_prob = score_map.get(_POSITIVE_RAW, 0.0)
99
+ if _NEUTRAL_RAW in score_map: # treat Neutral as Positive
100
+ pos_prob = max(pos_prob, score_map[_NEUTRAL_RAW])
101
+
102
+ label = _LABEL_POS if pos_prob >= threshold else _LABEL_NEG
103
+ return (label, pos_prob) if return_prob else label
104
+ except Exception:
105
+ return (_LABEL_UNK, 0.0) if return_prob else _LABEL_UNK
106
+
107
+
108
  # ---------------------------------------------------------------------------
109
  # Web‑scraping helper (Finviz)
110
  # ---------------------------------------------------------------------------
111
 
112
+ def fetch_news(ticker: str, max_items: int = 30) -> List[dict]:
113
+ """Return up to *max_items* latest Finviz headlines for *ticker*.
114
+
115
+ Result format:
116
+ ``[{'title': str, 'link': str}, ...]``
117
+ """
118
  try:
119
  url = f"https://finviz.com/quote.ashx?t={ticker}"
120
  headers = {
 
130
 
131
  soup = BeautifulSoup(r.text, "html.parser")
132
  if ticker.upper() not in (soup.title.text if soup.title else "").upper():
133
+ return [] # redirected / placeholder page
134
 
135
  table = soup.find(id="news-table")
136
  if table is None:
137
  return []
138
 
139
  headlines: List[dict] = []
140
+ for row in table.find_all("tr")[:max_items]:
141
  link_tag = row.find("a")
142
  if link_tag:
143
+ headlines.append(
144
+ {"title": link_tag.text.strip(), "link": link_tag["href"]}
145
+ )
146
  return headlines
147
  except Exception:
148
  return []
149
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
150
 
151
  # ---------------------------------------------------------------------------
152
+ # Named‑entity extraction helper
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
  # ---------------------------------------------------------------------------
 
154
 
155
+ def extract_org_entities(text: str, pipe=None, max_entities: int = 5) -> List[str]:
156
+ """Extract *ORG* tokens (upper‑cased) from *text*.
157
 
158
+ Returns at most *max_entities* unique ticker‑like strings suitable
159
+ for Finviz / Yahoo queries.
 
 
 
 
160
  """
 
 
 
 
 
 
 
 
 
 
 
 
161
  try:
162
  ner_pipe = pipe or ner_pipeline
163
  entities = ner_pipe(text)
 
173
  except Exception:
174
  return []
175
 
176
+
177
  # ---------------------------------------------------------------------------
178
  # Public accessors (legacy compatibility)
179
  # ---------------------------------------------------------------------------
180
 
181
  def get_sentiment_pipeline():
182
+ """Return the module‑level sentiment pipeline singleton."""
183
  return sentiment_pipeline
184
 
185
 
186
  def get_ner_pipeline():
187
+ """Return the module‑level NER pipeline singleton."""
188
  return ner_pipeline