LinkLinkWu commited on
Commit
c7f60fc
·
verified ·
1 Parent(s): 7c727fa

Update func.py

Browse files
Files changed (1) hide show
  1. func.py +55 -74
func.py CHANGED
@@ -12,13 +12,12 @@ import requests
12
  # ---------------------------------------------------------------------------
13
  # Model identifiers
14
  # ---------------------------------------------------------------------------
15
- SENTIMENT_MODEL_ID = "ahmedrachid/FinancialBERT-Sentiment-Analysis" # returns: positive / neutral / negative
16
  NER_MODEL_ID = "dslim/bert-base-NER"
17
 
18
  # ---------------------------------------------------------------------------
19
- # Eager initialisation of Hugging Face pipelines (shared across requests)
20
  # ---------------------------------------------------------------------------
21
- # Sentiment pipeline (binary decision will be made later)
22
  sentiment_tokenizer = AutoTokenizer.from_pretrained(SENTIMENT_MODEL_ID)
23
  sentiment_model = AutoModelForSequenceClassification.from_pretrained(SENTIMENT_MODEL_ID)
24
  sentiment_pipeline = pipeline(
@@ -27,7 +26,6 @@ sentiment_pipeline = pipeline(
27
  tokenizer=sentiment_tokenizer,
28
  )
29
 
30
- # Named‑entity‑recognition pipeline (ORG extraction)
31
  ner_tokenizer = AutoTokenizer.from_pretrained(NER_MODEL_ID)
32
  ner_model = AutoModelForTokenClassification.from_pretrained(NER_MODEL_ID)
33
  ner_pipeline = pipeline(
@@ -38,14 +36,14 @@ ner_pipeline = pipeline(
38
  )
39
 
40
  # ---------------------------------------------------------------------------
41
- # Core functionality
42
  # ---------------------------------------------------------------------------
43
 
44
  def fetch_news(ticker: str) -> List[dict]:
45
- """Scrape *up to* 30 recent headlines from Finviz for a given *ticker*.
46
 
47
- Returns a list of dictionaries with ``{"title": str, "link": str}`` or an
48
- empty list on any error/edge‑case (e.g. anti‑scraping redirect).
49
  """
50
  try:
51
  url = f"https://finviz.com/quote.ashx?t={ticker}"
@@ -56,80 +54,85 @@ def fetch_news(ticker: str) -> List[dict]:
56
  "Referer": "https://finviz.com/",
57
  "Connection": "keep-alive",
58
  }
59
- response = requests.get(url, headers=headers, timeout=10)
60
- if response.status_code != 200:
61
  return []
62
 
63
- soup = BeautifulSoup(response.text, "html.parser")
64
- page_title = soup.title.text if soup.title else ""
65
- if ticker.upper() not in page_title.upper():
66
- # Finviz sometimes redirects to a placeholder page if the ticker is unknown.
67
- return []
68
 
69
- news_table = soup.find(id="news-table")
70
- if news_table is None:
71
  return []
72
 
73
- latest_news: List[dict] = []
74
- for row in news_table.find_all("tr")[:30]: # keep only the 30 most recent rows
75
  link_tag = row.find("a")
76
  if link_tag:
77
- latest_news.append({
78
- "title": link_tag.get_text(strip=True),
79
- "link": link_tag["href"],
80
- })
81
- return latest_news
82
  except Exception:
83
- # swallow all exceptions and degrade gracefully
84
  return []
85
 
86
  # ---------------------------------------------------------------------------
87
- # Sentiment analysis helpers
88
  # ---------------------------------------------------------------------------
89
- # Raw labels coming from the FinancialBERT model
90
  _POSITIVE = "positive"
91
- _NEGATIVE = "negative"
92
 
93
- _DEFAULT_THRESHOLD = 0.55 # default probability threshold; callers may override
94
 
95
  def analyze_sentiment(
96
  text: str,
97
  pipe=None,
98
  threshold: float = _DEFAULT_THRESHOLD,
99
  ) -> Tuple[str, float]:
100
- """Classify *text* as **Positive/Negative** and return its positive probability.
101
 
102
- The underlying model is three‑class (positive/neutral/negative). We keep the
103
- **positive** score only and compare it against *threshold* to obtain a binary
104
- label. The function is **side‑effect free** and will never raise; on any
105
- internal error it falls back to ``("Unknown", 0.0)``.
106
  """
107
  try:
108
  sentiment_pipe = pipe or sentiment_pipeline
109
- raw_scores = sentiment_pipe(text, return_all_scores=True, truncation=True)[0]
110
- score_lookup = {item["label"].lower(): item["score"] for item in raw_scores}
111
- pos_score = score_lookup.get(_POSITIVE, 0.0)
112
- label = "Positive" if pos_score >= threshold else "Negative"
113
- return label, pos_score
 
 
 
114
  except Exception:
115
  return "Unknown", 0.0
116
 
117
  # ---------------------------------------------------------------------------
118
- # Organisation‑entity extraction helper (kept for backward compatibility)
119
  # ---------------------------------------------------------------------------
120
 
121
- def extract_org_entities(
122
- text: str,
123
- pipe=None,
124
- max_entities: int = 5,
125
- ) -> List[str]:
126
- """Extract up to *max_entities* unique organisation tokens from *text*.
127
-
128
- Uses the pre‑initialised NER pipeline unless an alternative *pipe* is
129
- supplied. Tokens are upper‑cased and de‑hashed ("##") to make them ticker‑
130
- friendly. The function is side‑effect free and falls back to an empty list
131
- on any exception.
132
  """
 
 
 
 
 
 
 
 
 
 
 
 
133
  try:
134
  ner_pipe = pipe or ner_pipeline
135
  entities = ner_pipe(text)
@@ -146,34 +149,12 @@ def extract_org_entities(
146
  return []
147
 
148
  # ---------------------------------------------------------------------------
149
- # Aggregation logic turning many headlines into one overall label
150
- # ---------------------------------------------------------------------------
151
-
152
- def aggregate_sentiments(
153
- results: List[Tuple[str, float]],
154
- avg_threshold: float = _DEFAULT_THRESHOLD,
155
- ) -> str:
156
- """Combine individual headline results into a single overall label.
157
-
158
- The rule is simple: compute the *mean* positive probability across all
159
- headlines and compare it with *avg_threshold*. If the list is empty, the
160
- function returns ``"Unknown"``.
161
- """
162
- if not results:
163
- return "Unknown"
164
-
165
- avg_pos = sum(score for _, score in results) / len(results)
166
- return "Positive" if avg_pos >= avg_threshold else "Negative"
167
-
168
- # ---------------------------------------------------------------------------
169
- # Public helpers (kept for backward compatibility with app.py)
170
  # ---------------------------------------------------------------------------
171
 
172
  def get_sentiment_pipeline():
173
- """Expose the initialised sentiment pipeline (singleton)."""
174
  return sentiment_pipeline
175
 
176
 
177
  def get_ner_pipeline():
178
- """Expose the initialised NER pipeline (singleton)."""
179
  return ner_pipeline
 
12
  # ---------------------------------------------------------------------------
13
  # Model identifiers
14
  # ---------------------------------------------------------------------------
15
+ SENTIMENT_MODEL_ID = "LinkLinkWu/Stock_Analysis_Test_Ahamed"
16
  NER_MODEL_ID = "dslim/bert-base-NER"
17
 
18
  # ---------------------------------------------------------------------------
19
+ # Eager initialisation of Hugging Face pipelines (shared singletons)
20
  # ---------------------------------------------------------------------------
 
21
  sentiment_tokenizer = AutoTokenizer.from_pretrained(SENTIMENT_MODEL_ID)
22
  sentiment_model = AutoModelForSequenceClassification.from_pretrained(SENTIMENT_MODEL_ID)
23
  sentiment_pipeline = pipeline(
 
26
  tokenizer=sentiment_tokenizer,
27
  )
28
 
 
29
  ner_tokenizer = AutoTokenizer.from_pretrained(NER_MODEL_ID)
30
  ner_model = AutoModelForTokenClassification.from_pretrained(NER_MODEL_ID)
31
  ner_pipeline = pipeline(
 
36
  )
37
 
38
  # ---------------------------------------------------------------------------
39
+ # Web‑scraping helper
40
  # ---------------------------------------------------------------------------
41
 
42
  def fetch_news(ticker: str) -> List[dict]:
43
+ """Return up to 30 latest Finviz headlines for *ticker* (title & link).
44
 
45
+ Empty list on network / parsing errors or if Finviz redirects to a generic
46
+ page (e.g. wrong ticker).
47
  """
48
  try:
49
  url = f"https://finviz.com/quote.ashx?t={ticker}"
 
54
  "Referer": "https://finviz.com/",
55
  "Connection": "keep-alive",
56
  }
57
+ r = requests.get(url, headers=headers, timeout=10)
58
+ if r.status_code != 200:
59
  return []
60
 
61
+ soup = BeautifulSoup(r.text, "html.parser")
62
+ if ticker.upper() not in (soup.title.text if soup.title else "").upper():
63
+ return [] # Finviz placeholder page
 
 
64
 
65
+ table = soup.find(id="news-table")
66
+ if table is None:
67
  return []
68
 
69
+ news: List[dict] = []
70
+ for row in table.find_all("tr")[:30]:
71
  link_tag = row.find("a")
72
  if link_tag:
73
+ news.append({"title": link_tag.get_text(strip=True), "link": link_tag["href"]})
74
+ return news
 
 
 
75
  except Exception:
 
76
  return []
77
 
78
  # ---------------------------------------------------------------------------
79
+ # Sentiment helpers
80
  # ---------------------------------------------------------------------------
 
81
  _POSITIVE = "positive"
82
+ _DEFAULT_THRESHOLD = 0.55 # per‑headline probability cut‑off
83
 
 
84
 
85
  def analyze_sentiment(
86
  text: str,
87
  pipe=None,
88
  threshold: float = _DEFAULT_THRESHOLD,
89
  ) -> Tuple[str, float]:
90
+ """Classify *text* and return ``(label, positive_probability)``.
91
 
92
+ * Binary label (*Positive* / *Negative*) is determined by comparing the
93
+ *positive* probability with *threshold*.
94
+ * Neutral headlines are mapped to *Negative* by design.
95
+ * On any internal error ("Unknown", 0.0).
96
  """
97
  try:
98
  sentiment_pipe = pipe or sentiment_pipeline
99
+ scores = sentiment_pipe(text, return_all_scores=True, truncation=True)[0]
100
+ pos_prob = 0.0
101
+ for item in scores:
102
+ if item["label"].lower() == _POSITIVE:
103
+ pos_prob = item["score"]
104
+ break
105
+ label = "Positive" if pos_prob >= threshold else "Negative"
106
+ return label, pos_prob
107
  except Exception:
108
  return "Unknown", 0.0
109
 
110
  # ---------------------------------------------------------------------------
111
+ # Aggregation – average positive probability binary overall label
112
  # ---------------------------------------------------------------------------
113
 
114
+ def aggregate_sentiments(
115
+ results: List[Tuple[str, float]],
116
+ avg_threshold: float = _DEFAULT_THRESHOLD,
117
+ ) -> str:
118
+ """Compute overall **Positive/Negative** based on *mean* positive probability.
119
+
120
+ * *results* – list returned by ``analyze_sentiment`` for each headline.
121
+ * If the average positive probability *avg_threshold* *Positive*.
122
+ * Empty list *Unknown*.
 
 
123
  """
124
+ if not results:
125
+ return "Unknown"
126
+
127
+ avg_pos = sum(prob for _, prob in results) / len(results)
128
+ return "Positive" if avg_pos >= avg_threshold else "Negative"
129
+
130
+ # ---------------------------------------------------------------------------
131
+ # ORG‑entity extraction (for ticker discovery)
132
+ # ---------------------------------------------------------------------------
133
+
134
+ def extract_org_entities(text: str, pipe=None, max_entities: int = 5) -> List[str]:
135
+ """Return up to *max_entities* unique ORG tokens (upper‑case, de‑hashed)."""
136
  try:
137
  ner_pipe = pipe or ner_pipeline
138
  entities = ner_pipe(text)
 
149
  return []
150
 
151
  # ---------------------------------------------------------------------------
152
+ # Public accessors (backward compatibility with app.py)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
  # ---------------------------------------------------------------------------
154
 
155
  def get_sentiment_pipeline():
 
156
  return sentiment_pipeline
157
 
158
 
159
  def get_ner_pipeline():
 
160
  return ner_pipeline