|
""" |
|
* **Single** `analyze_sentiment` implementation – no more duplicates. |
|
* Returns **label string by default**, optional probability via `return_prob`. |
|
* Threshold lowered to **0.50** and Neutral treated as Positive. |
|
* Helper pipelines cached at module level. |
|
""" |
|
|
|
from __future__ import annotations |
|
|
|
from typing import List, Tuple |
|
|
|
from transformers import ( |
|
pipeline, |
|
AutoTokenizer, |
|
AutoModelForSequenceClassification, |
|
AutoModelForTokenClassification, |
|
) |
|
from bs4 import BeautifulSoup |
|
import requests |
|
|
|
|
|
|
|
|
|
SENTIMENT_MODEL_ID = "LinkLinkWu/Boss_Stock_News_Analysis" |
|
NER_MODEL_ID = "dslim/bert-base-NER" |
|
|
|
|
|
|
|
|
|
|
|
_sent_tok = AutoTokenizer.from_pretrained(SENTIMENT_MODEL_ID) |
|
_sent_model = AutoModelForSequenceClassification.from_pretrained(SENTIMENT_MODEL_ID) |
|
sentiment_pipeline = pipeline( |
|
"text-classification", |
|
model=_sent_model, |
|
tokenizer=_sent_tok, |
|
return_all_scores=True, |
|
) |
|
|
|
|
|
_ner_tok = AutoTokenizer.from_pretrained(NER_MODEL_ID) |
|
_ner_model = AutoModelForTokenClassification.from_pretrained(NER_MODEL_ID) |
|
ner_pipeline = pipeline( |
|
"ner", |
|
model=_ner_model, |
|
tokenizer=_ner_tok, |
|
grouped_entities=True, |
|
) |
|
|
|
|
|
|
|
|
|
_POSITIVE_RAW = "LABEL_1" |
|
_NEUTRAL_RAW = "NEUTRAL" |
|
_SINGLE_THRESHOLD = 0.50 |
|
_LABEL_NEG = "Negative" |
|
_LABEL_POS = "Positive" |
|
_LABEL_UNK = "Unknown" |
|
|
|
|
|
def analyze_sentiment( |
|
text: str, |
|
*, |
|
pipe=None, |
|
threshold: float = _SINGLE_THRESHOLD, |
|
return_prob: bool = False, |
|
): |
|
"""Classify *text* as Positive / Negative. |
|
|
|
Parameters |
|
---------- |
|
text : str |
|
Input sentence (e.g. news headline). |
|
pipe : transformers.Pipeline, optional |
|
Custom sentiment pipeline; defaults to module-level singleton. |
|
threshold : float, default 0.50 |
|
Positive-probability cut-off. |
|
return_prob : bool, default False |
|
If *True*, returns ``(label, positive_probability)`` tuple; |
|
otherwise returns just the label string. |
|
|
|
Notes |
|
----- |
|
* When the underlying model emits *NEUTRAL*, we treat it the same |
|
as *Positive* – finance headlines often sound cautious. |
|
* Function never raises; on failure returns ``"Unknown"`` (or |
|
``("Unknown", 0.0)`` when *return_prob* is *True*). |
|
""" |
|
try: |
|
s_pipe = pipe or sentiment_pipeline |
|
scores = s_pipe(text, truncation=True)[0] |
|
score_map = {item["label"].upper(): item["score"] for item in scores} |
|
|
|
pos_prob = score_map.get(_POSITIVE_RAW, 0.0) |
|
if _NEUTRAL_RAW in score_map: |
|
pos_prob = max(pos_prob, score_map[_NEUTRAL_RAW]) |
|
|
|
label = _LABEL_POS if pos_prob >= threshold else _LABEL_NEG |
|
return (label, pos_prob) if return_prob else label |
|
except Exception: |
|
return (_LABEL_UNK, 0.0) if return_prob else _LABEL_UNK |
|
|
|
|
|
|
|
|
|
|
|
|
|
def fetch_news(ticker: str, max_items: int = 30) -> List[dict]: |
|
"""Return up to *max_items* latest Finviz headlines for *ticker*. |
|
|
|
Result format: |
|
``[{'title': str, 'link': str}, ...]`` |
|
""" |
|
try: |
|
url = f"https://finviz.com/quote.ashx?t={ticker}" |
|
headers = { |
|
"User-Agent": "Mozilla/5.0", |
|
"Accept": "text/html", |
|
"Accept-Language": "en-US,en;q=0.5", |
|
"Referer": "https://finviz.com/", |
|
"Connection": "keep-alive", |
|
} |
|
r = requests.get(url, headers=headers, timeout=10) |
|
if r.status_code != 200: |
|
return [] |
|
|
|
soup = BeautifulSoup(r.text, "html.parser") |
|
if ticker.upper() not in (soup.title.text if soup.title else "").upper(): |
|
return [] |
|
|
|
table = soup.find(id="news-table") |
|
if table is None: |
|
return [] |
|
|
|
headlines: List[dict] = [] |
|
for row in table.find_all("tr")[:max_items]: |
|
link_tag = row.find("a") |
|
if link_tag: |
|
headlines.append( |
|
{"title": link_tag.text.strip(), "link": link_tag["href"]} |
|
) |
|
return headlines |
|
except Exception: |
|
return [] |
|
|
|
|
|
|
|
|
|
|
|
|
|
def extract_org_entities(text: str, pipe=None, max_entities: int = 5) -> List[str]: |
|
"""Extract *ORG* tokens (upper-cased) from *text*. |
|
|
|
Returns at most *max_entities* unique ticker-like strings suitable |
|
for Finviz / Yahoo queries. |
|
""" |
|
try: |
|
ner_pipe = pipe or ner_pipeline |
|
entities = ner_pipe(text) |
|
orgs: List[str] = [] |
|
for ent in entities: |
|
if ent.get("entity_group") == "ORG": |
|
token = ent["word"].replace("##", "").strip().upper() |
|
if token and token not in orgs: |
|
orgs.append(token) |
|
if len(orgs) >= max_entities: |
|
break |
|
return orgs |
|
except Exception: |
|
return [] |
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_sentiment_pipeline(): |
|
"""Return the module-level sentiment pipeline singleton.""" |
|
return sentiment_pipeline |
|
|
|
|
|
def get_ner_pipeline(): |
|
"""Return the module-level NER pipeline singleton.""" |
|
return ner_pipeline |
|
|