""" * **Single** `analyze_sentiment` implementation – no more duplicates. * Returns **label string by default**, optional probability via `return_prob`. * Threshold lowered to **0.50** and Neutral treated as Positive. * Helper pipelines cached at module level. """ from __future__ import annotations from typing import List, Tuple from transformers import ( pipeline, AutoTokenizer, AutoModelForSequenceClassification, AutoModelForTokenClassification, ) from bs4 import BeautifulSoup import requests # --------------------------------------------------------------------------- # Model identifiers (Hugging Face) # --------------------------------------------------------------------------- SENTIMENT_MODEL_ID = "LinkLinkWu/Boss_Stock_News_Analysis" # LABEL_0 = Negative, LABEL_1 = Positive NER_MODEL_ID = "dslim/bert-base-NER" # --------------------------------------------------------------------------- # Pipeline singletons – loaded once on first import # --------------------------------------------------------------------------- # Sentiment _sent_tok = AutoTokenizer.from_pretrained(SENTIMENT_MODEL_ID) _sent_model = AutoModelForSequenceClassification.from_pretrained(SENTIMENT_MODEL_ID) sentiment_pipeline = pipeline( "text-classification", model=_sent_model, tokenizer=_sent_tok, return_all_scores=True, ) # NER _ner_tok = AutoTokenizer.from_pretrained(NER_MODEL_ID) _ner_model = AutoModelForTokenClassification.from_pretrained(NER_MODEL_ID) ner_pipeline = pipeline( "ner", model=_ner_model, tokenizer=_ner_tok, grouped_entities=True, ) # --------------------------------------------------------------------------- # Sentiment helpers # --------------------------------------------------------------------------- _POSITIVE_RAW = "LABEL_1" # positive class id in model output _NEUTRAL_RAW = "NEUTRAL" # some models add a neutral class _SINGLE_THRESHOLD = 0.50 # ≥50% positive prob → Positive _LABEL_NEG = "Negative" _LABEL_POS = "Positive" _LABEL_UNK = "Unknown" def analyze_sentiment( text: str, *, pipe=None, threshold: float = _SINGLE_THRESHOLD, return_prob: bool = False, ): """Classify *text* as Positive / Negative. Parameters ---------- text : str Input sentence (e.g. news headline). pipe : transformers.Pipeline, optional Custom sentiment pipeline; defaults to module-level singleton. threshold : float, default 0.50 Positive-probability cut-off. return_prob : bool, default False If *True*, returns ``(label, positive_probability)`` tuple; otherwise returns just the label string. Notes ----- * When the underlying model emits *NEUTRAL*, we treat it the same as *Positive* – finance headlines often sound cautious. * Function never raises; on failure returns ``"Unknown"`` (or ``("Unknown", 0.0)`` when *return_prob* is *True*). """ try: s_pipe = pipe or sentiment_pipeline scores = s_pipe(text, truncation=True)[0] # list[dict] score_map = {item["label"].upper(): item["score"] for item in scores} pos_prob = score_map.get(_POSITIVE_RAW, 0.0) if _NEUTRAL_RAW in score_map: # treat Neutral as Positive pos_prob = max(pos_prob, score_map[_NEUTRAL_RAW]) label = _LABEL_POS if pos_prob >= threshold else _LABEL_NEG return (label, pos_prob) if return_prob else label except Exception: return (_LABEL_UNK, 0.0) if return_prob else _LABEL_UNK # --------------------------------------------------------------------------- # Web-scraping helper (Finviz) # --------------------------------------------------------------------------- def fetch_news(ticker: str, max_items: int = 30) -> List[dict]: """Return up to *max_items* latest Finviz headlines for *ticker*. Result format: ``[{'title': str, 'link': str}, ...]`` """ try: url = f"https://finviz.com/quote.ashx?t={ticker}" headers = { "User-Agent": "Mozilla/5.0", "Accept": "text/html", "Accept-Language": "en-US,en;q=0.5", "Referer": "https://finviz.com/", "Connection": "keep-alive", } r = requests.get(url, headers=headers, timeout=10) if r.status_code != 200: return [] soup = BeautifulSoup(r.text, "html.parser") if ticker.upper() not in (soup.title.text if soup.title else "").upper(): return [] # redirected / placeholder page table = soup.find(id="news-table") if table is None: return [] headlines: List[dict] = [] for row in table.find_all("tr")[:max_items]: link_tag = row.find("a") if link_tag: headlines.append( {"title": link_tag.text.strip(), "link": link_tag["href"]} ) return headlines except Exception: return [] # --------------------------------------------------------------------------- # Named-entity extraction helper # --------------------------------------------------------------------------- def extract_org_entities(text: str, pipe=None, max_entities: int = 5) -> List[str]: """Extract *ORG* tokens (upper-cased) from *text*. Returns at most *max_entities* unique ticker-like strings suitable for Finviz / Yahoo queries. """ try: ner_pipe = pipe or ner_pipeline entities = ner_pipe(text) orgs: List[str] = [] for ent in entities: if ent.get("entity_group") == "ORG": token = ent["word"].replace("##", "").strip().upper() if token and token not in orgs: orgs.append(token) if len(orgs) >= max_entities: break return orgs except Exception: return [] # --------------------------------------------------------------------------- # Public accessors (legacy compatibility) # --------------------------------------------------------------------------- def get_sentiment_pipeline(): """Return the module-level sentiment pipeline singleton.""" return sentiment_pipeline def get_ner_pipeline(): """Return the module-level NER pipeline singleton.""" return ner_pipeline