Sze_Link_ISOM_5240_MODEL

Running

App Files Files Community

LinkLinkWu commited on May 18

Commit

64d5a00

verified ·

1 Parent(s): c7f60fc

Update func.py

Browse files

Files changed (1) hide show

func.py +41 -50

func.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import List, Tuple
 from transformers import (
     pipeline,
@@ -10,14 +10,16 @@ from bs4 import BeautifulSoup
 import requests
 # ---------------------------------------------------------------------------
-# Model identifiers
 # ---------------------------------------------------------------------------
-SENTIMENT_MODEL_ID = "LinkLinkWu/Stock_Analysis_Test_Ahamed"
 NER_MODEL_ID = "dslim/bert-base-NER"
 # ---------------------------------------------------------------------------
-# Eager initialisation of Hugging Face pipelines (shared singletons)
 # ---------------------------------------------------------------------------
 sentiment_tokenizer = AutoTokenizer.from_pretrained(SENTIMENT_MODEL_ID)
 sentiment_model = AutoModelForSequenceClassification.from_pretrained(SENTIMENT_MODEL_ID)
 sentiment_pipeline = pipeline(
@@ -26,6 +28,7 @@ sentiment_pipeline = pipeline(
     tokenizer=sentiment_tokenizer,
 )
 ner_tokenizer = AutoTokenizer.from_pretrained(NER_MODEL_ID)
 ner_model = AutoModelForTokenClassification.from_pretrained(NER_MODEL_ID)
 ner_pipeline = pipeline(
@@ -36,15 +39,11 @@ ner_pipeline = pipeline(
 )
 # ---------------------------------------------------------------------------
-# Web‑scraping helper
 # ---------------------------------------------------------------------------
 def fetch_news(ticker: str) -> List[dict]:
-    """Return up to 30 latest Finviz headlines for *ticker* (title & link).
-    Empty list on network / parsing errors or if Finviz redirects to a generic
-    page (e.g. wrong ticker).
-    """
     try:
         url = f"https://finviz.com/quote.ashx?t={ticker}"
         headers = {
@@ -60,79 +59,71 @@ def fetch_news(ticker: str) -> List[dict]:
         soup = BeautifulSoup(r.text, "html.parser")
         if ticker.upper() not in (soup.title.text if soup.title else "").upper():
-            return []  # Finviz placeholder page
         table = soup.find(id="news-table")
         if table is None:
             return []
-        news: List[dict] = []
         for row in table.find_all("tr")[:30]:
             link_tag = row.find("a")
             if link_tag:
-                news.append({"title": link_tag.get_text(strip=True), "link": link_tag["href"]})
-        return news
     except Exception:
         return []
 # ---------------------------------------------------------------------------
-# Sentiment helpers
 # ---------------------------------------------------------------------------
-_POSITIVE = "positive"
-_DEFAULT_THRESHOLD = 0.55  # per‑headline probability cut‑off
-def analyze_sentiment(
-    text: str,
-    pipe=None,
-    threshold: float = _DEFAULT_THRESHOLD,
-) -> Tuple[str, float]:
-    """Classify *text* and return ``(label, positive_probability)``.
-    * Binary label (*Positive* / *Negative*) is determined by comparing the
-      *positive* probability with *threshold*.
-    * Neutral headlines are mapped to *Negative* by design.
-    * On any internal error → ("Unknown", 0.0).
     """
     try:
         sentiment_pipe = pipe or sentiment_pipeline
-        scores = sentiment_pipe(text, return_all_scores=True, truncation=True)[0]
-        pos_prob = 0.0
-        for item in scores:
-            if item["label"].lower() == _POSITIVE:
-                pos_prob = item["score"]
-                break
-        label = "Positive" if pos_prob >= threshold else "Negative"
-        return label, pos_prob
     except Exception:
-        return "Unknown", 0.0
 # ---------------------------------------------------------------------------
-# Aggregation – average positive probability → binary overall label
 # ---------------------------------------------------------------------------
-def aggregate_sentiments(
-    results: List[Tuple[str, float]],
-    avg_threshold: float = _DEFAULT_THRESHOLD,
-) -> str:
-    """Compute overall **Positive/Negative** based on *mean* positive probability.
-    * *results* – list returned by ``analyze_sentiment`` for each headline.
-    * If the average positive probability ≥ *avg_threshold* → *Positive*.
     * Empty list → *Unknown*.
     """
-    if not results:
         return "Unknown"
-    avg_pos = sum(prob for _, prob in results) / len(results)
-    return "Positive" if avg_pos >= avg_threshold else "Negative"
 # ---------------------------------------------------------------------------
-# ORG‑entity extraction (for ticker discovery)
 # ---------------------------------------------------------------------------
 def extract_org_entities(text: str, pipe=None, max_entities: int = 5) -> List[str]:
-    """Return up to *max_entities* unique ORG tokens (upper‑case, de‑hashed)."""
     try:
         ner_pipe = pipe or ner_pipeline
         entities = ner_pipe(text)
@@ -149,7 +140,7 @@ def extract_org_entities(text: str, pipe=None, max_entities: int = 5) -> List[st
         return []
 # ---------------------------------------------------------------------------
-# Public accessors (backward compatibility with app.py)
 # ---------------------------------------------------------------------------
 def get_sentiment_pipeline():

+from typing import List
 from transformers import (
     pipeline,
 import requests
 # ---------------------------------------------------------------------------
+# Model identifiers – use your custom sentiment model hosted on Hugging Face
 # ---------------------------------------------------------------------------
+SENTIMENT_MODEL_ID = "LinkLinkWu/Stock_Analysis_Test_Ahamed"  # binary sentiment
 NER_MODEL_ID = "dslim/bert-base-NER"
 # ---------------------------------------------------------------------------
+# Eager initialisation (singletons shared by the whole Streamlit session)
 # ---------------------------------------------------------------------------
+# Sentiment pipeline – returns one label with its score. We will *ignore* the
+# numeric score down‑stream to satisfy the "no numbers" requirement.
 sentiment_tokenizer = AutoTokenizer.from_pretrained(SENTIMENT_MODEL_ID)
 sentiment_model = AutoModelForSequenceClassification.from_pretrained(SENTIMENT_MODEL_ID)
 sentiment_pipeline = pipeline(
     tokenizer=sentiment_tokenizer,
 )
+# Named‑entity‑recognition pipeline (ORG extraction)
 ner_tokenizer = AutoTokenizer.from_pretrained(NER_MODEL_ID)
 ner_model = AutoModelForTokenClassification.from_pretrained(NER_MODEL_ID)
 ner_pipeline = pipeline(
 )
 # ---------------------------------------------------------------------------
+# Web‑scraping helper (Finviz)
 # ---------------------------------------------------------------------------
 def fetch_news(ticker: str) -> List[dict]:
+    """Return at most 30 latest Finviz headlines for *ticker* ("title" & "link")."""
     try:
         url = f"https://finviz.com/quote.ashx?t={ticker}"
         headers = {
         soup = BeautifulSoup(r.text, "html.parser")
         if ticker.upper() not in (soup.title.text if soup.title else "").upper():
+            return []  # possibly a redirect page
         table = soup.find(id="news-table")
         if table is None:
             return []
+        headlines: List[dict] = []
         for row in table.find_all("tr")[:30]:
             link_tag = row.find("a")
             if link_tag:
+                headlines.append({"title": link_tag.get_text(strip=True), "link": link_tag["href"]})
+        return headlines
     except Exception:
         return []
 # ---------------------------------------------------------------------------
+# Sentiment helpers – binary classification, *no* numeric score exposed
 # ---------------------------------------------------------------------------
+_LABEL_MAP = {"LABEL_0": "Negative", "LABEL_1": "Positive"}  # adjust if model config differs
+def analyze_sentiment(text: str, pipe=None) -> str:
+    """Return **"Positive"** or **"Negative"** for a single headline.
+    *Neutral* outputs (if ever returned by the model) are coerced to *Negative*.
+    Numeric confidence scores are deliberately discarded to honour the
+    "no numbers" requirement.
     """
     try:
         sentiment_pipe = pipe or sentiment_pipeline
+        result = sentiment_pipe(text, truncation=True, return_all_scores=False)[0]
+        raw_label = result.get("label", "").upper()
+        label = _LABEL_MAP.get(raw_label, "Negative")  # default to Negative
+        return label
     except Exception:
+        return "Unknown"
 # ---------------------------------------------------------------------------
+# Aggregation – majority vote (Positive‑ratio) → binary label
 # ---------------------------------------------------------------------------
+_POS_RATIO_THRESHOLD = 0.6  # ≥60 % positives → overall Positive
+def aggregate_sentiments(labels: List[str], pos_ratio_threshold: float = _POS_RATIO_THRESHOLD) -> str:
+    """Combine individual headline labels into an overall binary sentiment.
+    * If *Positive* proportion ≥ *pos_ratio_threshold* → *Positive*.
+    * Otherwise → *Negative*.
     * Empty list → *Unknown*.
     """
+    if not labels:
         return "Unknown"
+    total = len(labels)
+    positives = sum(1 for l in labels if l == "Positive")
+    ratio = positives / total
+    return "Positive" if ratio >= pos_ratio_threshold else "Negative"
 # ---------------------------------------------------------------------------
+# ORG‑entity extraction (ticker discovery)
 # ---------------------------------------------------------------------------
 def extract_org_entities(text: str, pipe=None, max_entities: int = 5) -> List[str]:
+    """Extract up to *max_entities* unique ORG tokens (upper‑case, de‑hashed)."""
     try:
         ner_pipe = pipe or ner_pipeline
         entities = ner_pipe(text)
         return []
 # ---------------------------------------------------------------------------
+# Public accessors (legacy compatibility)
 # ---------------------------------------------------------------------------
 def get_sentiment_pipeline():