Sze_Link_ISOM_5240_MODEL

Running

App Files Files Community

LinkLinkWu commited on May 18

Commit

d25b499

verified ·

1 Parent(s): bdaffbb

Update func.py

Browse files

Files changed (1) hide show

func.py +107 -58

func.py CHANGED Viewed

@@ -1,99 +1,148 @@
-from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification, AutoModelForTokenClassification
 from bs4 import BeautifulSoup
 import requests
-# ----------- Eager Initialization of Pipelines -----------
-# Sentiment pipeline
-model_id = "ahmedrachid/FinancialBERT-Sentiment-Analysis"
-sentiment_tokenizer = AutoTokenizer.from_pretrained(model_id)
-sentiment_model = AutoModelForSequenceClassification.from_pretrained(model_id)
 sentiment_pipeline = pipeline(
     "sentiment-analysis",
     model=sentiment_model,
-    tokenizer=sentiment_tokenizer
 )
-# NER pipeline
-ner_tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
-ner_model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")
 ner_pipeline = pipeline(
     "ner",
     model=ner_model,
     tokenizer=ner_tokenizer,
-    grouped_entities=True
 )
-# ----------- Core Functions -----------
-def fetch_news(ticker):
     try:
         url = f"https://finviz.com/quote.ashx?t={ticker}"
         headers = {
-            'User-Agent': 'Mozilla/5.0',
-            'Accept': 'text/html',
-            'Accept-Language': 'en-US,en;q=0.5',
-            'Referer': 'https://finviz.com/',
-            'Connection': 'keep-alive',
         }
-        response = requests.get(url, headers=headers)
         if response.status_code != 200:
             return []
-        soup = BeautifulSoup(response.text, 'html.parser')
-        title = soup.title.text if soup.title else ""
-        if ticker not in title:
             return []
-        news_table = soup.find(id='news-table')
         if news_table is None:
             return []
-        news = []
-        for row in news_table.findAll('tr')[:30]:
-            a_tag = row.find('a')
-            if a_tag:
-                title_text = a_tag.get_text()
-                link = a_tag['href']
-                news.append({'title': title_text, 'link': link})
-        return news
     except Exception:
         return []
-def analyze_sentiment(text, pipe=None):
-    """
-    兼容两种调用：
-      - analyze_sentiment(text)                   -> 使用全局 sentiment_pipeline
-      - analyze_sentiment(text, some_pipeline)    -> 使用传入的 some_pipeline
     """
     try:
         sentiment_pipe = pipe or sentiment_pipeline
-        result = sentiment_pipe(text)[0]
-        return "Positive" if result['label'] == 'POSITIVE' else "Negative"
     except Exception:
-        return "Unknown"
-def extract_org_entities(text, pipe=None):
-    """
-      - extract_org_entities(text)
-      - extract_org_entities(text, some_pipeline)
     """
-    try:
-        ner_pipe = pipe or ner_pipeline
-        entities = ner_pipe(text)
-        orgs = []
-        for ent in entities:
-            if ent["entity_group"] == "ORG":
-                w = ent["word"].replace("##", "").strip().upper()
-                if w not in orgs:
-                    orgs.append(w)
-                if len(orgs) >= 5:
-                    break
-        return orgs
-    except Exception:
-        return []
-# ----------- Helper Functions for Imports -----------
 def get_sentiment_pipeline():
     return sentiment_pipeline
 def get_ner_pipeline():
     return ner_pipeline

+from typing import List, Tuple
+from transformers import (
+    pipeline,
+    AutoTokenizer,
+    AutoModelForSequenceClassification,
+    AutoModelForTokenClassification,
+)
 from bs4 import BeautifulSoup
 import requests
+# ---------------------------------------------------------------------------
+# Model identifiers
+# ---------------------------------------------------------------------------
+SENTIMENT_MODEL_ID = "ahmedrachid/FinancialBERT-Sentiment-Analysis"  # returns: positive / neutral / negative
+NER_MODEL_ID = "dslim/bert-base-NER"
+# ---------------------------------------------------------------------------
+# Eager initialisation of Hugging Face pipelines (shared across requests)
+# ---------------------------------------------------------------------------
+# Sentiment pipeline (binary decision will be made later)
+sentiment_tokenizer = AutoTokenizer.from_pretrained(SENTIMENT_MODEL_ID)
+sentiment_model = AutoModelForSequenceClassification.from_pretrained(SENTIMENT_MODEL_ID)
 sentiment_pipeline = pipeline(
     "sentiment-analysis",
     model=sentiment_model,
+    tokenizer=sentiment_tokenizer,
 )
+# Named‑entity‑recognition pipeline (ORG extraction)
+ner_tokenizer = AutoTokenizer.from_pretrained(NER_MODEL_ID)
+ner_model = AutoModelForTokenClassification.from_pretrained(NER_MODEL_ID)
 ner_pipeline = pipeline(
     "ner",
     model=ner_model,
     tokenizer=ner_tokenizer,
+    grouped_entities=True,
 )
+# ---------------------------------------------------------------------------
+# Core functionality
+# ---------------------------------------------------------------------------
+def fetch_news(ticker: str) -> List[dict]:
+    """Scrape *up to* 30 recent headlines from Finviz for a given *ticker*.
+    Returns a list of dictionaries with ``{"title": str, "link": str}`` or an
+    empty list on any error/edge‑case (e.g. anti‑scraping redirect).
+    """
     try:
         url = f"https://finviz.com/quote.ashx?t={ticker}"
         headers = {
+            "User-Agent": "Mozilla/5.0",
+            "Accept": "text/html",
+            "Accept-Language": "en-US,en;q=0.5",
+            "Referer": "https://finviz.com/",
+            "Connection": "keep-alive",
         }
+        response = requests.get(url, headers=headers, timeout=10)
         if response.status_code != 200:
             return []
+        soup = BeautifulSoup(response.text, "html.parser")
+        page_title = soup.title.text if soup.title else ""
+        if ticker.upper() not in page_title.upper():
+            # Finviz sometimes redirects to a placeholder page if the ticker is unknown.
             return []
+        news_table = soup.find(id="news-table")
         if news_table is None:
             return []
+        latest_news: List[dict] = []
+        for row in news_table.find_all("tr")[:30]:  # keep only the 30 most recent rows
+            link_tag = row.find("a")
+            if link_tag:
+                latest_news.append({
+                    "title": link_tag.get_text(strip=True),
+                    "link": link_tag["href"],
+                })
+        return latest_news
     except Exception:
+        # swallow all exceptions and degrade gracefully
         return []
+# ---------------------------------------------------------------------------
+# Sentiment analysis helpers
+# ---------------------------------------------------------------------------
+# Raw labels coming from the FinancialBERT model
+_POSITIVE = "positive"
+_NEGATIVE = "negative"
+_DEFAULT_THRESHOLD = 0.55  # default probability threshold; callers may override
+def analyze_sentiment(
+    text: str,
+    pipe=None,
+    threshold: float = _DEFAULT_THRESHOLD,
+) -> Tuple[str, float]:
+    """Classify *text* as **Positive/Negative** and return its positive probability.
+    The underlying model is three‑class (positive/neutral/negative). We keep the
+    **positive** score only and compare it against *threshold* to obtain a binary
+    label. The function is **side‑effect free** and will never raise; on any
+    internal error it falls back to ``("Unknown", 0.0)``.
     """
     try:
         sentiment_pipe = pipe or sentiment_pipeline
+        raw_scores = sentiment_pipe(text, return_all_scores=True, truncation=True)[0]
+        score_lookup = {item["label"].lower(): item["score"] for item in raw_scores}
+        pos_score = score_lookup.get(_POSITIVE, 0.0)
+        label = "Positive" if pos_score >= threshold else "Negative"
+        return label, pos_score
     except Exception:
+        return "Unknown", 0.0
+# ---------------------------------------------------------------------------
+# Aggregation logic – turning many headlines into one overall label
+# ---------------------------------------------------------------------------
+def aggregate_sentiments(
+    results: List[Tuple[str, float]],
+    avg_threshold: float = _DEFAULT_THRESHOLD,
+) -> str:
+    """Combine individual headline results into a single overall label.
+    The rule is simple: compute the *mean* positive probability across all
+    headlines and compare it with *avg_threshold*. If the list is empty, the
+    function returns ``"Unknown"``.
     """
+    if not results:
+        return "Unknown"
+    avg_pos = sum(score for _, score in results) / len(results)
+    return "Positive" if avg_pos >= avg_threshold else "Negative"
+# ---------------------------------------------------------------------------
+# Public helpers (kept for backward compatibility with app.py)
+# ---------------------------------------------------------------------------
 def get_sentiment_pipeline():
+    """Expose the initialised sentiment pipeline (singleton)."""
     return sentiment_pipeline
 def get_ner_pipeline():
+    """Expose the initialised NER pipeline (singleton)."""
     return ner_pipeline