Spaces:

william1324
/

1234

Running

File size: 2,727 Bytes

856d4f7
4ab5677
 
2cc4dc9
4ab5677
 
2cc4dc9
 
a5dc391
4ab5677
 
 
 
 
 
2cc4dc9
a5dc391
4ab5677
e4fff6a
 
 
 
 
 
4ab5677
a5dc391
4ab5677
 
 
 
 
a5dc391
4ab5677
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a5dc391
4ab5677
 
 
 
 
 
 
 
 
 
 
 
 
 
a5dc391
4ab5677
 
 
 
 
 
 
a5dc391
1b8dd75
4ab5677
 
 
 
 
 
 
 
 
a5dc391
e4fff6a
1b8dd75

import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from transformers import pipeline
from prophet import Prophet
import matplotlib.pyplot as plt
import gradio as gr

# model
embedder = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
sentiment_model = pipeline(
    "text-classification",
    model="uer/roberta-base-finetuned-dianping-chinese",
    tokenizer="uer/roberta-base-finetuned-dianping-chinese"
)

#main
def full_pipeline(file, num_clusters):
    df = pd.read_csv(file)

    if "text" not in df.columns:
        return "❌ 錯誤：CSV 檔案需包含 text 欄位"
    if "timestamp" not in df.columns:
        return "❌ 錯誤：CSV 檔案需包含 timestamp 欄位（例如新聞時間）"

    #降維
    texts = df["text"].astype(str).tolist()
    embeddings = embedder.encode(texts, show_progress_bar=True)
    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    df["topic"] = kmeans.fit_predict(embeddings)

    # 情緒分析
    sentiments = []
    for text in texts:
        try:
            result = sentiment_model(text)[0]
            label = result["label"]
            if label == "LABEL_0":
                sentiment = "負向"
            elif label == "LABEL_1":
                sentiment = "中立"
            elif label == "LABEL_2":
                sentiment = "正向"
            else:
                sentiment = "未知"
        except:
            sentiment = "錯誤"
        sentiments.append(sentiment)
    df["sentiment"] = sentiments

    # 熱度預測
    df["timestamp"] = pd.to_datetime(df["timestamp"])
    topic0 = df[df["topic"] == 0]
    daily_counts = topic0.groupby(df["timestamp"].dt.date).size().reset_index(name="count")
    daily_counts.columns = ["ds", "y"]

    if len(daily_counts) < 2:
        return "❌ 無法預測：topic=0 數據太少"

    m = Prophet()
    m.fit(daily_counts)
    future = m.make_future_dataframe(periods=7)
    forecast = m.predict(future)
    fig = m.plot(forecast)

    #output
    output_csv = "/tmp/final_output.csv"
    output_img = "/tmp/forecast.png"
    df.to_csv(output_csv, index=False)
    fig.savefig(output_img)

    return output_csv, output_img

#gradio
gr.Interface(
    fn=full_pipeline,
    inputs=[
        gr.File(label="上傳 CSV（需含 text 與 timestamp 欄）"),
        gr.Number(label="分幾群？（聚類數）", value=5)
    ],
    outputs=[
        gr.File(label="結果 CSV（含 topic, sentiment）"),
        gr.Image(label="topic=0 熱度預測圖（Prophet）")
    ],
    title="話題雷達",
    description="自動分群、分析情緒，並預測熱度走勢（topic=0 為例）"
).launch()