Test / app.py
820nam's picture
Update app.py
82d33ff verified
import streamlit as st
import requests
import openai
import os
from datasets import load_dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier # Incremental Learning์— ์ ํ•ฉํ•œ ๋ชจ๋ธ
from sklearn.metrics import classification_report, accuracy_score
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
# Streamlit ํŽ˜์ด์ง€ ์„ค์ •
st.set_page_config(page_title="์ •์น˜์  ์„ฑํ–ฅ ๋ถ„์„ ๋ฐ ๋ฐ˜๋Œ€ ๊ด€์  ์ƒ์„ฑ", page_icon="๐Ÿ“ฐ", layout="wide")
# OpenAI API ํ‚ค ์„ค์ •
openai.api_key = os.getenv("OPENAI_API_KEY")
# ํ—ˆ๊น…ํŽ˜์ด์Šค ๋ฐ์ดํ„ฐ์…‹ ๋กœ๋“œ
@st.cache_data
def load_huggingface_data():
dataset = load_dataset("jacobvs/PoliticalTweets")
return dataset
# ๋„ค์ด๋ฒ„ ๋‰ด์Šค API๋ฅผ ํ†ตํ•ด ๋‰ด์Šค ๋ฐ์ดํ„ฐ ๊ฐ€์ ธ์˜ค๊ธฐ
def fetch_naver_news(query, display=15):
client_id = "I_8koTJh3R5l4wLurQbG" # ๋„ค์ด๋ฒ„ ๊ฐœ๋ฐœ์ž ์„ผํ„ฐ์—์„œ ๋ฐœ๊ธ‰๋ฐ›์€ Client ID
client_secret = "W5oWYlAgur" # ๋„ค์ด๋ฒ„ ๊ฐœ๋ฐœ์ž ์„ผํ„ฐ์—์„œ ๋ฐœ๊ธ‰๋ฐ›์€ Client Secret
url = "https://openapi.naver.com/v1/search/news.json"
headers = {
"X-Naver-Client-Id": client_id,
"X-Naver-Client-Secret": client_secret,
}
params = {
"query": query,
"display": display, # ๋‰ด์Šค 15๊ฐœ ๊ฐ€์ ธ์˜ค๊ธฐ
"start": 1,
"sort": "date", # ์ตœ์‹ ์ˆœ์œผ๋กœ ์ •๋ ฌ
}
response = requests.get(url, headers=headers, params=params)
if response.status_code == 200:
return response.json()['items']
else:
st.error("๋‰ด์Šค ๋ฐ์ดํ„ฐ๋ฅผ ๋ถˆ๋Ÿฌ์˜ค๋Š” ๋ฐ ์‹คํŒจํ–ˆ์Šต๋‹ˆ๋‹ค.")
return []
# ํ—ˆ๊น…ํŽ˜์ด์Šค ๋ฐ์ดํ„ฐ์™€ ๋„ค์ด๋ฒ„ ๋‰ด์Šค ๋ฐ์ดํ„ฐ๋ฅผ ๊ฒฐํ•ฉ
def combine_datasets(huggingface_data, naver_data):
additional_texts = [item['title'] + ". " + item['description'] for item in naver_data]
additional_labels = ["NEUTRAL"] * len(additional_texts) # ๊ธฐ๋ณธ์ ์œผ๋กœ ์ค‘๋ฆฝ์œผ๋กœ ๋ผ๋ฒจ๋ง
hf_texts = huggingface_data['train']['text']
hf_labels = huggingface_data['train']['party']
return hf_texts + additional_texts, hf_labels + additional_labels
# ๋ชจ๋ธ ์ดˆ๊ธฐํ™”
def initialize_model():
if os.path.exists("incremental_model.pkl") and os.path.exists("tfidf_vectorizer.pkl"):
model = joblib.load("incremental_model.pkl")
vectorizer = joblib.load("tfidf_vectorizer.pkl")
else:
# ์ดˆ๊ธฐ ๋ชจ๋ธ ๋ฐ ๋ฒกํ„ฐ๋ผ์ด์ € ์„ค์ •
model = SGDClassifier(loss='log_loss', max_iter=5, tol=None) # 'log_loss'๋กœ ์ˆ˜์ •
vectorizer = TfidfVectorizer(max_features=1000, stop_words="english")
return model, vectorizer
# ์ถ”๊ฐ€ ํ•™์Šต ์ˆ˜ํ–‰
def incremental_training(texts, labels, model, vectorizer):
X = vectorizer.fit_transform(texts)
y = [0 if label == "Democrat" else 1 if label == "Republican" else 2 for label in labels]
model.partial_fit(X, y, classes=[0, 1, 2]) # Incremental Learning
# ๋ชจ๋ธ ๋ฐ ๋ฒกํ„ฐ๋ผ์ด์ € ์ €์žฅ
joblib.dump(model, "incremental_model.pkl")
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")
return model, vectorizer
# GPT-4๋ฅผ ์ด์šฉํ•ด ๋ฐ˜๋Œ€ ๊ด€์  ๊ธฐ์‚ฌ ์ƒ์„ฑ
def generate_article_gpt4(prompt):
try:
response = openai.ChatCompletion.create(
model="gpt-4",
messages=[
{"role": "system", "content": "You are a helpful assistant that generates articles."},
{"role": "user", "content": prompt}
],
max_tokens=1024,
temperature=0.7
)
return response['choices'][0]['message']['content']
except Exception as e:
return f"Error generating text: {e}"
# Streamlit ์• ํ”Œ๋ฆฌ์ผ€์ด์…˜ ์‹œ์ž‘
st.title("๐Ÿ“ฐ ์ •์น˜์  ์„ฑํ–ฅ ๋ถ„์„ ๋ฐ ๋ฐ˜๋Œ€ ๊ด€์  ์ƒ์„ฑ ๋„๊ตฌ")
st.markdown("๋„ค์ด๋ฒ„ ๋‰ด์Šค์™€ ํ—ˆ๊น…ํŽ˜์ด์Šค ๋ฐ์ดํ„ฐ๋ฅผ ํ™œ์šฉํ•˜์—ฌ ๋‰ด์Šค ์„ฑํ–ฅ์„ ๋ถ„์„ํ•˜๊ณ , ๋ฐ˜๋Œ€ ๊ด€์ ์„ ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค.")
# ๋ฐ์ดํ„ฐ ๋กœ๋“œ
huggingface_data = load_huggingface_data()
query = st.text_input("๋„ค์ด๋ฒ„ ๋‰ด์Šค์—์„œ ๊ฒ€์ƒ‰ํ•  ํ‚ค์›Œ๋“œ๋ฅผ ์ž…๋ ฅํ•˜์„ธ์š”", value="์ •์น˜")
# ๋ฐ์ดํ„ฐ ๊ฒฐํ•ฉ ๋ฐ ํ•™์Šต
if st.button("๋ฐ์ดํ„ฐ ๊ฒฐํ•ฉ ๋ฐ ํ•™์Šต"):
texts, labels = combine_datasets(huggingface_data, fetch_naver_news(query))
model, vectorizer = initialize_model()
model, vectorizer = incremental_training(texts, labels, model, vectorizer)
# ์„ฑ๋Šฅ ํ‰๊ฐ€
X_test = vectorizer.transform(texts)
y_test = [0 if label == "Democrat" else 1 if label == "Republican" else 2 for label in labels]
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
st.write(f"๋ชจ๋ธ ์ •ํ™•๋„: {accuracy:.2f}")
st.text("๋ถ„๋ฅ˜ ๋ฆฌํฌํŠธ:")
st.text(classification_report(y_test, y_pred, target_names=["Democrat", "Republican", "NEUTRAL"]))
st.success("๋ชจ๋ธ์ด ์ƒˆ๋กœ์šด ๋ฐ์ดํ„ฐ๋กœ ์ถ”๊ฐ€ ํ•™์Šต๋˜์—ˆ์Šต๋‹ˆ๋‹ค.")
# ๋‰ด์Šค ๋ฐ์ดํ„ฐ ๋ถ„์„ ๋ฐ ๋ฐ˜๋Œ€ ๊ด€์  ๊ธฐ์‚ฌ ์ƒ์„ฑ
if st.button("๋‰ด์Šค ์„ฑํ–ฅ ๋ถ„์„"):
model, vectorizer = initialize_model()
news_items = fetch_naver_news(query, display=15) # ๋‰ด์Šค 15๊ฐœ ๊ฐ€์ ธ์˜ค๊ธฐ
if news_items:
st.subheader("๋‰ด์Šค ์„ฑํ–ฅ ๋ถ„์„ ๊ฒฐ๊ณผ")
for item in news_items:
title = item["title"]
description = item["description"]
link = item["link"]
combined_text = f"{title}. {description}"
# ํ…์ŠคํŠธ ๋ฒกํ„ฐํ™” ๋ฐ ์˜ˆ์ธก
vectorized_text = vectorizer.transform([combined_text])
prediction = model.predict(vectorized_text)[0]
sentiment = ["Democrat", "Republican", "NEUTRAL"][prediction]
# ๋ฐ˜๋Œ€ ๊ด€์  ์ƒ์„ฑ
opposite_perspective = "๋ณด์ˆ˜์ " if sentiment == "Democrat" else "์ง„๋ณด์ " if sentiment == "Republican" else "์ค‘๋ฆฝ์ "
prompt = f"๋‹ค์Œ ๊ธฐ์‚ฌ์˜ ๋ฐ˜๋Œ€ ๊ด€์ ์œผ๋กœ ๊ธฐ์‚ฌ๋ฅผ ์ž‘์„ฑํ•˜์„ธ์š”:\n\n{combined_text}\n\n๋ฐ˜๋Œ€ ๊ด€์ : {opposite_perspective}"
opposite_article = generate_article_gpt4(prompt)
st.write(f"**์ œ๋ชฉ:** {title}")
st.write(f"**๊ธฐ์‚ฌ ๋‚ด์šฉ:** {description}")
st.write(f"**์„ฑํ–ฅ:** {sentiment}")
st.write(f"**๋ฐ˜๋Œ€ ๊ด€์  ๊ธฐ์‚ฌ:** {opposite_article}")
st.write(f"**๋งํฌ:** [๊ธฐ์‚ฌ ๋งํฌ]({link})")
st.markdown("---")