|
import streamlit as st |
|
import requests |
|
import openai |
|
import os |
|
from datasets import load_dataset |
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
from sklearn.linear_model import SGDClassifier |
|
from sklearn.metrics import classification_report, accuracy_score |
|
import joblib |
|
import matplotlib.pyplot as plt |
|
import seaborn as sns |
|
from pathlib import Path |
|
|
|
|
|
st.set_page_config(page_title="์ ์น์ ์ฑํฅ ๋ถ์ ๋ฐ ๋ฐ๋ ๊ด์ ์์ฑ", page_icon="๐ฐ", layout="wide") |
|
|
|
|
|
openai.api_key = os.getenv("OPENAI_API_KEY") |
|
|
|
|
|
@st.cache_data |
|
def load_huggingface_data(): |
|
dataset = load_dataset("jacobvs/PoliticalTweets") |
|
return dataset |
|
|
|
|
|
def fetch_naver_news(query, display=15): |
|
client_id = "I_8koTJh3R5l4wLurQbG" |
|
client_secret = "W5oWYlAgur" |
|
|
|
url = "https://openapi.naver.com/v1/search/news.json" |
|
headers = { |
|
"X-Naver-Client-Id": client_id, |
|
"X-Naver-Client-Secret": client_secret, |
|
} |
|
params = { |
|
"query": query, |
|
"display": display, |
|
"start": 1, |
|
"sort": "date", |
|
} |
|
|
|
response = requests.get(url, headers=headers, params=params) |
|
if response.status_code == 200: |
|
return response.json()['items'] |
|
else: |
|
st.error("๋ด์ค ๋ฐ์ดํฐ๋ฅผ ๋ถ๋ฌ์ค๋ ๋ฐ ์คํจํ์ต๋๋ค.") |
|
return [] |
|
|
|
|
|
def combine_datasets(huggingface_data, naver_data): |
|
additional_texts = [item['title'] + ". " + item['description'] for item in naver_data] |
|
additional_labels = ["NEUTRAL"] * len(additional_texts) |
|
hf_texts = huggingface_data['train']['text'] |
|
hf_labels = huggingface_data['train']['party'] |
|
return hf_texts + additional_texts, hf_labels + additional_labels |
|
|
|
|
|
def initialize_model(): |
|
if os.path.exists("incremental_model.pkl") and os.path.exists("tfidf_vectorizer.pkl"): |
|
model = joblib.load("incremental_model.pkl") |
|
vectorizer = joblib.load("tfidf_vectorizer.pkl") |
|
else: |
|
|
|
model = SGDClassifier(loss='log_loss', max_iter=5, tol=None) |
|
vectorizer = TfidfVectorizer(max_features=1000, stop_words="english") |
|
return model, vectorizer |
|
|
|
|
|
def incremental_training(texts, labels, model, vectorizer): |
|
X = vectorizer.fit_transform(texts) |
|
y = [0 if label == "Democrat" else 1 if label == "Republican" else 2 for label in labels] |
|
model.partial_fit(X, y, classes=[0, 1, 2]) |
|
|
|
joblib.dump(model, "incremental_model.pkl") |
|
joblib.dump(vectorizer, "tfidf_vectorizer.pkl") |
|
return model, vectorizer |
|
|
|
|
|
def generate_article_gpt4(prompt): |
|
try: |
|
response = openai.ChatCompletion.create( |
|
model="gpt-4", |
|
messages=[ |
|
{"role": "system", "content": "You are a helpful assistant that generates articles."}, |
|
{"role": "user", "content": prompt} |
|
], |
|
max_tokens=1024, |
|
temperature=0.7 |
|
) |
|
return response['choices'][0]['message']['content'] |
|
except Exception as e: |
|
return f"Error generating text: {e}" |
|
|
|
|
|
st.title("๐ฐ ์ ์น์ ์ฑํฅ ๋ถ์ ๋ฐ ๋ฐ๋ ๊ด์ ์์ฑ ๋๊ตฌ") |
|
st.markdown("๋ค์ด๋ฒ ๋ด์ค์ ํ๊น
ํ์ด์ค ๋ฐ์ดํฐ๋ฅผ ํ์ฉํ์ฌ ๋ด์ค ์ฑํฅ์ ๋ถ์ํ๊ณ , ๋ฐ๋ ๊ด์ ์ ์์ฑํฉ๋๋ค.") |
|
|
|
|
|
huggingface_data = load_huggingface_data() |
|
query = st.text_input("๋ค์ด๋ฒ ๋ด์ค์์ ๊ฒ์ํ ํค์๋๋ฅผ ์
๋ ฅํ์ธ์", value="์ ์น") |
|
|
|
|
|
if st.button("๋ฐ์ดํฐ ๊ฒฐํฉ ๋ฐ ํ์ต"): |
|
texts, labels = combine_datasets(huggingface_data, fetch_naver_news(query)) |
|
model, vectorizer = initialize_model() |
|
model, vectorizer = incremental_training(texts, labels, model, vectorizer) |
|
|
|
|
|
X_test = vectorizer.transform(texts) |
|
y_test = [0 if label == "Democrat" else 1 if label == "Republican" else 2 for label in labels] |
|
y_pred = model.predict(X_test) |
|
accuracy = accuracy_score(y_test, y_pred) |
|
st.write(f"๋ชจ๋ธ ์ ํ๋: {accuracy:.2f}") |
|
st.text("๋ถ๋ฅ ๋ฆฌํฌํธ:") |
|
st.text(classification_report(y_test, y_pred, target_names=["Democrat", "Republican", "NEUTRAL"])) |
|
st.success("๋ชจ๋ธ์ด ์๋ก์ด ๋ฐ์ดํฐ๋ก ์ถ๊ฐ ํ์ต๋์์ต๋๋ค.") |
|
|
|
|
|
if st.button("๋ด์ค ์ฑํฅ ๋ถ์"): |
|
model, vectorizer = initialize_model() |
|
news_items = fetch_naver_news(query, display=15) |
|
|
|
if news_items: |
|
st.subheader("๋ด์ค ์ฑํฅ ๋ถ์ ๊ฒฐ๊ณผ") |
|
for item in news_items: |
|
title = item["title"] |
|
description = item["description"] |
|
link = item["link"] |
|
combined_text = f"{title}. {description}" |
|
|
|
|
|
vectorized_text = vectorizer.transform([combined_text]) |
|
prediction = model.predict(vectorized_text)[0] |
|
sentiment = ["Democrat", "Republican", "NEUTRAL"][prediction] |
|
|
|
|
|
opposite_perspective = "๋ณด์์ " if sentiment == "Democrat" else "์ง๋ณด์ " if sentiment == "Republican" else "์ค๋ฆฝ์ " |
|
prompt = f"๋ค์ ๊ธฐ์ฌ์ ๋ฐ๋ ๊ด์ ์ผ๋ก ๊ธฐ์ฌ๋ฅผ ์์ฑํ์ธ์:\n\n{combined_text}\n\n๋ฐ๋ ๊ด์ : {opposite_perspective}" |
|
opposite_article = generate_article_gpt4(prompt) |
|
|
|
st.write(f"**์ ๋ชฉ:** {title}") |
|
st.write(f"**๊ธฐ์ฌ ๋ด์ฉ:** {description}") |
|
st.write(f"**์ฑํฅ:** {sentiment}") |
|
st.write(f"**๋ฐ๋ ๊ด์ ๊ธฐ์ฌ:** {opposite_article}") |
|
st.write(f"**๋งํฌ:** [๊ธฐ์ฌ ๋งํฌ]({link})") |
|
st.markdown("---") |
|
|