File size: 4,652 Bytes
22acb53
2130106
 
 
e29024a
2130106
 
 
fec291e
f2496ac
 
 
22acb53
24d4881
aaef8fd
fdb1d41
aaef8fd
879bc79
 
404f618
879bc79
 
 
 
 
 
 
 
 
 
 
 
 
2130106
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6c0af57
2130106
6c0af57
2130106
6c0af57
 
5bb7d28
6c0af57
 
 
1d56eaf
 
6c0af57
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5bb7d28
6c0af57
 
5bb7d28
fec291e
6c0af57
 
 
 
 
 
6465ce2
6c0af57
 
73b9bd4
 
 
 
fec291e
cfe892d
 
 
fec291e
5a17582
6c0af57
 
 
 
 
 
 
9c6f4a1
 
e94df39
5a17582
e94df39
5c4531d
fec291e
 
6c0af57
9c6f4a1
fec291e
e94df39
6c0af57
fec291e
 
6c0af57
 
 
fec291e
 
 
 
 
6c0af57
 
e94df39
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
import streamlit as st
import tensorflow as tf
import numpy as np
import pandas as pd
import json
from transformers import *
from tqdm import tqdm
from tensorflow.python.client import device_lib
import requests
from bs4 import BeautifulSoup
import time


PATH = './checkpoint-7500/'
SEQ_LEN = 128
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

def create_sentiment_bert():
  # ๋ฒ„ํŠธ pretrained ๋ชจ๋ธ ๋กœ๋“œ
  model = TFAutoModel.from_pretrained(PATH,local_files_only=True)
  # ํ† ํฐ ์ธํ’‹, ๋งˆ์Šคํฌ ์ธํ’‹, ์„ธ๊ทธ๋จผํŠธ ์ธํ’‹ ์ •์˜
  token_inputs = tf.keras.layers.Input((SEQ_LEN,), dtype=tf.int32, name='input_word_ids')
  mask_inputs = tf.keras.layers.Input((SEQ_LEN,), dtype=tf.int32, name='input_masks')
  segment_inputs = tf.keras.layers.Input((SEQ_LEN,), dtype=tf.int32, name='input_segment')
  # ์ธํ’‹์ด [ํ† ํฐ, ๋งˆ์Šคํฌ, ์„ธ๊ทธ๋จผํŠธ]์ธ ๋ชจ๋ธ ์ •์˜
  bert_outputs = model([token_inputs, mask_inputs, segment_inputs])

  bert_outputs = bert_outputs[1]
  sentiment_first = tf.keras.layers.Dense(1, activation='sigmoid', kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02))(bert_outputs)
  sentiment_model = tf.keras.Model([token_inputs, mask_inputs, segment_inputs], sentiment_first)

  sentiment_model.compile(loss=tf.keras.losses.BinaryCrossentropy(), metrics = ['accuracy'])
  return sentiment_model

def sentence_convert_data(data):
    global tokenizer
    tokens, masks, segments = [], [], []
    token = tokenizer.encode(data, max_length=SEQ_LEN, truncation=True, padding='max_length')
    
    num_zeros = token.count(0) 
    mask = [1]*(SEQ_LEN-num_zeros) + [0]*num_zeros 
    segment = [0]*SEQ_LEN

    tokens.append(token)
    segments.append(segment)
    masks.append(mask)

    tokens = np.array(tokens)
    masks = np.array(masks)
    segments = np.array(segments)
    return [tokens, masks, segments]

def movie_evaluation_predict(sentence):
    data_x = sentence_convert_data(sentence)
    predict = sentiment_model.predict(data_x)
    predict_value = np.ravel(predict)
    # 0:๋ถ€์ •, 1:๊ธ์ •
    predict_answer = np.round(predict_value,0).item()
    return predict_answer

def get_comments(news_url):
    # oid, aid ์ถ”์ถœ

    list = news_url.split("/")
    oid = list[-2]
    aid = list[-1]
    if len(aid) > 10:
        aid = aid[:10]
    
    # API URL ๊ตฌ์„ฑ
    api_url = "https://apis.naver.com/commentBox/cbox/web_naver_list_jsonp.json"
    params = {
        "ticket": "news",
        "templateId": "default_society",
        "pool": "cbox5",
        "lang": "ko",
        "country": "KR",
        "objectId": f"news{oid},{aid}",
        "pageSize": 100,
        "indexSize": 10,
        "page": 1,
        "sort": "FAVORITE" # 'NEW'(์ตœ์‹ ์ˆœ), 'FAVORITE'(์ˆœ๊ณต๊ฐ์ˆœ)
    }
    
    headers = {
        "User-Agent": "Mozilla/5.0",
        "Referer": news_url
    }
    
    # API ํ˜ธ์ถœ ๋ฐ ๋ฐ์ดํ„ฐ ์ฒ˜๋ฆฌ
    response = requests.get(api_url, params=params, headers=headers)
    content = response.text.replace("_callback(", "").replace(");", "")
    json_data = json.loads(content)

    response = requests.get(news_url)
    article_soup = BeautifulSoup(response.text, "html.parser")

    # ์ œ๋ชฉ ์ถ”์ถœ
    title = article_soup.select_one("#ct > div.media_end_head.go_trans > div.media_end_head_title > h2")
    if title is None:
        title = article_soup.select_one("#content > div.end_ct > div > h2")
        
    # ๋ณธ๋ฌธ ์ถ”์ถœ
    article = article_soup.select_one("#dic_area")
    if article is None:
        article = article_soup.select_one("#articeBody")
    
    return title.text.strip(), article.text.strip(), processing_data(json_data['result']['commentList'])

def processing_data(comments):
    comment_list = []
    for comment in comments:
        comment_list.append(comment['contents'])
    comment_listR = [x for x in comment_list if x]
    return comment_listR


def main():
    global sentiment_model
    sentiment_model = create_sentiment_bert()
    st.title("๋Œ“๊ธ€ ํ•„ํ„ฐ๋ง ์„œ๋น„์Šค")
    
    # URL ์ž…๋ ฅ ๋ฐ›๊ธฐ
    url = st.text_input("url์„ ์ž…๋ ฅํ•˜์„ธ์š”")
    
    if st.button("์Šคํฌ๋žฉ ์‹œ์ž‘"):
        if url:
            title, content, comments = get_comments(url)
            
            # ๊ฒฐ๊ณผ ํ‘œ์‹œ
            st.subheader("๊ธฐ์‚ฌ ์ œ๋ชฉ")
            st.write(title)
            
            st.subheader("๋ณธ๋ฌธ ๋‚ด์šฉ")
            st.write(content)
            
            st.subheader("๋Œ“๊ธ€")
            for comment in comments:
                if movie_evaluation_predict(comment) == 1:
                    st.write(comment)
    return 0

if __name__ == "__main__":
    main()