File size: 5,952 Bytes
7aa5e6a ca12572 5a1ddac ca12572 7aa5e6a 5a1ddac ca12572 ea96c2b ca12572 ea96c2b eae5fab ea96c2b eae5fab ca12572 eae5fab ca12572 eae5fab ca12572 eae5fab 5a1ddac 30bf2ff 5a1ddac 4ba793c 30bf2ff 5a1ddac 4307f05 5a1ddac 30bf2ff 5a1ddac 30bf2ff 5a1ddac 7aa5e6a 0b66ee9 5a1ddac 0b66ee9 7aa5e6a ea96c2b 5a1ddac ea96c2b 5a1ddac ea96c2b 5a1ddac ea96c2b 5a1ddac ea96c2b 5a1ddac ea96c2b 5a1ddac ea96c2b 6fafb78 7aa5e6a 5a1ddac |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 |
import streamlit as st
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
import pandas as pd
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
from webdriver_manager.chrome import ChromeDriverManager
from webdriver_manager.chrome import ChromeType
import transformers
import torch
import plotly.express as px
st.subheader("YouTube Comments Sentiment Analysis", divider="red")
tokenizer = transformers.DistilBertTokenizer.from_pretrained("tabularisai/robust-sentiment-analysis")
model = transformers.DistilBertForSequenceClassification.from_pretrained("tabularisai/robust-sentiment-analysis")
if 'url_count' not in st.session_state:
st.session_state['url_count'] = 0
max_attempts = 2
def update_url_count():
st.session_state['url_count'] += 1
def clear_question():
st.session_state["url"] = ""
url = st.text_input("Enter YouTube URL:", key="url")
st.button("Clear question", on_click=clear_question)
if st.button("Sentiment Analysis", type="secondary"):
if st.session_state['url_count'] < max_attempts:
if url:
with st.spinner("Wait for it...", show_time=True):
options = Options()
options.add_argument("--headless")
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--start-maximized")
service = Service(ChromeDriverManager(chrome_type=ChromeType.CHROMIUM).install())
driver = webdriver.Chrome(service=service, options=options)
data = []
wait = WebDriverWait(driver, 30)
driver.get(url)
placeholder = st.empty()
progress_bar = st.progress(0)
for item in range(30):
try:
driver.execute_script("window.scrollBy(0, 500);")
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#content #content-text")))
placeholder.text(f"Scrolled {item + 1} times")
progress_bar.progress((item + 1) / 30)
time.sleep(1) #Increased wait time for dynamic loading
except Exception as e:
st.error(f"Exception during scrolling: {e}")
break
placeholder.text("Scrolling complete.")
progress_bar.empty()
try:
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#contents #contents")))
comments = driver.find_elements(By.CSS_SELECTOR, "#content #content-text")
for comment in comments:
timestamp = None
try:
comment_text = comment.text
date_match = re.search(r'\d+ (day|week|month|year)s? ago', comment_text) #Example regex.
if date_match:
timestamp = date_match.group(0)
except Exception as e:
st.error(f"Error extracting date with regex: {e}")
data.append({"Comment": comment.text, "comment_date": timestamp})
except Exception as e:
st.error(f"Exception during comment extraction: {e}")
driver.quit()
df = pd.DataFrame(data, columns=["Comment", "comment_date"])
if not df.empty and not df['Comment'].tolist() == []:
st.dataframe(df)
inputs = tokenizer(df['Comment'].tolist(), return_tensors="pt", padding=True, truncation=True)
with torch.no_grad():
logits = model(**inputs).logits
predicted_probabilities = torch.nn.functional.softmax(logits, dim=-1)
predicted_labels = predicted_probabilities.argmax(dim=1)
results = []
for i, label in enumerate(predicted_labels):
results.append({'Review Number': i + 1, 'Sentiment': model.config.id2label[label.item()]})
sentiment_df = pd.DataFrame(results)
value_counts1 = sentiment_df['Sentiment'].value_counts().rename_axis('Sentiment').reset_index(name='count')
final_df = value_counts1
tab1, tab2 = st.tabs(["Pie Chart", "Bar Chart"])
with tab1:
fig1 = px.pie(final_df, values='count', names='Sentiment', hover_data=['count'], labels={'count': 'count'})
fig1.update_traces(textposition='inside', textinfo='percent+label')
st.plotly_chart(fig1)
result = pd.concat([df, sentiment_df], axis=1)
st.dataframe(result)
with tab2:
fig2 = px.bar(result, x="Sentiment", y="comment_date", color="Sentiment")
st.plotly_chart(fig2)
csv = result.to_csv(index=False)
st.download_button(label="Download data as CSV", data=csv, file_name='Summary of the results.csv', mime='text/csv')
else:
st.warning("No comments were scraped. Sentiment analysis could not be performed.")
else:
st.warning("Please enter a URL.")
else:
st.warning(f"You have reached the maximum URL attempts ({max_attempts}).")
if 'url_count' in st.session_state:
st.write(f"URL pasted {st.session_state['url_count']} times.") |