Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
import streamlit as st
|
2 |
-
from transformers import pipeline
|
3 |
import matplotlib.pyplot as plt
|
4 |
from wordcloud import WordCloud
|
5 |
import pandas as pd
|
@@ -19,15 +19,40 @@ def load_models():
|
|
19 |
model="AndrewLi403/CustomModel_tripadvisor_finetuned"
|
20 |
)
|
21 |
ner_model = pipeline("ner", model="dslim/bert-base-NER")
|
22 |
-
|
|
|
23 |
|
24 |
-
def analyze_sentiment(text, model):
|
25 |
-
|
26 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
return {
|
28 |
-
'rating':
|
29 |
-
'label': RATING_MAP[
|
30 |
-
'score':
|
31 |
}
|
32 |
|
33 |
def extract_aspects(text, model):
|
@@ -57,22 +82,16 @@ def extract_aspects(text, model):
|
|
57 |
|
58 |
def plot_sentiment_distribution(df):
|
59 |
fig, ax = plt.subplots()
|
60 |
-
|
61 |
-
# Get counts for all possible ratings
|
62 |
counts = df['label'].value_counts()
|
63 |
|
64 |
-
# Ensure all rating categories are present (even with 0 counts)
|
65 |
for rating in RATING_MAP.values():
|
66 |
if rating not in counts.index:
|
67 |
counts[rating] = 0
|
68 |
|
69 |
-
# Sort by the predefined rating order
|
70 |
counts = counts.loc[list(RATING_MAP.values())]
|
71 |
-
|
72 |
-
# Plot with consistent colors
|
73 |
counts.plot.pie(
|
74 |
autopct='%1.1f%%',
|
75 |
-
colors=['#ff9999','#66b3ff','#99ff99'],
|
76 |
ax=ax
|
77 |
)
|
78 |
ax.set_ylabel('')
|
@@ -96,7 +115,7 @@ def main():
|
|
96 |
st.title("Restaurant Review Analyzer")
|
97 |
st.markdown("Using fine-tuned model for sentiment and aspect analysis")
|
98 |
|
99 |
-
sentiment_model, ner_model = load_models()
|
100 |
|
101 |
st.sidebar.header("Analysis Options")
|
102 |
analysis_mode = st.sidebar.radio(
|
@@ -115,7 +134,7 @@ def main():
|
|
115 |
if st.button("Analyze"):
|
116 |
if user_input:
|
117 |
with st.spinner("Analyzing..."):
|
118 |
-
sentiment = analyze_sentiment(user_input, sentiment_model)
|
119 |
aspects = extract_aspects(user_input, ner_model)
|
120 |
|
121 |
new_entry = pd.DataFrame([{
|
@@ -159,7 +178,7 @@ def main():
|
|
159 |
results = []
|
160 |
|
161 |
for i, row in enumerate(df.itertuples()):
|
162 |
-
sentiment = analyze_sentiment(row.text, sentiment_model)
|
163 |
aspects = extract_aspects(row.text, ner_model)
|
164 |
|
165 |
results.append({
|
|
|
1 |
import streamlit as st
|
2 |
+
from transformers import pipeline, AutoTokenizer
|
3 |
import matplotlib.pyplot as plt
|
4 |
from wordcloud import WordCloud
|
5 |
import pandas as pd
|
|
|
19 |
model="AndrewLi403/CustomModel_tripadvisor_finetuned"
|
20 |
)
|
21 |
ner_model = pipeline("ner", model="dslim/bert-base-NER")
|
22 |
+
tokenizer = AutoTokenizer.from_pretrained("AndrewLi403/CustomModel_tripadvisor_finetuned")
|
23 |
+
return sentiment_model, ner_model, tokenizer
|
24 |
|
25 |
+
def analyze_sentiment(text, model, tokenizer, chunk_size=400):
|
26 |
+
tokens = tokenizer.tokenize(text)
|
27 |
+
|
28 |
+
# Short text processing
|
29 |
+
if len(tokens) <= 512:
|
30 |
+
result = model(text)[0]
|
31 |
+
rating = int(result['label'].split('_')[-1])
|
32 |
+
return {
|
33 |
+
'rating': rating,
|
34 |
+
'label': RATING_MAP[rating],
|
35 |
+
'score': result['score']
|
36 |
+
}
|
37 |
+
|
38 |
+
# Long text chunk processing
|
39 |
+
chunks = [tokens[i:i+chunk_size] for i in range(0, len(tokens), chunk_size)]
|
40 |
+
results = []
|
41 |
+
|
42 |
+
for chunk in chunks:
|
43 |
+
chunk_text = tokenizer.convert_tokens_to_string(chunk)
|
44 |
+
result = model(chunk_text)[0]
|
45 |
+
results.append(result)
|
46 |
+
|
47 |
+
# Aggregate results (majority vote + average confidence)
|
48 |
+
final_label = max(set(r['label'] for r in results),
|
49 |
+
key=lambda x: sum(1 for r in results if r['label'] == x))
|
50 |
+
avg_score = sum(r['score'] for r in results) / len(results)
|
51 |
+
|
52 |
return {
|
53 |
+
'rating': int(final_label.split('_')[-1]),
|
54 |
+
'label': RATING_MAP[int(final_label.split('_')[-1])],
|
55 |
+
'score': avg_score
|
56 |
}
|
57 |
|
58 |
def extract_aspects(text, model):
|
|
|
82 |
|
83 |
def plot_sentiment_distribution(df):
|
84 |
fig, ax = plt.subplots()
|
|
|
|
|
85 |
counts = df['label'].value_counts()
|
86 |
|
|
|
87 |
for rating in RATING_MAP.values():
|
88 |
if rating not in counts.index:
|
89 |
counts[rating] = 0
|
90 |
|
|
|
91 |
counts = counts.loc[list(RATING_MAP.values())]
|
|
|
|
|
92 |
counts.plot.pie(
|
93 |
autopct='%1.1f%%',
|
94 |
+
colors=['#ff9999','#66b3ff','#99ff99'],
|
95 |
ax=ax
|
96 |
)
|
97 |
ax.set_ylabel('')
|
|
|
115 |
st.title("Restaurant Review Analyzer")
|
116 |
st.markdown("Using fine-tuned model for sentiment and aspect analysis")
|
117 |
|
118 |
+
sentiment_model, ner_model, tokenizer = load_models()
|
119 |
|
120 |
st.sidebar.header("Analysis Options")
|
121 |
analysis_mode = st.sidebar.radio(
|
|
|
134 |
if st.button("Analyze"):
|
135 |
if user_input:
|
136 |
with st.spinner("Analyzing..."):
|
137 |
+
sentiment = analyze_sentiment(user_input, sentiment_model, tokenizer)
|
138 |
aspects = extract_aspects(user_input, ner_model)
|
139 |
|
140 |
new_entry = pd.DataFrame([{
|
|
|
178 |
results = []
|
179 |
|
180 |
for i, row in enumerate(df.itertuples()):
|
181 |
+
sentiment = analyze_sentiment(row.text, sentiment_model, tokenizer)
|
182 |
aspects = extract_aspects(row.text, ner_model)
|
183 |
|
184 |
results.append({
|