mxiean commited on
Commit
db24cbc
·
verified ·
1 Parent(s): 3515dc3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +109 -194
app.py CHANGED
@@ -1,9 +1,16 @@
1
  import streamlit as st
2
- from transformers import pipeline, AutoTokenizer
3
  import matplotlib.pyplot as plt
4
  from wordcloud import WordCloud
5
  import pandas as pd
6
  from datetime import datetime
 
 
 
 
 
 
 
7
 
8
  # Constants
9
  RATING_MAP = {
@@ -18,213 +25,121 @@ def load_models():
18
  "text-classification",
19
  model="AndrewLi403/CustomModel_tripadvisor_finetuned"
20
  )
21
- ner_model = pipeline("ner", model="dslim/bert-base-NER")
22
- tokenizer = AutoTokenizer.from_pretrained("AndrewLi403/CustomModel_tripadvisor_finetuned")
23
- return sentiment_model, ner_model, tokenizer
24
 
25
- def analyze_sentiment(text, model, tokenizer, chunk_size=400):
26
- tokens = tokenizer.tokenize(text)
27
-
28
- # Short text processing
29
- if len(tokens) <= 512:
30
- result = model(text)[0]
31
- rating = int(result['label'].split('_')[-1])
32
- return {
33
- 'rating': rating,
34
- 'label': RATING_MAP[rating],
35
- 'score': result['score']
36
- }
37
-
38
- # Long text chunk processing
39
- chunks = [tokens[i:i+chunk_size] for i in range(0, len(tokens), chunk_size)]
40
- results = []
41
-
42
- for chunk in chunks:
43
- chunk_text = tokenizer.convert_tokens_to_string(chunk)
44
- result = model(chunk_text)[0]
45
- results.append(result)
46
-
47
- # Aggregate results (majority vote + average confidence)
48
- final_label = max(set(r['label'] for r in results),
49
- key=lambda x: sum(1 for r in results if r['label'] == x))
50
- avg_score = sum(r['score'] for r in results) / len(results)
51
-
52
  return {
53
- 'rating': int(final_label.split('_')[-1]),
54
- 'label': RATING_MAP[int(final_label.split('_')[-1])],
55
- 'score': avg_score
56
  }
57
 
58
- def extract_aspects(text, model):
59
- entities = model(text)
60
- aspects = []
61
- current_entity = ""
62
 
63
- for entity in entities:
64
- if entity['word'].startswith('##'):
65
- current_entity += entity['word'][2:]
66
- else:
67
- if current_entity:
68
- aspects.append({
69
- 'entity': current_entity,
70
- 'type': prev_type
71
- })
72
- current_entity = entity['word']
73
- prev_type = entity['entity']
74
-
75
- if current_entity:
76
- aspects.append({
77
- 'entity': current_entity,
78
- 'type': prev_type
79
- })
80
-
81
- return [a for a in aspects if a['type'] in ['PRODUCT', 'ORG', 'PERSON']]
82
-
83
- def plot_sentiment_distribution(df):
84
- fig, ax = plt.subplots()
85
- counts = df['label'].value_counts()
86
-
87
- for rating in RATING_MAP.values():
88
- if rating not in counts.index:
89
- counts[rating] = 0
90
-
91
- counts = counts.loc[list(RATING_MAP.values())]
92
- counts.plot.pie(
93
- autopct='%1.1f%%',
94
- colors=['#ff9999','#66b3ff','#99ff99'],
95
- ax=ax
96
- )
97
- ax.set_ylabel('')
98
- return fig
99
-
100
- def plot_wordcloud(negative_reviews):
101
- text = " ".join(negative_reviews)
102
- wordcloud = WordCloud(
103
- width=800,
104
  height=400,
105
  background_color='white',
106
- colormap='Reds'
107
- ).generate(text)
 
108
 
109
  fig, ax = plt.subplots(figsize=(10, 5))
110
- ax.imshow(wordcloud, interpolation='bilinear')
111
  ax.axis('off')
112
  return fig
113
 
 
 
 
 
 
 
 
 
 
 
 
114
  def main():
115
- st.title("Restaurant Review Analyzer")
116
- st.markdown("Using fine-tuned model for sentiment and aspect analysis")
117
-
118
- sentiment_model, ner_model, tokenizer = load_models()
119
-
120
- st.sidebar.header("Analysis Options")
121
- analysis_mode = st.sidebar.radio(
122
- "Select Mode",
123
- ["Single Review", "Batch Analysis"]
124
- )
125
-
126
- if 'history' not in st.session_state:
127
- st.session_state.history = pd.DataFrame(
128
- columns=['text', 'rating', 'label', 'date', 'aspects']
129
- )
130
-
131
- if analysis_mode == "Single Review":
132
- user_input = st.text_area("Enter or paste a restaurant review:", height=150)
133
-
134
- if st.button("Analyze"):
135
- if user_input:
136
- with st.spinner("Analyzing..."):
137
- sentiment = analyze_sentiment(user_input, sentiment_model, tokenizer)
138
- aspects = extract_aspects(user_input, ner_model)
139
-
140
- new_entry = pd.DataFrame([{
141
- 'text': user_input,
142
- 'rating': sentiment['rating'],
143
- 'label': sentiment['label'],
144
- 'date': datetime.now(),
145
- 'aspects': aspects
146
- }])
147
- st.session_state.history = pd.concat(
148
- [st.session_state.history, new_entry],
149
- ignore_index=True
150
- )
151
-
152
- st.subheader("Analysis Results")
153
- col1, col2 = st.columns(2)
154
- with col1:
155
- st.metric("Rating", sentiment['label'])
156
- with col2:
157
- st.metric("Confidence", f"{sentiment['score']:.2f}")
158
-
159
- if aspects:
160
- st.subheader("Identified Aspects")
161
- for aspect in aspects:
162
- st.markdown(f"- **{aspect['type']}**: `{aspect['entity']}`")
163
- else:
164
- st.info("No specific entities identified")
165
- else:
166
- st.warning("Please enter a review")
167
-
168
- else:
169
- uploaded_file = st.file_uploader("Upload CSV file", type=["csv"])
170
-
171
- if uploaded_file:
172
- df = pd.read_csv(uploaded_file)
173
- if 'text' not in df.columns:
174
- st.error("CSV must contain 'text' column")
175
- else:
176
- if st.button("Analyze All"):
177
- progress_bar = st.progress(0)
178
- results = []
179
-
180
- for i, row in enumerate(df.itertuples()):
181
- sentiment = analyze_sentiment(row.text, sentiment_model, tokenizer)
182
- aspects = extract_aspects(row.text, ner_model)
183
-
184
- results.append({
185
- 'text': row.text,
186
- 'rating': sentiment['rating'],
187
- 'label': sentiment['label'],
188
- 'date': datetime.now(),
189
- 'aspects': aspects
190
- })
191
-
192
- progress_bar.progress((i + 1) / len(df))
193
-
194
- st.session_state.history = pd.concat(
195
- [st.session_state.history, pd.DataFrame(results)],
196
- ignore_index=True
197
- )
198
- st.success(f"Completed analysis of {len(df)} reviews")
199
-
200
- if not st.session_state.history.empty:
201
- st.divider()
202
- st.header("Analysis History")
203
-
204
- with st.expander("View Raw Data"):
205
- st.dataframe(st.session_state.history)
206
-
207
- st.subheader("Sentiment Distribution")
208
- fig1 = plot_sentiment_distribution(st.session_state.history)
209
- st.pyplot(fig1)
210
-
211
- negative_reviews = st.session_state.history[
212
- st.session_state.history['rating'] == 0
213
- ]['text'].tolist()
214
-
215
- if negative_reviews:
216
- st.subheader("Negative Reviews Word Cloud")
217
- fig2 = plot_wordcloud(negative_reviews)
218
- st.pyplot(fig2)
219
  else:
220
- st.info("No negative reviews yet")
221
-
222
- if len(st.session_state.history) > 1:
223
- st.subheader("Rating Trend Over Time")
224
- time_df = st.session_state.history.copy()
225
- time_df['date'] = pd.to_datetime(time_df['date'])
226
- time_df = time_df.set_index('date').resample('D')['rating'].mean()
227
- st.line_chart(time_df)
 
 
 
 
 
 
 
 
 
 
 
228
 
229
  if __name__ == "__main__":
230
  main()
 
1
  import streamlit as st
2
+ from transformers import pipeline
3
  import matplotlib.pyplot as plt
4
  from wordcloud import WordCloud
5
  import pandas as pd
6
  from datetime import datetime
7
+ from collections import Counter
8
+ import re
9
+ from nltk.corpus import stopwords
10
+ import nltk
11
+
12
+ # Download NLTK stopwords (first-time only)
13
+ nltk.download('stopwords')
14
 
15
  # Constants
16
  RATING_MAP = {
 
25
  "text-classification",
26
  model="AndrewLi403/CustomModel_tripadvisor_finetuned"
27
  )
28
+ return sentiment_model
 
 
29
 
30
+ def preprocess_text(text):
31
+ """Clean and tokenize English text"""
32
+ # Convert to lowercase
33
+ text = text.lower()
34
+ # Remove special characters
35
+ text = re.sub(r'[^\w\s]', '', text)
36
+ # Tokenize
37
+ words = text.split()
38
+ # Remove stopwords
39
+ stop_words = set(stopwords.words('english'))
40
+ words = [w for w in words if w not in stop_words and len(w) > 2]
41
+ return words
42
+
43
+ def analyze_sentiment(text, model):
44
+ result = model(text)[0]
45
+ rating = int(result['label'].split('_')[-1])
 
 
 
 
 
 
 
 
 
 
 
46
  return {
47
+ 'rating': rating,
48
+ 'label': RATING_MAP[rating],
49
+ 'score': result['score']
50
  }
51
 
52
+ def generate_wordcloud(text, sentiment):
53
+ """Generate word cloud from English text"""
54
+ words = preprocess_text(text)
55
+ word_freq = Counter(words)
56
 
57
+ wc = WordCloud(
58
+ width=800,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  height=400,
60
  background_color='white',
61
+ colormap='Reds' if sentiment['rating'] == 0 else 'Greens',
62
+ collocations=False # Better for single documents
63
+ ).generate_from_frequencies(word_freq)
64
 
65
  fig, ax = plt.subplots(figsize=(10, 5))
66
+ ax.imshow(wc, interpolation='bilinear')
67
  ax.axis('off')
68
  return fig
69
 
70
+ def display_top_keywords(text, n=10):
71
+ """Show most frequent keywords"""
72
+ words = preprocess_text(text)
73
+ counter = Counter(words)
74
+ top_words = counter.most_common(n)
75
+
76
+ st.subheader(f"Top {n} Keywords")
77
+ cols = st.columns(2)
78
+ for i, (word, count) in enumerate(top_words):
79
+ cols[i%2].metric(f"{word.title()}", f"{count} mentions")
80
+
81
  def main():
82
+ st.title("Tripadvisor Hotel Review Analyzer")
83
+ st.markdown("Instant sentiment and keyword analysis for English reviews")
84
+
85
+ if 'model' not in st.session_state:
86
+ st.session_state.model = load_models()
87
+
88
+ user_input = st.text_area("Paste your English review here:", height=150)
89
+
90
+ if st.button("Analyze Review"):
91
+ if user_input:
92
+ with st.spinner("Analyzing..."):
93
+ # Sentiment analysis
94
+ sentiment = analyze_sentiment(user_input, st.session_state.model)
95
+
96
+ # Display results
97
+ st.subheader("Analysis Results")
98
+ col1, col2 = st.columns(2)
99
+ with col1:
100
+ st.metric("Overall Rating", sentiment['label'])
101
+ with col2:
102
+ st.metric("Confidence Score", f"{sentiment['score']:.0%}")
103
+
104
+ # Generate visualizations
105
+ st.subheader("Keyword Visualization")
106
+ tab1, tab2 = st.tabs(["Word Cloud", "Top Keywords"])
107
+
108
+ with tab1:
109
+ fig = generate_wordcloud(user_input, sentiment)
110
+ st.pyplot(fig)
111
+
112
+ with tab2:
113
+ display_top_keywords(user_input)
114
+
115
+ # Store in session history
116
+ if 'history' not in st.session_state:
117
+ st.session_state.history = []
118
+ st.session_state.history.append({
119
+ 'text': user_input[:100] + "..." if len(user_input) > 100 else user_input,
120
+ 'rating': sentiment['rating'],
121
+ 'date': datetime.now().strftime("%Y-%m-%d %H:%M")
122
+ })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
  else:
124
+ st.warning("Please enter a review to analyze")
125
+
126
+ # Display history if exists
127
+ if 'history' in st.session_state and st.session_state.history:
128
+ st.divider()
129
+ with st.expander("Recent Analyses (Last 5)"):
130
+ history_df = pd.DataFrame(st.session_state.history[-5:])
131
+ st.dataframe(
132
+ history_df,
133
+ column_config={
134
+ "text": "Review Excerpt",
135
+ "rating": st.column_config.NumberColumn(
136
+ "Rating",
137
+ format="%d ⭐",
138
+ ),
139
+ "date": "Analyzed At"
140
+ },
141
+ hide_index=True
142
+ )
143
 
144
  if __name__ == "__main__":
145
  main()