mxiean commited on
Commit
f130149
·
verified ·
1 Parent(s): ad0b640

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +136 -159
app.py CHANGED
@@ -1,10 +1,9 @@
1
  import streamlit as st
2
- from transformers import pipeline, AutoTokenizer
3
  import matplotlib.pyplot as plt
4
  from wordcloud import WordCloud
5
  import pandas as pd
6
  from datetime import datetime
7
- from tqdm import tqdm
8
 
9
  # Constants
10
  RATING_MAP = {
@@ -15,88 +14,89 @@ RATING_MAP = {
15
 
16
  @st.cache_resource
17
  def load_models():
18
- # Load tokenizer first
19
- tokenizer = AutoTokenizer.from_pretrained("AndrewLi403/CustomModel_tripadvisor_finetuned")
20
-
21
- # Load models with proper tokenizer
22
  sentiment_model = pipeline(
23
  "text-classification",
24
- model="AndrewLi403/CustomModel_tripadvisor_finetuned",
25
- tokenizer=tokenizer,
26
- device=0 if st.secrets.get("USE_GPU", False) else -1
27
  )
28
  ner_model = pipeline("ner", model="dslim/bert-base-NER")
29
- return sentiment_model, ner_model, tokenizer
30
 
31
- def safe_analyze_sentiment(text, model, tokenizer):
32
- try:
33
- # Tokenize with truncation
34
- inputs = tokenizer(
35
- text,
36
- padding=True,
37
- truncation=True,
38
- max_length=512,
39
- return_tensors="pt"
40
- )
41
-
42
- # Process through model
43
- result = model(**inputs)[0]
44
- rating = int(result['label'].split('_')[-1])
45
- return {
46
- 'rating': rating,
47
- 'label': RATING_MAP[rating],
48
- 'score': result['score'],
49
- 'error': None
50
- }
51
- except Exception as e:
52
- return {
53
- 'rating': -1,
54
- 'label': "Error",
55
- 'score': 0.0,
56
- 'error': str(e)
57
- }
58
 
59
- def batch_analyze(df, sentiment_model, ner_model, tokenizer):
60
- results = []
61
- progress_bar = st.progress(0)
62
- status_text = st.empty()
63
 
64
- for i, row in tqdm(enumerate(df.itertuples()), total=len(df)):
65
- # Update progress
66
- progress = int((i + 1) / len(df) * 100)
67
- progress_bar.progress(progress)
68
- status_text.text(f"Processing {i+1}/{len(df)} reviews...")
69
-
70
- # Analyze sentiment with error handling
71
- sentiment = safe_analyze_sentiment(row.text, sentiment_model, tokenizer)
72
-
73
- # Only proceed with NER if sentiment analysis succeeded
74
- aspects = []
75
- if sentiment['error'] is None:
76
- try:
77
- aspects = extract_aspects(row.text, ner_model)
78
- except Exception as e:
79
- sentiment['error'] = f"NER Error: {str(e)}"
80
-
81
- results.append({
82
- 'text': row.text,
83
- 'rating': sentiment['rating'],
84
- 'label': sentiment['label'],
85
- 'score': sentiment['score'],
86
- 'date': datetime.now(),
87
- 'aspects': aspects,
88
- 'error': sentiment['error']
89
  })
90
 
91
- progress_bar.empty()
92
- status_text.empty()
93
- return pd.DataFrame(results)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
 
95
  def main():
96
  st.title("Restaurant Review Analyzer")
97
  st.markdown("Using fine-tuned model for sentiment and aspect analysis")
98
 
99
- sentiment_model, ner_model, tokenizer = load_models()
100
 
101
  st.sidebar.header("Analysis Options")
102
  analysis_mode = st.sidebar.radio(
@@ -106,7 +106,7 @@ def main():
106
 
107
  if 'history' not in st.session_state:
108
  st.session_state.history = pd.DataFrame(
109
- columns=['text', 'rating', 'label', 'score', 'date', 'aspects', 'error']
110
  )
111
 
112
  if analysis_mode == "Single Review":
@@ -115,120 +115,97 @@ def main():
115
  if st.button("Analyze"):
116
  if user_input:
117
  with st.spinner("Analyzing..."):
118
- sentiment = safe_analyze_sentiment(user_input, sentiment_model, tokenizer)
119
- aspects = []
120
-
121
- if sentiment['error'] is None:
122
- try:
123
- aspects = extract_aspects(user_input, ner_model)
124
- except Exception as e:
125
- sentiment['error'] = f"NER Error: {str(e)}"
126
 
127
  new_entry = pd.DataFrame([{
128
  'text': user_input,
129
  'rating': sentiment['rating'],
130
  'label': sentiment['label'],
131
- 'score': sentiment['score'],
132
  'date': datetime.now(),
133
- 'aspects': aspects,
134
- 'error': sentiment['error']
135
  }])
136
-
137
  st.session_state.history = pd.concat(
138
  [st.session_state.history, new_entry],
139
  ignore_index=True
140
  )
141
 
142
- if sentiment['error']:
143
- st.error(f"Analysis error: {sentiment['error']}")
 
 
 
 
 
 
 
 
 
144
  else:
145
- st.subheader("Analysis Results")
146
- col1, col2 = st.columns(2)
147
- with col1:
148
- st.metric("Rating", sentiment['label'])
149
- with col2:
150
- st.metric("Confidence", f"{sentiment['score']:.2f}")
151
-
152
- if aspects:
153
- st.subheader("Identified Aspects")
154
- for aspect in aspects:
155
- st.markdown(f"- **{aspect['type']}**: `{aspect['entity']}`")
156
- else:
157
- st.info("No specific entities identified")
158
  else:
159
  st.warning("Please enter a review")
160
 
161
- else: # Batch Analysis
162
  uploaded_file = st.file_uploader("Upload CSV file", type=["csv"])
163
 
164
  if uploaded_file:
165
- try:
166
- df = pd.read_csv(uploaded_file)
167
- if 'text' not in df.columns:
168
- st.error("CSV must contain a 'text' column with review content")
169
- else:
170
- if st.button("Analyze All Reviews"):
171
- with st.spinner("Batch processing started..."):
172
- results_df = batch_analyze(df, sentiment_model, ner_model, tokenizer)
173
-
174
- # Calculate stats
175
- success_count = len(results_df[results_df['error'].isna()])
176
- error_count = len(results_df) - success_count
177
-
178
- # Update history
179
- st.session_state.history = pd.concat(
180
- [st.session_state.history, results_df],
181
- ignore_index=True
182
- )
183
-
184
- st.success(f"""
185
- Batch analysis completed:
186
- - Successful analyses: {success_count}
187
- - Failed analyses: {error_count}
188
- """)
189
-
190
- # Show error details if any
191
- if error_count > 0:
192
- with st.expander("Show error details"):
193
- st.dataframe(results_df[results_df['error'].notna()][['text', 'error']])
194
- except Exception as e:
195
- st.error(f"Failed to process CSV file: {str(e)}")
196
 
197
- # Display results
198
  if not st.session_state.history.empty:
199
  st.divider()
200
  st.header("Analysis History")
201
 
202
- # Filter out failed analyses for visualization
203
- valid_results = st.session_state.history[st.session_state.history['error'].isna()]
204
 
205
- if not valid_results.empty:
206
- with st.expander("View Processed Data"):
207
- st.dataframe(valid_results)
208
-
209
- st.subheader("Sentiment Distribution")
210
- fig1 = plot_sentiment_distribution(valid_results)
211
- st.pyplot(fig1)
212
-
213
- negative_reviews = valid_results[
214
- valid_results['rating'] == 0
215
- ]['text'].tolist()
216
-
217
- if negative_reviews:
218
- st.subheader("Negative Reviews Word Cloud")
219
- fig2 = plot_wordcloud(negative_reviews)
220
- st.pyplot(fig2)
221
- else:
222
- st.info("No negative reviews yet")
223
-
224
- if len(valid_results) > 1:
225
- st.subheader("Rating Trend Over Time")
226
- time_df = valid_results.copy()
227
- time_df['date'] = pd.to_datetime(time_df['date'])
228
- time_df = time_df.set_index('date').resample('D')['rating'].mean()
229
- st.line_chart(time_df)
230
  else:
231
- st.warning("No valid analyses to display")
 
 
 
 
 
 
 
232
 
233
  if __name__ == "__main__":
234
  main()
 
1
  import streamlit as st
2
+ from transformers import pipeline
3
  import matplotlib.pyplot as plt
4
  from wordcloud import WordCloud
5
  import pandas as pd
6
  from datetime import datetime
 
7
 
8
  # Constants
9
  RATING_MAP = {
 
14
 
15
  @st.cache_resource
16
  def load_models():
 
 
 
 
17
  sentiment_model = pipeline(
18
  "text-classification",
19
+ model="AndrewLi403/CustomModel_tripadvisor_finetuned"
 
 
20
  )
21
  ner_model = pipeline("ner", model="dslim/bert-base-NER")
22
+ return sentiment_model, ner_model
23
 
24
+ def analyze_sentiment(text, model):
25
+ result = model(text)[0]
26
+ rating = int(result['label'].split('_')[-1])
27
+ return {
28
+ 'rating': rating,
29
+ 'label': RATING_MAP[rating],
30
+ 'score': result['score']
31
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
+ def extract_aspects(text, model):
34
+ entities = model(text)
35
+ aspects = []
36
+ current_entity = ""
37
 
38
+ for entity in entities:
39
+ if entity['word'].startswith('##'):
40
+ current_entity += entity['word'][2:]
41
+ else:
42
+ if current_entity:
43
+ aspects.append({
44
+ 'entity': current_entity,
45
+ 'type': prev_type
46
+ })
47
+ current_entity = entity['word']
48
+ prev_type = entity['entity']
49
+
50
+ if current_entity:
51
+ aspects.append({
52
+ 'entity': current_entity,
53
+ 'type': prev_type
 
 
 
 
 
 
 
 
 
54
  })
55
 
56
+ return [a for a in aspects if a['type'] in ['PRODUCT', 'ORG', 'PERSON']]
57
+
58
+ def plot_sentiment_distribution(df):
59
+ fig, ax = plt.subplots()
60
+
61
+ # Get counts for all possible ratings
62
+ counts = df['label'].value_counts()
63
+
64
+ # Ensure all rating categories are present (even with 0 counts)
65
+ for rating in RATING_MAP.values():
66
+ if rating not in counts.index:
67
+ counts[rating] = 0
68
+
69
+ # Sort by the predefined rating order
70
+ counts = counts.loc[list(RATING_MAP.values())]
71
+
72
+ # Plot with consistent colors
73
+ counts.plot.pie(
74
+ autopct='%1.1f%%',
75
+ colors=['#ff9999','#66b3ff','#99ff99'], # Negative, Neutral, Positive
76
+ ax=ax
77
+ )
78
+ ax.set_ylabel('')
79
+ return fig
80
+
81
+ def plot_wordcloud(negative_reviews):
82
+ text = " ".join(negative_reviews)
83
+ wordcloud = WordCloud(
84
+ width=800,
85
+ height=400,
86
+ background_color='white',
87
+ colormap='Reds'
88
+ ).generate(text)
89
+
90
+ fig, ax = plt.subplots(figsize=(10, 5))
91
+ ax.imshow(wordcloud, interpolation='bilinear')
92
+ ax.axis('off')
93
+ return fig
94
 
95
  def main():
96
  st.title("Restaurant Review Analyzer")
97
  st.markdown("Using fine-tuned model for sentiment and aspect analysis")
98
 
99
+ sentiment_model, ner_model = load_models()
100
 
101
  st.sidebar.header("Analysis Options")
102
  analysis_mode = st.sidebar.radio(
 
106
 
107
  if 'history' not in st.session_state:
108
  st.session_state.history = pd.DataFrame(
109
+ columns=['text', 'rating', 'label', 'date', 'aspects']
110
  )
111
 
112
  if analysis_mode == "Single Review":
 
115
  if st.button("Analyze"):
116
  if user_input:
117
  with st.spinner("Analyzing..."):
118
+ sentiment = analyze_sentiment(user_input, sentiment_model)
119
+ aspects = extract_aspects(user_input, ner_model)
 
 
 
 
 
 
120
 
121
  new_entry = pd.DataFrame([{
122
  'text': user_input,
123
  'rating': sentiment['rating'],
124
  'label': sentiment['label'],
 
125
  'date': datetime.now(),
126
+ 'aspects': aspects
 
127
  }])
 
128
  st.session_state.history = pd.concat(
129
  [st.session_state.history, new_entry],
130
  ignore_index=True
131
  )
132
 
133
+ st.subheader("Analysis Results")
134
+ col1, col2 = st.columns(2)
135
+ with col1:
136
+ st.metric("Rating", sentiment['label'])
137
+ with col2:
138
+ st.metric("Confidence", f"{sentiment['score']:.2f}")
139
+
140
+ if aspects:
141
+ st.subheader("Identified Aspects")
142
+ for aspect in aspects:
143
+ st.markdown(f"- **{aspect['type']}**: `{aspect['entity']}`")
144
  else:
145
+ st.info("No specific entities identified")
 
 
 
 
 
 
 
 
 
 
 
 
146
  else:
147
  st.warning("Please enter a review")
148
 
149
+ else:
150
  uploaded_file = st.file_uploader("Upload CSV file", type=["csv"])
151
 
152
  if uploaded_file:
153
+ df = pd.read_csv(uploaded_file)
154
+ if 'text' not in df.columns:
155
+ st.error("CSV must contain 'text' column")
156
+ else:
157
+ if st.button("Analyze All"):
158
+ progress_bar = st.progress(0)
159
+ results = []
160
+
161
+ for i, row in enumerate(df.itertuples()):
162
+ sentiment = analyze_sentiment(row.text, sentiment_model)
163
+ aspects = extract_aspects(row.text, ner_model)
164
+
165
+ results.append({
166
+ 'text': row.text,
167
+ 'rating': sentiment['rating'],
168
+ 'label': sentiment['label'],
169
+ 'date': datetime.now(),
170
+ 'aspects': aspects
171
+ })
172
+
173
+ progress_bar.progress((i + 1) / len(df))
174
+
175
+ st.session_state.history = pd.concat(
176
+ [st.session_state.history, pd.DataFrame(results)],
177
+ ignore_index=True
178
+ )
179
+ st.success(f"Completed analysis of {len(df)} reviews")
 
 
 
 
180
 
 
181
  if not st.session_state.history.empty:
182
  st.divider()
183
  st.header("Analysis History")
184
 
185
+ with st.expander("View Raw Data"):
186
+ st.dataframe(st.session_state.history)
187
 
188
+ st.subheader("Sentiment Distribution")
189
+ fig1 = plot_sentiment_distribution(st.session_state.history)
190
+ st.pyplot(fig1)
191
+
192
+ negative_reviews = st.session_state.history[
193
+ st.session_state.history['rating'] == 0
194
+ ]['text'].tolist()
195
+
196
+ if negative_reviews:
197
+ st.subheader("Negative Reviews Word Cloud")
198
+ fig2 = plot_wordcloud(negative_reviews)
199
+ st.pyplot(fig2)
 
 
 
 
 
 
 
 
 
 
 
 
 
200
  else:
201
+ st.info("No negative reviews yet")
202
+
203
+ if len(st.session_state.history) > 1:
204
+ st.subheader("Rating Trend Over Time")
205
+ time_df = st.session_state.history.copy()
206
+ time_df['date'] = pd.to_datetime(time_df['date'])
207
+ time_df = time_df.set_index('date').resample('D')['rating'].mean()
208
+ st.line_chart(time_df)
209
 
210
  if __name__ == "__main__":
211
  main()