mxiean commited on
Commit
ad0b640
·
verified ·
1 Parent(s): a8ec16d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +159 -136
app.py CHANGED
@@ -1,9 +1,10 @@
1
  import streamlit as st
2
- from transformers import pipeline
3
  import matplotlib.pyplot as plt
4
  from wordcloud import WordCloud
5
  import pandas as pd
6
  from datetime import datetime
 
7
 
8
  # Constants
9
  RATING_MAP = {
@@ -14,89 +15,88 @@ RATING_MAP = {
14
 
15
  @st.cache_resource
16
  def load_models():
 
 
 
 
17
  sentiment_model = pipeline(
18
  "text-classification",
19
- model="AndrewLi403/CustomModel_tripadvisor_finetuned"
 
 
20
  )
21
  ner_model = pipeline("ner", model="dslim/bert-base-NER")
22
- return sentiment_model, ner_model
23
 
24
- def analyze_sentiment(text, model):
25
- result = model(text)[0]
26
- rating = int(result['label'].split('_')[-1])
27
- return {
28
- 'rating': rating,
29
- 'label': RATING_MAP[rating],
30
- 'score': result['score']
31
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
- def extract_aspects(text, model):
34
- entities = model(text)
35
- aspects = []
36
- current_entity = ""
37
-
38
- for entity in entities:
39
- if entity['word'].startswith('##'):
40
- current_entity += entity['word'][2:]
41
- else:
42
- if current_entity:
43
- aspects.append({
44
- 'entity': current_entity,
45
- 'type': prev_type
46
- })
47
- current_entity = entity['word']
48
- prev_type = entity['entity']
49
 
50
- if current_entity:
51
- aspects.append({
52
- 'entity': current_entity,
53
- 'type': prev_type
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  })
55
 
56
- return [a for a in aspects if a['type'] in ['PRODUCT', 'ORG', 'PERSON']]
57
-
58
- def plot_sentiment_distribution(df):
59
- fig, ax = plt.subplots()
60
-
61
- # Get counts for all possible ratings
62
- counts = df['label'].value_counts()
63
-
64
- # Ensure all rating categories are present (even with 0 counts)
65
- for rating in RATING_MAP.values():
66
- if rating not in counts.index:
67
- counts[rating] = 0
68
-
69
- # Sort by the predefined rating order
70
- counts = counts.loc[list(RATING_MAP.values())]
71
-
72
- # Plot with consistent colors
73
- counts.plot.pie(
74
- autopct='%1.1f%%',
75
- colors=['#ff9999','#66b3ff','#99ff99'], # Negative, Neutral, Positive
76
- ax=ax
77
- )
78
- ax.set_ylabel('')
79
- return fig
80
-
81
- def plot_wordcloud(negative_reviews):
82
- text = " ".join(negative_reviews)
83
- wordcloud = WordCloud(
84
- width=800,
85
- height=400,
86
- background_color='white',
87
- colormap='Reds'
88
- ).generate(text)
89
-
90
- fig, ax = plt.subplots(figsize=(10, 5))
91
- ax.imshow(wordcloud, interpolation='bilinear')
92
- ax.axis('off')
93
- return fig
94
 
95
  def main():
96
  st.title("Restaurant Review Analyzer")
97
  st.markdown("Using fine-tuned model for sentiment and aspect analysis")
98
 
99
- sentiment_model, ner_model = load_models()
100
 
101
  st.sidebar.header("Analysis Options")
102
  analysis_mode = st.sidebar.radio(
@@ -106,7 +106,7 @@ def main():
106
 
107
  if 'history' not in st.session_state:
108
  st.session_state.history = pd.DataFrame(
109
- columns=['text', 'rating', 'label', 'date', 'aspects']
110
  )
111
 
112
  if analysis_mode == "Single Review":
@@ -115,97 +115,120 @@ def main():
115
  if st.button("Analyze"):
116
  if user_input:
117
  with st.spinner("Analyzing..."):
118
- sentiment = analyze_sentiment(user_input, sentiment_model)
119
- aspects = extract_aspects(user_input, ner_model)
 
 
 
 
 
 
120
 
121
  new_entry = pd.DataFrame([{
122
  'text': user_input,
123
  'rating': sentiment['rating'],
124
  'label': sentiment['label'],
 
125
  'date': datetime.now(),
126
- 'aspects': aspects
 
127
  }])
 
128
  st.session_state.history = pd.concat(
129
  [st.session_state.history, new_entry],
130
  ignore_index=True
131
  )
132
 
133
- st.subheader("Analysis Results")
134
- col1, col2 = st.columns(2)
135
- with col1:
136
- st.metric("Rating", sentiment['label'])
137
- with col2:
138
- st.metric("Confidence", f"{sentiment['score']:.2f}")
139
-
140
- if aspects:
141
- st.subheader("Identified Aspects")
142
- for aspect in aspects:
143
- st.markdown(f"- **{aspect['type']}**: `{aspect['entity']}`")
144
  else:
145
- st.info("No specific entities identified")
 
 
 
 
 
 
 
 
 
 
 
 
146
  else:
147
  st.warning("Please enter a review")
148
 
149
- else:
150
  uploaded_file = st.file_uploader("Upload CSV file", type=["csv"])
151
 
152
  if uploaded_file:
153
- df = pd.read_csv(uploaded_file)
154
- if 'text' not in df.columns:
155
- st.error("CSV must contain 'text' column")
156
- else:
157
- if st.button("Analyze All"):
158
- progress_bar = st.progress(0)
159
- results = []
160
-
161
- for i, row in enumerate(df.itertuples()):
162
- sentiment = analyze_sentiment(row.text, sentiment_model)
163
- aspects = extract_aspects(row.text, ner_model)
164
-
165
- results.append({
166
- 'text': row.text,
167
- 'rating': sentiment['rating'],
168
- 'label': sentiment['label'],
169
- 'date': datetime.now(),
170
- 'aspects': aspects
171
- })
172
-
173
- progress_bar.progress((i + 1) / len(df))
174
-
175
- st.session_state.history = pd.concat(
176
- [st.session_state.history, pd.DataFrame(results)],
177
- ignore_index=True
178
- )
179
- st.success(f"Completed analysis of {len(df)} reviews")
 
 
 
 
180
 
 
181
  if not st.session_state.history.empty:
182
  st.divider()
183
  st.header("Analysis History")
184
 
185
- with st.expander("View Raw Data"):
186
- st.dataframe(st.session_state.history)
187
 
188
- st.subheader("Sentiment Distribution")
189
- fig1 = plot_sentiment_distribution(st.session_state.history)
190
- st.pyplot(fig1)
191
-
192
- negative_reviews = st.session_state.history[
193
- st.session_state.history['rating'] == 0
194
- ]['text'].tolist()
195
-
196
- if negative_reviews:
197
- st.subheader("Negative Reviews Word Cloud")
198
- fig2 = plot_wordcloud(negative_reviews)
199
- st.pyplot(fig2)
 
 
 
 
 
 
 
 
 
 
 
 
 
200
  else:
201
- st.info("No negative reviews yet")
202
-
203
- if len(st.session_state.history) > 1:
204
- st.subheader("Rating Trend Over Time")
205
- time_df = st.session_state.history.copy()
206
- time_df['date'] = pd.to_datetime(time_df['date'])
207
- time_df = time_df.set_index('date').resample('D')['rating'].mean()
208
- st.line_chart(time_df)
209
 
210
  if __name__ == "__main__":
211
  main()
 
1
  import streamlit as st
2
+ from transformers import pipeline, AutoTokenizer
3
  import matplotlib.pyplot as plt
4
  from wordcloud import WordCloud
5
  import pandas as pd
6
  from datetime import datetime
7
+ from tqdm import tqdm
8
 
9
  # Constants
10
  RATING_MAP = {
 
15
 
16
  @st.cache_resource
17
  def load_models():
18
+ # Load tokenizer first
19
+ tokenizer = AutoTokenizer.from_pretrained("AndrewLi403/CustomModel_tripadvisor_finetuned")
20
+
21
+ # Load models with proper tokenizer
22
  sentiment_model = pipeline(
23
  "text-classification",
24
+ model="AndrewLi403/CustomModel_tripadvisor_finetuned",
25
+ tokenizer=tokenizer,
26
+ device=0 if st.secrets.get("USE_GPU", False) else -1
27
  )
28
  ner_model = pipeline("ner", model="dslim/bert-base-NER")
29
+ return sentiment_model, ner_model, tokenizer
30
 
31
+ def safe_analyze_sentiment(text, model, tokenizer):
32
+ try:
33
+ # Tokenize with truncation
34
+ inputs = tokenizer(
35
+ text,
36
+ padding=True,
37
+ truncation=True,
38
+ max_length=512,
39
+ return_tensors="pt"
40
+ )
41
+
42
+ # Process through model
43
+ result = model(**inputs)[0]
44
+ rating = int(result['label'].split('_')[-1])
45
+ return {
46
+ 'rating': rating,
47
+ 'label': RATING_MAP[rating],
48
+ 'score': result['score'],
49
+ 'error': None
50
+ }
51
+ except Exception as e:
52
+ return {
53
+ 'rating': -1,
54
+ 'label': "Error",
55
+ 'score': 0.0,
56
+ 'error': str(e)
57
+ }
58
 
59
+ def batch_analyze(df, sentiment_model, ner_model, tokenizer):
60
+ results = []
61
+ progress_bar = st.progress(0)
62
+ status_text = st.empty()
 
 
 
 
 
 
 
 
 
 
 
 
63
 
64
+ for i, row in tqdm(enumerate(df.itertuples()), total=len(df)):
65
+ # Update progress
66
+ progress = int((i + 1) / len(df) * 100)
67
+ progress_bar.progress(progress)
68
+ status_text.text(f"Processing {i+1}/{len(df)} reviews...")
69
+
70
+ # Analyze sentiment with error handling
71
+ sentiment = safe_analyze_sentiment(row.text, sentiment_model, tokenizer)
72
+
73
+ # Only proceed with NER if sentiment analysis succeeded
74
+ aspects = []
75
+ if sentiment['error'] is None:
76
+ try:
77
+ aspects = extract_aspects(row.text, ner_model)
78
+ except Exception as e:
79
+ sentiment['error'] = f"NER Error: {str(e)}"
80
+
81
+ results.append({
82
+ 'text': row.text,
83
+ 'rating': sentiment['rating'],
84
+ 'label': sentiment['label'],
85
+ 'score': sentiment['score'],
86
+ 'date': datetime.now(),
87
+ 'aspects': aspects,
88
+ 'error': sentiment['error']
89
  })
90
 
91
+ progress_bar.empty()
92
+ status_text.empty()
93
+ return pd.DataFrame(results)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
 
95
  def main():
96
  st.title("Restaurant Review Analyzer")
97
  st.markdown("Using fine-tuned model for sentiment and aspect analysis")
98
 
99
+ sentiment_model, ner_model, tokenizer = load_models()
100
 
101
  st.sidebar.header("Analysis Options")
102
  analysis_mode = st.sidebar.radio(
 
106
 
107
  if 'history' not in st.session_state:
108
  st.session_state.history = pd.DataFrame(
109
+ columns=['text', 'rating', 'label', 'score', 'date', 'aspects', 'error']
110
  )
111
 
112
  if analysis_mode == "Single Review":
 
115
  if st.button("Analyze"):
116
  if user_input:
117
  with st.spinner("Analyzing..."):
118
+ sentiment = safe_analyze_sentiment(user_input, sentiment_model, tokenizer)
119
+ aspects = []
120
+
121
+ if sentiment['error'] is None:
122
+ try:
123
+ aspects = extract_aspects(user_input, ner_model)
124
+ except Exception as e:
125
+ sentiment['error'] = f"NER Error: {str(e)}"
126
 
127
  new_entry = pd.DataFrame([{
128
  'text': user_input,
129
  'rating': sentiment['rating'],
130
  'label': sentiment['label'],
131
+ 'score': sentiment['score'],
132
  'date': datetime.now(),
133
+ 'aspects': aspects,
134
+ 'error': sentiment['error']
135
  }])
136
+
137
  st.session_state.history = pd.concat(
138
  [st.session_state.history, new_entry],
139
  ignore_index=True
140
  )
141
 
142
+ if sentiment['error']:
143
+ st.error(f"Analysis error: {sentiment['error']}")
 
 
 
 
 
 
 
 
 
144
  else:
145
+ st.subheader("Analysis Results")
146
+ col1, col2 = st.columns(2)
147
+ with col1:
148
+ st.metric("Rating", sentiment['label'])
149
+ with col2:
150
+ st.metric("Confidence", f"{sentiment['score']:.2f}")
151
+
152
+ if aspects:
153
+ st.subheader("Identified Aspects")
154
+ for aspect in aspects:
155
+ st.markdown(f"- **{aspect['type']}**: `{aspect['entity']}`")
156
+ else:
157
+ st.info("No specific entities identified")
158
  else:
159
  st.warning("Please enter a review")
160
 
161
+ else: # Batch Analysis
162
  uploaded_file = st.file_uploader("Upload CSV file", type=["csv"])
163
 
164
  if uploaded_file:
165
+ try:
166
+ df = pd.read_csv(uploaded_file)
167
+ if 'text' not in df.columns:
168
+ st.error("CSV must contain a 'text' column with review content")
169
+ else:
170
+ if st.button("Analyze All Reviews"):
171
+ with st.spinner("Batch processing started..."):
172
+ results_df = batch_analyze(df, sentiment_model, ner_model, tokenizer)
173
+
174
+ # Calculate stats
175
+ success_count = len(results_df[results_df['error'].isna()])
176
+ error_count = len(results_df) - success_count
177
+
178
+ # Update history
179
+ st.session_state.history = pd.concat(
180
+ [st.session_state.history, results_df],
181
+ ignore_index=True
182
+ )
183
+
184
+ st.success(f"""
185
+ Batch analysis completed:
186
+ - Successful analyses: {success_count}
187
+ - Failed analyses: {error_count}
188
+ """)
189
+
190
+ # Show error details if any
191
+ if error_count > 0:
192
+ with st.expander("Show error details"):
193
+ st.dataframe(results_df[results_df['error'].notna()][['text', 'error']])
194
+ except Exception as e:
195
+ st.error(f"Failed to process CSV file: {str(e)}")
196
 
197
+ # Display results
198
  if not st.session_state.history.empty:
199
  st.divider()
200
  st.header("Analysis History")
201
 
202
+ # Filter out failed analyses for visualization
203
+ valid_results = st.session_state.history[st.session_state.history['error'].isna()]
204
 
205
+ if not valid_results.empty:
206
+ with st.expander("View Processed Data"):
207
+ st.dataframe(valid_results)
208
+
209
+ st.subheader("Sentiment Distribution")
210
+ fig1 = plot_sentiment_distribution(valid_results)
211
+ st.pyplot(fig1)
212
+
213
+ negative_reviews = valid_results[
214
+ valid_results['rating'] == 0
215
+ ]['text'].tolist()
216
+
217
+ if negative_reviews:
218
+ st.subheader("Negative Reviews Word Cloud")
219
+ fig2 = plot_wordcloud(negative_reviews)
220
+ st.pyplot(fig2)
221
+ else:
222
+ st.info("No negative reviews yet")
223
+
224
+ if len(valid_results) > 1:
225
+ st.subheader("Rating Trend Over Time")
226
+ time_df = valid_results.copy()
227
+ time_df['date'] = pd.to_datetime(time_df['date'])
228
+ time_df = time_df.set_index('date').resample('D')['rating'].mean()
229
+ st.line_chart(time_df)
230
  else:
231
+ st.warning("No valid analyses to display")
 
 
 
 
 
 
 
232
 
233
  if __name__ == "__main__":
234
  main()