Spaces:

mxiean
/

G10_TripAdvisor

Sleeping

App Files Files Community

mxiean commited on Mar 27

Commit

ad0b640

verified ·

1 Parent(s): a8ec16d

Update app.py

Browse files

Files changed (1) hide show

app.py +159 -136

app.py CHANGED Viewed

@@ -1,9 +1,10 @@
 import streamlit as st
-from transformers import pipeline
 import matplotlib.pyplot as plt
 from wordcloud import WordCloud
 import pandas as pd
 from datetime import datetime
 # Constants
 RATING_MAP = {
@@ -14,89 +15,88 @@ RATING_MAP = {
 @st.cache_resource
 def load_models():
     sentiment_model = pipeline(
         "text-classification",
-        model="AndrewLi403/CustomModel_tripadvisor_finetuned"
     )
     ner_model = pipeline("ner", model="dslim/bert-base-NER")
-    return sentiment_model, ner_model
-def analyze_sentiment(text, model):
-    result = model(text)[0]
-    rating = int(result['label'].split('_')[-1])
-    return {
-        'rating': rating,
-        'label': RATING_MAP[rating],
-        'score': result['score']
-    }
-def extract_aspects(text, model):
-    entities = model(text)
-    aspects = []
-    current_entity = ""
-    for entity in entities:
-        if entity['word'].startswith('##'):
-            current_entity += entity['word'][2:]
-        else:
-            if current_entity:
-                aspects.append({
-                    'entity': current_entity,
-                    'type': prev_type
-                })
-            current_entity = entity['word']
-            prev_type = entity['entity']
-    if current_entity:
-        aspects.append({
-            'entity': current_entity,
-            'type': prev_type
         })
-    return [a for a in aspects if a['type'] in ['PRODUCT', 'ORG', 'PERSON']]
-def plot_sentiment_distribution(df):
-    fig, ax = plt.subplots()
-    # Get counts for all possible ratings
-    counts = df['label'].value_counts()
-    # Ensure all rating categories are present (even with 0 counts)
-    for rating in RATING_MAP.values():
-        if rating not in counts.index:
-            counts[rating] = 0
-    # Sort by the predefined rating order
-    counts = counts.loc[list(RATING_MAP.values())]
-    # Plot with consistent colors
-    counts.plot.pie(
-        autopct='%1.1f%%',
-        colors=['#ff9999','#66b3ff','#99ff99'],  # Negative, Neutral, Positive
-        ax=ax
-    )
-    ax.set_ylabel('')
-    return fig
-def plot_wordcloud(negative_reviews):
-    text = " ".join(negative_reviews)
-    wordcloud = WordCloud(
-        width=800,
-        height=400,
-        background_color='white',
-        colormap='Reds'
-    ).generate(text)
-    fig, ax = plt.subplots(figsize=(10, 5))
-    ax.imshow(wordcloud, interpolation='bilinear')
-    ax.axis('off')
-    return fig
 def main():
     st.title("Restaurant Review Analyzer")
     st.markdown("Using fine-tuned model for sentiment and aspect analysis")
-    sentiment_model, ner_model = load_models()
     st.sidebar.header("Analysis Options")
     analysis_mode = st.sidebar.radio(
@@ -106,7 +106,7 @@ def main():
     if 'history' not in st.session_state:
         st.session_state.history = pd.DataFrame(
-            columns=['text', 'rating', 'label', 'date', 'aspects']
         )
     if analysis_mode == "Single Review":
@@ -115,97 +115,120 @@ def main():
         if st.button("Analyze"):
             if user_input:
                 with st.spinner("Analyzing..."):
-                    sentiment = analyze_sentiment(user_input, sentiment_model)
-                    aspects = extract_aspects(user_input, ner_model)
                     new_entry = pd.DataFrame([{
                         'text': user_input,
                         'rating': sentiment['rating'],
                         'label': sentiment['label'],
                         'date': datetime.now(),
-                        'aspects': aspects
                     }])
                     st.session_state.history = pd.concat(
                         [st.session_state.history, new_entry],
                         ignore_index=True
                     )
-                    st.subheader("Analysis Results")
-                    col1, col2 = st.columns(2)
-                    with col1:
-                        st.metric("Rating", sentiment['label'])
-                    with col2:
-                        st.metric("Confidence", f"{sentiment['score']:.2f}")
-                    if aspects:
-                        st.subheader("Identified Aspects")
-                        for aspect in aspects:
-                            st.markdown(f"- **{aspect['type']}**: `{aspect['entity']}`")
                     else:
-                        st.info("No specific entities identified")
             else:
                 st.warning("Please enter a review")
-    else:
         uploaded_file = st.file_uploader("Upload CSV file", type=["csv"])
         if uploaded_file:
-            df = pd.read_csv(uploaded_file)
-            if 'text' not in df.columns:
-                st.error("CSV must contain 'text' column")
-            else:
-                if st.button("Analyze All"):
-                    progress_bar = st.progress(0)
-                    results = []
-                    for i, row in enumerate(df.itertuples()):
-                        sentiment = analyze_sentiment(row.text, sentiment_model)
-                        aspects = extract_aspects(row.text, ner_model)
-                        results.append({
-                            'text': row.text,
-                            'rating': sentiment['rating'],
-                            'label': sentiment['label'],
-                            'date': datetime.now(),
-                            'aspects': aspects
-                        })
-                        progress_bar.progress((i + 1) / len(df))
-                    st.session_state.history = pd.concat(
-                        [st.session_state.history, pd.DataFrame(results)],
-                        ignore_index=True
-                    )
-                    st.success(f"Completed analysis of {len(df)} reviews")
     if not st.session_state.history.empty:
         st.divider()
         st.header("Analysis History")
-        with st.expander("View Raw Data"):
-            st.dataframe(st.session_state.history)
-        st.subheader("Sentiment Distribution")
-        fig1 = plot_sentiment_distribution(st.session_state.history)
-        st.pyplot(fig1)
-        negative_reviews = st.session_state.history[
-            st.session_state.history['rating'] == 0
-        ]['text'].tolist()
-        if negative_reviews:
-            st.subheader("Negative Reviews Word Cloud")
-            fig2 = plot_wordcloud(negative_reviews)
-            st.pyplot(fig2)
         else:
-            st.info("No negative reviews yet")
-        if len(st.session_state.history) > 1:
-            st.subheader("Rating Trend Over Time")
-            time_df = st.session_state.history.copy()
-            time_df['date'] = pd.to_datetime(time_df['date'])
-            time_df = time_df.set_index('date').resample('D')['rating'].mean()
-            st.line_chart(time_df)
 if __name__ == "__main__":
     main()

 import streamlit as st
+from transformers import pipeline, AutoTokenizer
 import matplotlib.pyplot as plt
 from wordcloud import WordCloud
 import pandas as pd
 from datetime import datetime
+from tqdm import tqdm
 # Constants
 RATING_MAP = {
 @st.cache_resource
 def load_models():
+    # Load tokenizer first
+    tokenizer = AutoTokenizer.from_pretrained("AndrewLi403/CustomModel_tripadvisor_finetuned")
+    # Load models with proper tokenizer
     sentiment_model = pipeline(
         "text-classification",
+        model="AndrewLi403/CustomModel_tripadvisor_finetuned",
+        tokenizer=tokenizer,
+        device=0 if st.secrets.get("USE_GPU", False) else -1
     )
     ner_model = pipeline("ner", model="dslim/bert-base-NER")
+    return sentiment_model, ner_model, tokenizer
+def safe_analyze_sentiment(text, model, tokenizer):
+    try:
+        # Tokenize with truncation
+        inputs = tokenizer(
+            text,
+            padding=True,
+            truncation=True,
+            max_length=512,
+            return_tensors="pt"
+        )
+        # Process through model
+        result = model(**inputs)[0]
+        rating = int(result['label'].split('_')[-1])
+        return {
+            'rating': rating,
+            'label': RATING_MAP[rating],
+            'score': result['score'],
+            'error': None
+        }
+    except Exception as e:
+        return {
+            'rating': -1,
+            'label': "Error",
+            'score': 0.0,
+            'error': str(e)
+        }
+def batch_analyze(df, sentiment_model, ner_model, tokenizer):
+    results = []
+    progress_bar = st.progress(0)
+    status_text = st.empty()
+    for i, row in tqdm(enumerate(df.itertuples()), total=len(df)):
+        # Update progress
+        progress = int((i + 1) / len(df) * 100)
+        progress_bar.progress(progress)
+        status_text.text(f"Processing {i+1}/{len(df)} reviews...")
+        # Analyze sentiment with error handling
+        sentiment = safe_analyze_sentiment(row.text, sentiment_model, tokenizer)
+        # Only proceed with NER if sentiment analysis succeeded
+        aspects = []
+        if sentiment['error'] is None:
+            try:
+                aspects = extract_aspects(row.text, ner_model)
+            except Exception as e:
+                sentiment['error'] = f"NER Error: {str(e)}"
+        results.append({
+            'text': row.text,
+            'rating': sentiment['rating'],
+            'label': sentiment['label'],
+            'score': sentiment['score'],
+            'date': datetime.now(),
+            'aspects': aspects,
+            'error': sentiment['error']
         })
+    progress_bar.empty()
+    status_text.empty()
+    return pd.DataFrame(results)
 def main():
     st.title("Restaurant Review Analyzer")
     st.markdown("Using fine-tuned model for sentiment and aspect analysis")
+    sentiment_model, ner_model, tokenizer = load_models()
     st.sidebar.header("Analysis Options")
     analysis_mode = st.sidebar.radio(
     if 'history' not in st.session_state:
         st.session_state.history = pd.DataFrame(
+            columns=['text', 'rating', 'label', 'score', 'date', 'aspects', 'error']
         )
     if analysis_mode == "Single Review":
         if st.button("Analyze"):
             if user_input:
                 with st.spinner("Analyzing..."):
+                    sentiment = safe_analyze_sentiment(user_input, sentiment_model, tokenizer)
+                    aspects = []
+                    if sentiment['error'] is None:
+                        try:
+                            aspects = extract_aspects(user_input, ner_model)
+                        except Exception as e:
+                            sentiment['error'] = f"NER Error: {str(e)}"
                     new_entry = pd.DataFrame([{
                         'text': user_input,
                         'rating': sentiment['rating'],
                         'label': sentiment['label'],
+                        'score': sentiment['score'],
                         'date': datetime.now(),
+                        'aspects': aspects,
+                        'error': sentiment['error']
                     }])
                     st.session_state.history = pd.concat(
                         [st.session_state.history, new_entry],
                         ignore_index=True
                     )
+                    if sentiment['error']:
+                        st.error(f"Analysis error: {sentiment['error']}")
                     else:
+                        st.subheader("Analysis Results")
+                        col1, col2 = st.columns(2)
+                        with col1:
+                            st.metric("Rating", sentiment['label'])
+                        with col2:
+                            st.metric("Confidence", f"{sentiment['score']:.2f}")
+                        if aspects:
+                            st.subheader("Identified Aspects")
+                            for aspect in aspects:
+                                st.markdown(f"- **{aspect['type']}**: `{aspect['entity']}`")
+                        else:
+                            st.info("No specific entities identified")
             else:
                 st.warning("Please enter a review")
+    else:  # Batch Analysis
         uploaded_file = st.file_uploader("Upload CSV file", type=["csv"])
         if uploaded_file:
+            try:
+                df = pd.read_csv(uploaded_file)
+                if 'text' not in df.columns:
+                    st.error("CSV must contain a 'text' column with review content")
+                else:
+                    if st.button("Analyze All Reviews"):
+                        with st.spinner("Batch processing started..."):
+                            results_df = batch_analyze(df, sentiment_model, ner_model, tokenizer)
+                            # Calculate stats
+                            success_count = len(results_df[results_df['error'].isna()])
+                            error_count = len(results_df) - success_count
+                            # Update history
+                            st.session_state.history = pd.concat(
+                                [st.session_state.history, results_df],
+                                ignore_index=True
+                            )
+                            st.success(f"""
+                            Batch analysis completed:
+                            - Successful analyses: {success_count}
+                            - Failed analyses: {error_count}
+                            """)
+                            # Show error details if any
+                            if error_count > 0:
+                                with st.expander("Show error details"):
+                                    st.dataframe(results_df[results_df['error'].notna()][['text', 'error']])
+            except Exception as e:
+                st.error(f"Failed to process CSV file: {str(e)}")
+    # Display results
     if not st.session_state.history.empty:
         st.divider()
         st.header("Analysis History")
+        # Filter out failed analyses for visualization
+        valid_results = st.session_state.history[st.session_state.history['error'].isna()]
+        if not valid_results.empty:
+            with st.expander("View Processed Data"):
+                st.dataframe(valid_results)
+            st.subheader("Sentiment Distribution")
+            fig1 = plot_sentiment_distribution(valid_results)
+            st.pyplot(fig1)
+            negative_reviews = valid_results[
+                valid_results['rating'] == 0
+            ]['text'].tolist()
+            if negative_reviews:
+                st.subheader("Negative Reviews Word Cloud")
+                fig2 = plot_wordcloud(negative_reviews)
+                st.pyplot(fig2)
+            else:
+                st.info("No negative reviews yet")
+            if len(valid_results) > 1:
+                st.subheader("Rating Trend Over Time")
+                time_df = valid_results.copy()
+                time_df['date'] = pd.to_datetime(time_df['date'])
+                time_df = time_df.set_index('date').resample('D')['rating'].mean()
+                st.line_chart(time_df)
         else:
+            st.warning("No valid analyses to display")
 if __name__ == "__main__":
     main()