Spaces:

mxiean
/

G10_TripAdvisor

Sleeping

App Files Files Community

mxiean commited on Mar 27

Commit

db24cbc

verified ·

1 Parent(s): 3515dc3

Update app.py

Browse files

Files changed (1) hide show

app.py +109 -194

app.py CHANGED Viewed

@@ -1,9 +1,16 @@
 import streamlit as st
-from transformers import pipeline, AutoTokenizer
 import matplotlib.pyplot as plt
 from wordcloud import WordCloud
 import pandas as pd
 from datetime import datetime
 # Constants
 RATING_MAP = {
@@ -18,213 +25,121 @@ def load_models():
         "text-classification",
         model="AndrewLi403/CustomModel_tripadvisor_finetuned"
     )
-    ner_model = pipeline("ner", model="dslim/bert-base-NER")
-    tokenizer = AutoTokenizer.from_pretrained("AndrewLi403/CustomModel_tripadvisor_finetuned")
-    return sentiment_model, ner_model, tokenizer
-def analyze_sentiment(text, model, tokenizer, chunk_size=400):
-    tokens = tokenizer.tokenize(text)
-    # Short text processing
-    if len(tokens) <= 512:
-        result = model(text)[0]
-        rating = int(result['label'].split('_')[-1])
-        return {
-            'rating': rating,
-            'label': RATING_MAP[rating],
-            'score': result['score']
-        }
-    # Long text chunk processing
-    chunks = [tokens[i:i+chunk_size] for i in range(0, len(tokens), chunk_size)]
-    results = []
-    for chunk in chunks:
-        chunk_text = tokenizer.convert_tokens_to_string(chunk)
-        result = model(chunk_text)[0]
-        results.append(result)
-    # Aggregate results (majority vote + average confidence)
-    final_label = max(set(r['label'] for r in results),
-                     key=lambda x: sum(1 for r in results if r['label'] == x))
-    avg_score = sum(r['score'] for r in results) / len(results)
     return {
-        'rating': int(final_label.split('_')[-1]),
-        'label': RATING_MAP[int(final_label.split('_')[-1])],
-        'score': avg_score
     }
-def extract_aspects(text, model):
-    entities = model(text)
-    aspects = []
-    current_entity = ""
-    for entity in entities:
-        if entity['word'].startswith('##'):
-            current_entity += entity['word'][2:]
-        else:
-            if current_entity:
-                aspects.append({
-                    'entity': current_entity,
-                    'type': prev_type
-                })
-            current_entity = entity['word']
-            prev_type = entity['entity']
-    if current_entity:
-        aspects.append({
-            'entity': current_entity,
-            'type': prev_type
-        })
-    return [a for a in aspects if a['type'] in ['PRODUCT', 'ORG', 'PERSON']]
-def plot_sentiment_distribution(df):
-    fig, ax = plt.subplots()
-    counts = df['label'].value_counts()
-    for rating in RATING_MAP.values():
-        if rating not in counts.index:
-            counts[rating] = 0
-    counts = counts.loc[list(RATING_MAP.values())]
-    counts.plot.pie(
-        autopct='%1.1f%%',
-        colors=['#ff9999','#66b3ff','#99ff99'],
-        ax=ax
-    )
-    ax.set_ylabel('')
-    return fig
-def plot_wordcloud(negative_reviews):
-    text = " ".join(negative_reviews)
-    wordcloud = WordCloud(
-        width=800,
         height=400,
         background_color='white',
-        colormap='Reds'
-    ).generate(text)
     fig, ax = plt.subplots(figsize=(10, 5))
-    ax.imshow(wordcloud, interpolation='bilinear')
     ax.axis('off')
     return fig
 def main():
-    st.title("Restaurant Review Analyzer")
-    st.markdown("Using fine-tuned model for sentiment and aspect analysis")
-    sentiment_model, ner_model, tokenizer = load_models()
-    st.sidebar.header("Analysis Options")
-    analysis_mode = st.sidebar.radio(
-        "Select Mode",
-        ["Single Review", "Batch Analysis"]
-    )
-    if 'history' not in st.session_state:
-        st.session_state.history = pd.DataFrame(
-            columns=['text', 'rating', 'label', 'date', 'aspects']
-        )
-    if analysis_mode == "Single Review":
-        user_input = st.text_area("Enter or paste a restaurant review:", height=150)
-        if st.button("Analyze"):
-            if user_input:
-                with st.spinner("Analyzing..."):
-                    sentiment = analyze_sentiment(user_input, sentiment_model, tokenizer)
-                    aspects = extract_aspects(user_input, ner_model)
-                    new_entry = pd.DataFrame([{
-                        'text': user_input,
-                        'rating': sentiment['rating'],
-                        'label': sentiment['label'],
-                        'date': datetime.now(),
-                        'aspects': aspects
-                    }])
-                    st.session_state.history = pd.concat(
-                        [st.session_state.history, new_entry],
-                        ignore_index=True
-                    )
-                    st.subheader("Analysis Results")
-                    col1, col2 = st.columns(2)
-                    with col1:
-                        st.metric("Rating", sentiment['label'])
-                    with col2:
-                        st.metric("Confidence", f"{sentiment['score']:.2f}")
-                    if aspects:
-                        st.subheader("Identified Aspects")
-                        for aspect in aspects:
-                            st.markdown(f"- **{aspect['type']}**: `{aspect['entity']}`")
-                    else:
-                        st.info("No specific entities identified")
-            else:
-                st.warning("Please enter a review")
-    else:
-        uploaded_file = st.file_uploader("Upload CSV file", type=["csv"])
-        if uploaded_file:
-            df = pd.read_csv(uploaded_file)
-            if 'text' not in df.columns:
-                st.error("CSV must contain 'text' column")
-            else:
-                if st.button("Analyze All"):
-                    progress_bar = st.progress(0)
-                    results = []
-                    for i, row in enumerate(df.itertuples()):
-                        sentiment = analyze_sentiment(row.text, sentiment_model, tokenizer)
-                        aspects = extract_aspects(row.text, ner_model)
-                        results.append({
-                            'text': row.text,
-                            'rating': sentiment['rating'],
-                            'label': sentiment['label'],
-                            'date': datetime.now(),
-                            'aspects': aspects
-                        })
-                        progress_bar.progress((i + 1) / len(df))
-                    st.session_state.history = pd.concat(
-                        [st.session_state.history, pd.DataFrame(results)],
-                        ignore_index=True
-                    )
-                    st.success(f"Completed analysis of {len(df)} reviews")
-    if not st.session_state.history.empty:
-        st.divider()
-        st.header("Analysis History")
-        with st.expander("View Raw Data"):
-            st.dataframe(st.session_state.history)
-        st.subheader("Sentiment Distribution")
-        fig1 = plot_sentiment_distribution(st.session_state.history)
-        st.pyplot(fig1)
-        negative_reviews = st.session_state.history[
-            st.session_state.history['rating'] == 0
-        ]['text'].tolist()
-        if negative_reviews:
-            st.subheader("Negative Reviews Word Cloud")
-            fig2 = plot_wordcloud(negative_reviews)
-            st.pyplot(fig2)
         else:
-            st.info("No negative reviews yet")
-        if len(st.session_state.history) > 1:
-            st.subheader("Rating Trend Over Time")
-            time_df = st.session_state.history.copy()
-            time_df['date'] = pd.to_datetime(time_df['date'])
-            time_df = time_df.set_index('date').resample('D')['rating'].mean()
-            st.line_chart(time_df)
 if __name__ == "__main__":
     main()

 import streamlit as st
+from transformers import pipeline
 import matplotlib.pyplot as plt
 from wordcloud import WordCloud
 import pandas as pd
 from datetime import datetime
+from collections import Counter
+import re
+from nltk.corpus import stopwords
+import nltk
+# Download NLTK stopwords (first-time only)
+nltk.download('stopwords')
 # Constants
 RATING_MAP = {
         "text-classification",
         model="AndrewLi403/CustomModel_tripadvisor_finetuned"
     )
+    return sentiment_model
+def preprocess_text(text):
+    """Clean and tokenize English text"""
+    # Convert to lowercase
+    text = text.lower()
+    # Remove special characters
+    text = re.sub(r'[^\w\s]', '', text)
+    # Tokenize
+    words = text.split()
+    # Remove stopwords
+    stop_words = set(stopwords.words('english'))
+    words = [w for w in words if w not in stop_words and len(w) > 2]
+    return words
+def analyze_sentiment(text, model):
+    result = model(text)[0]
+    rating = int(result['label'].split('_')[-1])
     return {
+        'rating': rating,
+        'label': RATING_MAP[rating],
+        'score': result['score']
     }
+def generate_wordcloud(text, sentiment):
+    """Generate word cloud from English text"""
+    words = preprocess_text(text)
+    word_freq = Counter(words)
+    wc = WordCloud(
+        width=800,
         height=400,
         background_color='white',
+        colormap='Reds' if sentiment['rating'] == 0 else 'Greens',
+        collocations=False  # Better for single documents
+    ).generate_from_frequencies(word_freq)
     fig, ax = plt.subplots(figsize=(10, 5))
+    ax.imshow(wc, interpolation='bilinear')
     ax.axis('off')
     return fig
+def display_top_keywords(text, n=10):
+    """Show most frequent keywords"""
+    words = preprocess_text(text)
+    counter = Counter(words)
+    top_words = counter.most_common(n)
+    st.subheader(f"Top {n} Keywords")
+    cols = st.columns(2)
+    for i, (word, count) in enumerate(top_words):
+        cols[i%2].metric(f"{word.title()}", f"{count} mentions")
 def main():
+    st.title("Tripadvisor Hotel Review Analyzer")
+    st.markdown("Instant sentiment and keyword analysis for English reviews")
+    if 'model' not in st.session_state:
+        st.session_state.model = load_models()
+    user_input = st.text_area("Paste your English review here:", height=150)
+    if st.button("Analyze Review"):
+        if user_input:
+            with st.spinner("Analyzing..."):
+                # Sentiment analysis
+                sentiment = analyze_sentiment(user_input, st.session_state.model)
+                # Display results
+                st.subheader("Analysis Results")
+                col1, col2 = st.columns(2)
+                with col1:
+                    st.metric("Overall Rating", sentiment['label'])
+                with col2:
+                    st.metric("Confidence Score", f"{sentiment['score']:.0%}")
+                # Generate visualizations
+                st.subheader("Keyword Visualization")
+                tab1, tab2 = st.tabs(["Word Cloud", "Top Keywords"])
+                with tab1:
+                    fig = generate_wordcloud(user_input, sentiment)
+                    st.pyplot(fig)
+                with tab2:
+                    display_top_keywords(user_input)
+                # Store in session history
+                if 'history' not in st.session_state:
+                    st.session_state.history = []
+                st.session_state.history.append({
+                    'text': user_input[:100] + "..." if len(user_input) > 100 else user_input,
+                    'rating': sentiment['rating'],
+                    'date': datetime.now().strftime("%Y-%m-%d %H:%M")
+                })
         else:
+            st.warning("Please enter a review to analyze")
+    # Display history if exists
+    if 'history' in st.session_state and st.session_state.history:
+        st.divider()
+        with st.expander("Recent Analyses (Last 5)"):
+            history_df = pd.DataFrame(st.session_state.history[-5:])
+            st.dataframe(
+                history_df,
+                column_config={
+                    "text": "Review Excerpt",
+                    "rating": st.column_config.NumberColumn(
+                        "Rating",
+                        format="%d ⭐",
+                    ),
+                    "date": "Analyzed At"
+                },
+                hide_index=True
+            )
 if __name__ == "__main__":
     main()