Spaces:

bacancydataprophets
/

AI-Generated_FAQs

Sleeping

App Files Files Community

prathampatel1 commited on Jun 2

Commit

31e11f7

verified ·

1 Parent(s): acdb85e

Upload 3 files

Browse files

Files changed (3) hide show

app.py +665 -0
davids_bridal_middletown_reviews.csv +0 -0
scraper.py +451 -0

app.py ADDED Viewed

	@@ -0,0 +1,665 @@

+import re
+import os
+import streamlit as st
+import pandas as pd
+import json
+from typing import List, Dict
+from groq import Groq
+import time
+from dotenv import load_dotenv
+import math
+from collections import Counter
+# Load environment variables from .env file
+load_dotenv()
+reviews_data = {}
+# Configure the Streamlit page
+st.set_page_config(
+    page_title="AI FAQ Generator",
+    page_icon="🤖",
+    layout="wide",
+    initial_sidebar_state="expanded"
+)
+class OptimizedFAQGenerator:
+    def __init__(self, api_key: str):
+        """Initialize the FAQ Generator with Groq API key."""
+        self.client = Groq(api_key=api_key)
+        self.model = "llama3-8b-8192"  # Fast and efficient model
+        self.batch_size = 100  # Process reviews in batches of 100
+        self.max_text_length = 3000  # Maximum text length per API call
+    def chunk_reviews_by_size(self, reviews_data: List[Dict], max_chars: int = 3000) -> List[List[Dict]]:
+        """Chunk reviews by character count to stay within API limits."""
+        chunks = []
+        current_chunk = []
+        current_length = 0
+        for review in reviews_data:
+            review_text = review.get('review_text', '')
+            review_length = len(review_text) + 50  # Add buffer for formatting
+            # If adding this review would exceed the limit, start a new chunk
+            if current_length + review_length > max_chars and current_chunk:
+                chunks.append(current_chunk)
+                current_chunk = [review]
+                current_length = review_length
+            else:
+                current_chunk.append(review)
+                current_length += review_length
+        # Add the last chunk if it has content
+        if current_chunk:
+            chunks.append(current_chunk)
+        return chunks
+    def extract_keywords_from_batch(self, review_batch: List[Dict]) -> List[str]:
+        """Extract keywords from a batch of reviews."""
+        # Combine review texts from the batch
+        batch_text = " ".join([review.get('review_text', '') for review in review_batch if review.get('review_text')])
+        # Truncate if too long
+        if len(batch_text) > self.max_text_length:
+            batch_text = batch_text[:self.max_text_length]
+        prompt = f"""
+        Analyze these customer reviews and extract SEO keywords and phrases for a business website.
+        IMPORTANT RULES:
+        1. DO NOT include any specific brand names or business names
+        2. Focus on generic industry terms and services
+        3. Extract keywords that any similar business could use
+        4. Focus on customer pain points and solutions
+        Extract keywords for:
+        - Products and services mentioned (generic terms only)
+        - Common customer concerns and questions
+        - Industry terminology
+        - Customer experience themes
+        - Service quality aspects
+        Reviews:
+        {batch_text}
+        Return exactly 15 relevant SEO keywords/phrases, one per line, without numbering or bullets.
+        Use generic terms that any business in this industry could use.
+        """
+        try:
+            response = self.client.chat.completions.create(
+                messages=[{"role": "user", "content": prompt}],
+                model=self.model,
+                temperature=0.3,
+                max_tokens=400
+            )
+            keywords = [kw.strip() for kw in response.choices[0].message.content.strip().split('\n') if kw.strip()]
+            return keywords[:15]  # Limit to 15 keywords per batch
+        except Exception as e:
+            st.error(f"Error extracting keywords from batch: {str(e)}")
+            return []
+    def extract_seo_keywords(self, reviews_data: List[Dict]) -> List[str]:
+        """Extract SEO keywords from all reviews using batch processing."""
+        st.info(f"Processing {len(reviews_data)} reviews in batches...")
+        # Create progress bar
+        progress_bar = st.progress(0)
+        status_text = st.empty()
+        # Chunk reviews by character count
+        review_chunks = self.chunk_reviews_by_size(reviews_data, self.max_text_length)
+        all_keywords = []
+        for i, chunk in enumerate(review_chunks):
+            status_text.text(f"Processing batch {i+1}/{len(review_chunks)} ({len(chunk)} reviews)...")
+            # Extract keywords from this batch
+            batch_keywords = self.extract_keywords_from_batch(chunk)
+            all_keywords.extend(batch_keywords)
+            # Update progress
+            progress_bar.progress((i + 1) / len(review_chunks))
+            # Small delay to avoid rate limiting
+            time.sleep(0.5)
+        # Count keyword frequency and get top keywords
+        keyword_counts = Counter(all_keywords)
+        top_keywords = [kw for kw, count in keyword_counts.most_common(25)]
+        progress_bar.empty()
+        status_text.empty()
+        st.success(f"Extracted {len(top_keywords)} unique keywords from {len(review_chunks)} batches")
+        return top_keywords
+    def get_review_insights(self, reviews_data: List[Dict]) -> Dict:
+        """Extract insights from reviews for better FAQ generation."""
+        # Sample reviews for analysis
+        sample_size = min(50, len(reviews_data))
+        sample_reviews = reviews_data[:sample_size]
+        insights = {
+            'total_reviews': len(reviews_data),
+            'avg_rating': sum(int(r.get('rating', 0)) for r in reviews_data) / len(reviews_data),
+            'positive_reviews': sum(1 for r in reviews_data if int(r.get('rating', 0)) >= 4),
+            'common_themes': [],
+            'pain_points': [],
+            'positive_aspects': []
+        }
+        # Analyze positive vs negative reviews
+        positive_reviews = [r for r in sample_reviews if int(r.get('rating', 0)) >= 4]
+        negative_reviews = [r for r in sample_reviews if int(r.get('rating', 0)) <= 2]
+        insights['sample_positive'] = positive_reviews[:5]
+        insights['sample_negative'] = negative_reviews[:3]
+        return insights
+    def clean_json_response(self, response_text: str) -> str:
+        """Clean and extract JSON from AI response."""
+        # Remove markdown code blocks
+        response_text = re.sub(r'```json\s*', '', response_text)
+        response_text = re.sub(r'```\s*', '', response_text)
+        # Find the JSON array
+        json_start = response_text.find('[')
+        json_end = response_text.rfind(']') + 1
+        if json_start != -1 and json_end > json_start:
+            json_content = response_text[json_start:json_end]
+            # Clean common JSON issues
+            json_content = re.sub(r'\n\s*', ' ', json_content)  # Remove newlines and extra spaces
+            json_content = re.sub(r'"\s*,\s*"', '", "', json_content)  # Fix spacing around commas
+            json_content = re.sub(r'}\s*,\s*{', '}, {', json_content)  # Fix object separators
+            return json_content
+        return None
+    def generate_faqs(self, keywords: List[str], reviews_data: List[Dict], num_faqs: int = 20) -> List[Dict]:
+        """Generate FAQs based on SEO keywords and review insights."""
+        # Get review insights
+        insights = self.get_review_insights(reviews_data)
+        # Create sample review context (limit to prevent token overflow)
+        sample_reviews = []
+        for review in insights['sample_positive']:
+            sample_reviews.append(f"Rating: {review.get('rating', 'N/A')}/5 - {review.get('review_text', '')[:150]}...")
+        for review in insights['sample_negative']:
+            sample_reviews.append(f"Rating: {review.get('rating', 'N/A')}/5 - {review.get('review_text', '')[:150]}...")
+        sample_context = "\n".join(sample_reviews[:8])  # Limit to 8 samples
+        # Limit FAQs to maximum of 30
+        num_faqs = min(num_faqs, 15)
+        prompt = f"""
+        Based on the following SEO keywords and customer review insights, generate exactly {num_faqs} comprehensive FAQ pairs for a business website.
+        CRITICAL REQUIREMENTS:
+        1. DO NOT use any specific brand names or business names in questions or answers
+        2. Use generic terms like "our store", "our business", "our team", "our services"
+        3. Focus on universal customer concerns and solutions
+        SEO Keywords: {', '.join(keywords[:20])}
+        Business Insights:
+        - Total Reviews Analyzed: {insights['total_reviews']}
+        - Average Rating: {insights['avg_rating']:.1f}/5
+        - Positive Reviews: {insights['positive_reviews']}/{insights['total_reviews']}
+        Sample Customer Feedback:
+        {sample_context}
+        IMPORTANT: Respond with ONLY a valid JSON array. No additional text or markdown.
+        Format:
+        [
+          {{
+            "question": "Why should I choose your business for my needs?",
+            "answer": "Our experienced team provides personalized service with attention to detail. We focus on understanding your specific requirements and delivering solutions that exceed expectations, backed by our commitment to quality and customer satisfaction."
+          }}
+        ]
+        """
+        try:
+            st.info("Generating FAQs with AI...")
+            response = self.client.chat.completions.create(
+                messages=[
+                    {"role": "system", "content": "You are a helpful assistant that generates brand-neutral JSON responses for FAQ content. Always respond with valid JSON only, without any brand names."},
+                    {"role": "user", "content": prompt}
+                ],
+                model=self.model,
+                temperature=0.2,  # Lower temperature for more consistent output
+                max_tokens=3000   # Increased for more FAQs
+            )
+            # Get the response content
+            content = response.choices[0].message.content.strip()
+            # Clean and extract JSON
+            json_content = self.clean_json_response(content)
+            if json_content:
+                try:
+                    faqs = json.loads(json_content)
+                    # Validate that it's a list of dictionaries with required keys
+                    if isinstance(faqs, list) and all(isinstance(faq, dict) and 'question' in faq and 'answer' in faq for faq in faqs):
+                        # Limit to requested number
+                        return faqs[:num_faqs]
+                    else:
+                        st.warning("Invalid FAQ format received, using fallback")
+                        return self._get_fallback_faqs(num_faqs)
+                except json.JSONDecodeError as e:
+                    st.error(f"JSON parsing error: {str(e)}")
+                    return self._get_fallback_faqs(num_faqs)
+            else:
+                st.warning("Could not extract JSON from response, using fallback")
+                return self._get_fallback_faqs(num_faqs)
+        except Exception as e:
+            st.error(f"Error generating FAQs: {str(e)}")
+            return self._get_fallback_faqs(num_faqs)
+    def _get_fallback_faqs(self, num_faqs: int = 20) -> List[Dict]:
+        """Fallback FAQs if API fails - brand neutral and organized by categories."""
+        base_faqs = [
+            # Why choose us questions
+            {
+                "question": "Why should I choose your business over competitors?",
+                "answer": "Our experienced team provides personalized service with attention to detail and a commitment to customer satisfaction. We take time to understand your specific needs and work with you throughout the entire process to ensure you're completely happy with the results."
+            },
+            {
+                "question": "What makes your customer service different?",
+                "answer": "We pride ourselves on patient, welcoming service where customers never feel rushed. Our team focuses on creating a comfortable experience while providing expert guidance to help you make the best decisions for your needs."
+            },
+            {
+                "question": "How experienced is your team?",
+                "answer": "Our team consists of experienced professionals who are passionate about helping customers achieve their goals. We stay updated with the latest trends and techniques to provide you with the best possible service and advice."
+            },
+            # Problem-solving questions
+            {
+                "question": "How do you help customers who feel overwhelmed by choices?",
+                "answer": "Our knowledgeable staff guides you through the selection process based on your preferences, budget, and specific needs. We take time to understand your vision and narrow down options so you can make decisions with confidence."
+            },
+            {
+                "question": "What if I'm not satisfied with the results?",
+                "answer": "Customer satisfaction is our top priority. We work closely with you throughout the process and make adjustments as needed to ensure you're completely happy with the final outcome. Our team is committed to making things right."
+            },
+            {
+                "question": "How do you handle sizing and fit issues?",
+                "answer": "Our professional team provides expert fitting services and makes necessary adjustments to ensure perfect results. We take precise measurements and work with you through multiple fittings if needed to achieve the ideal fit."
+            },
+            # Service questions
+            {
+                "question": "What services do you offer besides your main products?",
+                "answer": "In addition to our primary offerings, we provide professional consultation, customization services, and ongoing support. We also offer accessories and complementary products to complete your experience with us."
+            },
+            {
+                "question": "Do you provide consultation services?",
+                "answer": "Yes, we offer personalized consultations where our experts help you explore options, provide styling advice, and ensure you make choices that align with your vision and budget. These consultations are designed to make your experience as smooth as possible."
+            },
+            {
+                "question": "What additional products and accessories do you carry?",
+                "answer": "We offer a comprehensive selection of complementary products and accessories to complete your needs. Our team can help coordinate everything to ensure a cohesive and polished final result."
+            },
+            # Process questions
+            {
+                "question": "Do I need an appointment or can I walk in?",
+                "answer": "While we welcome walk-ins when possible, we highly recommend scheduling an appointment to ensure you receive dedicated attention and personalized service. Appointments allow us to prepare for your visit and provide the best possible experience."
+            },
+            {
+                "question": "How long does the typical process take?",
+                "answer": "The timeline varies depending on your specific needs, but we work with you to establish realistic expectations from the start. Our team keeps you informed throughout the process and ensures everything is completed according to your schedule."
+            },
+            {
+                "question": "What should I expect during my first visit?",
+                "answer": "During your initial visit, we'll discuss your needs, preferences, and budget. Our team will guide you through available options, provide expert recommendations, and create a plan tailored to your specific requirements."
+            },
+            # Quality questions
+            {
+                "question": "How do you ensure quality in your products and services?",
+                "answer": "We maintain high standards through careful selection of products, skilled craftsmanship, and thorough quality checks. Our experienced team pays attention to every detail to ensure you receive exceptional results that meet our quality standards."
+            },
+            {
+                "question": "What is your experience with customers who have specific requirements?",
+                "answer": "Our team has extensive experience working with diverse customer needs and preferences. We pride ourselves on our ability to accommodate special requirements and provide customized solutions that exceed expectations."
+            },
+            {
+                "question": "How do you stay current with industry trends?",
+                "answer": "Our team continuously educates themselves on the latest trends, techniques, and products in the industry. We attend training sessions and stay connected with industry developments to provide you with current options and expert advice."
+            },
+            # Additional comprehensive questions
+            {
+                "question": "What price ranges do you offer?",
+                "answer": "We offer options across various price points to accommodate different budgets. Our team can help you find quality solutions within your budget and provide transparent pricing information upfront so you can make informed decisions."
+            },
+            {
+                "question": "Do you offer payment plans or financing options?",
+                "answer": "Yes, we understand that significant purchases require financial planning. We offer flexible payment options and financing plans to make our services more accessible and help you achieve your goals within your budget."
+            },
+            {
+                "question": "How far in advance should I start planning?",
+                "answer": "We recommend starting the process several months in advance to allow adequate time for consultation, selection, customization, and any necessary adjustments. Early planning ensures the best selection and reduces stress as your important date approaches."
+            },
+            {
+                "question": "Do you work with customers who have time constraints?",
+                "answer": "Absolutely! We understand that sometimes timelines are tight, and we're experienced in working efficiently to meet urgent deadlines. Our team will discuss your timeline and work diligently to accommodate your schedule while maintaining quality standards."
+            },
+            {
+                "question": "What sets your customer experience apart?",
+                "answer": "We focus on creating a welcoming, pressure-free environment where customers feel comfortable and supported. Our personalized approach, attention to detail, and commitment to customer satisfaction ensure that your experience with us is positive and memorable."
+            },
+            {
+                "question": "How do you handle special requests or customizations?",
+                "answer": "We welcome special requests and customizations as part of our personalized service approach. Our skilled team works with you to understand your vision and explore options for creating something unique that perfectly meets your specific needs and preferences."
+            }
+        ]
+        # Return the requested number of FAQs, up to the available amount
+        return base_faqs[:min(num_faqs, len(base_faqs))]
+def load_sample_data():
+    """Load sample data if no file is uploaded."""
+    sample_data = [
+        {
+            "reviewer_name": "Customer A",
+            "rating": 5,
+            "date": "3 months ago",
+            "review_text": "This past August, I went to the bridal store to look for my dream wedding dress and I found It! I was looking for an elegant, simple, and classic dress. The consultant was extremely helpful, patient, and sweet. The alterations team did a great job making sure I was happy with the alterations done to my dress.",
+            "owner_response": "Thank you so much for taking the time to leave this excellent review!",
+        },
+        {
+            "reviewer_name": "Customer B",
+            "rating": 5,
+            "date": "2 months ago",
+            "review_text": "A very special shout out to the consultant who made my daughters dress shopping so special. Never rushed her and only everything to accommodate her until she found the right dress to say yes to.",
+            "owner_response": "",
+        },
+        {
+            "reviewer_name": "Customer C",
+            "rating": 5,
+            "date": "1 month ago",
+            "review_text": "My wedding dress shopping experience was beyond amazing. The consultant was so wonderful to work with. Everyone was so sweet & welcoming when we walked in. She made me feel so comfortable as we tried on many different dresses.",
+            "owner_response": "Thank you for the wonderful review!",
+        }
+    ]
+    return sample_data
+def main():
+    st.title("🤖 AI FAQ Generator")
+    st.markdown("Generate SEO-optimized, FAQs from customer reviews")
+    # Sidebar for configuration
+    with st.sidebar:
+        st.header("⚙️ Configuration")
+        # API Key input
+        api_key = os.getenv("GROQ_API_KEY")
+        if not api_key:
+            st.warning("Please enter your Groq API key to use AI features")
+            st.markdown("Get your API key from [Groq Console](https://console.groq.com)")
+        st.divider()
+        # FAQ Configuration
+        st.subheader("FAQ Settings")
+        num_faqs = st.slider("Number of FAQs to generate", 5, 15, 10, 1, help="Select how many FAQs to generate based on the reviews")
+        if num_faqs < 5:
+            st.warning("Generating fewer than 5 FAQs may not provide enough coverage of customer concerns")
+        elif num_faqs > 15:
+            st.warning("Generating more than 15 FAQs may lead to less focused content")
+        st.info(f"Will generate {num_faqs} FAQs")
+        st.divider()
+        # File upload
+        uploaded_file = st.file_uploader(
+            "Upload Reviews CSV",
+            type=['csv'],
+            help="Upload a CSV file with customer reviews (supports large files)"
+        )
+        # Use sample data option
+        use_sample = st.checkbox("Use sample data", value=False)
+    # Load data
+    if uploaded_file is not None:
+        try:
+            with st.spinner("Loading CSV file..."):
+                df = pd.read_csv(uploaded_file)
+                # Data cleaning
+                df['rating'] = df['rating'].astype(str)
+                df = df.drop(columns=['review_id', 'scraped_at'], axis=1, errors='ignore')
+                # Remove empty reviews
+                df = df.dropna(subset=['review_text'])
+                df = df[df['review_text'].str.strip() != '']
+                reviews_data = df.to_dict('records')
+            st.success(f"✅ Loaded {len(reviews_data)} reviews from uploaded file")
+            # Show file statistics
+            col1, col2, col3 = st.columns(3)
+            with col1:
+                st.metric("Total Reviews", len(reviews_data))
+            with col2:
+                avg_rating = sum(int(r.get('rating', 0)) for r in reviews_data if r.get('rating', '0').isdigit()) / len([r for r in reviews_data if r.get('rating', '0').isdigit()])
+                st.metric("Average Rating", f"{avg_rating:.1f}")
+            with col3:
+                positive_reviews = sum(1 for r in reviews_data if r.get('rating', '0').isdigit() and int(r.get('rating', 0)) >= 4)
+                st.metric("Positive Reviews", f"{positive_reviews}/{len(reviews_data)}")
+        except Exception as e:
+            st.error(f"Error loading file: {str(e)}")
+            st.info("Please ensure your CSV has columns: 'review_text', 'rating'")
+            reviews_data = load_sample_data()
+    elif use_sample:
+        reviews_data = load_sample_data()
+        st.info("Using sample data for demonstration")
+    else:
+        st.warning("Please upload a CSV file or use sample data")
+        return
+    # Display data overview
+    if reviews_data:
+        with st.expander("📊 Data Overview", expanded=False):
+            # Sample reviews preview
+            st.subheader("Sample Reviews")
+            for i, review in enumerate(reviews_data[:3]):
+                with st.container():
+                    st.write(f"**{review.get('reviewer_name', 'Anonymous')}** - {review.get('rating', 'N/A')} ⭐")
+                    st.write(review.get('review_text', 'No review text')[:300] + "...")
+                    if i < 2:  # Don't show divider after last item
+                        st.divider()
+    # Generate FAQs
+    if api_key and reviews_data:
+        st.header("🚀 Generate AI-Powered FAQs")
+        col1, col2 = st.columns(2)
+        with col1:
+            if st.button("🤖 Generate Keywords & FAQs with AI", type="primary"):
+                start_time = time.time()
+                with st.spinner("Processing large dataset..."):
+                    # Initialize FAQ generator
+                    faq_gen = OptimizedFAQGenerator(api_key)
+                    # Extract keywords with batch processing
+                    st.info("🔍 Extracting SEO keywords from all reviews...")
+                    keywords = faq_gen.extract_seo_keywords(reviews_data)
+                    # Store in session state
+                    st.session_state.keywords = keywords
+                    # Generate FAQs
+                    st.info("📝 Generating brand-neutral FAQs...")
+                    faqs = faq_gen.generate_faqs(keywords, reviews_data, num_faqs)
+                    # Store in session state
+                    st.session_state.faqs = faqs
+                    generation_time = time.time() - start_time
+                    st.session_state.generation_time = generation_time
+                    st.success(f"✅ Generated {len(faqs)} FAQs in {generation_time:.1f} seconds!")
+        # with col2:
+        #     if st.button("⚡ Use Quick Fallback"):
+        #         faq_gen = OptimizedFAQGenerator("")  # Empty API key for fallback
+        #         st.session_state.keywords = ["customer service", "quality products", "professional consultation", "experienced team", "customer satisfaction"]
+        #         st.session_state.faqs = faq_gen._get_fallback_faqs(num_faqs)
+        #         st.info("Using pre-built content")
+    # Display results
+    if hasattr(st.session_state, 'keywords') and hasattr(st.session_state, 'faqs'):
+        # Performance metrics
+        if hasattr(st.session_state, 'generation_time'):
+            st.info(f"⏱️ Generation completed in {st.session_state.generation_time:.1f} seconds")
+        st.header("📈 Extracted SEO Keywords")
+        # Display keywords in a nice format
+        keywords = st.session_state.keywords
+        # Show keywords in columns
+        cols = st.columns(3)
+        for i, keyword in enumerate(keywords):
+            with cols[i % 3]:
+                st.markdown(f"`{keyword}`")
+        st.header("❓ Generated Brand-Neutral FAQs")
+        st.info(f"Generated {len(st.session_state.faqs)} FAQs that can be used by any business in this industry")
+        # Display FAQs with search functionality
+        search_term = st.text_input("🔍 Search FAQs", placeholder="Enter keywords to filter FAQs...")
+        faqs = st.session_state.faqs
+        # Filter FAQs if search term is provided
+        if search_term:
+            filtered_faqs = [
+                faq for faq in faqs
+                if search_term.lower() in faq.get('question', '').lower()
+                or search_term.lower() in faq.get('answer', '').lower()
+            ]
+            st.info(f"Showing {len(filtered_faqs)} FAQs matching '{search_term}'")
+            faqs_to_show = filtered_faqs
+        else:
+            faqs_to_show = faqs
+        # Display FAQs
+        for i, faq in enumerate(faqs_to_show):
+            with st.expander(f"FAQ {i+1}: {faq.get('question', 'No question')}", expanded=False):
+                st.subheader("Question:")
+                st.write(faq.get('question', 'No question'))
+                st.subheader("Answer:")
+                st.write(faq.get('answer', 'No answer'))
+        # Export options
+        st.header("📥 Export Options")
+        col1, col2, col3 = st.columns(3)
+        with col1:
+            # Export as JSON
+            export_data = {
+                "metadata": {
+                    "generated_at": time.strftime("%Y-%m-%d %H:%M:%S"),
+                    "total_reviews_analyzed": len(reviews_data),
+                    "generation_time_seconds": getattr(st.session_state, 'generation_time', 0),
+                    "brand_neutral": True
+                },
+                "keywords": keywords,
+                "faqs": faqs
+            }
+            st.download_button(
+                label="📄 Download JSON",
+                data=json.dumps(export_data, indent=2),
+                file_name="faqs.json",
+                mime="application/json"
+            )
+        with col2:
+            # Export as CSV
+            faq_df = pd.DataFrame(faqs)
+            csv_data = faq_df.to_csv(index=False)
+            st.download_button(
+                label="📊 Download CSV",
+                data=csv_data,
+                file_name="faqs.csv",
+                mime="text/csv"
+            )
+        with col3:
+            # Export as HTML
+            html_content = f"""
+            <!DOCTYPE html>
+            <html>
+            <head>
+                <title>Brand-Neutral FAQs</title>
+                <style>
+                    body {{ font-family: Arial, sans-serif; margin: 40px; line-height: 1.6; }}
+                    .faq {{ margin-bottom: 30px; border-left: 4px solid #007bff; padding-left: 20px; }}
+                    .question {{ font-weight: bold; font-size: 18px; color: #333; margin-bottom: 10px; }}
+                    .answer {{ color: #666; }}
+                    .header {{ background: #f8f9fa; padding: 20px; border-radius: 5px; margin-bottom: 30px; }}
+                </style>
+            </head>
+            <body>
+                <div class="header">
+                    <h1>Brand-Neutral FAQs</h1>
+                    <p><strong>Generated:</strong> {time.strftime("%Y-%m-%d %H:%M:%S")}</p>
+                    <p><strong>Reviews Analyzed:</strong> {len(reviews_data)}</p>
+                    <p><strong>Keywords:</strong> {', '.join(keywords[:10])}...</p>
+                </div>
+            """
+            for i, faq in enumerate(faqs, 1):
+                html_content += f"""
+                <div class="faq">
+                    <div class="question">{i}. {faq.get('question', '')}</div>
+                    <div class="answer">{faq.get('answer', '')}</div>
+                </div>
+                """
+            html_content += "</body></html>"
+            st.download_button(
+                label="🌐 Download HTML",
+                data=html_content,
+                file_name="faqs.html",
+                mime="text/html"
+            )
+    # Footer
+    st.markdown("---")
+    st.markdown("**Features:** Batch processing for large datasets • Brand-neutral content • SEO optimization • Multiple export formats")
+if __name__ == "__main__":
+    main()

davids_bridal_middletown_reviews.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

scraper.py ADDED Viewed

	@@ -0,0 +1,451 @@

+#!/usr/bin/env python3
+"""
+Enhanced Google Maps Reviews Scraper for David's Bridal
+Scrapes reviews from Google Maps with parallel processing and improved element detection
+"""
+import csv
+import time
+import random
+import asyncio
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.chrome.service import Service
+from selenium.common.exceptions import TimeoutException, NoSuchElementException, WebDriverException, ElementClickInterceptedException
+from webdriver_manager.chrome import ChromeDriverManager
+import pandas as pd
+from datetime import datetime
+import logging
+import sys
+import threading
+from queue import Queue
+# Set up logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+class EnhancedGoogleMapsReviewsScraper:
+    def __init__(self, headless=True, wait_time=10, max_workers=3):
+        """Initialize the scraper with Chrome driver options"""
+        self.wait_time = wait_time
+        self.max_workers = max_workers
+        self.reviews_queue = Queue()
+        self.processed_reviews = []
+        self.lock = threading.Lock()
+        self.setup_driver(headless)
+    def setup_driver(self, headless):
+        """Set up Chrome driver with appropriate options"""
+        try:
+            chrome_options = Options()
+            if headless:
+                chrome_options.add_argument("--headless")
+            chrome_options.add_argument("--no-sandbox")
+            chrome_options.add_argument("--disable-dev-shm-usage")
+            chrome_options.add_argument("--disable-blink-features=AutomationControlled")
+            chrome_options.add_argument("--disable-extensions")
+            chrome_options.add_argument("--disable-gpu")
+            chrome_options.add_argument("--remote-debugging-port=9222")
+            chrome_options.add_argument("--window-size=1920,1080")
+            chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
+            chrome_options.add_experimental_option('useAutomationExtension', False)
+            chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
+            logger.info("Setting up ChromeDriver...")
+            service = Service(ChromeDriverManager().install())
+            self.driver = webdriver.Chrome(service=service, options=chrome_options)
+            self.driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
+            self.wait = WebDriverWait(self.driver, self.wait_time)
+            logger.info("ChromeDriver setup successful")
+        except WebDriverException as e:
+            logger.error(f"Failed to setup ChromeDriver: {e}")
+            sys.exit(1)
+    def search_location(self, query):
+        """Search for David's Bridal location on Google Maps"""
+        try:
+            search_url = f"https://www.google.com/maps/search/{query.replace(' ', '+')}"
+            logger.info(f"Navigating to: {search_url}")
+            self.driver.get(search_url)
+            # Wait for page to load
+            time.sleep(5)
+            # Look for search results
+            result_selectors = [
+                "button.hh2c6.G7m0Af",  # Button with class for location
+            ]
+            result_found = False
+            for selector in result_selectors:
+                try:
+                    first_result = self.wait.until(
+                        EC.element_to_be_clickable((By.CSS_SELECTOR, selector))
+                    )
+                    self.driver.execute_script("arguments[0].click();", first_result)
+                    time.sleep(3)
+                    result_found = True
+                    break
+                except TimeoutException:
+                    continue
+            return result_found
+        except Exception as e:
+            logger.error(f"Error in search_location: {e}")
+            return False
+    def click_reviews_tab(self):
+        """Click on the reviews tab using the specific element structure"""
+        try:
+            # Wait for the reviews tab to be clickable
+            reviews_button = self.wait.until(
+                EC.element_to_be_clickable((By.CSS_SELECTOR, "button[data-tab-index='1'][aria-label*='Reviews']"))
+            )
+            # Scroll the button into view
+            self.driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", reviews_button)
+            time.sleep(1)
+            # Click the reviews button
+            self.driver.execute_script("arguments[0].click();", reviews_button)
+            logger.info("Successfully clicked reviews tab")
+            # Wait for reviews to load
+            time.sleep(3)
+            return True
+        except Exception as e:
+            logger.error(f"Could not click reviews tab: {e}")
+            return False
+    def expand_review_text(self, review_element):
+        """Expand review text by clicking 'More' button if present"""
+        try:
+            # Look for the 'More' button within this review
+            more_button = review_element.find_element(
+                By.CSS_SELECTOR,
+                "button.w8nwRe.kyuRq[aria-label='See more']"
+            )
+            # Scroll button into view and click
+            self.driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", more_button)
+            time.sleep(0.5)
+            self.driver.execute_script("arguments[0].click();", more_button)
+            time.sleep(1)  # Wait for text to expand
+            return True
+        except NoSuchElementException:
+            # No 'More' button found - review is already fully visible
+            return False
+        except Exception as e:
+            logger.warning(f"Error expanding review text: {e}")
+            return False
+    def scroll_and_load_reviews(self, target_count=5000):
+        """Scroll through reviews to load all available reviews"""
+        try:
+            scrollable_container = self.driver.find_element(By.CSS_SELECTOR, "div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde")
+            last_review_count = 0
+            stagnant_rounds = 0
+            max_stagnant_rounds = 5
+            scroll_attempts = 0
+            max_scroll_attempts = 1000  # increased max
+            while scroll_attempts < max_scroll_attempts:
+                # Scroll down
+                self.driver.execute_script(
+                    "arguments[0].scrollTo(0, arguments[0].scrollHeight);",
+                    scrollable_container
+                )
+                # Wait for content to load
+                time.sleep(random.uniform(2, 4))
+                # Count reviews
+                current_reviews = len(self.driver.find_elements(By.CSS_SELECTOR, "div[data-review-id]"))
+                logger.info(f"Attempt {scroll_attempts + 1}: Loaded {current_reviews} reviews (target: {target_count})")
+                # Check if we’ve hit the target
+                if current_reviews >= target_count:
+                    logger.info("Reached target review count.")
+                    break
+                # Check if no new reviews are loading
+                if current_reviews == last_review_count:
+                    stagnant_rounds += 1
+                    logger.info(f"No new reviews this round. Stagnant rounds: {stagnant_rounds}/{max_stagnant_rounds}")
+                    if stagnant_rounds >= max_stagnant_rounds:
+                        logger.info("No new reviews after several attempts. Stopping scroll.")
+                        break
+                else:
+                    stagnant_rounds = 0  # reset if progress made
+                last_review_count = current_reviews
+                scroll_attempts += 1
+                # Occasionally wait longer to mimic human behavior
+                if scroll_attempts % 10 == 0:
+                    logger.info("Taking a longer pause to mimic human browsing...")
+                    time.sleep(random.uniform(5, 8))
+            logger.info(f"Finished scrolling. Total reviews found: {current_reviews}")
+            return current_reviews
+        except Exception as e:
+            logger.error(f"Error scrolling reviews: {e}")
+            return 0
+    def extract_single_review_data(self, review_element):
+        """Extract data from a single review element"""
+        try:
+            review_data = {}
+            # First, try to expand the review text if there's a 'More' button
+            self.expand_review_text(review_element)
+            # Extract reviewer name
+            try:
+                name_element = review_element.find_element(By.CSS_SELECTOR, "div[class*='d4r55']")
+                review_data['reviewer_name'] = name_element.text.strip()
+            except NoSuchElementException:
+                review_data['reviewer_name'] = "Anonymous"
+            # Extract rating
+            try:
+                rating_element = review_element.find_element(By.CSS_SELECTOR, "span[role='img'][aria-label*='star']")
+                rating_text = rating_element.get_attribute('aria-label')
+                review_data['rating'] = self.extract_rating_from_text(rating_text)
+            except NoSuchElementException:
+                review_data['rating'] = None
+            # Extract review text using the specific selector you provided
+            try:
+                text_element = review_element.find_element(By.CSS_SELECTOR, "span.wiI7pd")
+                review_data['review_text'] = text_element.text.strip()
+            except NoSuchElementException:
+                review_data['review_text'] = ""
+            # Extract date
+            try:
+                date_element = review_element.find_element(By.CSS_SELECTOR, "span.rsqaWe")
+                review_data['date'] = date_element.text.strip()
+            except NoSuchElementException:
+                review_data['date'] = ""
+            # Extract owner response if any
+            try:
+                response_element = review_element.find_element(By.CSS_SELECTOR, "div[class*='wiI7pd']")
+                review_data['owner_response'] = response_element.text.strip()
+            except NoSuchElementException:
+                review_data['owner_response'] = ""
+            # Add metadata
+            review_data['scraped_at'] = datetime.now().isoformat()
+            review_data['review_id'] = review_element.get_attribute('data-review-id') or f"review_{int(time.time() * 1000)}"
+            return review_data
+        except Exception as e:
+            logger.error(f"Error extracting single review: {e}")
+            return None
+    def extract_rating_from_text(self, text):
+        """Extract numeric rating from aria-label text"""
+        if not text:
+            return None
+        import re
+        # Look for patterns like "5 stars", "Rated 4 out of 5 stars"
+        match = re.search(r'(\d+)\s*(?:out of \d+\s*)?stars?', text.lower())
+        if match:
+            return int(match.group(1))
+        # Fallback: count star characters
+        star_count = text.count('★') or text.count('⭐')
+        if star_count > 0:
+            return star_count
+        return None
+    def process_reviews_batch(self, review_elements, start_idx, end_idx):
+        """Process a batch of reviews in parallel"""
+        batch_results = []
+        for i in range(start_idx, min(end_idx, len(review_elements))):
+            try:
+                review_data = self.extract_single_review_data(review_elements[i])
+                if review_data:
+                    batch_results.append(review_data)
+                    logger.info(f"Processed review {i+1}/{len(review_elements)}")
+            except Exception as e:
+                logger.warning(f"Error processing review {i+1}: {e}")
+                continue
+        return batch_results
+    def extract_all_reviews_parallel(self):
+        """Extract all reviews using parallel processing with duplicate removal"""
+        try:
+            # Get all review elements using a single, specific selector
+            review_elements = self.driver.find_elements(By.CSS_SELECTOR, "div[data-review-id]")
+            total_reviews = len(review_elements)
+            logger.info(f"Found {total_reviews} review elements to process")
+            if total_reviews == 0:
+                return []
+            # Use a set to track processed review IDs and avoid duplicates
+            processed_review_ids = set()
+            all_reviews = []
+            # Process reviews sequentially to better control duplicates
+            for i, review_element in enumerate(review_elements):
+                try:
+                    # Get review ID first to check for duplicates
+                    review_id = review_element.get_attribute('data-review-id')
+                    if review_id and review_id in processed_review_ids:
+                        logger.debug(f"Skipping duplicate review ID: {review_id}")
+                        continue
+                    # Extract review data
+                    review_data = self.extract_single_review_data(review_element)
+                    if review_data and review_data.get('review_id'):
+                        # Add to processed set to prevent duplicates
+                        processed_review_ids.add(review_data['review_id'])
+                        all_reviews.append(review_data)
+                        logger.info(f"Processed review {len(all_reviews)}/{total_reviews}")
+                except Exception as e:
+                    logger.warning(f"Error processing review {i+1}: {e}")
+                    continue
+            logger.info(f"Successfully extracted {len(all_reviews)} unique reviews")
+            return all_reviews
+        except Exception as e:
+            logger.error(f"Error in review extraction: {e}")
+            return []
+    def save_to_csv(self, reviews_data, filename="davids_bridal_reviews.csv"):
+        """Save reviews data to CSV file with duplicate removal and better formatting"""
+        if not reviews_data:
+            logger.warning("No reviews data to save")
+            return
+        try:
+            df = pd.DataFrame(reviews_data)
+            # Remove duplicates based on review_id and review_text
+            initial_count = len(df)
+            df = df.drop_duplicates(subset=['review_id'], keep='first')
+            # If review_id duplicates removed, also check for text duplicates as backup
+            df = df.drop_duplicates(subset=['reviewer_name', 'review_text', 'date'], keep='first')
+            final_count = len(df)
+            if initial_count > final_count:
+                logger.info(f"Removed {initial_count - final_count} duplicate reviews")
+            # Reorder columns for better readability
+            column_order = ['reviewer_name', 'rating', 'date', 'review_text', 'owner_response', 'review_id', 'scraped_at']
+            df = df.reindex(columns=column_order)
+            # Save to CSV with proper encoding
+            df.to_csv(filename, index=False, encoding='utf-8')
+            logger.info(f"Successfully saved {len(df)} unique reviews to {filename}")
+            # Print summary statistics
+            if 'rating' in df.columns and len(df) > 0:
+                avg_rating = df['rating'].mean()
+                logger.info(f"Average rating: {avg_rating:.2f}")
+                logger.info(f"Rating distribution:\n{df['rating'].value_counts().sort_index()}")
+        except Exception as e:
+            logger.error(f"Error saving to CSV: {e}")
+    def scrape_reviews(self, location_query, output_file="davids_bridal_reviews.csv"):
+        """Main method to scrape all reviews"""
+        try:
+            logger.info("Starting enhanced review scraping...")
+            # Search for the location
+            if not self.search_location(location_query):
+                logger.error("Failed to find location")
+                return None
+            # Click reviews tab
+            if not self.click_reviews_tab():
+                logger.error("Failed to access reviews tab")
+                return None
+            # Scroll to load all reviews
+            total_loaded = self.scroll_and_load_reviews(target_count=2394)
+            if total_loaded == 0:
+                logger.error("No reviews found after scrolling")
+                return None
+            # Extract all reviews using parallel processing
+            reviews_data = self.extract_all_reviews_parallel()
+            # Save to CSV
+            if reviews_data:
+                self.save_to_csv(reviews_data, output_file)
+                logger.info(f"Successfully scraped {len(reviews_data)} reviews")
+                return reviews_data
+            else:
+                logger.warning("No reviews extracted")
+                return None
+        except Exception as e:
+            logger.error(f"Error during scraping: {e}")
+            return None
+        finally:
+            self.close()
+    def close(self):
+        """Close the browser driver"""
+        if hasattr(self, 'driver'):
+            self.driver.quit()
+def main():
+    """Enhanced main function with better error handling"""
+    try:
+        # Initialize scraper
+        scraper = EnhancedGoogleMapsReviewsScraper(
+            headless=False,  # Set to True for background operation
+            max_workers=3    # Adjust based on your system
+        )
+        # Define search query
+        search_query = "David's Bridal Middletown NY"
+        logger.info(f"Starting scrape for: {search_query}")
+        # Scrape reviews
+        reviews = scraper.scrape_reviews(
+            location_query=search_query,
+            output_file="davids_bridal_middletown_reviews.csv"
+        )
+        if reviews:
+            logger.info(f"Scraping completed successfully! Total reviews: {len(reviews)}")
+        else:
+            logger.error("Scraping failed - no reviews collected")
+    except KeyboardInterrupt:
+        logger.info("Scraping interrupted by user")
+    except Exception as e:
+        logger.error(f"Unexpected error in main: {e}")
+if __name__ == "__main__":
+    main()