prathampatel1 commited on
Commit
31e11f7
·
verified ·
1 Parent(s): acdb85e

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +665 -0
  2. davids_bridal_middletown_reviews.csv +0 -0
  3. scraper.py +451 -0
app.py ADDED
@@ -0,0 +1,665 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import os
3
+ import streamlit as st
4
+ import pandas as pd
5
+ import json
6
+ from typing import List, Dict
7
+ from groq import Groq
8
+ import time
9
+ from dotenv import load_dotenv
10
+ import math
11
+ from collections import Counter
12
+
13
+ # Load environment variables from .env file
14
+ load_dotenv()
15
+
16
+ reviews_data = {}
17
+
18
+ # Configure the Streamlit page
19
+ st.set_page_config(
20
+ page_title="AI FAQ Generator",
21
+ page_icon="🤖",
22
+ layout="wide",
23
+ initial_sidebar_state="expanded"
24
+ )
25
+
26
+ class OptimizedFAQGenerator:
27
+ def __init__(self, api_key: str):
28
+ """Initialize the FAQ Generator with Groq API key."""
29
+ self.client = Groq(api_key=api_key)
30
+ self.model = "llama3-8b-8192" # Fast and efficient model
31
+ self.batch_size = 100 # Process reviews in batches of 100
32
+ self.max_text_length = 3000 # Maximum text length per API call
33
+
34
+ def chunk_reviews_by_size(self, reviews_data: List[Dict], max_chars: int = 3000) -> List[List[Dict]]:
35
+ """Chunk reviews by character count to stay within API limits."""
36
+ chunks = []
37
+ current_chunk = []
38
+ current_length = 0
39
+
40
+ for review in reviews_data:
41
+ review_text = review.get('review_text', '')
42
+ review_length = len(review_text) + 50 # Add buffer for formatting
43
+
44
+ # If adding this review would exceed the limit, start a new chunk
45
+ if current_length + review_length > max_chars and current_chunk:
46
+ chunks.append(current_chunk)
47
+ current_chunk = [review]
48
+ current_length = review_length
49
+ else:
50
+ current_chunk.append(review)
51
+ current_length += review_length
52
+
53
+ # Add the last chunk if it has content
54
+ if current_chunk:
55
+ chunks.append(current_chunk)
56
+
57
+ return chunks
58
+
59
+ def extract_keywords_from_batch(self, review_batch: List[Dict]) -> List[str]:
60
+ """Extract keywords from a batch of reviews."""
61
+ # Combine review texts from the batch
62
+ batch_text = " ".join([review.get('review_text', '') for review in review_batch if review.get('review_text')])
63
+
64
+ # Truncate if too long
65
+ if len(batch_text) > self.max_text_length:
66
+ batch_text = batch_text[:self.max_text_length]
67
+
68
+ prompt = f"""
69
+ Analyze these customer reviews and extract SEO keywords and phrases for a business website.
70
+
71
+ IMPORTANT RULES:
72
+ 1. DO NOT include any specific brand names or business names
73
+ 2. Focus on generic industry terms and services
74
+ 3. Extract keywords that any similar business could use
75
+ 4. Focus on customer pain points and solutions
76
+
77
+ Extract keywords for:
78
+ - Products and services mentioned (generic terms only)
79
+ - Common customer concerns and questions
80
+ - Industry terminology
81
+ - Customer experience themes
82
+ - Service quality aspects
83
+
84
+ Reviews:
85
+ {batch_text}
86
+
87
+ Return exactly 15 relevant SEO keywords/phrases, one per line, without numbering or bullets.
88
+ Use generic terms that any business in this industry could use.
89
+ """
90
+
91
+ try:
92
+ response = self.client.chat.completions.create(
93
+ messages=[{"role": "user", "content": prompt}],
94
+ model=self.model,
95
+ temperature=0.3,
96
+ max_tokens=400
97
+ )
98
+
99
+ keywords = [kw.strip() for kw in response.choices[0].message.content.strip().split('\n') if kw.strip()]
100
+ return keywords[:15] # Limit to 15 keywords per batch
101
+
102
+ except Exception as e:
103
+ st.error(f"Error extracting keywords from batch: {str(e)}")
104
+ return []
105
+
106
+ def extract_seo_keywords(self, reviews_data: List[Dict]) -> List[str]:
107
+ """Extract SEO keywords from all reviews using batch processing."""
108
+ st.info(f"Processing {len(reviews_data)} reviews in batches...")
109
+
110
+ # Create progress bar
111
+ progress_bar = st.progress(0)
112
+ status_text = st.empty()
113
+
114
+ # Chunk reviews by character count
115
+ review_chunks = self.chunk_reviews_by_size(reviews_data, self.max_text_length)
116
+
117
+ all_keywords = []
118
+
119
+ for i, chunk in enumerate(review_chunks):
120
+ status_text.text(f"Processing batch {i+1}/{len(review_chunks)} ({len(chunk)} reviews)...")
121
+
122
+ # Extract keywords from this batch
123
+ batch_keywords = self.extract_keywords_from_batch(chunk)
124
+ all_keywords.extend(batch_keywords)
125
+
126
+ # Update progress
127
+ progress_bar.progress((i + 1) / len(review_chunks))
128
+
129
+ # Small delay to avoid rate limiting
130
+ time.sleep(0.5)
131
+
132
+ # Count keyword frequency and get top keywords
133
+ keyword_counts = Counter(all_keywords)
134
+ top_keywords = [kw for kw, count in keyword_counts.most_common(25)]
135
+
136
+ progress_bar.empty()
137
+ status_text.empty()
138
+
139
+ st.success(f"Extracted {len(top_keywords)} unique keywords from {len(review_chunks)} batches")
140
+
141
+ return top_keywords
142
+
143
+ def get_review_insights(self, reviews_data: List[Dict]) -> Dict:
144
+ """Extract insights from reviews for better FAQ generation."""
145
+ # Sample reviews for analysis
146
+ sample_size = min(50, len(reviews_data))
147
+ sample_reviews = reviews_data[:sample_size]
148
+
149
+ insights = {
150
+ 'total_reviews': len(reviews_data),
151
+ 'avg_rating': sum(int(r.get('rating', 0)) for r in reviews_data) / len(reviews_data),
152
+ 'positive_reviews': sum(1 for r in reviews_data if int(r.get('rating', 0)) >= 4),
153
+ 'common_themes': [],
154
+ 'pain_points': [],
155
+ 'positive_aspects': []
156
+ }
157
+
158
+ # Analyze positive vs negative reviews
159
+ positive_reviews = [r for r in sample_reviews if int(r.get('rating', 0)) >= 4]
160
+ negative_reviews = [r for r in sample_reviews if int(r.get('rating', 0)) <= 2]
161
+
162
+ insights['sample_positive'] = positive_reviews[:5]
163
+ insights['sample_negative'] = negative_reviews[:3]
164
+
165
+ return insights
166
+
167
+ def clean_json_response(self, response_text: str) -> str:
168
+ """Clean and extract JSON from AI response."""
169
+ # Remove markdown code blocks
170
+ response_text = re.sub(r'```json\s*', '', response_text)
171
+ response_text = re.sub(r'```\s*', '', response_text)
172
+
173
+ # Find the JSON array
174
+ json_start = response_text.find('[')
175
+ json_end = response_text.rfind(']') + 1
176
+
177
+ if json_start != -1 and json_end > json_start:
178
+ json_content = response_text[json_start:json_end]
179
+
180
+ # Clean common JSON issues
181
+ json_content = re.sub(r'\n\s*', ' ', json_content) # Remove newlines and extra spaces
182
+ json_content = re.sub(r'"\s*,\s*"', '", "', json_content) # Fix spacing around commas
183
+ json_content = re.sub(r'}\s*,\s*{', '}, {', json_content) # Fix object separators
184
+
185
+ return json_content
186
+
187
+ return None
188
+
189
+ def generate_faqs(self, keywords: List[str], reviews_data: List[Dict], num_faqs: int = 20) -> List[Dict]:
190
+ """Generate FAQs based on SEO keywords and review insights."""
191
+
192
+ # Get review insights
193
+ insights = self.get_review_insights(reviews_data)
194
+
195
+ # Create sample review context (limit to prevent token overflow)
196
+ sample_reviews = []
197
+ for review in insights['sample_positive']:
198
+ sample_reviews.append(f"Rating: {review.get('rating', 'N/A')}/5 - {review.get('review_text', '')[:150]}...")
199
+
200
+ for review in insights['sample_negative']:
201
+ sample_reviews.append(f"Rating: {review.get('rating', 'N/A')}/5 - {review.get('review_text', '')[:150]}...")
202
+
203
+ sample_context = "\n".join(sample_reviews[:8]) # Limit to 8 samples
204
+
205
+ # Limit FAQs to maximum of 30
206
+ num_faqs = min(num_faqs, 15)
207
+
208
+ prompt = f"""
209
+ Based on the following SEO keywords and customer review insights, generate exactly {num_faqs} comprehensive FAQ pairs for a business website.
210
+
211
+ CRITICAL REQUIREMENTS:
212
+ 1. DO NOT use any specific brand names or business names in questions or answers
213
+ 2. Use generic terms like "our store", "our business", "our team", "our services"
214
+ 3. Focus on universal customer concerns and solutions
215
+
216
+ SEO Keywords: {', '.join(keywords[:20])}
217
+
218
+ Business Insights:
219
+ - Total Reviews Analyzed: {insights['total_reviews']}
220
+ - Average Rating: {insights['avg_rating']:.1f}/5
221
+ - Positive Reviews: {insights['positive_reviews']}/{insights['total_reviews']}
222
+
223
+ Sample Customer Feedback:
224
+ {sample_context}
225
+
226
+ IMPORTANT: Respond with ONLY a valid JSON array. No additional text or markdown.
227
+
228
+ Format:
229
+ [
230
+ {{
231
+ "question": "Why should I choose your business for my needs?",
232
+ "answer": "Our experienced team provides personalized service with attention to detail. We focus on understanding your specific requirements and delivering solutions that exceed expectations, backed by our commitment to quality and customer satisfaction."
233
+ }}
234
+ ]
235
+ """
236
+
237
+ try:
238
+ st.info("Generating FAQs with AI...")
239
+
240
+ response = self.client.chat.completions.create(
241
+ messages=[
242
+ {"role": "system", "content": "You are a helpful assistant that generates brand-neutral JSON responses for FAQ content. Always respond with valid JSON only, without any brand names."},
243
+ {"role": "user", "content": prompt}
244
+ ],
245
+ model=self.model,
246
+ temperature=0.2, # Lower temperature for more consistent output
247
+ max_tokens=3000 # Increased for more FAQs
248
+ )
249
+
250
+ # Get the response content
251
+ content = response.choices[0].message.content.strip()
252
+
253
+ # Clean and extract JSON
254
+ json_content = self.clean_json_response(content)
255
+
256
+ if json_content:
257
+ try:
258
+ faqs = json.loads(json_content)
259
+ # Validate that it's a list of dictionaries with required keys
260
+ if isinstance(faqs, list) and all(isinstance(faq, dict) and 'question' in faq and 'answer' in faq for faq in faqs):
261
+ # Limit to requested number
262
+ return faqs[:num_faqs]
263
+ else:
264
+ st.warning("Invalid FAQ format received, using fallback")
265
+ return self._get_fallback_faqs(num_faqs)
266
+ except json.JSONDecodeError as e:
267
+ st.error(f"JSON parsing error: {str(e)}")
268
+ return self._get_fallback_faqs(num_faqs)
269
+ else:
270
+ st.warning("Could not extract JSON from response, using fallback")
271
+ return self._get_fallback_faqs(num_faqs)
272
+
273
+ except Exception as e:
274
+ st.error(f"Error generating FAQs: {str(e)}")
275
+ return self._get_fallback_faqs(num_faqs)
276
+
277
+ def _get_fallback_faqs(self, num_faqs: int = 20) -> List[Dict]:
278
+ """Fallback FAQs if API fails - brand neutral and organized by categories."""
279
+ base_faqs = [
280
+ # Why choose us questions
281
+ {
282
+ "question": "Why should I choose your business over competitors?",
283
+ "answer": "Our experienced team provides personalized service with attention to detail and a commitment to customer satisfaction. We take time to understand your specific needs and work with you throughout the entire process to ensure you're completely happy with the results."
284
+ },
285
+ {
286
+ "question": "What makes your customer service different?",
287
+ "answer": "We pride ourselves on patient, welcoming service where customers never feel rushed. Our team focuses on creating a comfortable experience while providing expert guidance to help you make the best decisions for your needs."
288
+ },
289
+ {
290
+ "question": "How experienced is your team?",
291
+ "answer": "Our team consists of experienced professionals who are passionate about helping customers achieve their goals. We stay updated with the latest trends and techniques to provide you with the best possible service and advice."
292
+ },
293
+
294
+ # Problem-solving questions
295
+ {
296
+ "question": "How do you help customers who feel overwhelmed by choices?",
297
+ "answer": "Our knowledgeable staff guides you through the selection process based on your preferences, budget, and specific needs. We take time to understand your vision and narrow down options so you can make decisions with confidence."
298
+ },
299
+ {
300
+ "question": "What if I'm not satisfied with the results?",
301
+ "answer": "Customer satisfaction is our top priority. We work closely with you throughout the process and make adjustments as needed to ensure you're completely happy with the final outcome. Our team is committed to making things right."
302
+ },
303
+ {
304
+ "question": "How do you handle sizing and fit issues?",
305
+ "answer": "Our professional team provides expert fitting services and makes necessary adjustments to ensure perfect results. We take precise measurements and work with you through multiple fittings if needed to achieve the ideal fit."
306
+ },
307
+
308
+ # Service questions
309
+ {
310
+ "question": "What services do you offer besides your main products?",
311
+ "answer": "In addition to our primary offerings, we provide professional consultation, customization services, and ongoing support. We also offer accessories and complementary products to complete your experience with us."
312
+ },
313
+ {
314
+ "question": "Do you provide consultation services?",
315
+ "answer": "Yes, we offer personalized consultations where our experts help you explore options, provide styling advice, and ensure you make choices that align with your vision and budget. These consultations are designed to make your experience as smooth as possible."
316
+ },
317
+ {
318
+ "question": "What additional products and accessories do you carry?",
319
+ "answer": "We offer a comprehensive selection of complementary products and accessories to complete your needs. Our team can help coordinate everything to ensure a cohesive and polished final result."
320
+ },
321
+
322
+ # Process questions
323
+ {
324
+ "question": "Do I need an appointment or can I walk in?",
325
+ "answer": "While we welcome walk-ins when possible, we highly recommend scheduling an appointment to ensure you receive dedicated attention and personalized service. Appointments allow us to prepare for your visit and provide the best possible experience."
326
+ },
327
+ {
328
+ "question": "How long does the typical process take?",
329
+ "answer": "The timeline varies depending on your specific needs, but we work with you to establish realistic expectations from the start. Our team keeps you informed throughout the process and ensures everything is completed according to your schedule."
330
+ },
331
+ {
332
+ "question": "What should I expect during my first visit?",
333
+ "answer": "During your initial visit, we'll discuss your needs, preferences, and budget. Our team will guide you through available options, provide expert recommendations, and create a plan tailored to your specific requirements."
334
+ },
335
+
336
+ # Quality questions
337
+ {
338
+ "question": "How do you ensure quality in your products and services?",
339
+ "answer": "We maintain high standards through careful selection of products, skilled craftsmanship, and thorough quality checks. Our experienced team pays attention to every detail to ensure you receive exceptional results that meet our quality standards."
340
+ },
341
+ {
342
+ "question": "What is your experience with customers who have specific requirements?",
343
+ "answer": "Our team has extensive experience working with diverse customer needs and preferences. We pride ourselves on our ability to accommodate special requirements and provide customized solutions that exceed expectations."
344
+ },
345
+ {
346
+ "question": "How do you stay current with industry trends?",
347
+ "answer": "Our team continuously educates themselves on the latest trends, techniques, and products in the industry. We attend training sessions and stay connected with industry developments to provide you with current options and expert advice."
348
+ },
349
+
350
+ # Additional comprehensive questions
351
+ {
352
+ "question": "What price ranges do you offer?",
353
+ "answer": "We offer options across various price points to accommodate different budgets. Our team can help you find quality solutions within your budget and provide transparent pricing information upfront so you can make informed decisions."
354
+ },
355
+ {
356
+ "question": "Do you offer payment plans or financing options?",
357
+ "answer": "Yes, we understand that significant purchases require financial planning. We offer flexible payment options and financing plans to make our services more accessible and help you achieve your goals within your budget."
358
+ },
359
+ {
360
+ "question": "How far in advance should I start planning?",
361
+ "answer": "We recommend starting the process several months in advance to allow adequate time for consultation, selection, customization, and any necessary adjustments. Early planning ensures the best selection and reduces stress as your important date approaches."
362
+ },
363
+ {
364
+ "question": "Do you work with customers who have time constraints?",
365
+ "answer": "Absolutely! We understand that sometimes timelines are tight, and we're experienced in working efficiently to meet urgent deadlines. Our team will discuss your timeline and work diligently to accommodate your schedule while maintaining quality standards."
366
+ },
367
+ {
368
+ "question": "What sets your customer experience apart?",
369
+ "answer": "We focus on creating a welcoming, pressure-free environment where customers feel comfortable and supported. Our personalized approach, attention to detail, and commitment to customer satisfaction ensure that your experience with us is positive and memorable."
370
+ },
371
+ {
372
+ "question": "How do you handle special requests or customizations?",
373
+ "answer": "We welcome special requests and customizations as part of our personalized service approach. Our skilled team works with you to understand your vision and explore options for creating something unique that perfectly meets your specific needs and preferences."
374
+ }
375
+ ]
376
+
377
+ # Return the requested number of FAQs, up to the available amount
378
+ return base_faqs[:min(num_faqs, len(base_faqs))]
379
+
380
+ def load_sample_data():
381
+ """Load sample data if no file is uploaded."""
382
+ sample_data = [
383
+ {
384
+ "reviewer_name": "Customer A",
385
+ "rating": 5,
386
+ "date": "3 months ago",
387
+ "review_text": "This past August, I went to the bridal store to look for my dream wedding dress and I found It! I was looking for an elegant, simple, and classic dress. The consultant was extremely helpful, patient, and sweet. The alterations team did a great job making sure I was happy with the alterations done to my dress.",
388
+ "owner_response": "Thank you so much for taking the time to leave this excellent review!",
389
+ },
390
+ {
391
+ "reviewer_name": "Customer B",
392
+ "rating": 5,
393
+ "date": "2 months ago",
394
+ "review_text": "A very special shout out to the consultant who made my daughters dress shopping so special. Never rushed her and only everything to accommodate her until she found the right dress to say yes to.",
395
+ "owner_response": "",
396
+ },
397
+ {
398
+ "reviewer_name": "Customer C",
399
+ "rating": 5,
400
+ "date": "1 month ago",
401
+ "review_text": "My wedding dress shopping experience was beyond amazing. The consultant was so wonderful to work with. Everyone was so sweet & welcoming when we walked in. She made me feel so comfortable as we tried on many different dresses.",
402
+ "owner_response": "Thank you for the wonderful review!",
403
+ }
404
+ ]
405
+ return sample_data
406
+
407
+ def main():
408
+ st.title("🤖 AI FAQ Generator")
409
+ st.markdown("Generate SEO-optimized, FAQs from customer reviews")
410
+
411
+ # Sidebar for configuration
412
+ with st.sidebar:
413
+ st.header("⚙️ Configuration")
414
+
415
+ # API Key input
416
+ api_key = os.getenv("GROQ_API_KEY")
417
+
418
+ if not api_key:
419
+ st.warning("Please enter your Groq API key to use AI features")
420
+ st.markdown("Get your API key from [Groq Console](https://console.groq.com)")
421
+
422
+ st.divider()
423
+
424
+ # FAQ Configuration
425
+ st.subheader("FAQ Settings")
426
+ num_faqs = st.slider("Number of FAQs to generate", 5, 15, 10, 1, help="Select how many FAQs to generate based on the reviews")
427
+ if num_faqs < 5:
428
+ st.warning("Generating fewer than 5 FAQs may not provide enough coverage of customer concerns")
429
+ elif num_faqs > 15:
430
+ st.warning("Generating more than 15 FAQs may lead to less focused content")
431
+ st.info(f"Will generate {num_faqs} FAQs")
432
+
433
+ st.divider()
434
+
435
+ # File upload
436
+ uploaded_file = st.file_uploader(
437
+ "Upload Reviews CSV",
438
+ type=['csv'],
439
+ help="Upload a CSV file with customer reviews (supports large files)"
440
+ )
441
+
442
+ # Use sample data option
443
+ use_sample = st.checkbox("Use sample data", value=False)
444
+
445
+ # Load data
446
+ if uploaded_file is not None:
447
+ try:
448
+ with st.spinner("Loading CSV file..."):
449
+ df = pd.read_csv(uploaded_file)
450
+
451
+ # Data cleaning
452
+ df['rating'] = df['rating'].astype(str)
453
+ df = df.drop(columns=['review_id', 'scraped_at'], axis=1, errors='ignore')
454
+
455
+ # Remove empty reviews
456
+ df = df.dropna(subset=['review_text'])
457
+ df = df[df['review_text'].str.strip() != '']
458
+
459
+ reviews_data = df.to_dict('records')
460
+
461
+ st.success(f"✅ Loaded {len(reviews_data)} reviews from uploaded file")
462
+
463
+ # Show file statistics
464
+ col1, col2, col3 = st.columns(3)
465
+ with col1:
466
+ st.metric("Total Reviews", len(reviews_data))
467
+ with col2:
468
+ avg_rating = sum(int(r.get('rating', 0)) for r in reviews_data if r.get('rating', '0').isdigit()) / len([r for r in reviews_data if r.get('rating', '0').isdigit()])
469
+ st.metric("Average Rating", f"{avg_rating:.1f}")
470
+ with col3:
471
+ positive_reviews = sum(1 for r in reviews_data if r.get('rating', '0').isdigit() and int(r.get('rating', 0)) >= 4)
472
+ st.metric("Positive Reviews", f"{positive_reviews}/{len(reviews_data)}")
473
+
474
+ except Exception as e:
475
+ st.error(f"Error loading file: {str(e)}")
476
+ st.info("Please ensure your CSV has columns: 'review_text', 'rating'")
477
+ reviews_data = load_sample_data()
478
+ elif use_sample:
479
+ reviews_data = load_sample_data()
480
+ st.info("Using sample data for demonstration")
481
+ else:
482
+ st.warning("Please upload a CSV file or use sample data")
483
+ return
484
+
485
+ # Display data overview
486
+ if reviews_data:
487
+ with st.expander("📊 Data Overview", expanded=False):
488
+ # Sample reviews preview
489
+ st.subheader("Sample Reviews")
490
+ for i, review in enumerate(reviews_data[:3]):
491
+ with st.container():
492
+ st.write(f"**{review.get('reviewer_name', 'Anonymous')}** - {review.get('rating', 'N/A')} ⭐")
493
+ st.write(review.get('review_text', 'No review text')[:300] + "...")
494
+ if i < 2: # Don't show divider after last item
495
+ st.divider()
496
+
497
+ # Generate FAQs
498
+ if api_key and reviews_data:
499
+ st.header("🚀 Generate AI-Powered FAQs")
500
+
501
+ col1, col2 = st.columns(2)
502
+ with col1:
503
+ if st.button("🤖 Generate Keywords & FAQs with AI", type="primary"):
504
+ start_time = time.time()
505
+
506
+ with st.spinner("Processing large dataset..."):
507
+ # Initialize FAQ generator
508
+ faq_gen = OptimizedFAQGenerator(api_key)
509
+
510
+ # Extract keywords with batch processing
511
+ st.info("🔍 Extracting SEO keywords from all reviews...")
512
+ keywords = faq_gen.extract_seo_keywords(reviews_data)
513
+
514
+ # Store in session state
515
+ st.session_state.keywords = keywords
516
+
517
+ # Generate FAQs
518
+ st.info("📝 Generating brand-neutral FAQs...")
519
+ faqs = faq_gen.generate_faqs(keywords, reviews_data, num_faqs)
520
+
521
+ # Store in session state
522
+ st.session_state.faqs = faqs
523
+
524
+ generation_time = time.time() - start_time
525
+ st.session_state.generation_time = generation_time
526
+
527
+ st.success(f"✅ Generated {len(faqs)} FAQs in {generation_time:.1f} seconds!")
528
+
529
+ # with col2:
530
+ # if st.button("⚡ Use Quick Fallback"):
531
+ # faq_gen = OptimizedFAQGenerator("") # Empty API key for fallback
532
+ # st.session_state.keywords = ["customer service", "quality products", "professional consultation", "experienced team", "customer satisfaction"]
533
+ # st.session_state.faqs = faq_gen._get_fallback_faqs(num_faqs)
534
+ # st.info("Using pre-built content")
535
+
536
+ # Display results
537
+ if hasattr(st.session_state, 'keywords') and hasattr(st.session_state, 'faqs'):
538
+
539
+ # Performance metrics
540
+ if hasattr(st.session_state, 'generation_time'):
541
+ st.info(f"⏱️ Generation completed in {st.session_state.generation_time:.1f} seconds")
542
+
543
+ st.header("📈 Extracted SEO Keywords")
544
+
545
+ # Display keywords in a nice format
546
+ keywords = st.session_state.keywords
547
+
548
+ # Show keywords in columns
549
+ cols = st.columns(3)
550
+ for i, keyword in enumerate(keywords):
551
+ with cols[i % 3]:
552
+ st.markdown(f"`{keyword}`")
553
+
554
+ st.header("❓ Generated Brand-Neutral FAQs")
555
+ st.info(f"Generated {len(st.session_state.faqs)} FAQs that can be used by any business in this industry")
556
+
557
+ # Display FAQs with search functionality
558
+ search_term = st.text_input("🔍 Search FAQs", placeholder="Enter keywords to filter FAQs...")
559
+
560
+ faqs = st.session_state.faqs
561
+
562
+ # Filter FAQs if search term is provided
563
+ if search_term:
564
+ filtered_faqs = [
565
+ faq for faq in faqs
566
+ if search_term.lower() in faq.get('question', '').lower()
567
+ or search_term.lower() in faq.get('answer', '').lower()
568
+ ]
569
+ st.info(f"Showing {len(filtered_faqs)} FAQs matching '{search_term}'")
570
+ faqs_to_show = filtered_faqs
571
+ else:
572
+ faqs_to_show = faqs
573
+
574
+ # Display FAQs
575
+ for i, faq in enumerate(faqs_to_show):
576
+ with st.expander(f"FAQ {i+1}: {faq.get('question', 'No question')}", expanded=False):
577
+ st.subheader("Question:")
578
+ st.write(faq.get('question', 'No question'))
579
+ st.subheader("Answer:")
580
+ st.write(faq.get('answer', 'No answer'))
581
+
582
+ # Export options
583
+ st.header("📥 Export Options")
584
+
585
+ col1, col2, col3 = st.columns(3)
586
+
587
+ with col1:
588
+ # Export as JSON
589
+ export_data = {
590
+ "metadata": {
591
+ "generated_at": time.strftime("%Y-%m-%d %H:%M:%S"),
592
+ "total_reviews_analyzed": len(reviews_data),
593
+ "generation_time_seconds": getattr(st.session_state, 'generation_time', 0),
594
+ "brand_neutral": True
595
+ },
596
+ "keywords": keywords,
597
+ "faqs": faqs
598
+ }
599
+
600
+ st.download_button(
601
+ label="📄 Download JSON",
602
+ data=json.dumps(export_data, indent=2),
603
+ file_name="faqs.json",
604
+ mime="application/json"
605
+ )
606
+
607
+ with col2:
608
+ # Export as CSV
609
+ faq_df = pd.DataFrame(faqs)
610
+ csv_data = faq_df.to_csv(index=False)
611
+
612
+ st.download_button(
613
+ label="📊 Download CSV",
614
+ data=csv_data,
615
+ file_name="faqs.csv",
616
+ mime="text/csv"
617
+ )
618
+
619
+ with col3:
620
+ # Export as HTML
621
+ html_content = f"""
622
+ <!DOCTYPE html>
623
+ <html>
624
+ <head>
625
+ <title>Brand-Neutral FAQs</title>
626
+ <style>
627
+ body {{ font-family: Arial, sans-serif; margin: 40px; line-height: 1.6; }}
628
+ .faq {{ margin-bottom: 30px; border-left: 4px solid #007bff; padding-left: 20px; }}
629
+ .question {{ font-weight: bold; font-size: 18px; color: #333; margin-bottom: 10px; }}
630
+ .answer {{ color: #666; }}
631
+ .header {{ background: #f8f9fa; padding: 20px; border-radius: 5px; margin-bottom: 30px; }}
632
+ </style>
633
+ </head>
634
+ <body>
635
+ <div class="header">
636
+ <h1>Brand-Neutral FAQs</h1>
637
+ <p><strong>Generated:</strong> {time.strftime("%Y-%m-%d %H:%M:%S")}</p>
638
+ <p><strong>Reviews Analyzed:</strong> {len(reviews_data)}</p>
639
+ <p><strong>Keywords:</strong> {', '.join(keywords[:10])}...</p>
640
+ </div>
641
+ """
642
+
643
+ for i, faq in enumerate(faqs, 1):
644
+ html_content += f"""
645
+ <div class="faq">
646
+ <div class="question">{i}. {faq.get('question', '')}</div>
647
+ <div class="answer">{faq.get('answer', '')}</div>
648
+ </div>
649
+ """
650
+
651
+ html_content += "</body></html>"
652
+
653
+ st.download_button(
654
+ label="🌐 Download HTML",
655
+ data=html_content,
656
+ file_name="faqs.html",
657
+ mime="text/html"
658
+ )
659
+
660
+ # Footer
661
+ st.markdown("---")
662
+ st.markdown("**Features:** Batch processing for large datasets • Brand-neutral content • SEO optimization • Multiple export formats")
663
+
664
+ if __name__ == "__main__":
665
+ main()
davids_bridal_middletown_reviews.csv ADDED
The diff for this file is too large to render. See raw diff
 
scraper.py ADDED
@@ -0,0 +1,451 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Enhanced Google Maps Reviews Scraper for David's Bridal
4
+ Scrapes reviews from Google Maps with parallel processing and improved element detection
5
+ """
6
+
7
+ import csv
8
+ import time
9
+ import random
10
+ import asyncio
11
+ from concurrent.futures import ThreadPoolExecutor, as_completed
12
+ from selenium import webdriver
13
+ from selenium.webdriver.common.by import By
14
+ from selenium.webdriver.support.ui import WebDriverWait
15
+ from selenium.webdriver.support import expected_conditions as EC
16
+ from selenium.webdriver.chrome.options import Options
17
+ from selenium.webdriver.chrome.service import Service
18
+ from selenium.common.exceptions import TimeoutException, NoSuchElementException, WebDriverException, ElementClickInterceptedException
19
+ from webdriver_manager.chrome import ChromeDriverManager
20
+ import pandas as pd
21
+ from datetime import datetime
22
+ import logging
23
+ import sys
24
+ import threading
25
+ from queue import Queue
26
+
27
+ # Set up logging
28
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
29
+ logger = logging.getLogger(__name__)
30
+
31
+ class EnhancedGoogleMapsReviewsScraper:
32
+ def __init__(self, headless=True, wait_time=10, max_workers=3):
33
+ """Initialize the scraper with Chrome driver options"""
34
+ self.wait_time = wait_time
35
+ self.max_workers = max_workers
36
+ self.reviews_queue = Queue()
37
+ self.processed_reviews = []
38
+ self.lock = threading.Lock()
39
+ self.setup_driver(headless)
40
+
41
+ def setup_driver(self, headless):
42
+ """Set up Chrome driver with appropriate options"""
43
+ try:
44
+ chrome_options = Options()
45
+ if headless:
46
+ chrome_options.add_argument("--headless")
47
+ chrome_options.add_argument("--no-sandbox")
48
+ chrome_options.add_argument("--disable-dev-shm-usage")
49
+ chrome_options.add_argument("--disable-blink-features=AutomationControlled")
50
+ chrome_options.add_argument("--disable-extensions")
51
+ chrome_options.add_argument("--disable-gpu")
52
+ chrome_options.add_argument("--remote-debugging-port=9222")
53
+ chrome_options.add_argument("--window-size=1920,1080")
54
+ chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
55
+ chrome_options.add_experimental_option('useAutomationExtension', False)
56
+ chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
57
+
58
+ logger.info("Setting up ChromeDriver...")
59
+ service = Service(ChromeDriverManager().install())
60
+
61
+ self.driver = webdriver.Chrome(service=service, options=chrome_options)
62
+ self.driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
63
+ self.wait = WebDriverWait(self.driver, self.wait_time)
64
+ logger.info("ChromeDriver setup successful")
65
+
66
+ except WebDriverException as e:
67
+ logger.error(f"Failed to setup ChromeDriver: {e}")
68
+ sys.exit(1)
69
+
70
+ def search_location(self, query):
71
+ """Search for David's Bridal location on Google Maps"""
72
+ try:
73
+ search_url = f"https://www.google.com/maps/search/{query.replace(' ', '+')}"
74
+ logger.info(f"Navigating to: {search_url}")
75
+ self.driver.get(search_url)
76
+
77
+ # Wait for page to load
78
+ time.sleep(5)
79
+
80
+ # Look for search results
81
+ result_selectors = [
82
+ "button.hh2c6.G7m0Af", # Button with class for location
83
+ ]
84
+
85
+ result_found = False
86
+ for selector in result_selectors:
87
+ try:
88
+ first_result = self.wait.until(
89
+ EC.element_to_be_clickable((By.CSS_SELECTOR, selector))
90
+ )
91
+ self.driver.execute_script("arguments[0].click();", first_result)
92
+ time.sleep(3)
93
+ result_found = True
94
+ break
95
+ except TimeoutException:
96
+ continue
97
+
98
+ return result_found
99
+
100
+ except Exception as e:
101
+ logger.error(f"Error in search_location: {e}")
102
+ return False
103
+
104
+ def click_reviews_tab(self):
105
+ """Click on the reviews tab using the specific element structure"""
106
+ try:
107
+ # Wait for the reviews tab to be clickable
108
+ reviews_button = self.wait.until(
109
+ EC.element_to_be_clickable((By.CSS_SELECTOR, "button[data-tab-index='1'][aria-label*='Reviews']"))
110
+ )
111
+
112
+ # Scroll the button into view
113
+ self.driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", reviews_button)
114
+ time.sleep(1)
115
+
116
+ # Click the reviews button
117
+ self.driver.execute_script("arguments[0].click();", reviews_button)
118
+ logger.info("Successfully clicked reviews tab")
119
+
120
+ # Wait for reviews to load
121
+ time.sleep(3)
122
+ return True
123
+
124
+ except Exception as e:
125
+ logger.error(f"Could not click reviews tab: {e}")
126
+ return False
127
+
128
+ def expand_review_text(self, review_element):
129
+ """Expand review text by clicking 'More' button if present"""
130
+ try:
131
+ # Look for the 'More' button within this review
132
+ more_button = review_element.find_element(
133
+ By.CSS_SELECTOR,
134
+ "button.w8nwRe.kyuRq[aria-label='See more']"
135
+ )
136
+
137
+ # Scroll button into view and click
138
+ self.driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", more_button)
139
+ time.sleep(0.5)
140
+ self.driver.execute_script("arguments[0].click();", more_button)
141
+ time.sleep(1) # Wait for text to expand
142
+ return True
143
+
144
+ except NoSuchElementException:
145
+ # No 'More' button found - review is already fully visible
146
+ return False
147
+ except Exception as e:
148
+ logger.warning(f"Error expanding review text: {e}")
149
+ return False
150
+
151
+ def scroll_and_load_reviews(self, target_count=5000):
152
+ """Scroll through reviews to load all available reviews"""
153
+ try:
154
+ scrollable_container = self.driver.find_element(By.CSS_SELECTOR, "div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde")
155
+ last_review_count = 0
156
+ stagnant_rounds = 0
157
+ max_stagnant_rounds = 5
158
+ scroll_attempts = 0
159
+ max_scroll_attempts = 1000 # increased max
160
+
161
+ while scroll_attempts < max_scroll_attempts:
162
+ # Scroll down
163
+ self.driver.execute_script(
164
+ "arguments[0].scrollTo(0, arguments[0].scrollHeight);",
165
+ scrollable_container
166
+ )
167
+
168
+ # Wait for content to load
169
+ time.sleep(random.uniform(2, 4))
170
+
171
+ # Count reviews
172
+ current_reviews = len(self.driver.find_elements(By.CSS_SELECTOR, "div[data-review-id]"))
173
+ logger.info(f"Attempt {scroll_attempts + 1}: Loaded {current_reviews} reviews (target: {target_count})")
174
+
175
+ # Check if we’ve hit the target
176
+ if current_reviews >= target_count:
177
+ logger.info("Reached target review count.")
178
+ break
179
+
180
+ # Check if no new reviews are loading
181
+ if current_reviews == last_review_count:
182
+ stagnant_rounds += 1
183
+ logger.info(f"No new reviews this round. Stagnant rounds: {stagnant_rounds}/{max_stagnant_rounds}")
184
+ if stagnant_rounds >= max_stagnant_rounds:
185
+ logger.info("No new reviews after several attempts. Stopping scroll.")
186
+ break
187
+ else:
188
+ stagnant_rounds = 0 # reset if progress made
189
+
190
+ last_review_count = current_reviews
191
+ scroll_attempts += 1
192
+
193
+ # Occasionally wait longer to mimic human behavior
194
+ if scroll_attempts % 10 == 0:
195
+ logger.info("Taking a longer pause to mimic human browsing...")
196
+ time.sleep(random.uniform(5, 8))
197
+
198
+ logger.info(f"Finished scrolling. Total reviews found: {current_reviews}")
199
+ return current_reviews
200
+
201
+ except Exception as e:
202
+ logger.error(f"Error scrolling reviews: {e}")
203
+ return 0
204
+
205
+ def extract_single_review_data(self, review_element):
206
+ """Extract data from a single review element"""
207
+ try:
208
+ review_data = {}
209
+
210
+ # First, try to expand the review text if there's a 'More' button
211
+ self.expand_review_text(review_element)
212
+
213
+ # Extract reviewer name
214
+ try:
215
+ name_element = review_element.find_element(By.CSS_SELECTOR, "div[class*='d4r55']")
216
+ review_data['reviewer_name'] = name_element.text.strip()
217
+ except NoSuchElementException:
218
+ review_data['reviewer_name'] = "Anonymous"
219
+
220
+ # Extract rating
221
+ try:
222
+ rating_element = review_element.find_element(By.CSS_SELECTOR, "span[role='img'][aria-label*='star']")
223
+ rating_text = rating_element.get_attribute('aria-label')
224
+ review_data['rating'] = self.extract_rating_from_text(rating_text)
225
+ except NoSuchElementException:
226
+ review_data['rating'] = None
227
+
228
+ # Extract review text using the specific selector you provided
229
+ try:
230
+ text_element = review_element.find_element(By.CSS_SELECTOR, "span.wiI7pd")
231
+ review_data['review_text'] = text_element.text.strip()
232
+ except NoSuchElementException:
233
+ review_data['review_text'] = ""
234
+
235
+ # Extract date
236
+ try:
237
+ date_element = review_element.find_element(By.CSS_SELECTOR, "span.rsqaWe")
238
+ review_data['date'] = date_element.text.strip()
239
+ except NoSuchElementException:
240
+ review_data['date'] = ""
241
+
242
+ # Extract owner response if any
243
+ try:
244
+ response_element = review_element.find_element(By.CSS_SELECTOR, "div[class*='wiI7pd']")
245
+ review_data['owner_response'] = response_element.text.strip()
246
+ except NoSuchElementException:
247
+ review_data['owner_response'] = ""
248
+
249
+ # Add metadata
250
+ review_data['scraped_at'] = datetime.now().isoformat()
251
+ review_data['review_id'] = review_element.get_attribute('data-review-id') or f"review_{int(time.time() * 1000)}"
252
+
253
+ return review_data
254
+
255
+ except Exception as e:
256
+ logger.error(f"Error extracting single review: {e}")
257
+ return None
258
+
259
+ def extract_rating_from_text(self, text):
260
+ """Extract numeric rating from aria-label text"""
261
+ if not text:
262
+ return None
263
+
264
+ import re
265
+ # Look for patterns like "5 stars", "Rated 4 out of 5 stars"
266
+ match = re.search(r'(\d+)\s*(?:out of \d+\s*)?stars?', text.lower())
267
+ if match:
268
+ return int(match.group(1))
269
+
270
+ # Fallback: count star characters
271
+ star_count = text.count('★') or text.count('⭐')
272
+ if star_count > 0:
273
+ return star_count
274
+
275
+ return None
276
+
277
+ def process_reviews_batch(self, review_elements, start_idx, end_idx):
278
+ """Process a batch of reviews in parallel"""
279
+ batch_results = []
280
+
281
+ for i in range(start_idx, min(end_idx, len(review_elements))):
282
+ try:
283
+ review_data = self.extract_single_review_data(review_elements[i])
284
+ if review_data:
285
+ batch_results.append(review_data)
286
+ logger.info(f"Processed review {i+1}/{len(review_elements)}")
287
+ except Exception as e:
288
+ logger.warning(f"Error processing review {i+1}: {e}")
289
+ continue
290
+
291
+ return batch_results
292
+
293
+ def extract_all_reviews_parallel(self):
294
+ """Extract all reviews using parallel processing with duplicate removal"""
295
+ try:
296
+ # Get all review elements using a single, specific selector
297
+ review_elements = self.driver.find_elements(By.CSS_SELECTOR, "div[data-review-id]")
298
+ total_reviews = len(review_elements)
299
+ logger.info(f"Found {total_reviews} review elements to process")
300
+
301
+ if total_reviews == 0:
302
+ return []
303
+
304
+ # Use a set to track processed review IDs and avoid duplicates
305
+ processed_review_ids = set()
306
+ all_reviews = []
307
+
308
+ # Process reviews sequentially to better control duplicates
309
+ for i, review_element in enumerate(review_elements):
310
+ try:
311
+ # Get review ID first to check for duplicates
312
+ review_id = review_element.get_attribute('data-review-id')
313
+
314
+ if review_id and review_id in processed_review_ids:
315
+ logger.debug(f"Skipping duplicate review ID: {review_id}")
316
+ continue
317
+
318
+ # Extract review data
319
+ review_data = self.extract_single_review_data(review_element)
320
+
321
+ if review_data and review_data.get('review_id'):
322
+ # Add to processed set to prevent duplicates
323
+ processed_review_ids.add(review_data['review_id'])
324
+ all_reviews.append(review_data)
325
+ logger.info(f"Processed review {len(all_reviews)}/{total_reviews}")
326
+
327
+ except Exception as e:
328
+ logger.warning(f"Error processing review {i+1}: {e}")
329
+ continue
330
+
331
+ logger.info(f"Successfully extracted {len(all_reviews)} unique reviews")
332
+ return all_reviews
333
+
334
+ except Exception as e:
335
+ logger.error(f"Error in review extraction: {e}")
336
+ return []
337
+
338
+ def save_to_csv(self, reviews_data, filename="davids_bridal_reviews.csv"):
339
+ """Save reviews data to CSV file with duplicate removal and better formatting"""
340
+ if not reviews_data:
341
+ logger.warning("No reviews data to save")
342
+ return
343
+
344
+ try:
345
+ df = pd.DataFrame(reviews_data)
346
+
347
+ # Remove duplicates based on review_id and review_text
348
+ initial_count = len(df)
349
+ df = df.drop_duplicates(subset=['review_id'], keep='first')
350
+
351
+ # If review_id duplicates removed, also check for text duplicates as backup
352
+ df = df.drop_duplicates(subset=['reviewer_name', 'review_text', 'date'], keep='first')
353
+
354
+ final_count = len(df)
355
+ if initial_count > final_count:
356
+ logger.info(f"Removed {initial_count - final_count} duplicate reviews")
357
+
358
+ # Reorder columns for better readability
359
+ column_order = ['reviewer_name', 'rating', 'date', 'review_text', 'owner_response', 'review_id', 'scraped_at']
360
+ df = df.reindex(columns=column_order)
361
+
362
+ # Save to CSV with proper encoding
363
+ df.to_csv(filename, index=False, encoding='utf-8')
364
+ logger.info(f"Successfully saved {len(df)} unique reviews to {filename}")
365
+
366
+ # Print summary statistics
367
+ if 'rating' in df.columns and len(df) > 0:
368
+ avg_rating = df['rating'].mean()
369
+ logger.info(f"Average rating: {avg_rating:.2f}")
370
+ logger.info(f"Rating distribution:\n{df['rating'].value_counts().sort_index()}")
371
+
372
+ except Exception as e:
373
+ logger.error(f"Error saving to CSV: {e}")
374
+
375
+ def scrape_reviews(self, location_query, output_file="davids_bridal_reviews.csv"):
376
+ """Main method to scrape all reviews"""
377
+ try:
378
+ logger.info("Starting enhanced review scraping...")
379
+
380
+ # Search for the location
381
+ if not self.search_location(location_query):
382
+ logger.error("Failed to find location")
383
+ return None
384
+
385
+ # Click reviews tab
386
+ if not self.click_reviews_tab():
387
+ logger.error("Failed to access reviews tab")
388
+ return None
389
+
390
+ # Scroll to load all reviews
391
+ total_loaded = self.scroll_and_load_reviews(target_count=2394)
392
+
393
+ if total_loaded == 0:
394
+ logger.error("No reviews found after scrolling")
395
+ return None
396
+
397
+ # Extract all reviews using parallel processing
398
+ reviews_data = self.extract_all_reviews_parallel()
399
+
400
+ # Save to CSV
401
+ if reviews_data:
402
+ self.save_to_csv(reviews_data, output_file)
403
+ logger.info(f"Successfully scraped {len(reviews_data)} reviews")
404
+ return reviews_data
405
+ else:
406
+ logger.warning("No reviews extracted")
407
+ return None
408
+
409
+ except Exception as e:
410
+ logger.error(f"Error during scraping: {e}")
411
+ return None
412
+ finally:
413
+ self.close()
414
+
415
+ def close(self):
416
+ """Close the browser driver"""
417
+ if hasattr(self, 'driver'):
418
+ self.driver.quit()
419
+
420
+ def main():
421
+ """Enhanced main function with better error handling"""
422
+ try:
423
+ # Initialize scraper
424
+ scraper = EnhancedGoogleMapsReviewsScraper(
425
+ headless=False, # Set to True for background operation
426
+ max_workers=3 # Adjust based on your system
427
+ )
428
+
429
+ # Define search query
430
+ search_query = "David's Bridal Middletown NY"
431
+
432
+ logger.info(f"Starting scrape for: {search_query}")
433
+
434
+ # Scrape reviews
435
+ reviews = scraper.scrape_reviews(
436
+ location_query=search_query,
437
+ output_file="davids_bridal_middletown_reviews.csv"
438
+ )
439
+
440
+ if reviews:
441
+ logger.info(f"Scraping completed successfully! Total reviews: {len(reviews)}")
442
+ else:
443
+ logger.error("Scraping failed - no reviews collected")
444
+
445
+ except KeyboardInterrupt:
446
+ logger.info("Scraping interrupted by user")
447
+ except Exception as e:
448
+ logger.error(f"Unexpected error in main: {e}")
449
+
450
+ if __name__ == "__main__":
451
+ main()