zavavan commited on
Commit
e781b0c
·
verified ·
1 Parent(s): 5d8f7ac

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +41 -35
app.py CHANGED
@@ -4,6 +4,8 @@ import requests
4
  import pytz
5
  import yaml
6
  from tools.final_answer import FinalAnswerTool
 
 
7
  from bs4 import BeautifulSoup
8
  import pandas as pd
9
  import time
@@ -27,45 +29,49 @@ def scrape_drug_reviews(drug:str)-> str: #it's import to specify the return type
27
  return f"Error fetching reviews for the target drug you provided: '{drug}'"
28
 
29
 
30
- def scrape_drugs_com_reviews(drug_name, max_pages=5, sleep_time=2):
31
  """
32
  Scrapes user reviews from Drugs.com for a given drug.
33
  """
34
  base_url = f"https://www.drugs.com/comments/{drug_name}/"
35
- reviews = []
36
-
37
- for page in range(1, max_pages + 1):
38
- url = base_url if page == 1 else f"{base_url}?page={page}"
39
- response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
40
-
41
- if response.status_code != 200:
42
- print(f"Failed to fetch page {page}")
43
- break
44
-
45
- soup = BeautifulSoup(response.text, "html.parser")
46
- review_blocks = soup.select('.user-comment')
47
-
48
- if not review_blocks:
49
- break
50
-
51
- for block in review_blocks:
52
- review_text = block.select_one('.user-comment-text')
53
- condition = block.select_one('.drug-condition')
54
- rating = block.select_one('.rating-score')
55
- date = block.select_one('.comment-date')
56
-
57
- reviews.append({
58
- "condition": condition.get_text(strip=True) if condition else None,
59
- "rating": rating.get_text(strip=True) if rating else None,
60
- "review": review_text.get_text(strip=True) if review_text else None,
61
- "date": date.get_text(strip=True) if date else None,
62
- "source": url
63
- })
64
-
65
- print(f"[✓] Page {page} scraped.")
66
- time.sleep(sleep_time)
67
-
68
- return pd.DataFrame(reviews)
 
 
 
 
69
 
70
 
71
 
 
4
  import pytz
5
  import yaml
6
  from tools.final_answer import FinalAnswerTool
7
+ from playwright.sync_api import sync_playwright
8
+ import time
9
  from bs4 import BeautifulSoup
10
  import pandas as pd
11
  import time
 
29
  return f"Error fetching reviews for the target drug you provided: '{drug}'"
30
 
31
 
32
+ def scrape_drugs_com_reviews(drug_name, max_pages=3, delay=2):
33
  """
34
  Scrapes user reviews from Drugs.com for a given drug.
35
  """
36
  base_url = f"https://www.drugs.com/comments/{drug_name}/"
37
+ all_reviews = []
38
+
39
+ with sync_playwright() as p:
40
+ browser = p.chromium.launch(headless=True)
41
+ page = browser.new_page()
42
+
43
+ for page_num in range(1, max_pages + 1):
44
+ url = base_url if page_num == 1 else f"{base_url}?page={page_num}"
45
+ print(f"Scraping: {url}")
46
+ page.goto(url, timeout=60000)
47
+ time.sleep(delay) # Give page some time to load
48
+
49
+ html = page.content()
50
+ soup = BeautifulSoup(html, 'html.parser')
51
+ review_blocks = soup.select('.user-comment')
52
+
53
+ if not review_blocks:
54
+ print("No reviews found on this page.")
55
+ break
56
+
57
+ for block in review_blocks:
58
+ review_text = block.select_one('.user-comment-text')
59
+ condition = block.select_one('.drug-condition')
60
+ rating = block.select_one('.rating-score')
61
+ date = block.select_one('.comment-date')
62
+
63
+ all_reviews.append({
64
+ "condition": condition.get_text(strip=True) if condition else None,
65
+ "rating": rating.get_text(strip=True) if rating else None,
66
+ "review": review_text.get_text(strip=True) if review_text else None,
67
+ "date": date.get_text(strip=True) if date else None,
68
+ "source": url
69
+ })
70
+
71
+ time.sleep(delay)
72
+
73
+ browser.close()
74
+ return pd.DataFrame(all_reviews)
75
 
76
 
77