Update app.py
Browse files
app.py
CHANGED
@@ -4,6 +4,8 @@ import requests
|
|
4 |
import pytz
|
5 |
import yaml
|
6 |
from tools.final_answer import FinalAnswerTool
|
|
|
|
|
7 |
from bs4 import BeautifulSoup
|
8 |
import pandas as pd
|
9 |
import time
|
@@ -27,45 +29,49 @@ def scrape_drug_reviews(drug:str)-> str: #it's import to specify the return type
|
|
27 |
return f"Error fetching reviews for the target drug you provided: '{drug}'"
|
28 |
|
29 |
|
30 |
-
def scrape_drugs_com_reviews(drug_name, max_pages=
|
31 |
"""
|
32 |
Scrapes user reviews from Drugs.com for a given drug.
|
33 |
"""
|
34 |
base_url = f"https://www.drugs.com/comments/{drug_name}/"
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
|
|
|
|
|
|
|
|
69 |
|
70 |
|
71 |
|
|
|
4 |
import pytz
|
5 |
import yaml
|
6 |
from tools.final_answer import FinalAnswerTool
|
7 |
+
from playwright.sync_api import sync_playwright
|
8 |
+
import time
|
9 |
from bs4 import BeautifulSoup
|
10 |
import pandas as pd
|
11 |
import time
|
|
|
29 |
return f"Error fetching reviews for the target drug you provided: '{drug}'"
|
30 |
|
31 |
|
32 |
+
def scrape_drugs_com_reviews(drug_name, max_pages=3, delay=2):
|
33 |
"""
|
34 |
Scrapes user reviews from Drugs.com for a given drug.
|
35 |
"""
|
36 |
base_url = f"https://www.drugs.com/comments/{drug_name}/"
|
37 |
+
all_reviews = []
|
38 |
+
|
39 |
+
with sync_playwright() as p:
|
40 |
+
browser = p.chromium.launch(headless=True)
|
41 |
+
page = browser.new_page()
|
42 |
+
|
43 |
+
for page_num in range(1, max_pages + 1):
|
44 |
+
url = base_url if page_num == 1 else f"{base_url}?page={page_num}"
|
45 |
+
print(f"Scraping: {url}")
|
46 |
+
page.goto(url, timeout=60000)
|
47 |
+
time.sleep(delay) # Give page some time to load
|
48 |
+
|
49 |
+
html = page.content()
|
50 |
+
soup = BeautifulSoup(html, 'html.parser')
|
51 |
+
review_blocks = soup.select('.user-comment')
|
52 |
+
|
53 |
+
if not review_blocks:
|
54 |
+
print("No reviews found on this page.")
|
55 |
+
break
|
56 |
+
|
57 |
+
for block in review_blocks:
|
58 |
+
review_text = block.select_one('.user-comment-text')
|
59 |
+
condition = block.select_one('.drug-condition')
|
60 |
+
rating = block.select_one('.rating-score')
|
61 |
+
date = block.select_one('.comment-date')
|
62 |
+
|
63 |
+
all_reviews.append({
|
64 |
+
"condition": condition.get_text(strip=True) if condition else None,
|
65 |
+
"rating": rating.get_text(strip=True) if rating else None,
|
66 |
+
"review": review_text.get_text(strip=True) if review_text else None,
|
67 |
+
"date": date.get_text(strip=True) if date else None,
|
68 |
+
"source": url
|
69 |
+
})
|
70 |
+
|
71 |
+
time.sleep(delay)
|
72 |
+
|
73 |
+
browser.close()
|
74 |
+
return pd.DataFrame(all_reviews)
|
75 |
|
76 |
|
77 |
|