|
"""googlesearch is a Python library for searching Google, easily.""" |
|
from time import sleep |
|
from bs4 import BeautifulSoup |
|
from requests import get |
|
from urllib.parse import unquote |
|
from tools.googlesearch.useragentka import get_useragent |
|
from curl_cffi import requests as curlreq |
|
from tools.googlesearch.gettyimages import get_images |
|
|
|
def _req(term, results, lang, start, proxies, timeout, safe, ssl_verify, region): |
|
resp = get( |
|
url="https://www.google.com/search", |
|
headers={ |
|
"User-Agent": get_useragent(), |
|
"Accept": "*/*" |
|
}, |
|
params={ |
|
"q": term, |
|
"num": results + 2, |
|
"hl": lang, |
|
"start": start, |
|
"safe": safe, |
|
"gl": region, |
|
}, |
|
proxies=proxies, |
|
timeout=timeout, |
|
verify=ssl_verify, |
|
cookies = { |
|
'CONSENT': 'PENDING+987', |
|
'SOCS': 'CAESHAgBEhIaAB', |
|
} |
|
) |
|
resp.raise_for_status() |
|
return resp |
|
|
|
|
|
class SearchResult: |
|
def __init__(self, url, title, description): |
|
self.url = url |
|
self.title = title |
|
self.description = description |
|
|
|
def __repr__(self): |
|
return f"SearchResult(url={self.url}, title={self.title}, description={self.description})" |
|
|
|
|
|
def search(term, num_results=10, lang="en", proxy=None, advanced=False, sleep_interval=0, timeout=5, safe="active", ssl_verify=None, region=None, start_num=0, unique=False): |
|
"""Search the Google search engine""" |
|
|
|
|
|
proxies = {"https": proxy, "http": proxy} if proxy and (proxy.startswith("https") or proxy.startswith("http")) else None |
|
|
|
start = start_num |
|
fetched_results = 0 |
|
fetched_links = set() |
|
results_list = [] |
|
image_results = [] |
|
|
|
while fetched_results < num_results: |
|
|
|
resp = _req(term, num_results - start, |
|
lang, start, proxies, timeout, safe, ssl_verify, region) |
|
|
|
|
|
soup = BeautifulSoup(resp.text, "html.parser") |
|
result_block = soup.find_all("div", class_="ezO2md") |
|
new_results = 0 |
|
|
|
|
|
try: |
|
all_images = soup.find_all("img") |
|
for img in all_images: |
|
img_src = img.get("src") or img.get("data-src") |
|
if img_src: |
|
|
|
if img_src.startswith("data:image"): |
|
image_results.append({ |
|
"src": img_src, |
|
"alt": img.get("alt", ""), |
|
"class": img.get("class", []), |
|
}) |
|
|
|
elif img_src.startswith("http"): |
|
image_results.append({ |
|
"src": img_src, |
|
"alt": img.get("alt", ""), |
|
"class": img.get("class", []), |
|
}) |
|
except Exception as e: |
|
print(f"Error parsing images: {str(e)}") |
|
|
|
for result in result_block: |
|
link_tag = result.find("a", href=True) |
|
title_tag = link_tag.find("span", class_="CVA68e") if link_tag else None |
|
description_tag = result.find("span", class_="FrIlee") |
|
|
|
if link_tag and title_tag and description_tag: |
|
link = unquote(link_tag["href"].split("&")[0].replace("/url?q=", "")) |
|
if link in fetched_links and unique: |
|
continue |
|
fetched_links.add(link) |
|
title = title_tag.text if title_tag else "" |
|
description = description_tag.text if description_tag else "" |
|
|
|
|
|
if advanced and not any('page_text' in result for result in results_list): |
|
try: |
|
page_scrape = curlreq.get(link, impersonate='chrome110') |
|
page_scrape.encoding = 'utf-8' |
|
page_soup = BeautifulSoup(page_scrape.text, "html.parser") |
|
|
|
|
|
main_content = ( |
|
page_soup.find(['article', 'main']) or |
|
page_soup.find('div', {'id': ['content', 'main-content', 'body-content']}) or |
|
page_soup.find('div', {'class': ['content', 'main', 'article', 'post']}) or |
|
page_soup.find('div', {'role': 'main'}) or |
|
page_soup.body |
|
) |
|
if main_content: |
|
|
|
for element in main_content(['script', 'style', 'noscript', 'svg', 'header', 'footer', 'nav']): |
|
element.decompose() |
|
|
|
text = main_content.get_text(separator=' ', strip=True) |
|
text = ' '.join(line.strip() for line in text.splitlines() if line.strip()) |
|
page_text = ' '.join(word for word in text.split() if len(word) > 1)[:3000] |
|
else: |
|
page_text = "" |
|
except Exception as e: |
|
print(f"Error scraping {link}: {str(e)}") |
|
page_text = "" |
|
else: |
|
page_text = "" |
|
|
|
|
|
fetched_results += 1 |
|
new_results += 1 |
|
|
|
if advanced: |
|
results_list.append({ |
|
"link": link, |
|
"title": title, |
|
"description": description, |
|
"page_text": page_text, |
|
}) |
|
else: |
|
results_list.append(link) |
|
|
|
if fetched_results >= num_results: |
|
break |
|
|
|
if new_results == 0: |
|
break |
|
|
|
start += 10 |
|
sleep(sleep_interval) |
|
|
|
if image_results == [] : |
|
images = get_images(term) |
|
return {"results": results_list, "images": images} |
|
else: |
|
return {"results": results_list, "images": image_results} |
|
|