Spaces:

purpleriann
/

LLM-Engineers-Handbook

Runtime error

App Files Files Community

LLM-Engineers-Handbook / llm_engineering /application /crawlers /linkedin.py

purpleriann

Upload folder using huggingface_hub

a22e84b verified about 2 months ago

raw

history blame contribute delete

6.68 kB

	import time
	from typing import Dict, List

	from bs4 import BeautifulSoup
	from bs4.element import Tag
	from loguru import logger
	from selenium.webdriver.common.by import By

	from llm_engineering.domain.documents import PostDocument
	from llm_engineering.domain.exceptions import ImproperlyConfigured
	from llm_engineering.settings import settings

	from .base import BaseSeleniumCrawler


	class LinkedInCrawler(BaseSeleniumCrawler):
	model = PostDocument

	def __init__(self, scroll_limit: int = 5, is_deprecated: bool = True) -> None:
	super().__init__(scroll_limit)

	self._is_deprecated = is_deprecated

	def set_extra_driver_options(self, options) -> None:
	options.add_experimental_option("detach", True)

	def login(self) -> None:
	if self._is_deprecated:
	raise DeprecationWarning(
	"As LinkedIn has updated its security measures, the login() method is no longer supported."
	)

	self.driver.get("https://www.linkedin.com/login")
	if not settings.LINKEDIN_USERNAME or not settings.LINKEDIN_PASSWORD:
	raise ImproperlyConfigured(
	"LinkedIn scraper requires the {LINKEDIN_USERNAME} and {LINKEDIN_PASSWORD} settings."
	)

	self.driver.find_element(By.ID, "username").send_keys(settings.LINKEDIN_USERNAME)
	self.driver.find_element(By.ID, "password").send_keys(settings.LINKEDIN_PASSWORD)
	self.driver.find_element(By.CSS_SELECTOR, ".login__form_action_container button").click()

	def extract(self, link: str, **kwargs) -> None:
	if self._is_deprecated:
	raise DeprecationWarning(
	"As LinkedIn has updated its feed structure, the extract() method is no longer supported."
	)

	if self.model.link is not None:
	old_model = self.model.find(link=link)
	if old_model is not None:
	logger.info(f"Post already exists in the database: {link}")

	return

	logger.info(f"Starting scrapping data for profile: {link}")

	self.login()

	soup = self._get_page_content(link)

	data = { # noqa
	"Name": self._scrape_section(soup, "h1", class_="text-heading-xlarge"),
	"About": self._scrape_section(soup, "div", class_="display-flex ph5 pv3"),
	"Main Page": self._scrape_section(soup, "div", {"id": "main-content"}),
	"Experience": self._scrape_experience(link),
	"Education": self._scrape_education(link),
	}

	self.driver.get(link)
	time.sleep(5)
	button = self.driver.find_element(
	By.CSS_SELECTOR, ".app-aware-link.profile-creator-shared-content-view__footer-action"
	)
	button.click()

	# Scrolling and scraping posts
	self.scroll_page()
	soup = BeautifulSoup(self.driver.page_source, "html.parser")
	post_elements = soup.find_all(
	"div",
	class_="update-components-text relative update-components-update-v2__commentary",
	)
	buttons = soup.find_all("button", class_="update-components-image__image-link")
	post_images = self._extract_image_urls(buttons)

	posts = self._extract_posts(post_elements, post_images)
	logger.info(f"Found {len(posts)} posts for profile: {link}")

	self.driver.close()

	user = kwargs["user"]
	self.model.bulk_insert(
	[
	PostDocument(platform="linkedin", content=post, author_id=user.id, author_full_name=user.full_name)
	for post in posts
	]
	)

	logger.info(f"Finished scrapping data for profile: {link}")

	def _scrape_section(self, soup: BeautifulSoup, args, *kwargs) -> str:
	"""Scrape a specific section of the LinkedIn profile."""
	# Example: Scrape the 'About' section

	parent_div = soup.find(args, *kwargs)

	return parent_div.get_text(strip=True) if parent_div else ""

	def _extract_image_urls(self, buttons: List[Tag]) -> Dict[str, str]:
	"""
	Extracts image URLs from button elements.

	Args:
	buttons (List[Tag]): A list of BeautifulSoup Tag objects representing buttons.

	Returns:
	Dict[str, str]: A dictionary mapping post indexes to image URLs.
	"""

	post_images = {}
	for i, button in enumerate(buttons):
	img_tag = button.find("img")
	if img_tag and "src" in img_tag.attrs:
	post_images[f"Post_{i}"] = img_tag["src"]
	else:
	logger.warning("No image found in this button")
	return post_images

	def _get_page_content(self, url: str) -> BeautifulSoup:
	"""Retrieve the page content of a given URL."""

	self.driver.get(url)
	time.sleep(5)

	return BeautifulSoup(self.driver.page_source, "html.parser")

	def _extract_posts(self, post_elements: List[Tag], post_images: Dict[str, str]) -> Dict[str, Dict[str, str]]:
	"""
	Extracts post texts and combines them with their respective images.

	Args:
	post_elements (List[Tag]): A list of BeautifulSoup Tag objects representing post elements.
	post_images (Dict[str, str]): A dictionary containing image URLs mapped by post index.

	Returns:
	Dict[str, Dict[str, str]]: A dictionary containing post data with text and optional image URL.
	"""

	posts_data = {}
	for i, post_element in enumerate(post_elements):
	post_text = post_element.get_text(strip=True, separator="\n")
	post_data = {"text": post_text}
	if f"Post_{i}" in post_images:
	post_data["image"] = post_images[f"Post_{i}"]
	posts_data[f"Post_{i}"] = post_data

	return posts_data

	def _scrape_experience(self, profile_url: str) -> str:
	"""Scrapes the Experience section of the LinkedIn profile."""

	self.driver.get(profile_url + "/details/experience/")
	time.sleep(5)
	soup = BeautifulSoup(self.driver.page_source, "html.parser")
	experience_content = soup.find("section", {"id": "experience-section"})

	return experience_content.get_text(strip=True) if experience_content else ""

	def _scrape_education(self, profile_url: str) -> str:
	self.driver.get(profile_url + "/details/education/")
	time.sleep(5)
	soup = BeautifulSoup(self.driver.page_source, "html.parser")
	education_content = soup.find("section", {"id": "education-section"})

	return education_content.get_text(strip=True) if education_content else ""