tldr_crawlre / crawl_archive.py
Defender117's picture
Upload 5 files
b6204d2 verified
import asyncio
from llama_cpp import Llama
from openai import OpenAI
from selenium import webdriver
from selenium.common import WebDriverException
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
import concurrent.futures
from webdriver_manager.chrome import ChromeDriverManager
import time
from datetime import datetime
import os
from GenerateAIPodcast import generateMp3
from btts import generateAudioFile
# client = OpenAI(base_url="http://localhost:8080/v1", api_key="lm-studio")
'''def make_request(link):
print("-----------------------------------------------------------------------------------------")
print("Make Request is called")
try:
completion = client.chat.completions.create(
model="model-identifier",
messages=[
{"role": "system",
"content": "Always answer short and most detailled and dont use * in your answers. It should be good to hear as a Podcast"},
{"role": "user", "content": f"Please summarize this website: {link}."}
],
temperature=0.7,
)
# print(f"Thread: {completion.choices[0].message}")
# print("TEST:", completion.choices[0].message)
message = completion.choices[0].message.content
return message
except Exception as e:
print(f"Thread encountered an error: {e}^")
'''
llm = Llama.from_pretrained(
repo_id="hugging-quants/Llama-3.2-1B-Instruct-Q8_0-GGUF",
filename="llama-3.2-1b-instruct-q8_0.gguf",
)
def generate(link:str):
## use the pipeline to generate text from given input text
output= llm.create_chat_completion(
messages = [
{"role": "system",
"content": "Always answer short and most detailled and dont use * in your answers. It should be good to hear as a Podcast"},
{"role": "user", "content": f"Please summarize this website: {link}."}
]
)
## return the generate text in Json reposnfe
return output['choices'][0]['message']['content']
def run_tldr_crawler():
# Setup Selenium WebDriver
options = webdriver.ChromeOptions()
# options.add_argument() # Run in headless mode (no browser UI)
options.add_argument('--disable-gpu')
options.add_argument('--no-sandbox')
# Initialize the WebDriver
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=options)
try:
date = datetime.today().strftime('%Y-%m-%d')
print(date)
# Comment this if you want run this at a weekend
date = '2025-03-07'
# Step 1: Navigate to the TLDR archives page
url = f"https://tldr.tech/tech/{date}"
driver.get(url)
# Wait for the page to load
time.sleep(2)
# Step 3: Extract all links on the new page
links = driver.find_elements(By.TAG_NAME, 'a')
# Collect the href attributes
# extracted_links = [link.get_attribute('href') for link in links if link.get_attribute('href') is not None]
extracted_links = [
link.get_attribute('href')
for link in links
if link.get_attribute('href') is not None and
not link.get_attribute('href').startswith("https://tldr.tech") and
not link.get_attribute('href').startswith("https://jobs") and
not "advertise" in link.get_attribute('href')
]
# Output the extracted links
print("Extracted Links:")
print(len(extracted_links))
for idx, link in enumerate(extracted_links, start=1):
print(f"{idx}. {link}")
# Die maximale Anzahl von Threads, die gleichzeitig laufen sollen
max_threads = 4
# ThreadPoolExecutor verwenden, um maximal 4 Threads gleichzeitig auszuführen
with concurrent.futures.ThreadPoolExecutor(max_workers=max_threads) as executor:
# Für jeden Link in extracted_links wird makerequest aufgerufen
# enumerate gibt auch den Index zurück, falls du ihn brauchst
futures = []
for idx, link in enumerate(extracted_links, start=1):
future = executor.submit(generate, link)
futures.append((idx, link, future))
# print(f"{idx}. {link}")
# print(future.result())
for idx, link, future in futures:
result = future.result()
# print(f"{idx}. {link} - Result {result}")
asyncio.run(generateAudioFile(result, idx))
except WebDriverException as e:
print(f"Fehler beim Laden der Seite: {e}")
finally:
# Close the WebDriver
driver.quit()