import asyncio from llama_cpp import Llama from openai import OpenAI from selenium import webdriver from selenium.common import WebDriverException from selenium.webdriver.common.by import By from selenium.webdriver.common.keys import Keys from selenium.webdriver.chrome.service import Service import concurrent.futures from webdriver_manager.chrome import ChromeDriverManager import time from datetime import datetime import os from GenerateAIPodcast import generateMp3 from btts import generateAudioFile # client = OpenAI(base_url="http://localhost:8080/v1", api_key="lm-studio") '''def make_request(link): print("-----------------------------------------------------------------------------------------") print("Make Request is called") try: completion = client.chat.completions.create( model="model-identifier", messages=[ {"role": "system", "content": "Always answer short and most detailled and dont use * in your answers. It should be good to hear as a Podcast"}, {"role": "user", "content": f"Please summarize this website: {link}."} ], temperature=0.7, ) # print(f"Thread: {completion.choices[0].message}") # print("TEST:", completion.choices[0].message) message = completion.choices[0].message.content return message except Exception as e: print(f"Thread encountered an error: {e}^") ''' llm = Llama.from_pretrained( repo_id="hugging-quants/Llama-3.2-1B-Instruct-Q8_0-GGUF", filename="llama-3.2-1b-instruct-q8_0.gguf", ) def generate(link:str): ## use the pipeline to generate text from given input text output= llm.create_chat_completion( messages = [ {"role": "system", "content": "Always answer short and most detailled and dont use * in your answers. It should be good to hear as a Podcast"}, {"role": "user", "content": f"Please summarize this website: {link}."} ] ) ## return the generate text in Json reposnfe return output['choices'][0]['message']['content'] def run_tldr_crawler(): # Setup Selenium WebDriver options = webdriver.ChromeOptions() # options.add_argument() # Run in headless mode (no browser UI) options.add_argument('--disable-gpu') options.add_argument('--no-sandbox') # Initialize the WebDriver service = Service(ChromeDriverManager().install()) driver = webdriver.Chrome(service=service, options=options) try: date = datetime.today().strftime('%Y-%m-%d') print(date) # Comment this if you want run this at a weekend date = '2025-03-07' # Step 1: Navigate to the TLDR archives page url = f"https://tldr.tech/tech/{date}" driver.get(url) # Wait for the page to load time.sleep(2) # Step 3: Extract all links on the new page links = driver.find_elements(By.TAG_NAME, 'a') # Collect the href attributes # extracted_links = [link.get_attribute('href') for link in links if link.get_attribute('href') is not None] extracted_links = [ link.get_attribute('href') for link in links if link.get_attribute('href') is not None and not link.get_attribute('href').startswith("https://tldr.tech") and not link.get_attribute('href').startswith("https://jobs") and not "advertise" in link.get_attribute('href') ] # Output the extracted links print("Extracted Links:") print(len(extracted_links)) for idx, link in enumerate(extracted_links, start=1): print(f"{idx}. {link}") # Die maximale Anzahl von Threads, die gleichzeitig laufen sollen max_threads = 4 # ThreadPoolExecutor verwenden, um maximal 4 Threads gleichzeitig auszuführen with concurrent.futures.ThreadPoolExecutor(max_workers=max_threads) as executor: # Für jeden Link in extracted_links wird makerequest aufgerufen # enumerate gibt auch den Index zurück, falls du ihn brauchst futures = [] for idx, link in enumerate(extracted_links, start=1): future = executor.submit(generate, link) futures.append((idx, link, future)) # print(f"{idx}. {link}") # print(future.result()) for idx, link, future in futures: result = future.result() # print(f"{idx}. {link} - Result {result}") asyncio.run(generateAudioFile(result, idx)) except WebDriverException as e: print(f"Fehler beim Laden der Seite: {e}") finally: # Close the WebDriver driver.quit()