Spaces:
Runtime error
Runtime error
Upload 5 files
Browse files- GenerateAIPodcast.py +25 -0
- app.py +11 -19
- crawl_archive.py +136 -0
GenerateAIPodcast.py
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Import the required module
|
2 |
+
import pyttsx3
|
3 |
+
from numpy.ma.core import concatenate
|
4 |
+
|
5 |
+
|
6 |
+
|
7 |
+
def generateMp3(text, number):
|
8 |
+
# Initialize the Pyttsx3 engine
|
9 |
+
pyttsx3.init(driverName='sapi5')
|
10 |
+
engine = pyttsx3.init()
|
11 |
+
|
12 |
+
voices = engine.getProperty('voices')
|
13 |
+
|
14 |
+
engine.setProperty('voice', voices[2].id)
|
15 |
+
|
16 |
+
newVoiceRate = 145
|
17 |
+
engine.setProperty('rate', newVoiceRate)
|
18 |
+
|
19 |
+
# We can use file extension as mp3 and wav, both will work
|
20 |
+
fileName = "Ttldr - " + str(number) + ".mp3"
|
21 |
+
engine.save_to_file(text, fileName)
|
22 |
+
|
23 |
+
# Wait until above command is not finished.
|
24 |
+
engine.runAndWait()
|
25 |
+
|
app.py
CHANGED
@@ -1,6 +1,9 @@
|
|
1 |
from fastapi import FastAPI
|
2 |
from transformers import pipeline
|
3 |
|
|
|
|
|
|
|
4 |
|
5 |
from llama_cpp import Llama
|
6 |
|
@@ -12,19 +15,6 @@ llm = Llama.from_pretrained(
|
|
12 |
filename="llama-3.2-1b-instruct-q8_0.gguf",
|
13 |
)
|
14 |
|
15 |
-
check = llm.create_chat_completion(
|
16 |
-
messages = [
|
17 |
-
{
|
18 |
-
"role": "user",
|
19 |
-
"content": "What is the capital of France?"
|
20 |
-
}
|
21 |
-
]
|
22 |
-
)
|
23 |
-
|
24 |
-
print(check['choices'][0]['message']['content'])
|
25 |
-
|
26 |
-
## create a new FASTAPI app instance
|
27 |
-
app=FastAPI()
|
28 |
@app.get("/")
|
29 |
def home():
|
30 |
return {"message":"Hello World"}
|
@@ -32,18 +22,20 @@ def home():
|
|
32 |
# Define a function to handle the GET request at `/generate`
|
33 |
|
34 |
|
|
|
35 |
@app.get("/generate")
|
36 |
-
def generate(
|
37 |
## use the pipeline to generate text from given input text
|
38 |
output= llm.create_chat_completion(
|
39 |
messages = [
|
40 |
-
{
|
41 |
-
|
42 |
-
|
43 |
-
}
|
44 |
]
|
45 |
)
|
46 |
|
47 |
## return the generate text in Json reposnfe
|
48 |
-
return
|
|
|
49 |
|
|
|
|
1 |
from fastapi import FastAPI
|
2 |
from transformers import pipeline
|
3 |
|
4 |
+
import crawl_archive
|
5 |
+
import GenerateAIPodcast
|
6 |
+
|
7 |
|
8 |
from llama_cpp import Llama
|
9 |
|
|
|
15 |
filename="llama-3.2-1b-instruct-q8_0.gguf",
|
16 |
)
|
17 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
@app.get("/")
|
19 |
def home():
|
20 |
return {"message":"Hello World"}
|
|
|
22 |
# Define a function to handle the GET request at `/generate`
|
23 |
|
24 |
|
25 |
+
|
26 |
@app.get("/generate")
|
27 |
+
def generate(link:str):
|
28 |
## use the pipeline to generate text from given input text
|
29 |
output= llm.create_chat_completion(
|
30 |
messages = [
|
31 |
+
{"role": "system",
|
32 |
+
"content": "Always answer short and most detailled and dont use * in your answers. It should be good to hear as a Podcast"},
|
33 |
+
{"role": "user", "content": f"Please summarize this website: {link}."}
|
|
|
34 |
]
|
35 |
)
|
36 |
|
37 |
## return the generate text in Json reposnfe
|
38 |
+
return output['choices'][0]['message']['content']
|
39 |
+
|
40 |
|
41 |
+
crawl_archive.run_tldr_crawler()
|
crawl_archive.py
ADDED
@@ -0,0 +1,136 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import asyncio
|
2 |
+
|
3 |
+
from llama_cpp import Llama
|
4 |
+
from openai import OpenAI
|
5 |
+
from selenium import webdriver
|
6 |
+
from selenium.common import WebDriverException
|
7 |
+
from selenium.webdriver.common.by import By
|
8 |
+
from selenium.webdriver.common.keys import Keys
|
9 |
+
from selenium.webdriver.chrome.service import Service
|
10 |
+
import concurrent.futures
|
11 |
+
from webdriver_manager.chrome import ChromeDriverManager
|
12 |
+
import time
|
13 |
+
from datetime import datetime
|
14 |
+
import os
|
15 |
+
from GenerateAIPodcast import generateMp3
|
16 |
+
from btts import generateAudioFile
|
17 |
+
|
18 |
+
# client = OpenAI(base_url="http://localhost:8080/v1", api_key="lm-studio")
|
19 |
+
|
20 |
+
|
21 |
+
'''def make_request(link):
|
22 |
+
print("-----------------------------------------------------------------------------------------")
|
23 |
+
print("Make Request is called")
|
24 |
+
try:
|
25 |
+
|
26 |
+
completion = client.chat.completions.create(
|
27 |
+
model="model-identifier",
|
28 |
+
|
29 |
+
messages=[
|
30 |
+
{"role": "system",
|
31 |
+
"content": "Always answer short and most detailled and dont use * in your answers. It should be good to hear as a Podcast"},
|
32 |
+
{"role": "user", "content": f"Please summarize this website: {link}."}
|
33 |
+
],
|
34 |
+
temperature=0.7,
|
35 |
+
)
|
36 |
+
# print(f"Thread: {completion.choices[0].message}")
|
37 |
+
# print("TEST:", completion.choices[0].message)
|
38 |
+
message = completion.choices[0].message.content
|
39 |
+
return message
|
40 |
+
except Exception as e:
|
41 |
+
print(f"Thread encountered an error: {e}^")
|
42 |
+
'''
|
43 |
+
|
44 |
+
|
45 |
+
|
46 |
+
llm = Llama.from_pretrained(
|
47 |
+
repo_id="hugging-quants/Llama-3.2-1B-Instruct-Q8_0-GGUF",
|
48 |
+
filename="llama-3.2-1b-instruct-q8_0.gguf",
|
49 |
+
)
|
50 |
+
|
51 |
+
def generate(link:str):
|
52 |
+
## use the pipeline to generate text from given input text
|
53 |
+
output= llm.create_chat_completion(
|
54 |
+
messages = [
|
55 |
+
{"role": "system",
|
56 |
+
"content": "Always answer short and most detailled and dont use * in your answers. It should be good to hear as a Podcast"},
|
57 |
+
{"role": "user", "content": f"Please summarize this website: {link}."}
|
58 |
+
]
|
59 |
+
)
|
60 |
+
|
61 |
+
## return the generate text in Json reposnfe
|
62 |
+
return output['choices'][0]['message']['content']
|
63 |
+
|
64 |
+
|
65 |
+
|
66 |
+
def run_tldr_crawler():
|
67 |
+
# Setup Selenium WebDriver
|
68 |
+
options = webdriver.ChromeOptions()
|
69 |
+
# options.add_argument() # Run in headless mode (no browser UI)
|
70 |
+
options.add_argument('--disable-gpu')
|
71 |
+
options.add_argument('--no-sandbox')
|
72 |
+
|
73 |
+
# Initialize the WebDriver
|
74 |
+
service = Service(ChromeDriverManager().install())
|
75 |
+
driver = webdriver.Chrome(service=service, options=options)
|
76 |
+
try:
|
77 |
+
date = datetime.today().strftime('%Y-%m-%d')
|
78 |
+
print(date)
|
79 |
+
# Comment this if you want run this at a weekend
|
80 |
+
date = '2025-03-07'
|
81 |
+
# Step 1: Navigate to the TLDR archives page
|
82 |
+
url = f"https://tldr.tech/tech/{date}"
|
83 |
+
driver.get(url)
|
84 |
+
|
85 |
+
# Wait for the page to load
|
86 |
+
time.sleep(2)
|
87 |
+
|
88 |
+
# Step 3: Extract all links on the new page
|
89 |
+
links = driver.find_elements(By.TAG_NAME, 'a')
|
90 |
+
|
91 |
+
# Collect the href attributes
|
92 |
+
# extracted_links = [link.get_attribute('href') for link in links if link.get_attribute('href') is not None]
|
93 |
+
|
94 |
+
extracted_links = [
|
95 |
+
link.get_attribute('href')
|
96 |
+
for link in links
|
97 |
+
if link.get_attribute('href') is not None and
|
98 |
+
not link.get_attribute('href').startswith("https://tldr.tech") and
|
99 |
+
not link.get_attribute('href').startswith("https://jobs") and
|
100 |
+
not "advertise" in link.get_attribute('href')
|
101 |
+
]
|
102 |
+
|
103 |
+
# Output the extracted links
|
104 |
+
print("Extracted Links:")
|
105 |
+
print(len(extracted_links))
|
106 |
+
for idx, link in enumerate(extracted_links, start=1):
|
107 |
+
print(f"{idx}. {link}")
|
108 |
+
|
109 |
+
# Die maximale Anzahl von Threads, die gleichzeitig laufen sollen
|
110 |
+
max_threads = 4
|
111 |
+
|
112 |
+
# ThreadPoolExecutor verwenden, um maximal 4 Threads gleichzeitig auszuführen
|
113 |
+
with concurrent.futures.ThreadPoolExecutor(max_workers=max_threads) as executor:
|
114 |
+
# Für jeden Link in extracted_links wird makerequest aufgerufen
|
115 |
+
# enumerate gibt auch den Index zurück, falls du ihn brauchst
|
116 |
+
futures = []
|
117 |
+
for idx, link in enumerate(extracted_links, start=1):
|
118 |
+
future = executor.submit(generate, link)
|
119 |
+
futures.append((idx, link, future))
|
120 |
+
# print(f"{idx}. {link}")
|
121 |
+
|
122 |
+
# print(future.result())
|
123 |
+
|
124 |
+
for idx, link, future in futures:
|
125 |
+
result = future.result()
|
126 |
+
# print(f"{idx}. {link} - Result {result}")
|
127 |
+
asyncio.run(generateAudioFile(result, idx))
|
128 |
+
|
129 |
+
|
130 |
+
except WebDriverException as e:
|
131 |
+
print(f"Fehler beim Laden der Seite: {e}")
|
132 |
+
|
133 |
+
|
134 |
+
finally:
|
135 |
+
# Close the WebDriver
|
136 |
+
driver.quit()
|