podcastgen

Paused

App Files Files Community

Rausda6 commited on May 21

Commit

cda1077

verified ·

1 Parent(s): 139ac7f

Update app.py

Browse files

Files changed (1) hide show

app.py +114 -121

app.py CHANGED Viewed

@@ -9,139 +9,132 @@ import os
 import time
 import mimetypes
 from typing import List, Dict
-from transformers import AutoTokenizer, AutoModelForCausalLM  # Replaced gemini
-import torch  # Replaced gemini
 # Constants
 MAX_FILE_SIZE_MB = 20
 MAX_FILE_SIZE_BYTES = MAX_FILE_SIZE_MB * 1024 * 1024  # Convert MB to bytes
-# Model setup
-MODEL_ID = "tabularisai/german-gemma-3-1b-it"
-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID,
-    torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
-    device_map="auto",
-).eval()
 class PodcastGenerator:
     def __init__(self):
-        pass
-    async def generate_script(self, prompt: str, language: str, api_key: str, file_obj=None, progress=None) -> Dict:
         example = """
 {
-    "topic": "AGI",
-    "podcast": [
-        {
-            "speaker": "Alex",
-            "text": "Hallo und willkommen zur heutigen Folge über AGI."
-        },
-        {
-            "speaker": "Ben",
-            "text": "Danke Alex! Wir sprechen heute über Artificial General Intelligence und ihre möglichen Auswirkungen."
-        }
-    ]
 }
 """
-        full_prompt = (
-            f"Lies den Inhalt und fasse ihn in einem kurzen Podcast-Drehbuch in folgender JSON-Struktur zusammen:\n"
-            f"{example}\n\n"
-            f"Inhalt:\n{prompt}"
-        )
-        inputs = tokenizer(full_prompt, return_tensors="pt").to(model.device)
-        output = model.generate(**inputs, max_new_tokens=1024, do_sample=True)
-        script_text = tokenizer.decode(output[0], skip_special_tokens=True)
         try:
-            podcast_json = json.loads(script_text)
-        except json.JSONDecodeError:
-            return {"error": "Fehler beim Parsen der Antwort: Kein gültiges JSON erkannt."}
-        return podcast_json
-    async def text_to_audio(self, segments: List[Dict], language: str, progress=None) -> str:
-        audio_segments = []
-        temp_files = []
-        for idx, segment in enumerate(segments):
-            speaker = segment.get("speaker", "Sprecher")
-            text = segment.get("text", "")
-            tts_text = f"{speaker} sagt: {text}"
-            filename = f"temp_{uuid.uuid4()}.mp3"
-            communicate = edge_tts.Communicate(tts_text, "de-DE-KasperNeural")
-            await communicate.save(filename)
-            audio = AudioSegment.from_file(filename, format="mp3")
-            audio_segments.append(audio)
-            temp_files.append(filename)
-            if progress:
-                progress((idx + 1) / len(segments), f"Segment {idx+1} erstellt...")
-        podcast = sum(audio_segments[1:], audio_segments[0])
-        final_filename = f"podcast_{uuid.uuid4()}.mp3"
-        podcast.export(final_filename, format="mp3")
-        for f in temp_files:
-            os.remove(f)
-        return final_filename
-async def generate_podcast(prompt, language, api_key, file, progress=gr.Progress(track_tqdm=True)):
-    gen = PodcastGenerator()
-    text = ""
-    if file is not None:
-        if file.size > MAX_FILE_SIZE_BYTES:
-            return "Datei zu groß. Maximal erlaubt sind 20 MB.", None
-        mime_type, _ = mimetypes.guess_type(file.name)
-        if not mime_type or not mime_type.endswith("pdf"):
-            return "Nur PDF-Dateien sind erlaubt.", None
-        async with aiofiles.open(file.name, mode="rb") as f:
-            pdf_data = await f.read()
-            import fitz  # PyMuPDF
-            with fitz.open(stream=pdf_data, filetype="pdf") as doc:
-                text = "\n".join([page.get_text() for page in doc])
-    else:
-        text = prompt
-    progress(0.1, "Erzeuge Skript...")
-    script = await gen.generate_script(text, language, api_key, file_obj=file, progress=progress)
-    if "error" in script:
-        return script["error"], None
-    progress(0.5, "Erzeuge Audio...")
-    podcast_path = await gen.text_to_audio(script["podcast"], language, progress=progress)
-    progress(1.0, "Fertig!")
-    return "Podcast erfolgreich erstellt!", podcast_path
-demo = gr.Interface(
-    fn=generate_podcast,
-    inputs=[
-        gr.Textbox(label="Thema oder Text", lines=5, placeholder="Worum soll es im Podcast gehen?"),
-        gr.Radio(["de"], label="Sprache", value="de"),
-        gr.Textbox(label="API Key (nicht benötigt für dieses Modell)", type="password", placeholder=""),
-        gr.File(label="Optional: PDF-Datei hochladen", file_types=[".pdf"])
-    ],
-    outputs=[
-        gr.Textbox(label="Status"),
-        gr.Audio(label="Erstellter Podcast", type="filepath")
-    ],
-    title="Podcast Generator (German-Gemma)",
-    description="Erstelle Podcasts aus Text oder PDFs mithilfe eines KI-Modells. Nutzt das Modell 'tabularisai/german-gemma-3-1b-it'.",
-    allow_flagging="never"
-)
-if __name__ == "__main__":
-    demo.launch()

 import time
 import mimetypes
 from typing import List, Dict
+# NEW – Hugging Face Transformers
+from transformers import AutoTokenizer, AutoModelForCausalLM
+import torch
+# NEW – external model id
+MODEL_ID = "tabularisai/german-gemma-3-1b-it"
 # Constants
 MAX_FILE_SIZE_MB = 20
 MAX_FILE_SIZE_BYTES = MAX_FILE_SIZE_MB * 1024 * 1024  # Convert MB to bytes
 class PodcastGenerator:
     def __init__(self):
+        self.tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+        self.model = AutoModelForCausalLM.from_pretrained(
+            MODEL_ID,
+            torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
+            device_map="auto",
+        ).eval()
+    async def generate_script(
+        self,
+        prompt: str,
+        language: str,
+        api_key: str,
+        file_obj=None,
+        progress=None,
+    ) -> Dict:
         example = """
 {
+  "topic": "AGI",
+  "podcast": [
+    {
+      "speaker": 2,
+      "line": "So, AGI, huh? Seems like everyone's talking about it these days."
+    },
+    {
+      "speaker": 1,
+      "line": "Yeah, it's definitely having a moment, isn't it?"
+    }
+  ]
 }
 """
+        if language == "Auto Detect":
+            language_instruction = (
+                "- The podcast MUST be in the same language as the user input."
+            )
+        else:
+            language_instruction = f"- The podcast MUST be in {language} language"
+        system_prompt = f"""
+You are a professional podcast generator. Your task is to generate a professional podcast script based on the user input.
+{language_instruction}
+- The podcast should have 2 speakers.
+- The podcast should be long.
+- Do not use names for the speakers.
+- The podcast should be interesting, lively, and engaging, and hook the listener from the start.
+- The input text might be disorganized or unformatted, originating from sources like PDFs or text files. Ignore any formatting inconsistencies or irrelevant details; your task is to distill the essential points, identify key definitions, and highlight intriguing facts that would be suitable for discussion in a podcast.
+- The script must be in JSON format.
+Follow this example structure:
+{example}
+"""
+        if prompt and file_obj:
+            user_prompt = (
+                f"Please generate a podcast script based on the uploaded file following user input:\n{prompt}"
+            )
+        elif prompt:
+            user_prompt = (
+                f"Please generate a podcast script based on the following user input:\n{prompt}"
+            )
+        else:
+            user_prompt = "Please generate a podcast script based on the uploaded file."
+        # If a file is provided we still read it for completeness (not required for HF generation)
+        if file_obj:
+            _ = await self._read_file_bytes(file_obj)
+        if progress:
+            progress(0.3, "Generating podcast script...")
+        inputs = self.tokenizer(
+            f"{system_prompt}\n\n{user_prompt}", return_tensors="pt"
+        ).to(self.model.device)
         try:
+            output = self.model.generate(**inputs, max_new_tokens=2048, temperature=1.0)
+            response_text = self.tokenizer.decode(output[0], skip_special_tokens=True)
+        except Exception as e:
+            raise Exception(f"Failed to generate podcast script: {e}")
+        print(f"Generated podcast script:\n{response_text}")
+        if progress:
+            progress(0.4, "Script generated successfully!")
+        return json.loads(response_text)
+    async def _read_file_bytes(self, file_obj) -> bytes:
+        if hasattr(file_obj, "size"):
+            file_size = file_obj.size
+        else:
+            file_size = os.path.getsize(file_obj.name)
+        if file_size > MAX_FILE_SIZE_BYTES:
+            raise Exception(
+                f"File size exceeds the {MAX_FILE_SIZE_MB}MB limit. Please upload a smaller file."
+            )
+        if hasattr(file_obj, "read"):
+            return file_obj.read()
+        else:
+            async with aiofiles.open(file_obj.name, "rb") as f:
+                return await f.read()
+    @staticmethod
+    def _get_mime_type(filename: str) -> str:
+        ext = os.path.splitext(filename)[1].lower()
+        if ext == ".pdf":
+            return "application/pdf"
+        elif ext == ".txt":
+            return "text/plain"
+        else:
+            mime_type, _ = mimetypes.guess_type(filename)
+            return mime_type or "application/octet-stream"