Spaces:

MoJaff
/

Mustalhim_AI

Sleeping

MoJaff commited on Feb 27

Commit

fbb7d6a

verified ·

1 Parent(s): 2eef82c

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,10 +1,9 @@
 import torch
 from transformers import AutoModelForCausalLM, LlamaTokenizer, pipeline as transformers_pipeline
-from gtts import gTTS  # Replace KPipeline with gTTS
 import soundfile as sf
 import numpy as np
 import gradio as gr
-import os
 # Initialize the image-to-text pipeline
 captionImage = transformers_pipeline("image-to-text", model="Salesforce/blip-image-captioning-large")
@@ -54,23 +53,28 @@ def Generate_story(textAbout):
     return story
-# Function to generate audio using gTTS
-def Generate_audio(text, lang='en'):
-    tts = gTTS(text=text, lang=lang, slow=False)  # Create a gTTS object
-    audio_file = "output_audio.mp3"  # Save as MP3
-    tts.save(audio_file)  # Save the audio file
-    return audio_file
 # Main function to process the image and generate audio
 def Mustalhim(image):
     caption = Image_Caption(image)
     story = Generate_story(caption)
-    audio_file = Generate_audio(story)
-    return audio_file
 # Gradio interface
 def gradio_interface(image):
-    audio_file = Mustalhim(image)
     return audio_file
 # Path to the example image
@@ -81,7 +85,7 @@ app = gr.Interface(
     fn=gradio_interface,
     inputs=gr.Image(type="pil"),
     outputs=gr.Audio(type="filepath"),
-    title="Image to Audio Story",
     description="Upload an image, and the app will generate a story and convert it to audio.",
     examples=[[example_image]]
 )

 import torch
 from transformers import AutoModelForCausalLM, LlamaTokenizer, pipeline as transformers_pipeline
+from kokoro import KPipeline
 import soundfile as sf
 import numpy as np
 import gradio as gr
 # Initialize the image-to-text pipeline
 captionImage = transformers_pipeline("image-to-text", model="Salesforce/blip-image-captioning-large")
     return story
+# Function to generate audio
+def Generate_audio(text, voice='bm_lewis', speed=1):
+    pipeline = KPipeline(lang_code='b')
+    generator = pipeline(text, voice=voice, speed=speed, split_pattern=r'\n+')
+    full_audio = []
+    for _, _, audio in generator:
+        full_audio.extend(audio)
+    full_audio = np.array(full_audio)
+    return full_audio, 24000
 # Main function to process the image and generate audio
 def Mustalhim(image):
     caption = Image_Caption(image)
     story = Generate_story(caption)
+    audio = Generate_audio(story)
+    return audio
 # Gradio interface
 def gradio_interface(image):
+    audio_waveform, sampling_rate = Mustalhim(image)
+    audio_file = "output_audio.wav"
+    sf.write(audio_file, audio_waveform, sampling_rate)
     return audio_file
 # Path to the example image
     fn=gradio_interface,
     inputs=gr.Image(type="pil"),
     outputs=gr.Audio(type="filepath"),
+    title="Mustalhim",
     description="Upload an image, and the app will generate a story and convert it to audio.",
     examples=[[example_image]]
 )