MoJaff commited on
Commit
fbb7d6a
·
verified ·
1 Parent(s): 2eef82c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -12
app.py CHANGED
@@ -1,10 +1,9 @@
1
  import torch
2
  from transformers import AutoModelForCausalLM, LlamaTokenizer, pipeline as transformers_pipeline
3
- from gtts import gTTS # Replace KPipeline with gTTS
4
  import soundfile as sf
5
  import numpy as np
6
  import gradio as gr
7
- import os
8
 
9
  # Initialize the image-to-text pipeline
10
  captionImage = transformers_pipeline("image-to-text", model="Salesforce/blip-image-captioning-large")
@@ -54,23 +53,28 @@ def Generate_story(textAbout):
54
 
55
  return story
56
 
57
- # Function to generate audio using gTTS
58
- def Generate_audio(text, lang='en'):
59
- tts = gTTS(text=text, lang=lang, slow=False) # Create a gTTS object
60
- audio_file = "output_audio.mp3" # Save as MP3
61
- tts.save(audio_file) # Save the audio file
62
- return audio_file
 
 
 
63
 
64
  # Main function to process the image and generate audio
65
  def Mustalhim(image):
66
  caption = Image_Caption(image)
67
  story = Generate_story(caption)
68
- audio_file = Generate_audio(story)
69
- return audio_file
70
 
71
  # Gradio interface
72
  def gradio_interface(image):
73
- audio_file = Mustalhim(image)
 
 
74
  return audio_file
75
 
76
  # Path to the example image
@@ -81,7 +85,7 @@ app = gr.Interface(
81
  fn=gradio_interface,
82
  inputs=gr.Image(type="pil"),
83
  outputs=gr.Audio(type="filepath"),
84
- title="Image to Audio Story",
85
  description="Upload an image, and the app will generate a story and convert it to audio.",
86
  examples=[[example_image]]
87
  )
 
1
  import torch
2
  from transformers import AutoModelForCausalLM, LlamaTokenizer, pipeline as transformers_pipeline
3
+ from kokoro import KPipeline
4
  import soundfile as sf
5
  import numpy as np
6
  import gradio as gr
 
7
 
8
  # Initialize the image-to-text pipeline
9
  captionImage = transformers_pipeline("image-to-text", model="Salesforce/blip-image-captioning-large")
 
53
 
54
  return story
55
 
56
+ # Function to generate audio
57
+ def Generate_audio(text, voice='bm_lewis', speed=1):
58
+ pipeline = KPipeline(lang_code='b')
59
+ generator = pipeline(text, voice=voice, speed=speed, split_pattern=r'\n+')
60
+ full_audio = []
61
+ for _, _, audio in generator:
62
+ full_audio.extend(audio)
63
+ full_audio = np.array(full_audio)
64
+ return full_audio, 24000
65
 
66
  # Main function to process the image and generate audio
67
  def Mustalhim(image):
68
  caption = Image_Caption(image)
69
  story = Generate_story(caption)
70
+ audio = Generate_audio(story)
71
+ return audio
72
 
73
  # Gradio interface
74
  def gradio_interface(image):
75
+ audio_waveform, sampling_rate = Mustalhim(image)
76
+ audio_file = "output_audio.wav"
77
+ sf.write(audio_file, audio_waveform, sampling_rate)
78
  return audio_file
79
 
80
  # Path to the example image
 
85
  fn=gradio_interface,
86
  inputs=gr.Image(type="pil"),
87
  outputs=gr.Audio(type="filepath"),
88
+ title="Mustalhim",
89
  description="Upload an image, and the app will generate a story and convert it to audio.",
90
  examples=[[example_image]]
91
  )