MoJaff commited on
Commit
fd20bfc
·
verified ·
1 Parent(s): 723c2aa

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -18
app.py CHANGED
@@ -1,9 +1,10 @@
1
  import torch
2
  from transformers import AutoModelForCausalLM, LlamaTokenizer, pipeline as transformers_pipeline
3
- from kokoro import KPipeline
4
  import soundfile as sf
5
  import numpy as np
6
  import gradio as gr
 
7
 
8
  # Initialize the image-to-text pipeline
9
  captionImage = transformers_pipeline("image-to-text", model="Salesforce/blip-image-captioning-large")
@@ -53,31 +54,23 @@ def Generate_story(textAbout):
53
 
54
  return story
55
 
56
- # Function to generate audio
57
- def Generate_audio(text, voice='bm_lewis', speed=1):
58
- pipeline = KPipeline(lang_code='a') # Use 'a' for American English
59
- generator = pipeline(text, voice=voice, speed=speed, split_pattern=r'\n+')
60
- full_audio = []
61
- for _, _, audio in generator:
62
- if audio is not None: # Check if audio is not None
63
- full_audio.extend(audio)
64
- if not full_audio: # Check if full_audio is empty
65
- raise ValueError("No audio data generated.")
66
- full_audio = np.array(full_audio)
67
- return full_audio, 24000
68
 
69
  # Main function to process the image and generate audio
70
  def Mustalhim(image):
71
  caption = Image_Caption(image)
72
  story = Generate_story(caption)
73
- audio = Generate_audio(story)
74
- return audio
75
 
76
  # Gradio interface
77
  def gradio_interface(image):
78
- audio_waveform, sampling_rate = Mustalhim(image)
79
- audio_file = "output_audio.wav"
80
- sf.write(audio_file, audio_waveform, sampling_rate)
81
  return audio_file
82
 
83
  # Path to the example image
 
1
  import torch
2
  from transformers import AutoModelForCausalLM, LlamaTokenizer, pipeline as transformers_pipeline
3
+ from gtts import gTTS # Replace KPipeline with gTTS
4
  import soundfile as sf
5
  import numpy as np
6
  import gradio as gr
7
+ import os
8
 
9
  # Initialize the image-to-text pipeline
10
  captionImage = transformers_pipeline("image-to-text", model="Salesforce/blip-image-captioning-large")
 
54
 
55
  return story
56
 
57
+ # Function to generate audio using gTTS
58
+ def Generate_audio(text, lang='en'):
59
+ tts = gTTS(text=text, lang=lang, slow=False) # Create a gTTS object
60
+ audio_file = "output_audio.mp3" # Save as MP3
61
+ tts.save(audio_file) # Save the audio file
62
+ return audio_file
 
 
 
 
 
 
63
 
64
  # Main function to process the image and generate audio
65
  def Mustalhim(image):
66
  caption = Image_Caption(image)
67
  story = Generate_story(caption)
68
+ audio_file = Generate_audio(story)
69
+ return audio_file
70
 
71
  # Gradio interface
72
  def gradio_interface(image):
73
+ audio_file = Mustalhim(image)
 
 
74
  return audio_file
75
 
76
  # Path to the example image