Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,10 +1,9 @@
|
|
1 |
import torch
|
2 |
from transformers import AutoModelForCausalLM, LlamaTokenizer, pipeline as transformers_pipeline
|
3 |
-
from
|
4 |
import soundfile as sf
|
5 |
import numpy as np
|
6 |
import gradio as gr
|
7 |
-
import os
|
8 |
|
9 |
# Initialize the image-to-text pipeline
|
10 |
captionImage = transformers_pipeline("image-to-text", model="Salesforce/blip-image-captioning-large")
|
@@ -54,23 +53,28 @@ def Generate_story(textAbout):
|
|
54 |
|
55 |
return story
|
56 |
|
57 |
-
# Function to generate audio
|
58 |
-
def Generate_audio(text,
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
|
|
|
|
|
|
63 |
|
64 |
# Main function to process the image and generate audio
|
65 |
def Mustalhim(image):
|
66 |
caption = Image_Caption(image)
|
67 |
story = Generate_story(caption)
|
68 |
-
|
69 |
-
return
|
70 |
|
71 |
# Gradio interface
|
72 |
def gradio_interface(image):
|
73 |
-
|
|
|
|
|
74 |
return audio_file
|
75 |
|
76 |
# Path to the example image
|
@@ -81,7 +85,7 @@ app = gr.Interface(
|
|
81 |
fn=gradio_interface,
|
82 |
inputs=gr.Image(type="pil"),
|
83 |
outputs=gr.Audio(type="filepath"),
|
84 |
-
title="
|
85 |
description="Upload an image, and the app will generate a story and convert it to audio.",
|
86 |
examples=[[example_image]]
|
87 |
)
|
|
|
1 |
import torch
|
2 |
from transformers import AutoModelForCausalLM, LlamaTokenizer, pipeline as transformers_pipeline
|
3 |
+
from kokoro import KPipeline
|
4 |
import soundfile as sf
|
5 |
import numpy as np
|
6 |
import gradio as gr
|
|
|
7 |
|
8 |
# Initialize the image-to-text pipeline
|
9 |
captionImage = transformers_pipeline("image-to-text", model="Salesforce/blip-image-captioning-large")
|
|
|
53 |
|
54 |
return story
|
55 |
|
56 |
+
# Function to generate audio
|
57 |
+
def Generate_audio(text, voice='bm_lewis', speed=1):
|
58 |
+
pipeline = KPipeline(lang_code='b')
|
59 |
+
generator = pipeline(text, voice=voice, speed=speed, split_pattern=r'\n+')
|
60 |
+
full_audio = []
|
61 |
+
for _, _, audio in generator:
|
62 |
+
full_audio.extend(audio)
|
63 |
+
full_audio = np.array(full_audio)
|
64 |
+
return full_audio, 24000
|
65 |
|
66 |
# Main function to process the image and generate audio
|
67 |
def Mustalhim(image):
|
68 |
caption = Image_Caption(image)
|
69 |
story = Generate_story(caption)
|
70 |
+
audio = Generate_audio(story)
|
71 |
+
return audio
|
72 |
|
73 |
# Gradio interface
|
74 |
def gradio_interface(image):
|
75 |
+
audio_waveform, sampling_rate = Mustalhim(image)
|
76 |
+
audio_file = "output_audio.wav"
|
77 |
+
sf.write(audio_file, audio_waveform, sampling_rate)
|
78 |
return audio_file
|
79 |
|
80 |
# Path to the example image
|
|
|
85 |
fn=gradio_interface,
|
86 |
inputs=gr.Image(type="pil"),
|
87 |
outputs=gr.Audio(type="filepath"),
|
88 |
+
title="Mustalhim",
|
89 |
description="Upload an image, and the app will generate a story and convert it to audio.",
|
90 |
examples=[[example_image]]
|
91 |
)
|