Spaces:
Sleeping
Sleeping
File size: 2,850 Bytes
20faed5 8a3a311 20faed5 8a3a311 20faed5 8a3a311 20faed5 8a3a311 20faed5 8a3a311 20faed5 8a3a311 20faed5 8a3a311 20faed5 8a3a311 3719419 8a3a311 3719419 8a3a311 3719419 8a3a311 20faed5 8a3a311 20faed5 723c2aa 20faed5 21cb76c 20faed5 8a3a311 20faed5 8a3a311 20faed5 8a3a311 20faed5 8a3a311 20faed5 8a3a311 20faed5 8a3a311 20faed5 22b03f1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 |
import torch
from transformers import AutoModelForCausalLM, LlamaTokenizer, pipeline as transformers_pipeline
from kokoro import KPipeline
import soundfile as sf
import numpy as np
import gradio as gr
# Initialize the image-to-text pipeline
captionImage = transformers_pipeline("image-to-text", model="Salesforce/blip-image-captioning-large")
# Initialize the text-generation pipeline
device = "cpu"
model_id = "ALLaM-AI/ALLaM-7B-Instruct-preview"
# Load the model
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype="auto",
trust_remote_code=True,
)
# Use LlamaTokenizer for compatibility
tokenizer = LlamaTokenizer.from_pretrained(model_id)
# Initialize the text-generation pipeline
generator = transformers_pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
return_full_text=False,
max_new_tokens=500,
do_sample=False,
)
# Function to generate caption
def Image_Caption(image):
caption = captionImage(image)
return caption[0]['generated_text']
# Function to generate a story
def Generate_story(textAbout):
# Define the prompt as a string
prompt = f'write a long story about {textAbout} that takes 3 min to read'
# Generate the story using the generator pipeline
story = generator(prompt)
# Extract the generated text
story = story[0]['generated_text']
# Clean up the story
story = story.replace('\n', ' ').replace('arafed', ' ')
return story
# Function to generate audio
def Generate_audio(text, voice='bm_lewis', speed=1):
pipeline = KPipeline(lang_code='a') # Use 'a' for American English
generator = pipeline(text, voice=voice, speed=speed, split_pattern=r'\n+')
full_audio = []
for _, _, audio in generator:
if audio is not None: # Check if audio is not None
full_audio.extend(audio)
if not full_audio: # Check if full_audio is empty
raise ValueError("No audio data generated.")
full_audio = np.array(full_audio)
return full_audio, 24000
# Main function to process the image and generate audio
def Mustalhim(image):
caption = Image_Caption(image)
story = Generate_story(caption)
audio = Generate_audio(story)
return audio
# Gradio interface
def gradio_interface(image):
audio_waveform, sampling_rate = Mustalhim(image)
audio_file = "output_audio.wav"
sf.write(audio_file, audio_waveform, sampling_rate)
return audio_file
# Path to the example image
example_image = "Example.PNG"
# Create the Gradio app
app = gr.Interface(
fn=gradio_interface,
inputs=gr.Image(type="pil"),
outputs=gr.Audio(type="filepath"),
title="Image to Audio Story",
description="Upload an image, and the app will generate a story and convert it to audio.",
examples=[[example_image]]
)
# Launch the app
app.launch() |