Spaces:
Running
Running
import torch | |
from transformers import AutoModelForCausalLM, LlamaTokenizer, pipeline as transformers_pipeline | |
from gtts import gTTS # Replace KPipeline with gTTS | |
import soundfile as sf | |
import numpy as np | |
import gradio as gr | |
import os | |
# Initialize the image-to-text pipeline | |
captionImage = transformers_pipeline("image-to-text", model="Salesforce/blip-image-captioning-large") | |
# Initialize the text-generation pipeline | |
device = "cpu" | |
model_id = "ALLaM-AI/ALLaM-7B-Instruct-preview" | |
# Load the model | |
model = AutoModelForCausalLM.from_pretrained( | |
model_id, | |
torch_dtype="auto", | |
trust_remote_code=True, | |
) | |
# Use LlamaTokenizer for compatibility | |
tokenizer = LlamaTokenizer.from_pretrained(model_id) | |
# Initialize the text-generation pipeline | |
generator = transformers_pipeline( | |
"text-generation", | |
model=model, | |
tokenizer=tokenizer, | |
return_full_text=False, | |
max_new_tokens=500, | |
do_sample=False, | |
) | |
# Function to generate caption | |
def Image_Caption(image): | |
caption = captionImage(image) | |
return caption[0]['generated_text'] | |
# Function to generate a story | |
def Generate_story(textAbout): | |
# Define the prompt as a string | |
prompt = f'write a long story about {textAbout} that takes 3 min to read' | |
# Generate the story using the generator pipeline | |
story = generator(prompt) | |
# Extract the generated text | |
story = story[0]['generated_text'] | |
# Clean up the story | |
story = story.replace('\n', ' ').replace('arafed', ' ') | |
return story | |
# Function to generate audio using gTTS | |
def Generate_audio(text, lang='en'): | |
tts = gTTS(text=text, lang=lang, slow=False) # Create a gTTS object | |
audio_file = "output_audio.mp3" # Save as MP3 | |
tts.save(audio_file) # Save the audio file | |
return audio_file | |
# Main function to process the image and generate audio | |
def Mustalhim(image): | |
caption = Image_Caption(image) | |
story = Generate_story(caption) | |
audio_file = Generate_audio(story) | |
return audio_file | |
# Gradio interface | |
def gradio_interface(image): | |
audio_file = Mustalhim(image) | |
return audio_file | |
# Path to the example image | |
example_image = "Example.PNG" | |
# Create the Gradio app | |
app = gr.Interface( | |
fn=gradio_interface, | |
inputs=gr.Image(type="pil"), | |
outputs=gr.Audio(type="filepath"), | |
title="Image to Audio Story", | |
description="Upload an image, and the app will generate a story and convert it to audio.", | |
examples=[[example_image]] | |
) | |
# Launch the app | |
app.launch() |