import torch from transformers import AutoModelForCausalLM, LlamaTokenizer, pipeline as transformers_pipeline from gtts import gTTS # Replace KPipeline with gTTS import soundfile as sf import numpy as np import gradio as gr import os # Initialize the image-to-text pipeline captionImage = transformers_pipeline("image-to-text", model="Salesforce/blip-image-captioning-large") # Initialize the text-generation pipeline device = "cpu" model_id = "ALLaM-AI/ALLaM-7B-Instruct-preview" # Load the model model = AutoModelForCausalLM.from_pretrained( model_id, torch_dtype="auto", trust_remote_code=True, ) # Use LlamaTokenizer for compatibility tokenizer = LlamaTokenizer.from_pretrained(model_id) # Initialize the text-generation pipeline generator = transformers_pipeline( "text-generation", model=model, tokenizer=tokenizer, return_full_text=False, max_new_tokens=500, do_sample=False, ) # Function to generate caption def Image_Caption(image): caption = captionImage(image) return caption[0]['generated_text'] # Function to generate a story def Generate_story(textAbout): # Define the prompt as a string prompt = f'write a long story about {textAbout} that takes 3 min to read' # Generate the story using the generator pipeline story = generator(prompt) # Extract the generated text story = story[0]['generated_text'] # Clean up the story story = story.replace('\n', ' ').replace('arafed', ' ') return story # Function to generate audio using gTTS def Generate_audio(text, lang='en'): tts = gTTS(text=text, lang=lang, slow=False) # Create a gTTS object audio_file = "output_audio.mp3" # Save as MP3 tts.save(audio_file) # Save the audio file return audio_file # Main function to process the image and generate audio def Mustalhim(image): caption = Image_Caption(image) story = Generate_story(caption) audio_file = Generate_audio(story) return audio_file # Gradio interface def gradio_interface(image): audio_file = Mustalhim(image) return audio_file # Path to the example image example_image = "Example.PNG" # Create the Gradio app app = gr.Interface( fn=gradio_interface, inputs=gr.Image(type="pil"), outputs=gr.Audio(type="filepath"), title="Image to Audio Story", description="Upload an image, and the app will generate a story and convert it to audio.", examples=[[example_image]] ) # Launch the app app.launch()