import torch
from transformers import AutoModelForCausalLM, LlamaTokenizer, pipeline as transformers_pipeline
from gtts import gTTS  # Replace KPipeline with gTTS
import soundfile as sf
import numpy as np
import gradio as gr
import os

# Initialize the image-to-text pipeline
captionImage = transformers_pipeline("image-to-text", model="Salesforce/blip-image-captioning-large")

# Initialize the text-generation pipeline
device = "cpu"
model_id = "ALLaM-AI/ALLaM-7B-Instruct-preview"

# Load the model
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype="auto",
    trust_remote_code=True,
)

# Use LlamaTokenizer for compatibility
tokenizer = LlamaTokenizer.from_pretrained(model_id)

# Initialize the text-generation pipeline
generator = transformers_pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    return_full_text=False,
    max_new_tokens=500,
    do_sample=False,
)

# Function to generate caption
def Image_Caption(image):
    caption = captionImage(image)
    return caption[0]['generated_text']

# Function to generate a story
def Generate_story(textAbout):
    # Define the prompt as a string
    prompt = f'write a long story about {textAbout} that takes 3 min to read'
    
    # Generate the story using the generator pipeline
    story = generator(prompt)
    
    # Extract the generated text
    story = story[0]['generated_text']
    
    # Clean up the story
    story = story.replace('\n', ' ').replace('arafed', ' ')
    
    return story

# Function to generate audio using gTTS
def Generate_audio(text, lang='en'):
    tts = gTTS(text=text, lang=lang, slow=False)  # Create a gTTS object
    audio_file = "output_audio.mp3"  # Save as MP3
    tts.save(audio_file)  # Save the audio file
    return audio_file

# Main function to process the image and generate audio
def Mustalhim(image):
    caption = Image_Caption(image)
    story = Generate_story(caption)
    audio_file = Generate_audio(story)
    return audio_file

# Gradio interface
def gradio_interface(image):
    audio_file = Mustalhim(image)
    return audio_file

# Path to the example image
example_image = "Example.PNG"

# Create the Gradio app
app = gr.Interface(
    fn=gradio_interface,
    inputs=gr.Image(type="pil"),
    outputs=gr.Audio(type="filepath"),
    title="Image to Audio Story",
    description="Upload an image, and the app will generate a story and convert it to audio.",
    examples=[[example_image]]
)

# Launch the app
app.launch()