import tempfile
import gradio as gr
from gtts import gTTS
import inference_script
import vit_gpt2
import os
import warnings

warnings.filterwarnings('ignore')

# Define problem statement
problem_statement = """
### Problem Statement
Visually impaired individuals face challenges in understanding image content. This project aims to address this issue by generating descriptive spoken captions for images, leveraging CNNs and RNNs for feature extraction and sequence generation, respectively. The model is trained on the Flickr8K dataset and extended with an attention mechanism for enhanced accessibility.
"""

# Define solution overview
solution_overview = """
### Solution Overview
The basic model, trained for a limited duration without extensive hyperparameter tuning, primarily focuses on exploring subclassing techniques. To improve inference quality, Vit-GPT2 architecture is integrated. [Visit the Kaggle notebook](https://www.kaggle.com/code/krishna2308/eye-for-blind) for implementation details.
"""

# Define real-life scenario application
real_life_scenario = """
### Real-life Scenario Application
While this current implementation may not support real-time processing, the potential for future development is vast. Where a visually impaired individual wears smart glasses equipped with a camera. As they move around, the camera captures live footage of their surroundings, which is then processed in real-time by the image captioning model integrated into the glasses. The generated spoken descriptions can be streamed directly to the user's earpiece, providing instant audio feedback about their environment.
"""


def process_image_and_generate_output(image, model_selection):
    if image is None:
        return "Please select an image", None
    if model_selection == 'Basic Model':
        result = inference_script.evaluate(image)
        pred_caption = ' '.join(result).rsplit(' ', 1)[0]
        pred_caption = pred_caption.replace('<unk>', '')
    elif model_selection == 'ViT-GPT2':
        result = vit_gpt2.predict_step(image)
        pred_caption = result[0]
    else:
        return "Invalid model selection", None

    # Generate speech from the caption
    tts = gTTS(text=pred_caption, lang='en', slow=False)
    with tempfile.NamedTemporaryFile(delete=False, suffix='.mp3') as temp_audio:
        audio_file_path = temp_audio.name
        tts.save(audio_file_path)

    # Read the audio file
    with open(audio_file_path, "rb") as f:
        audio_content = f.read()

    # Clean up the temporary audio file
    os.unlink(audio_file_path)
    return pred_caption, audio_content


sample_images = [
    [os.path.join(os.path.dirname(__file__), "sample_images/1.jpg"), "ViT-GPT2"],
    [os.path.join(os.path.dirname(__file__), "sample_images/1.jpg"), 'Basic Model'],
    [os.path.join(os.path.dirname(__file__), "sample_images/3.jpg"), "ViT-GPT2"],
    [os.path.join(os.path.dirname(__file__), "sample_images/3.jpg"), 'Basic Model']
]

# Create a dropdown to select sample image
image_input = gr.Image(label="Upload Image")

# Create a dropdown to choose the model
model_selection_input = gr.Radio(["Basic Model",
                                  "ViT-GPT2"],
                                 label="Choose Model")


iface = gr.Interface(fn=process_image_and_generate_output,
                     inputs=[image_input, model_selection_input],
                     outputs=[gr.Text(label="Caption"),gr.Audio(label="Audio")],
                     examples=sample_images,
                     allow_flagging='never',
                     title="Eye For Blind | Image Captioning & TTS Demo",
                     description=f"{problem_statement}\n\n{solution_overview}\n\n{real_life_scenario}")

iface.launch()