Spaces:

krishnapal2308
/

eye_for_blind

Sleeping

File size: 2,759 Bytes

73d4923
 
 
 
 
 
 
3a2d1fe
73d4923
 
 
 
3a2d1fe
 
00eaff9
 
 
73d4923
 
 
00eaff9
73d4923
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
090ccb6
00eaff9
 
 
 
 
 
090ccb6
62d3fbb
090ccb6
 
 
 
 
 
 
3a2d1fe
 
090ccb6
3a2d1fe
4701b71
3a2d1fe
 
00eaff9
 
3a2d1fe
 
73d4923
3a2d1fe
73d4923
3a2d1fe
00eaff9
62d3fbb
73d4923

import tempfile
import gradio as gr
from gtts import gTTS
import inference_script
import vit_gpt2
import os
import warnings

warnings.filterwarnings('ignore')


def process_image_and_generate_output(image, model_selection):
    if image is None:
        return "Please select an image", None
    # (Trained only for 15 epochs without any hyperparameter tuning, utilizing inception v3)'
    #                            (SOTA model for Image captioning)
    if model_selection == ('Basic Model'):
        result = inference_script.evaluate(image)
        pred_caption = ' '.join(result).rsplit(' ', 1)[0]
        pred_caption = pred_caption.replace('<unk>', '')
    elif model_selection == 'ViT-GPT2':
        result = vit_gpt2.predict_step(image)
        pred_caption = result[0]
    else:
        return "Invalid model selection", None

    # Generate speech from the caption
    tts = gTTS(text=pred_caption, lang='en', slow=False)
    with tempfile.NamedTemporaryFile(delete=False, suffix='.mp3') as temp_audio:
        audio_file_path = temp_audio.name
        tts.save(audio_file_path)

    # Read the audio file
    with open(audio_file_path, "rb") as f:
        audio_content = f.read()

    # Clean up the temporary audio file
    os.unlink(audio_file_path)
    return pred_caption, audio_content


# sample_images = [
#     [os.path.join(os.path.dirname(__file__), "sample_images/1.jpg"), 'Basic Model'],
#     [os.path.join(os.path.dirname(__file__), "sample_images/2.jpg"), 'Basic Model'],
#     [os.path.join(os.path.dirname(__file__), "sample_images/3.jpg"), 'Basic Model'],
#     [os.path.join(os.path.dirname(__file__), "sample_images/4.jpg"), "ViT-GPT2"],
#     [os.path.join(os.path.dirname(__file__), "sample_images/5.jpg"), "ViT-GPT2"],
#     [os.path.join(os.path.dirname(__file__), "sample_images/6.jpg"), "ViT-GPT2"]
# ]

sample_images = [
    ["sample_images/1.jpg"],
    ["sample_images/2.jpg"],
    ["sample_images/3.jpg"],
    ["sample_images/4.jpg"],
    ["sample_images/5.jpg"],
    ["sample_images/6.jpg"]
]


# Create a dropdown to select sample image
image_input = gr.Image(label="Upload Image")

# Create a dropdown to choose the model
model_selection_input = gr.Radio(["Basic Model",
                                  "ViT-GPT2"],
                                 label="Choose Model")

iface = gr.Interface(fn=process_image_and_generate_output,
                     inputs=[image_input, model_selection_input],
                     outputs=["text", "audio"],
                     examples=sample_images,
                     cache_examples=False,
                     allow_flagging='never',
                     title="Eye For Blind | Image Captioning & TTS",
                     description="To be added")

iface.launch()