File size: 2,495 Bytes
20faed5
8a3a311
fd20bfc
8a3a311
 
 
fd20bfc
20faed5
8a3a311
 
20faed5
8a3a311
20faed5
8a3a311
20faed5
8a3a311
20faed5
 
 
 
 
 
8a3a311
 
 
 
 
 
 
 
 
 
 
20faed5
 
8a3a311
 
 
 
20faed5
8a3a311
 
3719419
 
 
 
 
 
 
8a3a311
3719419
 
8a3a311
3719419
8a3a311
20faed5
fd20bfc
 
 
 
 
 
20faed5
8a3a311
20faed5
8a3a311
 
fd20bfc
 
20faed5
8a3a311
20faed5
fd20bfc
20faed5
 
8a3a311
20faed5
 
8a3a311
20faed5
8a3a311
 
 
20faed5
 
 
 
 
 
22b03f1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import torch
from transformers import AutoModelForCausalLM, LlamaTokenizer, pipeline as transformers_pipeline
from gtts import gTTS  # Replace KPipeline with gTTS
import soundfile as sf
import numpy as np
import gradio as gr
import os

# Initialize the image-to-text pipeline
captionImage = transformers_pipeline("image-to-text", model="Salesforce/blip-image-captioning-large")

# Initialize the text-generation pipeline
device = "cpu"
model_id = "ALLaM-AI/ALLaM-7B-Instruct-preview"

# Load the model
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype="auto",
    trust_remote_code=True,
)

# Use LlamaTokenizer for compatibility
tokenizer = LlamaTokenizer.from_pretrained(model_id)

# Initialize the text-generation pipeline
generator = transformers_pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    return_full_text=False,
    max_new_tokens=500,
    do_sample=False,
)

# Function to generate caption
def Image_Caption(image):
    caption = captionImage(image)
    return caption[0]['generated_text']

# Function to generate a story
def Generate_story(textAbout):
    # Define the prompt as a string
    prompt = f'write a long story about {textAbout} that takes 3 min to read'
    
    # Generate the story using the generator pipeline
    story = generator(prompt)
    
    # Extract the generated text
    story = story[0]['generated_text']
    
    # Clean up the story
    story = story.replace('\n', ' ').replace('arafed', ' ')
    
    return story

# Function to generate audio using gTTS
def Generate_audio(text, lang='en'):
    tts = gTTS(text=text, lang=lang, slow=False)  # Create a gTTS object
    audio_file = "output_audio.mp3"  # Save as MP3
    tts.save(audio_file)  # Save the audio file
    return audio_file

# Main function to process the image and generate audio
def Mustalhim(image):
    caption = Image_Caption(image)
    story = Generate_story(caption)
    audio_file = Generate_audio(story)
    return audio_file

# Gradio interface
def gradio_interface(image):
    audio_file = Mustalhim(image)
    return audio_file

# Path to the example image
example_image = "Example.PNG"

# Create the Gradio app
app = gr.Interface(
    fn=gradio_interface,
    inputs=gr.Image(type="pil"),
    outputs=gr.Audio(type="filepath"),
    title="Image to Audio Story",
    description="Upload an image, and the app will generate a story and convert it to audio.",
    examples=[[example_image]]
)

# Launch the app
app.launch()