File size: 2,442 Bytes
20faed5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import torch


device = "cpu"
model_id ="ALLaM-AI/ALLaM-7B-Instruct-preview"

from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline


model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype="auto",
    trust_remote_code=True,
)

tokenizer = AutoTokenizer.from_pretrained("ALLaM-AI/ALLaM-7B-Instruct-preview")

messages = [
    {"role": "user", "content": "write a long story that takes 3 min to read"}
]


generator = pipeline(
    "text-generation",   
    model=model,         
    tokenizer=tokenizer, 
    return_full_text=False,  
    max_new_tokens=500,      
    do_sample=False           
)

from kokoro import KPipeline
from IPython.display import display, Audio
import soundfile as sf

pipeline = KPipeline(lang_code='b', model=False)

import numpy as np

def Generate_audio(text, voice='bm_lewis', speed=1):

    pipeline = KPipeline(lang_code='b')


    generator = pipeline(text, voice=voice, speed=speed, split_pattern=r'\n+')


    full_audio = []
    for _, _, audio in generator:
        full_audio.extend(audio)


    full_audio = np.array(full_audio)


    return full_audio, 24000

from transformers import pipeline as transformers_pipeline



captionImage = transformers_pipeline("image-to-text",
                                 model="Salesforce/blip-image-captioning-large")

def Image_Caption(image):
    caption = captionImage(image)
    caption = caption[0]['generated_text']
    return caption

def Generate_story(textAbout):
  storyAbout =  {"role": "user", "content": f'write a long story about {textAbout} that takes 3 min to read'},

  story = generator(storyAbout)
  story = story[0]['generated_text']
  story = story.replace('\n', ' ').replace('arafed', ' ')
  return story

def Mustalhim(image):
  caption = Image_Caption(image)
  story = Generate_story(caption)
  audio = Generate_audio(story)
  return audio

def gradio_interface(image):

    audio_waveform, sampling_rate = Mustalhim(image)
    

    audio_file = "output_audio.wav"
    sf.write(audio_file, audio_waveform, sampling_rate)
    

    return audio_file

example_image = "Example.PNG"


app = gr.Interface(
    fn=gradio_interface,  
    inputs=gr.Image(type="pil"),  
    outputs=gr.Audio(type="filepath"),  
    title="Image to Audio Story",
    description="Upload an image, and the app will generate a story and convert it to audio.",
    examples=[[example_image]]
)

# Launch the app
app.launch()