MoJaff commited on
Commit
20faed5
·
verified ·
1 Parent(s): ccdc1fd

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +109 -0
app.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+
3
+
4
+ device = "cpu"
5
+ model_id ="ALLaM-AI/ALLaM-7B-Instruct-preview"
6
+
7
+ from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
8
+
9
+
10
+ model = AutoModelForCausalLM.from_pretrained(
11
+ model_id,
12
+ torch_dtype="auto",
13
+ trust_remote_code=True,
14
+ )
15
+
16
+ tokenizer = AutoTokenizer.from_pretrained("ALLaM-AI/ALLaM-7B-Instruct-preview")
17
+
18
+ messages = [
19
+ {"role": "user", "content": "write a long story that takes 3 min to read"}
20
+ ]
21
+
22
+
23
+ generator = pipeline(
24
+ "text-generation",
25
+ model=model,
26
+ tokenizer=tokenizer,
27
+ return_full_text=False,
28
+ max_new_tokens=500,
29
+ do_sample=False
30
+ )
31
+
32
+ from kokoro import KPipeline
33
+ from IPython.display import display, Audio
34
+ import soundfile as sf
35
+
36
+ pipeline = KPipeline(lang_code='b', model=False)
37
+
38
+ import numpy as np
39
+
40
+ def Generate_audio(text, voice='bm_lewis', speed=1):
41
+
42
+ pipeline = KPipeline(lang_code='b')
43
+
44
+
45
+ generator = pipeline(text, voice=voice, speed=speed, split_pattern=r'\n+')
46
+
47
+
48
+ full_audio = []
49
+ for _, _, audio in generator:
50
+ full_audio.extend(audio)
51
+
52
+
53
+ full_audio = np.array(full_audio)
54
+
55
+
56
+ return full_audio, 24000
57
+
58
+ from transformers import pipeline as transformers_pipeline
59
+
60
+
61
+
62
+ captionImage = transformers_pipeline("image-to-text",
63
+ model="Salesforce/blip-image-captioning-large")
64
+
65
+ def Image_Caption(image):
66
+ caption = captionImage(image)
67
+ caption = caption[0]['generated_text']
68
+ return caption
69
+
70
+ def Generate_story(textAbout):
71
+ storyAbout = {"role": "user", "content": f'write a long story about {textAbout} that takes 3 min to read'},
72
+
73
+ story = generator(storyAbout)
74
+ story = story[0]['generated_text']
75
+ story = story.replace('\n', ' ').replace('arafed', ' ')
76
+ return story
77
+
78
+ def Mustalhim(image):
79
+ caption = Image_Caption(image)
80
+ story = Generate_story(caption)
81
+ audio = Generate_audio(story)
82
+ return audio
83
+
84
+ def gradio_interface(image):
85
+
86
+ audio_waveform, sampling_rate = Mustalhim(image)
87
+
88
+
89
+ audio_file = "output_audio.wav"
90
+ sf.write(audio_file, audio_waveform, sampling_rate)
91
+
92
+
93
+ return audio_file
94
+
95
+ example_image = "Example.PNG"
96
+
97
+
98
+ app = gr.Interface(
99
+ fn=gradio_interface,
100
+ inputs=gr.Image(type="pil"),
101
+ outputs=gr.Audio(type="filepath"),
102
+ title="Image to Audio Story",
103
+ description="Upload an image, and the app will generate a story and convert it to audio.",
104
+ examples=[[example_image]]
105
+ )
106
+
107
+ # Launch the app
108
+ app.launch()
109
+