MoJaff commited on
Commit
8a3a311
·
verified ·
1 Parent(s): 90dfe38

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +45 -68
app.py CHANGED
@@ -1,109 +1,86 @@
1
  import torch
 
 
 
 
 
2
 
 
 
3
 
 
4
  device = "cpu"
5
- model_id ="ALLaM-AI/ALLaM-7B-Instruct-preview"
6
-
7
- from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
8
-
9
 
 
10
  model = AutoModelForCausalLM.from_pretrained(
11
  model_id,
12
  torch_dtype="auto",
13
  trust_remote_code=True,
14
  )
15
 
16
- tokenizer = AutoTokenizer.from_pretrained("ALLaM-AI/ALLaM-7B-Instruct-preview")
17
-
18
- messages = [
19
- {"role": "user", "content": "write a long story that takes 3 min to read"}
20
- ]
21
-
22
-
23
- generator = pipeline(
24
- "text-generation",
25
- model=model,
26
- tokenizer=tokenizer,
27
- return_full_text=False,
28
- max_new_tokens=500,
29
- do_sample=False
30
  )
31
 
32
- from kokoro import KPipeline
33
- from IPython.display import display, Audio
34
- import soundfile as sf
35
-
36
- pipeline = KPipeline(lang_code='b', model=False)
37
 
38
- import numpy as np
 
 
 
 
 
 
39
 
 
40
  def Generate_audio(text, voice='bm_lewis', speed=1):
41
-
42
  pipeline = KPipeline(lang_code='b')
43
-
44
-
45
  generator = pipeline(text, voice=voice, speed=speed, split_pattern=r'\n+')
46
-
47
-
48
  full_audio = []
49
  for _, _, audio in generator:
50
  full_audio.extend(audio)
51
-
52
-
53
  full_audio = np.array(full_audio)
54
-
55
-
56
  return full_audio, 24000
57
 
58
- from transformers import pipeline as transformers_pipeline
59
-
60
-
61
-
62
- captionImage = transformers_pipeline("image-to-text",
63
- model="Salesforce/blip-image-captioning-large")
64
-
65
- def Image_Caption(image):
66
- caption = captionImage(image)
67
- caption = caption[0]['generated_text']
68
- return caption
69
-
70
- def Generate_story(textAbout):
71
- storyAbout = {"role": "user", "content": f'write a long story about {textAbout} that takes 3 min to read'},
72
-
73
- story = generator(storyAbout)
74
- story = story[0]['generated_text']
75
- story = story.replace('\n', ' ').replace('arafed', ' ')
76
- return story
77
-
78
  def Mustalhim(image):
79
- caption = Image_Caption(image)
80
- story = Generate_story(caption)
81
- audio = Generate_audio(story)
82
- return audio
83
 
 
84
  def gradio_interface(image):
85
-
86
  audio_waveform, sampling_rate = Mustalhim(image)
87
-
88
-
89
  audio_file = "output_audio.wav"
90
  sf.write(audio_file, audio_waveform, sampling_rate)
91
-
92
-
93
  return audio_file
94
 
 
95
  example_image = "Example.PNG"
96
 
97
-
98
  app = gr.Interface(
99
- fn=gradio_interface,
100
- inputs=gr.Image(type="pil"),
101
- outputs=gr.Audio(type="filepath"),
102
  title="Image to Audio Story",
103
  description="Upload an image, and the app will generate a story and convert it to audio.",
104
  examples=[[example_image]]
105
  )
106
 
107
  # Launch the app
108
- app.launch()
109
-
 
1
  import torch
2
+ from transformers import AutoModelForCausalLM, LlamaTokenizer, pipeline as transformers_pipeline
3
+ from kokoro import KPipeline
4
+ import soundfile as sf
5
+ import numpy as np
6
+ import gradio as gr
7
 
8
+ # Initialize the image-to-text pipeline
9
+ captionImage = transformers_pipeline("image-to-text", model="Salesforce/blip-image-captioning-large")
10
 
11
+ # Initialize the text-generation pipeline
12
  device = "cpu"
13
+ model_id = "ALLaM-AI/ALLaM-7B-Instruct-preview"
 
 
 
14
 
15
+ # Load the model
16
  model = AutoModelForCausalLM.from_pretrained(
17
  model_id,
18
  torch_dtype="auto",
19
  trust_remote_code=True,
20
  )
21
 
22
+ # Use LlamaTokenizer for compatibility
23
+ tokenizer = LlamaTokenizer.from_pretrained(model_id)
24
+
25
+ # Initialize the text-generation pipeline
26
+ generator = transformers_pipeline(
27
+ "text-generation",
28
+ model=model,
29
+ tokenizer=tokenizer,
30
+ return_full_text=False,
31
+ max_new_tokens=500,
32
+ do_sample=False,
 
 
 
33
  )
34
 
35
+ # Function to generate caption
36
+ def Image_Caption(image):
37
+ caption = captionImage(image)
38
+ return caption[0]['generated_text']
 
39
 
40
+ # Function to generate a story
41
+ def Generate_story(textAbout):
42
+ storyAbout = {"role": "user", "content": f'write a long story about {textAbout} that takes 3 min to read'}
43
+ story = generator(storyAbout)
44
+ story = story[0]['generated_text']
45
+ story = story.replace('\n', ' ').replace('arafed', ' ')
46
+ return story
47
 
48
+ # Function to generate audio
49
  def Generate_audio(text, voice='bm_lewis', speed=1):
 
50
  pipeline = KPipeline(lang_code='b')
 
 
51
  generator = pipeline(text, voice=voice, speed=speed, split_pattern=r'\n+')
 
 
52
  full_audio = []
53
  for _, _, audio in generator:
54
  full_audio.extend(audio)
 
 
55
  full_audio = np.array(full_audio)
 
 
56
  return full_audio, 24000
57
 
58
+ # Main function to process the image and generate audio
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  def Mustalhim(image):
60
+ caption = Image_Caption(image)
61
+ story = Generate_story(caption)
62
+ audio = Generate_audio(story)
63
+ return audio
64
 
65
+ # Gradio interface
66
  def gradio_interface(image):
 
67
  audio_waveform, sampling_rate = Mustalhim(image)
 
 
68
  audio_file = "output_audio.wav"
69
  sf.write(audio_file, audio_waveform, sampling_rate)
 
 
70
  return audio_file
71
 
72
+ # Path to the example image
73
  example_image = "Example.PNG"
74
 
75
+ # Create the Gradio app
76
  app = gr.Interface(
77
+ fn=gradio_interface,
78
+ inputs=gr.Image(type="pil"),
79
+ outputs=gr.Audio(type="filepath"),
80
  title="Image to Audio Story",
81
  description="Upload an image, and the app will generate a story and convert it to audio.",
82
  examples=[[example_image]]
83
  )
84
 
85
  # Launch the app
86
+ app.launch()