import torch device = "cpu" model_id ="ALLaM-AI/ALLaM-7B-Instruct-preview" from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline model = AutoModelForCausalLM.from_pretrained( model_id, torch_dtype="auto", trust_remote_code=True, ) tokenizer = AutoTokenizer.from_pretrained("ALLaM-AI/ALLaM-7B-Instruct-preview") messages = [ {"role": "user", "content": "write a long story that takes 3 min to read"} ] generator = pipeline( "text-generation", model=model, tokenizer=tokenizer, return_full_text=False, max_new_tokens=500, do_sample=False ) from kokoro import KPipeline from IPython.display import display, Audio import soundfile as sf pipeline = KPipeline(lang_code='b', model=False) import numpy as np def Generate_audio(text, voice='bm_lewis', speed=1): pipeline = KPipeline(lang_code='b') generator = pipeline(text, voice=voice, speed=speed, split_pattern=r'\n+') full_audio = [] for _, _, audio in generator: full_audio.extend(audio) full_audio = np.array(full_audio) return full_audio, 24000 from transformers import pipeline as transformers_pipeline captionImage = transformers_pipeline("image-to-text", model="Salesforce/blip-image-captioning-large") def Image_Caption(image): caption = captionImage(image) caption = caption[0]['generated_text'] return caption def Generate_story(textAbout): storyAbout = {"role": "user", "content": f'write a long story about {textAbout} that takes 3 min to read'}, story = generator(storyAbout) story = story[0]['generated_text'] story = story.replace('\n', ' ').replace('arafed', ' ') return story def Mustalhim(image): caption = Image_Caption(image) story = Generate_story(caption) audio = Generate_audio(story) return audio def gradio_interface(image): audio_waveform, sampling_rate = Mustalhim(image) audio_file = "output_audio.wav" sf.write(audio_file, audio_waveform, sampling_rate) return audio_file example_image = "Example.PNG" app = gr.Interface( fn=gradio_interface, inputs=gr.Image(type="pil"), outputs=gr.Audio(type="filepath"), title="Image to Audio Story", description="Upload an image, and the app will generate a story and convert it to audio.", examples=[[example_image]] ) # Launch the app app.launch()