isom-5240-project

Sleeping

App Files Files Community

LTH001 commited on May 2

Commit

0546ecd

verified ·

1 Parent(s): ffd9308

Update app.py

Browse files

Files changed (1) hide show

app.py +16 -63

app.py CHANGED Viewed

@@ -1,117 +1,70 @@
-# import part
-!pip install streamlit pyngrok soundfile  # soundfile for audio conversion
 import streamlit as st
 from transformers import pipeline
 from PIL import Image
 import io
-import numpy as np
-import soundfile as sf  # For handling audio file operations
-# function part
 def generate_image_caption(image):
-    """Generates a caption for the given image using a pre-trained model.
-    Args:
-        image: PIL Image object
-    Returns:
-        str: Generated caption text
-    """
-    # Initialize image-to-text pipeline with BLIP model
     img2caption = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
     result = img2caption(image)
     return result[0]['generated_text']
 def text2story(text):
-    """Generates a children's story from text input using story generation model.
-    Args:
-        text: Input text prompt
-    Returns:
-        str: Generated story text
-    """
-    # Craft prompt with specific requirements for children's stories
-    story_prompt = f"Create a funny 100-word story for 8-year-olds about: {text}. Include: "
-    story_prompt += "1) A silly character 2) Magical object 3) Sound effects 4) Happy ending"
-    # Initialize text generation pipeline
     pipe = pipeline("text-generation", model="pranavpsv/genre-story-generator-v2")
-    # Generate story with controlled randomness parameters
     story_text = pipe(
         story_prompt,
-        max_new_tokens=200,  # Limit story length
-        temperature=0.9,     # Control randomness (higher = more creative)
-        top_k=50             # Limit vocabulary choices
     )[0]['generated_text']
-    # Clean output by splitting at the required ending marker
     return story_text.split("Happy ending")[-1].strip()
 def story_to_speech(story_text):
-    """Converts story text to audio using text-to-speech model.
-    Args:
-        story_text: Story text to convert
-    Returns:
-        BytesIO: Audio data in WAV format
-    """
-    # Initialize Bark text-to-speech pipeline
     tts_pipe = pipeline("text-to-speech", model="suno/bark-small")
-    # Generate audio array (numpy array of sound samples)
-    audio_output = tts_pipe(story_text, max_length=400)  # Limit text length for stability
-    # Convert numpy array to playable audio bytes
     audio_bytes = io.BytesIO()
-    sf.write(
-        audio_bytes,
-        audio_output["audio"],
-        audio_output["sampling_rate"],
-        format='WAV'
-    )
-    audio_bytes.seek(0)  # Reset pointer for Streamlit audio player
     return audio_bytes
 def main():
-    """Main function for Streamlit application workflow"""
-    # Configure page header
     st.title("📖 Image Story Generator with Audio")
     st.write("Upload an image to get a magical story read aloud!")
-    # Image upload widget (supports JPG/PNG)
     uploaded_image = st.file_uploader("Choose an image...", type=["jpg", "jpeg", "png"])
     if uploaded_image:
-        # Process image
-        image = Image.open(uploaded_image).convert("RGB")  # Ensure RGB format
-        st.image(image, use_column_width=True)  # Display uploaded image
-        # Image analysis section
         with st.spinner("✨ Analyzing image..."):
             caption = generate_image_caption(image)
-        # Display image understanding
         st.subheader("Image Understanding")
         st.write(caption)
-        # Story generation section
         with st.spinner("📖 Writing story..."):
             story = text2story(caption)
-        # Display generated story
         st.subheader("Magical Story")
         st.write(story)
-        # Audio generation section
         if st.button("🎧 Read Story Aloud"):
             with st.spinner("🔊 Generating audio..."):
                 try:
-                    # Convert story to audio (trim to 400 characters for model stability)
-                    audio_bytes = story_to_speech(story[:400])
-                    # Display audio player
                     st.audio(audio_bytes, format="audio/wav")
                 except Exception as e:
                     st.error(f"Error generating audio: {str(e)}")
 if __name__ == "__main__":
-    # Start the Streamlit application
     main()

+# app.py
 import streamlit as st
 from transformers import pipeline
 from PIL import Image
 import io
+from scipy.io.wavfile import write as write_wav
 def generate_image_caption(image):
+    """Generates a caption for the given image"""
     img2caption = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
     result = img2caption(image)
     return result[0]['generated_text']
 def text2story(text):
+    """Generates a children's story from text input"""
+    story_prompt = f"Create a funny 100-word story for 8-year-olds about: {text}. Include: 1) A silly character 2) Magical object 3) Sound effects 4) Happy ending"
     pipe = pipeline("text-generation", model="pranavpsv/genre-story-generator-v2")
     story_text = pipe(
         story_prompt,
+        max_new_tokens=200,
+        temperature=0.9,
+        top_k=50
     )[0]['generated_text']
     return story_text.split("Happy ending")[-1].strip()
 def story_to_speech(story_text):
+    """Converts story text to audio using TTS"""
     tts_pipe = pipeline("text-to-speech", model="suno/bark-small")
+    audio_output = tts_pipe(story_text[:400])
+    # Convert numpy array to bytes using scipy
     audio_bytes = io.BytesIO()
+    write_wav(audio_bytes, audio_output["sampling_rate"], audio_output["audio"])
+    audio_bytes.seek(0)
     return audio_bytes
 def main():
     st.title("📖 Image Story Generator with Audio")
     st.write("Upload an image to get a magical story read aloud!")
     uploaded_image = st.file_uploader("Choose an image...", type=["jpg", "jpeg", "png"])
     if uploaded_image:
+        image = Image.open(uploaded_image).convert("RGB")
+        st.image(image, use_column_width=True)
         with st.spinner("✨ Analyzing image..."):
             caption = generate_image_caption(image)
         st.subheader("Image Understanding")
         st.write(caption)
         with st.spinner("📖 Writing story..."):
             story = text2story(caption)
         st.subheader("Magical Story")
         st.write(story)
         if st.button("🎧 Read Story Aloud"):
             with st.spinner("🔊 Generating audio..."):
                 try:
+                    audio_bytes = story_to_speech(story)
                     st.audio(audio_bytes, format="audio/wav")
                 except Exception as e:
                     st.error(f"Error generating audio: {str(e)}")
 if __name__ == "__main__":
     main()