File size: 10,795 Bytes
ac70fac
 
 
 
70fa8c1
 
 
243f6f7
70fa8c1
 
 
 
 
ac70fac
 
 
 
70fa8c1
 
 
4204a24
70fa8c1
 
 
5bd24ba
70fa8c1
 
 
 
 
 
ac70fac
 
 
 
70fa8c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
243f6f7
70fa8c1
5bd24ba
 
 
 
 
 
70fa8c1
 
5bd24ba
70fa8c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ac70fac
 
 
 
 
 
5227f85
ac70fac
 
70fa8c1
ac70fac
 
1ccbf4f
ac70fac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eeb972c
 
 
 
 
 
 
 
ac70fac
 
 
 
 
 
 
 
70fa8c1
ac70fac
 
 
1ccbf4f
70fa8c1
ac70fac
 
 
5227f85
 
 
 
 
 
 
ac70fac
 
70fa8c1
ac70fac
 
70fa8c1
ac70fac
 
 
 
 
 
 
 
 
 
 
 
 
eeb972c
ac70fac
 
 
 
 
5227f85
ac70fac
 
 
 
 
5227f85
 
 
 
 
 
 
 
 
ac70fac
5227f85
 
eeb972c
 
 
 
 
 
 
 
 
 
5227f85
 
 
c1d7ca6
5227f85
 
 
c1d7ca6
5227f85
 
 
c1d7ca6
5227f85
 
ac70fac
 
5227f85
 
ac70fac
5227f85
 
eeb972c
5227f85
 
 
 
 
 
 
 
 
 
ac70fac
5227f85
 
 
 
 
 
 
 
ac70fac
 
 
 
 
 
 
5227f85
eeb972c
 
ac70fac
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
# ======================================
# Package Import
# ======================================

import streamlit as st
from PIL import Image
import time
from transformers import pipeline
from typing import Tuple
from datasets import load_dataset
import soundfile as sf
import torch

# ======================================
# Basic Initialization
# ======================================

# Initialize image captioning pipeline with pretrained model
_image_caption_pipeline = pipeline(
    task="image-to-text",
    model="cnmoro/tiny-image-captioning"
)

# Global model configuration constants
_text_generation_pipeline = pipeline("text-generation", model="Qwen/Qwen1.5-0.5B",max_new_tokens=100)

# Initialize TTS components once to avoid reloading
_SPEECH_PIPELINE = pipeline("text-to-speech", model="microsoft/speecht5_tts")
_EMBEDDINGS_DATASET = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
_DEFAULT_SPEAKER_EMBEDDING = torch.tensor(_EMBEDDINGS_DATASET[7306]["xvector"]).unsqueeze(0)

# ======================================
# Function settings
# ======================================

def generate_image_caption(input_image):
    """
    Generate a textual description for an input image using a pretrained model.
    
    Args:
        input_image (Union[PIL.Image.Image, str]): Image to process. Can be either:
            - A PIL Image object
            - A string containing a filesystem path to an image file
    
    Returns:
        str: Generated caption text in natural language
        
    Example:
        >>> from PIL import Image
        >>> img = Image.open("photo.jpg")
        >>> caption = generate_image_caption(img)
        >>> print(f"Caption: {caption}")
    """
    # Process image through the captioning pipeline
    inference_results = _image_caption_pipeline(input_image)
    
    # Extract text from the first (and only) result dictionary
    caption_text = inference_results[0]['generated_text']
    
    return caption_text

def generate_story_content(system_prompt: str, user_prompt: str) -> str:
    """
    Generates a children's story based on provided system and user prompts.
    
    Args:
        system_prompt: Defines the assistant's role and writing constraints
        user_prompt: Describes the story scenario and specific elements to include
        
    Returns:
        Generated story text without any thinking process metadata
        
    Raises:
        RuntimeError: If text generation fails at any stage
    
    Example:
        >>> story = generate_story_content(
        ...     "You are a helpful children's author...",
        ...     "Kids playing with dogs in a sunny meadow..."
        ... )
    """
    try:
        # Prepare chat message structure
        conversation_history = [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt},
        ]

        # Generate the story
        story=_text_generation_pipeline(conversation_history)

        # Extract the stroy result
        stroy_result=story[0]["generated_text"][2]["content"]
        
        # Process and clean output
        return stroy_result
        
    except Exception as error:
        raise RuntimeError(f"Story generation failed: {str(error)}") from error

def generate_audio_from_story(story_text: str, output_path: str = "output.wav") -> str:
    """
    Convert text story to speech audio file using text-to-speech synthesis.
    
    Args:
        story_text: Input story text to synthesize
        output_path: Path to save generated audio (default: 'output.wav')
        
    Returns:
        Path to generated audio file
        
    Raises:
        ValueError: For empty/invalid input text
        RuntimeError: If audio generation fails
        
    Example:
        >>> generate_audio_from_story("Children playing in the park", "story_audio.wav")
        'story_audio.wav'
    """
    # Validate input text
    if not isinstance(story_text, str) or not story_text.strip():
        raise ValueError("Input story text must be a non-empty string")
    
    try:
        # Generate speech with default speaker profile
        speech_output = _SPEECH_PIPELINE(
            story_text,
            forward_params={"speaker_embeddings": _DEFAULT_SPEAKER_EMBEDDING}
        )
        
        # Save audio to WAV file
        sf.write(
            output_path,
            speech_output["audio"],
            samplerate=speech_output["sampling_rate"]
        )
        
        return output_path
        
    except Exception as error:
        raise RuntimeError(f"Audio synthesis failed: {str(error)}") from error


# ======================================
# Page Configuration & Custom Styling
# ======================================
st.set_page_config(
    page_title="Magic Story Generator",
    page_icon="๐Ÿงš",
    layout="centered",
    initial_sidebar_state="collapsed"
)

# Custom CSS styling for child-friendly interface
st.markdown("""
<style>   
    /* Prompt buttons styling */
    .prompt-btn {
        background: #4CAF50 !important;
        border-radius: 15px !important;
        padding: 15px 30px !important;
        font-size: 1.1rem !important;
        margin: 10px;
    }
    
    /* Story container styling */
    .story-container {
        background: #FFF3E0;
        border-radius: 20px;
        padding: 25px;
        margin: 20px 0;
        box-shadow: 0 4px 8px rgba(0,0,0,0.1);
    }
    /* Image caption styling */
    .image-caption {
        border-left: 4px solid #4CAF50;
        padding-left: 1rem;
        font-size: 1.1rem;
        color: #2c3e50;
        margin: 1.5rem 0;
    }
    
    /* Progress spinner styling */
    .stSpinner > div {
        font-size: 1.2rem !important;
        color: #9C27B0 !important;
    }
</style>
""", unsafe_allow_html=True)

# ======================================
# Main Application Interface
# ======================================
st.title("๐Ÿงš Welcome to Magic Story Maker!")

# File upload section
with st.container():
    st.subheader("Step 1: Upload Your Picture")
    uploaded_image = st.file_uploader("Choose an image...", 
                                    type=["png", "jpg", "jpeg"],
                                    label_visibility="collapsed")

# Initialize session state for confirmation status
if 'confirmed' not in st.session_state:
    st.session_state.confirmed = False

# Main processing flow
if uploaded_image is not None:
    # Display uploaded image
    with st.spinner("โœจ Magical image processing..."):
        image = Image.open(uploaded_image)
        st.image(image, caption="Your Magical Image", use_column_width=True)

    # Prompt selection section
    with st.container():
        st.subheader("Step 2: Choose Story Style")
        
        # Create three columns for prompt buttons
        col1, col2, col3 = st.columns(3)
        with col1:
            if st.button("๐Ÿ“š Learning Story", 
                        help="Generate educational story with life lessons",
                        key="edu_btn"):
                st.session_state.selected_prompt = "educational"
                st.session_state.confirmed = False
        with col2:
            if st.button("๐ŸŒ  Fantasy Adventure", 
                        help="Create magical adventure story",
                        key="fantasy_btn"):
                st.session_state.selected_prompt = "adventure"
                st.session_state.confirmed = False
        with col3:
            if st.button("๐Ÿป Animal Friends", 
                        help="Make story about friendly animals",
                        key="animal_btn"):
                st.session_state.selected_prompt = "animal"
                st.session_state.confirmed = False

    # Add confirmation button
    with st.container():
        st.subheader("Step 3: Confirm Selection")
        if st.button("๐Ÿ”ฎ Start Magic Creation!", 
                   help="Click to generate story after choosing style",
                   type="primary"):
            st.session_state.confirmed = True

    # Only show generation when confirmed
    if st.session_state.get('confirmed', False):
        # Generate image caption with loading state
        with st.spinner("๐Ÿ” Analyzing image and generating description..."):
            image_caption = generate_image_caption(image)
            
            # Display caption results using CSS class
            st.subheader("๐Ÿ“ Image Understanding")
            st.markdown(f'<div class="story-container image-caption">{image_caption}</div>', 
                      unsafe_allow_html=True)
            st.write("")  # Add spacing

        # Define prompt templates
        PROMPT_TEMPLATES = {
            "educational": {
                "system": "You are a children's educator. Create a simple 100-word story that teaches basic life skills or moral lessons.",
                "icon": "๐Ÿ“š"
            },
            "adventure": {
                "system": "You are a fantasy writer. Create a 100-word magical adventure story suitable for children.",
                "icon": "๐ŸŒ "
            },
            "animal": {
                "system": "You are an animal expert. Create a 100-word story about friendly animals learning together.",
                "icon": "๐Ÿป"
            }
        }

        # Safe access with default fallback
        selected_prompt = st.session_state.get("selected_prompt", "educational")
        
        # Story generation section
        with st.spinner(f"{PROMPT_TEMPLATES[selected_prompt]['icon']} Creating your story..."):
            # Generate story content using the caption
            selected_template = PROMPT_TEMPLATES[selected_prompt]
            story_text = generate_story_content(
                system_prompt=selected_template["system"],
                user_prompt=image_caption
            )
            
            # Display formatted story
            st.subheader("โœจ Your Magical Story")
            st.markdown(f'<div class="story-container">{story_text}</div>', 
                      unsafe_allow_html=True)

        # Audio generation section
        with st.spinner("๐Ÿ”ฎ Preparing story narration..."):
            audio_file = generate_audio_from_story(story_text, "story_audio.wav")
            st.subheader("๐ŸŽง Listen to Your Story")
            st.audio(audio_file)
    else:
        # Show waiting message
        st.info("โ„น๏ธ Please select a story style and click the confirmation button to continue")

# Help section
st.markdown("---")
st.subheader("๐ŸŒŸ How to Use:")
st.info("""
1. Upload any picture (animals, nature, or people work best!)
2. Choose your favorite story style
3. Click the confirmation button
4. Wait for image analysis to complete
5. Enjoy your personalized story and audio!
""")