File size: 7,651 Bytes
70fa8c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
import streamlit as st
from PIL import Image
import time
from transformers import pipeline,AutoModelForCausalLM,AutoTokenizer
from typing import Tuple
from datasets import load_dataset
import soundfile as sf
import torch

# Initialize image captioning pipeline with pretrained model
# Model source: Hugging Face Model Hub
_image_caption_pipeline = pipeline(
    task="image-to-text",
    model="noamrot/FuseCap_Image_Captioning"
)

# Global model configuration constants
_MODEL_NAME = "Qwen/Qwen3-1.7B"
_THINKING_TOKEN_ID = 151668  # Special token marking thinking/content separation

# Initialize model components once
_tokenizer = AutoTokenizer.from_pretrained(_MODEL_NAME)
_model = AutoModelForCausalLM.from_pretrained(
    _MODEL_NAME,
    torch_dtype="auto",
    device_map="auto"
)

# Initialize TTS components once to avoid reloading
_SPEECH_PIPELINE = pipeline("text-to-speech", model="microsoft/speecht5_tts")
_EMBEDDINGS_DATASET = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
_DEFAULT_SPEAKER_EMBEDDING = torch.tensor(_EMBEDDINGS_DATASET[7306]["xvector"]).unsqueeze(0)

def generate_image_caption(input_image):
    """
    Generate a textual description for an input image using a pretrained model.
    
    Args:
        input_image (Union[PIL.Image.Image, str]): Image to process. Can be either:
            - A PIL Image object
            - A string containing a filesystem path to an image file
    
    Returns:
        str: Generated caption text in natural language
        
    Example:
        >>> from PIL import Image
        >>> img = Image.open("photo.jpg")
        >>> caption = generate_image_caption(img)
        >>> print(f"Caption: {caption}")
    """
    # Process image through the captioning pipeline
    inference_results = _image_caption_pipeline(input_image)
    
    # Extract text from the first (and only) result dictionary
    caption_text = inference_results[0]['generated_text']
    
    return caption_text

def generate_story_content(system_prompt: str, user_prompt: str) -> str:
    """
    Generates a children's story based on provided system and user prompts.
    
    Args:
        system_prompt: Defines the assistant's role and writing constraints
        user_prompt: Describes the story scenario and specific elements to include
        
    Returns:
        Generated story text without any thinking process metadata
        
    Raises:
        RuntimeError: If text generation fails at any stage
    
    Example:
        >>> story = generate_story_content(
        ...     "You are a helpful children's author...",
        ...     "Kids playing with dogs in a sunny meadow..."
        ... )
    """
    try:
        # Prepare chat message structure
        conversation_history = [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ]
        
        # Format input using model-specific template
        formatted_input = _tokenizer.apply_chat_template(
            conversation_history,
            tokenize=False,
            add_generation_prompt=True,
            enable_thinking=False
        )
        
        # Tokenize and prepare model inputs
        model_inputs = _tokenizer(
            [formatted_input], 
            return_tensors="pt"
        ).to(_model.device)
        
        # Generate text completion
        generated_sequences = _model.generate(
            **model_inputs,
            max_new_tokens=1000
        )
        
        # Process and clean output
        return _process_generated_output(
            generated_sequences, 
            model_inputs.input_ids
        )
        
    except Exception as error:
        raise RuntimeError(f"Story generation failed: {str(error)}") from error

def _process_generated_output(generated_sequences: list, input_ids: list) -> str:
    """
    Processes raw model output to extract final content.
    
    Args:
        generated_sequences: Raw output sequences from model generation
        input_ids: Original input token IDs used for generation
        
    Returns:
        Cleaned final content text
    """
    # Extract new tokens excluding original prompt
    new_tokens = generated_sequences[0][len(input_ids[0]):].tolist()
    
    # Find separation point between thinking and final content
    separation_index = _find_thinking_separation(new_tokens)
    
    # Decode and clean final content
    return _tokenizer.decode(
        new_tokens[separation_index:],
        skip_special_tokens=True
    ).strip("\n")

def _find_thinking_separation(token_sequence: list) -> int:
    """
    Locates the boundary between thinking process and final content.
    
    Args:
        token_sequence: List of generated token IDs
        
    Returns:
        Index position marking the start of final content
    """
    try:
        # Search from end for separation token
        reverse_position = token_sequence[::-1].index(_THINKING_TOKEN_ID)
        return len(token_sequence) - reverse_position
    except ValueError:
        return 0  # Return start if token not found

def generate_audio_from_story(story_text: str, output_path: str = "output.wav") -> str:
    """
    Convert text story to speech audio file using text-to-speech synthesis.
    
    Args:
        story_text: Input story text to synthesize
        output_path: Path to save generated audio (default: 'output.wav')
        
    Returns:
        Path to generated audio file
        
    Raises:
        ValueError: For empty/invalid input text
        RuntimeError: If audio generation fails
        
    Example:
        >>> generate_audio_from_story("Children playing in the park", "story_audio.wav")
        'story_audio.wav'
    """
    # Validate input text
    if not isinstance(story_text, str) or not story_text.strip():
        raise ValueError("Input story text must be a non-empty string")
    
    try:
        # Generate speech with default speaker profile
        speech_output = _SPEECH_PIPELINE(
            story_text,
            forward_params={"speaker_embeddings": _DEFAULT_SPEAKER_EMBEDDING}
        )
        
        # Save audio to WAV file
        sf.write(
            output_path,
            speech_output["audio"],
            samplerate=speech_output["sampling_rate"]
        )
        
        return output_path
        
    except Exception as error:
        raise RuntimeError(f"Audio synthesis failed: {str(error)}") from error


# App title
st.title("Best Story Teller")

# Write some text
st.write("Upload a picture and start your journey of creativeness and imagination")

# File uploader for image and audio
uploaded_image = st.file_uploader("Upload an image", type=["jpg", "jpeg", "png"])
uploaded_audio = st.file_uploader("Upload an audio file", type=["mp3", "wav", "ogg"])

# Display image with spinner
if uploaded_image is not None:
    with st.spinner("Loading image..."):
        image = Image.open(uploaded_image)
        st.image(image, caption="Uploaded Image", use_column_width=True)
    with st.spinner("Captioning image..."):
        caption_from_file = generate_image_caption(image)
    with st.spinner("Adding some magics and imagination..."):
        system_prompt = "You are a helpful kid story writter. You should directly generate a simple, educational and intresting story no more than 150 words."
        user_prompt = caption_from_file
        story = generate_story_content(system_prompt, user_prompt)
        st.write(story)
    with st.spinner("Finding the best voice actor"):
        generated_audio = generate_audio_from_story(story,"childrens_story.wav")
        st.audio(generated_audio)