# ====================================== # Package Import # ====================================== import streamlit as st from PIL import Image import time from transformers import pipeline,AutoModelForCausalLM,AutoTokenizer from typing import Tuple from datasets import load_dataset import soundfile as sf import torch # ====================================== # Basic Initialization # ====================================== # Initialize image captioning pipeline with pretrained model # Model source: Hugging Face Model Hub _image_caption_pipeline = pipeline( task="image-to-text", model="noamrot/FuseCap_Image_Captioning" ) # Global model configuration constants _MODEL_NAME = "Qwen/Qwen3-1.7B" _THINKING_TOKEN_ID = 151668 # Special token marking thinking/content separation # Initialize model components once _tokenizer = AutoTokenizer.from_pretrained(_MODEL_NAME) _model = AutoModelForCausalLM.from_pretrained( _MODEL_NAME, torch_dtype="auto", device_map="auto" ) # Initialize TTS components once to avoid reloading _SPEECH_PIPELINE = pipeline("text-to-speech", model="microsoft/speecht5_tts") _EMBEDDINGS_DATASET = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") _DEFAULT_SPEAKER_EMBEDDING = torch.tensor(_EMBEDDINGS_DATASET[7306]["xvector"]).unsqueeze(0) # ====================================== # Function settings # ====================================== def generate_image_caption(input_image): """ Generate a textual description for an input image using a pretrained model. Args: input_image (Union[PIL.Image.Image, str]): Image to process. Can be either: - A PIL Image object - A string containing a filesystem path to an image file Returns: str: Generated caption text in natural language Example: >>> from PIL import Image >>> img = Image.open("photo.jpg") >>> caption = generate_image_caption(img) >>> print(f"Caption: {caption}") """ # Process image through the captioning pipeline inference_results = _image_caption_pipeline(input_image) # Extract text from the first (and only) result dictionary caption_text = inference_results[0]['generated_text'] return caption_text def generate_story_content(system_prompt: str, user_prompt: str) -> str: """ Generates a children's story based on provided system and user prompts. Args: system_prompt: Defines the assistant's role and writing constraints user_prompt: Describes the story scenario and specific elements to include Returns: Generated story text without any thinking process metadata Raises: RuntimeError: If text generation fails at any stage Example: >>> story = generate_story_content( ... "You are a helpful children's author...", ... "Kids playing with dogs in a sunny meadow..." ... ) """ try: # Prepare chat message structure conversation_history = [ {"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt} ] # Format input using model-specific template formatted_input = _tokenizer.apply_chat_template( conversation_history, tokenize=False, add_generation_prompt=True, enable_thinking=False ) # Tokenize and prepare model inputs model_inputs = _tokenizer( [formatted_input], return_tensors="pt" ).to(_model.device) # Generate text completion generated_sequences = _model.generate( **model_inputs, max_new_tokens=1000 ) # Process and clean output return _process_generated_output( generated_sequences, model_inputs.input_ids ) except Exception as error: raise RuntimeError(f"Story generation failed: {str(error)}") from error def _process_generated_output(generated_sequences: list, input_ids: list) -> str: """ Processes raw model output to extract final content. Args: generated_sequences: Raw output sequences from model generation input_ids: Original input token IDs used for generation Returns: Cleaned final content text """ # Extract new tokens excluding original prompt new_tokens = generated_sequences[0][len(input_ids[0]):].tolist() # Find separation point between thinking and final content separation_index = _find_thinking_separation(new_tokens) # Decode and clean final content return _tokenizer.decode( new_tokens[separation_index:], skip_special_tokens=True ).strip("\n") def _find_thinking_separation(token_sequence: list) -> int: """ Locates the boundary between thinking process and final content. Args: token_sequence: List of generated token IDs Returns: Index position marking the start of final content """ try: # Search from end for separation token reverse_position = token_sequence[::-1].index(_THINKING_TOKEN_ID) return len(token_sequence) - reverse_position except ValueError: return 0 # Return start if token not found def generate_audio_from_story(story_text: str, output_path: str = "output.wav") -> str: """ Convert text story to speech audio file using text-to-speech synthesis. Args: story_text: Input story text to synthesize output_path: Path to save generated audio (default: 'output.wav') Returns: Path to generated audio file Raises: ValueError: For empty/invalid input text RuntimeError: If audio generation fails Example: >>> generate_audio_from_story("Children playing in the park", "story_audio.wav") 'story_audio.wav' """ # Validate input text if not isinstance(story_text, str) or not story_text.strip(): raise ValueError("Input story text must be a non-empty string") try: # Generate speech with default speaker profile speech_output = _SPEECH_PIPELINE( story_text, forward_params={"speaker_embeddings": _DEFAULT_SPEAKER_EMBEDDING} ) # Save audio to WAV file sf.write( output_path, speech_output["audio"], samplerate=speech_output["sampling_rate"] ) return output_path except Exception as error: raise RuntimeError(f"Audio synthesis failed: {str(error)}") from error # ====================================== # Page Configuration & Custom Styling # ====================================== st.set_page_config( page_title="Magic Story Generator", page_icon="🧚", layout="centered", initial_sidebar_state="collapsed" ) # Custom CSS styling for child-friendly interface st.markdown(""" """, unsafe_allow_html=True) # ====================================== # Main Application Interface # ====================================== st.title("🧚 Welcome to Magic Story Maker!") # File upload section with st.container(): st.subheader("Step 1: Upload Your Picture") uploaded_image = st.file_uploader("Choose an image...", type=["png", "jpg", "jpeg"], label_visibility="collapsed") # Initialize session state for confirmation status if 'confirmed' not in st.session_state: st.session_state.confirmed = False # Main processing flow if uploaded_image is not None: # Display uploaded image with st.spinner("✨ Magical image processing..."): image = Image.open(uploaded_image) st.image(image, caption="Your Magical Image", use_column_width=True) # Prompt selection section with st.container(): st.subheader("Step 2: Choose Story Style") # Create three columns for prompt buttons col1, col2, col3 = st.columns(3) with col1: if st.button("📚 Learning Story", help="Generate educational story with life lessons", key="edu_btn"): st.session_state.selected_prompt = "educational" st.session_state.confirmed = False with col2: if st.button("🌠 Fantasy Adventure", help="Create magical adventure story", key="fantasy_btn"): st.session_state.selected_prompt = "adventure" st.session_state.confirmed = False with col3: if st.button("đŸģ Animal Friends", help="Make story about friendly animals", key="animal_btn"): st.session_state.selected_prompt = "animal" st.session_state.confirmed = False # Add confirmation button with st.container(): st.subheader("Step 3: Confirm Selection") if st.button("🔮 Start Magic Creation!", help="Click to generate story after choosing style", type="primary"): st.session_state.confirmed = True # Only show generation when confirmed if st.session_state.get('confirmed', False): # Generate image caption with loading state with st.spinner("🔍 Analyzing image and generating description..."): image_caption = generate_image_caption(image) # Display caption results using CSS class st.subheader("📝 Image Understanding") st.markdown(f'
{image_caption}
', unsafe_allow_html=True) st.write("") # Add spacing # Define prompt templates PROMPT_TEMPLATES = { "educational": { "system": "You are a children's educator. Create a simple 100-word story that teaches basic life skills or moral lessons.", "icon": "📚" }, "adventure": { "system": "You are a fantasy writer. Create a 100-word magical adventure story suitable for children.", "icon": "🌠" }, "animal": { "system": "You are an animal expert. Create a 100-word story about friendly animals learning together.", "icon": "đŸģ" } } # Safe access with default fallback selected_prompt = st.session_state.get("selected_prompt", "educational") # Story generation section with st.spinner(f"{PROMPT_TEMPLATES[selected_prompt]['icon']} Creating your story..."): # Generate story content using the caption selected_template = PROMPT_TEMPLATES[selected_prompt] story_text = generate_story_content( system_prompt=selected_template["system"], user_prompt=image_caption ) # Display formatted story st.subheader("✨ Your Magical Story") st.markdown(f'
{story_text}
', unsafe_allow_html=True) # Audio generation section with st.spinner("🔮 Preparing story narration..."): audio_file = generate_audio_from_story(story_text, "story_audio.wav") st.subheader("🎧 Listen to Your Story") st.audio(audio_file) else: # Show waiting message st.info("â„šī¸ Please select a story style and click the confirmation button to continue") # Help section st.markdown("---") st.subheader("🌟 How to Use:") st.info(""" 1. Upload any picture (animals, nature, or people work best!) 2. Choose your favorite story style 3. Click the confirmation button 4. Wait for image analysis to complete 5. Enjoy your personalized story and audio! """)