# ====================================== # Package Import # ====================================== import streamlit as st from PIL import Image import time from transformers import pipeline,AutoModelForCausalLM,AutoTokenizer from typing import Tuple from datasets import load_dataset import soundfile as sf import torch # ====================================== # Basic Initialization # ====================================== # Initialize image captioning pipeline with pretrained model # Model source: Hugging Face Model Hub _image_caption_pipeline = pipeline( task="image-to-text", model="noamrot/FuseCap_Image_Captioning" ) # Global model configuration constants _MODEL_NAME = "Qwen/Qwen3-1.7B" _THINKING_TOKEN_ID = 151668 # Special token marking thinking/content separation # Initialize model components once _tokenizer = AutoTokenizer.from_pretrained(_MODEL_NAME) _model = AutoModelForCausalLM.from_pretrained( _MODEL_NAME, torch_dtype="auto", device_map="auto" ) # Initialize TTS components once to avoid reloading _SPEECH_PIPELINE = pipeline("text-to-speech", model="microsoft/speecht5_tts") _EMBEDDINGS_DATASET = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") _DEFAULT_SPEAKER_EMBEDDING = torch.tensor(_EMBEDDINGS_DATASET[7306]["xvector"]).unsqueeze(0) # ====================================== # Function settings # ====================================== def generate_image_caption(input_image): """ Generate a textual description for an input image using a pretrained model. Args: input_image (Union[PIL.Image.Image, str]): Image to process. Can be either: - A PIL Image object - A string containing a filesystem path to an image file Returns: str: Generated caption text in natural language Example: >>> from PIL import Image >>> img = Image.open("photo.jpg") >>> caption = generate_image_caption(img) >>> print(f"Caption: {caption}") """ # Process image through the captioning pipeline inference_results = _image_caption_pipeline(input_image) # Extract text from the first (and only) result dictionary caption_text = inference_results[0]['generated_text'] return caption_text def generate_story_content(system_prompt: str, user_prompt: str) -> str: """ Generates a children's story based on provided system and user prompts. Args: system_prompt: Defines the assistant's role and writing constraints user_prompt: Describes the story scenario and specific elements to include Returns: Generated story text without any thinking process metadata Raises: RuntimeError: If text generation fails at any stage Example: >>> story = generate_story_content( ... "You are a helpful children's author...", ... "Kids playing with dogs in a sunny meadow..." ... ) """ try: # Prepare chat message structure conversation_history = [ {"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt} ] # Format input using model-specific template formatted_input = _tokenizer.apply_chat_template( conversation_history, tokenize=False, add_generation_prompt=True, enable_thinking=False ) # Tokenize and prepare model inputs model_inputs = _tokenizer( [formatted_input], return_tensors="pt" ).to(_model.device) # Generate text completion generated_sequences = _model.generate( **model_inputs, max_new_tokens=1000 ) # Process and clean output return _process_generated_output( generated_sequences, model_inputs.input_ids ) except Exception as error: raise RuntimeError(f"Story generation failed: {str(error)}") from error def _process_generated_output(generated_sequences: list, input_ids: list) -> str: """ Processes raw model output to extract final content. Args: generated_sequences: Raw output sequences from model generation input_ids: Original input token IDs used for generation Returns: Cleaned final content text """ # Extract new tokens excluding original prompt new_tokens = generated_sequences[0][len(input_ids[0]):].tolist() # Find separation point between thinking and final content separation_index = _find_thinking_separation(new_tokens) # Decode and clean final content return _tokenizer.decode( new_tokens[separation_index:], skip_special_tokens=True ).strip("\n") def _find_thinking_separation(token_sequence: list) -> int: """ Locates the boundary between thinking process and final content. Args: token_sequence: List of generated token IDs Returns: Index position marking the start of final content """ try: # Search from end for separation token reverse_position = token_sequence[::-1].index(_THINKING_TOKEN_ID) return len(token_sequence) - reverse_position except ValueError: return 0 # Return start if token not found def generate_audio_from_story(story_text: str, output_path: str = "output.wav") -> str: """ Convert text story to speech audio file using text-to-speech synthesis. Args: story_text: Input story text to synthesize output_path: Path to save generated audio (default: 'output.wav') Returns: Path to generated audio file Raises: ValueError: For empty/invalid input text RuntimeError: If audio generation fails Example: >>> generate_audio_from_story("Children playing in the park", "story_audio.wav") 'story_audio.wav' """ # Validate input text if not isinstance(story_text, str) or not story_text.strip(): raise ValueError("Input story text must be a non-empty string") try: # Generate speech with default speaker profile speech_output = _SPEECH_PIPELINE( story_text, forward_params={"speaker_embeddings": _DEFAULT_SPEAKER_EMBEDDING} ) # Save audio to WAV file sf.write( output_path, speech_output["audio"], samplerate=speech_output["sampling_rate"] ) return output_path except Exception as error: raise RuntimeError(f"Audio synthesis failed: {str(error)}") from error # ====================================== # Page Configuration & Custom Styling # ====================================== st.set_page_config( page_title="Magic Story Generator", page_icon="đ§", layout="centered", initial_sidebar_state="collapsed" ) # Custom CSS styling for child-friendly interface st.markdown(""" """, unsafe_allow_html=True) # ====================================== # Main Application Interface # ====================================== st.title("đ§ Welcome to Magic Story Maker!") # File upload section with st.container(): st.subheader("Step 1: Upload Your Picture") uploaded_image = st.file_uploader("Choose an image...", type=["png", "jpg", "jpeg"], label_visibility="collapsed") # Initialize session state for confirmation status if 'confirmed' not in st.session_state: st.session_state.confirmed = False # Main processing flow if uploaded_image is not None: # Display uploaded image with st.spinner("⨠Magical image processing..."): image = Image.open(uploaded_image) st.image(image, caption="Your Magical Image", use_column_width=True) # Prompt selection section with st.container(): st.subheader("Step 2: Choose Story Style") # Create three columns for prompt buttons col1, col2, col3 = st.columns(3) with col1: if st.button("đ Learning Story", help="Generate educational story with life lessons", key="edu_btn"): st.session_state.selected_prompt = "educational" st.session_state.confirmed = False with col2: if st.button("đ Fantasy Adventure", help="Create magical adventure story", key="fantasy_btn"): st.session_state.selected_prompt = "adventure" st.session_state.confirmed = False with col3: if st.button("đģ Animal Friends", help="Make story about friendly animals", key="animal_btn"): st.session_state.selected_prompt = "animal" st.session_state.confirmed = False # Add confirmation button with st.container(): st.subheader("Step 3: Confirm Selection") if st.button("đŽ Start Magic Creation!", help="Click to generate story after choosing style", type="primary"): st.session_state.confirmed = True # Only show generation when confirmed if st.session_state.get('confirmed', False): # Generate image caption with loading state with st.spinner("đ Analyzing image and generating description..."): image_caption = generate_image_caption(image) # Display caption results using CSS class st.subheader("đ Image Understanding") st.markdown(f'