Spaces:

ccclllwww
/

Assignment_V1

Sleeping

File size: 7,651 Bytes

70fa8c1

import streamlit as st
from PIL import Image
import time
from transformers import pipeline,AutoModelForCausalLM,AutoTokenizer
from typing import Tuple
from datasets import load_dataset
import soundfile as sf
import torch

# Initialize image captioning pipeline with pretrained model
# Model source: Hugging Face Model Hub
_image_caption_pipeline = pipeline(
    task="image-to-text",
    model="noamrot/FuseCap_Image_Captioning"
)

# Global model configuration constants
_MODEL_NAME = "Qwen/Qwen3-1.7B"
_THINKING_TOKEN_ID = 151668  # Special token marking thinking/content separation

# Initialize model components once
_tokenizer = AutoTokenizer.from_pretrained(_MODEL_NAME)
_model = AutoModelForCausalLM.from_pretrained(
    _MODEL_NAME,
    torch_dtype="auto",
    device_map="auto"
)

# Initialize TTS components once to avoid reloading
_SPEECH_PIPELINE = pipeline("text-to-speech", model="microsoft/speecht5_tts")
_EMBEDDINGS_DATASET = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
_DEFAULT_SPEAKER_EMBEDDING = torch.tensor(_EMBEDDINGS_DATASET[7306]["xvector"]).unsqueeze(0)

def generate_image_caption(input_image):
    """
    Generate a textual description for an input image using a pretrained model.
    
    Args:
        input_image (Union[PIL.Image.Image, str]): Image to process. Can be either:
            - A PIL Image object
            - A string containing a filesystem path to an image file
    
    Returns:
        str: Generated caption text in natural language
        
    Example:
        >>> from PIL import Image
        >>> img = Image.open("photo.jpg")
        >>> caption = generate_image_caption(img)
        >>> print(f"Caption: {caption}")
    """
    # Process image through the captioning pipeline
    inference_results = _image_caption_pipeline(input_image)
    
    # Extract text from the first (and only) result dictionary
    caption_text = inference_results[0]['generated_text']
    
    return caption_text

def generate_story_content(system_prompt: str, user_prompt: str) -> str:
    """
    Generates a children's story based on provided system and user prompts.
    
    Args:
        system_prompt: Defines the assistant's role and writing constraints
        user_prompt: Describes the story scenario and specific elements to include
        
    Returns:
        Generated story text without any thinking process metadata
        
    Raises:
        RuntimeError: If text generation fails at any stage
    
    Example:
        >>> story = generate_story_content(
        ...     "You are a helpful children's author...",
        ...     "Kids playing with dogs in a sunny meadow..."
        ... )
    """
    try:
        # Prepare chat message structure
        conversation_history = [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ]
        
        # Format input using model-specific template
        formatted_input = _tokenizer.apply_chat_template(
            conversation_history,
            tokenize=False,
            add_generation_prompt=True,
            enable_thinking=False
        )
        
        # Tokenize and prepare model inputs
        model_inputs = _tokenizer(
            [formatted_input], 
            return_tensors="pt"
        ).to(_model.device)
        
        # Generate text completion
        generated_sequences = _model.generate(
            **model_inputs,
            max_new_tokens=1000
        )
        
        # Process and clean output
        return _process_generated_output(
            generated_sequences, 
            model_inputs.input_ids
        )
        
    except Exception as error:
        raise RuntimeError(f"Story generation failed: {str(error)}") from error

def _process_generated_output(generated_sequences: list, input_ids: list) -> str:
    """
    Processes raw model output to extract final content.
    
    Args:
        generated_sequences: Raw output sequences from model generation
        input_ids: Original input token IDs used for generation
        
    Returns:
        Cleaned final content text
    """
    # Extract new tokens excluding original prompt
    new_tokens = generated_sequences[0][len(input_ids[0]):].tolist()
    
    # Find separation point between thinking and final content
    separation_index = _find_thinking_separation(new_tokens)
    
    # Decode and clean final content
    return _tokenizer.decode(
        new_tokens[separation_index:],
        skip_special_tokens=True
    ).strip("\n")

def _find_thinking_separation(token_sequence: list) -> int:
    """
    Locates the boundary between thinking process and final content.
    
    Args:
        token_sequence: List of generated token IDs
        
    Returns:
        Index position marking the start of final content
    """
    try:
        # Search from end for separation token
        reverse_position = token_sequence[::-1].index(_THINKING_TOKEN_ID)
        return len(token_sequence) - reverse_position
    except ValueError:
        return 0  # Return start if token not found

def generate_audio_from_story(story_text: str, output_path: str = "output.wav") -> str:
    """
    Convert text story to speech audio file using text-to-speech synthesis.
    
    Args:
        story_text: Input story text to synthesize
        output_path: Path to save generated audio (default: 'output.wav')
        
    Returns:
        Path to generated audio file
        
    Raises:
        ValueError: For empty/invalid input text
        RuntimeError: If audio generation fails
        
    Example:
        >>> generate_audio_from_story("Children playing in the park", "story_audio.wav")
        'story_audio.wav'
    """
    # Validate input text
    if not isinstance(story_text, str) or not story_text.strip():
        raise ValueError("Input story text must be a non-empty string")
    
    try:
        # Generate speech with default speaker profile
        speech_output = _SPEECH_PIPELINE(
            story_text,
            forward_params={"speaker_embeddings": _DEFAULT_SPEAKER_EMBEDDING}
        )
        
        # Save audio to WAV file
        sf.write(
            output_path,
            speech_output["audio"],
            samplerate=speech_output["sampling_rate"]
        )
        
        return output_path
        
    except Exception as error:
        raise RuntimeError(f"Audio synthesis failed: {str(error)}") from error


# App title
st.title("Best Story Teller")

# Write some text
st.write("Upload a picture and start your journey of creativeness and imagination")

# File uploader for image and audio
uploaded_image = st.file_uploader("Upload an image", type=["jpg", "jpeg", "png"])
uploaded_audio = st.file_uploader("Upload an audio file", type=["mp3", "wav", "ogg"])

# Display image with spinner
if uploaded_image is not None:
    with st.spinner("Loading image..."):
        image = Image.open(uploaded_image)
        st.image(image, caption="Uploaded Image", use_column_width=True)
    with st.spinner("Captioning image..."):
        caption_from_file = generate_image_caption(image)
    with st.spinner("Adding some magics and imagination..."):
        system_prompt = "You are a helpful kid story writter. You should directly generate a simple, educational and intresting story no more than 150 words."
        user_prompt = caption_from_file
        story = generate_story_content(system_prompt, user_prompt)
        st.write(story)
    with st.spinner("Finding the best voice actor"):
        generated_audio = generate_audio_from_story(story,"childrens_story.wav")
        st.audio(generated_audio)