Spaces:

ccclllwww
/

Assignment_V1

Sleeping

App Files Files Community

ccclllwww commited on Apr 30

Commit

70fa8c1

verified ·

1 Parent(s): 40ae61d

Create app.py

Browse files

Files changed (1) hide show

app.py +223 -0

app.py ADDED Viewed

	@@ -0,0 +1,223 @@

+import streamlit as st
+from PIL import Image
+import time
+from transformers import pipeline,AutoModelForCausalLM,AutoTokenizer
+from typing import Tuple
+from datasets import load_dataset
+import soundfile as sf
+import torch
+# Initialize image captioning pipeline with pretrained model
+# Model source: Hugging Face Model Hub
+_image_caption_pipeline = pipeline(
+    task="image-to-text",
+    model="noamrot/FuseCap_Image_Captioning"
+)
+# Global model configuration constants
+_MODEL_NAME = "Qwen/Qwen3-1.7B"
+_THINKING_TOKEN_ID = 151668  # Special token marking thinking/content separation
+# Initialize model components once
+_tokenizer = AutoTokenizer.from_pretrained(_MODEL_NAME)
+_model = AutoModelForCausalLM.from_pretrained(
+    _MODEL_NAME,
+    torch_dtype="auto",
+    device_map="auto"
+)
+# Initialize TTS components once to avoid reloading
+_SPEECH_PIPELINE = pipeline("text-to-speech", model="microsoft/speecht5_tts")
+_EMBEDDINGS_DATASET = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
+_DEFAULT_SPEAKER_EMBEDDING = torch.tensor(_EMBEDDINGS_DATASET[7306]["xvector"]).unsqueeze(0)
+def generate_image_caption(input_image):
+    """
+    Generate a textual description for an input image using a pretrained model.
+    Args:
+        input_image (Union[PIL.Image.Image, str]): Image to process. Can be either:
+            - A PIL Image object
+            - A string containing a filesystem path to an image file
+    Returns:
+        str: Generated caption text in natural language
+    Example:
+        >>> from PIL import Image
+        >>> img = Image.open("photo.jpg")
+        >>> caption = generate_image_caption(img)
+        >>> print(f"Caption: {caption}")
+    """
+    # Process image through the captioning pipeline
+    inference_results = _image_caption_pipeline(input_image)
+    # Extract text from the first (and only) result dictionary
+    caption_text = inference_results[0]['generated_text']
+    return caption_text
+def generate_story_content(system_prompt: str, user_prompt: str) -> str:
+    """
+    Generates a children's story based on provided system and user prompts.
+    Args:
+        system_prompt: Defines the assistant's role and writing constraints
+        user_prompt: Describes the story scenario and specific elements to include
+    Returns:
+        Generated story text without any thinking process metadata
+    Raises:
+        RuntimeError: If text generation fails at any stage
+    Example:
+        >>> story = generate_story_content(
+        ...     "You are a helpful children's author...",
+        ...     "Kids playing with dogs in a sunny meadow..."
+        ... )
+    """
+    try:
+        # Prepare chat message structure
+        conversation_history = [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": user_prompt}
+        ]
+        # Format input using model-specific template
+        formatted_input = _tokenizer.apply_chat_template(
+            conversation_history,
+            tokenize=False,
+            add_generation_prompt=True,
+            enable_thinking=False
+        )
+        # Tokenize and prepare model inputs
+        model_inputs = _tokenizer(
+            [formatted_input],
+            return_tensors="pt"
+        ).to(_model.device)
+        # Generate text completion
+        generated_sequences = _model.generate(
+            **model_inputs,
+            max_new_tokens=1000
+        )
+        # Process and clean output
+        return _process_generated_output(
+            generated_sequences,
+            model_inputs.input_ids
+        )
+    except Exception as error:
+        raise RuntimeError(f"Story generation failed: {str(error)}") from error
+def _process_generated_output(generated_sequences: list, input_ids: list) -> str:
+    """
+    Processes raw model output to extract final content.
+    Args:
+        generated_sequences: Raw output sequences from model generation
+        input_ids: Original input token IDs used for generation
+    Returns:
+        Cleaned final content text
+    """
+    # Extract new tokens excluding original prompt
+    new_tokens = generated_sequences[0][len(input_ids[0]):].tolist()
+    # Find separation point between thinking and final content
+    separation_index = _find_thinking_separation(new_tokens)
+    # Decode and clean final content
+    return _tokenizer.decode(
+        new_tokens[separation_index:],
+        skip_special_tokens=True
+    ).strip("\n")
+def _find_thinking_separation(token_sequence: list) -> int:
+    """
+    Locates the boundary between thinking process and final content.
+    Args:
+        token_sequence: List of generated token IDs
+    Returns:
+        Index position marking the start of final content
+    """
+    try:
+        # Search from end for separation token
+        reverse_position = token_sequence[::-1].index(_THINKING_TOKEN_ID)
+        return len(token_sequence) - reverse_position
+    except ValueError:
+        return 0  # Return start if token not found
+def generate_audio_from_story(story_text: str, output_path: str = "output.wav") -> str:
+    """
+    Convert text story to speech audio file using text-to-speech synthesis.
+    Args:
+        story_text: Input story text to synthesize
+        output_path: Path to save generated audio (default: 'output.wav')
+    Returns:
+        Path to generated audio file
+    Raises:
+        ValueError: For empty/invalid input text
+        RuntimeError: If audio generation fails
+    Example:
+        >>> generate_audio_from_story("Children playing in the park", "story_audio.wav")
+        'story_audio.wav'
+    """
+    # Validate input text
+    if not isinstance(story_text, str) or not story_text.strip():
+        raise ValueError("Input story text must be a non-empty string")
+    try:
+        # Generate speech with default speaker profile
+        speech_output = _SPEECH_PIPELINE(
+            story_text,
+            forward_params={"speaker_embeddings": _DEFAULT_SPEAKER_EMBEDDING}
+        )
+        # Save audio to WAV file
+        sf.write(
+            output_path,
+            speech_output["audio"],
+            samplerate=speech_output["sampling_rate"]
+        )
+        return output_path
+    except Exception as error:
+        raise RuntimeError(f"Audio synthesis failed: {str(error)}") from error
+# App title
+st.title("Best Story Teller")
+# Write some text
+st.write("Upload a picture and start your journey of creativeness and imagination")
+# File uploader for image and audio
+uploaded_image = st.file_uploader("Upload an image", type=["jpg", "jpeg", "png"])
+uploaded_audio = st.file_uploader("Upload an audio file", type=["mp3", "wav", "ogg"])
+# Display image with spinner
+if uploaded_image is not None:
+    with st.spinner("Loading image..."):
+        image = Image.open(uploaded_image)
+        st.image(image, caption="Uploaded Image", use_column_width=True)
+    with st.spinner("Captioning image..."):
+        caption_from_file = generate_image_caption(image)
+    with st.spinner("Adding some magics and imagination..."):
+        system_prompt = "You are a helpful kid story writter. You should directly generate a simple, educational and intresting story no more than 150 words."
+        user_prompt = caption_from_file
+        story = generate_story_content(system_prompt, user_prompt)
+        st.write(story)
+    with st.spinner("Finding the best voice actor"):
+        generated_audio = generate_audio_from_story(story,"childrens_story.wav")
+        st.audio(generated_audio)