Spaces:
Sleeping
Sleeping
File size: 7,651 Bytes
70fa8c1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 |
import streamlit as st
from PIL import Image
import time
from transformers import pipeline,AutoModelForCausalLM,AutoTokenizer
from typing import Tuple
from datasets import load_dataset
import soundfile as sf
import torch
# Initialize image captioning pipeline with pretrained model
# Model source: Hugging Face Model Hub
_image_caption_pipeline = pipeline(
task="image-to-text",
model="noamrot/FuseCap_Image_Captioning"
)
# Global model configuration constants
_MODEL_NAME = "Qwen/Qwen3-1.7B"
_THINKING_TOKEN_ID = 151668 # Special token marking thinking/content separation
# Initialize model components once
_tokenizer = AutoTokenizer.from_pretrained(_MODEL_NAME)
_model = AutoModelForCausalLM.from_pretrained(
_MODEL_NAME,
torch_dtype="auto",
device_map="auto"
)
# Initialize TTS components once to avoid reloading
_SPEECH_PIPELINE = pipeline("text-to-speech", model="microsoft/speecht5_tts")
_EMBEDDINGS_DATASET = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
_DEFAULT_SPEAKER_EMBEDDING = torch.tensor(_EMBEDDINGS_DATASET[7306]["xvector"]).unsqueeze(0)
def generate_image_caption(input_image):
"""
Generate a textual description for an input image using a pretrained model.
Args:
input_image (Union[PIL.Image.Image, str]): Image to process. Can be either:
- A PIL Image object
- A string containing a filesystem path to an image file
Returns:
str: Generated caption text in natural language
Example:
>>> from PIL import Image
>>> img = Image.open("photo.jpg")
>>> caption = generate_image_caption(img)
>>> print(f"Caption: {caption}")
"""
# Process image through the captioning pipeline
inference_results = _image_caption_pipeline(input_image)
# Extract text from the first (and only) result dictionary
caption_text = inference_results[0]['generated_text']
return caption_text
def generate_story_content(system_prompt: str, user_prompt: str) -> str:
"""
Generates a children's story based on provided system and user prompts.
Args:
system_prompt: Defines the assistant's role and writing constraints
user_prompt: Describes the story scenario and specific elements to include
Returns:
Generated story text without any thinking process metadata
Raises:
RuntimeError: If text generation fails at any stage
Example:
>>> story = generate_story_content(
... "You are a helpful children's author...",
... "Kids playing with dogs in a sunny meadow..."
... )
"""
try:
# Prepare chat message structure
conversation_history = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt}
]
# Format input using model-specific template
formatted_input = _tokenizer.apply_chat_template(
conversation_history,
tokenize=False,
add_generation_prompt=True,
enable_thinking=False
)
# Tokenize and prepare model inputs
model_inputs = _tokenizer(
[formatted_input],
return_tensors="pt"
).to(_model.device)
# Generate text completion
generated_sequences = _model.generate(
**model_inputs,
max_new_tokens=1000
)
# Process and clean output
return _process_generated_output(
generated_sequences,
model_inputs.input_ids
)
except Exception as error:
raise RuntimeError(f"Story generation failed: {str(error)}") from error
def _process_generated_output(generated_sequences: list, input_ids: list) -> str:
"""
Processes raw model output to extract final content.
Args:
generated_sequences: Raw output sequences from model generation
input_ids: Original input token IDs used for generation
Returns:
Cleaned final content text
"""
# Extract new tokens excluding original prompt
new_tokens = generated_sequences[0][len(input_ids[0]):].tolist()
# Find separation point between thinking and final content
separation_index = _find_thinking_separation(new_tokens)
# Decode and clean final content
return _tokenizer.decode(
new_tokens[separation_index:],
skip_special_tokens=True
).strip("\n")
def _find_thinking_separation(token_sequence: list) -> int:
"""
Locates the boundary between thinking process and final content.
Args:
token_sequence: List of generated token IDs
Returns:
Index position marking the start of final content
"""
try:
# Search from end for separation token
reverse_position = token_sequence[::-1].index(_THINKING_TOKEN_ID)
return len(token_sequence) - reverse_position
except ValueError:
return 0 # Return start if token not found
def generate_audio_from_story(story_text: str, output_path: str = "output.wav") -> str:
"""
Convert text story to speech audio file using text-to-speech synthesis.
Args:
story_text: Input story text to synthesize
output_path: Path to save generated audio (default: 'output.wav')
Returns:
Path to generated audio file
Raises:
ValueError: For empty/invalid input text
RuntimeError: If audio generation fails
Example:
>>> generate_audio_from_story("Children playing in the park", "story_audio.wav")
'story_audio.wav'
"""
# Validate input text
if not isinstance(story_text, str) or not story_text.strip():
raise ValueError("Input story text must be a non-empty string")
try:
# Generate speech with default speaker profile
speech_output = _SPEECH_PIPELINE(
story_text,
forward_params={"speaker_embeddings": _DEFAULT_SPEAKER_EMBEDDING}
)
# Save audio to WAV file
sf.write(
output_path,
speech_output["audio"],
samplerate=speech_output["sampling_rate"]
)
return output_path
except Exception as error:
raise RuntimeError(f"Audio synthesis failed: {str(error)}") from error
# App title
st.title("Best Story Teller")
# Write some text
st.write("Upload a picture and start your journey of creativeness and imagination")
# File uploader for image and audio
uploaded_image = st.file_uploader("Upload an image", type=["jpg", "jpeg", "png"])
uploaded_audio = st.file_uploader("Upload an audio file", type=["mp3", "wav", "ogg"])
# Display image with spinner
if uploaded_image is not None:
with st.spinner("Loading image..."):
image = Image.open(uploaded_image)
st.image(image, caption="Uploaded Image", use_column_width=True)
with st.spinner("Captioning image..."):
caption_from_file = generate_image_caption(image)
with st.spinner("Adding some magics and imagination..."):
system_prompt = "You are a helpful kid story writter. You should directly generate a simple, educational and intresting story no more than 150 words."
user_prompt = caption_from_file
story = generate_story_content(system_prompt, user_prompt)
st.write(story)
with st.spinner("Finding the best voice actor"):
generated_audio = generate_audio_from_story(story,"childrens_story.wav")
st.audio(generated_audio) |