ccclllwww commited on
Commit
70fa8c1
·
verified ·
1 Parent(s): 40ae61d

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +223 -0
app.py ADDED
@@ -0,0 +1,223 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from PIL import Image
3
+ import time
4
+ from transformers import pipeline,AutoModelForCausalLM,AutoTokenizer
5
+ from typing import Tuple
6
+ from datasets import load_dataset
7
+ import soundfile as sf
8
+ import torch
9
+
10
+ # Initialize image captioning pipeline with pretrained model
11
+ # Model source: Hugging Face Model Hub
12
+ _image_caption_pipeline = pipeline(
13
+ task="image-to-text",
14
+ model="noamrot/FuseCap_Image_Captioning"
15
+ )
16
+
17
+ # Global model configuration constants
18
+ _MODEL_NAME = "Qwen/Qwen3-1.7B"
19
+ _THINKING_TOKEN_ID = 151668 # Special token marking thinking/content separation
20
+
21
+ # Initialize model components once
22
+ _tokenizer = AutoTokenizer.from_pretrained(_MODEL_NAME)
23
+ _model = AutoModelForCausalLM.from_pretrained(
24
+ _MODEL_NAME,
25
+ torch_dtype="auto",
26
+ device_map="auto"
27
+ )
28
+
29
+ # Initialize TTS components once to avoid reloading
30
+ _SPEECH_PIPELINE = pipeline("text-to-speech", model="microsoft/speecht5_tts")
31
+ _EMBEDDINGS_DATASET = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
32
+ _DEFAULT_SPEAKER_EMBEDDING = torch.tensor(_EMBEDDINGS_DATASET[7306]["xvector"]).unsqueeze(0)
33
+
34
+ def generate_image_caption(input_image):
35
+ """
36
+ Generate a textual description for an input image using a pretrained model.
37
+
38
+ Args:
39
+ input_image (Union[PIL.Image.Image, str]): Image to process. Can be either:
40
+ - A PIL Image object
41
+ - A string containing a filesystem path to an image file
42
+
43
+ Returns:
44
+ str: Generated caption text in natural language
45
+
46
+ Example:
47
+ >>> from PIL import Image
48
+ >>> img = Image.open("photo.jpg")
49
+ >>> caption = generate_image_caption(img)
50
+ >>> print(f"Caption: {caption}")
51
+ """
52
+ # Process image through the captioning pipeline
53
+ inference_results = _image_caption_pipeline(input_image)
54
+
55
+ # Extract text from the first (and only) result dictionary
56
+ caption_text = inference_results[0]['generated_text']
57
+
58
+ return caption_text
59
+
60
+ def generate_story_content(system_prompt: str, user_prompt: str) -> str:
61
+ """
62
+ Generates a children's story based on provided system and user prompts.
63
+
64
+ Args:
65
+ system_prompt: Defines the assistant's role and writing constraints
66
+ user_prompt: Describes the story scenario and specific elements to include
67
+
68
+ Returns:
69
+ Generated story text without any thinking process metadata
70
+
71
+ Raises:
72
+ RuntimeError: If text generation fails at any stage
73
+
74
+ Example:
75
+ >>> story = generate_story_content(
76
+ ... "You are a helpful children's author...",
77
+ ... "Kids playing with dogs in a sunny meadow..."
78
+ ... )
79
+ """
80
+ try:
81
+ # Prepare chat message structure
82
+ conversation_history = [
83
+ {"role": "system", "content": system_prompt},
84
+ {"role": "user", "content": user_prompt}
85
+ ]
86
+
87
+ # Format input using model-specific template
88
+ formatted_input = _tokenizer.apply_chat_template(
89
+ conversation_history,
90
+ tokenize=False,
91
+ add_generation_prompt=True,
92
+ enable_thinking=False
93
+ )
94
+
95
+ # Tokenize and prepare model inputs
96
+ model_inputs = _tokenizer(
97
+ [formatted_input],
98
+ return_tensors="pt"
99
+ ).to(_model.device)
100
+
101
+ # Generate text completion
102
+ generated_sequences = _model.generate(
103
+ **model_inputs,
104
+ max_new_tokens=1000
105
+ )
106
+
107
+ # Process and clean output
108
+ return _process_generated_output(
109
+ generated_sequences,
110
+ model_inputs.input_ids
111
+ )
112
+
113
+ except Exception as error:
114
+ raise RuntimeError(f"Story generation failed: {str(error)}") from error
115
+
116
+ def _process_generated_output(generated_sequences: list, input_ids: list) -> str:
117
+ """
118
+ Processes raw model output to extract final content.
119
+
120
+ Args:
121
+ generated_sequences: Raw output sequences from model generation
122
+ input_ids: Original input token IDs used for generation
123
+
124
+ Returns:
125
+ Cleaned final content text
126
+ """
127
+ # Extract new tokens excluding original prompt
128
+ new_tokens = generated_sequences[0][len(input_ids[0]):].tolist()
129
+
130
+ # Find separation point between thinking and final content
131
+ separation_index = _find_thinking_separation(new_tokens)
132
+
133
+ # Decode and clean final content
134
+ return _tokenizer.decode(
135
+ new_tokens[separation_index:],
136
+ skip_special_tokens=True
137
+ ).strip("\n")
138
+
139
+ def _find_thinking_separation(token_sequence: list) -> int:
140
+ """
141
+ Locates the boundary between thinking process and final content.
142
+
143
+ Args:
144
+ token_sequence: List of generated token IDs
145
+
146
+ Returns:
147
+ Index position marking the start of final content
148
+ """
149
+ try:
150
+ # Search from end for separation token
151
+ reverse_position = token_sequence[::-1].index(_THINKING_TOKEN_ID)
152
+ return len(token_sequence) - reverse_position
153
+ except ValueError:
154
+ return 0 # Return start if token not found
155
+
156
+ def generate_audio_from_story(story_text: str, output_path: str = "output.wav") -> str:
157
+ """
158
+ Convert text story to speech audio file using text-to-speech synthesis.
159
+
160
+ Args:
161
+ story_text: Input story text to synthesize
162
+ output_path: Path to save generated audio (default: 'output.wav')
163
+
164
+ Returns:
165
+ Path to generated audio file
166
+
167
+ Raises:
168
+ ValueError: For empty/invalid input text
169
+ RuntimeError: If audio generation fails
170
+
171
+ Example:
172
+ >>> generate_audio_from_story("Children playing in the park", "story_audio.wav")
173
+ 'story_audio.wav'
174
+ """
175
+ # Validate input text
176
+ if not isinstance(story_text, str) or not story_text.strip():
177
+ raise ValueError("Input story text must be a non-empty string")
178
+
179
+ try:
180
+ # Generate speech with default speaker profile
181
+ speech_output = _SPEECH_PIPELINE(
182
+ story_text,
183
+ forward_params={"speaker_embeddings": _DEFAULT_SPEAKER_EMBEDDING}
184
+ )
185
+
186
+ # Save audio to WAV file
187
+ sf.write(
188
+ output_path,
189
+ speech_output["audio"],
190
+ samplerate=speech_output["sampling_rate"]
191
+ )
192
+
193
+ return output_path
194
+
195
+ except Exception as error:
196
+ raise RuntimeError(f"Audio synthesis failed: {str(error)}") from error
197
+
198
+
199
+ # App title
200
+ st.title("Best Story Teller")
201
+
202
+ # Write some text
203
+ st.write("Upload a picture and start your journey of creativeness and imagination")
204
+
205
+ # File uploader for image and audio
206
+ uploaded_image = st.file_uploader("Upload an image", type=["jpg", "jpeg", "png"])
207
+ uploaded_audio = st.file_uploader("Upload an audio file", type=["mp3", "wav", "ogg"])
208
+
209
+ # Display image with spinner
210
+ if uploaded_image is not None:
211
+ with st.spinner("Loading image..."):
212
+ image = Image.open(uploaded_image)
213
+ st.image(image, caption="Uploaded Image", use_column_width=True)
214
+ with st.spinner("Captioning image..."):
215
+ caption_from_file = generate_image_caption(image)
216
+ with st.spinner("Adding some magics and imagination..."):
217
+ system_prompt = "You are a helpful kid story writter. You should directly generate a simple, educational and intresting story no more than 150 words."
218
+ user_prompt = caption_from_file
219
+ story = generate_story_content(system_prompt, user_prompt)
220
+ st.write(story)
221
+ with st.spinner("Finding the best voice actor"):
222
+ generated_audio = generate_audio_from_story(story,"childrens_story.wav")
223
+ st.audio(generated_audio)