Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -16,23 +16,13 @@ import torch
|
|
16 |
# ======================================
|
17 |
|
18 |
# Initialize image captioning pipeline with pretrained model
|
19 |
-
# Model source: Hugging Face Model Hub
|
20 |
_image_caption_pipeline = pipeline(
|
21 |
task="image-to-text",
|
22 |
model="noamrot/FuseCap_Image_Captioning"
|
23 |
)
|
24 |
|
25 |
# Global model configuration constants
|
26 |
-
|
27 |
-
_THINKING_TOKEN_ID = 151668 # Special token marking thinking/content separation
|
28 |
-
|
29 |
-
# Initialize model components once
|
30 |
-
_tokenizer = AutoTokenizer.from_pretrained(_MODEL_NAME)
|
31 |
-
_model = AutoModelForCausalLM.from_pretrained(
|
32 |
-
_MODEL_NAME,
|
33 |
-
torch_dtype="auto",
|
34 |
-
device_map="auto"
|
35 |
-
)
|
36 |
|
37 |
# Initialize TTS components once to avoid reloading
|
38 |
_SPEECH_PIPELINE = pipeline("text-to-speech", model="microsoft/speecht5_tts")
|
@@ -95,76 +85,19 @@ def generate_story_content(system_prompt: str, user_prompt: str) -> str:
|
|
95 |
{"role": "system", "content": system_prompt},
|
96 |
{"role": "user", "content": user_prompt}
|
97 |
]
|
98 |
-
|
99 |
-
#
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
enable_thinking=False
|
105 |
-
)
|
106 |
-
|
107 |
-
# Tokenize and prepare model inputs
|
108 |
-
model_inputs = _tokenizer(
|
109 |
-
[formatted_input],
|
110 |
-
return_tensors="pt"
|
111 |
-
).to(_model.device)
|
112 |
-
|
113 |
-
# Generate text completion
|
114 |
-
generated_sequences = _model.generate(
|
115 |
-
**model_inputs,
|
116 |
-
max_new_tokens=150
|
117 |
-
)
|
118 |
|
119 |
# Process and clean output
|
120 |
-
return
|
121 |
-
generated_sequences,
|
122 |
-
model_inputs.input_ids
|
123 |
-
)
|
124 |
|
125 |
except Exception as error:
|
126 |
raise RuntimeError(f"Story generation failed: {str(error)}") from error
|
127 |
|
128 |
-
def _process_generated_output(generated_sequences: list, input_ids: list) -> str:
|
129 |
-
"""
|
130 |
-
Processes raw model output to extract final content.
|
131 |
-
|
132 |
-
Args:
|
133 |
-
generated_sequences: Raw output sequences from model generation
|
134 |
-
input_ids: Original input token IDs used for generation
|
135 |
-
|
136 |
-
Returns:
|
137 |
-
Cleaned final content text
|
138 |
-
"""
|
139 |
-
# Extract new tokens excluding original prompt
|
140 |
-
new_tokens = generated_sequences[0][len(input_ids[0]):].tolist()
|
141 |
-
|
142 |
-
# Find separation point between thinking and final content
|
143 |
-
separation_index = _find_thinking_separation(new_tokens)
|
144 |
-
|
145 |
-
# Decode and clean final content
|
146 |
-
return _tokenizer.decode(
|
147 |
-
new_tokens[separation_index:],
|
148 |
-
skip_special_tokens=True
|
149 |
-
).strip("\n")
|
150 |
-
|
151 |
-
def _find_thinking_separation(token_sequence: list) -> int:
|
152 |
-
"""
|
153 |
-
Locates the boundary between thinking process and final content.
|
154 |
-
|
155 |
-
Args:
|
156 |
-
token_sequence: List of generated token IDs
|
157 |
-
|
158 |
-
Returns:
|
159 |
-
Index position marking the start of final content
|
160 |
-
"""
|
161 |
-
try:
|
162 |
-
# Search from end for separation token
|
163 |
-
reverse_position = token_sequence[::-1].index(_THINKING_TOKEN_ID)
|
164 |
-
return len(token_sequence) - reverse_position
|
165 |
-
except ValueError:
|
166 |
-
return 0 # Return start if token not found
|
167 |
-
|
168 |
def generate_audio_from_story(story_text: str, output_path: str = "output.wav") -> str:
|
169 |
"""
|
170 |
Convert text story to speech audio file using text-to-speech synthesis.
|
@@ -238,7 +171,6 @@ st.markdown("""
|
|
238 |
margin: 20px 0;
|
239 |
box-shadow: 0 4px 8px rgba(0,0,0,0.1);
|
240 |
}
|
241 |
-
|
242 |
/* Image caption styling */
|
243 |
.image-caption {
|
244 |
border-left: 4px solid #4CAF50;
|
|
|
16 |
# ======================================
|
17 |
|
18 |
# Initialize image captioning pipeline with pretrained model
|
|
|
19 |
_image_caption_pipeline = pipeline(
|
20 |
task="image-to-text",
|
21 |
model="noamrot/FuseCap_Image_Captioning"
|
22 |
)
|
23 |
|
24 |
# Global model configuration constants
|
25 |
+
_text_generation_pipeline = pipeline("text-generation", model="Qwen/Qwen1.5-0.5B",max_new_tokens=100)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
|
27 |
# Initialize TTS components once to avoid reloading
|
28 |
_SPEECH_PIPELINE = pipeline("text-to-speech", model="microsoft/speecht5_tts")
|
|
|
85 |
{"role": "system", "content": system_prompt},
|
86 |
{"role": "user", "content": user_prompt}
|
87 |
]
|
88 |
+
|
89 |
+
# Generate the story
|
90 |
+
story=_text_generation_pipeline(conversation_history)
|
91 |
+
|
92 |
+
# Extract the stroy result
|
93 |
+
stroy_result=story[0]["generated_text"][2]["content"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
|
95 |
# Process and clean output
|
96 |
+
return stroy_result
|
|
|
|
|
|
|
97 |
|
98 |
except Exception as error:
|
99 |
raise RuntimeError(f"Story generation failed: {str(error)}") from error
|
100 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
101 |
def generate_audio_from_story(story_text: str, output_path: str = "output.wav") -> str:
|
102 |
"""
|
103 |
Convert text story to speech audio file using text-to-speech synthesis.
|
|
|
171 |
margin: 20px 0;
|
172 |
box-shadow: 0 4px 8px rgba(0,0,0,0.1);
|
173 |
}
|
|
|
174 |
/* Image caption styling */
|
175 |
.image-caption {
|
176 |
border-left: 4px solid #4CAF50;
|