bibibi12345 commited on
Commit
a03de74
·
1 Parent(s): ba2538c

fixed cot parsing bugs

Browse files
app/api_helpers.py CHANGED
@@ -19,7 +19,8 @@ from message_processing import (
19
  convert_chunk_to_openai,
20
  create_final_chunk,
21
  split_text_by_completion_tokens,
22
- parse_gemini_response_for_reasoning_and_content # Added import
 
23
  )
24
  import config as app_config
25
 
@@ -235,16 +236,14 @@ async def gemini_fake_stream_generator( # Changed to async
235
  # Consider re-raising if auto-mode needs to catch this: raise e_outer_gemini
236
 
237
 
238
- async def openai_fake_stream_generator(
239
  openai_client: AsyncOpenAI,
240
- openai_params: Dict[str, Any],
241
  openai_extra_body: Dict[str, Any],
242
  request_obj: OpenAIRequest,
243
- is_auto_attempt: bool,
244
- gcp_credentials: Any,
245
- gcp_project_id: str,
246
- gcp_location: str,
247
- base_model_id_for_tokenizer: str
248
  ):
249
  api_model_name = openai_params.get("model", "unknown-openai-model")
250
  print(f"FAKE STREAMING (OpenAI): Prep for '{request_obj.model}' (API model: '{api_model_name}') with reasoning split.")
@@ -254,8 +253,16 @@ async def openai_fake_stream_generator(
254
  params_for_non_stream_call = openai_params.copy()
255
  params_for_non_stream_call['stream'] = False
256
 
 
 
 
 
 
 
 
 
257
  _api_call_task = asyncio.create_task(
258
- openai_client.chat.completions.create(**params_for_non_stream_call, extra_body=openai_extra_body)
259
  )
260
  raw_response = await _api_call_task
261
  full_content_from_api = ""
@@ -264,18 +271,27 @@ async def openai_fake_stream_generator(
264
  vertex_completion_tokens = 0
265
  if raw_response.usage and raw_response.usage.completion_tokens is not None:
266
  vertex_completion_tokens = raw_response.usage.completion_tokens
 
267
  reasoning_text = ""
268
- actual_content_text = full_content_from_api
269
- if full_content_from_api and vertex_completion_tokens > 0:
270
- reasoning_text, actual_content_text, _ = await asyncio.to_thread(
271
- split_text_by_completion_tokens,
272
- gcp_credentials, gcp_project_id, gcp_location,
273
- base_model_id_for_tokenizer,
274
- full_content_from_api,
275
- vertex_completion_tokens
276
- )
277
  if reasoning_text:
278
- print(f"DEBUG_FAKE_REASONING_SPLIT: Success. Reasoning len: {len(reasoning_text)}, Content len: {len(actual_content_text)}")
 
 
 
 
 
 
 
 
 
279
  return raw_response, reasoning_text, actual_content_text
280
 
281
  temp_task_for_keepalive_check = asyncio.create_task(_openai_api_call_and_split_task_creator_wrapper())
 
19
  convert_chunk_to_openai,
20
  create_final_chunk,
21
  split_text_by_completion_tokens,
22
+ parse_gemini_response_for_reasoning_and_content, # Added import
23
+ extract_reasoning_by_tags # Added for new OpenAI direct reasoning logic
24
  )
25
  import config as app_config
26
 
 
236
  # Consider re-raising if auto-mode needs to catch this: raise e_outer_gemini
237
 
238
 
239
+ async def openai_fake_stream_generator( # Reverted signature: removed thought_tag_marker
240
  openai_client: AsyncOpenAI,
241
+ openai_params: Dict[str, Any],
242
  openai_extra_body: Dict[str, Any],
243
  request_obj: OpenAIRequest,
244
+ is_auto_attempt: bool
245
+ # Removed thought_tag_marker as parsing uses a fixed tag now
246
+ # Removed gcp_credentials, gcp_project_id, gcp_location, base_model_id_for_tokenizer previously
 
 
247
  ):
248
  api_model_name = openai_params.get("model", "unknown-openai-model")
249
  print(f"FAKE STREAMING (OpenAI): Prep for '{request_obj.model}' (API model: '{api_model_name}') with reasoning split.")
 
253
  params_for_non_stream_call = openai_params.copy()
254
  params_for_non_stream_call['stream'] = False
255
 
256
+ # Add the tag marker specifically for the internal non-streaming call in fake streaming
257
+ extra_body_for_internal_call = openai_extra_body.copy() # Avoid modifying the original dict
258
+ if 'google' not in extra_body_for_internal_call.get('extra_body', {}):
259
+ if 'extra_body' not in extra_body_for_internal_call: extra_body_for_internal_call['extra_body'] = {}
260
+ extra_body_for_internal_call['extra_body']['google'] = {}
261
+ extra_body_for_internal_call['extra_body']['google']['thought_tag_marker'] = 'vertex_think_tag'
262
+ print("DEBUG: Adding 'thought_tag_marker' for fake-streaming internal call.")
263
+
264
  _api_call_task = asyncio.create_task(
265
+ openai_client.chat.completions.create(**params_for_non_stream_call, extra_body=extra_body_for_internal_call) # Use modified extra_body
266
  )
267
  raw_response = await _api_call_task
268
  full_content_from_api = ""
 
271
  vertex_completion_tokens = 0
272
  if raw_response.usage and raw_response.usage.completion_tokens is not None:
273
  vertex_completion_tokens = raw_response.usage.completion_tokens
274
+ # --- Start Inserted Block (Tag-based reasoning extraction) ---
275
  reasoning_text = ""
276
+ # Ensure actual_content_text is a string even if API returns None
277
+ actual_content_text = full_content_from_api if isinstance(full_content_from_api, str) else ""
278
+
279
+ fixed_tag = "vertex_think_tag" # Use the fixed tag name
280
+ if actual_content_text: # Check if content exists
281
+ print(f"INFO: OpenAI Direct Fake-Streaming - Applying tag extraction with fixed marker: '{fixed_tag}'")
282
+ # Unconditionally attempt extraction with the fixed tag
283
+ reasoning_text, actual_content_text = extract_reasoning_by_tags(actual_content_text, fixed_tag)
 
284
  if reasoning_text:
285
+ print(f"DEBUG: Tag extraction success (fixed tag). Reasoning len: {len(reasoning_text)}, Content len: {len(actual_content_text)}")
286
+ else:
287
+ print(f"DEBUG: No content found within fixed tag '{fixed_tag}'.")
288
+ else:
289
+ print(f"WARNING: OpenAI Direct Fake-Streaming - No initial content found in message.")
290
+ actual_content_text = "" # Ensure empty string
291
+
292
+ # --- End Revised Block ---
293
+
294
+ # The return uses the potentially modified variables:
295
  return raw_response, reasoning_text, actual_content_text
296
 
297
  temp_task_for_keepalive_check = asyncio.create_task(_openai_api_call_and_split_task_creator_wrapper())
app/message_processing.py CHANGED
@@ -11,6 +11,26 @@ from google import genai as google_genai_client
11
  from models import OpenAIMessage, ContentPartText, ContentPartImage
12
 
13
  SUPPORTED_ROLES = ["user", "model"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
  def create_gemini_prompt(messages: List[OpenAIMessage]) -> Union[types.Content, List[types.Content]]:
16
  # This function remains unchanged
@@ -203,11 +223,8 @@ def parse_gemini_response_for_reasoning_and_content(gemini_response_candidate: A
203
  # Check if gemini_response_candidate itself resembles a part_item with 'thought'
204
  # This might be relevant for direct part processing in stream chunks if candidate structure is shallow
205
  candidate_part_text = ""
206
- is_candidate_itself_thought = False
207
  if hasattr(gemini_response_candidate, 'text') and gemini_response_candidate.text is not None:
208
  candidate_part_text = str(gemini_response_candidate.text)
209
- if hasattr(gemini_response_candidate, 'thought') and gemini_response_candidate.thought is True:
210
- is_candidate_itself_thought = True
211
 
212
  # Primary logic: Iterate through parts of the candidate's content object
213
  gemini_candidate_content = None
@@ -224,9 +241,7 @@ def parse_gemini_response_for_reasoning_and_content(gemini_response_candidate: A
224
  reasoning_text_parts.append(part_text)
225
  else:
226
  normal_text_parts.append(part_text)
227
- elif is_candidate_itself_thought: # Candidate itself was a thought part (e.g. direct part from a stream)
228
- reasoning_text_parts.append(candidate_part_text)
229
- elif candidate_part_text: # Candidate had text but no parts and was not a thought itself
230
  normal_text_parts.append(candidate_part_text)
231
  # If no parts and no direct text on candidate, both lists remain empty.
232
 
@@ -235,7 +250,7 @@ def parse_gemini_response_for_reasoning_and_content(gemini_response_candidate: A
235
  normal_text_parts.append(str(gemini_candidate_content.text))
236
  # Fallback if no .content but direct .text on candidate
237
  elif hasattr(gemini_response_candidate, 'text') and gemini_response_candidate.text is not None and not gemini_candidate_content:
238
- normal_text_parts.append(str(gemini_response_candidate.text))
239
 
240
  return "".join(reasoning_text_parts), "".join(normal_text_parts)
241
 
 
11
  from models import OpenAIMessage, ContentPartText, ContentPartImage
12
 
13
  SUPPORTED_ROLES = ["user", "model"]
14
+ # New function to extract reasoning based on specified tags
15
+ # Removed duplicate import
16
+
17
+ def extract_reasoning_by_tags(full_text: str, tag_name: str) -> Tuple[str, str]:
18
+ """Extracts reasoning content enclosed in specific tags."""
19
+ if not tag_name or not isinstance(full_text, str): # Handle empty tag or non-string input
20
+ return "", full_text if isinstance(full_text, str) else ""
21
+
22
+ open_tag = f"<{tag_name}>"
23
+ close_tag = f"</{tag_name}>"
24
+ # Make pattern non-greedy and handle potential multiple occurrences
25
+ pattern = re.compile(f"{re.escape(open_tag)}(.*?){re.escape(close_tag)}", re.DOTALL)
26
+
27
+ reasoning_parts = pattern.findall(full_text)
28
+ # Remove tags and the extracted reasoning content to get normal content
29
+ normal_text = pattern.sub('', full_text)
30
+
31
+ reasoning_content = "".join(reasoning_parts)
32
+ # Consider trimming whitespace that might be left after tag removal
33
+ return reasoning_content.strip(), normal_text.strip()
34
 
35
  def create_gemini_prompt(messages: List[OpenAIMessage]) -> Union[types.Content, List[types.Content]]:
36
  # This function remains unchanged
 
223
  # Check if gemini_response_candidate itself resembles a part_item with 'thought'
224
  # This might be relevant for direct part processing in stream chunks if candidate structure is shallow
225
  candidate_part_text = ""
 
226
  if hasattr(gemini_response_candidate, 'text') and gemini_response_candidate.text is not None:
227
  candidate_part_text = str(gemini_response_candidate.text)
 
 
228
 
229
  # Primary logic: Iterate through parts of the candidate's content object
230
  gemini_candidate_content = None
 
241
  reasoning_text_parts.append(part_text)
242
  else:
243
  normal_text_parts.append(part_text)
244
+ if candidate_part_text: # Candidate had text but no parts and was not a thought itself
 
 
245
  normal_text_parts.append(candidate_part_text)
246
  # If no parts and no direct text on candidate, both lists remain empty.
247
 
 
250
  normal_text_parts.append(str(gemini_candidate_content.text))
251
  # Fallback if no .content but direct .text on candidate
252
  elif hasattr(gemini_response_candidate, 'text') and gemini_response_candidate.text is not None and not gemini_candidate_content:
253
+ normal_text_parts.append(str(gemini_response_candidate.text))
254
 
255
  return "".join(reasoning_text_parts), "".join(normal_text_parts)
256
 
app/routes/chat_api.py CHANGED
@@ -23,7 +23,8 @@ from message_processing import (
23
  create_gemini_prompt,
24
  create_encrypted_gemini_prompt,
25
  create_encrypted_full_gemini_prompt,
26
- split_text_by_completion_tokens # Added
 
27
  )
28
  from api_helpers import (
29
  create_generation_config,
@@ -219,29 +220,34 @@ STRICT OPERATING PROTOCOL:
219
  openai_params = {k: v for k, v in openai_params.items() if v is not None}
220
 
221
  openai_extra_body = {
222
- 'google': {
223
- 'safety_settings': openai_safety_settings
 
 
 
224
  }
225
  }
226
 
 
227
  if request.stream:
228
  if app_config.FAKE_STREAMING_ENABLED:
229
  print(f"INFO: OpenAI Fake Streaming (SSE Simulation) ENABLED for model '{request.model}'.")
230
  # openai_params already has "stream": True from initial setup,
231
  # but openai_fake_stream_generator will make a stream=False call internally.
232
- # Call the now async generator
 
 
 
 
233
  return StreamingResponse(
234
- openai_fake_stream_generator( # REMOVED await here
235
  openai_client=openai_client,
236
  openai_params=openai_params,
237
- openai_extra_body=openai_extra_body,
238
  request_obj=request,
239
- is_auto_attempt=False,
240
- # --- New parameters for tokenizer and reasoning split ---
241
- gcp_credentials=rotated_credentials,
242
- gcp_project_id=PROJECT_ID, # This is rotated_project_id
243
- gcp_location=LOCATION, # This is "global"
244
- base_model_id_for_tokenizer=base_model_name # Stripped model ID for tokenizer
245
  ),
246
  media_type="text/event-stream"
247
  )
@@ -297,70 +303,71 @@ STRICT OPERATING PROTOCOL:
297
  yield "data: [DONE]\n\n"
298
  return StreamingResponse(openai_true_stream_generator(), media_type="text/event-stream")
299
  else: # Not streaming (is_openai_direct_model and not request.stream)
300
- try:
301
- # Ensure stream=False is explicitly passed for non-streaming
302
- openai_params_for_non_stream = {**openai_params, "stream": False}
303
- response = await openai_client.chat.completions.create(
304
- **openai_params_for_non_stream,
305
- # Removed redundant **openai_params spread
306
- extra_body=openai_extra_body
307
- )
308
- response_dict = response.model_dump(exclude_unset=True, exclude_none=True)
309
-
310
- try:
311
- usage = response_dict.get('usage')
312
- vertex_completion_tokens = 0
 
 
 
 
313
 
314
- if usage and isinstance(usage, dict):
315
- vertex_completion_tokens = usage.get('completion_tokens')
316
-
317
- choices = response_dict.get('choices')
318
- if choices and isinstance(choices, list) and len(choices) > 0:
319
- message_dict = choices[0].get('message')
320
- if message_dict and isinstance(message_dict, dict):
321
- # Always remove extra_content from the message if it exists, before any splitting
322
- if 'extra_content' in message_dict:
323
- del message_dict['extra_content']
324
- print("DEBUG: Removed 'extra_content' from response message.")
325
-
326
- if isinstance(vertex_completion_tokens, int) and vertex_completion_tokens > 0:
 
 
 
 
 
327
  full_content = message_dict.get('content')
328
- if isinstance(full_content, str) and full_content:
329
- model_id_for_tokenizer = base_model_name
330
-
331
- reasoning_text, actual_content, dbg_all_tokens = await asyncio.to_thread(
332
- split_text_by_completion_tokens, # Use imported function
333
- rotated_credentials,
334
- PROJECT_ID,
335
- LOCATION,
336
- model_id_for_tokenizer,
337
- full_content,
338
- vertex_completion_tokens
339
- )
340
-
341
- message_dict['content'] = actual_content
342
- if reasoning_text: # Only add reasoning_content if it's not empty
343
  message_dict['reasoning_content'] = reasoning_text
344
- print(f"DEBUG_REASONING_SPLIT_DIRECT_JOIN: Successful. Reasoning len: {len(reasoning_text)}. Content len: {len(actual_content)}")
345
- print(f" Vertex completion_tokens: {vertex_completion_tokens}. Our tokenizer total tokens: {len(dbg_all_tokens)}")
346
- elif "".join(dbg_all_tokens) != full_content : # Content was re-joined from tokens but no reasoning
347
- print(f"INFO: Content reconstructed from tokens. Original len: {len(full_content)}, Reconstructed len: {len(actual_content)}")
348
- # else: No reasoning, and content is original full_content because num_completion_tokens was invalid or zero.
349
-
350
  else:
351
- print(f"WARNING: Full content is not a string or is empty. Cannot perform split. Content: {full_content}")
352
- else:
353
- print(f"INFO: No positive vertex_completion_tokens ({vertex_completion_tokens}) found in usage, or no message content. No split performed.")
354
-
355
- except Exception as e_reasoning_processing:
356
- print(f"WARNING: Error during non-streaming reasoning token processing for model {request.model} due to: {e_reasoning_processing}.")
357
-
358
- return JSONResponse(content=response_dict)
359
- except Exception as generate_error:
360
- error_msg_generate = f"Error calling OpenAI client for {request.model}: {str(generate_error)}"
361
- print(f"ERROR: {error_msg_generate}")
362
- error_response = create_openai_error_response(500, error_msg_generate, "server_error")
363
- return JSONResponse(status_code=500, content=error_response)
364
  elif is_auto_model:
365
  print(f"Processing auto model: {request.model}")
366
  attempts = [
 
23
  create_gemini_prompt,
24
  create_encrypted_gemini_prompt,
25
  create_encrypted_full_gemini_prompt,
26
+ split_text_by_completion_tokens, # Added
27
+ extract_reasoning_by_tags # Added for new reasoning logic
28
  )
29
  from api_helpers import (
30
  create_generation_config,
 
220
  openai_params = {k: v for k, v in openai_params.items() if v is not None}
221
 
222
  openai_extra_body = {
223
+ "extra_body": {
224
+ 'google': {
225
+ 'safety_settings': openai_safety_settings
226
+ # REMOVED 'thought_tag_marker' - will be added conditionally below
227
+ }
228
  }
229
  }
230
 
231
+
232
  if request.stream:
233
  if app_config.FAKE_STREAMING_ENABLED:
234
  print(f"INFO: OpenAI Fake Streaming (SSE Simulation) ENABLED for model '{request.model}'.")
235
  # openai_params already has "stream": True from initial setup,
236
  # but openai_fake_stream_generator will make a stream=False call internally.
237
+ # Retrieve the marker before the call
238
+ openai_extra_body_from_req = getattr(request, 'openai_extra_body', None)
239
+ thought_tag_marker = openai_extra_body_from_req.get("google", {}).get("thought_tag_marker") if openai_extra_body_from_req else None
240
+
241
+ # Call the generator with updated signature
242
  return StreamingResponse(
243
+ openai_fake_stream_generator(
244
  openai_client=openai_client,
245
  openai_params=openai_params,
246
+ openai_extra_body=openai_extra_body, # Keep passing the full extra_body as it might be used elsewhere
247
  request_obj=request,
248
+ is_auto_attempt=False # Assuming this remains false for direct calls
249
+ # Removed thought_tag_marker argument
250
+ # Removed gcp_credentials, gcp_project_id, gcp_location, base_model_id_for_tokenizer previously
 
 
 
251
  ),
252
  media_type="text/event-stream"
253
  )
 
303
  yield "data: [DONE]\n\n"
304
  return StreamingResponse(openai_true_stream_generator(), media_type="text/event-stream")
305
  else: # Not streaming (is_openai_direct_model and not request.stream)
306
+ # Conditionally add the tag marker ONLY for non-streaming
307
+ extra_body_for_call = openai_extra_body.copy() # Avoid modifying the original dict used elsewhere
308
+ if 'google' not in extra_body_for_call.get('extra_body', {}):
309
+ if 'extra_body' not in extra_body_for_call: extra_body_for_call['extra_body'] = {}
310
+ extra_body_for_call['extra_body']['google'] = {}
311
+ extra_body_for_call['extra_body']['google']['thought_tag_marker'] = 'vertex_think_tag'
312
+ print("DEBUG: Adding 'thought_tag_marker' for non-streaming call.")
313
+
314
+ try: # Corrected indentation for entire block
315
+ # Ensure stream=False is explicitly passed for non-streaming
316
+ openai_params_for_non_stream = {**openai_params, "stream": False}
317
+ response = await openai_client.chat.completions.create(
318
+ **openai_params_for_non_stream,
319
+ # Removed redundant **openai_params spread
320
+ extra_body=extra_body_for_call # Use the modified extra_body for non-streaming call
321
+ )
322
+ response_dict = response.model_dump(exclude_unset=True, exclude_none=True)
323
 
324
+ try:
325
+ usage = response_dict.get('usage')
326
+ vertex_completion_tokens = 0 # Keep this for potential future use, but not used for split
327
+
328
+ if usage and isinstance(usage, dict):
329
+ vertex_completion_tokens = usage.get('completion_tokens')
330
+
331
+ choices = response_dict.get('choices')
332
+ if choices and isinstance(choices, list) and len(choices) > 0:
333
+ message_dict = choices[0].get('message')
334
+ if message_dict and isinstance(message_dict, dict):
335
+ # Always remove extra_content from the message if it exists
336
+ if 'extra_content' in message_dict:
337
+ del message_dict['extra_content']
338
+ # print("DEBUG: Removed 'extra_content' from response message.") # Optional debug log
339
+
340
+ # --- Start Revised Block (Fixed tag reasoning extraction) ---
341
+ # No longer need to get marker from request
342
  full_content = message_dict.get('content')
343
+ reasoning_text = ""
344
+ actual_content = full_content if isinstance(full_content, str) else "" # Ensure string
345
+
346
+ fixed_tag = "vertex_think_tag" # Use the fixed tag name
347
+ if actual_content: # Check if content exists
348
+ print(f"INFO: OpenAI Direct Non-Streaming - Applying tag extraction with fixed marker: '{fixed_tag}'")
349
+ # Unconditionally attempt extraction with the fixed tag
350
+ reasoning_text, actual_content = extract_reasoning_by_tags(actual_content, fixed_tag)
351
+ message_dict['content'] = actual_content # Update the dictionary
352
+ if reasoning_text:
 
 
 
 
 
353
  message_dict['reasoning_content'] = reasoning_text
354
+ print(f"DEBUG: Tag extraction success (fixed tag). Reasoning len: {len(reasoning_text)}, Content len: {len(actual_content)}")
355
+ else:
356
+ print(f"DEBUG: No content found within fixed tag '{fixed_tag}'.")
 
 
 
357
  else:
358
+ print(f"WARNING: OpenAI Direct Non-Streaming - No initial content found in message. Content: {message_dict.get('content')}")
359
+ message_dict['content'] = "" # Ensure content key exists and is empty string
360
+
361
+ # --- End Revised Block ---
362
+ except Exception as e_reasoning_processing:
363
+ print(f"WARNING: Error during non-streaming reasoning token processing for model {request.model} due to: {e_reasoning_processing}.")
364
+
365
+ return JSONResponse(content=response_dict)
366
+ except Exception as generate_error: # Corrected indentation for except block
367
+ error_msg_generate = f"Error calling OpenAI client for {request.model}: {str(generate_error)}"
368
+ print(f"ERROR: {error_msg_generate}")
369
+ error_response = create_openai_error_response(500, error_msg_generate, "server_error")
370
+ return JSONResponse(status_code=500, content=error_response)
371
  elif is_auto_model:
372
  print(f"Processing auto model: {request.model}")
373
  attempts = [