Vertex

Running

App Files Files Community

bibibi12345 commited on May 24

Commit

a03de74

1 Parent(s): ba2538c

fixed cot parsing bugs

Browse files

Files changed (3) hide show

app/api_helpers.py +35 -19
app/message_processing.py +22 -7
app/routes/chat_api.py +79 -72

app/api_helpers.py CHANGED Viewed

@@ -19,7 +19,8 @@ from message_processing import (
     convert_chunk_to_openai,
     create_final_chunk,
     split_text_by_completion_tokens,
-    parse_gemini_response_for_reasoning_and_content # Added import
 )
 import config as app_config
@@ -235,16 +236,14 @@ async def gemini_fake_stream_generator( # Changed to async
         # Consider re-raising if auto-mode needs to catch this: raise e_outer_gemini
-async def openai_fake_stream_generator(
     openai_client: AsyncOpenAI,
-    openai_params: Dict[str, Any],
     openai_extra_body: Dict[str, Any],
     request_obj: OpenAIRequest,
-    is_auto_attempt: bool,
-    gcp_credentials: Any,
-    gcp_project_id: str,
-    gcp_location: str,
-    base_model_id_for_tokenizer: str
 ):
     api_model_name = openai_params.get("model", "unknown-openai-model")
     print(f"FAKE STREAMING (OpenAI): Prep for '{request_obj.model}' (API model: '{api_model_name}') with reasoning split.")
@@ -254,8 +253,16 @@ async def openai_fake_stream_generator(
         params_for_non_stream_call = openai_params.copy()
         params_for_non_stream_call['stream'] = False
         _api_call_task = asyncio.create_task(
-            openai_client.chat.completions.create(**params_for_non_stream_call, extra_body=openai_extra_body)
         )
         raw_response = await _api_call_task
         full_content_from_api = ""
@@ -264,18 +271,27 @@ async def openai_fake_stream_generator(
         vertex_completion_tokens = 0
         if raw_response.usage and raw_response.usage.completion_tokens is not None:
             vertex_completion_tokens = raw_response.usage.completion_tokens
         reasoning_text = ""
-        actual_content_text = full_content_from_api
-        if full_content_from_api and vertex_completion_tokens > 0:
-            reasoning_text, actual_content_text, _ = await asyncio.to_thread(
-                split_text_by_completion_tokens,
-                gcp_credentials, gcp_project_id, gcp_location,
-                base_model_id_for_tokenizer,
-                full_content_from_api,
-                vertex_completion_tokens
-            )
             if reasoning_text:
-                 print(f"DEBUG_FAKE_REASONING_SPLIT: Success. Reasoning len: {len(reasoning_text)}, Content len: {len(actual_content_text)}")
         return raw_response, reasoning_text, actual_content_text
     temp_task_for_keepalive_check = asyncio.create_task(_openai_api_call_and_split_task_creator_wrapper())

     convert_chunk_to_openai,
     create_final_chunk,
     split_text_by_completion_tokens,
+    parse_gemini_response_for_reasoning_and_content, # Added import
+    extract_reasoning_by_tags # Added for new OpenAI direct reasoning logic
 )
 import config as app_config
         # Consider re-raising if auto-mode needs to catch this: raise e_outer_gemini
+async def openai_fake_stream_generator( # Reverted signature: removed thought_tag_marker
     openai_client: AsyncOpenAI,
+    openai_params: Dict[str, Any],
     openai_extra_body: Dict[str, Any],
     request_obj: OpenAIRequest,
+    is_auto_attempt: bool
+    # Removed thought_tag_marker as parsing uses a fixed tag now
+    # Removed gcp_credentials, gcp_project_id, gcp_location, base_model_id_for_tokenizer previously
 ):
     api_model_name = openai_params.get("model", "unknown-openai-model")
     print(f"FAKE STREAMING (OpenAI): Prep for '{request_obj.model}' (API model: '{api_model_name}') with reasoning split.")
         params_for_non_stream_call = openai_params.copy()
         params_for_non_stream_call['stream'] = False
+        # Add the tag marker specifically for the internal non-streaming call in fake streaming
+        extra_body_for_internal_call = openai_extra_body.copy() # Avoid modifying the original dict
+        if 'google' not in extra_body_for_internal_call.get('extra_body', {}):
+             if 'extra_body' not in extra_body_for_internal_call: extra_body_for_internal_call['extra_body'] = {}
+             extra_body_for_internal_call['extra_body']['google'] = {}
+        extra_body_for_internal_call['extra_body']['google']['thought_tag_marker'] = 'vertex_think_tag'
+        print("DEBUG: Adding 'thought_tag_marker' for fake-streaming internal call.")
         _api_call_task = asyncio.create_task(
+            openai_client.chat.completions.create(**params_for_non_stream_call, extra_body=extra_body_for_internal_call) # Use modified extra_body
         )
         raw_response = await _api_call_task
         full_content_from_api = ""
         vertex_completion_tokens = 0
         if raw_response.usage and raw_response.usage.completion_tokens is not None:
             vertex_completion_tokens = raw_response.usage.completion_tokens
+        # --- Start Inserted Block (Tag-based reasoning extraction) ---
         reasoning_text = ""
+        # Ensure actual_content_text is a string even if API returns None
+        actual_content_text = full_content_from_api if isinstance(full_content_from_api, str) else ""
+        fixed_tag = "vertex_think_tag" # Use the fixed tag name
+        if actual_content_text: # Check if content exists
+            print(f"INFO: OpenAI Direct Fake-Streaming - Applying tag extraction with fixed marker: '{fixed_tag}'")
+            # Unconditionally attempt extraction with the fixed tag
+            reasoning_text, actual_content_text = extract_reasoning_by_tags(actual_content_text, fixed_tag)
             if reasoning_text:
+                 print(f"DEBUG: Tag extraction success (fixed tag). Reasoning len: {len(reasoning_text)}, Content len: {len(actual_content_text)}")
+            else:
+                 print(f"DEBUG: No content found within fixed tag '{fixed_tag}'.")
+        else:
+             print(f"WARNING: OpenAI Direct Fake-Streaming - No initial content found in message.")
+             actual_content_text = "" # Ensure empty string
+        # --- End Revised Block ---
+        # The return uses the potentially modified variables:
         return raw_response, reasoning_text, actual_content_text
     temp_task_for_keepalive_check = asyncio.create_task(_openai_api_call_and_split_task_creator_wrapper())

app/message_processing.py CHANGED Viewed

@@ -11,6 +11,26 @@ from google import genai as google_genai_client
 from models import OpenAIMessage, ContentPartText, ContentPartImage
 SUPPORTED_ROLES = ["user", "model"]
 def create_gemini_prompt(messages: List[OpenAIMessage]) -> Union[types.Content, List[types.Content]]:
     # This function remains unchanged
@@ -203,11 +223,8 @@ def parse_gemini_response_for_reasoning_and_content(gemini_response_candidate: A
     # Check if gemini_response_candidate itself resembles a part_item with 'thought'
     # This might be relevant for direct part processing in stream chunks if candidate structure is shallow
     candidate_part_text = ""
-    is_candidate_itself_thought = False
     if hasattr(gemini_response_candidate, 'text') and gemini_response_candidate.text is not None:
         candidate_part_text = str(gemini_response_candidate.text)
-    if hasattr(gemini_response_candidate, 'thought') and gemini_response_candidate.thought is True:
-        is_candidate_itself_thought = True
     # Primary logic: Iterate through parts of the candidate's content object
     gemini_candidate_content = None
@@ -224,9 +241,7 @@ def parse_gemini_response_for_reasoning_and_content(gemini_response_candidate: A
                 reasoning_text_parts.append(part_text)
             else:
                 normal_text_parts.append(part_text)
-    elif is_candidate_itself_thought: # Candidate itself was a thought part (e.g. direct part from a stream)
-        reasoning_text_parts.append(candidate_part_text)
-    elif candidate_part_text: # Candidate had text but no parts and was not a thought itself
         normal_text_parts.append(candidate_part_text)
     # If no parts and no direct text on candidate, both lists remain empty.
@@ -235,7 +250,7 @@ def parse_gemini_response_for_reasoning_and_content(gemini_response_candidate: A
         normal_text_parts.append(str(gemini_candidate_content.text))
     # Fallback if no .content but direct .text on candidate
     elif hasattr(gemini_response_candidate, 'text') and gemini_response_candidate.text is not None and not gemini_candidate_content:
-         normal_text_parts.append(str(gemini_response_candidate.text))
     return "".join(reasoning_text_parts), "".join(normal_text_parts)

 from models import OpenAIMessage, ContentPartText, ContentPartImage
 SUPPORTED_ROLES = ["user", "model"]
+# New function to extract reasoning based on specified tags
+# Removed duplicate import
+def extract_reasoning_by_tags(full_text: str, tag_name: str) -> Tuple[str, str]:
+    """Extracts reasoning content enclosed in specific tags."""
+    if not tag_name or not isinstance(full_text, str): # Handle empty tag or non-string input
+        return "", full_text if isinstance(full_text, str) else ""
+    open_tag = f"<{tag_name}>"
+    close_tag = f"</{tag_name}>"
+    # Make pattern non-greedy and handle potential multiple occurrences
+    pattern = re.compile(f"{re.escape(open_tag)}(.*?){re.escape(close_tag)}", re.DOTALL)
+    reasoning_parts = pattern.findall(full_text)
+    # Remove tags and the extracted reasoning content to get normal content
+    normal_text = pattern.sub('', full_text)
+    reasoning_content = "".join(reasoning_parts)
+    # Consider trimming whitespace that might be left after tag removal
+    return reasoning_content.strip(), normal_text.strip()
 def create_gemini_prompt(messages: List[OpenAIMessage]) -> Union[types.Content, List[types.Content]]:
     # This function remains unchanged
     # Check if gemini_response_candidate itself resembles a part_item with 'thought'
     # This might be relevant for direct part processing in stream chunks if candidate structure is shallow
     candidate_part_text = ""
     if hasattr(gemini_response_candidate, 'text') and gemini_response_candidate.text is not None:
         candidate_part_text = str(gemini_response_candidate.text)
     # Primary logic: Iterate through parts of the candidate's content object
     gemini_candidate_content = None
                 reasoning_text_parts.append(part_text)
             else:
                 normal_text_parts.append(part_text)
+    if candidate_part_text: # Candidate had text but no parts and was not a thought itself
         normal_text_parts.append(candidate_part_text)
     # If no parts and no direct text on candidate, both lists remain empty.
         normal_text_parts.append(str(gemini_candidate_content.text))
     # Fallback if no .content but direct .text on candidate
     elif hasattr(gemini_response_candidate, 'text') and gemini_response_candidate.text is not None and not gemini_candidate_content:
+        normal_text_parts.append(str(gemini_response_candidate.text))
     return "".join(reasoning_text_parts), "".join(normal_text_parts)

app/routes/chat_api.py CHANGED Viewed

@@ -23,7 +23,8 @@ from message_processing import (
     create_gemini_prompt,
     create_encrypted_gemini_prompt,
     create_encrypted_full_gemini_prompt,
-    split_text_by_completion_tokens # Added
 )
 from api_helpers import (
     create_generation_config,
@@ -219,29 +220,34 @@ STRICT OPERATING PROTOCOL:
             openai_params = {k: v for k, v in openai_params.items() if v is not None}
             openai_extra_body = {
-                'google': {
-                    'safety_settings': openai_safety_settings
                 }
             }
             if request.stream:
                 if app_config.FAKE_STREAMING_ENABLED:
                     print(f"INFO: OpenAI Fake Streaming (SSE Simulation) ENABLED for model '{request.model}'.")
                     # openai_params already has "stream": True from initial setup,
                     # but openai_fake_stream_generator will make a stream=False call internally.
-                    # Call the now async generator
                     return StreamingResponse(
-                        openai_fake_stream_generator( # REMOVED await here
                             openai_client=openai_client,
                             openai_params=openai_params,
-                            openai_extra_body=openai_extra_body,
                             request_obj=request,
-                            is_auto_attempt=False,
-                            # --- New parameters for tokenizer and reasoning split ---
-                            gcp_credentials=rotated_credentials,
-                            gcp_project_id=PROJECT_ID, # This is rotated_project_id
-                            gcp_location=LOCATION,     # This is "global"
-                            base_model_id_for_tokenizer=base_model_name # Stripped model ID for tokenizer
                         ),
                         media_type="text/event-stream"
                     )
@@ -297,70 +303,71 @@ STRICT OPERATING PROTOCOL:
                             yield "data: [DONE]\n\n"
                     return StreamingResponse(openai_true_stream_generator(), media_type="text/event-stream")
             else: # Not streaming (is_openai_direct_model and not request.stream)
-                try:
-                    # Ensure stream=False is explicitly passed for non-streaming
-                    openai_params_for_non_stream = {**openai_params, "stream": False}
-                    response = await openai_client.chat.completions.create(
-                        **openai_params_for_non_stream,
-                        # Removed redundant **openai_params spread
-                        extra_body=openai_extra_body
-                    )
-                    response_dict = response.model_dump(exclude_unset=True, exclude_none=True)
-                    try:
-                        usage = response_dict.get('usage')
-                        vertex_completion_tokens = 0
-                        if usage and isinstance(usage, dict):
-                            vertex_completion_tokens = usage.get('completion_tokens')
-                        choices = response_dict.get('choices')
-                        if choices and isinstance(choices, list) and len(choices) > 0:
-                            message_dict = choices[0].get('message')
-                            if message_dict and isinstance(message_dict, dict):
-                                # Always remove extra_content from the message if it exists, before any splitting
-                                if 'extra_content' in message_dict:
-                                    del message_dict['extra_content']
-                                    print("DEBUG: Removed 'extra_content' from response message.")
-                                if isinstance(vertex_completion_tokens, int) and vertex_completion_tokens > 0:
                                     full_content = message_dict.get('content')
-                                    if isinstance(full_content, str) and full_content:
-                                        model_id_for_tokenizer = base_model_name
-                                        reasoning_text, actual_content, dbg_all_tokens = await asyncio.to_thread(
-                                            split_text_by_completion_tokens, # Use imported function
-                                            rotated_credentials,
-                                            PROJECT_ID,
-                                            LOCATION,
-                                            model_id_for_tokenizer,
-                                            full_content,
-                                            vertex_completion_tokens
-                                        )
-                                        message_dict['content'] = actual_content
-                                        if reasoning_text: # Only add reasoning_content if it's not empty
                                             message_dict['reasoning_content'] = reasoning_text
-                                            print(f"DEBUG_REASONING_SPLIT_DIRECT_JOIN: Successful. Reasoning len: {len(reasoning_text)}. Content len: {len(actual_content)}")
-                                            print(f"  Vertex completion_tokens: {vertex_completion_tokens}. Our tokenizer total tokens: {len(dbg_all_tokens)}")
-                                        elif "".join(dbg_all_tokens) != full_content : # Content was re-joined from tokens but no reasoning
-                                            print(f"INFO: Content reconstructed from tokens. Original len: {len(full_content)}, Reconstructed len: {len(actual_content)}")
-                                        # else: No reasoning, and content is original full_content because num_completion_tokens was invalid or zero.
                                     else:
-                                         print(f"WARNING: Full content is not a string or is empty. Cannot perform split. Content: {full_content}")
-                                else:
-                                    print(f"INFO: No positive vertex_completion_tokens ({vertex_completion_tokens}) found in usage, or no message content. No split performed.")
-                    except Exception as e_reasoning_processing:
-                        print(f"WARNING: Error during non-streaming reasoning token processing for model {request.model} due to: {e_reasoning_processing}.")
-                    return JSONResponse(content=response_dict)
-                except Exception as generate_error:
-                    error_msg_generate = f"Error calling OpenAI client for {request.model}: {str(generate_error)}"
-                    print(f"ERROR: {error_msg_generate}")
-                    error_response = create_openai_error_response(500, error_msg_generate, "server_error")
-                    return JSONResponse(status_code=500, content=error_response)
         elif is_auto_model:
             print(f"Processing auto model: {request.model}")
             attempts = [

     create_gemini_prompt,
     create_encrypted_gemini_prompt,
     create_encrypted_full_gemini_prompt,
+    split_text_by_completion_tokens, # Added
+    extract_reasoning_by_tags # Added for new reasoning logic
 )
 from api_helpers import (
     create_generation_config,
             openai_params = {k: v for k, v in openai_params.items() if v is not None}
             openai_extra_body = {
+                "extra_body": {
+                    'google': {
+                        'safety_settings': openai_safety_settings
+                        # REMOVED 'thought_tag_marker' - will be added conditionally below
+                    }
                 }
             }
             if request.stream:
                 if app_config.FAKE_STREAMING_ENABLED:
                     print(f"INFO: OpenAI Fake Streaming (SSE Simulation) ENABLED for model '{request.model}'.")
                     # openai_params already has "stream": True from initial setup,
                     # but openai_fake_stream_generator will make a stream=False call internally.
+                    # Retrieve the marker before the call
+                    openai_extra_body_from_req = getattr(request, 'openai_extra_body', None)
+                    thought_tag_marker = openai_extra_body_from_req.get("google", {}).get("thought_tag_marker") if openai_extra_body_from_req else None
+                    # Call the generator with updated signature
                     return StreamingResponse(
+                        openai_fake_stream_generator(
                             openai_client=openai_client,
                             openai_params=openai_params,
+                            openai_extra_body=openai_extra_body, # Keep passing the full extra_body as it might be used elsewhere
                             request_obj=request,
+                            is_auto_attempt=False # Assuming this remains false for direct calls
+                            # Removed thought_tag_marker argument
+                            # Removed gcp_credentials, gcp_project_id, gcp_location, base_model_id_for_tokenizer previously
                         ),
                         media_type="text/event-stream"
                     )
                             yield "data: [DONE]\n\n"
                     return StreamingResponse(openai_true_stream_generator(), media_type="text/event-stream")
             else: # Not streaming (is_openai_direct_model and not request.stream)
+                    # Conditionally add the tag marker ONLY for non-streaming
+                    extra_body_for_call = openai_extra_body.copy() # Avoid modifying the original dict used elsewhere
+                    if 'google' not in extra_body_for_call.get('extra_body', {}):
+                        if 'extra_body' not in extra_body_for_call: extra_body_for_call['extra_body'] = {}
+                        extra_body_for_call['extra_body']['google'] = {}
+                    extra_body_for_call['extra_body']['google']['thought_tag_marker'] = 'vertex_think_tag'
+                    print("DEBUG: Adding 'thought_tag_marker' for non-streaming call.")
+                    try: # Corrected indentation for entire block
+                        # Ensure stream=False is explicitly passed for non-streaming
+                        openai_params_for_non_stream = {**openai_params, "stream": False}
+                        response = await openai_client.chat.completions.create(
+                            **openai_params_for_non_stream,
+                            # Removed redundant **openai_params spread
+                            extra_body=extra_body_for_call # Use the modified extra_body for non-streaming call
+                        )
+                        response_dict = response.model_dump(exclude_unset=True, exclude_none=True)
+                        try:
+                            usage = response_dict.get('usage')
+                            vertex_completion_tokens = 0 # Keep this for potential future use, but not used for split
+                            if usage and isinstance(usage, dict):
+                                vertex_completion_tokens = usage.get('completion_tokens')
+                            choices = response_dict.get('choices')
+                            if choices and isinstance(choices, list) and len(choices) > 0:
+                                message_dict = choices[0].get('message')
+                                if message_dict and isinstance(message_dict, dict):
+                                    # Always remove extra_content from the message if it exists
+                                    if 'extra_content' in message_dict:
+                                        del message_dict['extra_content']
+                                        # print("DEBUG: Removed 'extra_content' from response message.") # Optional debug log
+                                    # --- Start Revised Block (Fixed tag reasoning extraction) ---
+                                    # No longer need to get marker from request
                                     full_content = message_dict.get('content')
+                                    reasoning_text = ""
+                                    actual_content = full_content if isinstance(full_content, str) else "" # Ensure string
+                                    fixed_tag = "vertex_think_tag" # Use the fixed tag name
+                                    if actual_content: # Check if content exists
+                                        print(f"INFO: OpenAI Direct Non-Streaming - Applying tag extraction with fixed marker: '{fixed_tag}'")
+                                        # Unconditionally attempt extraction with the fixed tag
+                                        reasoning_text, actual_content = extract_reasoning_by_tags(actual_content, fixed_tag)
+                                        message_dict['content'] = actual_content # Update the dictionary
+                                        if reasoning_text:
                                             message_dict['reasoning_content'] = reasoning_text
+                                            print(f"DEBUG: Tag extraction success (fixed tag). Reasoning len: {len(reasoning_text)}, Content len: {len(actual_content)}")
+                                        else:
+                                            print(f"DEBUG: No content found within fixed tag '{fixed_tag}'.")
                                     else:
+                                        print(f"WARNING: OpenAI Direct Non-Streaming - No initial content found in message. Content: {message_dict.get('content')}")
+                                        message_dict['content'] = "" # Ensure content key exists and is empty string
+                                    # --- End Revised Block ---
+                        except Exception as e_reasoning_processing:
+                            print(f"WARNING: Error during non-streaming reasoning token processing for model {request.model} due to: {e_reasoning_processing}.")
+                        return JSONResponse(content=response_dict)
+                    except Exception as generate_error: # Corrected indentation for except block
+                        error_msg_generate = f"Error calling OpenAI client for {request.model}: {str(generate_error)}"
+                        print(f"ERROR: {error_msg_generate}")
+                        error_response = create_openai_error_response(500, error_msg_generate, "server_error")
+                        return JSONResponse(status_code=500, content=error_response)
         elif is_auto_model:
             print(f"Processing auto model: {request.model}")
             attempts = [