Vertex

Running

App Files Files Community

bibibi12345 commited on May 29

Commit

ebec74a

1 Parent(s): 9f566ef

bug fixes

Browse files

Files changed (2) hide show

app/api_helpers.py +27 -0
app/openai_handler.py +31 -8

app/api_helpers.py CHANGED Viewed

@@ -91,6 +91,33 @@ class StreamingReasoningProcessor:
                     self.inside_tag = False
         return processed_content, current_reasoning
 def process_streaming_content_with_reasoning_tags(

                     self.inside_tag = False
         return processed_content, current_reasoning
+    def flush_remaining(self) -> tuple[str, str]:
+        """
+        Flush any remaining content in the buffer when the stream ends.
+        Returns:
+            A tuple of:
+            - remaining_content: Any content that was buffered but not yet output
+            - remaining_reasoning: Any incomplete reasoning if we were inside a tag
+        """
+        remaining_content = ""
+        remaining_reasoning = ""
+        if self.tag_buffer and not self.inside_tag:
+            # If we have buffered content and we're not inside a tag,
+            # it's safe to output all of it
+            remaining_content = self.tag_buffer
+            self.tag_buffer = ""
+        elif self.inside_tag:
+            # If we're inside a tag when the stream ends, we have an unclosed tag
+            # Return the partial content as regular content (including the opening tag)
+            remaining_content = f"<{self.tag_name}>{self.reasoning_buffer}{self.tag_buffer}"
+            self.reasoning_buffer = ""
+            self.tag_buffer = ""
+            self.inside_tag = False
+        return remaining_content, remaining_reasoning
 def process_streaming_content_with_reasoning_tags(

app/openai_handler.py CHANGED Viewed

@@ -120,8 +120,10 @@ class OpenAIDirectHandler:
             # Create processor for tag-based extraction across chunks
             reasoning_processor = StreamingReasoningProcessor(VERTEX_REASONING_TAG)
             async for chunk in stream_response:
                 try:
                     chunk_as_dict = chunk.model_dump(exclude_unset=True, exclude_none=True)
@@ -134,11 +136,15 @@ class OpenAIDirectHandler:
                                 del delta['extra_content']
                             content = delta.get('content', '')
-                            print(content)
                             if content:
                                 # Use the processor to extract reasoning
                                 processed_content, current_reasoning = reasoning_processor.process_chunk(content)
                                 # Update delta with processed content
                                 if current_reasoning:
                                     delta['reasoning_content'] = current_reasoning
@@ -159,20 +165,37 @@ class OpenAIDirectHandler:
                     yield "data: [DONE]\n\n"
                     return
-            # Handle any remaining buffer content
-            if reasoning_processor.tag_buffer and not reasoning_processor.inside_tag:
-                # Output any remaining content
                 final_chunk = {
                     "id": f"chatcmpl-{int(time.time())}",
                     "object": "chat.completion.chunk",
                     "created": int(time.time()),
                     "model": request.model,
-                    "choices": [{"index": 0, "delta": {"content": reasoning_processor.tag_buffer}, "finish_reason": None}]
                 }
                 yield f"data: {json.dumps(final_chunk)}\n\n"
-            elif reasoning_processor.inside_tag and reasoning_processor.reasoning_buffer:
-                # We were inside a tag but never found the closing tag
-                print(f"WARNING: Unclosed reasoning tag detected. Partial reasoning: {reasoning_processor.reasoning_buffer[:100]}...")
             yield "data: [DONE]\n\n"

             # Create processor for tag-based extraction across chunks
             reasoning_processor = StreamingReasoningProcessor(VERTEX_REASONING_TAG)
+            chunk_count = 0
             async for chunk in stream_response:
+                chunk_count += 1
                 try:
                     chunk_as_dict = chunk.model_dump(exclude_unset=True, exclude_none=True)
                                 del delta['extra_content']
                             content = delta.get('content', '')
                             if content:
+                                print(f"DEBUG: Chunk {chunk_count} - Raw content: '{content}'")
                                 # Use the processor to extract reasoning
                                 processed_content, current_reasoning = reasoning_processor.process_chunk(content)
+                                # Debug logging for processing results
+                                if processed_content or current_reasoning:
+                                    print(f"DEBUG: Chunk {chunk_count} - Processed content: '{processed_content}', Reasoning: '{current_reasoning[:50]}...' if len(current_reasoning) > 50 else '{current_reasoning}'")
                                 # Update delta with processed content
                                 if current_reasoning:
                                     delta['reasoning_content'] = current_reasoning
                     yield "data: [DONE]\n\n"
                     return
+            # Debug logging for buffer state and chunk count
+            print(f"DEBUG: Stream ended after {chunk_count} chunks. Buffer state - tag_buffer: '{reasoning_processor.tag_buffer}', "
+                  f"inside_tag: {reasoning_processor.inside_tag}, "
+                  f"reasoning_buffer: '{reasoning_processor.reasoning_buffer[:50]}...' if reasoning_processor.reasoning_buffer else ''")
+            # Flush any remaining buffered content
+            remaining_content, remaining_reasoning = reasoning_processor.flush_remaining()
+            if remaining_content:
+                print(f"DEBUG: Flushing remaining content: '{remaining_content}'")
                 final_chunk = {
                     "id": f"chatcmpl-{int(time.time())}",
                     "object": "chat.completion.chunk",
                     "created": int(time.time()),
                     "model": request.model,
+                    "choices": [{"index": 0, "delta": {"content": remaining_content}, "finish_reason": None}]
                 }
                 yield f"data: {json.dumps(final_chunk)}\n\n"
+                # Send a proper finish reason chunk
+                finish_chunk = {
+                    "id": f"chatcmpl-{int(time.time())}",
+                    "object": "chat.completion.chunk",
+                    "created": int(time.time()),
+                    "model": request.model,
+                    "choices": [{"index": 0, "delta": {}, "finish_reason": "stop"}]
+                }
+                yield f"data: {json.dumps(finish_chunk)}\n\n"
+            # Note: remaining_reasoning is not used here since incomplete reasoning
+            # is treated as regular content when tags are unclosed
             yield "data: [DONE]\n\n"