File size: 47,508 Bytes
e6a18b7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bb01345
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e6a18b7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
import re
import logging
import traceback
from typing import Dict, List, Any, Optional, Set


class ResponseProcessingError(Exception):
    """回應處理相關錯誤的自定義異常"""
    pass


class ResponseProcessor:
    """
    負責處理和清理LLM模型輸出的回應。
    包含格式清理、重複內容檢測、語法完整性確保等功能。
    """

    def __init__(self):
        """初始化回應處理器"""
        # set the logger
        self.logger = logging.getLogger(self.__class__.__name__)
        if not self.logger.handlers:
            handler = logging.StreamHandler()
            formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
            handler.setFormatter(formatter)
            self.logger.addHandler(handler)
            self.logger.setLevel(logging.INFO)

        # 初始化清理規則和替換字典
        self._initialize_cleaning_rules()
        self.logger.info("ResponseProcessor initialized successfully")


    def _initialize_cleaning_rules(self):
        """初始化各種清理規則和替換字典,把常見有問題情況優化"""
        try:
            # 設置重複詞彙的替換字典
            self.replacement_alternatives = {
                'visible': ['present', 'evident', 'apparent', 'observable'],
                'positioned': ['arranged', 'placed', 'set', 'organized'],
                'located': ['found', 'placed', 'situated', 'established'],
                'situated': ['placed', 'positioned', 'arranged', 'set'],
                'appears': ['seems', 'looks', 'presents', 'exhibits'],
                'features': ['includes', 'contains', 'displays', 'showcases'],
                'shows': ['reveals', 'presents', 'exhibits', 'demonstrates'],
                'displays': ['presents', 'exhibits', 'shows', 'reveals']
            }

            # 設置需要移除的前綴短語
            self.prefixes_to_remove = [
                "Here's the enhanced description:",
                "Enhanced description:",
                "Here is the enhanced scene description:",
                "I've enhanced the description while preserving all factual details:",
                "Enhanced Description:",
                "Scene Description:",
                "Description:",
                "Here is the enhanced description:",
                "Here's the enhanced description:",
                "Here is a rewritten scene description that adheres to the provided critical rules:",
                "Here is the rewritten scene description:",
                "Here's a rewritten scene description:",
                "The rewritten scene description is as follows:"
            ]

            # 設置需要移除的後綴短語
            self.suffixes_to_remove = [
                "I've maintained all the key factual elements",
                "I've preserved all the factual details",
                "All factual elements have been maintained"
            ]

            # 設置重複檢測模式
            self.repetitive_patterns = [
                (r'\b(visible)\b.*?\b(visible)\b', 'Multiple uses of "visible" detected'),
                (r'\b(positioned)\b.*?\b(positioned)\b', 'Multiple uses of "positioned" detected'),
                (r'\b(located)\b.*?\b(located)\b', 'Multiple uses of "located" detected'),
                (r'\b(situated)\b.*?\b(situated)\b', 'Multiple uses of "situated" detected'),
                (r'\b(appears)\b.*?\b(appears)\b', 'Multiple uses of "appears" detected'),
                (r'\b(features)\b.*?\b(features)\b', 'Multiple uses of "features" detected'),
                (r'\bThis\s+(\w+)\s+.*?\bThis\s+\1\b', 'Repetitive sentence structure detected')
            ]

            # 斜線組合的形容詞替換字典(有時會有斜線格式問題)
            self.slash_replacements = {
                'sunrise/sunset': 'warm lighting',
                'sunset/sunrise': 'warm lighting',
                'day/night': 'ambient lighting',
                'night/day': 'ambient lighting',
                'morning/evening': 'soft lighting',
                'evening/morning': 'soft lighting',
                'dawn/dusk': 'gentle lighting',
                'dusk/dawn': 'gentle lighting',
                'sunny/cloudy': 'natural lighting',
                'cloudy/sunny': 'natural lighting',
                'bright/dark': 'varied lighting',
                'dark/bright': 'varied lighting',
                'light/shadow': 'contrasting illumination',
                'shadow/light': 'contrasting illumination',
                'indoor/outdoor': 'mixed environment',
                'outdoor/indoor': 'mixed environment',
                'inside/outside': 'transitional space',
                'outside/inside': 'transitional space',
                'urban/rural': 'diverse landscape',
                'rural/urban': 'diverse landscape',
                'modern/traditional': 'architectural blend',
                'traditional/modern': 'architectural blend',
                'old/new': 'varied architecture',
                'new/old': 'varied architecture',
                'busy/quiet': 'dynamic atmosphere',
                'quiet/busy': 'dynamic atmosphere',
                'crowded/empty': 'varying occupancy',
                'empty/crowded': 'varying occupancy',
                'hot/cold': 'comfortable temperature',
                'cold/hot': 'comfortable temperature',
                'wet/dry': 'mixed conditions',
                'dry/wet': 'mixed conditions',
                'summer/winter': 'seasonal atmosphere',
                'winter/summer': 'seasonal atmosphere',
                'spring/autumn': 'transitional season',
                'autumn/spring': 'transitional season',
                'left/right': 'balanced composition',
                'right/left': 'balanced composition',
                'near/far': 'layered perspective',
                'far/near': 'layered perspective',
                'high/low': 'varied elevation',
                'low/high': 'varied elevation',
                'big/small': 'diverse scale',
                'small/big': 'diverse scale',
                'wide/narrow': 'varied width',
                'narrow/wide': 'varied width',
                'open/closed': 'flexible space',
                'closed/open': 'flexible space',
                'public/private': 'community space',
                'private/public': 'community space',
                'formal/informal': 'relaxed setting',
                'informal/formal': 'relaxed setting',
                'commercial/residential': 'mixed-use area',
                'residential/commercial': 'mixed-use area'
            }

            # 新增:擴展的底線替換字典
            self.underscore_replacements = {
                'urban_intersection': 'urban intersection',
                'tourist_landmark': 'tourist landmark',
                'historical_site': 'historical site',
                'religious_building': 'religious building',
                'natural_landmark': 'natural landmark',
                'commercial_area': 'commercial area',
                'residential_area': 'residential area',
                'public_space': 'public space',
                'outdoor_scene': 'outdoor scene',
                'indoor_scene': 'indoor scene',
                'street_scene': 'street scene',
                'city_center': 'city center',
                'shopping_district': 'shopping district',
                'business_district': 'business district',
                'traffic_light': 'traffic light',
                'street_lamp': 'street lamp',
                'parking_meter': 'parking meter',
                'fire_hydrant': 'fire hydrant',
                'bus_stop': 'bus stop',
                'train_station': 'train station',
                'police_car': 'police car',
                'fire_truck': 'fire truck',
                'school_bus': 'school bus',
                'time_of_day': 'time of day',
                'weather_condition': 'weather condition',
                'lighting_condition': 'lighting condition',
                'atmospheric_condition': 'atmospheric condition',
                'human_activity': 'human activity',
                'pedestrian_traffic': 'pedestrian traffic',
                'vehicle_traffic': 'vehicle traffic',
                'social_gathering': 'social gathering',
                'object_detection': 'object detection',
                'scene_analysis': 'scene analysis',
                'image_classification': 'image classification',
                'computer_vision': 'computer vision'
            }

            self.logger.info("Cleaning rules initialized successfully")

        except Exception as e:
            error_msg = f"Failed to initialize cleaning rules: {str(e)}"
            self.logger.error(error_msg)
            self.logger.error(traceback.format_exc())
            raise ResponseProcessingError(error_msg) from e

    def clean_response(self, response: str, model_type: str = "general") -> str:
        """
        清理LLM回應

        Args:
            response: 原始LLM回應
            model_type: 模型類型(用於特定清理規則)

        Returns:
            str: 清理後的回應

        Raises:
            ResponseProcessingError: 當回應處理失敗時
        """
        if not response:
            raise ResponseProcessingError("Empty response provided for cleaning")

        try:
            self.logger.debug(f"Starting response cleaning (original length: {len(response)})")

            # 保存原始回應作為備份
            original_response = response

            # 根據模型類型選擇清理策略
            if "llama" in model_type.lower():
                cleaned_response = self._clean_llama_response(response)
            else:
                cleaned_response = self._clean_general_response(response)

            # 如果清理後內容過短,嘗試從原始回應中恢復
            if len(cleaned_response.strip()) < 40:
                self.logger.warning("Cleaned response too short, attempting recovery")
                cleaned_response = self._recover_from_overcleaning(original_response)

            # 最終驗證
            self._validate_cleaned_response(cleaned_response)

            self.logger.debug(f"Response cleaning completed (final length: {len(cleaned_response)})")
            return cleaned_response

        except Exception as e:
            error_msg = f"Response cleaning failed: {str(e)}"
            self.logger.error(error_msg)
            self.logger.error(traceback.format_exc())
            raise ResponseProcessingError(error_msg) from e

    def _clean_llama_response(self, response: str) -> str:
        """
        專門處理Llama模型的回應清理

        Args:
            response: 原始Llama回應

        Returns:
            str: 清理後的回應
        """
        # 首先應用通用清理
        response = self._clean_general_response(response)

        # Llama特有的前綴清理
        llama_prefixes = [
            "Here's the enhanced description:",
            "Enhanced description:",
            "Here is the enhanced scene description:",
            "I've enhanced the description while preserving all factual details:"
        ]

        for prefix in llama_prefixes:
            if response.lower().startswith(prefix.lower()):
                response = response[len(prefix):].strip()

        # Llama特有的後綴清理
        llama_suffixes = [
            "I've maintained all the key factual elements",
            "I've preserved all the factual details",
            "All factual elements have been maintained"
        ]

        for suffix in llama_suffixes:
            if response.lower().endswith(suffix.lower()):
                response = response[:response.rfind(suffix)].strip()

        return response

    def _clean_general_response(self, response: str) -> str:
        """
        通用回應清理方法

        Args:
            response: 原始回應

        Returns:
            str: 清理後的回應
        """
        response = self._critical_format_preprocess(response)

        # 1. 移除系統remark
        response = self._remove_system_markers(response)

        # 2. 移除介紹性prefix
        response = self._remove_introduction_prefixes(response)

        # 3. 移除格式標記和上下文標籤
        response = self._remove_format_markers(response)

        # 4. 清理場景類型引用
        response = self._clean_scene_type_references(response)

        # 5. 標準化標點符號
        response = self._normalize_punctuation(response)

        # 6. 移除重複句子
        response = self._remove_duplicate_sentences(response)

        # 7. 處理重複詞彙
        response = self._handle_repetitive_vocabulary(response)

        # 8. ensure completement
        response = self._ensure_grammatical_completeness(response)

        # 9. 控制字數長度
        response = self._control_word_length(response)

        # 10. 最終格式化
        response = self._final_formatting(response)

        return response


    def _critical_format_preprocess(self, response: str) -> str:
        """
        關鍵格式預處理,處理最常見的格式問題

        Args:
            response: 原始回應

        Returns:
            str: 預處理後的回應
        """
        if not response:
            return response

        try:
            import re

            # 第一優先級:處理斜線問題
            # 首先處理已知的斜線組合,使用形容詞替換
            for slash_combo, replacement in self.slash_replacements.items():
                if slash_combo.lower() in response.lower():
                    # 保持原始大小寫格式
                    if slash_combo.upper() in response:
                        replacement_formatted = replacement.upper()
                    elif slash_combo.title() in response:
                        replacement_formatted = replacement.title()
                    else:
                        replacement_formatted = replacement

                    # 執行替換(不區分大小寫)
                    response = re.sub(re.escape(slash_combo), replacement_formatted, response, flags=re.IGNORECASE)
                    self.logger.debug(f"Replaced slash pattern '{slash_combo}' with '{replacement_formatted}'")

            # 處理其他未預定義的斜線模式
            # 標準斜線模式:word/word
            slash_pattern = r'\b([a-zA-Z]+)/([a-zA-Z]+)\b'
            matches = list(re.finditer(slash_pattern, response))
            for match in reversed(matches):  # 從後往前處理避免位置偏移
                word1, word2 = match.groups()
                # 選擇較短或更常見的詞作為替換
                if len(word1) <= len(word2):
                    replacement = word1
                else:
                    replacement = word2
                response = response[:match.start()] + replacement + response[match.end():]
                self.logger.debug(f"Replaced general slash pattern '{match.group(0)}' with '{replacement}'")

            # 第二優先級:處理底線格式
            # 首先處理已知的底線組合
            for underscore_combo, replacement in self.underscore_replacements.items():
                if underscore_combo in response:
                    response = response.replace(underscore_combo, replacement)
                    self.logger.debug(f"Replaced underscore pattern '{underscore_combo}' with '{replacement}'")

            # 處理三個詞的底線組合:word_word_word → word word word
            response = re.sub(r'\b([a-z]+)_([a-z]+)_([a-z]+)\b', r'\1 \2 \3', response)

            # 處理任何剩餘的底線模式:word_word → word word
            response = re.sub(r'\b([a-zA-Z]+)_([a-zA-Z]+)\b', r'\1 \2', response)

            # 第三優先級:修正不完整句子
            incomplete_sentence_fixes = [
                (r'\bIn\s*,\s*', 'Throughout the area, '),
                (r'\bOverall,\s+exudes\b', 'Overall, the scene exudes'),
                (r'\bThe overall atmosphere of\s+is\b', 'The overall atmosphere'),
                (r'\bwith its lights turned illuminating\b', 'with its lights illuminating'),
                (r'\bwhere it stands as\b', 'where it stands as'),
            ]

            for pattern, replacement in incomplete_sentence_fixes:
                response = re.sub(pattern, replacement, response, flags=re.IGNORECASE)

            # 第四優先級:語法修正處理(像是person and people)
            grammar_fixes = [
                (r'\b(\d+)\s+persons\b', r'\1 people'),
                (r'\bone\s+persons\b', 'one person'),
                (r'\btwo\s+persons\b', 'two people'),
                (r'\bthree\s+persons\b', 'three people'),
                (r'\bfour\s+persons\b', 'four people'),
                (r'\bfive\s+persons\b', 'five people'),
                (r'\bsix\s+persons\b', 'six people'),
                (r'\bseven\s+persons\b', 'seven people'),
                (r'\beight\s+persons\b', 'eight people'),
                (r'\bnine\s+persons\b', 'nine people'),
                (r'\bten\s+persons\b', 'ten people'),
                (r'\bmultiple\s+persons\b', 'multiple people'),
                (r'\bseveral\s+persons\b', 'several people'),
                (r'\bmany\s+persons\b', 'many people'),
                (r'\ba\s+few\s+persons\b', 'a few people'),
                (r'\bsome\s+persons\b', 'some people')
            ]

            for pattern, replacement in grammar_fixes:
                response = re.sub(pattern, replacement, response, flags=re.IGNORECASE)

            return response

        except Exception as e:
            self.logger.warning(f"Error in critical format preprocessing: {str(e)}")
            return response

    def _remove_system_markers(self, response: str) -> str:
        """移除系統樣式標記"""
        # 移除對話remark
        response = re.sub(r'<\|.*?\|>', '', response)

        # 移除輸出remark
        output_start = response.find("[OUTPUT_START]")
        output_end = response.find("[OUTPUT_END]")
        if output_start != -1 and output_end != -1 and output_end > output_start:
            response = response[output_start + len("[OUTPUT_START]"):output_end].strip()

        # 移除其他remark
        section_markers = [
            r'\[.*?\]',
            r'OUTPUT_START\s*:|OUTPUT_END\s*:',
            r'ENHANCED DESCRIPTION\s*:',
            r'Scene Type\s*:.*?(?=\n|$)',
            r'Original Description\s*:.*?(?=\n|$)',
            r'GOOD\s*:|BAD\s*:',
            r'PROBLEM\s*:.*?(?=\n|$)',
            r'</?\|(?:assistant|system|user)\|>',
            r'\(Note:.*?\)',
            r'\(.*?I\'ve.*?\)',
            r'\(.*?as per your request.*?\)'
        ]

        for marker in section_markers:
            response = re.sub(marker, '', response, flags=re.IGNORECASE)

        return response

    def _remove_introduction_prefixes(self, response: str) -> str:
        """移除介紹性前綴"""
        # 處理 "Here is..." 類型的prefix
        intro_prefixes = [
            r'^Here\s+is\s+(?:a\s+|the\s+)?(?:rewritten\s+|enhanced\s+)?scene\s+description.*?:\s*',
            r'^The\s+(?:rewritten\s+|enhanced\s+)?(?:scene\s+)?description\s+is.*?:\s*',
            r'^Here\'s\s+(?:a\s+|the\s+)?(?:rewritten\s+|enhanced\s+)?description.*?:\s*'
        ]

        for prefix_pattern in intro_prefixes:
            response = re.sub(prefix_pattern, '', response, flags=re.IGNORECASE)

        # 處理固定prefix
        for prefix in self.prefixes_to_remove:
            if response.lower().startswith(prefix.lower()):
                response = response[len(prefix):].strip()

        return response

    def _remove_format_markers(self, response: str) -> str:
        """移除格式標記和上下文標籤(保留括號內的地理與細節資訊)"""
        # 移除上下文相關remark
        response = re.sub(r'<\s*Context:.*?>', '', response)
        response = re.sub(r'Context:.*?(?=\n|$)', '', response)
        response = re.sub(r'Note:.*?(?=\n|$)', '', response, flags=re.IGNORECASE)

        # 移除Markdown格式
        response = re.sub(r'\*\*|\*|__|\|', '', response)

        # 移除任何剩餘的特殊標記 (避開括號內容,以免剔除地理位置等有用資訊)
        response = re.sub(r'</?\|.*?\|>', '', response)
        # ※ 以下移除「刪除整個括號及其內文」的方式已註解,以保留地理位置資訊
        # response = re.sub(r'\(.*?\)', '', response)

        return response


    def _clean_scene_type_references(self, response: str) -> str:
        """清理不當的場景類型引用"""
        scene_type_pattern = r'This ([a-zA-Z_]+) (features|shows|displays|contains)'
        match = re.search(scene_type_pattern, response)
        if match and '_' in match.group(1):
            fixed_text = f"This scene {match.group(2)}"
            response = re.sub(scene_type_pattern, fixed_text, response)

        return response

    def _normalize_punctuation(self, response: str) -> str:
        """標準化標點符號"""
        # 減少破折號使用
        response = re.sub(r'—', ', ', response)
        response = re.sub(r' - ', ', ', response)

        # 處理連續標點符號
        response = re.sub(r'([.,;:!?])\1+', r'\1', response)

        # 修復不完整句子的標點
        response = re.sub(r',\s*$', '.', response)

        # 修復句號後缺少空格的問題
        response = re.sub(r'([.!?])([A-Z])', r'\1 \2', response)

        # 清理多餘空格和換行
        response = response.replace('\r', ' ')
        response = re.sub(r'\n+', ' ', response)
        response = re.sub(r'\s{2,}', ' ', response)

        return response


    def _remove_duplicate_sentences(self, response: str, similarity_threshold: float = 0.85) -> str:
        """
        移除重複或高度相似的句子,使用 Jaccard 相似度進行比較。
        Args:
            response: 原始回應文本。
            similarity_threshold: 認定句子重複的相似度閾值 (0.0 到 1.0)。
                                  較高的閾值表示句子需要非常相似才會被移除。
        Returns:
            str: 移除重複句子後的文本。
        """
        try:
            if not response or not response.strip():
                return ""

            # (?<=[.!?]) 會保留分隔符在句尾, \s+ 會消耗句尾的空格
            # 這樣用 ' ' join 回去時, 標點和下個句子間剛好一個空格
            sentences = re.split(r'(?<=[.!?])\s+', response.strip())

            unique_sentences_data = [] # Store tuples of (original_sentence, simplified_word_set)

            min_sentence_len_for_check = 8 # 簡化後詞彙數少於此值,除非完全相同否則不輕易判斷為重複

            for sentence in sentences:
                sentence = sentence.strip()
                if not sentence:
                    continue

                # 創建簡化版本用於比較 (小寫,移除標點,分割為詞彙集合)
                # 保留數字,因為數字可能是關鍵資訊
                simplified_text = re.sub(r'[^\w\s\d]', '', sentence.lower())
                current_sentence_words = set(simplified_text.split())

                if not current_sentence_words: # 如果處理後是空集合,跳過
                    continue

                is_duplicate = False
                # 與已保留的唯一句子比較
                for i, (kept_sentence_text, kept_sentence_words) in enumerate(unique_sentences_data):
                    # Jaccard Index
                    intersection_len = len(current_sentence_words.intersection(kept_sentence_words))
                    union_len = len(current_sentence_words.union(kept_sentence_words))

                    if union_len == 0: # 兩個都是空集合,代表相同句子
                        jaccard_similarity = 1.0
                    else:
                        jaccard_similarity = intersection_len / union_len

                    # 用Jaccard 相似度超過閾值,不是兩個都非常短的句子 (避免 "Yes." 和 "No." 被錯誤合併)
                    # 新句子完全被舊句子包含 (且舊句子更長)
                    # 舊句子完全被新句子包含 (且新句子更長) -> 這種情況就需要替換
                    if jaccard_similarity >= similarity_threshold:
                        # 如果當前句子比已保留的句子短,且高度相似,則認為是重複
                        if len(current_sentence_words) < len(kept_sentence_words):
                            is_duplicate = True
                            self.logger.debug(f"Sentence \"{sentence[:30]}...\" marked duplicate (shorter, similar to \"{kept_sentence_text[:30]}...\") Jaccard: {jaccard_similarity:.2f}")
                            break
                        # 如果當前句子比已保留的句子長,且高度相似,則替換掉已保留的
                        elif len(current_sentence_words) > len(kept_sentence_words):
                            self.logger.debug(f"Sentence \"{kept_sentence_text[:30]}...\" replaced by longer similar sentence \"{sentence[:30]}...\" Jaccard: {jaccard_similarity:.2f}")
                            unique_sentences_data.pop(i) # 移除舊的、較短的句子

                        # 如果長度差不多,但相似度高,保留第一個出現的
                        elif current_sentence_words != kept_sentence_words : # 避免完全相同的句子被錯誤地跳過替換邏輯
                             is_duplicate = True # 保留先出現的
                             self.logger.debug(f"Sentence \"{sentence[:30]}...\" marked duplicate (similar length, similar to \"{kept_sentence_text[:30]}...\") Jaccard: {jaccard_similarity:.2f}")
                             break

                if not is_duplicate:
                    unique_sentences_data.append((sentence, current_sentence_words))

            # 重組唯一句子
            final_sentences = [s_data[0] for s_data in unique_sentences_data]

            # 確保每個句子以標點結尾 (因為 split 可能會產生沒有標點的最後一個片段)
            reconstructed_response = ""
            for i, s in enumerate(final_sentences):
                s = s.strip()
                if not s: continue
                if not s[-1] in ".!?":
                    s += "."
                reconstructed_response += s
                if i < len(final_sentences) - 1:
                     reconstructed_response += " " # 在句子間添加空格

            return reconstructed_response.strip()

        except Exception as e:
            self.logger.error(f"Error in _remove_duplicate_sentences: {str(e)}")
            self.logger.error(traceback.format_exc())
            return response # 發生錯誤時返回原始回應

    def _handle_repetitive_vocabulary(self, response: str) -> str:
        """處理重複詞彙,使用 re.sub 和可呼叫的替換函數以提高效率和準確性。"""
        try:
            # 檢測重複模式 (僅警告)
            if hasattr(self, 'repetitive_patterns'):
                for pattern, issue in self.repetitive_patterns:
                    if re.search(pattern, response, re.IGNORECASE | re.DOTALL):
                        self.logger.warning(f"Text quality issue detected: {issue} in response: \"{response[:100]}...\"")

            if not hasattr(self, 'replacement_alternatives') or not self.replacement_alternatives:
                return response

            processed_response = response

            for word_to_replace, alternatives in self.replacement_alternatives.items():
                if not alternatives:  # 如果沒有可用的替代詞,則跳過
                    continue

                # 為每個詞創建一個獨立的計數器和替代索引
                # 使用閉包或一個小類來封裝狀態
                class WordReplacer:
                    def __init__(self, alternatives_list):
                        self.count = 0
                        self.alternative_idx = 0
                        self.alternatives_list = alternatives_list

                    def __call__(self, match_obj):
                        self.count += 1
                        original_word = match_obj.group(0)
                        if self.count > 1:  # 從第二次出現開始替換
                            replacement = self.alternatives_list[self.alternative_idx % len(self.alternatives_list)]
                            self.alternative_idx += 1
                            # 保持原始大小寫格式
                            if original_word.isupper():
                                return replacement.upper()
                            elif original_word.istitle():
                                return replacement.capitalize()
                            return replacement
                        return original_word # 因為第一次出現, 就不用替換

                replacer_instance = WordReplacer(alternatives)
                # 使用 \b 確保匹配的是整個單詞
                pattern = re.compile(r'\b' + re.escape(word_to_replace) + r'\b', re.IGNORECASE)
                processed_response = pattern.sub(replacer_instance, processed_response)

            # 移除 identical 等重複性描述詞彙
            identical_cleanup_patterns = [
                (r'\b(\d+)\s+identical\s+([a-zA-Z\s]+)', r'\1 \2'),
                (r'\b(two|three|four|five|six|seven|eight|nine|ten|eleven|twelve)\s+identical\s+([a-zA-Z\s]+)', r'\1 \2'),
                (r'\bidentical\s+([a-zA-Z\s]+)', r'\1'),
                (r'\bcomprehensive arrangement of\b', 'arrangement of'),
                (r'\bcomprehensive view featuring\b', 'scene featuring'),
                (r'\bcomprehensive display of\b', 'display of'),
            ]

            for pattern, replacement in identical_cleanup_patterns:
                processed_response = re.sub(pattern, replacement, processed_response, flags=re.IGNORECASE)

            # 數字到文字
            number_conversions = {
                '2': 'two', '3': 'three', '4': 'four', '5': 'five', '6': 'six',
                '7': 'seven', '8': 'eight', '9': 'nine', '10': 'ten', 
                '11': 'eleven', '12': 'twelve'
            }

            # 處理各種語法結構中的數字
            for digit, word in number_conversions.items():
                # 模式1: 數字 + 單一複數詞 (如 "7 chairs")
                pattern1 = rf'\b{digit}\s+([a-zA-Z]+s)\b'
                processed_response = re.sub(pattern1, rf'{word} \1', processed_response)
                
                # 模式2: 數字 + 修飾詞 + 複數詞 (如 "7 more chairs")
                pattern2 = rf'\b{digit}\s+(more|additional|other|identical)\s+([a-zA-Z]+s)\b'
                processed_response = re.sub(pattern2, rf'{word} \1 \2', processed_response, flags=re.IGNORECASE)
                
                # 模式3: 數字 + 形容詞 + 複數詞 (如 "2 dining tables")
                pattern3 = rf'\b{digit}\s+([a-zA-Z]+)\s+([a-zA-Z]+s)\b'
                processed_response = re.sub(pattern3, rf'{word} \1 \2', processed_response)
                
                # 模式4: 介詞片語中的數字 (如 "around 2 tables")
                pattern4 = rf'\b(around|approximately|about)\s+{digit}\s+([a-zA-Z]+s)\b'
                processed_response = re.sub(pattern4, rf'\1 {word} \2', processed_response, flags=re.IGNORECASE)

            return processed_response

        except Exception as e:
            self.logger.error(f"Error in _handle_repetitive_vocabulary: {str(e)}")
            self.logger.error(traceback.format_exc())
            return response # 發生錯誤時返回原始回應

    def _ensure_grammatical_completeness(self, response: str) -> str:
        """
        確保語法完整性,處理不完整句子和格式問題

        Args:
            response: 待檢查的回應文本

        Returns:
            str: 語法完整的回應文本
        """
        try:
            if not response or not response.strip():
                return response

            # 第一階段:檢查並修正不完整的句子模式
            incomplete_patterns = [
                # 介詞後直接結束的問題(針對 "over ." 等情況)
                (r'\b(over|under|through|across|along|beneath|beyond|throughout)\s*\.', 'incomplete_preposition'),
                (r'\b(with|without|against|towards|beside|between|among)\s*\.', 'incomplete_preposition'),
                (r'\b(into|onto|upon|within|behind|below|above)\s*\.', 'incomplete_preposition'),

                # 處理 "In ," 這類缺失詞彙的問題
                (r'\bIn\s*,', 'incomplete_location'),
                (r'\bAt\s*,', 'incomplete_location'),
                (r'\bOn\s*,', 'incomplete_location'),
                (r'\bWith\s*,', 'incomplete_context'),

                # 不完整的描述模式
                (r'\b(fine|the)\s+(the\s+)?(?:urban|area|scene)\b(?!\s+\w)', 'incomplete_description'),

                # 連詞或介詞後直接標點的問題
                (r'\b(and|or|but|with|from|in|at|on|by|for|to)\s*[.!?]', 'incomplete_conjunction'),

                # 重複詞彙
                (r'\b(\w+)\s+\1\b', 'word_repetition'),

                # 不完整的場景類型引用(如 "urban_intersection" 格式問題)
                (r'\b(\w+)_(\w+)\b', 'underscore_format'),

                # 地標場景特有問題
                (r'\btourist_landmark\b', 'underscore_format'),
                (r'\burban_intersection\b', 'underscore_format'),
                (r'\bIn\s*,\s*(?=\w)', 'incomplete_prepositional'),
                (r'\bOverall,\s+(?=exudes|shows|displays)(?!\s+(?:the|this|it))', 'missing_subject'),
                (r'\batmosphere of\s+is one of\b', 'redundant_structure'),
                (r'\bwith.*?turned\s+illuminating\b', 'redundant_participle')
            ]

            for pattern, issue_type in incomplete_patterns:
                try:
                    matches = list(re.finditer(pattern, response, re.IGNORECASE))

                    for match in matches:
                        if issue_type == 'incomplete_preposition':
                            # 處理介詞後直接結束的情況
                            response = self._fix_incomplete_preposition(response, match)

                        elif issue_type == 'underscore_format':
                            # 將下劃線格式轉換為空格分隔
                            original = match.group(0)
                            replacement = original.replace('_', ' ')
                            response = response.replace(original, replacement)

                        elif issue_type == 'word_repetition':
                            # 移除重複的詞彙
                            repeated_word = match.group(1)
                            response = response.replace(f"{repeated_word} {repeated_word}", repeated_word)

                        elif issue_type == 'incomplete_location' or issue_type == 'incomplete_context':
                            # 移除不完整的位置或上下文引用
                            response = response.replace(match.group(0), '')

                        elif issue_type == 'incomplete_prepositional':
                            # 處理不完整的介詞短語
                            response = re.sub(r'\bIn\s*,\s*', 'Throughout the scene, ', response)

                        elif issue_type == 'missing_subject':
                            # 為Overall句子添加主語
                            response = re.sub(r'\bOverall,\s+(?=exudes)', 'Overall, the scene ', response)

                        elif issue_type == 'redundant_structure':
                            # 簡化冗餘結構
                            response = re.sub(r'\batmosphere of\s+is one of\b', 'atmosphere is one of', response)

                        elif issue_type == 'redundant_participle':
                            # 清理冗餘分詞
                            response = re.sub(r'turned\s+illuminating', 'illuminating', response)

                        else:
                            # 對於其他不完整模式,直接移除
                            response = response.replace(match.group(0), '')

                    # 清理多餘空格
                    response = re.sub(r'\s{2,}', ' ', response).strip()

                except re.error as e:
                    self.logger.warning(f"Regular expression pattern error for {issue_type}: {pattern} - {str(e)}")
                    continue

            # 第二階段:處理物件類別格式問題
            response = self._clean_object_class_references(response)

            # 第三階段:確保句子正確結束
            response = self._ensure_proper_sentence_ending(response)

            # 第四階段:最終語法檢查
            response = self._final_grammar_check(response)

            return response.strip()

        except Exception as e:
            self.logger.error(f"Error in _ensure_grammatical_completeness: {str(e)}")
            return response

    def _fix_incomplete_preposition(self, response: str, match) -> str:
        """
        修正不完整的介詞短語

        Args:
            response: 回應文本
            match: 正則匹配對象

        Returns:
            str: 修正後的回應
        """
        preposition = match.group(1)
        match_start = match.start()

        # 找到句子的開始位置
        sentence_start = response.rfind('.', 0, match_start)
        sentence_start = sentence_start + 1 if sentence_start != -1 else 0

        # 提取句子片段
        sentence_fragment = response[sentence_start:match_start].strip()

        # 如果句子片段有意義,嘗試移除不完整的介詞部分
        if len(sentence_fragment) > 10:
            # 移除介詞及其後的內容,添加適當的句號
            response = response[:match_start].rstrip() + '.'
        else:
            # 如果句子片段太短,移除整個不完整的句子
            response = response[:sentence_start] + response[match.end():]

        return response

    def _clean_object_class_references(self, response: str) -> str:
        """
        清理物件類別引用中的格式問題

        Args:
            response: 回應文本

        Returns:
            str: 清理後的回應
        """
        # 移除類別ID引用(如 "unknown-class 2", "Class 0" 等)
        class_id_patterns = [
            r'\bunknown[- ]?class\s*\d+\s*objects?',
            r'\bclass[- ]?\d+\s*objects?',
            r'\b[Cc]lass\s*\d+\s*objects?',
            r'\bunknown[- ][Cc]lass\s*\d+\s*objects?'
        ]

        for pattern in class_id_patterns:
            try:
                # 替換為更自然的描述
                response = re.sub(pattern, 'objects', response, flags=re.IGNORECASE)
            except re.error as e:
                self.logger.warning(f"Error cleaning class reference pattern {pattern}: {str(e)}")
                continue

        # 處理數量描述中的問題
        response = re.sub(r'\b(\w+)\s+unknown[- ]?\w*\s*objects?', r'\1 objects', response, flags=re.IGNORECASE)

        return response

    def _ensure_proper_sentence_ending(self, response: str) -> str:
        """
        確保句子有適當的結尾

        Args:
            response: 回應文本

        Returns:
            str: 具有適當結尾的回應
        """
        if not response or not response.strip():
            return response

        response = response.strip()

        # 檢查是否以標點符號結尾
        if response and response[-1] not in ['.', '!', '?']:

            # 常見介詞和連詞列表
            problematic_endings = [
                "into", "onto", "about", "above", "across", "after", "along", "around",
                "at", "before", "behind", "below", "beneath", "beside", "between",
                "beyond", "by", "down", "during", "except", "for", "from", "in",
                "inside", "near", "of", "off", "on", "over", "through", "to",
                "toward", "under", "up", "upon", "with", "within", "and", "or", "but"
            ]

            words = response.split()
            if words:
                last_word = words[-1].lower().rstrip('.,!?')

                if last_word in problematic_endings:
                    # 找到最後完整的句子
                    last_period_pos = max(
                        response.rfind('.'),
                        response.rfind('!'),
                        response.rfind('?')
                    )

                    if last_period_pos > len(response) // 2:  # 如果有較近的完整句子
                        response = response[:last_period_pos + 1]
                    else:
                        # 移除問題詞彙並添加句號
                        if len(words) > 1:
                            response = " ".join(words[:-1]) + "."
                        else:
                            response = "The scene displays various elements."
                else:
                    # 正常情況下添加句號
                    response += "."

        return response

    def _final_grammar_check(self, response: str) -> str:
        """
        最終語法檢查和清理

        Args:
            response: 回應文本

        Returns:
            str: 最終清理後的回應
        """
        if not response:
            return response

        # 修正連續標點符號
        response = re.sub(r'([.!?]){2,}', r'\1', response)

        # 修正句號前的空格
        response = re.sub(r'\s+([.!?])', r'\1', response)

        # 修正句號後缺少空格的問題
        response = re.sub(r'([.!?])([A-Z])', r'\1 \2', response)

        # 確保首字母大寫
        if response and response[0].islower():
            response = response[0].upper() + response[1:]

        # 移除多餘的空格
        response = re.sub(r'\s{2,}', ' ', response)

        # 處理空句子或過短的回應
        if len(response.strip()) < 20:
            return "The scene contains various visual elements."

        return response.strip()

    def _control_word_length(self, response: str) -> str:
        """控制文字長度在合理範圍內"""
        words = response.split()
        if len(words) > 200:
            # 找到接近字數限制的句子結束處
            truncated = ' '.join(words[:200])
            last_period = max(truncated.rfind('.'), truncated.rfind('!'), truncated.rfind('?'))

            if last_period > 0:
                response = truncated[:last_period+1]
            else:
                response = truncated + "."

        return response

    def _final_formatting(self, response: str) -> str:
        """最終格式化處理"""
        # 確保首字母大寫
        if response and response[0].islower():
            response = response[0].upper() + response[1:]

        # 統一格式為單一段落
        response = re.sub(r'\s*\n\s*', ' ', response)
        response = ' '.join(response.split())

        return response.strip()

    def _recover_from_overcleaning(self, original_response: str) -> str:
        """從過度清理中恢復內容"""
        try:
            # 嘗試從原始回應中找到最佳段落
            paragraphs = [p for p in original_response.split('\n\n') if p.strip()]
            if paragraphs:
                # 選擇最長的段落作為主要描述
                best_para = max(paragraphs, key=len)
                # 使用基本清理規則
                best_para = re.sub(r'\[.*?\]', '', best_para)
                best_para = re.sub(r'\s{2,}', ' ', best_para).strip()

                if len(best_para) >= 40:
                    return best_para

            return "Unable to generate a valid enhanced description."

        except Exception as e:
            self.logger.error(f"Recovery from overcleaning failed: {str(e)}")
            return "Description generation error."

    def _validate_cleaned_response(self, response: str):
        """驗證清理後的回應"""
        if not response:
            raise ResponseProcessingError("Response is empty after cleaning")

        if len(response.strip()) < 20:
            raise ResponseProcessingError("Response is too short after cleaning")

        # 檢查是否包含基本的句子結構
        if not re.search(r'[.!?]', response):
            raise ResponseProcessingError("Response lacks proper sentence structure")

    def remove_explanatory_notes(self, response: str) -> str:
        """
        移除解釋性注釋和說明

        Args:
            response: 包含可能注釋的回應

        Returns:
            str: 移除注釋後的回應
        """
        try:
            # 識別常見的注釋和解釋模式
            note_patterns = [
                r'(?:^|\n)Note:.*?(?:\n|$)',
                r'(?:^|\n)I have (?:followed|adhered to|ensured).*?(?:\n|$)',
                r'(?:^|\n)This description (?:follows|adheres to|maintains).*?(?:\n|$)',
                r'(?:^|\n)The enhanced description (?:maintains|preserves).*?(?:\n|$)'
            ]

            # 尋找段落
            paragraphs = [p.strip() for p in response.split('\n\n') if p.strip()]

            # 如果只有一個段落,檢查並清理它
            if len(paragraphs) == 1:
                for pattern in note_patterns:
                    paragraphs[0] = re.sub(pattern, '', paragraphs[0], flags=re.IGNORECASE)
                return paragraphs[0].strip()

            # 如果有多個段落,移除注釋段落
            content_paragraphs = []
            for paragraph in paragraphs:
                is_note = False
                for pattern in note_patterns:
                    if re.search(pattern, paragraph, flags=re.IGNORECASE):
                        is_note = True
                        break

                # 檢查段落是否以常見的注釋詞開頭
                if paragraph.lower().startswith(('note:', 'please note:', 'remember:')):
                    is_note = True

                if not is_note:
                    content_paragraphs.append(paragraph)

            return '\n\n'.join(content_paragraphs).strip()

        except Exception as e:
            self.logger.error(f"Failed to remove explanatory notes: {str(e)}")
            return response

    def get_processor_info(self) -> Dict[str, Any]:
        """
        獲取處理器信息

        Returns:
            Dict[str, Any]: 包含處理器狀態和配置的信息
        """
        return {
            "replacement_alternatives_count": len(self.replacement_alternatives),
            "prefixes_to_remove_count": len(self.prefixes_to_remove),
            "suffixes_to_remove_count": len(self.suffixes_to_remove),
            "repetitive_patterns_count": len(self.repetitive_patterns),
            "initialization_status": "success"
        }