[ { "name": "ascii_art_30", "score": 0.07142857142857142, "eval_type": "llm", "num_demo": 1, "num_query": 14 }, { "name": "humor_explanation", "score": 0.3066666666666667, "eval_type": "llm", "num_demo": 1, "num_query": 15 }, { "name": "science_figure_explanation", "score": 0.5000000000000001, "eval_type": "llm", "num_demo": 1, "num_query": 29 }, { "name": "vibe_eval_phrase", "score": 0.43571428571428567, "eval_type": "llm", "num_demo": 1, "num_query": 14 }, { "name": "traffic_accident_analysis", "score": 0.042857142857142864, "eval_type": "llm", "num_demo": 1, "num_query": 14 }, { "name": "figurative_speech_explanation", "score": 0.6137931034482759, "eval_type": "llm", "num_demo": 1, "num_query": 29 }, { "name": "table2latex_complex", "score": 0.43333333333333335, "eval_type": "llm", "num_demo": 1, "num_query": 9 }, { "name": "unusual_images", "score": 0.5689655172413793, "eval_type": "llm", "num_demo": 1, "num_query": 29 }, { "name": "art_explanation", "score": 0.3068965517241379, "eval_type": "llm", "num_demo": 1, "num_query": 29 }, { "name": "ocr_open_ended_qa", "score": 0.6896551724137931, "eval_type": "llm", "num_demo": 1, "num_query": 29 }, { "name": "bar_chart_interpretation", "score": 0.39655172413793105, "eval_type": "llm", "num_demo": 1, "num_query": 29 }, { "name": "scibench_w_solution_open_ended", "score": 0.17, "eval_type": "llm", "num_demo": 1, "num_query": 25 }, { "name": "GUI_Chat_Hard", "score": 0.6153846153846154, "eval_type": "llm", "num_demo": 1, "num_query": 26 }, { "name": "image_humor_understanding", "score": 0.5793103448275864, "eval_type": "llm", "num_demo": 1, "num_query": 29 }, { "name": "defeasible_reasoning", "score": 0.5827586206896552, "eval_type": "llm", "num_demo": 1, "num_query": 29 }, { "name": "funny_image_title", "score": 0.6071428571428571, "eval_type": "llm", "num_demo": 1, "num_query": 14 }, { "name": "tweets_captioning", "score": 0.2928571428571428, "eval_type": "llm", "num_demo": 1, "num_query": 14 }, { "name": "graph_interpretation", "score": 0.503448275862069, "eval_type": "llm", "num_demo": 1, "num_query": 29 }, { "name": "meme_explain", "score": 0.07142857142857142, "eval_type": "llm", "num_demo": 1, "num_query": 14 }, { "name": "guess_image_generation_prompt", "score": 0.6736842105263158, "eval_type": "llm", "num_demo": 1, "num_query": 19 }, { "name": "visualization_with_code", "score": 0.27142857142857146, "eval_type": "llm", "num_demo": 1, "num_query": 14 }, { "name": "iq_test_open_ended", "score": 0.2827586206896551, "eval_type": "llm", "num_demo": 1, "num_query": 29 }, { "name": "electrocardiogram", "score": 0.1642857142857143, "eval_type": "llm", "num_demo": 1, "num_query": 14 }, { "name": "image_captioning_with_additional_requirements", "score": 0.742857142857143, "eval_type": "llm", "num_demo": 1, "num_query": 14 }, { "name": "docci_image_description_long", "score": 0.4642857142857143, "eval_type": "llm", "num_demo": 1, "num_query": 14 }, { "name": "GUI_Chat_Easy", "score": 0.6884615384615385, "eval_type": "llm", "num_demo": 1, "num_query": 26 }, { "name": "bridge_strategies_advanced", "score": 0.10714285714285714, "eval_type": "llm", "num_demo": 1, "num_query": 14 }, { "name": "bridge_strategies_worldclass", "score": 0.014285714285714287, "eval_type": "llm", "num_demo": 1, "num_query": 14 }, { "name": "bridge_strategies_expert", "score": 0.2928571428571428, "eval_type": "llm", "num_demo": 1, "num_query": 14 }, { "name": "multi_lingual_Ruozhiba_expalnation_Spanish", "score": 0.07857142857142858, "eval_type": "llm", "num_demo": 1, "num_query": 14 }, { "name": "multi_lingual_Ruozhiba_expalnation_Japanese", "score": 0.09285714285714285, "eval_type": "llm", "num_demo": 1, "num_query": 14 }, { "name": "multi_lingual_Ruozhiba_expalnation_French", "score": 0.11428571428571428, "eval_type": "llm", "num_demo": 1, "num_query": 14 }, { "name": "multi_lingual_Ruozhiba_expalnation_Arabic", "score": 0.05714285714285715, "eval_type": "llm", "num_demo": 1, "num_query": 14 }, { "name": "multi_lingual_Ruozhiba_expalnation_Russian", "score": 0.03571428571428571, "eval_type": "llm", "num_demo": 1, "num_query": 14 }, { "name": "multi_lingual_Ruozhiba_expalnation_English", "score": 0.03571428571428571, "eval_type": "llm", "num_demo": 1, "num_query": 14 }, { "name": "table_understanding_fetaqa", "score": 0.35000000000000003, "eval_type": "llm", "num_demo": 1, "num_query": 14 }, { "name": "red_teaming_celebrity", "score": 0.8300000000000001, "eval_type": "llm", "num_demo": 0, "num_query": 20 }, { "name": "red_teaming_captcha", "score": 0.10000000000000003, "eval_type": "llm", "num_demo": 1, "num_query": 19 }, { "name": "red_teaming_jailbreak", "score": 0.6700000000000002, "eval_type": "llm", "num_demo": 0, "num_query": 20 }, { "name": "red_teaming_visualmisleading", "score": 0.8789473684210528, "eval_type": "llm", "num_demo": 1, "num_query": 19 }, { "name": "red_teaming_racial", "score": 0.675, "eval_type": "llm", "num_demo": 0, "num_query": 20 }, { "name": "red_teaming_politics", "score": 0.6499999999999999, "eval_type": "llm", "num_demo": 0, "num_query": 20 }, { "name": "brand_logo_recognition_and_elaboration", "score": 0.34, "eval_type": "rule", "num_demo": 1, "num_query": 25 }, { "name": "exchange_rate_estimate_plot", "score": 0.922935714285714, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "math_parity", "score": 0.4666666666666667, "eval_type": "rule", "num_demo": 1, "num_query": 15 }, { "name": "traffic_future_prediction_from_line_plot", "score": 0.5039473684210526, "eval_type": "rule", "num_demo": 1, "num_query": 19 }, { "name": "graph_chordless_cycle", "score": 0.35714285714285715, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "youtube_video_info_parsing", "score": 0.3809523809523809, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "super_clevr_scene_understanding", "score": 0.6428571428571429, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "figureqa", "score": 0.5, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "face_keypoint_detection", "score": 0.848854419078294, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "widerface_face_count_and_event_classification", "score": 0.42857142857142855, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "average_humidity_estimate_plot", "score": 0.5680000000000001, "eval_type": "rule", "num_demo": 1, "num_query": 15 }, { "name": "weather_info_parsing", "score": 0.20634920634920634, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "egocentric_analysis_single_image", "score": 0.3333333333333333, "eval_type": "rule", "num_demo": 1, "num_query": 9 }, { "name": "waybill_number_sequence_extraction", "score": 0.35714285714285715, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "graph_maxflow", "score": 0.0, "eval_type": "rule", "num_demo": 1, "num_query": 15 }, { "name": "TV_show_info_parsing", "score": 0.5634920634920635, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "insect_order_classification", "score": 0.13333333333333333, "eval_type": "rule", "num_demo": 1, "num_query": 15 }, { "name": "electricity_plot_future_prediction", "score": 0.4795157894736843, "eval_type": "rule", "num_demo": 1, "num_query": 19 }, { "name": "chemistry_exams_v", "score": 0.07142857142857142, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "finance_table_understanding", "score": 0.07142857142857142, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "funsd_document_qa", "score": 0.7142857142857143, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "vibe_eval_open", "score": 0.07142857142857142, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "question_solution_solving", "score": 0.0, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "graph_theory", "score": 0.21428571428571427, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "geometry_analytic", "score": 0.14285714285714285, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "geometry_length", "score": 0.14285714285714285, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "algebra", "score": 0.14285714285714285, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "chess_puzzle_single_step", "score": 0.0, "eval_type": "rule", "num_demo": 1, "num_query": 15 }, { "name": "chess_winner_identification", "score": 0.3333333333333333, "eval_type": "rule", "num_demo": 1, "num_query": 15 }, { "name": "physical_property_reasoning", "score": 0.7857142857142857, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "humor_understand_caption_match", "score": 0.13333333333333333, "eval_type": "rule", "num_demo": 1, "num_query": 15 }, { "name": "coco_object_detection_by_query_property", "score": 0.45080300212210156, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "multilingual_game_info_parsing", "score": 0.375, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "mnist_pattern", "score": 0.0, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "dvqa", "score": 0.9473684210526315, "eval_type": "rule", "num_demo": 1, "num_query": 19 }, { "name": "physics_exams_v", "score": 0.21428571428571427, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "snli_ve_visual_entailment", "score": 0.7333333333333333, "eval_type": "rule", "num_demo": 1, "num_query": 15 }, { "name": "3d_indoor_scene_text_bbox_selection", "score": 0.07142857142857142, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "geometry_descriptive", "score": 0.14285714285714285, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "top_rated_hotel_identification", "score": 0.42857142857142855, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "science_molecule_chemistry", "score": 0.5333333333333333, "eval_type": "rule", "num_demo": 1, "num_query": 15 }, { "name": "game_info_parsing", "score": 0.7857142857142856, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "deciphering_oracle_bone", "score": 0.0, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "signboard_identification", "score": 0.42857142857142855, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "image_style_recognition", "score": 1.0, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "math_convexity_value_estimation", "score": 0.3504934034834792, "eval_type": "rule", "num_demo": 1, "num_query": 15 }, { "name": "3d_indoor_scene_text_bbox_prediction", "score": 0.047217819481634095, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "movie_info_parsing", "score": 0.36607142857142855, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "human_relationship_reasoning", "score": 0.7857142857142857, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "graph_shortest_path_kamada_kawai", "score": 0.2857142857142857, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "coco_person_detection", "score": 0.5004101034934384, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "chart_vqa", "score": 0.21428571428571427, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "nlvr2_two_image_compare_qa", "score": 0.42857142857142855, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "math_exams_v", "score": 0.2857142857142857, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "newspaper_ocr_in_query_box", "score": 0.26666666666666666, "eval_type": "rule", "num_demo": 1, "num_query": 15 }, { "name": "mvsa_sentiment_classification", "score": 0.7857142857142857, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "egocentric_spatial_reasoning", "score": 0.5555555555555556, "eval_type": "rule", "num_demo": 1, "num_query": 9 }, { "name": "graph_isomorphism", "score": 0.5333333333333333, "eval_type": "rule", "num_demo": 1, "num_query": 15 }, { "name": "code_programming_test_easy", "score": 0.0, "eval_type": "rule", "num_demo": 1, "num_query": 24 }, { "name": "biology_exams_v", "score": 0.35714285714285715, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "long_string_number_recognition", "score": 0.21428571428571427, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "kvqa_knowledge_aware_qa", "score": 0.5263157894736842, "eval_type": "rule", "num_demo": 1, "num_query": 19 }, { "name": "math_breakpoint", "score": 0.7333333333333333, "eval_type": "rule", "num_demo": 1, "num_query": 15 }, { "name": "landmark_recognition_and_qa", "score": 0.24444444444444446, "eval_type": "rule", "num_demo": 1, "num_query": 15 }, { "name": "map_diagram_qa", "score": 0.5, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "pmc_vqa_medical_image_qa", "score": 0.47368421052631576, "eval_type": "rule", "num_demo": 1, "num_query": 19 }, { "name": "newspaper_page_parse_and_count", "score": 0.28888888888888886, "eval_type": "rule", "num_demo": 1, "num_query": 15 }, { "name": "science_basic_physics", "score": 0.7333333333333333, "eval_type": "rule", "num_demo": 1, "num_query": 15 }, { "name": "electricity_future_prediction_from_table", "score": 0.7136842105263157, "eval_type": "rule", "num_demo": 1, "num_query": 19 }, { "name": "license_plate_recognition", "score": 0.42857142857142855, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "places365_scene_type_classification", "score": 0.5714285714285714, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "music_info_parsing", "score": 0.2857142857142857, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "multilingual_movie_info_parsing", "score": 0.26530612244897955, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "iconqa_count_and_reasoning", "score": 0.631578947368421, "eval_type": "rule", "num_demo": 1, "num_query": 19 }, { "name": "graph_connectivity", "score": 0.16666666666666666, "eval_type": "rule", "num_demo": 1, "num_query": 15 }, { "name": "graph_shortest_path_planar", "score": 0.2857142857142857, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "famous_building_recognition", "score": 0.0, "eval_type": "rule", "num_demo": 1, "num_query": 16 }, { "name": "geometry_transformation", "score": 0.14285714285714285, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "long_string_letter_recognition", "score": 0.0, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "handwritten_math_expression_extraction", "score": 0.7857142857142857, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "geometry_solid", "score": 0.07142857142857142, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "animal_pose_estimation", "score": 0.20255737352831932, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "single_person_pose_estimation", "score": 0.11257002137534133, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "geometry_area", "score": 0.2857142857142857, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "hotel_booking_confirmation_parsing", "score": 0.1928571428571429, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "ili_ratio_future_prediction", "score": 0.08607142857142856, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "electricity_load_estimate_plot", "score": 0.5109285714285715, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "tqa_textbook_qa", "score": 0.7857142857142857, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "stock_info_parsing", "score": 0.6890756302521008, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "quizlet_question_solving", "score": 0.21428571428571427, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "stock_price_future_prediction", "score": 0.4992142857142858, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "Ad_count_detection", "score": 0.2857142857142857, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "recover_masked_word_in_figure", "score": 0.0, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "polygon_interior_angles", "score": 0.0, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "web_action_grounding", "score": 0.7142857142857143, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "latex_complex_formula_convertion", "score": 0.058823529411764705, "eval_type": "rule", "num_demo": 1, "num_query": 17 }, { "name": "transit_map_intersection_points", "score": 0.11607142857142858, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "arxiv_vqa", "score": 0.6428571428571429, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "medical_image_artifacts_indentification", "score": 0.21428571428571427, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "song_title_identification_from_lyrics", "score": 0.10714285714285714, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "actor_recognition_in_Movie", "score": 0.21428571428571427, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "bongard_problem", "score": 0.15789473684210525, "eval_type": "rule", "num_demo": 1, "num_query": 19 }, { "name": "ascii_art_understanding", "score": 0.5, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "calendar_schedule_suggestion", "score": 0.07142857142857142, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "geometry_reasoning_overlapped_circle", "score": 0.07142857142857142, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "planning_screenshot_barman", "score": 0.0, "eval_type": "rule", "num_demo": 1, "num_query": 15 }, { "name": "planning_screenshot_floortile", "score": 0.0, "eval_type": "rule", "num_demo": 1, "num_query": 15 }, { "name": "medical_blood_vessels_recognition", "score": 0.35714285714285715, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "location_vqa", "score": 0.35714285714285715, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "mindmap_elements_parsing", "score": 0.2857142857142857, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "mensa_iq_test", "score": 0.25490196078431376, "eval_type": "rule", "num_demo": 1, "num_query": 17 }, { "name": "flowchart_code_generation", "score": 0.4444444444444444, "eval_type": "rule", "num_demo": 1, "num_query": 9 }, { "name": "stackoverflow_debug_QA", "score": 0.4523809523809524, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "logical_reasoning_find_odd_one_out", "score": 0.0, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "web_action_prediction", "score": 0.5714285714285714, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "code_execution", "score": 0.0, "eval_type": "rule", "num_demo": 1, "num_query": 16 }, { "name": "music_sheet_format_QA", "score": 0.2857142857142857, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "annoying_word_search", "score": 0.0, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "interpret_force_perspective_illusion", "score": 0.6, "eval_type": "rule", "num_demo": 1, "num_query": 15 }, { "name": "healthcare_info_judgement", "score": 0.5714285714285714, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "geometry_plot_position_relationship", "score": 0.6428571428571429, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "relative_depth_of_different_points", "score": 0.5, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "topological_sort", "score": 0.0, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "scibench_fundamental_wo_solution", "score": 0.10204081632653061, "eval_type": "rule", "num_demo": 1, "num_query": 49 }, { "name": "geometry_reasoning_nested_squares", "score": 0.14285714285714285, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "font_recognition", "score": 0.0, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "geometry_reasoning_count_line_intersections", "score": 0.4642857142857143, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "circuit_diagram_understanding", "score": 0.0, "eval_type": "rule", "num_demo": 1, "num_query": 15 }, { "name": "go_capture_stone", "score": 0.06666666666666667, "eval_type": "rule", "num_demo": 1, "num_query": 15 }, { "name": "monthly_weather_days_count", "score": 0.16666666666666666, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "weather_map_climate_type_temperature_parsing", "score": 0.35714285714285715, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "top_video_creator_identification", "score": 0.21428571428571427, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "rebus", "score": 0.0, "eval_type": "rule", "num_demo": 1, "num_query": 23 }, { "name": "ishihara_test", "score": 0.21428571428571433, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "paper_vqa", "score": 0.2857142857142857, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "signage_navigation", "score": 0.5714285714285714, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "webpage_code_understanding", "score": 0.5555555555555556, "eval_type": "rule", "num_demo": 1, "num_query": 9 }, { "name": "medical_counting_lymphocytes", "score": 0.0, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "game_platform_support_identification", "score": 0.2857142857142857, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "GUI_Act_Mobile_swipe", "score": 0.5724262941510554, "eval_type": "rule", "num_demo": 1, "num_query": 13 }, { "name": "mahjong", "score": 0.0, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "scibench_calculus_wo_solution", "score": 0.12244897959183673, "eval_type": "rule", "num_demo": 1, "num_query": 49 }, { "name": "knowledge_graph_understanding", "score": 0.2, "eval_type": "rule", "num_demo": 1, "num_query": 15 }, { "name": "image_translation_en2cn", "score": 0.12162170301105525, "eval_type": "rule", "num_demo": 1, "num_query": 9 }, { "name": "realworld_qa_en2cn", "score": 0.14285714285714285, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "planning_visual_storage", "score": 0.06666666666666667, "eval_type": "rule", "num_demo": 1, "num_query": 15 }, { "name": "GUI_Act_Web_Multi", "score": 0.3579909577059959, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "chinese_idiom_recognition", "score": 0.0, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "number_comparison", "score": 0.7142857142857143, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "planning_screenshot_blocksworld", "score": 0.0, "eval_type": "rule", "num_demo": 1, "num_query": 15 }, { "name": "product_ocr_qa", "score": 0.2857142857142857, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "geometry_reasoning_circled_letter", "score": 0.35714285714285715, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "GUI_Act_Web_Single", "score": 0.0, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "extract_webpage_headline", "score": 0.5, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "planning_screenshot_storage", "score": 0.0, "eval_type": "rule", "num_demo": 1, "num_query": 15 }, { "name": "soccer_offside", "score": 0.2222222222222222, "eval_type": "rule", "num_demo": 1, "num_query": 9 }, { "name": "geometry_reasoning_grid", "score": 0.14285714285714285, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "relative_reflectance_of_different_regions", "score": 0.5, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "entertainment_web_game_style", "score": 0.0, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "orchestra_score_recognition", "score": 0.0, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "icon_arithmetic_puzzle", "score": 0.0, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "planning_screenshot_grippers", "score": 0.0, "eval_type": "rule", "num_demo": 1, "num_query": 15 }, { "name": "MMMU_pro_exam_screenshot", "score": 0.26262626262626265, "eval_type": "rule", "num_demo": 1, "num_query": 99 }, { "name": "clevrer_physics", "score": 0.2, "eval_type": "rule", "num_demo": 1, "num_query": 20 }, { "name": "MMMU_physics_chemistry_selected", "score": 0.35714285714285715, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "planning_screenshot_tyreworld", "score": 0.0, "eval_type": "rule", "num_demo": 1, "num_query": 15 }, { "name": "music_sheet_note_count", "score": 0.058823529411764705, "eval_type": "rule", "num_demo": 1, "num_query": 17 }, { "name": "hashtag_recommendation", "score": 0.7035714285714285, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "llavaguard", "score": 0.42857142857142855, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "medical_multi_organ_segmentation_rater", "score": 0.42857142857142855, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "cultural_vqa", "score": 0.2, "eval_type": "rule", "num_demo": 1, "num_query": 15 }, { "name": "logical_reasoning_fit_pattern", "score": 0.2857142857142857, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "character_recognition_in_TV_shows", "score": 0.2857142857142857, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "highest_discount_game_price_identification", "score": 0.35714285714285715, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "remaining_playback_time_calculation", "score": 0.0, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "medical_cell_recognition", "score": 0.14285714285714285, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "chess_find_legal_moves", "score": 0.03355324641748354, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "distinguish_ai_generated_image", "score": 0.5263157894736842, "eval_type": "rule", "num_demo": 1, "num_query": 19 }, { "name": "autonomous_driving_scene_analysis", "score": 0.8571428571428571, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "counting_single_image", "score": 0.6428571428571429, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "GUI_Act_Mobile_tap", "score": 0.2857142857142857, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "road_map_find_highway_between_two_place", "score": 0.4117647058823529, "eval_type": "rule", "num_demo": 1, "num_query": 17 }, { "name": "chess_sygyzy_endgames", "score": 0.10884353741496598, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "planning_screenshot_termes", "score": 0.0, "eval_type": "rule", "num_demo": 1, "num_query": 15 }, { "name": "multiple_states_identify_asia", "score": 0.028571428571428574, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "multiple_states_identify_africa", "score": 0.0, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "multiple_states_identify_europe", "score": 0.05714285714285715, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "multiple_states_identify_americas", "score": 0.11428571428571428, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "adapted_cvbench_distance", "score": 0.42857142857142855, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "adapted_cvbench_count", "score": 0.35714285714285715, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "adapted_cvbench_depth", "score": 0.5714285714285714, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "adapted_cvbench_relation", "score": 0.35714285714285715, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "symbolic_graphics_programs_computer_aided_design", "score": 0.2857142857142857, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "symbolic_graphics_programs_scalable_vector_graphics", "score": 0.2222222222222222, "eval_type": "rule", "num_demo": 1, "num_query": 18 }, { "name": "table_understanding_complex_question_answering", "score": 0.35714285714285715, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "table_understanding_fact_verification", "score": 0.7261904761904762, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "panel_images_multi_question", "score": 0.4047619047619047, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "panel_images_single_question", "score": 0.6428571428571429, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "MMSoc_Misinformation_GossipCop", "score": 0.5714285714285714, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "MMSoc_HatefulMemes", "score": 0.7857142857142857, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "MMSoc_Memotion", "score": 0.6705882352941177, "eval_type": "rule", "num_demo": 1, "num_query": 17 }, { "name": "MMSoc_Misinformation_PolitiFact", "score": 0.6428571428571429, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "poetry_acrostic_alliteration", "score": 0.0, "eval_type": "rule", "num_demo": 0, "num_query": 15 }, { "name": "poetry_acrostic", "score": 0.06666666666666667, "eval_type": "rule", "num_demo": 0, "num_query": 15 }, { "name": "poetry_limerick", "score": 0.06666666666666667, "eval_type": "rule", "num_demo": 0, "num_query": 15 }, { "name": "poetry_custom_rhyming_scheme", "score": 0.0, "eval_type": "rule", "num_demo": 0, "num_query": 15 }, { "name": "poetry_petrarchian_sonnet_optional_meter", "score": 0.0, "eval_type": "rule", "num_demo": 0, "num_query": 15 }, { "name": "poetry_haiku", "score": 0.0, "eval_type": "rule", "num_demo": 0, "num_query": 15 }, { "name": "poetry_shakespearean_sonnet", "score": 0.0, "eval_type": "rule", "num_demo": 0, "num_query": 15 }, { "name": "screenshot_lighteval_math", "score": 0.13333333333333333, "eval_type": "rule", "num_demo": 1, "num_query": 15 }, { "name": "screenshot_theoremqa", "score": 0.07142857142857142, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "number_puzzle_sudoku", "score": 0.0, "eval_type": "rule", "num_demo": 1, "num_query": 15 }, { "name": "number_puzzle_kakuro_5x5", "score": 0.0, "eval_type": "rule", "num_demo": 1, "num_query": 15 }, { "name": "text_entity_replace", "score": 0.42857142857142855, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "background_change", "score": 0.5, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "face_attribute_edit", "score": 0.5, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "face_swap", "score": 0.5, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "text_style", "score": 0.5, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "out_of_context", "score": 0.5, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "clip_stable_diffusion_generate", "score": 0.5, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "veracity", "score": 0.35714285714285715, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "counterfactual_arithmetic", "score": 0.0, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "maze_2d_8x8", "score": 0.0, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "shape_composition_shapes", "score": 0.2738095238095238, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "shape_composition_colours", "score": 0.2970521541950113, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "autorater_artifact", "score": 0.07142857142857142, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "autorater_artifact_reason", "score": 0.26666666666666666, "eval_type": "rule", "num_demo": 0, "num_query": 15 }, { "name": "chess_puzzles_crushing", "score": 0.0, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "chess_puzzles_checkmate", "score": 0.0, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "chess_puzzles_equality", "score": 0.0, "eval_type": "rule", "num_demo": 1, "num_query": 15 }, { "name": "app_layout_understanding_notes", "score": 0.0, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "app_layout_understanding_twitter", "score": 0.14285714285714285, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "app_layout_understanding_youtube", "score": 0.07142857142857142, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "app_layout_understanding_tiktok", "score": 0.07142857142857142, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "app_layout_understanding_excel", "score": 0.07142857142857142, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "app_layout_understanding_amazon", "score": 0.0, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "app_layout_understanding_instagram", "score": 0.07142857142857142, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "app_layout_understanding_zoom", "score": 0.2, "eval_type": "rule", "num_demo": 1, "num_query": 15 }, { "name": "app_layout_understanding_word", "score": 0.14285714285714285, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "app_layout_understanding_iphone_settings", "score": 0.0, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "app_layout_understanding_leetcode", "score": 0.0, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "app_layout_understanding_ppt", "score": 0.0, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "app_layout_understanding_alipay", "score": 0.11764705882352941, "eval_type": "rule", "num_demo": 1, "num_query": 17 }, { "name": "ocr_table_to_markdown", "score": 0.6428571428571429, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "ocr_table_to_latex", "score": 0.0, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "ocr_resume_employer_plain", "score": 0.42857142857142855, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "ocr_article_journal", "score": 0.07142857142857142, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "ocr_resume_experience_plain", "score": 0.21428571428571427, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "ocr_math_text_latex", "score": 0.07142857142857142, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "ocr_article_authors", "score": 0.42857142857142855, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "ocr_table_to_csv", "score": 0.42857142857142855, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "ocr_math_equation", "score": 0.14285714285714285, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "ocr_resume_school_plain", "score": 0.07142857142857142, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "ocr_table_to_html", "score": 0.07142857142857142, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "ocr_resume_skill_plain", "score": 0.42857142857142855, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "crossword_mini_5x5", "score": 0.0071428571428571435, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "contain_position_length", "score": 0.26666666666666666, "eval_type": "rule", "num_demo": 0, "num_query": 15 }, { "name": "contain_repeat_length", "score": 0.2, "eval_type": "rule", "num_demo": 0, "num_query": 15 }, { "name": "contain_length", "score": 0.6, "eval_type": "rule", "num_demo": 0, "num_query": 15 }, { "name": "contain_contain_length", "score": 0.8, "eval_type": "rule", "num_demo": 0, "num_query": 15 }, { "name": "pictionary_skribbl_io", "score": 0.0, "eval_type": "rule", "num_demo": 1, "num_query": 20 }, { "name": "pictionary_doodle_guess", "score": 0.2, "eval_type": "rule", "num_demo": 1, "num_query": 15 }, { "name": "pictionary_genai_output_chinese", "score": 0.0, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "pictionary_cartoon_drawing_guess", "score": 0.5, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "pictionary_chinese_food_img2en", "score": 0.07142857142857142, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "reward_models_i2t_reward", "score": 0.6428571428571429, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "memorization_chinese_celebrity", "score": 0.0, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "memorization_papers", "score": 0.0, "eval_type": "rule", "num_demo": 1, "num_query": 15 }, { "name": "memorization_famous_treaty", "score": 0.07142857142857142, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "memorization_indian_celebrity", "score": 0.14285714285714285, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "research_website_parsing_blogpost", "score": 0.07142857142857142, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "research_website_parsing_publication", "score": 0.07142857142857142, "eval_type": "rule", "num_demo": 1, "num_query": 14 }, { "name": "research_website_parsing_homepage", "score": 0.14285714285714285, "eval_type": "rule", "num_demo": 1, "num_query": 14 } ]