File size: 4,797 Bytes
934f74d
 
9f92793
934f74d
 
 
7a18ba5
934f74d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4a48ed2
934f74d
7b626c7
 
 
 
934f74d
 
 
 
 
 
 
 
 
b8c5764
5fcbca3
 
 
 
 
 
 
 
495c86c
5fcbca3
 
495c86c
 
a1f2581
5fcbca3
 
 
 
 
7a18ba5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b8c5764
 
 
608503a
b8c5764
5fcbca3
5deec6f
 
608503a
7a18ba5
 
 
 
 
5fcbca3
 
6c34254
 
7a18ba5
b8c5764
48bcd5d
6c34254
48bcd5d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
import time
import json
import numpy as np

import streamlit as st
from pathlib import Path
from collections import defaultdict

import sys
path_root = Path("./")
sys.path.append(str(path_root))


st.set_page_config(page_title="PSC Runtime",
                   page_icon='🌸', layout="centered")

# cola, colb, colc = st.columns([5, 4, 5])

# colaa, colbb, colcc = st.columns([1, 8, 1])
# with colbb:
#     runtime = st.select_slider(
#         'Select a runtime type',
#         options=['PyTorch', 'ONNX Runtime'])
#     st.write('Now using: ', runtime)


# colaa, colbb, colcc = st.columns([1, 8, 1])
# with colbb:
#     encoder = st.select_slider(
#         'Select a query encoder',
#         options=['uniCOIL', 'SPLADE++ Ensemble Distil', 'SPLADE++ Self Distil'])
#     st.write('Now Running Encoder: ', encoder)

# if runtime == 'PyTorch':
#     runtime = 'pytorch'
#     runtime_index = 1
# else:
#     runtime = 'onnx'
#     runtime_index = 0


col1, col2 = st.columns([9, 1])
with col1:
    search_query = st.text_input(label="search query", placeholder="Search")

with col2:
    st.write('#')
    button_clicked = st.button("🔎")


import torch
fn = "dl19-gpt-3.5.pt"
object = torch.load(fn)
# for x for x in object:
    
# outputs = [x[2] for x in object]
outputs = object[2]
query2outputs = {}
for output in outputs:
    all_queries = {x['query'] for x in output}
    assert len(all_queries) == 1
    query = list(all_queries)[0]
    query2outputs[query] = [x['hits'] for x in output]

search_query = sorted(query2outputs)[0]


def preferences_from_hits(list_of_hits):
    docid2id = {}
    id2doc = {}
    preferences = []

    for result in list_of_hits:
        for doc in result:
            if doc["docid"] not in docid2id:
                id = len(docid2id)
                docid2id[doc["docid"]] = id
                id2doc[id] = doc
        print([doc["docid"] for doc in result])
        print([docid2id[doc["docid"]] for doc in result])
        preferences.append([docid2id[doc["docid"]] for doc in result])

    #  = {v: k for k, v in docid2id.items()}
    return np.array(preferences), id2doc


def load_qrels(name):
    import ir_datasets
    if name == "dl19":
        ds_name = "msmarco-passage/trec-dl-2019/judged"
    elif name == "dl20":
        ds_name = "msmarco-passage/trec-dl-2020/judged"
    else:
        raise ValueError(name)

    dataset = ir_datasets.load(ds_name)
    qrels = defaultdict(dict)
    for qrel in dataset.qrels_iter():
        qrels[qrel.query_id][qrel.doc_id] = qrel.relevance
    return qrels


def aggregate(list_of_hits):
    import numpy as np
    from permsc import KemenyOptimalAggregator, sum_kendall_tau, ranks_from_preferences
    from permsc import BordaRankAggregator

    preferences, id2doc = preferences_from_hits(list_of_hits)
    y_optimal = KemenyOptimalAggregator().aggregate(preferences)
    # y_optimal = BordaRankAggregator().aggregate(preferences)

    # print("-------------------------------------")
    # print("preference:")
    # print(preferences)
    # print("preferences shape: ", preferences.shape)
    # print("y_optimal: ", y_optimal)

    return [id2doc[id] for id in y_optimal]

aggregated_ranking = aggregate(query2outputs[search_query])
qrels = load_qrels("dl19")

col1, col2 = st.columns([5, 5])

with col2:
    if search_query or button_clicked:
    
        num_results = None
        t_0 = time.time()
        # search_results = query2outputs[search_query][0] # first from the 20
        search_results = aggregated_ranking
    
        st.write(
            f'<p align=\"right\" style=\"color:grey;\"> Before aggregation for query [{search_query}] ms</p>', unsafe_allow_html=True)
    
        qid = {result["qid"] for result in search_results}
        assert len(qid) == 1
        qid = list(qid)[0]
    
        for i, result in enumerate(search_results):
            result_id = result["docid"]
            contents = result["content"]

            label = qrels[qid].get(result_id, 0)
            if label == 3:
                style = "style=\"color:blue;\""
            elif label == 2:
                style = "style=\"color:green;\""
            elif label == 1:
                style = "style=\"color:red;\""
            else:
                style = "style=\"color:grey;\""

            # output = f'<div class="row"> <b>Rank</b>: {i+1} | <b>Document ID</b>: {result_id} | <b>Score</b>:{result_score:.2f}</div>'
            output = f'<div class="row" {style}> <b>Rank</b>: {i+1} | <b>Document ID</b>: {result_id}'
    
            try:
                st.write(output, unsafe_allow_html=True)
                st.write(
                    f'<div class="row" {style}>{contents}</div>', unsafe_allow_html=True)
    
            except:
                pass
            st.write('---')