|
import pickle |
|
import sklearn.preprocessing as pp |
|
from scipy.sparse import csr_matrix |
|
import numpy as np |
|
import pandas as pd |
|
from scipy.sparse import vstack |
|
|
|
|
|
df_ps_train_ori = pd.read_hdf('model/df_ps_train_new.hdf') |
|
df_ps_train_extra = pd.read_hdf('data_train/df_ps_train_extra.hdf') |
|
pickle_path = 'model/giantMatrix_new.pickle' |
|
with open(pickle_path, 'rb') as f: |
|
ps_matrix_ori = pickle.load(f) |
|
|
|
|
|
|
|
def add_row_train(df, list_tid): |
|
new_pid_add = df.iloc[-1].name +1 |
|
list_tid_add = list_tid |
|
list_pos_add = list(range(len(list_tid_add))) |
|
|
|
df.loc[new_pid_add] = {'tid': list_tid_add,'pos': list_pos_add} |
|
return df |
|
|
|
|
|
def inference_row(list_tid, ps_matrix): |
|
ps_matrix_norm = pp.normalize(ps_matrix, axis=1) |
|
length_tid = len(list_tid) |
|
n_songs = ps_matrix.shape[1] |
|
sparse_row = csr_matrix((np.ones(length_tid), (np.zeros(length_tid), list_tid)), shape=(1, n_songs)) |
|
sparse_row_norm = pp.normalize(sparse_row, axis=1) |
|
|
|
return sparse_row_norm * ps_matrix_norm.T, sparse_row |
|
|
|
|
|
def get_best_tid(current_list, ps_matrix_row, K=50, MAX_tid=10): |
|
|
|
|
|
df_ps_train = pd.concat([df_ps_train_ori,df_ps_train_extra]) |
|
|
|
sim_vector, sparse_row = inference_row(current_list, ps_matrix_row) |
|
sim_vector = sim_vector.toarray()[0].tolist() |
|
|
|
|
|
counter_list = list(enumerate(sim_vector, 0)) |
|
|
|
|
|
sortedList = sorted(counter_list, key=lambda x: x[1], reverse=True) |
|
|
|
topK_pid = [i for i, _ in sortedList[1:K + 1]] |
|
|
|
n = 0 |
|
new_list = [] |
|
while (1): |
|
|
|
top_pid = topK_pid[n] |
|
|
|
add_tid_list = df_ps_train.loc[top_pid].tid |
|
|
|
|
|
new_tid_list = new_list + add_tid_list |
|
new_tid_list = [x for x in new_tid_list if x not in current_list] |
|
new_tid_list = list(dict.fromkeys(new_tid_list)) |
|
|
|
|
|
total_song = len(new_tid_list) |
|
|
|
if (total_song > MAX_tid): |
|
new_tid_list = new_tid_list[:MAX_tid] |
|
|
|
new_list = new_tid_list |
|
break |
|
else: |
|
new_list = new_tid_list |
|
n += 1 |
|
if (n == K): |
|
break |
|
|
|
df_ps_train_extra = add_row_train(df_ps_train_extra, current_list) |
|
|
|
|
|
df_ps_train_extra.to_hdf('data_train/df_ps_train_extra.hdf', key='abc') |
|
|
|
return new_list, sparse_row |
|
|
|
|
|
def inference_from_tid(list_tid, K=50, MAX_tid=10): |
|
|
|
|
|
|
|
with open("data_mat/giantMatrix_extra.pickle",'rb') as f: |
|
ps_matrix_extra = pickle.load(f) |
|
|
|
ps_matrix = vstack((ps_matrix_ori,ps_matrix_extra)) |
|
|
|
result, sparse_row = get_best_tid(list_tid, ps_matrix.tocsr(), K, MAX_tid) |
|
ps_matrix_extra = vstack((ps_matrix_extra,sparse_row.todok())) |
|
|
|
with open("data_mat/giantMatrix_extra.pickle", 'wb') as f: |
|
pickle.dump(ps_matrix_extra, f) |
|
|
|
return result |
|
|
|
|
|
def inference_from_uri(list_uri, K=50, MAX_tid=10): |
|
with open('model/dict_uri2tid.pkl', 'rb') as f: |
|
dict_uri2tid = pickle.load(f) |
|
list_tid = [dict_uri2tid[x] for x in list_uri if x in dict_uri2tid] |
|
best_tid = inference_from_tid(list_tid, K, MAX_tid) |
|
|
|
with open('model/dict_tid2uri.pkl', 'rb') as f: |
|
dict_tid2uri = pickle.load(f) |
|
best_uri = [dict_tid2uri[x] for x in best_tid] |
|
return best_uri |
|
|