File size: 5,782 Bytes
9ace58a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
import numpy as np
import random
import os
import glob
import json


def write_notes_file(file_name, text):
    with open(file_name, 'a') as da:
        da.write(text + '\n')


def get_blank_dataset_dict(dataset_name, is_test, ann_path, wav_path):
    ddict = {'dataset_name': dataset_name, 'is_test': is_test, 'is_binary': False,
            'ann_path': ann_path, 'wav_path': wav_path}
    return ddict


def get_short_class_names(class_names, str_len=3):
    class_names_short = []
    for cc in class_names:
        class_names_short.append(' '.join([sp[:str_len] for sp in cc.split(' ')]))
    return class_names_short


def remove_dupes(data_train, data_test):
    test_ids = [dd['id'] for dd in data_test]
    data_train_prune = []
    for aa in data_train:
        if aa['id'] not in test_ids:
            data_train_prune.append(aa)
    diff = len(data_train) - len(data_train_prune)
    if diff != 0:
        print(diff, 'items removed from train set')
    return data_train_prune


def get_genus_mapping(class_names):
    genus_names, genus_mapping = np.unique([cc.split(' ')[0] for cc in class_names], return_inverse=True)
    return genus_names.tolist(), genus_mapping.tolist()


def standardize_low_freq(data, class_of_interest):
    # address the issue of highly variable low frequency annotations
    # this often happens for contstant frequency calls
    # for the class of interest sets the low and high freq to be the dataset mean
    low_freqs = []
    high_freqs = []
    for dd in data:
        for aa in dd['annotation']:
            if aa['class'] == class_of_interest:
                low_freqs.append(aa['low_freq'])
                high_freqs.append(aa['high_freq'])

    low_mean = np.mean(low_freqs)
    high_mean = np.mean(high_freqs)
    assert(low_mean < high_mean)

    print('\nStandardizing low and high frequency for:')
    print(class_of_interest)
    print('low:  ', round(low_mean, 2))
    print('high: ', round(high_mean, 2))

    # only set the low freq, high stays the same
    # assumes that low_mean < high_mean
    for dd in data:
        for aa in dd['annotation']:
            if aa['class'] == class_of_interest:
                aa['low_freq'] = low_mean
                if aa['high_freq'] < low_mean:
                    aa['high_freq'] = high_mean

    return data


def load_set_of_anns(data, classes_to_ignore=[], events_of_interest=None,
                     convert_to_genus=False, verbose=True, list_of_anns=False,
                     filter_issues=False, name_replace=False):

    # load the annotations
    anns = []
    if list_of_anns:
        # path to list of individual json files
        anns.extend(load_anns_from_path(data['ann_path'], data['wav_path']))
    else:
        # dictionary of datasets
        for dd in data:
            anns.extend(load_anns(dd['ann_path'], dd['wav_path']))

    # discarding unannoated files
    anns = [aa for aa in anns if aa['annotated'] is True]

    # filter files that have annotation issues - is the input is a dictionary of
    # datasets, this will lilely have already been done
    if filter_issues:
        anns = [aa for aa in anns if aa['issues'] is False]

    # check for some basic formatting errors with class names
    for ann in anns:
        for aa in ann['annotation']:
            aa['class'] = aa['class'].strip()

    # only load specified events - i.e. types of calls
    if events_of_interest is not None:
        for ann in anns:
            filtered_events = []
            for aa in ann['annotation']:
                if aa['event'] in events_of_interest:
                    filtered_events.append(aa)
            ann['annotation'] = filtered_events

    # change class names
    # replace_names will be a dictionary mapping input name to output
    if type(name_replace) is dict:
        for ann in anns:
            for aa in ann['annotation']:
                if aa['class'] in name_replace:
                    aa['class'] = name_replace[aa['class']]

    # convert everything to genus name
    if convert_to_genus:
        for ann in anns:
            for aa in ann['annotation']:
                    aa['class'] = aa['class'].split(' ')[0]

    # get unique class names
    class_names_all = []
    for ann in anns:
        for aa in ann['annotation']:
            if aa['class'] not in classes_to_ignore:
                class_names_all.append(aa['class'])

    class_names, class_cnts = np.unique(class_names_all, return_counts=True)
    class_inv_freq = (class_cnts.sum() / (len(class_names) * class_cnts.astype(np.float32)))

    if verbose:
        print('Class count:')
        str_len = np.max([len(cc) for cc in class_names]) + 5
        for cc in range(len(class_names)):
            print(str(cc).ljust(5) + class_names[cc].ljust(str_len) + str(class_cnts[cc]))

    if len(classes_to_ignore) == 0:
        return anns
    else:
        return anns, class_names.tolist(), class_inv_freq.tolist()


def load_anns(ann_file_name, raw_audio_dir):
    with open(ann_file_name) as da:
        anns = json.load(da)

    for aa in anns:
        aa['file_path'] = raw_audio_dir + aa['id']

    return anns


def load_anns_from_path(ann_file_dir, raw_audio_dir):
    files = glob.glob(ann_file_dir + '*.json')
    anns = []
    for ff in files:
        with open(ff) as da:
            ann = json.load(da)
        ann['file_path'] = raw_audio_dir + ann['id']
        anns.append(ann)

    return anns


class AverageMeter(object):
  """Computes and stores the average and current value"""
  def __init__(self):
    self.reset()

  def reset(self):
    self.val = 0
    self.avg = 0
    self.sum = 0
    self.count = 0

  def update(self, val, n=1):
    self.val = val
    self.sum += val * n
    self.count += n
    self.avg = self.sum / self.count