Spaces:
Running
Running
File size: 5,782 Bytes
9ace58a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 |
import numpy as np
import random
import os
import glob
import json
def write_notes_file(file_name, text):
with open(file_name, 'a') as da:
da.write(text + '\n')
def get_blank_dataset_dict(dataset_name, is_test, ann_path, wav_path):
ddict = {'dataset_name': dataset_name, 'is_test': is_test, 'is_binary': False,
'ann_path': ann_path, 'wav_path': wav_path}
return ddict
def get_short_class_names(class_names, str_len=3):
class_names_short = []
for cc in class_names:
class_names_short.append(' '.join([sp[:str_len] for sp in cc.split(' ')]))
return class_names_short
def remove_dupes(data_train, data_test):
test_ids = [dd['id'] for dd in data_test]
data_train_prune = []
for aa in data_train:
if aa['id'] not in test_ids:
data_train_prune.append(aa)
diff = len(data_train) - len(data_train_prune)
if diff != 0:
print(diff, 'items removed from train set')
return data_train_prune
def get_genus_mapping(class_names):
genus_names, genus_mapping = np.unique([cc.split(' ')[0] for cc in class_names], return_inverse=True)
return genus_names.tolist(), genus_mapping.tolist()
def standardize_low_freq(data, class_of_interest):
# address the issue of highly variable low frequency annotations
# this often happens for contstant frequency calls
# for the class of interest sets the low and high freq to be the dataset mean
low_freqs = []
high_freqs = []
for dd in data:
for aa in dd['annotation']:
if aa['class'] == class_of_interest:
low_freqs.append(aa['low_freq'])
high_freqs.append(aa['high_freq'])
low_mean = np.mean(low_freqs)
high_mean = np.mean(high_freqs)
assert(low_mean < high_mean)
print('\nStandardizing low and high frequency for:')
print(class_of_interest)
print('low: ', round(low_mean, 2))
print('high: ', round(high_mean, 2))
# only set the low freq, high stays the same
# assumes that low_mean < high_mean
for dd in data:
for aa in dd['annotation']:
if aa['class'] == class_of_interest:
aa['low_freq'] = low_mean
if aa['high_freq'] < low_mean:
aa['high_freq'] = high_mean
return data
def load_set_of_anns(data, classes_to_ignore=[], events_of_interest=None,
convert_to_genus=False, verbose=True, list_of_anns=False,
filter_issues=False, name_replace=False):
# load the annotations
anns = []
if list_of_anns:
# path to list of individual json files
anns.extend(load_anns_from_path(data['ann_path'], data['wav_path']))
else:
# dictionary of datasets
for dd in data:
anns.extend(load_anns(dd['ann_path'], dd['wav_path']))
# discarding unannoated files
anns = [aa for aa in anns if aa['annotated'] is True]
# filter files that have annotation issues - is the input is a dictionary of
# datasets, this will lilely have already been done
if filter_issues:
anns = [aa for aa in anns if aa['issues'] is False]
# check for some basic formatting errors with class names
for ann in anns:
for aa in ann['annotation']:
aa['class'] = aa['class'].strip()
# only load specified events - i.e. types of calls
if events_of_interest is not None:
for ann in anns:
filtered_events = []
for aa in ann['annotation']:
if aa['event'] in events_of_interest:
filtered_events.append(aa)
ann['annotation'] = filtered_events
# change class names
# replace_names will be a dictionary mapping input name to output
if type(name_replace) is dict:
for ann in anns:
for aa in ann['annotation']:
if aa['class'] in name_replace:
aa['class'] = name_replace[aa['class']]
# convert everything to genus name
if convert_to_genus:
for ann in anns:
for aa in ann['annotation']:
aa['class'] = aa['class'].split(' ')[0]
# get unique class names
class_names_all = []
for ann in anns:
for aa in ann['annotation']:
if aa['class'] not in classes_to_ignore:
class_names_all.append(aa['class'])
class_names, class_cnts = np.unique(class_names_all, return_counts=True)
class_inv_freq = (class_cnts.sum() / (len(class_names) * class_cnts.astype(np.float32)))
if verbose:
print('Class count:')
str_len = np.max([len(cc) for cc in class_names]) + 5
for cc in range(len(class_names)):
print(str(cc).ljust(5) + class_names[cc].ljust(str_len) + str(class_cnts[cc]))
if len(classes_to_ignore) == 0:
return anns
else:
return anns, class_names.tolist(), class_inv_freq.tolist()
def load_anns(ann_file_name, raw_audio_dir):
with open(ann_file_name) as da:
anns = json.load(da)
for aa in anns:
aa['file_path'] = raw_audio_dir + aa['id']
return anns
def load_anns_from_path(ann_file_dir, raw_audio_dir):
files = glob.glob(ann_file_dir + '*.json')
anns = []
for ff in files:
with open(ff) as da:
ann = json.load(da)
ann['file_path'] = raw_audio_dir + ann['id']
anns.append(ann)
return anns
class AverageMeter(object):
"""Computes and stores the average and current value"""
def __init__(self):
self.reset()
def reset(self):
self.val = 0
self.avg = 0
self.sum = 0
self.count = 0
def update(self, val, n=1):
self.val = val
self.sum += val * n
self.count += n
self.avg = self.sum / self.count
|