Spaces:

RobPruzan
/

automaticlitassesment

Runtime error

App Files Files Community

RobPruzan commited on Aug 23, 2022

Commit

680cdda

1 Parent(s): dbe5251

Adding word sense disambiguation + definitions to synonym generation

Browse files

Files changed (1) hide show

app.py +218 -3

app.py CHANGED Viewed

@@ -11,6 +11,7 @@ import gradio as gr
 import readability
 import seaborn as sns
 import torch
 from fuzzywuzzy import fuzz
 from nltk.corpus import stopwords
 from nltk.corpus import wordnet as wn
@@ -18,6 +19,8 @@ from nltk.tokenize import word_tokenize
 from sklearn.metrics.pairwise import cosine_similarity
 from transformers import DistilBertTokenizer
 from transformers import pipeline
 nltk.download('wordnet')
@@ -442,6 +445,218 @@ def vocab_level_inter(text):
   interp.append(('', 0))
   return {'original': text, 'interpretation': interp}, f'{level(sum/total*4*2.5)[1:]} Level Vocabulary'
 with gr.Blocks(title="Automatic Literacy and Speech Assesmen") as demo:
   gr.HTML("""<center><h7 style="font-size: 35px">Automatic Literacy and Speech Assesment</h7></center>""")
   gr.HTML("""<center><h7 style="font-size: 15px">This may take 60s to generate all statistics</h7></center>""")
@@ -460,8 +675,8 @@ with gr.Blocks(title="Automatic Literacy and Speech Assesmen") as demo:
                     audio_file = gr.Audio(source="microphone",type="filepath")
                     grade1 = gr.Button("Grade Your Speech")
             with gr.Group():
-              gr.Markdown("Reading Level Based Synonyms | Enter only one word at a time")
-              words = gr.Textbox(label="Word For Synonyms")
               lvl = gr.Dropdown(choices=["Elementary Level", "Middle School Level", "High School Level", "College Level" ], label="Intended Reading Level For Synonym")
               get_syns = gr.Button("Get Synonyms")
               reccos = gr.Label()
@@ -532,6 +747,6 @@ with gr.Blocks(title="Automatic Literacy and Speech Assesmen") as demo:
   grade.click(vocab_level_inter, inputs=in_text, outputs=[interpretation3, vocab_output])
   grade1.click(speech_to_score, inputs=audio_file, outputs=diff_output)
   b1.click(speech_to_text, inputs=[audio_file1, target], outputs=[text, some_val, phones])
-  get_syns.click(gen_syns, inputs=[words, lvl], outputs=reccos)
   find_sim.click(get_sim_words, inputs=[in_text, words1], outputs=sims)
 demo.launch(debug=True)

 import readability
 import seaborn as sns
 import torch
+import torch.nn.functional as F
 from fuzzywuzzy import fuzz
 from nltk.corpus import stopwords
 from nltk.corpus import wordnet as wn
 from sklearn.metrics.pairwise import cosine_similarity
 from transformers import DistilBertTokenizer
 from transformers import pipeline
+from transformers import BertTokenizer
+from transformers import AutoTokenizer, BertForSequenceClassification
 nltk.download('wordnet')
   interp.append(('', 0))
   return {'original': text, 'interpretation': interp}, f'{level(sum/total*4*2.5)[1:]} Level Vocabulary'
+logger = logging.getLogger(__name__)
+logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
+                    datefmt = '%m/%d/%Y %H:%M:%S',
+                    level = logging.INFO)
+tokenizer4 = AutoTokenizer.from_pretrained('kanishka/GlossBERT')
+def construct_context_gloss_pairs_through_nltk(input, target_start_id, target_end_id):
+    """
+    construct context gloss pairs like sent_cls_ws
+    :param input: str, a sentence
+    :param target_start_id: int
+    :param target_end_id: int
+    :param lemma: lemma of the target word
+    :return: candidate lists
+    """
+    sent = tokenizer4.tokenize(input)
+    assert 0 <= target_start_id and target_start_id < target_end_id  and target_end_id <= len(sent)
+    target = " ".join(sent[target_start_id:target_end_id])
+    if len(sent) > target_end_id:
+        sent = sent[:target_start_id] + ['"'] + sent[target_start_id:target_end_id] + ['"'] + sent[target_end_id:]
+    else:
+        sent = sent[:target_start_id] + ['"'] + sent[target_start_id:target_end_id] + ['"']
+    sent = " ".join(sent)
+    candidate = []
+    syns = wn.synsets(target)
+    for syn in syns:
+        if target == syn.name().split('.')[0]:
+          continue
+        gloss = (syn.definition(), syn.name())
+        candidate.append((sent, f"{target} : {gloss}", target, gloss))
+    assert len(candidate) != 0, f'there is no candidate sense of "{target}" in WordNet, please check'
+    # print(f'there are {len(candidate)} candidate senses of "{target}"')
+    return candidate
+class InputFeatures(object):
+    """A single set of features of data."""
+    def __init__(self, input_ids, input_mask, segment_ids):
+        self.input_ids = input_ids
+        self.input_mask = input_mask
+        self.segment_ids = segment_ids
+def convert_to_features(candidate, tokenizer3, max_seq_length=512):
+    candidate_results = []
+    features = []
+    for item in candidate:
+        text_a = item[0] # sentence
+        text_b = item[1] # gloss
+        candidate_results.append((item[-2], item[-1])) # (target, gloss)
+        tokens_a = tokenizer3.tokenize(text_a)
+        tokens_b = tokenizer3.tokenize(text_b)
+        _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
+        tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
+        segment_ids = [0] * len(tokens)
+        tokens += tokens_b + ["[SEP]"]
+        segment_ids += [1] * (len(tokens_b) + 1)
+        input_ids = tokenizer3.convert_tokens_to_ids(tokens)
+        # The mask has 1 for real tokens and 0 for padding tokens. Only real
+        # tokens are attended to.
+        input_mask = [1] * len(input_ids)
+        # Zero-pad up to the sequence length.
+        padding = [0] * (max_seq_length - len(input_ids))
+        input_ids += padding
+        input_mask += padding
+        segment_ids += padding
+        assert len(input_ids) == max_seq_length
+        assert len(input_mask) == max_seq_length
+        assert len(segment_ids) == max_seq_length
+        features.append(
+            InputFeatures(input_ids=input_ids,
+                          input_mask=input_mask,
+                          segment_ids=segment_ids))
+    return features, candidate_results
+def _truncate_seq_pair(tokens_a, tokens_b, max_length):
+    """Truncates a sequence pair in place to the maximum length."""
+    # This is a simple heuristic which will always truncate the longer sequence
+    # one token at a time. This makes more sense than truncating an equal percent
+    # of tokens from each, since if one sequence is very short then each token
+    # that's truncated likely contains more information than a longer sequence.
+    while True:
+        total_length = len(tokens_a) + len(tokens_b)
+        if total_length <= max_length:
+            break
+        if len(tokens_a) > len(tokens_b):
+            tokens_a.pop()
+        else:
+            tokens_b.pop()
+def infer(input, target_start_id, target_end_id, args):
+    sent = tokenizer4.tokenize(input)
+    assert 0 <= target_start_id and target_start_id < target_end_id  and target_end_id <= len(sent)
+    target = " ".join(sent[target_start_id:target_end_id])
+    device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
+    label_list = ["0", "1"]
+    num_labels = len(label_list)
+    model = BertForSequenceClassification.from_pretrained(args.bert_model,
+                                                          num_labels=num_labels)
+    model.to(device)
+    # print(f"input: {input}\ntarget: {target}")
+    examples = construct_context_gloss_pairs_through_nltk(input, target_start_id, target_end_id)
+    eval_features, candidate_results = convert_to_features(examples, tokenizer4)
+    input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
+    input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
+    segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
+    model.eval()
+    input_ids = input_ids.to(device)
+    input_mask = input_mask.to(device)
+    segment_ids = segment_ids.to(device)
+    with torch.no_grad():
+        logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=None).logits
+    logits_ = F.softmax(logits, dim=-1)
+    logits_ = logits_.detach().cpu().numpy()
+    output = np.argmax(logits_, axis=0)[1]
+    results= []
+    for idx, i in enumerate(logits_):
+      results.append((candidate_results[idx][1], i[1]*100))
+    sorted_results = sorted(results, key=lambda x: x[1], reverse=True)
+    return sorted_results
+def format_for_gradio(inp):
+  retval = ''
+  for idx, i in enumerate(inp):
+    if idx == len(inp)-1:
+      retval += i.split('.')[0]
+      break
+    retval += f'''{i.split('.')[0]} | '''
+  return retval
+def smart_synonyms(text, level):
+  parser = argparse.ArgumentParser()
+  parser.add_argument("--bert_model", default="kanishka/GlossBERT", type=str)
+  parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available")
+  args, unknown = parser.parse_known_args()
+  location = 0
+  word = ''
+  tokens = tokenizer4.tokenize(text)
+  school_to_level = {"Elementary Level":'1', "Middle School Level":'2', "High School Level":'3', "College Level":'4'}
+  for idx, i in enumerate(tokens):
+    if i[0] == '@':
+      location = idx
+      text = text.replace('@', '')
+      word = tokens[location]
+      break
+  raw_syns = []
+  raw_defs = []
+  raw_scores = []
+  syns = []
+  defs = []
+  scores = []
+  preds = infer(text, location, location+1, args)
+  for i in preds:
+    if not i[0][1].split('.')[0] in data[school_to_level[level]]:
+      continue
+    raw_syns.append(i[0][1])
+    raw_defs.append(i[0][0])
+    raw_scores.append(i[1])
+    if i[1] > 5:
+      syns.append(i[0][1])
+      defs.append(i[0][0])
+      scores.append(i[1])
+  if not syns:
+    top_syns = int(len(raw_syns)*.25//1+1)
+    syns = raw_syns[:top_syns]
+    defs = raw_defs[:top_syns]
+    scores = raw_scores[:top_syns]
+  cleaned_syns = format_for_gradio(syns)
+  cleaend_defs = format_for_gradio(defs)
+  return f'{cleaned_syns}: Definition- {cleaend_defs} | '
 with gr.Blocks(title="Automatic Literacy and Speech Assesmen") as demo:
   gr.HTML("""<center><h7 style="font-size: 35px">Automatic Literacy and Speech Assesment</h7></center>""")
   gr.HTML("""<center><h7 style="font-size: 15px">This may take 60s to generate all statistics</h7></center>""")
                     audio_file = gr.Audio(source="microphone",type="filepath")
                     grade1 = gr.Button("Grade Your Speech")
             with gr.Group():
+              gr.Markdown("""Reading Level Based Synonyms | Enter a sentence with the word you want a synonym | Add an @ before the target word for synonym, e.g. - "Today is an @amazing day"- target word = amazing" """)
+              words = gr.Textbox(label="Text with word for synonyms")
               lvl = gr.Dropdown(choices=["Elementary Level", "Middle School Level", "High School Level", "College Level" ], label="Intended Reading Level For Synonym")
               get_syns = gr.Button("Get Synonyms")
               reccos = gr.Label()
   grade.click(vocab_level_inter, inputs=in_text, outputs=[interpretation3, vocab_output])
   grade1.click(speech_to_score, inputs=audio_file, outputs=diff_output)
   b1.click(speech_to_text, inputs=[audio_file1, target], outputs=[text, some_val, phones])
+  get_syns.click(smart_synonyms, inputs=[words, lvl], outputs=reccos)
   find_sim.click(get_sim_words, inputs=[in_text, words1], outputs=sims)
 demo.launch(debug=True)