Spaces:

dexay
/

EDC_IE

Runtime error

App Files Files Community

dexay commited on Jun 23, 2022

Commit

d532f55

1 Parent(s): 25a02b8

Update app.py

Browse files

Files changed (1) hide show

app.py +139 -2

app.py CHANGED Viewed

@@ -2,14 +2,151 @@ import streamlit as st
 import transformers
 from transformers import  pipeline, TokenClassificationPipeline, BertForTokenClassification , AutoTokenizer
 #model.to("cpu")
 tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-large-cased-v1.1", truncation = True, padding=True, model_max_length=512,)
 model_checkpoint = BertForTokenClassification.from_pretrained("dexay/Ner2HgF", )
 token_classifier = pipeline("token-classification", tokenizer = tokenizer,model=model_checkpoint,  )
-x = st.text_area('enter text')
 if x:
   out = token_classifier(x)
-  st.json(out)

 import transformers
 from transformers import  pipeline, TokenClassificationPipeline, BertForTokenClassification , AutoTokenizer
+x = st.text_area('enter')
 #model.to("cpu")
 tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-large-cased-v1.1", truncation = True, padding=True, model_max_length=512,)
 model_checkpoint = BertForTokenClassification.from_pretrained("dexay/Ner2HgF", )
 token_classifier = pipeline("token-classification", tokenizer = tokenizer,model=model_checkpoint,  )
+biotext = x
+#split document or text into sentences
+lstbiotext = []
+flag = 0
+tempsen = ""
+for e in biotext:
+  tempsen += e
+  if e=="(":
+      flag = 1
+  if e==")":
+      flag = 0
+  if (e =="." or e =="?" or e ==":" ) and flag == 0 :
+      lstbiotext += [tempsen.strip()]
+      tempsen = ""
+ddata = lstbiotext
+#tokenized_dat = tokenize_function(ddata)
+az = token_classifier(ddata)
+#code to convert NER output to  RE input compatible format
+#tg_inorder are decoding of labels on which the model was fine tuned on
+tg_inorder = ['I-RECEPTOR',
+ 'O',
+ 'B-RECEPTOR',
+ 'B-EDC',
+ 'I-EXP_PER',
+ 'B-EXP_PER',
+ 'I-CANCER',
+ 'I-EDC',
+ 'B-HORMONE',
+ 'I-HORMONE',
+ 'B-QUANTITY',
+ 'B-EXP_DUR',
+ 'I-QUANTITY',
+ 'B-CANCER',
+ 'PAD']
+lstSentEnc = []
+lstSentbilbl = []
+lstSentEnt = []
+for itsent in az:
+  sentaz = itsent
+  ph = []
+  phl = []
+  for e in sentaz:
+    if e["word"][0]=="#" and len(ph)!=0:
+      ph[-1]+= e["word"][2:]
+    else:
+      ph += [e["word"]]
+      phl += [e["entity"]]
+  phltr = []
+  for e in phl:
+    phltr += [tg_inorder[int(e[-1])] if len(e)==7 else  tg_inorder[int(e[-2:])]]
+  nwph = []
+  nwphltr = []
+  flag = 0
+  for i in range(len(phltr)-2):
+    if phltr[i]=="O" and flag != 3 :
+      nwph += [ph[i]]
+      nwphltr += [phltr[i]]
+      continue
+    elif flag == 3:
+      nwph[-1] += " "+ph[i]
+      flag = 1
+      continue
+    elif phltr[i][2:]==phltr[i+1][2:] and phltr[i+1][0]=="I" and flag == 0:
+      nwph += [ph[i]]
+      nwphltr += [phltr[i]]
+      flag = 1
+      continue
+    elif phltr[i][2:]==phltr[i+1][2:] and phltr[i+1][0]=="I" and flag == 1:
+      nwph[-1] += " "+ph[i]
+      continue
+# xox with flag == 3
+    elif phltr[i][2:]==phltr[i+2][2:] and phltr[i+1]=="O" and phltr[i+2][0]=="I" and flag == 0:
+      nwph += [ph[i]]
+      nwphltr += [phltr[i]]
+      flag = 3
+      continue
+    elif phltr[i][2:]==phltr[i+2][2:] and phltr[i+1]=="O" and phltr[i+2][0]=="I" and flag == 1:
+      nwph[-1] += " "+ph[i]
+      flag = 3
+      continue
+#\ xox
+    elif flag == 1:
+      nwph[-1] += " "+ph[i]
+      flag = 0
+      continue
+    else :
+      nwph += [ph[i]]
+      nwphltr += [phltr[i]]
+      continue
+  # nwph,nwphltr,len(nwph),len(nwphltr)
+  if nwphltr.count("O") <= len(nwphltr)-2:
+    for i in range(len(nwph)-1):
+      if nwphltr[i] != "O":
+        for j in range(i,len(nwph)):
+          if nwphltr[j] != "O" and nwphltr[j] != nwphltr[i] and {nwphltr[j], nwphltr[i]} != {"B-CANCER","B-RECEPTOR"}:
+            sen2ad = ""
+            for g in range(i):
+              sen2ad += nwph[g]+" "
+            sen2ad += "<e1>"+nwph[i]+"</e1> "
+            for t in range(i+1,j):
+              sen2ad += nwph[t]+" "
+            sen2ad += "<e2>"+nwph[j]+"</e2>"
+            if j<len(nwph):
+              for l in range(j+1,len(nwph)):
+                sen2ad += " "+nwph[l]
+            lstSentEnc += [sen2ad]
+            lstSentbilbl += [[nwphltr[i],nwphltr[j]]]
+            lstSentEnt += [[nwph[i],nwph[j]]]
+#lstSentEnc,lstSentEnt,lstSentbilbl
 if x:
   out = token_classifier(x)
+  st.markdown(lstSentEnc)