dexay commited on
Commit
d532f55
·
1 Parent(s): 25a02b8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +139 -2
app.py CHANGED
@@ -2,14 +2,151 @@ import streamlit as st
2
  import transformers
3
  from transformers import pipeline, TokenClassificationPipeline, BertForTokenClassification , AutoTokenizer
4
 
 
 
5
  #model.to("cpu")
6
  tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-large-cased-v1.1", truncation = True, padding=True, model_max_length=512,)
7
  model_checkpoint = BertForTokenClassification.from_pretrained("dexay/Ner2HgF", )
8
  token_classifier = pipeline("token-classification", tokenizer = tokenizer,model=model_checkpoint, )
9
 
10
- x = st.text_area('enter text')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
  if x:
13
  out = token_classifier(x)
14
- st.json(out)
 
 
15
 
 
2
  import transformers
3
  from transformers import pipeline, TokenClassificationPipeline, BertForTokenClassification , AutoTokenizer
4
 
5
+ x = st.text_area('enter')
6
+
7
  #model.to("cpu")
8
  tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-large-cased-v1.1", truncation = True, padding=True, model_max_length=512,)
9
  model_checkpoint = BertForTokenClassification.from_pretrained("dexay/Ner2HgF", )
10
  token_classifier = pipeline("token-classification", tokenizer = tokenizer,model=model_checkpoint, )
11
 
12
+
13
+ biotext = x
14
+
15
+ #split document or text into sentences
16
+
17
+ lstbiotext = []
18
+
19
+ flag = 0
20
+ tempsen = ""
21
+ for e in biotext:
22
+ tempsen += e
23
+ if e=="(":
24
+ flag = 1
25
+ if e==")":
26
+ flag = 0
27
+ if (e =="." or e =="?" or e ==":" ) and flag == 0 :
28
+ lstbiotext += [tempsen.strip()]
29
+ tempsen = ""
30
+
31
+ ddata = lstbiotext
32
+
33
+ #tokenized_dat = tokenize_function(ddata)
34
+
35
+ az = token_classifier(ddata)
36
+
37
+
38
+ #code to convert NER output to RE input compatible format
39
+
40
+ #tg_inorder are decoding of labels on which the model was fine tuned on
41
+
42
+ tg_inorder = ['I-RECEPTOR',
43
+ 'O',
44
+ 'B-RECEPTOR',
45
+ 'B-EDC',
46
+ 'I-EXP_PER',
47
+ 'B-EXP_PER',
48
+ 'I-CANCER',
49
+ 'I-EDC',
50
+ 'B-HORMONE',
51
+ 'I-HORMONE',
52
+ 'B-QUANTITY',
53
+ 'B-EXP_DUR',
54
+ 'I-QUANTITY',
55
+ 'B-CANCER',
56
+ 'PAD']
57
+
58
+ lstSentEnc = []
59
+ lstSentbilbl = []
60
+ lstSentEnt = []
61
+ for itsent in az:
62
+
63
+ sentaz = itsent
64
+ ph = []
65
+ phl = []
66
+ for e in sentaz:
67
+ if e["word"][0]=="#" and len(ph)!=0:
68
+ ph[-1]+= e["word"][2:]
69
+ else:
70
+ ph += [e["word"]]
71
+ phl += [e["entity"]]
72
+
73
+
74
+ phltr = []
75
+ for e in phl:
76
+ phltr += [tg_inorder[int(e[-1])] if len(e)==7 else tg_inorder[int(e[-2:])]]
77
+
78
+
79
+ nwph = []
80
+ nwphltr = []
81
+ flag = 0
82
+ for i in range(len(phltr)-2):
83
+ if phltr[i]=="O" and flag != 3 :
84
+ nwph += [ph[i]]
85
+ nwphltr += [phltr[i]]
86
+ continue
87
+ elif flag == 3:
88
+ nwph[-1] += " "+ph[i]
89
+ flag = 1
90
+ continue
91
+ elif phltr[i][2:]==phltr[i+1][2:] and phltr[i+1][0]=="I" and flag == 0:
92
+ nwph += [ph[i]]
93
+ nwphltr += [phltr[i]]
94
+ flag = 1
95
+ continue
96
+ elif phltr[i][2:]==phltr[i+1][2:] and phltr[i+1][0]=="I" and flag == 1:
97
+ nwph[-1] += " "+ph[i]
98
+ continue
99
+ # xox with flag == 3
100
+ elif phltr[i][2:]==phltr[i+2][2:] and phltr[i+1]=="O" and phltr[i+2][0]=="I" and flag == 0:
101
+ nwph += [ph[i]]
102
+ nwphltr += [phltr[i]]
103
+ flag = 3
104
+ continue
105
+ elif phltr[i][2:]==phltr[i+2][2:] and phltr[i+1]=="O" and phltr[i+2][0]=="I" and flag == 1:
106
+ nwph[-1] += " "+ph[i]
107
+ flag = 3
108
+ continue
109
+ #\ xox
110
+ elif flag == 1:
111
+ nwph[-1] += " "+ph[i]
112
+ flag = 0
113
+ continue
114
+ else :
115
+ nwph += [ph[i]]
116
+ nwphltr += [phltr[i]]
117
+ continue
118
+
119
+
120
+ # nwph,nwphltr,len(nwph),len(nwphltr)
121
+
122
+
123
+ if nwphltr.count("O") <= len(nwphltr)-2:
124
+ for i in range(len(nwph)-1):
125
+ if nwphltr[i] != "O":
126
+ for j in range(i,len(nwph)):
127
+ if nwphltr[j] != "O" and nwphltr[j] != nwphltr[i] and {nwphltr[j], nwphltr[i]} != {"B-CANCER","B-RECEPTOR"}:
128
+ sen2ad = ""
129
+ for g in range(i):
130
+ sen2ad += nwph[g]+" "
131
+ sen2ad += "<e1>"+nwph[i]+"</e1> "
132
+
133
+ for t in range(i+1,j):
134
+ sen2ad += nwph[t]+" "
135
+ sen2ad += "<e2>"+nwph[j]+"</e2>"
136
+ if j<len(nwph):
137
+ for l in range(j+1,len(nwph)):
138
+ sen2ad += " "+nwph[l]
139
+ lstSentEnc += [sen2ad]
140
+ lstSentbilbl += [[nwphltr[i],nwphltr[j]]]
141
+ lstSentEnt += [[nwph[i],nwph[j]]]
142
+
143
+
144
+
145
+ #lstSentEnc,lstSentEnt,lstSentbilbl
146
 
147
  if x:
148
  out = token_classifier(x)
149
+ st.markdown(lstSentEnc)
150
+
151
+
152