|
import streamlit as st |
|
import pandas as pd |
|
import transformers |
|
import re |
|
|
|
import zipfile |
|
import postt |
|
from postt import postcor |
|
from transformers import pipeline, TokenClassificationPipeline, BertForTokenClassification , AutoTokenizer , TextClassificationPipeline , AutoModelForSequenceClassification |
|
|
|
|
|
|
|
|
|
|
|
st.header("Knowledge extraction on Endocrine disruptors") |
|
st.write("This tool lets you extract relation triples concerning interactions between: endocrine disrupting chemicals, hormones, receptors and cancers.") |
|
st.write("It is the result of an end of studies project within ESI school and dedicated to biomedical researchers looking to extract precise information about the subject without digging into long publications.") |
|
|
|
|
|
|
|
|
|
form = st.form(key='my-form') |
|
x = form.text_area('Enter text', height=250) |
|
submit = form.form_submit_button('Submit') |
|
|
|
|
|
|
|
|
|
|
|
if submit and len(x) != 0: |
|
|
|
st.write("Execution in progress ... It may take a while, please be patient.") |
|
|
|
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-large-cased-v1.1", truncation = True, padding=True, model_max_length=512,) |
|
model_checkpoint = BertForTokenClassification.from_pretrained("dexay/Ner2HgF", ) |
|
model_re = AutoModelForSequenceClassification.from_pretrained("dexay/reDs3others", ) |
|
|
|
token_classifier = pipeline("token-classification", tokenizer = tokenizer,model=model_checkpoint, ) |
|
|
|
|
|
|
|
if x[-1] not in ".?:": |
|
x += "." |
|
|
|
biotext = x |
|
|
|
|
|
|
|
lstbiotext = [] |
|
|
|
flag = 0 |
|
tempsen = "" |
|
for e in biotext: |
|
tempsen += e |
|
if e=="(": |
|
flag = 1 |
|
if e==")": |
|
flag = 0 |
|
if (e =="." or e =="?" or e ==":" ) and flag == 0 : |
|
lstbiotext += [tempsen.strip()] |
|
tempsen = "" |
|
|
|
ddata = lstbiotext |
|
|
|
|
|
|
|
|
|
az = token_classifier(ddata) |
|
|
|
|
|
|
|
|
|
|
|
|
|
tg_inorder = ['O', |
|
'B-HORMONE', |
|
'B-EXP_PER', |
|
'I-HORMONE', |
|
'I-CANCER', |
|
'I-EDC', |
|
'B-RECEPTOR', |
|
'B-CANCER', |
|
'I-RECEPTOR', |
|
'B-EDC', |
|
'PAD'] |
|
|
|
lstSentEnc = [] |
|
lstSentbilbl = [] |
|
lstSentEnt = [] |
|
for itsent in az: |
|
|
|
sentaz = itsent |
|
ph = [] |
|
phl = [] |
|
for e in sentaz: |
|
if e["word"][0]=="#" and len(ph)!=0: |
|
ph[-1]+= e["word"][2:] |
|
else: |
|
ph += [e["word"]] |
|
phl += [e["entity"]] |
|
|
|
|
|
phltr = [] |
|
for e in phl: |
|
phltr += [tg_inorder[int(e[-1])] if len(e)==7 else tg_inorder[int(e[-2:])]] |
|
|
|
|
|
nwph = [] |
|
nwphltr = [] |
|
flag = 0 |
|
for i in range(len(phltr)-2): |
|
if phltr[i]=="O" and flag != 3 : |
|
nwph += [ph[i]] |
|
nwphltr += [phltr[i]] |
|
continue |
|
elif flag == 3: |
|
nwph[-1] += " "+ph[i] |
|
flag = 1 |
|
continue |
|
elif phltr[i][2:]==phltr[i+1][2:] and phltr[i+1][0]=="I" and flag == 0: |
|
nwph += [ph[i]] |
|
nwphltr += [phltr[i]] |
|
flag = 1 |
|
continue |
|
elif phltr[i][2:]==phltr[i+1][2:] and phltr[i+1][0]=="I" and flag == 1: |
|
nwph[-1] += " "+ph[i] |
|
continue |
|
|
|
elif phltr[i][2:]==phltr[i+2][2:] and phltr[i+1]=="O" and phltr[i+2][0]=="I" and flag == 0: |
|
nwph += [ph[i]] |
|
nwphltr += [phltr[i]] |
|
flag = 3 |
|
continue |
|
elif phltr[i][2:]==phltr[i+2][2:] and phltr[i+1]=="O" and phltr[i+2][0]=="I" and flag == 1: |
|
nwph[-1] += " "+ph[i] |
|
flag = 3 |
|
continue |
|
|
|
elif flag == 1: |
|
nwph[-1] += " "+ph[i] |
|
flag = 0 |
|
continue |
|
else : |
|
nwph += [ph[i]] |
|
nwphltr += [phltr[i]] |
|
continue |
|
|
|
|
|
|
|
|
|
|
|
if nwphltr.count("O") <= len(nwphltr)-2: |
|
for i in range(len(nwph)-1): |
|
if nwphltr[i] != "O": |
|
for j in range(i,len(nwph)): |
|
if nwphltr[j] != "O" and nwphltr[j] != nwphltr[i] and {nwphltr[j], nwphltr[i]} != {"B-CANCER","B-RECEPTOR"}: |
|
sen2ad = "" |
|
for g in range(i): |
|
sen2ad += nwph[g]+" " |
|
sen2ad += "<e1>"+nwph[i]+"</e1> " |
|
|
|
for t in range(i+1,j): |
|
sen2ad += nwph[t]+" " |
|
sen2ad += "<e2>"+nwph[j]+"</e2>" |
|
if j<len(nwph): |
|
for l in range(j+1,len(nwph)): |
|
sen2ad += " "+nwph[l] |
|
lstSentEnc += [sen2ad] |
|
lstSentbilbl += [[nwphltr[i],nwphltr[j]]] |
|
lstSentEnt += [[nwph[i],nwph[j]]] |
|
|
|
|
|
|
|
|
|
|
|
st.text("Entities detected.") |
|
st.text("") |
|
st.text("Next: Relation detection ...") |
|
|
|
|
|
|
|
|
|
token_classifier = pipeline("text-classification", tokenizer = tokenizer,model=model_re, ) |
|
|
|
rrdata = lstSentEnc |
|
|
|
|
|
|
|
outre = token_classifier(rrdata) |
|
|
|
|
|
trLABELS = ['INCREASE_RISK(e1,e2)', |
|
'SPEED_UP(e2,e1)', |
|
'DECREASE_ACTIVITY(e1,e2)', |
|
'NO_ASSOCIATION(e1,e2)', |
|
'DECREASE(e1,e2)', |
|
'BLOCK(e1,e2)', |
|
'CAUSE(e1,e2)', |
|
'ACTIVATE(e2,e1)', |
|
'DEVELOP(e2,e1)', |
|
'ALTER(e1,e2)', |
|
'INCREASE_RISK(e2,e1)', |
|
'SPEED_UP(e1,e2)', |
|
'INTERFER(e1,e2)', |
|
'DECREASE(e2,e1)', |
|
'NO_ASSOCIATION(e2,e1)', |
|
'INCREASE(e2,e1)', |
|
'INTERFER(e2,e1)', |
|
'ACTIVATE(e1,e2)', |
|
'INCREASE(e1,e2)', |
|
'MIMIC(e1,e2)', |
|
'MIMIC(e2,e1)', |
|
'BLOCK(e2,e1)', |
|
'other', |
|
'BIND(e2,e1)', |
|
'INCREASE_ACTIVITY(e2,e1)', |
|
'ALTER(e2,e1)', |
|
'CAUSE(e2,e1)', |
|
'BIND(e1,e2)', |
|
'DEVELOP(e1,e2)', |
|
'DECREASE_ACTIVITY(e2,e1)'] |
|
|
|
|
|
|
|
outrelbl = [] |
|
for e in outre: |
|
outrelbl += [trLABELS[int(e['label'][-1])] if len(e["label"])==7 else trLABELS[int(e['label'][-2:])] ] |
|
|
|
for i in range(len(outrelbl)): |
|
if "(e2,e1)" in outrelbl[i]: |
|
lstSentbilbl[i][0],lstSentbilbl[i][1] = lstSentbilbl[i][1],lstSentbilbl[i][0] |
|
lstSentEnt[i][0],lstSentEnt[i][1] = lstSentEnt[i][1],lstSentEnt[i][0] |
|
|
|
|
|
edccan = [] |
|
edccanbis = [] |
|
|
|
for i in range(len(outrelbl)): |
|
if outrelbl[i] != "other": |
|
edccanbis += [[lstSentEnt[i][0], lstSentEnt[i][1], outrelbl[i][:-7], lstSentEnc[i], lstSentbilbl[i]]] |
|
|
|
|
|
edccanbis = postcor(edccanbis) |
|
|
|
|
|
|
|
edccann = [] |
|
edchorm = [] |
|
edcrecep = [] |
|
hormrecep = [] |
|
hormcan = [] |
|
|
|
for e in edccanbis: |
|
if e[-1]== ["B-EDC","B-CANCER"]: |
|
edccann += [[e[0],e[1],e[2]]] |
|
st.write("am in edcann") |
|
|
|
elif e[-1]== ["B-EDC","B-HORMONE"]: |
|
edchorm += [[e[0],e[1],e[2]]] |
|
|
|
elif e[-1]== ["B-EDC","B-RECEPTOR"]: |
|
edcrecep += [[e[0],e[1],e[2]]] |
|
|
|
elif e[-1]== ["B-HORMONE","B-RECEPTOR"]: |
|
hormrecep += [[e[0],e[1],e[2]]] |
|
|
|
elif e[-1]== ["B-HORMONE","B-CANCER"]: |
|
hormcan += [[e[0],e[1],e[2]]] |
|
|
|
|
|
edcrecepdf = pd.DataFrame(edcrecep, columns=["EDC", "RECEPTOR", "RELATION"]) |
|
edccanndf = pd.DataFrame(edccann, columns= ["EDC", "CANCER", "RELATION"] ) |
|
edchormdf = pd.DataFrame(edchorm , columns = ["EDC", "HORMONE", "RELATION"]) |
|
hormrecepdf = pd.DataFrame(hormrecep, columns = ["HORMONE", "RECEPTOR", "RELATION"]) |
|
hormcandf = pd.DataFrame(hormcan, columns = ["HORMONE", "CANCER", "RELATION"]) |
|
|
|
edccancsv = edccanndf.to_csv('edccan.csv') |
|
edcrecepcsv = edcrecepdf.to_csv('edcrecep.csv') |
|
edchormcsv = edchormdf.to_csv('edchorm.csv') |
|
hormcancsv = hormcandf.to_csv('hormcan.csv') |
|
hormrecepcsv = hormrecepdf.to_csv('hormrecep.csv') |
|
|
|
with zipfile.ZipFile("allcsvs.zip", "w") as zipf: |
|
if len(edccann)!=0: |
|
zipf.write(edccancsv) |
|
st.write("am in zip") |
|
if len(edcrecep)!=0: |
|
zipf.write(edcrecepcsv) |
|
if len(edchorm)!=0: |
|
zipf.write(edchormcsv) |
|
if len(hormcan)!=0: |
|
zipf.write(hormcancsv) |
|
if len(hormrecep)!=0: |
|
zipf.write(hormrecepcsv) |
|
zipf.close() |
|
|
|
for e in edccanbis: |
|
edccan += [[e[3],e[0]+" ["+e[-1][0][2:]+"]", e[1]+" ["+e[-1][1][2:]+"]",e[2]]] |
|
|
|
edccandf = pd.DataFrame(edccan, columns= ["Sentence", "Entity 1", "Entity 2", "Relation"] ) |
|
|
|
|
|
st.table(edccandf) |
|
csv = edccandf.to_csv(index=False).encode('utf-8') |
|
st.download_button( |
|
label="Download all data as CSV", |
|
data=csv, |
|
file_name='Relation_triples.csv', |
|
mime='text/csv', |
|
) |
|
|
|
with open("allcsvs.zip", "rb") as fp: |
|
btn = st.download_button( |
|
label="Download ZIP", |
|
data=fp, |
|
file_name="SeperateCsvs.zip", |
|
mime="application/zip" |
|
) |
|
|
|
|
|
|
|
|
|
|