|
import pandas as pd |
|
import re |
|
|
|
def load_indications(file_path): |
|
""" |
|
Load indications from a TSV file. |
|
""" |
|
try: |
|
|
|
df = pd.read_csv(file_path, sep='\t', na_values=['', 'NA', 'N/A']) |
|
return df |
|
except Exception as e: |
|
print(f"Error loading file: {e}") |
|
return None |
|
|
|
def map_indication_to_organ(indication, areas=None): |
|
""" |
|
Map a medical indication to an organ. |
|
Returns the organ name if a confident mapping exists, otherwise None. |
|
""" |
|
|
|
target_organs = [ |
|
'Bowel', 'Lung', 'Esophagus/Stomach', 'Pancreas', 'Skin', |
|
'Uterus', 'Breast', 'Ovary/Fallopian Tube', 'Cervix', |
|
'CNS/Brain', 'Liver', 'Kidney', 'Peripheral Nervous System', |
|
'Vulva/Vagina', 'Bladder/Urinary Tract' |
|
] |
|
|
|
|
|
keyword_to_organ = { |
|
|
|
'bowel': 'Bowel', |
|
'intestinal': 'Bowel', |
|
'colon': 'Bowel', |
|
'colorectal': 'Bowel', |
|
'rectal': 'Bowel', |
|
'anus': 'Bowel', |
|
'colitis': 'Bowel', |
|
'crohn': 'Bowel', |
|
'inflammatory bowel': 'Bowel', |
|
'diverticulitis': 'Bowel', |
|
'ileus': 'Bowel', |
|
'short bowel': 'Bowel', |
|
|
|
|
|
'lung': 'Lung', |
|
'pulmonary': 'Lung', |
|
'respiratory': 'Lung', |
|
'bronch': 'Lung', |
|
'pneumonia': 'Lung', |
|
'emphysema': 'Lung', |
|
'asthma': 'Lung', |
|
'chronic obstructive pulmonary': 'Lung', |
|
'idiopathic pulmonary fibrosis': 'Lung', |
|
|
|
|
|
'esophag': 'Esophagus/Stomach', |
|
'stomach': 'Esophagus/Stomach', |
|
'gastric': 'Esophagus/Stomach', |
|
'gastroesophageal': 'Esophagus/Stomach', |
|
'duodenal': 'Esophagus/Stomach', |
|
'peptic ulcer': 'Esophagus/Stomach', |
|
'gastritis': 'Esophagus/Stomach', |
|
'barrett': 'Esophagus/Stomach', |
|
|
|
|
|
'pancrea': 'Pancreas', |
|
'exocrine pancreatic': 'Pancreas', |
|
|
|
|
|
'skin': 'Skin', |
|
'dermat': 'Skin', |
|
'cutaneous': 'Skin', |
|
'melanoma': 'Skin', |
|
'psoriasis': 'Skin', |
|
'eczema': 'Skin', |
|
'acne': 'Skin', |
|
'rosacea': 'Skin', |
|
'alopecia': 'Skin', |
|
'vitiligo': 'Skin', |
|
'keratosis': 'Skin', |
|
'keloid': 'Skin', |
|
'ichthyosis': 'Skin', |
|
'pruritus': 'Skin', |
|
'urticaria': 'Skin', |
|
'pemphigus': 'Skin', |
|
'pemphigoid': 'Skin', |
|
'scleroderma': 'Skin', |
|
|
|
|
|
'uter': 'Uterus', |
|
'endometri': 'Uterus', |
|
'myometri': 'Uterus', |
|
'leiomyoma': 'Uterus', |
|
|
|
|
|
'breast': 'Breast', |
|
'mammary': 'Breast', |
|
|
|
|
|
'ovar': 'Ovary/Fallopian Tube', |
|
'fallopian': 'Ovary/Fallopian Tube', |
|
'polycystic ovary': 'Ovary/Fallopian Tube', |
|
|
|
|
|
'cervix': 'Cervix', |
|
'cervical': 'Cervix', |
|
|
|
|
|
'brain': 'CNS/Brain', |
|
'central nervous': 'CNS/Brain', |
|
'cns': 'CNS/Brain', |
|
'cerebr': 'CNS/Brain', |
|
'alzheimer': 'CNS/Brain', |
|
'parkinson': 'CNS/Brain', |
|
'glioma': 'CNS/Brain', |
|
'glioblastoma': 'CNS/Brain', |
|
'medulloblastoma': 'CNS/Brain', |
|
'meningioma': 'CNS/Brain', |
|
'epilepsy': 'CNS/Brain', |
|
'dementia': 'CNS/Brain', |
|
'encephalitis': 'CNS/Brain', |
|
'encephalopathy': 'CNS/Brain', |
|
'huntington': 'CNS/Brain', |
|
'multiple sclerosis': 'CNS/Brain', |
|
'stroke': 'CNS/Brain', |
|
|
|
|
|
'liver': 'Liver', |
|
'hepat': 'Liver', |
|
'biliary': 'Liver', |
|
'cholang': 'Liver', |
|
'cirrhosis': 'Liver', |
|
'fatty liver': 'Liver', |
|
|
|
|
|
'kidney': 'Kidney', |
|
'renal': 'Kidney', |
|
'nephro': 'Kidney', |
|
'nephri': 'Kidney', |
|
'glomerulo': 'Kidney', |
|
'polycystic kidney': 'Kidney', |
|
|
|
|
|
'peripheral nervous': 'Peripheral Nervous System', |
|
'neuropathy': 'Peripheral Nervous System', |
|
'neuralgia': 'Peripheral Nervous System', |
|
|
|
|
|
'vulva': 'Vulva/Vagina', |
|
'vagina': 'Vulva/Vagina', |
|
'vulvar': 'Vulva/Vagina', |
|
'vaginal': 'Vulva/Vagina', |
|
'vaginitis': 'Vulva/Vagina', |
|
|
|
|
|
'bladder': 'Bladder/Urinary Tract', |
|
'urinary': 'Bladder/Urinary Tract', |
|
'urethral': 'Bladder/Urinary Tract', |
|
'urethra': 'Bladder/Urinary Tract', |
|
'cystitis': 'Bladder/Urinary Tract', |
|
} |
|
|
|
|
|
too_general = [ |
|
'Neoplasms', 'Inflammation', 'Pain', 'Infection', 'Fever', |
|
'Autoimmune Diseases', 'Immune System Diseases', |
|
'Nervous System Diseases', 'Metabolic Diseases', |
|
'Arthritis', 'Diabetes Mellitus', 'Hypertension', |
|
'Depression', 'Anxiety Disorders', 'HIV Infections', |
|
'Sepsis', 'Wound Healing', 'Cachexia', 'Fatigue', |
|
'Nausea', 'Vomiting', 'Multiple Myeloma', 'Leukemia', 'Lymphoma' |
|
] |
|
|
|
|
|
specific_disease_mapping = { |
|
'Ulcerative Colitis': 'Bowel', |
|
'Crohn Disease': 'Bowel', |
|
'Asthma': 'Lung', |
|
'Psoriasis': 'Skin', |
|
'Lupus Erythematosus, Cutaneous': 'Skin', |
|
'Lupus Nephritis': 'Kidney', |
|
'Alzheimer Disease': 'CNS/Brain', |
|
'Parkinson Disease': 'CNS/Brain', |
|
'Amyotrophic Lateral Sclerosis': 'CNS/Brain', |
|
'Huntington Disease': 'CNS/Brain', |
|
'Non-alcoholic Fatty Liver Disease': 'Liver', |
|
'Carcinoma, Non-Small-Cell Lung': 'Lung', |
|
'Small Cell Lung Carcinoma': 'Lung', |
|
'Liver Cirrhosis': 'Liver', |
|
'Liver Neoplasms': 'Liver', |
|
'Liver Cirrhosis, Biliary': 'Liver', |
|
'Breast Neoplasms': 'Breast', |
|
'Ovarian Neoplasms': 'Ovary/Fallopian Tube', |
|
'Brain Neoplasms': 'CNS/Brain', |
|
'Brain Injuries': 'CNS/Brain', |
|
'Brain Injuries, Traumatic': 'CNS/Brain', |
|
'Brain Ischemia': 'CNS/Brain', |
|
'Pancreatic Neoplasms': 'Pancreas', |
|
'Urinary Bladder Neoplasms': 'Bladder/Urinary Tract', |
|
'Stomach Neoplasms': 'Esophagus/Stomach', |
|
'Uterine Cervical Neoplasms': 'Cervix', |
|
'Endometrial Neoplasms': 'Uterus', |
|
'Kidney Neoplasms': 'Kidney', |
|
'Colorectal Neoplasms': 'Bowel', |
|
'Inflammatory Bowel Diseases': 'Bowel', |
|
'COVID-19': 'Lung', |
|
'Severe Acute Respiratory Syndrome': 'Lung', |
|
'Pulmonary Disease, Chronic Obstructive': 'Lung', |
|
'Idiopathic Pulmonary Fibrosis': 'Lung', |
|
'Respiratory Distress Syndrome': 'Lung', |
|
'Esophageal Neoplasms': 'Esophagus/Stomach', |
|
'Acne Vulgaris': 'Skin', |
|
'Psoriatic Arthritis': 'Skin', |
|
'Alopecia': 'Skin', |
|
'Alopecia Areata': 'Skin', |
|
'Vitiligo': 'Skin', |
|
'Rosacea': 'Skin', |
|
'Hidradenitis Suppurativa': 'Skin', |
|
'Pemphigus': 'Skin', |
|
'Pemphigoid, Bullous': 'Skin', |
|
'Scleroderma, Systemic': 'Skin', |
|
'Scleroderma, Diffuse': 'Skin', |
|
'Scleroderma, Limited': 'Skin', |
|
'Fallopian Tube Neoplasms': 'Ovary/Fallopian Tube', |
|
'Polycystic Ovary Syndrome': 'Ovary/Fallopian Tube', |
|
'Endometriosis': 'Uterus', |
|
'Leiomyoma': 'Uterus', |
|
'Carcinoma, Squamous Cell': 'Skin', |
|
'Neoplasms, Basal Cell': 'Skin', |
|
'Melanoma': 'Skin', |
|
'Glioblastoma': 'CNS/Brain', |
|
'Medulloblastoma': 'CNS/Brain', |
|
'Irritable Bowel Syndrome': 'Bowel', |
|
'Diabetic Nephropathies': 'Kidney', |
|
'Diabetic Neuropathies': 'Peripheral Nervous System', |
|
'Multiple Sclerosis': 'CNS/Brain', |
|
} |
|
|
|
|
|
if indication in too_general: |
|
return None |
|
|
|
|
|
if indication in specific_disease_mapping: |
|
return specific_disease_mapping[indication] |
|
|
|
|
|
indication_lower = indication.lower() |
|
for keyword, organ in keyword_to_organ.items(): |
|
if keyword in indication_lower: |
|
return organ |
|
|
|
|
|
if areas: |
|
areas_lower = str(areas).lower() |
|
if 'digestive' in areas_lower: |
|
if any(term in indication_lower for term in ['bowel', 'colon', 'rectal', 'intestin']): |
|
return 'Bowel' |
|
elif any(term in indication_lower for term in ['liver', 'hepat', 'biliary']): |
|
return 'Liver' |
|
elif 'pancrea' in indication_lower: |
|
return 'Pancreas' |
|
elif any(term in indication_lower for term in ['stomach', 'gastric', 'esophag']): |
|
return 'Esophagus/Stomach' |
|
elif 'respiratory' in areas_lower: |
|
return 'Lung' |
|
elif 'neurology' in areas_lower: |
|
if 'peripheral' in indication_lower: |
|
return 'Peripheral Nervous System' |
|
else: |
|
return 'CNS/Brain' |
|
elif 'dermatology' in areas_lower: |
|
return 'Skin' |
|
elif 'oncology' in areas_lower: |
|
|
|
if any(term in indication_lower for term in ['lung', 'pulmonary', 'bronch']): |
|
return 'Lung' |
|
elif any(term in indication_lower for term in ['brain', 'glioma', 'cerebr']): |
|
return 'CNS/Brain' |
|
elif any(term in indication_lower for term in ['liver', 'hepatic']): |
|
return 'Liver' |
|
elif any(term in indication_lower for term in ['skin', 'melanoma', 'basal cell']): |
|
return 'Skin' |
|
elif any(term in indication_lower for term in ['breast', 'mammary']): |
|
return 'Breast' |
|
elif any(term in indication_lower for term in ['colon', 'rectal', 'colorectal']): |
|
return 'Bowel' |
|
elif any(term in indication_lower for term in ['pancreas', 'pancreatic']): |
|
return 'Pancreas' |
|
elif any(term in indication_lower for term in ['kidney', 'renal']): |
|
return 'Kidney' |
|
elif any(term in indication_lower for term in ['bladder', 'urinary']): |
|
return 'Bladder/Urinary Tract' |
|
elif any(term in indication_lower for term in ['cervix', 'cervical']): |
|
return 'Cervix' |
|
elif any(term in indication_lower for term in ['uterus', 'uterine', 'endometri']): |
|
return 'Uterus' |
|
elif any(term in indication_lower for term in ['ovary', 'ovarian', 'fallopian']): |
|
return 'Ovary/Fallopian Tube' |
|
|
|
|
|
return None |
|
|
|
def main(): |
|
|
|
file_path = '../genetic_support/data/indic.tsv' |
|
indications_df = load_indications(file_path) |
|
indications_df = indications_df.dropna(subset=['indication_mesh_term']) |
|
|
|
if indications_df is not None: |
|
|
|
indications_df['organ'] = indications_df.apply( |
|
lambda row: map_indication_to_organ(row['indication_mesh_term'], row.get('areas')), |
|
axis=1 |
|
) |
|
|
|
|
|
mapped_df = indications_df[['indication_mesh_term', 'organ']] |
|
all_mappings = mapped_df.copy() |
|
|
|
|
|
mapped_df = mapped_df[mapped_df['organ'].notnull()] |
|
|
|
print(f"Successfully mapped {len(mapped_df)} out of {len(indications_df)} indications to organs.") |
|
print(mapped_df.head(20)) |
|
|
|
|
|
mapped_df.to_csv('indication_to_organ_mapping_confident.tsv', index=False, sep='\t') |
|
all_mappings.to_csv('indication_to_organ_mapping_all.tsv', index=False, sep='\t') |
|
print("Mappings saved to 'indication_to_organ_mapping_confident.tsv'") |
|
|
|
|
|
if not mapped_df.empty: |
|
organ_counts = mapped_df['organ'].value_counts() |
|
print("\nDistribution of indications across organs:") |
|
print(organ_counts) |
|
|
|
|
|
unmapped = indications_df[indications_df['organ'].isnull()] |
|
print(f"\nNumber of indications that couldn't be confidently mapped: {len(unmapped)}") |
|
|
|
if __name__ == "__main__": |
|
main() |