Spaces:
Sleeping
Sleeping
import xml.etree.ElementTree as ET | |
import re | |
class PubmedXmlParse: | |
def __init__(self): | |
pass | |
def remove_xml_tags(self, text): | |
"""移除XML标签,返回纯文本""" | |
clean = re.compile('<.*?>') | |
return re.sub(clean, '', text) | |
# 解析 XML 数据 | |
def parse_pubmed_xml(self, xml_data): | |
tree = ET.ElementTree(ET.fromstring(xml_data)) | |
root = tree.getroot() | |
articles = [] | |
# 遍历每个 PubmedArticle 元素 | |
for article in root.findall(".//PubmedArticle"): | |
# 提取文章信息 | |
article_title_elem = article.find(".//ArticleTitle") | |
article_title = "" | |
if article_title_elem is not None: | |
# Convert element to string and decode to handle tags | |
title_text = ET.tostring(article_title_elem, encoding='unicode', method='xml') | |
# Remove the ArticleTitle tags but keep inner content and tags | |
title_text = title_text.replace('<ArticleTitle>', '').replace('</ArticleTitle>', '') | |
# Remove all XML tags to get plain text | |
article_title = self.remove_xml_tags(title_text).strip() | |
pmid = ( | |
article.find(".//ArticleId[@IdType='pubmed']").text | |
if article.find(".//ArticleId[@IdType='pubmed']") is not None | |
else "" | |
) | |
abstract_texts = article.findall(".//AbstractText") | |
abstract_text = ( | |
" ".join( | |
[ | |
abstract.text if abstract.text is not None else "" | |
for abstract in abstract_texts | |
] | |
) | |
if abstract_texts | |
else "" | |
) | |
# 提取作者信息 | |
authors = [] | |
for author in article.findall(".//Author"): | |
authors.append( | |
{ | |
"lastname": ( | |
author.find(".//LastName").text | |
if author.find(".//LastName") is not None | |
else "" | |
), | |
"forename": ( | |
author.find(".//ForeName").text | |
if author.find(".//ForeName") is not None | |
else "" | |
), | |
"initials": ( | |
author.find(".//Initials").text | |
if author.find(".//Initials") is not None | |
else "" | |
), | |
"affiliation": ( | |
author.find(".//AffiliationInfo/Affiliation").text | |
if author.find(".//AffiliationInfo/Affiliation") is not None | |
else "" | |
), | |
} | |
) | |
journal = { | |
"issn": ( | |
article.find(".//Journal/ISSN").text | |
if article.find(".//Journal/ISSN") is not None | |
else "" | |
), | |
"title": ( | |
article.find(".//Journal/Title").text | |
if article.find(".//Journal/Title") is not None | |
else "" | |
), | |
"abbreviation": ( | |
article.find(".//Journal/ISOAbbreviation").text | |
if article.find(".//Journal/ISOAbbreviation") is not None | |
else "" | |
), | |
"startPage": ( | |
article.find(".//Pagination/StartPage").text | |
if article.find(".//Pagination/StartPage") is not None | |
else "" | |
), | |
"endPage": ( | |
article.find(".//Pagination/EndPage").text | |
if article.find(".//Pagination/EndPage") is not None | |
else "" | |
), | |
"volume": ( | |
article.find(".//Journal/JournalIssue/Volume").text | |
if article.find(".//Journal/JournalIssue/Volume") is not None | |
else "" | |
), | |
"issue": ( | |
article.find(".//Journal/JournalIssue/Issue").text | |
if article.find(".//Journal/JournalIssue/Issue") is not None | |
else "" | |
), | |
"year": ( | |
article.find(".//Journal/JournalIssue/PubDate/Year").text | |
if article.find(".//Journal/JournalIssue/PubDate/Year") is not None | |
else "" | |
), | |
} | |
medline = article.find("MedlineCitation") | |
references = article.findall(".//PubmedData/ReferenceList/Reference") | |
# 将每篇文章的信息添加到列表中 | |
articles.append( | |
{ | |
"pmid": pmid, | |
"pmcid": ( | |
article.find( | |
".//PubmedData/ArticleIdList/ArticleId[@IdType='pmc']" | |
).text | |
if article.find( | |
".//PubmedData/ArticleIdList/ArticleId[@IdType='pmc']" | |
) | |
is not None | |
else "" | |
), | |
"title": article_title, | |
"abstract": abstract_text, | |
"journal": journal, | |
"authors": authors, | |
"pub_date": { | |
"year": ( | |
article.find(".//Journal/JournalIssue/PubDate/Year").text | |
if article.find(".//Journal/JournalIssue/PubDate/Year") | |
is not None | |
else "" | |
), | |
"month": ( | |
article.find(".//Journal/JournalIssue/PubDate/Month").text | |
if article.find(".//Journal/JournalIssue/PubDate/Month") | |
is not None | |
else "" | |
), | |
"day": ( | |
article.find(".//Journal/JournalIssue/PubDate/Day").text | |
if article.find(".//Journal/JournalIssue/PubDate/Day") | |
is not None | |
else "" | |
), | |
}, | |
"keywords": ( | |
[k.text for k in medline.findall(".//KeywordList/Keyword")] | |
if medline.findall(".//KeywordList/Keyword") is not None | |
else "" | |
), | |
"doi": self.parse_doi(medline.find("Article"), article), | |
"mesh_terms": [ | |
self.parse_mesh(m) | |
for m in medline.findall("MeshHeadingList/MeshHeading") | |
], | |
"references": [self.parse_reference(r) for r in references], | |
} | |
) | |
return articles | |
def parse_doi(self, article, article_elem) -> str: | |
if article.find(".//ELocationID[@EIdType='doi']") is not None: | |
doi = article.find(".//ELocationID[@EIdType='doi']").text | |
if doi is not None and doi != "": | |
return doi | |
elif article_elem.find(".//ArticleIdList/ArticleId[@IdType='doi']") is not None: | |
doi = article_elem.find(".//ArticleIdList/ArticleId[@IdType='doi']").text | |
if doi is not None and doi != "": | |
return doi | |
else: | |
return "" | |
def parse_mesh(self, mesh_elem): | |
"""解析MeSH主题词""" | |
return { | |
"descriptor": ( | |
mesh_elem.find(".//DescriptorName").text | |
if mesh_elem.find(".//DescriptorName") is not None | |
else "" | |
), | |
"qualifiers": [ | |
( | |
q.find(".//QualifierName").text | |
if q.find(".//QualifierName") is not None | |
else "" | |
) | |
for q in mesh_elem.findall(".//QualifierName") | |
], | |
} | |
def parse_reference(self, reference_elem): | |
"""解析参考文献""" | |
return { | |
"citation": ( | |
reference_elem.find("Citation").text | |
if reference_elem.find("Citation") is not None | |
else "" | |
), | |
"doi": ( | |
reference_elem.find(".//ArticleId[@IdType='doi']").text | |
if reference_elem.find(".//ArticleId[@IdType='doi']") is not None | |
else "" | |
), | |
"pmid": ( | |
reference_elem.find(".//ArticleId[@IdType='pubmed']").text | |
if reference_elem.find(".//ArticleId[@IdType='pubmed']") is not None | |
else "" | |
), | |
"pmcid": ( | |
reference_elem.find(".//ArticleId[@IdType='pmcid']").text | |
if reference_elem.find(".//ArticleId[@IdType='pmcid']") is not None | |
else "" | |
), | |
} | |