import xml.etree.ElementTree as ET import re class PubmedXmlParse: def __init__(self): pass def remove_xml_tags(self, text): """移除XML标签,返回纯文本""" clean = re.compile('<.*?>') return re.sub(clean, '', text) # 解析 XML 数据 def parse_pubmed_xml(self, xml_data): tree = ET.ElementTree(ET.fromstring(xml_data)) root = tree.getroot() articles = [] # 遍历每个 PubmedArticle 元素 for article in root.findall(".//PubmedArticle"): # 提取文章信息 article_title_elem = article.find(".//ArticleTitle") article_title = "" if article_title_elem is not None: # Convert element to string and decode to handle tags title_text = ET.tostring(article_title_elem, encoding='unicode', method='xml') # Remove the ArticleTitle tags but keep inner content and tags title_text = title_text.replace('', '').replace('', '') # Remove all XML tags to get plain text article_title = self.remove_xml_tags(title_text).strip() pmid = ( article.find(".//ArticleId[@IdType='pubmed']").text if article.find(".//ArticleId[@IdType='pubmed']") is not None else "" ) abstract_texts = article.findall(".//AbstractText") abstract_text = ( " ".join( [ abstract.text if abstract.text is not None else "" for abstract in abstract_texts ] ) if abstract_texts else "" ) # 提取作者信息 authors = [] for author in article.findall(".//Author"): authors.append( { "lastname": ( author.find(".//LastName").text if author.find(".//LastName") is not None else "" ), "forename": ( author.find(".//ForeName").text if author.find(".//ForeName") is not None else "" ), "initials": ( author.find(".//Initials").text if author.find(".//Initials") is not None else "" ), "affiliation": ( author.find(".//AffiliationInfo/Affiliation").text if author.find(".//AffiliationInfo/Affiliation") is not None else "" ), } ) journal = { "issn": ( article.find(".//Journal/ISSN").text if article.find(".//Journal/ISSN") is not None else "" ), "title": ( article.find(".//Journal/Title").text if article.find(".//Journal/Title") is not None else "" ), "abbreviation": ( article.find(".//Journal/ISOAbbreviation").text if article.find(".//Journal/ISOAbbreviation") is not None else "" ), "startPage": ( article.find(".//Pagination/StartPage").text if article.find(".//Pagination/StartPage") is not None else "" ), "endPage": ( article.find(".//Pagination/EndPage").text if article.find(".//Pagination/EndPage") is not None else "" ), "volume": ( article.find(".//Journal/JournalIssue/Volume").text if article.find(".//Journal/JournalIssue/Volume") is not None else "" ), "issue": ( article.find(".//Journal/JournalIssue/Issue").text if article.find(".//Journal/JournalIssue/Issue") is not None else "" ), "year": ( article.find(".//Journal/JournalIssue/PubDate/Year").text if article.find(".//Journal/JournalIssue/PubDate/Year") is not None else "" ), } medline = article.find("MedlineCitation") references = article.findall(".//PubmedData/ReferenceList/Reference") # 将每篇文章的信息添加到列表中 articles.append( { "pmid": pmid, "pmcid": ( article.find( ".//PubmedData/ArticleIdList/ArticleId[@IdType='pmc']" ).text if article.find( ".//PubmedData/ArticleIdList/ArticleId[@IdType='pmc']" ) is not None else "" ), "title": article_title, "abstract": abstract_text, "journal": journal, "authors": authors, "pub_date": { "year": ( article.find(".//Journal/JournalIssue/PubDate/Year").text if article.find(".//Journal/JournalIssue/PubDate/Year") is not None else "" ), "month": ( article.find(".//Journal/JournalIssue/PubDate/Month").text if article.find(".//Journal/JournalIssue/PubDate/Month") is not None else "" ), "day": ( article.find(".//Journal/JournalIssue/PubDate/Day").text if article.find(".//Journal/JournalIssue/PubDate/Day") is not None else "" ), }, "keywords": ( [k.text for k in medline.findall(".//KeywordList/Keyword")] if medline.findall(".//KeywordList/Keyword") is not None else "" ), "doi": self.parse_doi(medline.find("Article"), article), "mesh_terms": [ self.parse_mesh(m) for m in medline.findall("MeshHeadingList/MeshHeading") ], "references": [self.parse_reference(r) for r in references], } ) return articles def parse_doi(self, article, article_elem) -> str: if article.find(".//ELocationID[@EIdType='doi']") is not None: doi = article.find(".//ELocationID[@EIdType='doi']").text if doi is not None and doi != "": return doi elif article_elem.find(".//ArticleIdList/ArticleId[@IdType='doi']") is not None: doi = article_elem.find(".//ArticleIdList/ArticleId[@IdType='doi']").text if doi is not None and doi != "": return doi else: return "" def parse_mesh(self, mesh_elem): """解析MeSH主题词""" return { "descriptor": ( mesh_elem.find(".//DescriptorName").text if mesh_elem.find(".//DescriptorName") is not None else "" ), "qualifiers": [ ( q.find(".//QualifierName").text if q.find(".//QualifierName") is not None else "" ) for q in mesh_elem.findall(".//QualifierName") ], } def parse_reference(self, reference_elem): """解析参考文献""" return { "citation": ( reference_elem.find("Citation").text if reference_elem.find("Citation") is not None else "" ), "doi": ( reference_elem.find(".//ArticleId[@IdType='doi']").text if reference_elem.find(".//ArticleId[@IdType='doi']") is not None else "" ), "pmid": ( reference_elem.find(".//ArticleId[@IdType='pubmed']").text if reference_elem.find(".//ArticleId[@IdType='pubmed']") is not None else "" ), "pmcid": ( reference_elem.find(".//ArticleId[@IdType='pmcid']").text if reference_elem.find(".//ArticleId[@IdType='pmcid']") is not None else "" ), }