Spaces:

jackkuo
/

streamlit-macp-agents

Sleeping

App Files Files Community

streamlit-macp-agents / python-services /Retrieve /service /pubmed_xml_parse.py

jackkuo

add QA

79899c0 12 days ago

raw

history blame contribute delete

9.32 kB

	import xml.etree.ElementTree as ET
	import re


	class PubmedXmlParse:
	def __init__(self):
	pass

	def remove_xml_tags(self, text):
	"""移除XML标签，返回纯文本"""
	clean = re.compile('<.*?>')
	return re.sub(clean, '', text)


	# 解析 XML 数据
	def parse_pubmed_xml(self, xml_data):
	tree = ET.ElementTree(ET.fromstring(xml_data))
	root = tree.getroot()

	articles = []

	# 遍历每个 PubmedArticle 元素
	for article in root.findall(".//PubmedArticle"):
	# 提取文章信息
	article_title_elem = article.find(".//ArticleTitle")
	article_title = ""
	if article_title_elem is not None:
	# Convert element to string and decode to handle tags
	title_text = ET.tostring(article_title_elem, encoding='unicode', method='xml')
	# Remove the ArticleTitle tags but keep inner content and tags
	title_text = title_text.replace('<ArticleTitle>', '').replace('</ArticleTitle>', '')
	# Remove all XML tags to get plain text
	article_title = self.remove_xml_tags(title_text).strip()

	pmid = (
	article.find(".//ArticleId[@IdType='pubmed']").text
	if article.find(".//ArticleId[@IdType='pubmed']") is not None
	else ""
	)
	abstract_texts = article.findall(".//AbstractText")
	abstract_text = (
	" ".join(
	[
	abstract.text if abstract.text is not None else ""
	for abstract in abstract_texts
	]
	)
	if abstract_texts
	else ""
	)

	# 提取作者信息
	authors = []
	for author in article.findall(".//Author"):

	authors.append(
	{
	"lastname": (
	author.find(".//LastName").text
	if author.find(".//LastName") is not None
	else ""
	),
	"forename": (
	author.find(".//ForeName").text
	if author.find(".//ForeName") is not None
	else ""
	),
	"initials": (
	author.find(".//Initials").text
	if author.find(".//Initials") is not None
	else ""
	),
	"affiliation": (
	author.find(".//AffiliationInfo/Affiliation").text
	if author.find(".//AffiliationInfo/Affiliation") is not None
	else ""
	),
	}
	)

	journal = {
	"issn": (
	article.find(".//Journal/ISSN").text
	if article.find(".//Journal/ISSN") is not None
	else ""
	),
	"title": (
	article.find(".//Journal/Title").text
	if article.find(".//Journal/Title") is not None
	else ""
	),
	"abbreviation": (
	article.find(".//Journal/ISOAbbreviation").text
	if article.find(".//Journal/ISOAbbreviation") is not None
	else ""
	),
	"startPage": (
	article.find(".//Pagination/StartPage").text
	if article.find(".//Pagination/StartPage") is not None
	else ""
	),
	"endPage": (
	article.find(".//Pagination/EndPage").text
	if article.find(".//Pagination/EndPage") is not None
	else ""
	),
	"volume": (
	article.find(".//Journal/JournalIssue/Volume").text
	if article.find(".//Journal/JournalIssue/Volume") is not None
	else ""
	),
	"issue": (
	article.find(".//Journal/JournalIssue/Issue").text
	if article.find(".//Journal/JournalIssue/Issue") is not None
	else ""
	),
	"year": (
	article.find(".//Journal/JournalIssue/PubDate/Year").text
	if article.find(".//Journal/JournalIssue/PubDate/Year") is not None
	else ""
	),
	}
	medline = article.find("MedlineCitation")
	references = article.findall(".//PubmedData/ReferenceList/Reference")
	# 将每篇文章的信息添加到列表中
	articles.append(
	{
	"pmid": pmid,
	"pmcid": (
	article.find(
	".//PubmedData/ArticleIdList/ArticleId[@IdType='pmc']"
	).text
	if article.find(
	".//PubmedData/ArticleIdList/ArticleId[@IdType='pmc']"
	)
	is not None
	else ""
	),
	"title": article_title,
	"abstract": abstract_text,
	"journal": journal,
	"authors": authors,
	"pub_date": {
	"year": (
	article.find(".//Journal/JournalIssue/PubDate/Year").text
	if article.find(".//Journal/JournalIssue/PubDate/Year")
	is not None
	else ""
	),
	"month": (
	article.find(".//Journal/JournalIssue/PubDate/Month").text
	if article.find(".//Journal/JournalIssue/PubDate/Month")
	is not None
	else ""
	),
	"day": (
	article.find(".//Journal/JournalIssue/PubDate/Day").text
	if article.find(".//Journal/JournalIssue/PubDate/Day")
	is not None
	else ""
	),
	},
	"keywords": (
	[k.text for k in medline.findall(".//KeywordList/Keyword")]
	if medline.findall(".//KeywordList/Keyword") is not None
	else ""
	),
	"doi": self.parse_doi(medline.find("Article"), article),
	"mesh_terms": [
	self.parse_mesh(m)
	for m in medline.findall("MeshHeadingList/MeshHeading")
	],
	"references": [self.parse_reference(r) for r in references],
	}
	)

	return articles

	def parse_doi(self, article, article_elem) -> str:
	if article.find(".//ELocationID[@EIdType='doi']") is not None:
	doi = article.find(".//ELocationID[@EIdType='doi']").text
	if doi is not None and doi != "":
	return doi
	elif article_elem.find(".//ArticleIdList/ArticleId[@IdType='doi']") is not None:
	doi = article_elem.find(".//ArticleIdList/ArticleId[@IdType='doi']").text
	if doi is not None and doi != "":
	return doi
	else:
	return ""

	def parse_mesh(self, mesh_elem):
	"""解析MeSH主题词"""
	return {
	"descriptor": (
	mesh_elem.find(".//DescriptorName").text
	if mesh_elem.find(".//DescriptorName") is not None
	else ""
	),
	"qualifiers": [
	(
	q.find(".//QualifierName").text
	if q.find(".//QualifierName") is not None
	else ""
	)
	for q in mesh_elem.findall(".//QualifierName")
	],
	}

	def parse_reference(self, reference_elem):
	"""解析参考文献"""
	return {
	"citation": (
	reference_elem.find("Citation").text
	if reference_elem.find("Citation") is not None
	else ""
	),
	"doi": (
	reference_elem.find(".//ArticleId[@IdType='doi']").text
	if reference_elem.find(".//ArticleId[@IdType='doi']") is not None
	else ""
	),
	"pmid": (
	reference_elem.find(".//ArticleId[@IdType='pubmed']").text
	if reference_elem.find(".//ArticleId[@IdType='pubmed']") is not None
	else ""
	),
	"pmcid": (
	reference_elem.find(".//ArticleId[@IdType='pmcid']").text
	if reference_elem.find(".//ArticleId[@IdType='pmcid']") is not None
	else ""
	),
	}