Spaces:

haotle
/

LibTesting

Running

LibTesting / tools /sourceformat.py

T Le

Upload updated files

c837e02 22 days ago

9.51 kB

	from io import StringIO, BytesIO
	import pymarc
	import requests
	import string
	import pandas as pd
	import tarfile
	try:
	from lxml import etree as ET
	except ImportError:
	import xml.etree.ElementTree as ET

	#metadata for htrc worksets
	def htrc(self):

	#variables/arrays and stuff

	#string of keywords per volume/htid
	keywords = ""

	#array of all the keywords per each volume/htid, to add to the file
	keylist = []

	#get htids of the volumes
	htids = self['htid'].values.tolist()
	#iterate through list of htids
	for id in range(len(htids)):
	htid = htids[id]

	#api call for the extra metadata using htid
	extradata = requests.get("https://catalog.hathitrust.org/api/volumes/full/htid/"+htid+".json")

	#turn the request into a json file
	extradata = extradata.json()

	#get record id and use it to get the xml/marc file with the actual metadata
	recid = extradata['items'][0]['fromRecord']
	xmlmarc = extradata['records'][recid]['marc-xml']

	#turn the formatted xml into an actual pymarc
	xml = StringIO(xmlmarc)
	marc = pymarc.parse_xml_to_array(xml)[0]
	xml.close()

	for term in marc.get_fields('650'):
	if "http" in (term.value()).lower():
	keywords+= ""
	elif "ocolc" in (term.value()).lower():
	keywords+=""
	else:
	keywords+=term.value().translate(str.maketrans('','', string.punctuation))+"; "
	keylist.append(keywords)
	self['Keywords'] = keylist
	return self

	def htrcxtra(self):

	#variables/arrays and stuff

	#string of keywords per volume/htid
	pages = ""

	#array of all the keywords per each volume/htid, to add to the file
	pagecount = []

	#get htids of the volumes
	htids = self['htid'].values.tolist()
	#iterate through list of htids
	for id in range(len(htids)):
	htid = htids[id]

	#api call for the extra metadata using htid
	extradata = requests.get("https://catalog.hathitrust.org/api/volumes/full/htid/"+htid+".json")

	#turn the request into a json file
	extradata = extradata.json()

	#get record id and use it to get the xml/marc file with the actual metadata
	recid = extradata['items'][0]['fromRecord']
	xmlmarc = extradata['records'][recid]['marc-xml']

	#turn the formatted xml into an actual pymarc
	xml = StringIO(xmlmarc)
	marc = pymarc.parse_xml_to_array(xml)[0]
	xml.close()

	for term in marc.get_fields('350'):
	pages+=term.value()
	pagecount.append(pages)
	self['pages'] = pagecount
	return self


	#format files from dimensions
	def dim(file):
	formatted = file.drop(file.columns[[0]],axis=1)

	done = pd.read_csv(StringIO((formatted.to_csv(header=False,index=False))))

	return done



	def readPub(tar):

	#list to put xmls from tarfile in
	xmllist = []

	readfile = BytesIO(tar)

	#get the files from the tarfile into the list
	files = tarfile.open(fileobj=readfile, mode = 'r:gz', )
	for member in files.getmembers():
	singlefile = files.extractfile(member)
	if singlefile is not None:
	article = singlefile.read()
	article = article.decode("utf-8")
	article = StringIO(article)
	xmllist.append(article)

	#lists for each data point
	titles = []
	years = []
	keys = []
	authors = []
	publishers = []
	journaltitles = []

	#go through each xml file in the list
	for art in range(len(xmllist)):

	#make a parseable element tree out of the xml file
	tree = ET.parse(xmllist[art])
	root = tree.getroot()

	#remove parts of the main branch that do not have metadata that we care about
	for child in list(root):
	if(child.tag!="front"):
	root.remove(child)

	#names to concatnate for each article
	firstname = []
	lastname = []

	#individual strings for multiple keywords/titles
	key = ""
	title = ""


	for target in root.iter('article-title'):
	if target.text is not None:
	title += target.text + ", "
	else:
	title += " "
	for target in root.iter('kwd'):
	if target.text is not None:
	key+=target.text+ "; "
	else:
	key += " "
	for target in root.iter('year'):
	year=int(target.text)
	years.append(year)
	for names in root.iter('given-names'):
	firstname.append(names.text)
	for names in root.iter('surname'):
	lastname.append(names.text)
	for target in root.iter('journal-title'):
	jtitle = target.text
	journaltitles.append(jtitle)
	for target in root.iter('publisher-name'):
	publisher = target.text
	publishers.append(publisher)

	titles.append(title)
	keys.append(key)

	fullnames = [first + ' ' + last for first, last in zip(firstname,lastname)]

	#join the names into a single string with authors
	author = str.join(', ', fullnames)

	authors.append(author)

	data = pd.DataFrame()

	data["Title"] = pd.Series(titles)
	data["Keywords"] = pd.Series(keys)
	data["Authors"] = pd.Series(authors)
	data["Year"] = pd.Series(years)
	data["Document Type"] = pd.Series(publisher)
	data["Source title"] = pd.Series(journaltitles)

	data.fillna(value = "empty", inplace = True)

	return data


	def readxml(file):
	root = ET.fromstring(file)



	#remove stuff from the xml that we do not need
	for child in list(root):
	for lchild in list(child):
	if(lchild.tag!="front"):
	child.remove(lchild)

	#get stuff

	keys = []
	titles = []
	authors = []
	jtitle = []
	publishers = []
	years = []

	for child in list(root):
	for article in list(child):
	key = ""
	firstname = []
	lastname = []
	for target in article.iter('article-title'):

	if target.text is not None:
	titles.append(target.text)
	else:
	titles.append("empty")
	for target in article.iter('kwd'):
	if target.text is not None:
	key+= target.text + "; "
	else:
	key += ""
	keys.append(key)
	for target in article.iter('given-names'):
	firstname.append(target.text)
	for target in article.iter('surname'):
	lastname.append(target.text)

	fullnames = [first + ' ' + last for first, last in zip(firstname,lastname)]
	author = str.join(', ', fullnames)
	authors.append(author)

	for target in article.iter('journal-title'):
	jtitle.append(target.text)
	for target in article.iter('publisher-name'):
	publishers.append(target.text)

	for target in article.iter('year'):
	years.append(int(target.text))

	frame = pd.DataFrame()

	frame["Title"] = pd.Series(titles)
	frame["Keywords"] = pd.Series(keys)
	frame["Authors"] = pd.Series(authors)
	frame["Year"] = pd.Series(years)
	frame["Document Type"] = pd.Series(jtitle)
	frame["Source title"] = pd.Series(publishers)

	frame.fillna(value = "empty", inplace = True)

	return frame

	def medline(file):

	textfile = file.read()


	text = textfile.decode()





	authors = []
	titles = []
	year = []
	meshkeys = []
	otherkeys = []

	#articles are separated by newlines so seperate them
	articles = text.split('\n\n')

	for paper in articles:
	names = ""
	meshk = ""
	otherk = ""
	largetext = paper.splitlines()
	for line in largetext:
	#title
	if "TI - " in line:
	#checking if the title goes over another line, and to add it if it does
	startpos = line.index("-") + 2
	if "- " not in(largetext[largetext.index(line)+1]):
	titles.append(line[startpos:] + " " + largetext[largetext.index(line)+1].strip())
	else:
	titles.append(line[startpos:])
	#author
	if "FAU - " in line:
	startpos = line.index("-") + 2
	names+= line[startpos:] + "; "
	#year
	if "DP - " in line:
	startpos = line.index("-") + 2
	year.append(int(line[startpos:startpos+4]))
	#key terms
	if "MH - " in line:
	startpos = line.index("-") + 2
	meshk += line[startpos:] + "; "
	if"OT - " in line:
	startpos = line.index("-") + 2
	otherk += line[startpos:] + "; "

	authors.append(names)
	meshkeys.append(meshk)
	otherkeys.append(otherk)

	frame = pd.DataFrame()

	frame['Title'] = pd.Series(titles)
	frame['Authors'] = pd.Series(authors)
	frame['Year'] = pd.Series(year)
	frame['MeSH Keywords'] = pd.Series(meshkeys)
	frame['Other Keywords'] = pd.Series(otherkeys)

	frame.fillna(value = "empty", inplace = True)

	return frame