Spaces:
Running
Running
from io import StringIO, BytesIO | |
import pymarc | |
import requests | |
import string | |
import pandas as pd | |
import tarfile | |
try: | |
from lxml import etree as ET | |
except ImportError: | |
import xml.etree.ElementTree as ET | |
#metadata for htrc worksets | |
def htrc(self): | |
#variables/arrays and stuff | |
#string of keywords per volume/htid | |
keywords = "" | |
#array of all the keywords per each volume/htid, to add to the file | |
keylist = [] | |
#get htids of the volumes | |
htids = self['htid'].values.tolist() | |
#iterate through list of htids | |
for id in range(len(htids)): | |
htid = htids[id] | |
#api call for the extra metadata using htid | |
extradata = requests.get("https://catalog.hathitrust.org/api/volumes/full/htid/"+htid+".json") | |
#turn the request into a json file | |
extradata = extradata.json() | |
#get record id and use it to get the xml/marc file with the actual metadata | |
recid = extradata['items'][0]['fromRecord'] | |
xmlmarc = extradata['records'][recid]['marc-xml'] | |
#turn the formatted xml into an actual pymarc | |
xml = StringIO(xmlmarc) | |
marc = pymarc.parse_xml_to_array(xml)[0] | |
xml.close() | |
for term in marc.get_fields('650'): | |
if "http" in (term.value()).lower(): | |
keywords+= "" | |
elif "ocolc" in (term.value()).lower(): | |
keywords+="" | |
else: | |
keywords+=term.value().translate(str.maketrans('','', string.punctuation))+"; " | |
keylist.append(keywords) | |
self['Keywords'] = keylist | |
return self | |
def htrcxtra(self): | |
#variables/arrays and stuff | |
#string of keywords per volume/htid | |
pages = "" | |
#array of all the keywords per each volume/htid, to add to the file | |
pagecount = [] | |
#get htids of the volumes | |
htids = self['htid'].values.tolist() | |
#iterate through list of htids | |
for id in range(len(htids)): | |
htid = htids[id] | |
#api call for the extra metadata using htid | |
extradata = requests.get("https://catalog.hathitrust.org/api/volumes/full/htid/"+htid+".json") | |
#turn the request into a json file | |
extradata = extradata.json() | |
#get record id and use it to get the xml/marc file with the actual metadata | |
recid = extradata['items'][0]['fromRecord'] | |
xmlmarc = extradata['records'][recid]['marc-xml'] | |
#turn the formatted xml into an actual pymarc | |
xml = StringIO(xmlmarc) | |
marc = pymarc.parse_xml_to_array(xml)[0] | |
xml.close() | |
for term in marc.get_fields('350'): | |
pages+=term.value() | |
pagecount.append(pages) | |
self['pages'] = pagecount | |
return self | |
#format files from dimensions | |
def dim(file): | |
formatted = file.drop(file.columns[[0]],axis=1) | |
done = pd.read_csv(StringIO((formatted.to_csv(header=False,index=False)))) | |
return done | |
def readPub(tar): | |
#list to put xmls from tarfile in | |
xmllist = [] | |
readfile = BytesIO(tar) | |
#get the files from the tarfile into the list | |
files = tarfile.open(fileobj=readfile, mode = 'r:gz', ) | |
for member in files.getmembers(): | |
singlefile = files.extractfile(member) | |
if singlefile is not None: | |
article = singlefile.read() | |
article = article.decode("utf-8") | |
article = StringIO(article) | |
xmllist.append(article) | |
#lists for each data point | |
titles = [] | |
years = [] | |
keys = [] | |
authors = [] | |
publishers = [] | |
journaltitles = [] | |
#go through each xml file in the list | |
for art in range(len(xmllist)): | |
#make a parseable element tree out of the xml file | |
tree = ET.parse(xmllist[art]) | |
root = tree.getroot() | |
#remove parts of the main branch that do not have metadata that we care about | |
for child in list(root): | |
if(child.tag!="front"): | |
root.remove(child) | |
#names to concatnate for each article | |
firstname = [] | |
lastname = [] | |
#individual strings for multiple keywords/titles | |
key = "" | |
title = "" | |
for target in root.iter('article-title'): | |
if target.text is not None: | |
title += target.text + ", " | |
else: | |
title += " " | |
for target in root.iter('kwd'): | |
if target.text is not None: | |
key+=target.text+ "; " | |
else: | |
key += " " | |
for target in root.iter('year'): | |
year=int(target.text) | |
years.append(year) | |
for names in root.iter('given-names'): | |
firstname.append(names.text) | |
for names in root.iter('surname'): | |
lastname.append(names.text) | |
for target in root.iter('journal-title'): | |
jtitle = target.text | |
journaltitles.append(jtitle) | |
for target in root.iter('publisher-name'): | |
publisher = target.text | |
publishers.append(publisher) | |
titles.append(title) | |
keys.append(key) | |
fullnames = [first + ' ' + last for first, last in zip(firstname,lastname)] | |
#join the names into a single string with authors | |
author = str.join(', ', fullnames) | |
authors.append(author) | |
data = pd.DataFrame() | |
data["Title"] = pd.Series(titles) | |
data["Keywords"] = pd.Series(keys) | |
data["Authors"] = pd.Series(authors) | |
data["Year"] = pd.Series(years) | |
data["Document Type"] = pd.Series(publisher) | |
data["Source title"] = pd.Series(journaltitles) | |
data.fillna(value = "empty", inplace = True) | |
return data | |
def readxml(file): | |
root = ET.fromstring(file) | |
#remove stuff from the xml that we do not need | |
for child in list(root): | |
for lchild in list(child): | |
if(lchild.tag!="front"): | |
child.remove(lchild) | |
#get stuff | |
keys = [] | |
titles = [] | |
authors = [] | |
jtitle = [] | |
publishers = [] | |
years = [] | |
for child in list(root): | |
for article in list(child): | |
key = "" | |
firstname = [] | |
lastname = [] | |
for target in article.iter('article-title'): | |
if target.text is not None: | |
titles.append(target.text) | |
else: | |
titles.append("empty") | |
for target in article.iter('kwd'): | |
if target.text is not None: | |
key+= target.text + "; " | |
else: | |
key += "" | |
keys.append(key) | |
for target in article.iter('given-names'): | |
firstname.append(target.text) | |
for target in article.iter('surname'): | |
lastname.append(target.text) | |
fullnames = [first + ' ' + last for first, last in zip(firstname,lastname)] | |
author = str.join(', ', fullnames) | |
authors.append(author) | |
for target in article.iter('journal-title'): | |
jtitle.append(target.text) | |
for target in article.iter('publisher-name'): | |
publishers.append(target.text) | |
for target in article.iter('year'): | |
years.append(int(target.text)) | |
frame = pd.DataFrame() | |
frame["Title"] = pd.Series(titles) | |
frame["Keywords"] = pd.Series(keys) | |
frame["Authors"] = pd.Series(authors) | |
frame["Year"] = pd.Series(years) | |
frame["Document Type"] = pd.Series(jtitle) | |
frame["Source title"] = pd.Series(publishers) | |
frame.fillna(value = "empty", inplace = True) | |
return frame | |
def medline(file): | |
textfile = file.read() | |
text = textfile.decode() | |
authors = [] | |
titles = [] | |
year = [] | |
meshkeys = [] | |
otherkeys = [] | |
#articles are separated by newlines so seperate them | |
articles = text.split('\n\n') | |
for paper in articles: | |
names = "" | |
meshk = "" | |
otherk = "" | |
largetext = paper.splitlines() | |
for line in largetext: | |
#title | |
if "TI - " in line: | |
#checking if the title goes over another line, and to add it if it does | |
startpos = line.index("-") + 2 | |
if "- " not in(largetext[largetext.index(line)+1]): | |
titles.append(line[startpos:] + " " + largetext[largetext.index(line)+1].strip()) | |
else: | |
titles.append(line[startpos:]) | |
#author | |
if "FAU - " in line: | |
startpos = line.index("-") + 2 | |
names+= line[startpos:] + "; " | |
#year | |
if "DP - " in line: | |
startpos = line.index("-") + 2 | |
year.append(int(line[startpos:startpos+4])) | |
#key terms | |
if "MH - " in line: | |
startpos = line.index("-") + 2 | |
meshk += line[startpos:] + "; " | |
if"OT - " in line: | |
startpos = line.index("-") + 2 | |
otherk += line[startpos:] + "; " | |
authors.append(names) | |
meshkeys.append(meshk) | |
otherkeys.append(otherk) | |
frame = pd.DataFrame() | |
frame['Title'] = pd.Series(titles) | |
frame['Authors'] = pd.Series(authors) | |
frame['Year'] = pd.Series(year) | |
frame['MeSH Keywords'] = pd.Series(meshkeys) | |
frame['Other Keywords'] = pd.Series(otherkeys) | |
frame.fillna(value = "empty", inplace = True) | |
return frame |