from io import StringIO, BytesIO import pymarc import requests import string import pandas as pd import tarfile try: from lxml import etree as ET except ImportError: import xml.etree.ElementTree as ET #metadata for htrc worksets def htrc(self): #variables/arrays and stuff #string of keywords per volume/htid keywords = "" #array of all the keywords per each volume/htid, to add to the file keylist = [] #get htids of the volumes htids = self['htid'].values.tolist() #iterate through list of htids for id in range(len(htids)): htid = htids[id] #api call for the extra metadata using htid extradata = requests.get("https://catalog.hathitrust.org/api/volumes/full/htid/"+htid+".json") #turn the request into a json file extradata = extradata.json() #get record id and use it to get the xml/marc file with the actual metadata recid = extradata['items'][0]['fromRecord'] xmlmarc = extradata['records'][recid]['marc-xml'] #turn the formatted xml into an actual pymarc xml = StringIO(xmlmarc) marc = pymarc.parse_xml_to_array(xml)[0] xml.close() for term in marc.get_fields('650'): if "http" in (term.value()).lower(): keywords+= "" elif "ocolc" in (term.value()).lower(): keywords+="" else: keywords+=term.value().translate(str.maketrans('','', string.punctuation))+"; " keylist.append(keywords) self['Keywords'] = keylist return self def htrcxtra(self): #variables/arrays and stuff #string of keywords per volume/htid pages = "" #array of all the keywords per each volume/htid, to add to the file pagecount = [] #get htids of the volumes htids = self['htid'].values.tolist() #iterate through list of htids for id in range(len(htids)): htid = htids[id] #api call for the extra metadata using htid extradata = requests.get("https://catalog.hathitrust.org/api/volumes/full/htid/"+htid+".json") #turn the request into a json file extradata = extradata.json() #get record id and use it to get the xml/marc file with the actual metadata recid = extradata['items'][0]['fromRecord'] xmlmarc = extradata['records'][recid]['marc-xml'] #turn the formatted xml into an actual pymarc xml = StringIO(xmlmarc) marc = pymarc.parse_xml_to_array(xml)[0] xml.close() for term in marc.get_fields('350'): pages+=term.value() pagecount.append(pages) self['pages'] = pagecount return self #format files from dimensions def dim(file): formatted = file.drop(file.columns[[0]],axis=1) done = pd.read_csv(StringIO((formatted.to_csv(header=False,index=False)))) return done def readPub(tar): #list to put xmls from tarfile in xmllist = [] readfile = BytesIO(tar) #get the files from the tarfile into the list files = tarfile.open(fileobj=readfile, mode = 'r:gz', ) for member in files.getmembers(): singlefile = files.extractfile(member) if singlefile is not None: article = singlefile.read() article = article.decode("utf-8") article = StringIO(article) xmllist.append(article) #lists for each data point titles = [] years = [] keys = [] authors = [] publishers = [] journaltitles = [] #go through each xml file in the list for art in range(len(xmllist)): #make a parseable element tree out of the xml file tree = ET.parse(xmllist[art]) root = tree.getroot() #remove parts of the main branch that do not have metadata that we care about for child in list(root): if(child.tag!="front"): root.remove(child) #names to concatnate for each article firstname = [] lastname = [] #individual strings for multiple keywords/titles key = "" title = "" for target in root.iter('article-title'): if target.text is not None: title += target.text + ", " else: title += " " for target in root.iter('kwd'): if target.text is not None: key+=target.text+ "; " else: key += " " for target in root.iter('year'): year=int(target.text) years.append(year) for names in root.iter('given-names'): firstname.append(names.text) for names in root.iter('surname'): lastname.append(names.text) for target in root.iter('journal-title'): jtitle = target.text journaltitles.append(jtitle) for target in root.iter('publisher-name'): publisher = target.text publishers.append(publisher) titles.append(title) keys.append(key) fullnames = [first + ' ' + last for first, last in zip(firstname,lastname)] #join the names into a single string with authors author = str.join(', ', fullnames) authors.append(author) data = pd.DataFrame() data["Title"] = pd.Series(titles) data["Keywords"] = pd.Series(keys) data["Authors"] = pd.Series(authors) data["Year"] = pd.Series(years) data["Document Type"] = pd.Series(publisher) data["Source title"] = pd.Series(journaltitles) data.fillna(value = "empty", inplace = True) return data def readxml(file): root = ET.fromstring(file) #remove stuff from the xml that we do not need for child in list(root): for lchild in list(child): if(lchild.tag!="front"): child.remove(lchild) #get stuff keys = [] titles = [] authors = [] jtitle = [] publishers = [] years = [] for child in list(root): for article in list(child): key = "" firstname = [] lastname = [] for target in article.iter('article-title'): if target.text is not None: titles.append(target.text) else: titles.append("empty") for target in article.iter('kwd'): if target.text is not None: key+= target.text + "; " else: key += "" keys.append(key) for target in article.iter('given-names'): firstname.append(target.text) for target in article.iter('surname'): lastname.append(target.text) fullnames = [first + ' ' + last for first, last in zip(firstname,lastname)] author = str.join(', ', fullnames) authors.append(author) for target in article.iter('journal-title'): jtitle.append(target.text) for target in article.iter('publisher-name'): publishers.append(target.text) for target in article.iter('year'): years.append(int(target.text)) frame = pd.DataFrame() frame["Title"] = pd.Series(titles) frame["Keywords"] = pd.Series(keys) frame["Authors"] = pd.Series(authors) frame["Year"] = pd.Series(years) frame["Document Type"] = pd.Series(jtitle) frame["Source title"] = pd.Series(publishers) frame.fillna(value = "empty", inplace = True) return frame def medline(file): textfile = file.read() text = textfile.decode() authors = [] titles = [] year = [] meshkeys = [] otherkeys = [] #articles are separated by newlines so seperate them articles = text.split('\n\n') for paper in articles: names = "" meshk = "" otherk = "" largetext = paper.splitlines() for line in largetext: #title if "TI - " in line: #checking if the title goes over another line, and to add it if it does startpos = line.index("-") + 2 if "- " not in(largetext[largetext.index(line)+1]): titles.append(line[startpos:] + " " + largetext[largetext.index(line)+1].strip()) else: titles.append(line[startpos:]) #author if "FAU - " in line: startpos = line.index("-") + 2 names+= line[startpos:] + "; " #year if "DP - " in line: startpos = line.index("-") + 2 year.append(int(line[startpos:startpos+4])) #key terms if "MH - " in line: startpos = line.index("-") + 2 meshk += line[startpos:] + "; " if"OT - " in line: startpos = line.index("-") + 2 otherk += line[startpos:] + "; " authors.append(names) meshkeys.append(meshk) otherkeys.append(otherk) frame = pd.DataFrame() frame['Title'] = pd.Series(titles) frame['Authors'] = pd.Series(authors) frame['Year'] = pd.Series(year) frame['MeSH Keywords'] = pd.Series(meshkeys) frame['Other Keywords'] = pd.Series(otherkeys) frame.fillna(value = "empty", inplace = True) return frame