Spaces:

haotle
/

LibTesting

Running

File size: 9,514 Bytes

c837e02

from io import StringIO, BytesIO
import pymarc
import requests
import string
import pandas as pd
import tarfile
try:
    from lxml import etree as ET
except ImportError:
    import xml.etree.ElementTree as ET

#metadata for htrc worksets
def htrc(self):
    
    #variables/arrays and stuff
    
    #string of keywords per volume/htid
    keywords = ""
    
    #array of all the keywords per each volume/htid, to add to the file
    keylist = []

    #get htids of the volumes
    htids = self['htid'].values.tolist()
    #iterate through list of htids
    for id in range(len(htids)):
        htid = htids[id]
        
        #api call for the extra metadata using htid
        extradata = requests.get("https://catalog.hathitrust.org/api/volumes/full/htid/"+htid+".json")
        
        #turn the request into a json file
        extradata = extradata.json()

        #get record id and use it to get the xml/marc file with the actual metadata
        recid = extradata['items'][0]['fromRecord']
        xmlmarc = extradata['records'][recid]['marc-xml']

        #turn the formatted xml into an actual pymarc
        xml = StringIO(xmlmarc)
        marc = pymarc.parse_xml_to_array(xml)[0]
        xml.close()

        for term in marc.get_fields('650'):
            if "http" in (term.value()).lower():
                keywords+= ""
            elif "ocolc" in (term.value()).lower():
                keywords+=""
            else:
                keywords+=term.value().translate(str.maketrans('','', string.punctuation))+"; "
        keylist.append(keywords)
    self['Keywords'] = keylist
    return self

def htrcxtra(self):
        
    #variables/arrays and stuff
    
    #string of keywords per volume/htid
    pages = ""
    
    #array of all the keywords per each volume/htid, to add to the file
    pagecount = []

    #get htids of the volumes
    htids = self['htid'].values.tolist()
    #iterate through list of htids
    for id in range(len(htids)):
        htid = htids[id]
        
        #api call for the extra metadata using htid
        extradata = requests.get("https://catalog.hathitrust.org/api/volumes/full/htid/"+htid+".json")
        
        #turn the request into a json file
        extradata = extradata.json()

        #get record id and use it to get the xml/marc file with the actual metadata
        recid = extradata['items'][0]['fromRecord']
        xmlmarc = extradata['records'][recid]['marc-xml']

        #turn the formatted xml into an actual pymarc
        xml = StringIO(xmlmarc)
        marc = pymarc.parse_xml_to_array(xml)[0]
        xml.close()

    for term in marc.get_fields('350'):
        pages+=term.value()
    pagecount.append(pages)
    self['pages'] = pagecount
    return self


#format files from dimensions
def dim(file):
    formatted = file.drop(file.columns[[0]],axis=1)

    done = pd.read_csv(StringIO((formatted.to_csv(header=False,index=False))))
  
    return done



def readPub(tar):

    #list to put xmls from tarfile in
    xmllist = []

    readfile = BytesIO(tar)

    #get the files from the tarfile into the list
    files = tarfile.open(fileobj=readfile, mode = 'r:gz', )
    for member in files.getmembers():
        singlefile = files.extractfile(member)
        if singlefile is not None:
            article = singlefile.read()
            article = article.decode("utf-8")
            article = StringIO(article)
            xmllist.append(article)

    #lists for each data point
    titles = []
    years = []
    keys = []
    authors = []
    publishers = []
    journaltitles = []
    
    #go through each xml file in the list
    for art in range(len(xmllist)):

        #make a parseable element tree out of the xml file
        tree = ET.parse(xmllist[art])
        root = tree.getroot()

        #remove parts of the main branch that do not have metadata that we care about
        for child in list(root):
            if(child.tag!="front"):
                root.remove(child)

        #names to concatnate for each article
        firstname = []
        lastname = []

        #individual strings for multiple keywords/titles
        key = ""
        title = ""
        

        for target in root.iter('article-title'):
            if target.text is not None:
                title += target.text + ", "
            else:
                title += " "
        for target in root.iter('kwd'):
            if target.text is not None:
                key+=target.text+ "; "
            else:
                key += " "
        for target in root.iter('year'):
            year=int(target.text)
            years.append(year)
        for names in root.iter('given-names'):
            firstname.append(names.text)
        for names in root.iter('surname'):
            lastname.append(names.text)
        for target in root.iter('journal-title'):
            jtitle = target.text
            journaltitles.append(jtitle)
        for target in root.iter('publisher-name'):
            publisher = target.text
            publishers.append(publisher)

        titles.append(title)
        keys.append(key)

        fullnames = [first + ' ' + last for first, last in zip(firstname,lastname)]

        #join the names into a single string with authors
        author = str.join(', ', fullnames)

        authors.append(author)

    data = pd.DataFrame()

    data["Title"] = pd.Series(titles)
    data["Keywords"] = pd.Series(keys)
    data["Authors"] = pd.Series(authors)
    data["Year"] = pd.Series(years)
    data["Document Type"] = pd.Series(publisher)
    data["Source title"] = pd.Series(journaltitles)

    data.fillna(value = "empty", inplace = True)

    return data


def readxml(file):
    root = ET.fromstring(file)



    #remove stuff from the xml that we do not need
    for child in list(root):
        for lchild in list(child):
            if(lchild.tag!="front"):
                child.remove(lchild)

    #get stuff

    keys = []
    titles = []
    authors = []
    jtitle = []
    publishers = []
    years = []

    for child in list(root):
        for article in list(child):
            key = ""
            firstname = []
            lastname = []
            for target in article.iter('article-title'):
                
                if target.text is not None:
                    titles.append(target.text)
                else:
                    titles.append("empty")
            for target in article.iter('kwd'):
                if target.text is not None:
                    key+= target.text + "; "
                else:
                    key += ""
            keys.append(key)
            for target in article.iter('given-names'):
                firstname.append(target.text)
            for target in article.iter('surname'):
                lastname.append(target.text)
            
            fullnames = [first + ' ' + last for first, last in zip(firstname,lastname)]
            author = str.join(', ', fullnames)
            authors.append(author)

            for target in article.iter('journal-title'):
                jtitle.append(target.text)
            for target in article.iter('publisher-name'):
                publishers.append(target.text)

            for target in article.iter('year'):
                years.append(int(target.text))

    frame = pd.DataFrame()

    frame["Title"] = pd.Series(titles)
    frame["Keywords"] = pd.Series(keys)
    frame["Authors"] = pd.Series(authors)
    frame["Year"] = pd.Series(years)
    frame["Document Type"] = pd.Series(jtitle)
    frame["Source title"] = pd.Series(publishers)

    frame.fillna(value = "empty", inplace = True)

    return frame

def medline(file):

    textfile = file.read()


    text = textfile.decode()





    authors = []
    titles = []
    year = []
    meshkeys = []
    otherkeys = []

    #articles are separated by newlines so seperate them
    articles = text.split('\n\n')

    for paper in articles:
        names = ""
        meshk = ""
        otherk = ""         
        largetext = paper.splitlines()
        for line in largetext:
            #title
            if "TI  - " in line:
                #checking if the title goes over another line, and to add it if it does
                startpos = line.index("-") + 2
                if "- " not in(largetext[largetext.index(line)+1]):
                    titles.append(line[startpos:] +  " " + largetext[largetext.index(line)+1].strip())
                else:
                    titles.append(line[startpos:])
            #author
            if "FAU - " in line:
                startpos = line.index("-") + 2
                names+= line[startpos:] + "; "
            #year
            if "DP  - " in line:
                startpos = line.index("-") + 2
                year.append(int(line[startpos:startpos+4]))
            #key terms
            if "MH  - " in line:
                startpos = line.index("-") + 2
                meshk += line[startpos:] + "; "
            if"OT  - " in line:
                startpos = line.index("-") + 2
                otherk += line[startpos:] + "; "
    
        authors.append(names)
        meshkeys.append(meshk)
        otherkeys.append(otherk)

    frame = pd.DataFrame()
    
    frame['Title'] = pd.Series(titles)
    frame['Authors'] = pd.Series(authors)
    frame['Year'] = pd.Series(year)
    frame['MeSH Keywords'] = pd.Series(meshkeys)
    frame['Other Keywords'] = pd.Series(otherkeys)

    frame.fillna(value = "empty", inplace = True)

    return frame