Spaces:
Running
Running
File size: 6,156 Bytes
03fbd26 bd52915 03fbd26 bd98c1d 03fbd26 bd98c1d 03fbd26 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 |
from django.utils.timezone import make_aware, is_naive
from django.utils.dateparse import parse_datetime
from tqdm import tqdm
import glob
import json
from core.models import (
Author, Institution, Affiliation, Domain, Field, Subfield, Topic, AuthorTopic, AuthorYearlyStats, Concept, AuthorConcept
)
from urllib.parse import urlparse
def parse_id_from_url(url):
if (not url) or (not url.startswith("http")):
return "N/A"
return urlparse(url).path.strip('/').split('/')[-1]
def add_author(user_info, updated_date):
author_id = parse_id_from_url(user_info["id"])
author, _ = Author.objects.update_or_create(
id=author_id,
defaults={
"name": max(user_info['display_name_alternatives'] + [user_info['display_name']], key=lambda name: len(name)),
"orcid": parse_id_from_url(user_info["orcid"]) if user_info.get("orcid") else None,
"h_index": user_info["summary_stats"]["h_index"],
"i10_index": user_info["summary_stats"]["i10_index"],
"cited_by_count": user_info["cited_by_count"],
"works_count": user_info["works_count"],
"mean_2yr_citedness": user_info["summary_stats"]["2yr_mean_citedness"]
}
)
author.updated_at = updated_date
author.save(update_fields=["updated_at"])
return author
def add_institution(inst_data):
inst_id = parse_id_from_url(inst_data["id"])
inst, _ = Institution.objects.update_or_create(
id=inst_id,
defaults={
"name": inst_data["display_name"],
"ror_id": parse_id_from_url(inst_data["ror"]),
"country_code": 'N/A' or inst_data.get("country_code"),
"institution_type": 'N/A' or inst_data.get("institution_type")
}
)
return inst
def add_affiliations(author, affiliations, last_known_insts):
last_known_ids = {parse_id_from_url(
inst["id"]) for inst in last_known_insts}
for aff in affiliations:
institution = add_institution(aff["institution"])
for year in aff["years"]:
Affiliation.objects.update_or_create(
author=author,
institution=institution,
year=year,
defaults={"is_last_known": institution.id in last_known_ids}
)
def add_hierarchy(domain_data):
domain_id = parse_id_from_url(domain_data["id"])
domain, _ = Domain.objects.update_or_create(
id=domain_id, defaults={"name": domain_data["display_name"]})
return domain
def add_field(field_data, domain_data):
domain = add_hierarchy(domain_data)
field_id = parse_id_from_url(field_data["id"])
field, _ = Field.objects.update_or_create(
id=field_id, defaults={
"name": field_data["display_name"], "domain": domain}
)
return field
def add_subfield(subfield_data, field_data, domain_data):
field = add_field(field_data, domain_data)
subfield_id = parse_id_from_url(subfield_data["id"])
subfield, _ = Subfield.objects.update_or_create(
id=subfield_id, defaults={
"name": subfield_data["display_name"], "field": field}
)
return subfield
def add_topic(author, topic_data, topic_share_map):
topic_id = parse_id_from_url(topic_data["id"])
subfield = add_subfield(
topic_data["subfield"], topic_data["field"], topic_data["domain"])
topic, _ = Topic.objects.update_or_create(
id=topic_id, defaults={
"name": topic_data["display_name"], "subfield": subfield}
)
share_value = topic_share_map.get(topic_id, 0)
AuthorTopic.objects.update_or_create(
author=author, topic=topic, defaults={
"count": topic_data["count"], "share_value": share_value}
)
def add_topic_shares(topic_share_list):
return {parse_id_from_url(topic["id"]): topic["value"] for topic in topic_share_list}
def add_yearly_stats(author, stats):
for stat in stats:
AuthorYearlyStats.objects.update_or_create(
author=author,
year=stat["year"],
defaults={
"works_count": stat["works_count"],
"cited_by_count": stat["cited_by_count"]
}
)
def add_concepts(author, concepts):
for concept in concepts:
concept_id = parse_id_from_url(concept["id"])
obj, _ = Concept.objects.update_or_create(
id=concept_id,
defaults={
"name": concept["display_name"],
"wikidata_url": concept.get("wikidata"),
"level": concept["level"],
"score": concept["score"]
}
)
AuthorConcept.objects.update_or_create(
author=author,
concept=obj,
defaults={
"level": concept["level"],
"score": concept["score"]
}
)
def populate_user(user_info):
author_id = parse_id_from_url(user_info["id"])
updated_date = parse_datetime(user_info["updated_date"])
author = Author.objects.filter(id=author_id).first()
if is_naive(updated_date):
updated_date = make_aware(updated_date)
if author and (author.updated_at >= updated_date):
return
author = add_author(user_info, updated_date)
add_affiliations(
author, user_info["affiliations"], user_info["last_known_institutions"])
topic_share_map = add_topic_shares(user_info["topic_share"])
for topic in user_info["topics"]:
add_topic(author, topic, topic_share_map)
add_yearly_stats(author, user_info["counts_by_year"])
add_concepts(author, user_info["x_concepts"])
# Call this function to load data
# populate_user(user_info)
jsons = "/Users/sgautam/Documents/BridgeMentor/C41008148_authors"
for page, json_file in tqdm(enumerate(glob.glob(f"{jsons}/*.json"))):
with open(json_file, "r") as file:
user_infos = json.load(file)['results']
for user_info in tqdm(user_infos, leave=False):
populate_user(user_info)
print(f"{page}-{user_info['display_name']}")
# python manage.py shell
# from populate_user import populate_user
|