Spaces:
Running
Running
import os | |
import requests | |
import json | |
from time import sleep | |
# Base API URL | |
BASE_URL = "https://api.openalex.org/authors" | |
FILTER = "last_known_institutions.country_code:NO,x_concepts.id:C41008148" | |
PER_PAGE = 200 | |
OUTPUT_DIR = "C41008148_authors" | |
os.makedirs(OUTPUT_DIR, exist_ok=True) | |
# Initialize cursor | |
cursor = "*" | |
page_count = 1 # Track page numbers for saving files | |
while cursor: | |
url = f"{BASE_URL}?filter={FILTER}&per-page={PER_PAGE}&cursor={cursor}" | |
try: | |
print(f"Fetching page {page_count} with cursor...") | |
response = requests.get(url) | |
response.raise_for_status() | |
data = response.json() | |
filename = os.path.join(OUTPUT_DIR, f"{page_count:010}.json") | |
if os.path.exists(filename): | |
print(f"File {filename} already exists, skipping...") | |
cursor = data.get("meta", {}).get("next_cursor") | |
page_count += 1 | |
continue | |
with open(filename, 'w', encoding='utf-8') as f: | |
json.dump(data, f, ensure_ascii=False, indent=2) | |
cursor = data.get("meta", {}).get("next_cursor") | |
if not cursor: | |
print("No more results.") | |
break | |
page_count += 1 | |
sleep(1) # Rate-limiting | |
except Exception as e: | |
print(f"Error on page {page_count}: {e}") | |
break | |
print("Download complete using cursor pagination.") | |