import os
import requests
import json
from time import sleep

# Base API URL
BASE_URL = "https://api.openalex.org/authors"
FILTER = "last_known_institutions.country_code:NO,x_concepts.id:C41008148"
PER_PAGE = 200

OUTPUT_DIR = "C41008148_authors"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Initialize cursor
cursor = "*"
page_count = 1  # Track page numbers for saving files

while cursor:
    url = f"{BASE_URL}?filter={FILTER}&per-page={PER_PAGE}&cursor={cursor}"
    try:
        print(f"Fetching page {page_count} with cursor...")
        response = requests.get(url)
        response.raise_for_status()
        data = response.json()

        filename = os.path.join(OUTPUT_DIR, f"{page_count:010}.json")
        if os.path.exists(filename):
            print(f"File {filename} already exists, skipping...")
            cursor = data.get("meta", {}).get("next_cursor")
            page_count += 1
            continue

        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(data, f, ensure_ascii=False, indent=2)

        cursor = data.get("meta", {}).get("next_cursor")
        if not cursor:
            print("No more results.")
            break

        page_count += 1
        sleep(1)  # Rate-limiting
    except Exception as e:
        print(f"Error on page {page_count}: {e}")
        break

print("Download complete using cursor pagination.")