File size: 3,382 Bytes
21dd449
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import { HUB_URL } from "../consts";
import { createApiError } from "../error";
import type { ApiDatasetInfo } from "../types/api/api-dataset";
import type { CredentialsParams } from "../types/public";
import { checkCredentials } from "../utils/checkCredentials";
import { parseLinkHeader } from "../utils/parseLinkHeader";
import { pick } from "../utils/pick";

export const DATASET_EXPAND_KEYS = [
	"private",
	"downloads",
	"gated",
	"likes",
	"lastModified",
] as const satisfies readonly (keyof ApiDatasetInfo)[];

export const DATASET_EXPANDABLE_KEYS = [
	"author",
	"cardData",
	"citation",
	"createdAt",
	"disabled",
	"description",
	"downloads",
	"downloadsAllTime",
	"gated",
	"gitalyUid",
	"lastModified",
	"likes",
	"paperswithcode_id",
	"private",
	// "siblings",
	"sha",
	"tags",
] as const satisfies readonly (keyof ApiDatasetInfo)[];

export interface DatasetEntry {
	id: string;
	name: string;
	private: boolean;
	downloads: number;
	gated: false | "auto" | "manual";
	likes: number;
	updatedAt: Date;
}

export async function* listDatasets<
	const T extends Exclude<(typeof DATASET_EXPANDABLE_KEYS)[number], (typeof DATASET_EXPAND_KEYS)[number]> = never,
>(
	params?: {
		search?: {
			/**
			 * Will search in the dataset name for matches
			 */
			query?: string;
			owner?: string;
			tags?: string[];
		};
		hubUrl?: string;
		additionalFields?: T[];
		/**
		 * Set to limit the number of models returned.
		 */
		limit?: number;
		/**
		 * Custom fetch function to use instead of the default one, for example to use a proxy or edit headers.
		 */
		fetch?: typeof fetch;
	} & Partial<CredentialsParams>
): AsyncGenerator<DatasetEntry & Pick<ApiDatasetInfo, T>> {
	const accessToken = params && checkCredentials(params);
	let totalToFetch = params?.limit ?? Infinity;
	const search = new URLSearchParams([
		...Object.entries({
			limit: String(Math.min(totalToFetch, 500)),
			...(params?.search?.owner ? { author: params.search.owner } : undefined),
			...(params?.search?.query ? { search: params.search.query } : undefined),
		}),
		...(params?.search?.tags?.map((tag) => ["filter", tag]) ?? []),
		...DATASET_EXPAND_KEYS.map((val) => ["expand", val] satisfies [string, string]),
		...(params?.additionalFields?.map((val) => ["expand", val] satisfies [string, string]) ?? []),
	]).toString();
	let url: string | undefined = `${params?.hubUrl || HUB_URL}/api/datasets` + (search ? "?" + search : "");

	while (url) {
		const res: Response = await (params?.fetch ?? fetch)(url, {
			headers: {
				accept: "application/json",
				...(accessToken ? { Authorization: `Bearer ${accessToken}` } : undefined),
			},
		});

		if (!res.ok) {
			throw await createApiError(res);
		}

		const items: ApiDatasetInfo[] = await res.json();

		for (const item of items) {
			yield {
				...(params?.additionalFields && pick(item, params.additionalFields)),
				id: item._id,
				name: item.id,
				private: item.private,
				downloads: item.downloads,
				likes: item.likes,
				gated: item.gated,
				updatedAt: new Date(item.lastModified),
			} as DatasetEntry & Pick<ApiDatasetInfo, T>;
			totalToFetch--;
			if (totalToFetch <= 0) {
				return;
			}
		}

		const linkHeader = res.headers.get("Link");

		url = linkHeader ? parseLinkHeader(linkHeader).next : undefined;
		// Could update limit in url to fetch less items if not all items of next page are needed.
	}
}