abidlabs HF Staff commited on
Commit
538d043
·
verified ·
1 Parent(s): 5196b68

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +1 -0
  2. __init__.py +200 -0
  3. __pycache__/__init__.cpython-310.pyc +0 -0
  4. __pycache__/__init__.cpython-311.pyc +0 -0
  5. __pycache__/__init__.cpython-312.pyc +0 -0
  6. __pycache__/cli.cpython-311.pyc +0 -0
  7. __pycache__/cli.cpython-312.pyc +0 -0
  8. __pycache__/commit_scheduler.cpython-311.pyc +0 -0
  9. __pycache__/commit_scheduler.cpython-312.pyc +0 -0
  10. __pycache__/context.cpython-312.pyc +0 -0
  11. __pycache__/context_vars.cpython-311.pyc +0 -0
  12. __pycache__/context_vars.cpython-312.pyc +0 -0
  13. __pycache__/deploy.cpython-310.pyc +0 -0
  14. __pycache__/deploy.cpython-311.pyc +0 -0
  15. __pycache__/deploy.cpython-312.pyc +0 -0
  16. __pycache__/dummy_commit_scheduler.cpython-310.pyc +0 -0
  17. __pycache__/dummy_commit_scheduler.cpython-311.pyc +0 -0
  18. __pycache__/dummy_commit_scheduler.cpython-312.pyc +0 -0
  19. __pycache__/file_storage.cpython-311.pyc +0 -0
  20. __pycache__/file_storage.cpython-312.pyc +0 -0
  21. __pycache__/imports.cpython-311.pyc +0 -0
  22. __pycache__/imports.cpython-312.pyc +0 -0
  23. __pycache__/media.cpython-311.pyc +0 -0
  24. __pycache__/media.cpython-312.pyc +0 -0
  25. __pycache__/run.cpython-310.pyc +0 -0
  26. __pycache__/run.cpython-311.pyc +0 -0
  27. __pycache__/run.cpython-312.pyc +0 -0
  28. __pycache__/sqlite_storage.cpython-310.pyc +0 -0
  29. __pycache__/sqlite_storage.cpython-311.pyc +0 -0
  30. __pycache__/sqlite_storage.cpython-312.pyc +0 -0
  31. __pycache__/storage.cpython-312.pyc +0 -0
  32. __pycache__/typehints.cpython-311.pyc +0 -0
  33. __pycache__/typehints.cpython-312.pyc +0 -0
  34. __pycache__/types.cpython-312.pyc +0 -0
  35. __pycache__/ui.cpython-310.pyc +0 -0
  36. __pycache__/ui.cpython-311.pyc +0 -0
  37. __pycache__/ui.cpython-312.pyc +0 -0
  38. __pycache__/utils.cpython-310.pyc +0 -0
  39. __pycache__/utils.cpython-311.pyc +0 -0
  40. __pycache__/utils.cpython-312.pyc +0 -0
  41. assets/trackio_logo_dark.png +0 -0
  42. assets/trackio_logo_light.png +0 -0
  43. assets/trackio_logo_old.png +3 -0
  44. assets/trackio_logo_type_dark.png +0 -0
  45. assets/trackio_logo_type_dark_transparent.png +0 -0
  46. assets/trackio_logo_type_light.png +0 -0
  47. assets/trackio_logo_type_light_transparent.png +0 -0
  48. cli.py +32 -0
  49. commit_scheduler.py +391 -0
  50. context_vars.py +15 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ assets/trackio_logo_old.png filter=lfs diff=lfs merge=lfs -text
__init__.py ADDED
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import hashlib
2
+ import os
3
+ import warnings
4
+ import webbrowser
5
+ from pathlib import Path
6
+ from typing import Any
7
+
8
+ from gradio.blocks import BUILT_IN_THEMES
9
+ from gradio.themes import Default as DefaultTheme
10
+ from gradio.themes import ThemeClass
11
+ from gradio_client import Client
12
+
13
+ from trackio import context_vars, deploy, utils
14
+ from trackio.imports import import_csv, import_tf_events
15
+ from trackio.run import Run
16
+ from trackio.sqlite_storage import SQLiteStorage
17
+ from trackio.ui import demo
18
+ from trackio.utils import TRACKIO_DIR, TRACKIO_LOGO_DIR
19
+
20
+ __version__ = Path(__file__).parent.joinpath("version.txt").read_text().strip()
21
+
22
+ __all__ = ["init", "log", "finish", "show", "import_csv", "import_tf_events"]
23
+
24
+
25
+ config = {}
26
+
27
+ DEFAULT_THEME = "citrus"
28
+
29
+
30
+ def init(
31
+ project: str,
32
+ name: str | None = None,
33
+ space_id: str | None = None,
34
+ dataset_id: str | None = None,
35
+ config: dict | None = None,
36
+ resume: str = "never",
37
+ settings: Any = None,
38
+ ) -> Run:
39
+ """
40
+ Creates a new Trackio project and returns a Run object.
41
+
42
+ Args:
43
+ project: The name of the project (can be an existing project to continue tracking or a new project to start tracking from scratch).
44
+ name: The name of the run (if not provided, a default name will be generated).
45
+ space_id: If provided, the project will be logged to a Hugging Face Space instead of a local directory. Should be a complete Space name like "username/reponame" or "orgname/reponame", or just "reponame" in which case the Space will be created in the currently-logged-in Hugging Face user's namespace. If the Space does not exist, it will be created. If the Space already exists, the project will be logged to it.
46
+ dataset_id: If a space_id is provided, a persistent Hugging Face Dataset will be created and the metrics will be synced to it every 5 minutes. Specify a Dataset with name like "username/datasetname" or "orgname/datasetname", or "datasetname" (uses currently-logged-in Hugging Face user's namespace), or None (uses the same name as the Space but with the "_dataset" suffix). If the Dataset does not exist, it will be created. If the Dataset already exists, the project will be appended to it.
47
+ config: A dictionary of configuration options. Provided for compatibility with wandb.init()
48
+ resume: Controls how to handle resuming a run. Can be one of:
49
+ - "must": Must resume the run with the given name, raises error if run doesn't exist
50
+ - "allow": Resume the run if it exists, otherwise create a new run
51
+ - "never": Never resume a run, always create a new one
52
+ settings: Not used. Provided for compatibility with wandb.init()
53
+ """
54
+ if settings is not None:
55
+ warnings.warn(
56
+ "* Warning: settings is not used. Provided for compatibility with wandb.init(). Please create an issue at: https://github.com/gradio-app/trackio/issues if you need a specific feature implemented."
57
+ )
58
+
59
+ if space_id is None and dataset_id is not None:
60
+ raise ValueError("Must provide a `space_id` when `dataset_id` is provided.")
61
+ space_id, dataset_id = utils.preprocess_space_and_dataset_ids(space_id, dataset_id)
62
+ url = context_vars.current_server.get()
63
+
64
+ if url is None:
65
+ if space_id is None:
66
+ _, url, _ = demo.launch(
67
+ show_api=False,
68
+ inline=False,
69
+ quiet=True,
70
+ prevent_thread_lock=True,
71
+ show_error=True,
72
+ )
73
+ else:
74
+ url = space_id
75
+ context_vars.current_server.set(url)
76
+
77
+ if (
78
+ context_vars.current_project.get() is None
79
+ or context_vars.current_project.get() != project
80
+ ):
81
+ print(f"* Trackio project initialized: {project}")
82
+
83
+ if dataset_id is not None:
84
+ os.environ["TRACKIO_DATASET_ID"] = dataset_id
85
+ print(
86
+ f"* Trackio metrics will be synced to Hugging Face Dataset: {dataset_id}"
87
+ )
88
+ if space_id is None:
89
+ print(f"* Trackio metrics logged to: {TRACKIO_DIR}")
90
+ utils.print_dashboard_instructions(project)
91
+ else:
92
+ deploy.create_space_if_not_exists(space_id, dataset_id)
93
+ print(
94
+ f"* View dashboard by going to: {deploy.SPACE_URL.format(space_id=space_id)}"
95
+ )
96
+ context_vars.current_project.set(project)
97
+
98
+ client = None
99
+ if not space_id:
100
+ client = Client(url, verbose=False)
101
+
102
+ if resume == "must":
103
+ if name is None:
104
+ raise ValueError("Must provide a run name when resume='must'")
105
+ if name not in SQLiteStorage.get_runs(project):
106
+ raise ValueError(f"Run '{name}' does not exist in project '{project}'")
107
+ elif resume == "allow":
108
+ if name is not None and name in SQLiteStorage.get_runs(project):
109
+ print(f"* Resuming existing run: {name}")
110
+ elif resume == "never":
111
+ if name is not None and name in SQLiteStorage.get_runs(project):
112
+ name = None
113
+ else:
114
+ raise ValueError("resume must be one of: 'must', 'allow', or 'never'")
115
+
116
+ run = Run(
117
+ url=url,
118
+ project=project,
119
+ client=client,
120
+ name=name,
121
+ config=config,
122
+ space_id=space_id,
123
+ )
124
+ context_vars.current_run.set(run)
125
+ globals()["config"] = run.config
126
+ return run
127
+
128
+
129
+ def log(metrics: dict, step: int | None = None) -> None:
130
+ """
131
+ Logs metrics to the current run.
132
+
133
+ Args:
134
+ metrics: A dictionary of metrics to log.
135
+ step: The step number. If not provided, the step will be incremented automatically.
136
+ """
137
+ run = context_vars.current_run.get()
138
+ if run is None:
139
+ raise RuntimeError("Call trackio.init() before trackio.log().")
140
+ run.log(
141
+ metrics=metrics,
142
+ step=step,
143
+ )
144
+
145
+
146
+ def finish():
147
+ """
148
+ Finishes the current run.
149
+ """
150
+ run = context_vars.current_run.get()
151
+ if run is None:
152
+ raise RuntimeError("Call trackio.init() before trackio.finish().")
153
+ run.finish()
154
+
155
+
156
+ def show(project: str | None = None, theme: str | ThemeClass = DEFAULT_THEME):
157
+ """
158
+ Launches the Trackio dashboard.
159
+
160
+ Args:
161
+ project: The name of the project whose runs to show. If not provided, all projects will be shown and the user can select one.
162
+ theme: The Gradio theme to use. Can be any built-in Gradio theme (e.g. "citrus", "soft", "default"), theme from the Hub (https://huggingface.co/spaces/gradio/theme-gallery), or a custom gradio.themes.ThemeClass
163
+ """
164
+ if theme != DEFAULT_THEME:
165
+ # TODO: It's a little hacky to reproduce this theme-setting logic from Gradio Blocks,
166
+ # but in Gradio 6.0, the theme will be set in `launch()` instead, which means that we
167
+ # will be able to remove this code.
168
+ if isinstance(theme, str):
169
+ if theme.lower() in BUILT_IN_THEMES:
170
+ theme = BUILT_IN_THEMES[theme.lower()]
171
+ else:
172
+ try:
173
+ theme = ThemeClass.from_hub(theme)
174
+ except Exception as e:
175
+ warnings.warn(f"Cannot load {theme}. Caught Exception: {str(e)}")
176
+ theme = DefaultTheme()
177
+ if not isinstance(theme, ThemeClass):
178
+ warnings.warn("Theme should be a class loaded from gradio.themes")
179
+ theme = DefaultTheme()
180
+ demo.theme: ThemeClass = theme
181
+ demo.theme_css = theme._get_theme_css()
182
+ demo.stylesheets = theme._stylesheets
183
+ theme_hasher = hashlib.sha256()
184
+ theme_hasher.update(demo.theme_css.encode("utf-8"))
185
+ demo.theme_hash = theme_hasher.hexdigest()
186
+
187
+ _, url, share_url = demo.launch(
188
+ show_api=False,
189
+ quiet=True,
190
+ inline=False,
191
+ prevent_thread_lock=True,
192
+ favicon_path=TRACKIO_LOGO_DIR / "trackio_logo_light.png",
193
+ allowed_paths=[TRACKIO_LOGO_DIR],
194
+ )
195
+
196
+ base_url = share_url + "/" if share_url else url
197
+ dashboard_url = base_url + f"?project={project}" if project else base_url
198
+ print(f"* Trackio UI launched at: {dashboard_url}")
199
+ webbrowser.open(dashboard_url)
200
+ utils.block_except_in_notebook()
__pycache__/__init__.cpython-310.pyc ADDED
Binary file (4.96 kB). View file
 
__pycache__/__init__.cpython-311.pyc ADDED
Binary file (8.77 kB). View file
 
__pycache__/__init__.cpython-312.pyc ADDED
Binary file (9.81 kB). View file
 
__pycache__/cli.cpython-311.pyc ADDED
Binary file (1.25 kB). View file
 
__pycache__/cli.cpython-312.pyc ADDED
Binary file (1.43 kB). View file
 
__pycache__/commit_scheduler.cpython-311.pyc ADDED
Binary file (20.3 kB). View file
 
__pycache__/commit_scheduler.cpython-312.pyc ADDED
Binary file (18.8 kB). View file
 
__pycache__/context.cpython-312.pyc ADDED
Binary file (440 Bytes). View file
 
__pycache__/context_vars.cpython-311.pyc ADDED
Binary file (840 Bytes). View file
 
__pycache__/context_vars.cpython-312.pyc ADDED
Binary file (763 Bytes). View file
 
__pycache__/deploy.cpython-310.pyc ADDED
Binary file (1.72 kB). View file
 
__pycache__/deploy.cpython-311.pyc ADDED
Binary file (7.44 kB). View file
 
__pycache__/deploy.cpython-312.pyc ADDED
Binary file (6.75 kB). View file
 
__pycache__/dummy_commit_scheduler.cpython-310.pyc ADDED
Binary file (936 Bytes). View file
 
__pycache__/dummy_commit_scheduler.cpython-311.pyc ADDED
Binary file (1.19 kB). View file
 
__pycache__/dummy_commit_scheduler.cpython-312.pyc ADDED
Binary file (1.01 kB). View file
 
__pycache__/file_storage.cpython-311.pyc ADDED
Binary file (3.14 kB). View file
 
__pycache__/file_storage.cpython-312.pyc ADDED
Binary file (2.77 kB). View file
 
__pycache__/imports.cpython-311.pyc ADDED
Binary file (12.9 kB). View file
 
__pycache__/imports.cpython-312.pyc ADDED
Binary file (11.8 kB). View file
 
__pycache__/media.cpython-311.pyc ADDED
Binary file (6.39 kB). View file
 
__pycache__/media.cpython-312.pyc ADDED
Binary file (5.78 kB). View file
 
__pycache__/run.cpython-310.pyc ADDED
Binary file (1.01 kB). View file
 
__pycache__/run.cpython-311.pyc ADDED
Binary file (7.61 kB). View file
 
__pycache__/run.cpython-312.pyc ADDED
Binary file (5.12 kB). View file
 
__pycache__/sqlite_storage.cpython-310.pyc ADDED
Binary file (5.37 kB). View file
 
__pycache__/sqlite_storage.cpython-311.pyc ADDED
Binary file (22.4 kB). View file
 
__pycache__/sqlite_storage.cpython-312.pyc ADDED
Binary file (18.7 kB). View file
 
__pycache__/storage.cpython-312.pyc ADDED
Binary file (4.6 kB). View file
 
__pycache__/typehints.cpython-311.pyc ADDED
Binary file (1.06 kB). View file
 
__pycache__/typehints.cpython-312.pyc ADDED
Binary file (535 Bytes). View file
 
__pycache__/types.cpython-312.pyc ADDED
Binary file (531 Bytes). View file
 
__pycache__/ui.cpython-310.pyc ADDED
Binary file (7.83 kB). View file
 
__pycache__/ui.cpython-311.pyc ADDED
Binary file (29.9 kB). View file
 
__pycache__/ui.cpython-312.pyc ADDED
Binary file (24 kB). View file
 
__pycache__/utils.cpython-310.pyc ADDED
Binary file (2.62 kB). View file
 
__pycache__/utils.cpython-311.pyc ADDED
Binary file (16.1 kB). View file
 
__pycache__/utils.cpython-312.pyc ADDED
Binary file (15.4 kB). View file
 
assets/trackio_logo_dark.png ADDED
assets/trackio_logo_light.png ADDED
assets/trackio_logo_old.png ADDED

Git LFS Details

  • SHA256: 3922c4d1e465270ad4d8abb12023f3beed5d9f7f338528a4c0ac21dcf358a1c8
  • Pointer size: 131 Bytes
  • Size of remote file: 487 kB
assets/trackio_logo_type_dark.png ADDED
assets/trackio_logo_type_dark_transparent.png ADDED
assets/trackio_logo_type_light.png ADDED
assets/trackio_logo_type_light_transparent.png ADDED
cli.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+
3
+ from trackio import show
4
+
5
+
6
+ def main():
7
+ parser = argparse.ArgumentParser(description="Trackio CLI")
8
+ subparsers = parser.add_subparsers(dest="command")
9
+
10
+ ui_parser = subparsers.add_parser(
11
+ "show", help="Show the Trackio dashboard UI for a project"
12
+ )
13
+ ui_parser.add_argument(
14
+ "--project", required=False, help="Project name to show in the dashboard"
15
+ )
16
+ ui_parser.add_argument(
17
+ "--theme",
18
+ required=False,
19
+ default="citrus",
20
+ help="A Gradio Theme to use for the dashboard instead of the default 'citrus', can be a built-in theme (e.g. 'soft', 'default'), a theme from the Hub (e.g. 'gstaff/xkcd').",
21
+ )
22
+
23
+ args = parser.parse_args()
24
+
25
+ if args.command == "show":
26
+ show(args.project, args.theme)
27
+ else:
28
+ parser.print_help()
29
+
30
+
31
+ if __name__ == "__main__":
32
+ main()
commit_scheduler.py ADDED
@@ -0,0 +1,391 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Originally copied from https://github.com/huggingface/huggingface_hub/blob/d0a948fc2a32ed6e557042a95ef3e4af97ec4a7c/src/huggingface_hub/_commit_scheduler.py
2
+
3
+ import atexit
4
+ import logging
5
+ import os
6
+ import time
7
+ from concurrent.futures import Future
8
+ from dataclasses import dataclass
9
+ from io import SEEK_END, SEEK_SET, BytesIO
10
+ from pathlib import Path
11
+ from threading import Lock, Thread
12
+ from typing import Callable, Dict, List, Optional, Union
13
+
14
+ from huggingface_hub.hf_api import (
15
+ DEFAULT_IGNORE_PATTERNS,
16
+ CommitInfo,
17
+ CommitOperationAdd,
18
+ HfApi,
19
+ )
20
+ from huggingface_hub.utils import filter_repo_objects
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ @dataclass(frozen=True)
26
+ class _FileToUpload:
27
+ """Temporary dataclass to store info about files to upload. Not meant to be used directly."""
28
+
29
+ local_path: Path
30
+ path_in_repo: str
31
+ size_limit: int
32
+ last_modified: float
33
+
34
+
35
+ class CommitScheduler:
36
+ """
37
+ Scheduler to upload a local folder to the Hub at regular intervals (e.g. push to hub every 5 minutes).
38
+
39
+ The recommended way to use the scheduler is to use it as a context manager. This ensures that the scheduler is
40
+ properly stopped and the last commit is triggered when the script ends. The scheduler can also be stopped manually
41
+ with the `stop` method. Checkout the [upload guide](https://huggingface.co/docs/huggingface_hub/guides/upload#scheduled-uploads)
42
+ to learn more about how to use it.
43
+
44
+ Args:
45
+ repo_id (`str`):
46
+ The id of the repo to commit to.
47
+ folder_path (`str` or `Path`):
48
+ Path to the local folder to upload regularly.
49
+ every (`int` or `float`, *optional*):
50
+ The number of minutes between each commit. Defaults to 5 minutes.
51
+ path_in_repo (`str`, *optional*):
52
+ Relative path of the directory in the repo, for example: `"checkpoints/"`. Defaults to the root folder
53
+ of the repository.
54
+ repo_type (`str`, *optional*):
55
+ The type of the repo to commit to. Defaults to `model`.
56
+ revision (`str`, *optional*):
57
+ The revision of the repo to commit to. Defaults to `main`.
58
+ private (`bool`, *optional*):
59
+ Whether to make the repo private. If `None` (default), the repo will be public unless the organization's default is private. This value is ignored if the repo already exists.
60
+ token (`str`, *optional*):
61
+ The token to use to commit to the repo. Defaults to the token saved on the machine.
62
+ allow_patterns (`List[str]` or `str`, *optional*):
63
+ If provided, only files matching at least one pattern are uploaded.
64
+ ignore_patterns (`List[str]` or `str`, *optional*):
65
+ If provided, files matching any of the patterns are not uploaded.
66
+ squash_history (`bool`, *optional*):
67
+ Whether to squash the history of the repo after each commit. Defaults to `False`. Squashing commits is
68
+ useful to avoid degraded performances on the repo when it grows too large.
69
+ hf_api (`HfApi`, *optional*):
70
+ The [`HfApi`] client to use to commit to the Hub. Can be set with custom settings (user agent, token,...).
71
+ on_before_commit (`Callable[[], None]`, *optional*):
72
+ If specified, a function that will be called before the CommitScheduler lists files to create a commit.
73
+
74
+ Example:
75
+ ```py
76
+ >>> from pathlib import Path
77
+ >>> from huggingface_hub import CommitScheduler
78
+
79
+ # Scheduler uploads every 10 minutes
80
+ >>> csv_path = Path("watched_folder/data.csv")
81
+ >>> CommitScheduler(repo_id="test_scheduler", repo_type="dataset", folder_path=csv_path.parent, every=10)
82
+
83
+ >>> with csv_path.open("a") as f:
84
+ ... f.write("first line")
85
+
86
+ # Some time later (...)
87
+ >>> with csv_path.open("a") as f:
88
+ ... f.write("second line")
89
+ ```
90
+
91
+ Example using a context manager:
92
+ ```py
93
+ >>> from pathlib import Path
94
+ >>> from huggingface_hub import CommitScheduler
95
+
96
+ >>> with CommitScheduler(repo_id="test_scheduler", repo_type="dataset", folder_path="watched_folder", every=10) as scheduler:
97
+ ... csv_path = Path("watched_folder/data.csv")
98
+ ... with csv_path.open("a") as f:
99
+ ... f.write("first line")
100
+ ... (...)
101
+ ... with csv_path.open("a") as f:
102
+ ... f.write("second line")
103
+
104
+ # Scheduler is now stopped and last commit have been triggered
105
+ ```
106
+ """
107
+
108
+ def __init__(
109
+ self,
110
+ *,
111
+ repo_id: str,
112
+ folder_path: Union[str, Path],
113
+ every: Union[int, float] = 5,
114
+ path_in_repo: Optional[str] = None,
115
+ repo_type: Optional[str] = None,
116
+ revision: Optional[str] = None,
117
+ private: Optional[bool] = None,
118
+ token: Optional[str] = None,
119
+ allow_patterns: Optional[Union[List[str], str]] = None,
120
+ ignore_patterns: Optional[Union[List[str], str]] = None,
121
+ squash_history: bool = False,
122
+ hf_api: Optional["HfApi"] = None,
123
+ on_before_commit: Optional[Callable[[], None]] = None,
124
+ ) -> None:
125
+ self.api = hf_api or HfApi(token=token)
126
+ self.on_before_commit = on_before_commit
127
+
128
+ # Folder
129
+ self.folder_path = Path(folder_path).expanduser().resolve()
130
+ self.path_in_repo = path_in_repo or ""
131
+ self.allow_patterns = allow_patterns
132
+
133
+ if ignore_patterns is None:
134
+ ignore_patterns = []
135
+ elif isinstance(ignore_patterns, str):
136
+ ignore_patterns = [ignore_patterns]
137
+ self.ignore_patterns = ignore_patterns + DEFAULT_IGNORE_PATTERNS
138
+
139
+ if self.folder_path.is_file():
140
+ raise ValueError(
141
+ f"'folder_path' must be a directory, not a file: '{self.folder_path}'."
142
+ )
143
+ self.folder_path.mkdir(parents=True, exist_ok=True)
144
+
145
+ # Repository
146
+ repo_url = self.api.create_repo(
147
+ repo_id=repo_id, private=private, repo_type=repo_type, exist_ok=True
148
+ )
149
+ self.repo_id = repo_url.repo_id
150
+ self.repo_type = repo_type
151
+ self.revision = revision
152
+ self.token = token
153
+
154
+ self.last_uploaded: Dict[Path, float] = {}
155
+ self.last_push_time: float | None = None
156
+
157
+ if not every > 0:
158
+ raise ValueError(f"'every' must be a positive integer, not '{every}'.")
159
+ self.lock = Lock()
160
+ self.every = every
161
+ self.squash_history = squash_history
162
+
163
+ logger.info(
164
+ f"Scheduled job to push '{self.folder_path}' to '{self.repo_id}' every {self.every} minutes."
165
+ )
166
+ self._scheduler_thread = Thread(target=self._run_scheduler, daemon=True)
167
+ self._scheduler_thread.start()
168
+ atexit.register(self._push_to_hub)
169
+
170
+ self.__stopped = False
171
+
172
+ def stop(self) -> None:
173
+ """Stop the scheduler.
174
+
175
+ A stopped scheduler cannot be restarted. Mostly for tests purposes.
176
+ """
177
+ self.__stopped = True
178
+
179
+ def __enter__(self) -> "CommitScheduler":
180
+ return self
181
+
182
+ def __exit__(self, exc_type, exc_value, traceback) -> None:
183
+ # Upload last changes before exiting
184
+ self.trigger().result()
185
+ self.stop()
186
+ return
187
+
188
+ def _run_scheduler(self) -> None:
189
+ """Dumb thread waiting between each scheduled push to Hub."""
190
+ while True:
191
+ self.last_future = self.trigger()
192
+ time.sleep(self.every * 60)
193
+ if self.__stopped:
194
+ break
195
+
196
+ def trigger(self) -> Future:
197
+ """Trigger a `push_to_hub` and return a future.
198
+
199
+ This method is automatically called every `every` minutes. You can also call it manually to trigger a commit
200
+ immediately, without waiting for the next scheduled commit.
201
+ """
202
+ return self.api.run_as_future(self._push_to_hub)
203
+
204
+ def _push_to_hub(self) -> Optional[CommitInfo]:
205
+ if self.__stopped: # If stopped, already scheduled commits are ignored
206
+ return None
207
+
208
+ logger.info("(Background) scheduled commit triggered.")
209
+ try:
210
+ value = self.push_to_hub()
211
+ if self.squash_history:
212
+ logger.info("(Background) squashing repo history.")
213
+ self.api.super_squash_history(
214
+ repo_id=self.repo_id, repo_type=self.repo_type, branch=self.revision
215
+ )
216
+ return value
217
+ except Exception as e:
218
+ logger.error(
219
+ f"Error while pushing to Hub: {e}"
220
+ ) # Depending on the setup, error might be silenced
221
+ raise
222
+
223
+ def push_to_hub(self) -> Optional[CommitInfo]:
224
+ """
225
+ Push folder to the Hub and return the commit info.
226
+
227
+ <Tip warning={true}>
228
+
229
+ This method is not meant to be called directly. It is run in the background by the scheduler, respecting a
230
+ queue mechanism to avoid concurrent commits. Making a direct call to the method might lead to concurrency
231
+ issues.
232
+
233
+ </Tip>
234
+
235
+ The default behavior of `push_to_hub` is to assume an append-only folder. It lists all files in the folder and
236
+ uploads only changed files. If no changes are found, the method returns without committing anything. If you want
237
+ to change this behavior, you can inherit from [`CommitScheduler`] and override this method. This can be useful
238
+ for example to compress data together in a single file before committing. For more details and examples, check
239
+ out our [integration guide](https://huggingface.co/docs/huggingface_hub/main/en/guides/upload#scheduled-uploads).
240
+ """
241
+ # Check files to upload (with lock)
242
+ with self.lock:
243
+ if self.on_before_commit is not None:
244
+ self.on_before_commit()
245
+
246
+ logger.debug("Listing files to upload for scheduled commit.")
247
+
248
+ # List files from folder (taken from `_prepare_upload_folder_additions`)
249
+ relpath_to_abspath = {
250
+ path.relative_to(self.folder_path).as_posix(): path
251
+ for path in sorted(
252
+ self.folder_path.glob("**/*")
253
+ ) # sorted to be deterministic
254
+ if path.is_file()
255
+ }
256
+ prefix = f"{self.path_in_repo.strip('/')}/" if self.path_in_repo else ""
257
+
258
+ # Filter with pattern + filter out unchanged files + retrieve current file size
259
+ files_to_upload: List[_FileToUpload] = []
260
+ for relpath in filter_repo_objects(
261
+ relpath_to_abspath.keys(),
262
+ allow_patterns=self.allow_patterns,
263
+ ignore_patterns=self.ignore_patterns,
264
+ ):
265
+ local_path = relpath_to_abspath[relpath]
266
+ stat = local_path.stat()
267
+ if (
268
+ self.last_uploaded.get(local_path) is None
269
+ or self.last_uploaded[local_path] != stat.st_mtime
270
+ ):
271
+ files_to_upload.append(
272
+ _FileToUpload(
273
+ local_path=local_path,
274
+ path_in_repo=prefix + relpath,
275
+ size_limit=stat.st_size,
276
+ last_modified=stat.st_mtime,
277
+ )
278
+ )
279
+
280
+ # Return if nothing to upload
281
+ if len(files_to_upload) == 0:
282
+ logger.debug("Dropping schedule commit: no changed file to upload.")
283
+ return None
284
+
285
+ # Convert `_FileToUpload` as `CommitOperationAdd` (=> compute file shas + limit to file size)
286
+ logger.debug("Removing unchanged files since previous scheduled commit.")
287
+ add_operations = [
288
+ CommitOperationAdd(
289
+ # TODO: Cap the file to its current size, even if the user append data to it while a scheduled commit is happening
290
+ # (requires an upstream fix for XET-535: `hf_xet` should support `BinaryIO` for upload)
291
+ path_or_fileobj=file_to_upload.local_path,
292
+ path_in_repo=file_to_upload.path_in_repo,
293
+ )
294
+ for file_to_upload in files_to_upload
295
+ ]
296
+
297
+ # Upload files (append mode expected - no need for lock)
298
+ logger.debug("Uploading files for scheduled commit.")
299
+ commit_info = self.api.create_commit(
300
+ repo_id=self.repo_id,
301
+ repo_type=self.repo_type,
302
+ operations=add_operations,
303
+ commit_message="Scheduled Commit",
304
+ revision=self.revision,
305
+ )
306
+
307
+ for file in files_to_upload:
308
+ self.last_uploaded[file.local_path] = file.last_modified
309
+
310
+ self.last_push_time = time.time()
311
+
312
+ return commit_info
313
+
314
+
315
+ class PartialFileIO(BytesIO):
316
+ """A file-like object that reads only the first part of a file.
317
+
318
+ Useful to upload a file to the Hub when the user might still be appending data to it. Only the first part of the
319
+ file is uploaded (i.e. the part that was available when the filesystem was first scanned).
320
+
321
+ In practice, only used internally by the CommitScheduler to regularly push a folder to the Hub with minimal
322
+ disturbance for the user. The object is passed to `CommitOperationAdd`.
323
+
324
+ Only supports `read`, `tell` and `seek` methods.
325
+
326
+ Args:
327
+ file_path (`str` or `Path`):
328
+ Path to the file to read.
329
+ size_limit (`int`):
330
+ The maximum number of bytes to read from the file. If the file is larger than this, only the first part
331
+ will be read (and uploaded).
332
+ """
333
+
334
+ def __init__(self, file_path: Union[str, Path], size_limit: int) -> None:
335
+ self._file_path = Path(file_path)
336
+ self._file = self._file_path.open("rb")
337
+ self._size_limit = min(size_limit, os.fstat(self._file.fileno()).st_size)
338
+
339
+ def __del__(self) -> None:
340
+ self._file.close()
341
+ return super().__del__()
342
+
343
+ def __repr__(self) -> str:
344
+ return (
345
+ f"<PartialFileIO file_path={self._file_path} size_limit={self._size_limit}>"
346
+ )
347
+
348
+ def __len__(self) -> int:
349
+ return self._size_limit
350
+
351
+ def __getattribute__(self, name: str):
352
+ if name.startswith("_") or name in (
353
+ "read",
354
+ "tell",
355
+ "seek",
356
+ ): # only 3 public methods supported
357
+ return super().__getattribute__(name)
358
+ raise NotImplementedError(f"PartialFileIO does not support '{name}'.")
359
+
360
+ def tell(self) -> int:
361
+ """Return the current file position."""
362
+ return self._file.tell()
363
+
364
+ def seek(self, __offset: int, __whence: int = SEEK_SET) -> int:
365
+ """Change the stream position to the given offset.
366
+
367
+ Behavior is the same as a regular file, except that the position is capped to the size limit.
368
+ """
369
+ if __whence == SEEK_END:
370
+ # SEEK_END => set from the truncated end
371
+ __offset = len(self) + __offset
372
+ __whence = SEEK_SET
373
+
374
+ pos = self._file.seek(__offset, __whence)
375
+ if pos > self._size_limit:
376
+ return self._file.seek(self._size_limit)
377
+ return pos
378
+
379
+ def read(self, __size: Optional[int] = -1) -> bytes:
380
+ """Read at most `__size` bytes from the file.
381
+
382
+ Behavior is the same as a regular file, except that it is capped to the size limit.
383
+ """
384
+ current = self._file.tell()
385
+ if __size is None or __size < 0:
386
+ # Read until file limit
387
+ truncated_size = self._size_limit - current
388
+ else:
389
+ # Read until file limit or __size
390
+ truncated_size = min(__size, self._size_limit - current)
391
+ return self._file.read(truncated_size)
context_vars.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import contextvars
2
+ from typing import TYPE_CHECKING
3
+
4
+ if TYPE_CHECKING:
5
+ from trackio.run import Run
6
+
7
+ current_run: contextvars.ContextVar["Run | None"] = contextvars.ContextVar(
8
+ "current_run", default=None
9
+ )
10
+ current_project: contextvars.ContextVar[str | None] = contextvars.ContextVar(
11
+ "current_project", default=None
12
+ )
13
+ current_server: contextvars.ContextVar[str | None] = contextvars.ContextVar(
14
+ "current_server", default=None
15
+ )