abidlabs HF Staff commited on
Commit
fb8ec19
·
verified ·
1 Parent(s): 7640fb2

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +1 -0
  2. __init__.py +169 -0
  3. __pycache__/__init__.cpython-310.pyc +0 -0
  4. __pycache__/__init__.cpython-311.pyc +0 -0
  5. __pycache__/__init__.cpython-312.pyc +0 -0
  6. __pycache__/cli.cpython-312.pyc +0 -0
  7. __pycache__/commit_scheduler.cpython-311.pyc +0 -0
  8. __pycache__/commit_scheduler.cpython-312.pyc +0 -0
  9. __pycache__/context.cpython-312.pyc +0 -0
  10. __pycache__/context_vars.cpython-311.pyc +0 -0
  11. __pycache__/context_vars.cpython-312.pyc +0 -0
  12. __pycache__/deploy.cpython-310.pyc +0 -0
  13. __pycache__/deploy.cpython-311.pyc +0 -0
  14. __pycache__/deploy.cpython-312.pyc +0 -0
  15. __pycache__/dummy_commit_scheduler.cpython-310.pyc +0 -0
  16. __pycache__/dummy_commit_scheduler.cpython-311.pyc +0 -0
  17. __pycache__/dummy_commit_scheduler.cpython-312.pyc +0 -0
  18. __pycache__/imports.cpython-311.pyc +0 -0
  19. __pycache__/imports.cpython-312.pyc +0 -0
  20. __pycache__/run.cpython-310.pyc +0 -0
  21. __pycache__/run.cpython-311.pyc +0 -0
  22. __pycache__/run.cpython-312.pyc +0 -0
  23. __pycache__/sqlite_storage.cpython-310.pyc +0 -0
  24. __pycache__/sqlite_storage.cpython-311.pyc +0 -0
  25. __pycache__/sqlite_storage.cpython-312.pyc +0 -0
  26. __pycache__/storage.cpython-312.pyc +0 -0
  27. __pycache__/typehints.cpython-311.pyc +0 -0
  28. __pycache__/typehints.cpython-312.pyc +0 -0
  29. __pycache__/types.cpython-312.pyc +0 -0
  30. __pycache__/ui.cpython-310.pyc +0 -0
  31. __pycache__/ui.cpython-311.pyc +0 -0
  32. __pycache__/ui.cpython-312.pyc +0 -0
  33. __pycache__/utils.cpython-310.pyc +0 -0
  34. __pycache__/utils.cpython-311.pyc +0 -0
  35. __pycache__/utils.cpython-312.pyc +0 -0
  36. assets/trackio_logo_dark.png +0 -0
  37. assets/trackio_logo_light.png +0 -0
  38. assets/trackio_logo_old.png +3 -0
  39. assets/trackio_logo_type_dark.png +0 -0
  40. assets/trackio_logo_type_dark_transparent.png +0 -0
  41. assets/trackio_logo_type_light.png +0 -0
  42. assets/trackio_logo_type_light_transparent.png +0 -0
  43. cli.py +26 -0
  44. commit_scheduler.py +392 -0
  45. context_vars.py +15 -0
  46. data_types/__pycache__/__init__.cpython-312.pyc +0 -0
  47. data_types/__pycache__/table.cpython-312.pyc +0 -0
  48. deploy.py +170 -0
  49. dummy_commit_scheduler.py +12 -0
  50. imports.py +245 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ assets/trackio_logo_old.png filter=lfs diff=lfs merge=lfs -text
__init__.py ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import warnings
3
+ import webbrowser
4
+ from pathlib import Path
5
+ from typing import Any
6
+
7
+ from gradio_client import Client
8
+
9
+ from trackio import context_vars, deploy, utils
10
+ from trackio.imports import import_csv, import_tf_events
11
+ from trackio.run import Run
12
+ from trackio.sqlite_storage import SQLiteStorage
13
+ from trackio.ui import demo
14
+ from trackio.utils import TRACKIO_DIR, TRACKIO_LOGO_DIR
15
+
16
+ __version__ = Path(__file__).parent.joinpath("version.txt").read_text().strip()
17
+
18
+ __all__ = ["init", "log", "finish", "show", "import_csv", "import_tf_events"]
19
+
20
+
21
+ config = {}
22
+
23
+
24
+ def init(
25
+ project: str,
26
+ name: str | None = None,
27
+ space_id: str | None = None,
28
+ dataset_id: str | None = None,
29
+ config: dict | None = None,
30
+ resume: str = "never",
31
+ settings: Any = None,
32
+ ) -> Run:
33
+ """
34
+ Creates a new Trackio project and returns a Run object.
35
+
36
+ Args:
37
+ project: The name of the project (can be an existing project to continue tracking or a new project to start tracking from scratch).
38
+ name: The name of the run (if not provided, a default name will be generated).
39
+ space_id: If provided, the project will be logged to a Hugging Face Space instead of a local directory. Should be a complete Space name like "username/reponame" or "orgname/reponame", or just "reponame" in which case the Space will be created in the currently-logged-in Hugging Face user's namespace. If the Space does not exist, it will be created. If the Space already exists, the project will be logged to it.
40
+ dataset_id: If a space_id is provided, a persistent Hugging Face Dataset will be created and the metrics will be synced to it every 5 minutes. Specify a Dataset with name like "username/datasetname" or "orgname/datasetname", or "datasetname" (uses currently-logged-in Hugging Face user's namespace), or None (uses the same name as the Space but with the "_dataset" suffix). If the Dataset does not exist, it will be created. If the Dataset already exists, the project will be appended to it.
41
+ config: A dictionary of configuration options. Provided for compatibility with wandb.init()
42
+ resume: Controls how to handle resuming a run. Can be one of:
43
+ - "must": Must resume the run with the given name, raises error if run doesn't exist
44
+ - "allow": Resume the run if it exists, otherwise create a new run
45
+ - "never": Never resume a run, always create a new one
46
+ settings: Not used. Provided for compatibility with wandb.init()
47
+ """
48
+ if settings is not None:
49
+ warnings.warn(
50
+ "* Warning: settings is not used. Provided for compatibility with wandb.init(). Please create an issue at: https://github.com/gradio-app/trackio/issues if you need a specific feature implemented."
51
+ )
52
+
53
+ if space_id is None and dataset_id is not None:
54
+ raise ValueError("Must provide a `space_id` when `dataset_id` is provided.")
55
+ space_id, dataset_id = utils.preprocess_space_and_dataset_ids(space_id, dataset_id)
56
+ url = context_vars.current_server.get()
57
+
58
+ if url is None:
59
+ if space_id is None:
60
+ _, url, _ = demo.launch(
61
+ show_api=False,
62
+ inline=False,
63
+ quiet=True,
64
+ prevent_thread_lock=True,
65
+ show_error=True,
66
+ )
67
+ else:
68
+ url = space_id
69
+ context_vars.current_server.set(url)
70
+
71
+ if (
72
+ context_vars.current_project.get() is None
73
+ or context_vars.current_project.get() != project
74
+ ):
75
+ print(f"* Trackio project initialized: {project}")
76
+
77
+ if dataset_id is not None:
78
+ os.environ["TRACKIO_DATASET_ID"] = dataset_id
79
+ print(
80
+ f"* Trackio metrics will be synced to Hugging Face Dataset: {dataset_id}"
81
+ )
82
+ if space_id is None:
83
+ print(f"* Trackio metrics logged to: {TRACKIO_DIR}")
84
+ utils.print_dashboard_instructions(project)
85
+ else:
86
+ deploy.create_space_if_not_exists(space_id, dataset_id)
87
+ print(
88
+ f"* View dashboard by going to: {deploy.SPACE_URL.format(space_id=space_id)}"
89
+ )
90
+ context_vars.current_project.set(project)
91
+
92
+ client = None
93
+ if not space_id:
94
+ client = Client(url, verbose=False)
95
+
96
+ if resume == "must":
97
+ if name is None:
98
+ raise ValueError("Must provide a run name when resume='must'")
99
+ if name not in SQLiteStorage.get_runs(project):
100
+ raise ValueError(f"Run '{name}' does not exist in project '{project}'")
101
+ elif resume == "allow":
102
+ if name is not None and name in SQLiteStorage.get_runs(project):
103
+ print(f"* Resuming existing run: {name}")
104
+ elif resume == "never":
105
+ if name is not None and name in SQLiteStorage.get_runs(project):
106
+ name = None
107
+ else:
108
+ raise ValueError("resume must be one of: 'must', 'allow', or 'never'")
109
+
110
+ run = Run(
111
+ url=url,
112
+ project=project,
113
+ client=client,
114
+ name=name,
115
+ config=config,
116
+ space_id=space_id,
117
+ )
118
+ context_vars.current_run.set(run)
119
+ globals()["config"] = run.config
120
+ return run
121
+
122
+
123
+ def log(metrics: dict, step: int | None = None) -> None:
124
+ """
125
+ Logs metrics to the current run.
126
+
127
+ Args:
128
+ metrics: A dictionary of metrics to log.
129
+ step: The step number. If not provided, the step will be incremented automatically.
130
+ """
131
+ run = context_vars.current_run.get()
132
+ if run is None:
133
+ raise RuntimeError("Call trackio.init() before trackio.log().")
134
+ run.log(
135
+ metrics=metrics,
136
+ step=step,
137
+ )
138
+
139
+
140
+ def finish():
141
+ """
142
+ Finishes the current run.
143
+ """
144
+ run = context_vars.current_run.get()
145
+ if run is None:
146
+ raise RuntimeError("Call trackio.init() before trackio.finish().")
147
+ run.finish()
148
+
149
+
150
+ def show(project: str | None = None):
151
+ """
152
+ Launches the Trackio dashboard.
153
+
154
+ Args:
155
+ project: The name of the project whose runs to show. If not provided, all projects will be shown and the user can select one.
156
+ """
157
+ _, url, share_url = demo.launch(
158
+ show_api=False,
159
+ quiet=True,
160
+ inline=False,
161
+ prevent_thread_lock=True,
162
+ favicon_path=TRACKIO_LOGO_DIR / "trackio_logo_light.png",
163
+ allowed_paths=[TRACKIO_LOGO_DIR],
164
+ )
165
+ base_url = share_url + "/" if share_url else url
166
+ dashboard_url = base_url + f"?project={project}" if project else base_url
167
+ print(f"* Trackio UI launched at: {dashboard_url}")
168
+ webbrowser.open(dashboard_url)
169
+ utils.block_except_in_notebook()
__pycache__/__init__.cpython-310.pyc ADDED
Binary file (4.96 kB). View file
 
__pycache__/__init__.cpython-311.pyc ADDED
Binary file (8.69 kB). View file
 
__pycache__/__init__.cpython-312.pyc ADDED
Binary file (8.05 kB). View file
 
__pycache__/cli.cpython-312.pyc ADDED
Binary file (1.11 kB). View file
 
__pycache__/commit_scheduler.cpython-311.pyc ADDED
Binary file (20.3 kB). View file
 
__pycache__/commit_scheduler.cpython-312.pyc ADDED
Binary file (18.7 kB). View file
 
__pycache__/context.cpython-312.pyc ADDED
Binary file (440 Bytes). View file
 
__pycache__/context_vars.cpython-311.pyc ADDED
Binary file (840 Bytes). View file
 
__pycache__/context_vars.cpython-312.pyc ADDED
Binary file (763 Bytes). View file
 
__pycache__/deploy.cpython-310.pyc ADDED
Binary file (1.72 kB). View file
 
__pycache__/deploy.cpython-311.pyc ADDED
Binary file (7.44 kB). View file
 
__pycache__/deploy.cpython-312.pyc ADDED
Binary file (6.6 kB). View file
 
__pycache__/dummy_commit_scheduler.cpython-310.pyc ADDED
Binary file (936 Bytes). View file
 
__pycache__/dummy_commit_scheduler.cpython-311.pyc ADDED
Binary file (1.19 kB). View file
 
__pycache__/dummy_commit_scheduler.cpython-312.pyc ADDED
Binary file (1.01 kB). View file
 
__pycache__/imports.cpython-311.pyc ADDED
Binary file (12.9 kB). View file
 
__pycache__/imports.cpython-312.pyc ADDED
Binary file (11.8 kB). View file
 
__pycache__/run.cpython-310.pyc ADDED
Binary file (1.01 kB). View file
 
__pycache__/run.cpython-311.pyc ADDED
Binary file (5.65 kB). View file
 
__pycache__/run.cpython-312.pyc ADDED
Binary file (5.1 kB). View file
 
__pycache__/sqlite_storage.cpython-310.pyc ADDED
Binary file (5.37 kB). View file
 
__pycache__/sqlite_storage.cpython-311.pyc ADDED
Binary file (22.3 kB). View file
 
__pycache__/sqlite_storage.cpython-312.pyc ADDED
Binary file (18.7 kB). View file
 
__pycache__/storage.cpython-312.pyc ADDED
Binary file (4.6 kB). View file
 
__pycache__/typehints.cpython-311.pyc ADDED
Binary file (648 Bytes). View file
 
__pycache__/typehints.cpython-312.pyc ADDED
Binary file (535 Bytes). View file
 
__pycache__/types.cpython-312.pyc ADDED
Binary file (531 Bytes). View file
 
__pycache__/ui.cpython-310.pyc ADDED
Binary file (7.83 kB). View file
 
__pycache__/ui.cpython-311.pyc ADDED
Binary file (24.4 kB). View file
 
__pycache__/ui.cpython-312.pyc ADDED
Binary file (21.3 kB). View file
 
__pycache__/utils.cpython-310.pyc ADDED
Binary file (2.62 kB). View file
 
__pycache__/utils.cpython-311.pyc ADDED
Binary file (11.2 kB). View file
 
__pycache__/utils.cpython-312.pyc ADDED
Binary file (10.2 kB). View file
 
assets/trackio_logo_dark.png ADDED
assets/trackio_logo_light.png ADDED
assets/trackio_logo_old.png ADDED

Git LFS Details

  • SHA256: 3922c4d1e465270ad4d8abb12023f3beed5d9f7f338528a4c0ac21dcf358a1c8
  • Pointer size: 131 Bytes
  • Size of remote file: 487 kB
assets/trackio_logo_type_dark.png ADDED
assets/trackio_logo_type_dark_transparent.png ADDED
assets/trackio_logo_type_light.png ADDED
assets/trackio_logo_type_light_transparent.png ADDED
cli.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+
3
+ from trackio import show
4
+
5
+
6
+ def main():
7
+ parser = argparse.ArgumentParser(description="Trackio CLI")
8
+ subparsers = parser.add_subparsers(dest="command")
9
+
10
+ ui_parser = subparsers.add_parser(
11
+ "show", help="Show the Trackio dashboard UI for a project"
12
+ )
13
+ ui_parser.add_argument(
14
+ "--project", required=False, help="Project name to show in the dashboard"
15
+ )
16
+
17
+ args = parser.parse_args()
18
+
19
+ if args.command == "show":
20
+ show(args.project)
21
+ else:
22
+ parser.print_help()
23
+
24
+
25
+ if __name__ == "__main__":
26
+ main()
commit_scheduler.py ADDED
@@ -0,0 +1,392 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Originally copied from https://github.com/huggingface/huggingface_hub/blob/d0a948fc2a32ed6e557042a95ef3e4af97ec4a7c/src/huggingface_hub/_commit_scheduler.py
2
+
3
+ import atexit
4
+ import logging
5
+ import os
6
+ import time
7
+ from concurrent.futures import Future
8
+ from dataclasses import dataclass
9
+ from io import SEEK_END, SEEK_SET, BytesIO
10
+ from pathlib import Path
11
+ from threading import Lock, Thread
12
+ from typing import Callable, Dict, List, Optional, Union
13
+
14
+ from huggingface_hub.hf_api import (
15
+ DEFAULT_IGNORE_PATTERNS,
16
+ CommitInfo,
17
+ CommitOperationAdd,
18
+ HfApi,
19
+ )
20
+ from huggingface_hub.utils import filter_repo_objects
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ @dataclass(frozen=True)
26
+ class _FileToUpload:
27
+ """Temporary dataclass to store info about files to upload. Not meant to be used directly."""
28
+
29
+ local_path: Path
30
+ path_in_repo: str
31
+ size_limit: int
32
+ last_modified: float
33
+
34
+
35
+ class CommitScheduler:
36
+ """
37
+ Scheduler to upload a local folder to the Hub at regular intervals (e.g. push to hub every 5 minutes).
38
+
39
+ The recommended way to use the scheduler is to use it as a context manager. This ensures that the scheduler is
40
+ properly stopped and the last commit is triggered when the script ends. The scheduler can also be stopped manually
41
+ with the `stop` method. Checkout the [upload guide](https://huggingface.co/docs/huggingface_hub/guides/upload#scheduled-uploads)
42
+ to learn more about how to use it.
43
+
44
+ Args:
45
+ repo_id (`str`):
46
+ The id of the repo to commit to.
47
+ folder_path (`str` or `Path`):
48
+ Path to the local folder to upload regularly.
49
+ every (`int` or `float`, *optional*):
50
+ The number of minutes between each commit. Defaults to 5 minutes.
51
+ path_in_repo (`str`, *optional*):
52
+ Relative path of the directory in the repo, for example: `"checkpoints/"`. Defaults to the root folder
53
+ of the repository.
54
+ repo_type (`str`, *optional*):
55
+ The type of the repo to commit to. Defaults to `model`.
56
+ revision (`str`, *optional*):
57
+ The revision of the repo to commit to. Defaults to `main`.
58
+ private (`bool`, *optional*):
59
+ Whether to make the repo private. If `None` (default), the repo will be public unless the organization's default is private. This value is ignored if the repo already exists.
60
+ token (`str`, *optional*):
61
+ The token to use to commit to the repo. Defaults to the token saved on the machine.
62
+ allow_patterns (`List[str]` or `str`, *optional*):
63
+ If provided, only files matching at least one pattern are uploaded.
64
+ ignore_patterns (`List[str]` or `str`, *optional*):
65
+ If provided, files matching any of the patterns are not uploaded.
66
+ squash_history (`bool`, *optional*):
67
+ Whether to squash the history of the repo after each commit. Defaults to `False`. Squashing commits is
68
+ useful to avoid degraded performances on the repo when it grows too large.
69
+ hf_api (`HfApi`, *optional*):
70
+ The [`HfApi`] client to use to commit to the Hub. Can be set with custom settings (user agent, token,...).
71
+ on_before_commit (`Callable[[], None]`, *optional*):
72
+ If specified, a function that will be called before the CommitScheduler lists files to create a commit.
73
+
74
+ Example:
75
+ ```py
76
+ >>> from pathlib import Path
77
+ >>> from huggingface_hub import CommitScheduler
78
+
79
+ # Scheduler uploads every 10 minutes
80
+ >>> csv_path = Path("watched_folder/data.csv")
81
+ >>> CommitScheduler(repo_id="test_scheduler", repo_type="dataset", folder_path=csv_path.parent, every=10)
82
+
83
+ >>> with csv_path.open("a") as f:
84
+ ... f.write("first line")
85
+
86
+ # Some time later (...)
87
+ >>> with csv_path.open("a") as f:
88
+ ... f.write("second line")
89
+ ```
90
+
91
+ Example using a context manager:
92
+ ```py
93
+ >>> from pathlib import Path
94
+ >>> from huggingface_hub import CommitScheduler
95
+
96
+ >>> with CommitScheduler(repo_id="test_scheduler", repo_type="dataset", folder_path="watched_folder", every=10) as scheduler:
97
+ ... csv_path = Path("watched_folder/data.csv")
98
+ ... with csv_path.open("a") as f:
99
+ ... f.write("first line")
100
+ ... (...)
101
+ ... with csv_path.open("a") as f:
102
+ ... f.write("second line")
103
+
104
+ # Scheduler is now stopped and last commit have been triggered
105
+ ```
106
+ """
107
+
108
+ def __init__(
109
+ self,
110
+ *,
111
+ repo_id: str,
112
+ folder_path: Union[str, Path],
113
+ every: Union[int, float] = 5,
114
+ path_in_repo: Optional[str] = None,
115
+ repo_type: Optional[str] = None,
116
+ revision: Optional[str] = None,
117
+ private: Optional[bool] = None,
118
+ token: Optional[str] = None,
119
+ allow_patterns: Optional[Union[List[str], str]] = None,
120
+ ignore_patterns: Optional[Union[List[str], str]] = None,
121
+ squash_history: bool = False,
122
+ hf_api: Optional["HfApi"] = None,
123
+ on_before_commit: Optional[Callable[[], None]] = None,
124
+ ) -> None:
125
+ self.api = hf_api or HfApi(token=token)
126
+ self.on_before_commit = on_before_commit
127
+
128
+ # Folder
129
+ self.folder_path = Path(folder_path).expanduser().resolve()
130
+ self.path_in_repo = path_in_repo or ""
131
+ self.allow_patterns = allow_patterns
132
+
133
+ if ignore_patterns is None:
134
+ ignore_patterns = []
135
+ elif isinstance(ignore_patterns, str):
136
+ ignore_patterns = [ignore_patterns]
137
+ self.ignore_patterns = ignore_patterns + DEFAULT_IGNORE_PATTERNS
138
+
139
+ if self.folder_path.is_file():
140
+ raise ValueError(
141
+ f"'folder_path' must be a directory, not a file: '{self.folder_path}'."
142
+ )
143
+ self.folder_path.mkdir(parents=True, exist_ok=True)
144
+
145
+ # Repository
146
+ repo_url = self.api.create_repo(
147
+ repo_id=repo_id, private=private, repo_type=repo_type, exist_ok=True
148
+ )
149
+ self.repo_id = repo_url.repo_id
150
+ self.repo_type = repo_type
151
+ self.revision = revision
152
+ self.token = token
153
+
154
+ # Keep track of already uploaded files
155
+ self.last_uploaded: Dict[
156
+ Path, float
157
+ ] = {} # key is local path, value is timestamp
158
+
159
+ # Scheduler
160
+ if not every > 0:
161
+ raise ValueError(f"'every' must be a positive integer, not '{every}'.")
162
+ self.lock = Lock()
163
+ self.every = every
164
+ self.squash_history = squash_history
165
+
166
+ logger.info(
167
+ f"Scheduled job to push '{self.folder_path}' to '{self.repo_id}' every {self.every} minutes."
168
+ )
169
+ self._scheduler_thread = Thread(target=self._run_scheduler, daemon=True)
170
+ self._scheduler_thread.start()
171
+ atexit.register(self._push_to_hub)
172
+
173
+ self.__stopped = False
174
+
175
+ def stop(self) -> None:
176
+ """Stop the scheduler.
177
+
178
+ A stopped scheduler cannot be restarted. Mostly for tests purposes.
179
+ """
180
+ self.__stopped = True
181
+
182
+ def __enter__(self) -> "CommitScheduler":
183
+ return self
184
+
185
+ def __exit__(self, exc_type, exc_value, traceback) -> None:
186
+ # Upload last changes before exiting
187
+ self.trigger().result()
188
+ self.stop()
189
+ return
190
+
191
+ def _run_scheduler(self) -> None:
192
+ """Dumb thread waiting between each scheduled push to Hub."""
193
+ while True:
194
+ self.last_future = self.trigger()
195
+ time.sleep(self.every * 60)
196
+ if self.__stopped:
197
+ break
198
+
199
+ def trigger(self) -> Future:
200
+ """Trigger a `push_to_hub` and return a future.
201
+
202
+ This method is automatically called every `every` minutes. You can also call it manually to trigger a commit
203
+ immediately, without waiting for the next scheduled commit.
204
+ """
205
+ return self.api.run_as_future(self._push_to_hub)
206
+
207
+ def _push_to_hub(self) -> Optional[CommitInfo]:
208
+ if self.__stopped: # If stopped, already scheduled commits are ignored
209
+ return None
210
+
211
+ logger.info("(Background) scheduled commit triggered.")
212
+ try:
213
+ value = self.push_to_hub()
214
+ if self.squash_history:
215
+ logger.info("(Background) squashing repo history.")
216
+ self.api.super_squash_history(
217
+ repo_id=self.repo_id, repo_type=self.repo_type, branch=self.revision
218
+ )
219
+ return value
220
+ except Exception as e:
221
+ logger.error(
222
+ f"Error while pushing to Hub: {e}"
223
+ ) # Depending on the setup, error might be silenced
224
+ raise
225
+
226
+ def push_to_hub(self) -> Optional[CommitInfo]:
227
+ """
228
+ Push folder to the Hub and return the commit info.
229
+
230
+ <Tip warning={true}>
231
+
232
+ This method is not meant to be called directly. It is run in the background by the scheduler, respecting a
233
+ queue mechanism to avoid concurrent commits. Making a direct call to the method might lead to concurrency
234
+ issues.
235
+
236
+ </Tip>
237
+
238
+ The default behavior of `push_to_hub` is to assume an append-only folder. It lists all files in the folder and
239
+ uploads only changed files. If no changes are found, the method returns without committing anything. If you want
240
+ to change this behavior, you can inherit from [`CommitScheduler`] and override this method. This can be useful
241
+ for example to compress data together in a single file before committing. For more details and examples, check
242
+ out our [integration guide](https://huggingface.co/docs/huggingface_hub/main/en/guides/upload#scheduled-uploads).
243
+ """
244
+ # Check files to upload (with lock)
245
+ with self.lock:
246
+ if self.on_before_commit is not None:
247
+ self.on_before_commit()
248
+
249
+ logger.debug("Listing files to upload for scheduled commit.")
250
+
251
+ # List files from folder (taken from `_prepare_upload_folder_additions`)
252
+ relpath_to_abspath = {
253
+ path.relative_to(self.folder_path).as_posix(): path
254
+ for path in sorted(
255
+ self.folder_path.glob("**/*")
256
+ ) # sorted to be deterministic
257
+ if path.is_file()
258
+ }
259
+ prefix = f"{self.path_in_repo.strip('/')}/" if self.path_in_repo else ""
260
+
261
+ # Filter with pattern + filter out unchanged files + retrieve current file size
262
+ files_to_upload: List[_FileToUpload] = []
263
+ for relpath in filter_repo_objects(
264
+ relpath_to_abspath.keys(),
265
+ allow_patterns=self.allow_patterns,
266
+ ignore_patterns=self.ignore_patterns,
267
+ ):
268
+ local_path = relpath_to_abspath[relpath]
269
+ stat = local_path.stat()
270
+ if (
271
+ self.last_uploaded.get(local_path) is None
272
+ or self.last_uploaded[local_path] != stat.st_mtime
273
+ ):
274
+ files_to_upload.append(
275
+ _FileToUpload(
276
+ local_path=local_path,
277
+ path_in_repo=prefix + relpath,
278
+ size_limit=stat.st_size,
279
+ last_modified=stat.st_mtime,
280
+ )
281
+ )
282
+
283
+ # Return if nothing to upload
284
+ if len(files_to_upload) == 0:
285
+ logger.debug("Dropping schedule commit: no changed file to upload.")
286
+ return None
287
+
288
+ # Convert `_FileToUpload` as `CommitOperationAdd` (=> compute file shas + limit to file size)
289
+ logger.debug("Removing unchanged files since previous scheduled commit.")
290
+ add_operations = [
291
+ CommitOperationAdd(
292
+ # TODO: Cap the file to its current size, even if the user append data to it while a scheduled commit is happening
293
+ # (requires an upstream fix for XET-535: `hf_xet` should support `BinaryIO` for upload)
294
+ path_or_fileobj=file_to_upload.local_path,
295
+ path_in_repo=file_to_upload.path_in_repo,
296
+ )
297
+ for file_to_upload in files_to_upload
298
+ ]
299
+
300
+ # Upload files (append mode expected - no need for lock)
301
+ logger.debug("Uploading files for scheduled commit.")
302
+ commit_info = self.api.create_commit(
303
+ repo_id=self.repo_id,
304
+ repo_type=self.repo_type,
305
+ operations=add_operations,
306
+ commit_message="Scheduled Commit",
307
+ revision=self.revision,
308
+ )
309
+
310
+ # Successful commit: keep track of the latest "last_modified" for each file
311
+ for file in files_to_upload:
312
+ self.last_uploaded[file.local_path] = file.last_modified
313
+ return commit_info
314
+
315
+
316
+ class PartialFileIO(BytesIO):
317
+ """A file-like object that reads only the first part of a file.
318
+
319
+ Useful to upload a file to the Hub when the user might still be appending data to it. Only the first part of the
320
+ file is uploaded (i.e. the part that was available when the filesystem was first scanned).
321
+
322
+ In practice, only used internally by the CommitScheduler to regularly push a folder to the Hub with minimal
323
+ disturbance for the user. The object is passed to `CommitOperationAdd`.
324
+
325
+ Only supports `read`, `tell` and `seek` methods.
326
+
327
+ Args:
328
+ file_path (`str` or `Path`):
329
+ Path to the file to read.
330
+ size_limit (`int`):
331
+ The maximum number of bytes to read from the file. If the file is larger than this, only the first part
332
+ will be read (and uploaded).
333
+ """
334
+
335
+ def __init__(self, file_path: Union[str, Path], size_limit: int) -> None:
336
+ self._file_path = Path(file_path)
337
+ self._file = self._file_path.open("rb")
338
+ self._size_limit = min(size_limit, os.fstat(self._file.fileno()).st_size)
339
+
340
+ def __del__(self) -> None:
341
+ self._file.close()
342
+ return super().__del__()
343
+
344
+ def __repr__(self) -> str:
345
+ return (
346
+ f"<PartialFileIO file_path={self._file_path} size_limit={self._size_limit}>"
347
+ )
348
+
349
+ def __len__(self) -> int:
350
+ return self._size_limit
351
+
352
+ def __getattribute__(self, name: str):
353
+ if name.startswith("_") or name in (
354
+ "read",
355
+ "tell",
356
+ "seek",
357
+ ): # only 3 public methods supported
358
+ return super().__getattribute__(name)
359
+ raise NotImplementedError(f"PartialFileIO does not support '{name}'.")
360
+
361
+ def tell(self) -> int:
362
+ """Return the current file position."""
363
+ return self._file.tell()
364
+
365
+ def seek(self, __offset: int, __whence: int = SEEK_SET) -> int:
366
+ """Change the stream position to the given offset.
367
+
368
+ Behavior is the same as a regular file, except that the position is capped to the size limit.
369
+ """
370
+ if __whence == SEEK_END:
371
+ # SEEK_END => set from the truncated end
372
+ __offset = len(self) + __offset
373
+ __whence = SEEK_SET
374
+
375
+ pos = self._file.seek(__offset, __whence)
376
+ if pos > self._size_limit:
377
+ return self._file.seek(self._size_limit)
378
+ return pos
379
+
380
+ def read(self, __size: Optional[int] = -1) -> bytes:
381
+ """Read at most `__size` bytes from the file.
382
+
383
+ Behavior is the same as a regular file, except that it is capped to the size limit.
384
+ """
385
+ current = self._file.tell()
386
+ if __size is None or __size < 0:
387
+ # Read until file limit
388
+ truncated_size = self._size_limit - current
389
+ else:
390
+ # Read until file limit or __size
391
+ truncated_size = min(__size, self._size_limit - current)
392
+ return self._file.read(truncated_size)
context_vars.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import contextvars
2
+ from typing import TYPE_CHECKING
3
+
4
+ if TYPE_CHECKING:
5
+ from trackio.run import Run
6
+
7
+ current_run: contextvars.ContextVar["Run | None"] = contextvars.ContextVar(
8
+ "current_run", default=None
9
+ )
10
+ current_project: contextvars.ContextVar[str | None] = contextvars.ContextVar(
11
+ "current_project", default=None
12
+ )
13
+ current_server: contextvars.ContextVar[str | None] = contextvars.ContextVar(
14
+ "current_server", default=None
15
+ )
data_types/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (155 Bytes). View file
 
data_types/__pycache__/table.cpython-312.pyc ADDED
Binary file (1.01 kB). View file
 
deploy.py ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+ import os
3
+ import time
4
+ from importlib.resources import files
5
+ from pathlib import Path
6
+
7
+ import gradio
8
+ import huggingface_hub
9
+ from gradio_client import Client, handle_file
10
+ from httpx import ReadTimeout
11
+ from huggingface_hub.errors import RepositoryNotFoundError
12
+ from requests import HTTPError
13
+
14
+ from trackio.sqlite_storage import SQLiteStorage
15
+
16
+ SPACE_URL = "https://huggingface.co/spaces/{space_id}"
17
+
18
+
19
+ def deploy_as_space(
20
+ space_id: str,
21
+ dataset_id: str | None = None,
22
+ ):
23
+ if (
24
+ os.getenv("SYSTEM") == "spaces"
25
+ ): # in case a repo with this function is uploaded to spaces
26
+ return
27
+
28
+ trackio_path = files("trackio")
29
+
30
+ hf_api = huggingface_hub.HfApi()
31
+
32
+ try:
33
+ huggingface_hub.create_repo(
34
+ space_id,
35
+ space_sdk="gradio",
36
+ repo_type="space",
37
+ exist_ok=True,
38
+ )
39
+ except HTTPError as e:
40
+ if e.response.status_code in [401, 403]: # unauthorized or forbidden
41
+ print("Need 'write' access token to create a Spaces repo.")
42
+ huggingface_hub.login(add_to_git_credential=False)
43
+ huggingface_hub.create_repo(
44
+ space_id,
45
+ space_sdk="gradio",
46
+ repo_type="space",
47
+ exist_ok=True,
48
+ )
49
+ else:
50
+ raise ValueError(f"Failed to create Space: {e}")
51
+
52
+ with open(Path(trackio_path, "README.md"), "r") as f:
53
+ readme_content = f.read()
54
+ readme_content = readme_content.replace("{GRADIO_VERSION}", gradio.__version__)
55
+ readme_buffer = io.BytesIO(readme_content.encode("utf-8"))
56
+ hf_api.upload_file(
57
+ path_or_fileobj=readme_buffer,
58
+ path_in_repo="README.md",
59
+ repo_id=space_id,
60
+ repo_type="space",
61
+ )
62
+
63
+ # We can assume pandas, gradio, and huggingface-hub are already installed in a Gradio Space.
64
+ # Make sure necessary dependencies are installed by creating a requirements.txt.
65
+ requirements_content = """
66
+ pyarrow>=21.0
67
+ """
68
+ requirements_buffer = io.BytesIO(requirements_content.encode("utf-8"))
69
+ hf_api.upload_file(
70
+ path_or_fileobj=requirements_buffer,
71
+ path_in_repo="requirements.txt",
72
+ repo_id=space_id,
73
+ repo_type="space",
74
+ )
75
+
76
+ huggingface_hub.utils.disable_progress_bars()
77
+ hf_api.upload_folder(
78
+ repo_id=space_id,
79
+ repo_type="space",
80
+ folder_path=trackio_path,
81
+ ignore_patterns=["README.md"],
82
+ )
83
+
84
+ hf_token = huggingface_hub.utils.get_token()
85
+ if hf_token is not None:
86
+ huggingface_hub.add_space_secret(space_id, "HF_TOKEN", hf_token)
87
+ if dataset_id is not None:
88
+ huggingface_hub.add_space_variable(space_id, "TRACKIO_DATASET_ID", dataset_id)
89
+
90
+
91
+ def create_space_if_not_exists(
92
+ space_id: str,
93
+ dataset_id: str | None = None,
94
+ ) -> None:
95
+ """
96
+ Creates a new Hugging Face Space if it does not exist. If a dataset_id is provided, it will be added as a space variable.
97
+
98
+ Args:
99
+ space_id: The ID of the Space to create.
100
+ dataset_id: The ID of the Dataset to add to the Space.
101
+ """
102
+ if "/" not in space_id:
103
+ raise ValueError(
104
+ f"Invalid space ID: {space_id}. Must be in the format: username/reponame or orgname/reponame."
105
+ )
106
+ if dataset_id is not None and "/" not in dataset_id:
107
+ raise ValueError(
108
+ f"Invalid dataset ID: {dataset_id}. Must be in the format: username/datasetname or orgname/datasetname."
109
+ )
110
+ try:
111
+ huggingface_hub.repo_info(space_id, repo_type="space")
112
+ print(f"* Found existing space: {SPACE_URL.format(space_id=space_id)}")
113
+ if dataset_id is not None:
114
+ huggingface_hub.add_space_variable(
115
+ space_id, "TRACKIO_DATASET_ID", dataset_id
116
+ )
117
+ return
118
+ except RepositoryNotFoundError:
119
+ pass
120
+ except HTTPError as e:
121
+ if e.response.status_code in [401, 403]: # unauthorized or forbidden
122
+ print("Need 'write' access token to create a Spaces repo.")
123
+ huggingface_hub.login(add_to_git_credential=False)
124
+ huggingface_hub.add_space_variable(
125
+ space_id, "TRACKIO_DATASET_ID", dataset_id
126
+ )
127
+ else:
128
+ raise ValueError(f"Failed to create Space: {e}")
129
+
130
+ print(f"* Creating new space: {SPACE_URL.format(space_id=space_id)}")
131
+ deploy_as_space(space_id, dataset_id)
132
+
133
+
134
+ def wait_until_space_exists(
135
+ space_id: str,
136
+ ) -> None:
137
+ """
138
+ Blocks the current thread until the space exists.
139
+ May raise a TimeoutError if this takes quite a while.
140
+
141
+ Args:
142
+ space_id: The ID of the Space to wait for.
143
+ """
144
+ delay = 1
145
+ for _ in range(10):
146
+ try:
147
+ Client(space_id, verbose=False)
148
+ return
149
+ except (ReadTimeout, ValueError):
150
+ time.sleep(delay)
151
+ delay = min(delay * 2, 30)
152
+ raise TimeoutError("Waiting for space to exist took longer than expected")
153
+
154
+
155
+ def upload_db_to_space(project: str, space_id: str) -> None:
156
+ """
157
+ Uploads the database of a local Trackio project to a Hugging Face Space.
158
+
159
+ Args:
160
+ project: The name of the project to upload.
161
+ space_id: The ID of the Space to upload to.
162
+ """
163
+ db_path = SQLiteStorage.get_project_db_path(project)
164
+ client = Client(space_id, verbose=False)
165
+ client.predict(
166
+ api_name="/upload_db_to_space",
167
+ project=project,
168
+ uploaded_db=handle_file(db_path),
169
+ hf_token=huggingface_hub.utils.get_token(),
170
+ )
dummy_commit_scheduler.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # A dummy object to fit the interface of huggingface_hub's CommitScheduler
2
+ class DummyCommitSchedulerLock:
3
+ def __enter__(self):
4
+ return None
5
+
6
+ def __exit__(self, exception_type, exception_value, exception_traceback):
7
+ pass
8
+
9
+
10
+ class DummyCommitScheduler:
11
+ def __init__(self):
12
+ self.lock = DummyCommitSchedulerLock()
imports.py ADDED
@@ -0,0 +1,245 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from pathlib import Path
3
+
4
+ import pandas as pd
5
+
6
+ from trackio import deploy, utils
7
+ from trackio.sqlite_storage import SQLiteStorage
8
+
9
+
10
+ def import_csv(
11
+ csv_path: str | Path,
12
+ project: str,
13
+ name: str | None = None,
14
+ space_id: str | None = None,
15
+ dataset_id: str | None = None,
16
+ ) -> None:
17
+ """
18
+ Imports a CSV file into a Trackio project. The CSV file must contain a "step" column, may optionally
19
+ contain a "timestamp" column, and any other columns will be treated as metrics. It should also include
20
+ a header row with the column names.
21
+
22
+ TODO: call init() and return a Run object so that the user can continue to log metrics to it.
23
+
24
+ Args:
25
+ csv_path: The str or Path to the CSV file to import.
26
+ project: The name of the project to import the CSV file into. Must not be an existing project.
27
+ name: The name of the Run to import the CSV file into. If not provided, a default name will be generated.
28
+ name: The name of the run (if not provided, a default name will be generated).
29
+ space_id: If provided, the project will be logged to a Hugging Face Space instead of a local directory. Should be a complete Space name like "username/reponame" or "orgname/reponame", or just "reponame" in which case the Space will be created in the currently-logged-in Hugging Face user's namespace. If the Space does not exist, it will be created. If the Space already exists, the project will be logged to it.
30
+ dataset_id: If provided, a persistent Hugging Face Dataset will be created and the metrics will be synced to it every 5 minutes. Should be a complete Dataset name like "username/datasetname" or "orgname/datasetname", or just "datasetname" in which case the Dataset will be created in the currently-logged-in Hugging Face user's namespace. If the Dataset does not exist, it will be created. If the Dataset already exists, the project will be appended to it. If not provided, the metrics will be logged to a local SQLite database, unless a `space_id` is provided, in which case a Dataset will be automatically created with the same name as the Space but with the "_dataset" suffix.
31
+ """
32
+ if SQLiteStorage.get_runs(project):
33
+ raise ValueError(
34
+ f"Project '{project}' already exists. Cannot import CSV into existing project."
35
+ )
36
+
37
+ csv_path = Path(csv_path)
38
+ if not csv_path.exists():
39
+ raise FileNotFoundError(f"CSV file not found: {csv_path}")
40
+
41
+ df = pd.read_csv(csv_path)
42
+ if df.empty:
43
+ raise ValueError("CSV file is empty")
44
+
45
+ column_mapping = utils.simplify_column_names(df.columns.tolist())
46
+ df = df.rename(columns=column_mapping)
47
+
48
+ step_column = None
49
+ for col in df.columns:
50
+ if col.lower() == "step":
51
+ step_column = col
52
+ break
53
+
54
+ if step_column is None:
55
+ raise ValueError("CSV file must contain a 'step' or 'Step' column")
56
+
57
+ if name is None:
58
+ name = csv_path.stem
59
+
60
+ metrics_list = []
61
+ steps = []
62
+ timestamps = []
63
+
64
+ numeric_columns = []
65
+ for column in df.columns:
66
+ if column == step_column:
67
+ continue
68
+ if column == "timestamp":
69
+ continue
70
+
71
+ try:
72
+ pd.to_numeric(df[column], errors="raise")
73
+ numeric_columns.append(column)
74
+ except (ValueError, TypeError):
75
+ continue
76
+
77
+ for _, row in df.iterrows():
78
+ metrics = {}
79
+ for column in numeric_columns:
80
+ value = row[column]
81
+ if bool(pd.notna(value)):
82
+ metrics[column] = float(value)
83
+
84
+ if metrics:
85
+ metrics_list.append(metrics)
86
+ steps.append(int(row[step_column]))
87
+
88
+ if "timestamp" in df.columns and bool(pd.notna(row["timestamp"])):
89
+ timestamps.append(str(row["timestamp"]))
90
+ else:
91
+ timestamps.append("")
92
+
93
+ if metrics_list:
94
+ SQLiteStorage.bulk_log(
95
+ project=project,
96
+ run=name,
97
+ metrics_list=metrics_list,
98
+ steps=steps,
99
+ timestamps=timestamps,
100
+ )
101
+
102
+ print(
103
+ f"* Imported {len(metrics_list)} rows from {csv_path} into project '{project}' as run '{name}'"
104
+ )
105
+ print(f"* Metrics found: {', '.join(metrics_list[0].keys())}")
106
+
107
+ space_id, dataset_id = utils.preprocess_space_and_dataset_ids(space_id, dataset_id)
108
+ if dataset_id is not None:
109
+ os.environ["TRACKIO_DATASET_ID"] = dataset_id
110
+ print(f"* Trackio metrics will be synced to Hugging Face Dataset: {dataset_id}")
111
+
112
+ if space_id is None:
113
+ utils.print_dashboard_instructions(project)
114
+ else:
115
+ deploy.create_space_if_not_exists(space_id, dataset_id)
116
+ deploy.wait_until_space_exists(space_id)
117
+ deploy.upload_db_to_space(project, space_id)
118
+ print(
119
+ f"* View dashboard by going to: {deploy.SPACE_URL.format(space_id=space_id)}"
120
+ )
121
+
122
+
123
+ def import_tf_events(
124
+ log_dir: str | Path,
125
+ project: str,
126
+ name: str | None = None,
127
+ space_id: str | None = None,
128
+ dataset_id: str | None = None,
129
+ ) -> None:
130
+ """
131
+ Imports TensorFlow Events files from a directory into a Trackio project.
132
+ Each subdirectory in the log directory will be imported as a separate run.
133
+
134
+ Args:
135
+ log_dir: The str or Path to the directory containing TensorFlow Events files.
136
+ project: The name of the project to import the TensorFlow Events files into. Must not be an existing project.
137
+ name: The name prefix for runs (if not provided, will use directory names). Each subdirectory will create a separate run.
138
+ space_id: If provided, the project will be logged to a Hugging Face Space instead of a local directory. Should be a complete Space name like username/reponame" ororgname/reponame", or just "reponame" in which case the Space will be created in the currently-logged-in Hugging Face user's namespace. If the Space does not exist, it will be created. If the Space already exists, the project will be logged to it.
139
+ dataset_id: If provided, a persistent Hugging Face Dataset will be created and the metrics will be synced to it every 5 minutes. Should be a complete Dataset name likeusername/datasetname" or "orgname/datasetname", or just "datasetname" in which case the Dataset will be created in the currently-logged-in Hugging Face user's namespace. If the Dataset does not exist, it will be created. If the Dataset already exists, the project will be appended to it. If not provided, the metrics will be logged to a local SQLite database, unless a `space_id` is provided, in which case a Dataset will be automatically created with the same name as the Space but with the_dataset suffix.
140
+ """
141
+ try:
142
+ from tbparse import SummaryReader
143
+ except ImportError:
144
+ raise ImportError(
145
+ "The `tbparse` package is not installed but is required for `import_tf_events`. Please install trackio with the `tensorboard` extra: `pip install trackio[tensorboard]`."
146
+ )
147
+
148
+ if SQLiteStorage.get_runs(project):
149
+ raise ValueError(
150
+ f"Project '{project}' already exists. Cannot import TF events into existing project."
151
+ )
152
+
153
+ path = Path(log_dir)
154
+ if not path.exists():
155
+ raise FileNotFoundError(f"TF events directory not found: {path}")
156
+
157
+ # Use tbparse to read all tfevents files in the directory structure
158
+ reader = SummaryReader(str(path), extra_columns={"dir_name"})
159
+ df = reader.scalars
160
+
161
+ if df.empty:
162
+ raise ValueError(f"No TensorFlow events data found in {path}")
163
+
164
+ total_imported = 0
165
+ imported_runs = []
166
+
167
+ # Group by dir_name to create separate runs
168
+ for dir_name, group_df in df.groupby("dir_name"):
169
+ try:
170
+ # Determine run name based on directory name
171
+ if dir_name == "":
172
+ run_name = "main" # For files in the root directory
173
+ else:
174
+ run_name = dir_name # Use directory name
175
+
176
+ if name:
177
+ run_name = f"{name}_{run_name}"
178
+
179
+ if group_df.empty:
180
+ print(f"* Skipping directory {dir_name}: no scalar data found")
181
+ continue
182
+
183
+ metrics_list = []
184
+ steps = []
185
+ timestamps = []
186
+
187
+ for _, row in group_df.iterrows():
188
+ # Convert row values to appropriate types
189
+ tag = str(row["tag"])
190
+ value = float(row["value"])
191
+ step = int(row["step"])
192
+
193
+ metrics = {tag: value}
194
+ metrics_list.append(metrics)
195
+ steps.append(step)
196
+
197
+ # Use wall_time if present, else fallback
198
+ if "wall_time" in group_df.columns and not bool(
199
+ pd.isna(row["wall_time"])
200
+ ):
201
+ timestamps.append(str(row["wall_time"]))
202
+ else:
203
+ timestamps.append("")
204
+
205
+ if metrics_list:
206
+ SQLiteStorage.bulk_log(
207
+ project=project,
208
+ run=str(run_name),
209
+ metrics_list=metrics_list,
210
+ steps=steps,
211
+ timestamps=timestamps,
212
+ )
213
+
214
+ total_imported += len(metrics_list)
215
+ imported_runs.append(run_name)
216
+
217
+ print(
218
+ f"* Imported {len(metrics_list)} scalar events from directory '{dir_name}' as run '{run_name}'"
219
+ )
220
+ print(f"* Metrics in this run: {', '.join(set(group_df['tag']))}")
221
+
222
+ except Exception as e:
223
+ print(f"* Error processing directory {dir_name}: {e}")
224
+ continue
225
+
226
+ if not imported_runs:
227
+ raise ValueError("No valid TensorFlow events data could be imported")
228
+
229
+ print(f"* Total imported events: {total_imported}")
230
+ print(f"* Created runs: {', '.join(imported_runs)}")
231
+
232
+ space_id, dataset_id = utils.preprocess_space_and_dataset_ids(space_id, dataset_id)
233
+ if dataset_id is not None:
234
+ os.environ["TRACKIO_DATASET_ID"] = dataset_id
235
+ print(f"* Trackio metrics will be synced to Hugging Face Dataset: {dataset_id}")
236
+
237
+ if space_id is None:
238
+ utils.print_dashboard_instructions(project)
239
+ else:
240
+ deploy.create_space_if_not_exists(space_id, dataset_id)
241
+ deploy.wait_until_space_exists(space_id)
242
+ deploy.upload_db_to_space(project, space_id)
243
+ print(
244
+ f"* View dashboard by going to: {deploy.SPACE_URL.format(space_id=space_id)}"
245
+ )