# Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. # import collections import contextlib import datetime import itertools import os import random import shutil import subprocess import tempfile import time import typing as tp from pathlib import Path # pylint: disable=unused-import # import DelayedSubmission and CommandFunction to populate helpers namespace from .core import core from .core.job_environment import JobEnvironment from .core.utils import CommandFunction as CommandFunction # noqa from .core.utils import DelayedSubmission as DelayedSubmission # noqa from .core.utils import environment_variables as environment_variables # noqa class Checkpointable: """Derived callable classes are requeued after timeout with their current state dumped at checkpoint. __call__ method must be implemented to make your class a callable. Note ---- The following implementation of the checkpoint method resubmits the full current state of the callable (self) with the initial argument. You may want to replace the method to curate the state (dump a neural network to a standard format and remove it from the state so that not to pickle it) and change/remove the initial parameters. """ # pylint: disable=unused-argument def __new__(cls, *args, **kwargs): instance = super().__new__(cls) assert callable( instance ), f"Class {cls.__name__} is marked as Checkpointable but doesn't have a __call__ method. Please add a __call__ method." return instance def checkpoint(self, *args: tp.Any, **kwargs: tp.Any) -> DelayedSubmission: """Resubmits the same callable with the same arguments""" # The DelayedSubmission class goal is only to register and format # the arguments of the call "self(*args, **kwargs)" for submission to slurm return DelayedSubmission(self, *args, **kwargs) # type: ignore class FunctionSequence(Checkpointable): """This is for gathering several estimations into one function, which will return the sequence of outputs. Also this "function" is stateful, hence it can be stopped, and recovered, which is useful when job can be preempted. Usage ----- func = FunctionSequence() func.add(my_function1, arg1, kwarg1=value_kwarg1) func.add(my_function2, arg1, arg2) result1, result2 = func() Note ---- This function is checkpointable because: - it derives from Checkpointable - it keeps DelayedSubmission objects as attribute, which in turn store the results of the computation in memory once they are computed. So at checkpoint time, those results will be saved, and only the non-computed results will be computed once the job restarts. """ def __init__(self, verbose: bool = False) -> None: self.verbose = verbose self.delayed_functions: tp.List[DelayedSubmission] = [] def add(self, func: tp.Callable[..., tp.Any], *args: tp.Any, **kwargs: tp.Any) -> None: self.delayed_functions.append(DelayedSubmission(func, *args, **kwargs)) def __len__(self) -> int: return len(self.delayed_functions) def __iter__(self) -> tp.Iterator[DelayedSubmission]: return iter(self.delayed_functions) def __call__(self) -> tp.List[tp.Any]: # pylint: disable=arguments-differ if self.verbose: done = sum(f.done() for f in self) # those were computed before checkpoint print(f"Starting from {done}/{len(self.delayed_functions)}", flush=True) return [ f.result() for f in self.delayed_functions ] # results all results one by one (by running the functions if not already done) def as_completed( jobs: tp.Sequence[core.Job[core.R]], timeout: tp.Optional[tp.Union[int, float]] = None, poll_frequency: float = 10, ) -> tp.Iterator[core.Job[core.R]]: """ Yields jobs as they complete (finished, failed or were cancelled). Raises a TimeoutError if the result isn’t available after timeout seconds. timeout can be an int or float. If timeout is not specified or None, there is no limit to the wait time. Parameters ---------- jobs: list Jobs instances timeout: int/float Maximum time (in sec) to wait for jobs completion poll_frequency: float Frequency in second at which we check job status. Yields ------ Job The next completed job """ start = time.time() jobs_done: tp.Set[int] = set() while True: if timeout is not None and time.time() - start > timeout: raise TimeoutError for i, job in enumerate(jobs): if i in jobs_done: continue if job.done(): jobs_done.add(i) yield job if len(jobs_done) == len(jobs): break time.sleep(poll_frequency) def run_cmd(str_args, **kwargs): return subprocess.check_output(str_args, **kwargs).decode("utf-8").strip() class RsyncSnapshot: """Takes a snapshot of the git repository that the script lives in. This ensures that remote jobs always use the code from when they are scheduled and not the code from when they are launched / re-started. Parameters ---------- snapshot_dir: Path A path to where the snapshot should be created with_submodules: bool Whether or not submodules should be included in the snapshot exclude: Sequence[str] An optional list of patterns to exclude from the snapshot include: Sequence[str] A list of relative file names to include from the snapshot. Useful for .so or other build artifacts that are genarally not tracked by git. Note ---- - Only files that are checked in to the repository are included in the snapshot. If you have experimental code that you would like to include in the snapshot, you'll need to `git add` the file first for it to be included, or use `include` arg. """ def __init__( self, snapshot_dir: Path, root_dir: tp.Optional[Path] = None, with_submodules: bool = False, exclude: tp.Sequence[str] = (), include: tp.Sequence[str] = (), ): self.available(throw=True) self.snapshot_dir = Path(snapshot_dir) self.root_dir = root_dir or run_cmd(["git", "rev-parse", "--show-toplevel"]) self.original_dir = Path.cwd() self.with_submodules = with_submodules self.exclude = exclude self.include = include @staticmethod def available(throw: bool = False) -> bool: if not shutil.which("rsync"): if throw: raise RuntimeError("RsyncSnapshot requires rsync to be installed.") return False return True def __enter__(self) -> None: self.original_dir = Path.cwd() # Get the repository root root_dir = str(self.root_dir) sub = "--recurse-submodules" if self.with_submodules else "-s" # Make a shallow git clone if not self.snapshot_dir.exists(): self.snapshot_dir.parent.mkdir(parents=True, exist_ok=True) subprocess.check_call(["git", "clone", "--depth=2", f"file://{root_dir}", str(self.snapshot_dir)]) # Get a list of all the checked in files that we can pass to rsync # Is Rsync faster than a `git pull` ? with tempfile.NamedTemporaryFile() as tfile: # https://stackoverflow.com/a/51689219/4876946 run_cmd(f"git ls-files {sub} | grep -v ^16 | cut -f2- > {tfile.name}", cwd=root_dir, shell=True) exclude = list(itertools.chain.from_iterable(("--exclude", pat) for pat in self.exclude)) with open(tfile.name, "a", encoding="utf8") as o: for inc in self.include: print(inc, file=o) run_cmd(["rsync", "-a", "--files-from", tfile.name, root_dir, str(self.snapshot_dir)] + exclude) os.chdir(self.snapshot_dir) def __exit__(self, *args): os.chdir(self.original_dir) def _default_custom_logging(monitoring_start_time: float, n_jobs: int, state_jobs: tp.Dict[str, tp.Set[int]]): run_time = time.time() - monitoring_start_time date_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") failed_job_indices = sorted(state_jobs["FAILED"]) n_chars = len(str(n_jobs)) print( f"[{date_time}] Launched {int(run_time / 60)} minutes ago,", f"{len(state_jobs['RUNNING']):{n_chars}}/{n_jobs} jobs running,", f"{len(failed_job_indices):{n_chars}}/{n_jobs} jobs failed,", f"{len(state_jobs['DONE']) - len(failed_job_indices):{n_chars}}/{n_jobs} jobs done", flush=True, ) if len(failed_job_indices) > 0: print(f"[{date_time}] Failed jobs, indices {failed_job_indices}", flush=True) def monitor_jobs( jobs: tp.Sequence[core.Job[core.R]], poll_frequency: float = 30, test_mode: bool = False, custom_logging: tp.Callable = _default_custom_logging, ) -> None: """Continuously monitors given jobs until they are all done or failed. Parameters ---------- jobs: List[Jobs] A list of jobs to monitor poll_frequency: int The time (in seconds) between two refreshes of the monitoring. Can't be inferior to 30s. test_mode: bool If in test mode, we do not check the length of poll_frequency """ if not test_mode: assert poll_frequency >= 30, "You can't refresh too often (>= 30s) to avoid overloading squeue" n_jobs = len(jobs) if n_jobs == 0: print("There are no jobs to monitor") return job_arrays = ", ".join(sorted(set(str(job.job_id).split("_", 1)[0] for job in jobs))) print(f"Monitoring {n_jobs} jobs from job arrays {job_arrays} \n") monitoring_start_time = time.time() while True: if not test_mode: jobs[0].get_info(mode="force") # Force update once to sync the state state_jobs = collections.defaultdict(set) for i, job in enumerate(jobs): state_jobs[job.state.upper()].add(i) if job.done(): state_jobs["DONE"].add(i) failed_job_indices = sorted(state_jobs["FAILED"]) if len(state_jobs["DONE"]) == len(jobs): print(f"All jobs finished, jobs with indices {failed_job_indices} failed", flush=True) break custom_logging(monitoring_start_time, n_jobs, state_jobs) time.sleep(poll_frequency) print(f"Whole process is finished, took {int((time.time() - monitoring_start_time) / 60)} minutes") @contextlib.contextmanager def clean_env(extra_names: tp.Sequence[str] = ()) -> tp.Iterator[None]: """Removes slurm and submitit related environment variables so as to avoid interferences when submiting a new job from a job. Parameters ---------- extra_names: Sequence[str] Additional environment variables to hide inside the context, e.g. TRITON_CACHE_DIR and TORCHINDUCTOR_CACHE_DIR when using torch.compile. Note ---- A slurm job submitted from within a slurm job inherits some of its attributes, which may be confusing a cause weird gres errors (or pytorch distributed). Submitting within this context should prevent this. Usage ----- with submitit.helpers.clean_env(): executor.submit(...) """ distrib_names = ("MASTER_ADDR", "MASTER_PORT", "RANK", "WORLD_SIZE", "LOCAL_RANK", "LOCAL_WORLD_SIZE") cluster_env = { x: os.environ.pop(x) for x in os.environ if ( x.startswith(("SLURM_", "SLURMD_", "SRUN_", "SBATCH_", "SUBMITIT_")) or x in distrib_names or x in extra_names ) } try: yield finally: os.environ.update(cluster_env) class TorchDistributedEnvironment: def __init__(self) -> None: """Construct a class holding the parameters required to properly setup PyTorch distributed (with the default env:// initialization method). Examples -------- >>> dist_env = TorchDistributedEnvironment().export() >>> torch.distributed.init_process_group(backend="nccl") >>> print(f"master: {dist_env.master_addr}:{dist_env.master_port}") """ self._job_env = JobEnvironment() self.master_addr = self._job_env.hostnames[0] self.master_port = self._get_master_port() self.rank = self._job_env.global_rank self.world_size = self._job_env.num_tasks self.local_rank = self._job_env.local_rank self.local_world_size = self._job_env.num_tasks // self._job_env.num_nodes def _get_master_port(self) -> int: # MIN_MASTER_PORT, MAX_MASTER_PORT = (1023, 65535) MIN_MASTER_PORT, MAX_MASTER_PORT = (20000, 60000) master_port_str = os.environ.get("MASTER_PORT") if master_port_str is None: rng = random.Random(self._job_env.job_id) return rng.randint(MIN_MASTER_PORT, MAX_MASTER_PORT) master_port = int(master_port_str) # assert MIN_MASTER_PORT <= master_port <= MIN_MASTER_PORT return master_port def export( self, set_cuda_visible_devices: bool = True, overwrite: bool = False, ) -> "TorchDistributedEnvironment": """Export all the environment variables required to properly setup PyTorch distributed (with the default env:// initialization method) i.e. MASTER_ADDR, MASTER_PORT, RANK, WORLD_SIZE (to which LOCAL_RANK and LOCAL_WORLD_SIZE are added). Parameter ---------- set_cuda_visible_device: bool if True, updates CUDA_VISIBLE_DEVICES to use only the device matching the local rank. overwrite: bool if True, overwrites the environment variables if they exist; this can be useful when launching a job from another job. Returns -------- TorchDistributedEnvironment the current instance """ # See the "Environment variable initialization" section from # https://pytorch.org/docs/stable/distributed.html for the complete list of # environment variables required for the env:// initialization method. env_vars = { "MASTER_ADDR": self.master_addr, "MASTER_PORT": str(self.master_port), "RANK": str(self.rank), "WORLD_SIZE": str(self.world_size), "LOCAL_RANK": str(self.local_rank), # Not required "LOCAL_WORLD_SIZE": str(self.local_world_size), # Not required } if not overwrite: for key in env_vars: if key in os.environ: raise RuntimeError(f"Cannot export environment variables as {key} is already set") # Note: CUDA_VISIBLE_DEVICES may already be set with all available GPUs if set_cuda_visible_devices: env_vars["CUDA_VISIBLE_DEVICES"] = str(self.local_rank) os.environ.update(env_vars) return self