# Copyright (c) Facebook, Inc. and its affiliates. # All rights reserved. # # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. """ This is the central dispatch of the `dora` command. From there you can check grid files, launch XPs, check their logs etc, as well as doing local runs for debugging. """ import argparse from .grid import grid_action from .info import info_action from .launch import launch_action from .log import fatal, setup_logging, simple_log from .run import run_action from .share import import_action, export_action from ._utils import get_main def add_submit_rules(parser): parser.add_argument("-r", "--retry", action="store_true", help="Retry failed jobs") parser.add_argument("-R", "--replace", action="store_true", help="Replace any running job.") parser.add_argument("-D", "--replace_done", action="store_true", help="Also resubmit done jobs.") parser.add_argument("--no_git_save", action='store_false', dest='git_save', default=None, help="Temporarily deactivate git_save for any scheduled job.") def add_slurm_config(parser): parser.add_argument("-g", "--gpus", type=int, help="Number of gpus.") parser.add_argument("-p", "--partition", help="Partition.") parser.add_argument("--dev", action="store_const", dest="partition", const="devlab", help="Use dev partition.") parser.add_argument("-c", "--comment", help="Comment.") parser.add_argument("--constraint", help="Constraint.") def get_parser(): parser = argparse.ArgumentParser() parser.add_argument( '--package', '-P', default=None, help='Training module. ' 'You can also set the DORA_PACKAGE env. In last resort, ' 'Dora will look for a package in the current folder with module defined ' 'at --runfile flag.') parser.add_argument( '--main_module', default=None, help='Training exec name. ' 'Dora will search for this module to run within the package provided by --package ' 'flag. You can also set DORA_MAIN_MODULE env. Defaults to \'train\' module.') parser.add_argument('--verbose', '-v', action='store_true', help="Show debug info.") subparsers = parser.add_subparsers( title="command", help="Command to execute", required=True, dest='command') grid = subparsers.add_parser("grid") add_submit_rules(grid) add_slurm_config(grid) grid.add_argument("-C", "--cancel", action='store_true', help="Cancel all running jobs.") grid.add_argument("--clear", action='store_true', help="Remove XP folder, reschedule all jobs, starting from scratch.") grid.add_argument("-i", "--interval", default=5, type=float, help="Update status and metrics every that number of minutes. " "Default is 5 min.") grid.add_argument("--no_monitoring", action="store_false", dest="monitor", help="No monitoring, just schedule and print current state.") grid.add_argument("--dry_run", action="store_true", help="Only simulate actions but does not run any call to Slurm.") grid.add_argument("-T", "--trim", type=int, help="Trim history to the length of the exp with the given index.") grid.add_argument("-L", "--trim_last", action="store_true", help="Trim history to the slowest.") group = grid.add_mutually_exclusive_group() group.add_argument("-f", "--folder", type=int, help="Show the folder for the job with the given index") group.add_argument("-l", "--log", type=int, help="Show the log for the job with the given index") group.add_argument("-t", "--tail", type=int, help="Show the log for the job with the given index") grid.add_argument("--init", action='store_true', help="Init the given XPs so that their signature can be referenced.") grid.add_argument( 'grid', nargs='?', help='Name of the grid to run. Name of the module will be `package`.grids.`name`.') grid.add_argument("patterns", nargs='*', help="Only handle experiments matching all the given pattern. " "If empty, handle all experiments") grid.set_defaults(action=grid_action) run = subparsers.add_parser("run", help="Run locally the given command.") run.add_argument("-f", "--from_sig", help="Signature of job to use as baseline.") run.add_argument("-d", "--ddp", action="store_true", help="Distributed training.") run.add_argument("--ddp_workers", type=int, help="Nb of workers for distributed, default to nb of GPUs.") run.add_argument("--git_save", action="store_true", default=False, help="Run from a clean git clone.") run.add_argument("--clear", action='store_true', help="Remove XP folder, reschedule job, starting from scratch.") run.add_argument("argv", nargs='*') run.set_defaults(action=run_action) launch = subparsers.add_parser("launch") launch.add_argument("-f", "--from_sig", help="Signature of job to use as baseline.") launch.add_argument("-a", "--attach", action="store_true", help="Attach to the remote process. Interrupting the command will " "kill the remote job.") launch.add_argument("--no_tail", action="store_false", dest="tail", default=True, help="Does not tail the log once job is started.") launch.add_argument("-C", "--cancel", action='store_true', help="Cancel any existing job and return.") launch.add_argument("--clear", action='store_true', help="Remove XP folder, reschedule job, starting from scratch.") add_submit_rules(launch) add_slurm_config(launch) launch.add_argument("argv", nargs='*') launch.set_defaults(action=launch_action) info = subparsers.add_parser("info") info.add_argument("-f", "--from_sig", help="Signature of job to use as baseline.") info.add_argument("-j", "--job_id", help="Find job by job id.") info.add_argument("-C", "--cancel", action="store_true", help="Cancel job") info.add_argument("-l", "--log", action="store_true", help="Show entire log") info.add_argument("-t", "--tail", action="store_true", help="Tail log") info.add_argument("-m", "--metrics", action="store_true", help="Show last metrics") info.add_argument("argv", nargs='*') info.set_defaults(action=info_action) import_ = subparsers.add_parser("import") import_.set_defaults(action=import_action) export = subparsers.add_parser("export") export.add_argument("sigs", nargs='*', help='All the XP sigs to export.') export.set_defaults(action=export_action) return parser def main(): parser = get_parser() args = parser.parse_args() setup_logging(args.verbose) if args.action is None: fatal("You must give an action.") main = get_main(args.main_module, args.package) if getattr(args, 'from_sig', None) is not None: try: argv = main.get_argv_from_sig(args.from_sig) except RuntimeError: fatal(f"Could not find an existing run with sig {args.from_sig}") simple_log("Parser", "Injecting argv", argv, "from sig", args.from_sig) args.argv = argv + args.argv if getattr(args, 'git_save', None) is not None: main.dora.git_save = args.git_save args.action(args, main) if __name__ == "__main__": main()
Memory