Skip to content

Commit

Permalink
merge latest changes from develop
Browse files Browse the repository at this point in the history
  • Loading branch information
bgunnar5 committed Jan 22, 2024
2 parents 51f9796 + 642f925 commit 4d47689
Show file tree
Hide file tree
Showing 2 changed files with 359 additions and 0 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Fixed
- The `merlin status` command so that it's consistent in its output whether using redis or rabbitmq as the broker
- The `merlin monitor` command will now keep an allocation up if the queues are empty and workers are still processing tasks
- Add the restart keyword to the specification docs

### Changed
- The entire documentation has been ported to MkDocs and re-organized
Expand Down
358 changes: 358 additions & 0 deletions docs/source/merlin_specification.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,358 @@
Workflow Specification
======================

The merlin input file or spec file is separated into several sections. An
annotated version is given below.

.. note:: The Merlin input file is a yaml file and must adhere to yaml
syntax. The yaml spec relies on the indentation in the file.

The input file can take a number of variables, beyond the examples shown here.
For a complete list and descriptions of the variables,
see :doc:`./merlin_variables`.

.. code-block:: yaml
####################################
# Description Block (Required)
####################################
# The description block is where the description of the study is placed. This
# section is meant primarily for documentation purposes so that when a
# specification is passed to other users they can glean a general understanding
# of what this study is meant to achieve.
#-------------------------------
# Required keys:
# name - Name of the study
# description - Description of what this study does.
#-------------------------------
# NOTE: You can add other keys to this block for custom documentation. Merlin
# currently only looks for the required set.
####################################
description:
description: Run a scan through Merlin
name: MERLIN
####################################
# Batch Block (Required)
####################################
# The batch system to use for each allocation
#-------------------------------
# Required keys:
# type - The scheduler type to use (local|slurm|flux|lsf)
# bank - The allocation bank
# queue - The batch queue
####################################
batch:
type: flux
bank: testbank
queue: pbatch
flux_path: <optional path to flux bin>
flux_start_opts: <optional flux start options>
flux_exec: <optional, flux exec command to launch workers on
all nodes if using flux and flux_exec_workers is True
(flux exec) >
flux_exec_workers: <optional, flux argument to launch workers on
all nodes. (True)>
launch_pre: <Any configuration needed before the srun or jsrun launch>
launch_args: <Optional extra arguments for the parallel launch command>
worker_launch: <Override the parallel launch defined in merlin>
shell: <the interpreter to use for the script after the shebang>
# e.g. /bin/bash, /bin/tcsh, python, /usr/bin/env perl, etc.
nodes: <num nodes> # The number of nodes to use for all workers
This can be overridden in the workers config.
If this is unset the number of nodes will be
queried from the environment, failing that, the
number of nodes will be set to 1.
walltime: The total walltime of the batch allocation (hh:mm:ss or mm:ss or ss)
#####################################
# Environment Block
####################################
# The environment block is where items describing the study's environment are
# defined. This includes static information that the study needs to know about
# and dependencies that the workflow requires for execution.
#-------------------------------
# NOTE: This block isn't strictly required as a study may not depend on anything.
########################################################################
env:
#-------------------------------
# Variables
#-------------------------------
# Values that the workflow substitutes into steps and are similar in
# concept to Unix environment variables. These variables are not dependent
# on values in the environment and so are more portable.
#
# Note that variables defined here can alter the runtime shell
# variable definitions.
# Do not define a variable named "shell" here.
#-------------------------------
variables:
# Set a custom output path for the study workspace. This path is where
# Merlin will place all temporary files, state files, and any output.
# The resulting path is usually a timestamped folder within OUTPUT_PATH
# and in this case would be
# './sample_output/merlin/merlin_sample1_<timestamp>'.
# NOTE: If not specified,
# OUTPUT_PATH is assumed to be the path where Merlin was launched from.
OUTPUT_PATH: ./sample_output/merlin # OUTPUT_PATH is a keyword
# variable that Merlin looks for
# to replace with the study
# directory created for the
# ensemble
####################################
# Study Block (Required)
####################################
# The study block is where the steps in the workflow are defined. This section
# of the specification represents the unexpanded set of tasks that the study
# is composed of.
#
#
# A description of what gets turned into tasks and what type of task
# would be a good addition
#
# study lists the various steps, each of which has these fields
# name: step name
# description: what the step does
# run:
# cmd: the command to run for multilines use cmd: |
# The $(LAUNCHER) macro can be used to substitute a parallel launcher
# based on the batch:type:.
# It will use the nodes and procs values for the task.
# restart: The (optional) restart command to run when $(MERLIN_RESTART)
# is the exit code. The command in cmd will be run if the exit code
# is $(MERLIN_RETRY).
# task_queue: the queue to assign the step to (optional. default: merlin)
# shell: the shell to use for the command (eg /bin/bash /usr/bin/env python)
# (optional. default: /bin/bash)
# depends: a list of steps this step depends upon (ie parents)
# procs: The total number of MPI tasks
# nodes: The total number of MPI nodes
# walltime: The total walltime of the run (hh:mm:ss, mm:ss or ss) (not available in lsf)
# cores per task: The number of hardware threads per MPI task
# gpus per task: The number of GPUs per MPI task
# SLURM specific run flags:
# slurm: Verbatim flags only for the srun parallel launch (srun -n <nodes> -n <procs> <slurm>)
# FLUX specific run flags:
# flux: Verbatim flags for the flux parallel launch (flux mini run <flux>)
# LSF specific run flags:
# bind: Flag for MPI binding of tasks on a node
# num resource set: Number of resource sets
# launch_distribution : The distribution of resources (default: plane:{procs/nodes})
# exit_on_error: Flag to exit on error (default: 1)
# lsf: Verbatim flags only for the lsf parallel launch (jsrun ... <lsf>
#######################################################################
study:
- name: runs1
description: Run on alloc1
run:
cmd: $(LAUNCHER) echo "$(VAR1) $(VAR2)" > simrun.out
nodes: 1
procs: 1
task_queue: queue1
shell: /bin/bash
- name: post-process
description: Post-Process runs on alloc1
run:
cmd: |
cd $(runs1.workspace)/$(MERLIN_SAMPLE_PATH)
<post-process>
# exit $(MERLIN_RESTART) # syntax to send a restart error code
# This will rerun the cmd command. Users can also use $(MERLIN_RETRY).
nodes: 1
procs: 1
depends: [runs1]
task_queue: queue1
- name: runs2
description: Run on alloc2
run:
cmd: |
touch learnrun.out
$(LAUNCHER) echo "$(VAR1) $(VAR2)" >> learnrun.out
exit $(MERLIN_RESTART) # syntax to send a restart error code
# exit $(MERLIN_RETRY) # syntax to send a retry error code to
# run the cmd command again instead of the restart command.
restart: |
# Command to run if the $(MERLIN_RESTART) exit code is used
touch learnrunrs.out
$(LAUNCHER) echo "$(VAR1) $(VAR2)" >> learnrunrs.out
exit $(MERLIN_SUCCESS) # syntax to send a success code
nodes: 1
procs: 1
task_queue: lqueue
max_retries: 3 # maximum number of retries
retry_delay: 10 # delay retry for N seconds (default 1)
batch:
type: <override the default batch type>
- name: monitor
description: Monitor on alloc1
run:
cmd: date > monitor.out
nodes: 1
procs: 1
task_queue: mqueue
####################################
# Parameter Block (Required)
####################################
# The parameter block contains all the things we'd like to vary in the study.
# Currently, there are two modes of operating in the specification:
# 1. If a parameter block is specified, the study is expanded and considered a
# parameterized study.
# 2. If a parameter block is not specified, the study is treated as linear and
# the resulting study is not expanded.
#
# There are three keys per parameter:
# 1. A list of values that the parameter takes.
# 2. A label that represents a "pretty printed" version of the parameter. The
# parameter values is specified by the '%%' moniker (for example, for SIZE --
# when SIZE is equal to 10, the label will be 'SIZE.10'). To access the label
# for SIZE, for example, the token '$(SIZE.label)' is used.
# Labels can take one of two forms: A single string with the '%%' marker or
# a list of per value labels (must be the same length as the list of values).
#
# NOTE: A specified parameter does not necessarily have to be used in every step
# or at all. If a parameter is specified and not used, it simply will not be
# factored into expansion or the naming of expanded steps or their workspaces.
# NOTE: You can also specify custom generation of parameters using a Python
# file containing the definition of a function as follows:
#
# 'def get_custom_generator():'
#
# The 'get_custom_generator' function is required to return a ParameterGenerator
# instance populated with custom filled values. In order to use the file, simply
# call Merlin using 'merlin run <specification path>'.
########################################################################
global.parameters:
STUDY:
label: STUDY.%%
values: [MERLIN1, MERLIN2]
SIZE:
values : [10, 20]
label : SIZE.%%
ITERATIONS:
values : [10, 20]
label : ITER.%%
####################################
# Merlin Block (Required)
####################################
# The merlin specific block will add any configuration to
# the DAG created by the study description.
# including task server config, data management and sample definitions.
#
# merlin will replace all SPECROOT instances with the directory where
# the input yaml was run.
#######################################################################
merlin:
####################################
# Resource definitions
#
# Define the task server configuration and workers to run the tasks.
#
####################################
resources:
task_server: celery
# Flag to determine if multiple workers can pull tasks
# from overlapping queues. (default = False)
overlap: False
# Customize workers. Workers can have any user-defined name (e.g., simworkers, learnworkers).
workers:
simworkers:
args: <celery worker args> <optional>
steps: [runs1, post-process, monitor] # [all] when steps is omitted
nodes: <Number of nodes for this worker or batch num nodes>
# A list of machines to run the given steps can be specified
# in the machines keyword. <optional>
# A full OUTPUT_PATH and the steps argument are required
# when using this option. Currently all machines in the
# list must have access to the OUTPUT_PATH.
machines: [host1, host2]
learnworkers:
args: <celery worker args> <optional>
steps: [runs2]
nodes: <Number of nodes for this worker or batch num nodes>
# An optional batch section in the worker can override the
# main batch config. This is useful if other workers are running
# flux, but some component of the workflow requires the native
# scheduler or cannot run under flux. Another possibility is to
# have the default type as local and workers needed for flux or
# slurm steps.
batch:
type: local
machines: [host3]
###################################################
# Sample definitions
#
# samples file can be one of
# .npy (numpy binary)
# .csv (comma delimited: '#' = comment line)
# .tab (tab/space delimited: '#' = comment line)
###################################################
samples:
column_labels: [VAR1, VAR2]
file: $(SPECROOT)/samples.npy
generate:
cmd: |
python $(SPECROOT)/make_samples.py -dims 2 -n 10 -outfile=$(INPUT_PATH)/samples.npy "[(1.3, 1.3, 'linear'), (3.3, 3.3, 'linear')]"
level_max_dirs: 25
####################################
# User Block (Optional)
####################################
# The user block allows other variables in the workflow file to be propagated
# through to the workflow (including in variables .partial.yaml and .expanded.yaml).
# User block uses yaml anchors, which defines a chunk of configuration and use
# their alias to refer to that specific chunk of configuration elsewhere.
#######################################################################
user:
study:
run:
hello: &hello_run
cmd: |
python3 $(HELLO) -outfile hello_world_output_$(MERLIN_SAMPLE_ID).json $(X0) $(X1) $(X2)
max_retries: 1
collect: &collect_run
cmd: |
echo $(MERLIN_GLOB_PATH)
echo $(hello.workspace)
ls $(hello.workspace)/X2.$(X2)/$(MERLIN_GLOB_PATH)/hello_world_output_*.json > files_to_collect.txt
spellbook collect -outfile results.json -instring "$(cat files_to_collect.txt)"
translate: &translate_run
cmd: spellbook translate -input $(collect.workspace)/results.json -output results.npz -schema $(FEATURES)
learn: &learn_run
cmd: spellbook learn -infile $(translate.workspace)/results.npz
make_samples: &make_samples_run
cmd: spellbook make-samples -n $(N_NEW) -sample_type grid -outfile grid_$(N_NEW).npy
predict: &predict_run
cmd: spellbook predict -infile $(make_new_samples.workspace)/grid_$(N_NEW).npy -outfile prediction_$(N_NEW).npy -reg $(learn.workspace)/random_forest_reg.pkl
verify: &verify_run
cmd: |
if [[ -f $(learn.workspace)/random_forest_reg.pkl && -f $(predict.workspace)/prediction_$(N_NEW).npy ]]
then
touch FINISHED
exit $(MERLIN_SUCCESS)
else
exit $(MERLIN_SOFT_FAIL)
fi
python3:
run: &python3_run
cmd: |
print("OMG is this in python?")
print("Variable X2 is $(X2)")
shell: /usr/bin/env python3
python2:
run: &python2_run
cmd: |
print "OMG is this in python2? Change is bad."
print "Variable X2 is $(X2)"
shell: /usr/bin/env python2

0 comments on commit 4d47689

Please sign in to comment.