# =========================================================
#
# MetaBakery: customized reimplementation of the
#             BioBakery workflow
#
# =========================================================
#
# Config Template for
#    MetaBakery version 2b, released on 2023-Mar-19
#
# Last modification of this document: 2023-Mar-19
#
# MetaBakery's web page:
# http://metabakery.fe.uni-lj.si
#
# MetaBakery users' manual:
# http://metabakery.fe.uni-lj.si/metabakery_manual.pdf
#
# Quick start guide:
# http://metabakery.fe.uni-lj.si/quick_start.txt
#
# The latest version of this config template:
# http://metabakery.fe.uni-lj.si/config_template.txt
#
# MetaBakery developers:
#    Blaz Stres, blaz.stres@fgg.uni-lj.si
#    Bostjan Murovec, bostjan.murovec@fe.uni-lj.si
#
# License:
# --------
# Creative Commons Attribution CC BY license
# https://creativecommons.org/licenses
#
#
#
#    ------------------------------------------------------
#    If you use MetaBakery or its derivatives, please cite:
#       ... article submitted for publication, please
#           check back to MetaBakery web site for
#           reference update
#
#    Please also cite software that MetaBakery contains,
#        and that you use for your pipeline processing.
#    Please see chapter Credits in MetaBakery
#        Users' Manual.
#    ------------------------------------------------------
#
#
#
#    IMPORTANT: MetaBakery is developed and disseminated in a good
#    faith and desire to work according to expectations, but authors
#    DO NOT give any guarantees about it correctness.
#
#    USE IT AT YOUR OWN RISK.
#
#    Authors cannot be held legally or morally responsible for any
#    consequences that may arise from using or misusing MetaBakery.
#
#    IMPORTANT: MetaBakery is a skeleton application for a synergic
#    execution of many externally developed pieces of software.
#    These are disseminated as integrated parts of MetaBakery to
#    provide user-friendly out-of-the-box experience.
#    Nonetheless, every included piece of software remains OWNED
#    and COPYRIGHTED by its respective developers.
#    Please see chapter Credits in MetaBakery Users' Manual.


#####################################################
#
# General instructions
#
#####################################################

# This config file describes available parameters,
# and is intended to serve as a template for making
# actual MetaBakery configuration files. All parameters
# are accompanied with their explanation, to make this
# document fairly self contained.
#
# Nonetheless, this file is not a full substitute for
# the MetaBakery users' manual, which presents certain
# information in a didactically more suitable way.
#
# MetaBakery users' manual:
# http://metabakery.fe.uni-lj.si/metabakery_manual.pdf
#
# This configuration template is available at:
# http://metabakery.fe.uni-lj.si/config_template.txt
#
#
#
# ----------------------------------------------------------
# Usage:
# ----------------------------------------------------------
#
# Fill in parameters below according to your preferences.
#
# This document is fairly long because it contains
# complete instructions about setting MetaBakery parameters.
# For production, a user is expected to remove the majority of
# comments out of this file to make it easier to navigate.
#
# Many parameters do not need to be set. In the vast majority of
# cases default values suffice. Please do not be overwhelmed
# by the number of available settings below. As the first step,
# it suffices to specify directory with input files (parameter in_dir).
#
#
#
# MetaBakery workflow can be run in two different ways.
#
# -------------------------------------
# THE FIRST WAY OF LAUNCHING MetaBakery
# -------------------------------------
#
# If MetaBakery default settings are adequate,
# then the only piece of information that needs to be
# specified is an input directory, which contains
# fastq reads to be processed. In this case, MetaBakery
# is executed in one of the following ways.
#
# singularity run path_to/mbakery_{version}.sif /abs/path_to/reads
# singularity run path_to/mbakery_{version}.sif /home/user/path_to/reads
# singularity run path_to/mbakery_{version}.sif ~/path_to/reads  (the same as /home/user/path_to/reads)
# singularity run path_to/mbakery_{version}.sif ./relative_to_current_dir/path/reads
# singularity run path_to/mbakery_{version}.sif relative_to_current_dir/path/reads  (the same as above)
#
# --------------------------------------
# THE SECOND WAY OF LAUNCHING MetaBakery
# --------------------------------------
#
# If a workflow configuration is needed, then a config file
# according to this template file should be prepared.
# In this case, MetaBakery is run by specifying the prepared
# config file name instead of a directory with input reads.
#
# singularity run path_to/mbakery_{version}.sif /abs/path_to/config_file.txt
# singularity run path_to/mbakery_{version}.sif /home/user/path_to/config_file.txt
# singularity run path_to/mbakery_{version}.sif ~/path_to/config_file.txt  (the same as /home/user/path_to/config_file.txt)
# singularity run path_to/mbakery_{version}.sif ./relative_to_current_dir/path/config_file.txt
# singularity run path_to/mbakery_{version}.sif relative_to_current_dir/path/config_file.txt  (the same as above)
#
# The only required parameter in config file is in_dir,
# which takes role of the input-directory specification on
# the command line according to the first way of launching MetaBakery.


# The following syntax applies to configuration files.
# Lines that begin with a dash (#) are comments and they
# are ignored by the workflow. Empty lines and lines that
# contain only spaces and tabular (tab) characters are
# ignored as well.
#
# Each parameter is specified on a single line, which consists
# of a parameter's name, an equal sign (=), and a potential
# parameter's value. Names of parameters are lowercase. Spaces
# may be optionally inserted to the left and to the right of the
# equal sign. Everything after the equal sign and any potential
# spaces that follow it, constellates the value of the parameter.
# The value may be empty, in which case the equal sign still
# needs to be present.
#
# Parameters may be set in any order. Default settings apply to
# undefined parameters, which eliminates the need to define everything.
# Typically, only a very tiny subset of known parameters is set within
# any config file. This greatly simplifies configuration process.


#####################################################
#
# Global settings
#
#####################################################


# THE ONLY REQUIRED PARAMETER: in_dir
#
# Specification of an input directory with reads.
# The workflow figures out by itself which of the
# present files are paired or unpaired reads,
# as well as it ignores files with non-relevant
# extensions. Subdirectories are ignored.
#
# Workflow can operate either on paired R1_R2 or
# unpaired R1 fastq files (with extensions .fastq or .fq).
# These may be compressed, in which case an appropriate
# extension (.gz, .zip, .bz2 or .xz) needs to be appended
# to extension .fastq or .fq (for example: some_file.fastq.gz
# or some_file.fq.gz). Compressed and uncompressed files
# may be intermixed, even among paired and unpaired inputs.
# Different compressing formats may also be intermixed
# within an input set.
#
# All mentioned types of files may be present at once
# in an input directory. The workflow (hopefully) figures
# out what belongs together. If MetaBakery fails to
# properly combine input files in pairs, please see
# the next section.

#in_dir = /abs/path_to/some_reads
#in_dir = /home/user/path_to/some_reads
#in_dir = ~/path_to/some_reads      (the same as /home/user/path_to/some_reads)
#in_dir = ./relative_to_current_dir/some_reads
#in_dir = relative_to_current_dir/some_reads                (the same as above)


# OPTIONAL: force_paired       (default: No)
# OPTIONAL: force_unpaired     (default: No)
# OPTIONAL: unpaired_indicator (default: none)
#
# On rare occasions MetaBakery fails to properly recognize
# fastq pairs. More or less the only troublesome situation
# is that two unpaired fastq files are recognized as a pair
# due to an unfortunate similarity of their names.
#
# To alleviate the difficulty MetaBakery possesses three
# above listed configuration directives, which can be enabled,
# if a need arises.
#
# Enabled option force_paired=Yes does not alter the fastq
# recognition process. However, if MetaBakery finds any unpaired
# reads during enumeration of input files, it aborts processing.
# The option is handy, when all input fastq files are supposed
# to be of a paired R1_R2 type. If MetaBakery cannot find a pair
# for any input fastq file, its refusal to proceed with the
# analyses, alerts a user about the error, instead of blindly
# presuming that everything is okay. This option must not be activated
# together with any of the two remaining options within this set.
#
# Enabled option force_unpaired=Yes prevents MetaBakery from
# attempting to identify fastq pairs. All input fastq files are
# treated as unpaired ones.
#
# Option unpaired_indicator=...string_literal... forces files with
# ...string_literal.. substring in their names to be treated as
# unpaired files, even if MetaBakery might recognize them as R1_R2
# pair. Other files (without the specified literal in their names)
# are not forced to be fastq pairs. For them normal MetaBakery
# recognition rules apply.

#force_paired=Yes
#force_unpaired=Yes

#unpaired_indicator=_UN_
#unpaired_indicator=RX
#unpaired_indicator=_unpaired


###############################################################################
Specification of a sequencer source (fastq format)
###############################################################################
#
# Sometimes the KneadData preprocessor (more precisely: its Trimmmomatic step)
# needs a specification of a sequencer in order to properly interpret fastq
# sequence names and select appropriate adapter sequences for processing the
# input. This can be done by specifying a configuration directive with one
# of the following choices (NexteraPE is default); PLEASE see the warning below.

#params_kneaddata = --sequencer-source NexteraPE
#params_kneaddata = --sequencer-source TruSeq2
#params_kneaddata = --sequencer-source TruSeq3
#params_kneaddata = --sequencer-source none

# If KneadData terminates with an error when running with default
# parameters (i.e. without specification of the above parameter),
# you may try re-running it with selection TruSeq3, or whatever
# is appropriate for your sequences.
#
# WARNING. The default KneadData parameters are the following
# ones, which mimic the behavior of the BioBakery wmgx workflow.

#params_kneaddata = --cat-final-output --serial --remove-intermediate-output

# In order specify parameter --sequencer-source without overriding
# the default behavior, it is necessary to repeat the default settings
# as well, as the following example demonstrates.

#params_kneaddata = --cat-final-output --serial --remove-intermediate-output --sequencer-source TruSeq3

# Two parameters --cat-final-output and --serial are mandatory
# and MUST REMAIN set in order for the workflow to execute properly.
# Hence, whenever prescribing your own parameters for KneadData,
# please also include --cat-final-output --serial to the specifications.


# OPTIONAL: out_dir (default: the same as in_dir)
#
# Specification of an output directory, where the workflow
# places generated files. Everything that workflow generates
# is placed into subfolders of out_dir, so several MetaBakery
# runs may safely target the same output directory,
# since directory naming scheme guarantees no collisions.
#
# Output directory may even be the same as directory
# with input files. However, there are several reasons
# to select out_dir on a different location.
# In may be that disk/partition with input files does
# not have enough space to hold intermediate and
# resulting files. Also, disk with in_dir may be a
# slow one (e.g. USB key), for which it is prudent
# to only read input files from.
# Reads may also be stored in location for which
# a MetaBakery operator has only a read-only access,
# in order not to accidentally delete sequenced data.

#out_dir = /home/some_user/convenient_output_location


# ----------------------------------------------------
# Compression of results
# ----------------------------------------------------

# Set the following directive(s) to Yes to compress
# output directory into a tar.gz file at the end of
the workflow execution.

#compress_output = Yes


# OPTIONAL: summary_dir (default: none)
#
# Specification of an alternative output directory,
# where MetaBakery places a congested output.


# OPTIONAL: verbose (default: No)
#
# Set to Yes for a more detailed on-screen description
# of the ongoing progress and performance tuning.
# These are primarily useful for workflow debugging.
#
# NOTE: if you report an issue to us, please set
#       this option to yes, and send us the
#       resulting report directory.

#verbose=Yes


# OPTIONAL: number_of_threads
#               (default: as many as there are CPUs)
#
# Number of threads to use for parallel execution.
#
# If the parameter is not specified, the number of
# available processors will be determined by querying
# the underlying operating system.
#
# If your intention is not to consume all available
# resources (e.g. because the same hardware will
# execute some other calculations in parallel), then
# you may manually set the value of this parameter to
# a LOWER value than the number of available CPUs.
# Setting this number to a larger value than the
# numbers of CPUs, will DECREASE computation speed
# (but will have no other adverse consequences).
#
# Another reason for lowering this number below the
# actual CPU count is to lower memory consumption.
# If an experience shows that MetaBakery (in fact,
# some of program that is integrated into it) consumes
# too much memory, sometimes (but not always) the issue
# may be alleviated by reducing the number of threads
# to execute in parallel.

#number_of_threads = 16


# OPTIONAL: dangerous_delete_inputs (default: No)
#
# This option is meant primarily for running MetaBakery
# on High-Performance Computers. In such scenarios of use
# input fastq files are transferred from a local or some
# cache disk to the HPC working disk storage, which is
# isolated from the local or cache storage.
# In such cases, it is prudent to delete input and
# some other files as soon as they are not needed
# any more by the remaining workflow steps.
#
# Specifically, if input fastq files are compressed
# e.g. by the gzip utility, then MetaBakery ungzips
# them before their processing. From that moment on
# the gzipped versions of files are not needed any
# more. By setting option dangerous_delete_inputs to
# Yes, MetaBakery deletes the gzipped files immediately
# upon their decompression. Similarly, the uncompressed
# fastq files are subjected to the KneadData preprocessor,
# and after that, only the KneadData output is needed
# for further steps of analysis. With option
# dangerous_delete_inputs set to yes,
# MetaBakery deletes the uncompressed fastq files
# as soon as KneadData processes them.
#
# This option is potentially dangerous, since
# if it is enabled (perhaps by a mistake) when
# running MetaBakery locally, the archive of
# fastq files get deleted, which is probably
# not intended. Hence, the literal "dangerous"
# in the name of the option.
#
# The intended purpose of this option is not
# so much lower disk consumption, since HPC
# facilities generally posses ample of space.
# Nonetheless, quick deletion of typically
# large unneeded files is a good idea, since
# files are cached in memory by the underlying
# operating system. When files are deleted,
# the operating system gets hints for better
# utilization of memory for disk cache.

#dangerous_delete_inputs = Yes


# OPTIONAL: short_work_dir (default: None)
#
# This option is meant primarily for running MetaBakery
# on High-Performance Computers. Names of working
# directories on many HPC systems are fairly long.
# It turns out that KneadData processing step
# cannot cope with the resulting long file names
# which leads to its crashes.
#
# To alleviate the difficulty, MetaBakery possesses
# option short_work_dir for relocation of
# KneadData's work directory. Below are some
# viable options that work in typical cases,
# but their feasible set depends on the actual
# configuration of HPC facilities.
# Note that /dev/shm points to a RAM disk.
# If HPC node has plenty of RAM, putting
# the KneadData's work directory into RAM
# speeds up the processing due to the avoided
# disk bottleneck. However, if the available
# RAM is exhausted, then KneadData crashes
# together with the rest of the workflow.

#short_work_dir = /var/tmp
#short_work_dir = /tmp
#short_work_dir = /dev/shm
#short_work_dir = /scratch
#short_work_dir = /data


#####################################################
#
# General workflow behavior
#
#####################################################


# OPTIONAL: route_direct (default: No)
#           route_kneaddata (default: Yes)
#
# Enables or disables available processing routes.
#
#
#
# Route direct feeds input data directly to
# Metaphlan, Humann, Strainphlan and MelonnPan.
#
# This route is meant for the cases where inputs:
#
# 1. are already prepared and filtered by other
#    means or processing frameworks,
#
# 2. are inherently high quality and without
#    contaminants,
#
#
#
# Route KneadData mimics original BioBakery
# wmgx workflow, by filtering input data
# with the KneadData preprocessor before
# feeding it to Metaphlan, Humann, Strainphlan
# and MelonnPan.
#
#
#
# NOTE: generally, only one of the above routes
#       is expected to be enabled. Both routes
#       may be enabled, if the intention is to
#       compare their results.
#       In this case separate Metaphlan, Humann,
#       etc. runs are performed for each route,
#       and the entire framework executes longer.
#       Hence, when a preferred route is known,
#       it should be the only one enabled for
#       real-case analyses.

#route_direct = Yes
#route_kneaddata = No


# OPTIONAL for MetaBakery edition 4:
#           enable_metaphlan (default: Yes)
#           enable_humann (default: Yes)
#
# OPTIONAL for MetaBakery edition 3:
#           enable_metaphlan3 (default: Yes)
#           enable_humann3 (default: Yes)
#
# OPTIONAL for MetaBakery edition 2:
#           enable_metaphlan2 (default: Yes)
#           enable_humann2 (default: Yes)

# OPTIONAL for all MetaBakery editions:
#           enable_strainphlan (default: Yes)
#           enable_melonnpan (default: Yes)
#
# Disables or enables individual processing steps
# along the above analysis routes. The settings
# apply uniformly to all enabled routes.
#
# NOTE: if Humann is enabled, then the appropriate
#       Metaphlan version is also enabled,
#       regardless of its settings..
#
# NOTE: Strainphlan works when Metaphlan is enabled.
#       If Metaphlan is not enabled,
#       then Strainphlan does not execute at all.
#
# NOTE: MelonnPan works when Humann is enabled.
#       If Humann is not enabled,
#       then MelonnPan does not execute at all.
#
# NOTE: different MetaBakery editions integrate different
#       subsets of the programs. The discussed
#       parameters that are associated with the
#       non-existing programs are ignored.

#enable_metaphlan = no
#enable_humann = no

#enable_metaphlan3 = no
#enable_humann3 = no

#enable_metaphlan2 = no
#enable_humann2 = no

#enable_strainphlan = no
#enable_melonnpan = no


# OPTIONAL: contaminants_db
#
# A list of reference databases with contaminants data.
#
# Note: this parameter is ignored, if parameter
#       external_contaminants_dir (described further on)
#       is defined.
#
# Possible values are:
#     human_hg38_refMrna
#     hg37dec_v0.1
#     mouse_C57BL_6NJ
#     SILVA_128_LSUParc_SSUParc_ribosomal_RNA
#
# It is possible to specify several of the above databases
# as a comma-separated list. The above values are case
# sensitive and need to be provided verbatim.
# It is advised to copy & paste them to the
# below parameter value.
#
# Default value is:
#     human_hg38_refMrna,hg37dec_v0.1
# so that both of these databases apply.
#

#contaminants_db = human_hg38_refMrna
#contaminants_db = hg37dec_v0.1
#contaminants_db = mouse_C57BL_6NJ
#contaminants_db = SILVA_128_LSUParc_SSUParc_ribosomal_RNA


# OPTIONAL: external_contaminants_dir
#
# Directory that contains external contaminants database
# or several databases. Each contaminant database
# consists of an entire Bowtie2 index, which can be short
# or long. Each database (Bowtie2 index) consists of
# six files.
#
# For a short index these are:
# some_name.1.bt2, some_name.2.bt2, some_name.3.bt2,
# some_name.4.bt2, some_name.rev.1.bt2, some_name.rev.2.bt2.
#
# For a long index these are:
# some_name.1.bt2l, some_name.2.bt2l, some_name.3.bt2l,
# some_name.4.bt2l, some_name.rev.1.bt2l, some_name.rev.2.bt2l.
#
# External directory can contain both types of indices.
# All indices within the specified external contaminants
# directory are applied to the input reads.
#
# Note: this parameter supersedes the previously
#       introduced parameter contaminants_db.

#external_contaminants_dir = some_dir_with_contaminants


#####################################################
#
# KneadData configuration
#
#####################################################


# OPTIONAL: params_kneaddata
#
# Parameters for KneadData (others than the ones
# that are provided by the workflow itself,
# like input and output files).
#
# If this parameter is not set, then it assumes a
# default value:
#     --cat-final-output --serial --remove-intermediate-output
#
# to mimic the original behavior of BioBakery
# wmgx workflow.
#
# NOTE: you must not specify any parameters that are
#       related to input and output files. Only parameters
#       that tune execution of this program are feasible.
#       Please see the original documentation.
#
# NOTE: Two parameters --cat-final-output and --serial are mandatory
#       and MUST REMAIN set in order for the workflow to execute properly.
#       Hence, whenever prescribing your own parameters for KneadData,
#       please also include --cat-final-output --serial to the specifications.


#params_kneaddata = --cat-final-output --serial ...


# OPTIONAL: params_kneaddata_bowtie2
#
# Additional parameters that KneadData passes
# to Bowtie2 (default None, which applies --very-sensitive-local).
#
# Description of bowtie2 parameters can be found here:
# http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml#options

#params_kneaddata_bowtie2 = ...


# OPTIONAL: params_kneaddata_trimmomatic
#
# Additional parameters that KneadData passes
# to Trimmomatic (A flexible read trimming tool
# for Illumina NGS data). If this parameter is not
# set, then KneadData provides its own default values.
#
# Possible settings are:
#
#     --trimmomatic-options   e.g. SLIDINGWINDOW:4:20 MINLEN:50
#        MINLEN...is set to 50 percent of total input read length
#
# Description of trimmomatic parameters can be found here:
# http://www.usadellab.org/cms/?page=trimmomatic

#params_kneaddata_trimmomatic = ...custom options...


#####################################################
#
# MetaPhlan configuration
#
#####################################################


# OPTIONAL: clean_metaphlan_sam        (default: False)
# OPTIONAL: clean_metaphlan_bowtie2out (default: False)
#
# Set to Yes to clean Metaphlan SAM and/or Metaphlan
# Bowtie2out files at the end of the workflow execution.
# These files are rarely needed but they consume large
# amount of disk space.

#clean_metaphlan_sam = Yes
#clean_metaphlan_bowtie2out = Yes


# OPTIONAL (edition 4): params_metaphlan
# OPTIONAL (edition 3): params_metaphlan3
# OPTIONAL (edition 2): params_metaphlan2
#
# Parameters for the respective versions
# of MetaPhlan  (others than the ones that
# are provided by the workflow itself,
# like input and output files).
#
# If these parameters are not set, then they assume
# the following default values.
#     Metaphlan2: (none)
#     Metaphlan3: --add_viruses
#     Metaphlan4: --add_viruses
#
# NOTE: Metaphlan4 databases currently do NOT
#       support profiling of viral organisms.
#       Here default setting --add_viruses is
#       merely a preparation for the future.
#
# NOTE: you must not specify any parameters that are
#       related to input and output files. Only parameters
#       that tune execution of this program are feasible.
#       Please see the original documentation.

#params_metaphlan = ...
#params_metaphlan3 = ...
#params_metaphlan2 = ...


# OPTIONAL (edition 4): metaphlan_external_db_dir
# OPTIONAL (edition 3): metaphlan3_external_db_dir
#
# Directory that contains external Metaphlan database.
# The specified directory is supposed to have the same
# structure as the built-in database directory.
#
# Here is a directory structure of an official
# Metaphlan3 database that a users' provided
# database directory should mimic.
#
# mpa_v30_CHOCOPhlAn_201901.md5
# mpa_v30_CHOCOPhlAn_201901.pkl
# mpa_v30_CHOCOPhlAn_201901.fna.bz2
# mpa_v30_CHOCOPhlAn_201901.1.bt2
# mpa_v30_CHOCOPhlAn_201901.2.bt2
# mpa_v30_CHOCOPhlAn_201901.3.bt2
# mpa_v30_CHOCOPhlAn_201901.4.bt2
# mpa_v30_CHOCOPhlAn_201901.rev.1.bt2
# mpa_v30_CHOCOPhlAn_201901.rev.2.bt2
#
# For Metaphlan4 appropriate bases with long
# Bowtie2 indices are needed.
#
# mpa_vJan21_CHOCOPhlAnSGB_202103.md5
# mpa_vJan21_CHOCOPhlAnSGB_202103.pkl
# mpa_vJan21_CHOCOPhlAnSGB_202103.1.bt2l
# mpa_vJan21_CHOCOPhlAnSGB_202103.2.bt2l
# mpa_vJan21_CHOCOPhlAnSGB_202103.3.bt2l
# mpa_vJan21_CHOCOPhlAnSGB_202103.4.bt2l
# mpa_vJan21_CHOCOPhlAnSGB_202103.rev.1.bt2l
# mpa_vJan21_CHOCOPhlAnSGB_202103.rev.2.bt2l
# mpa_vJan21_CHOCOPhlAnSGB_202103_VINFO.csv
# mpa_vJan21_CHOCOPhlAnSGB_202103_VSG.fna
#
# Note: when parameter metaphlan[x]_external_db_dir is set, it
#       overrides the use of internally integrated Metaphlan
#       database.

#metaphlan_external_db_dir = /home/some_dir/newer_or_custom_mp_db
#metaphlan3_external_db_dir = /home/some_dir/newer_or_custom_mp3_db


#####################################################
#
# Metaphlan_count_features configuration
#
#####################################################


# OPTIONAL (edition 4): params_metaphlan_count_features
# OPTIONAL (edition 3): params_metaphlan3_count_features
# OPTIONAL (edition 2): params_metaphlan2_count_features
#
# If this parameter is not set, it assumes a value:
#     --include s__ --filter t__ --reduce-sample-name
#
# to mimic the original behavior of BioBakery wmgx workflow.
#
# --include INCLUDE
#
# --filter FILTER
#
# --reduce-sample-name
#
# --ignore-un-features
#
# --ignore-stratification

#params_metaphlan_count_features = --include k__ --filter t__
#params_metaphlan3_count_features = --include k__ --filter t__
#params_metaphlan2_count_features = --include k__ --filter t__


#####################################################
#
# Humann2 configuration
#
#####################################################


# OPTIONAL: humann2_protein_db (default: uniref90)
#
# Protein databases to use.
#
# Possible values are:
#     uniref90
#     uniref50
#
# The above values are case sensitive and
# need to be provided verbatim.
#
# If this parameter is not set, then
# uniref90 is assumed.

#humann2_protein_db = uniref90
#humann2_protein_db = uniref50


# OPTIONAL: params_humann2
#
# Parameters for Humann2 (others than the ones
# that are provided by the workflow itself,
# like input and output files).
#
# If this parameter is not set, then it assumes
# the value
#
#    --remove-temp-output --memory-use maximum
#
# to save disk space and speedup the execution
# under the presumption that your computer has
# enough memory (no specific numbers, since it
# depends on an input set). If Humann2 fails
# due to memory exhaustion, set params_humann2
# to something that does not include option
# "--memory-use maximum", e.g. only
# --remove-temp-output.
#
# NOTE: you must not specify any parameters that are
#       related to input and output files. Only parameters
#       that tune execution of this program are feasible.
#       Please see the original documentation.

#params_humann2 = --bypass-prescreen
#params_humann2 = --memory-use maximum


#####################################################
#
# Humann3 and Humann4 configuration
#
#####################################################


# OPTIONAL (edition 4): humann_protein_db (default: uniref90)
# OPTIONAL (edition 3): humann3_protein_db (default: uniref90)
#
# Protein databases to use.
#
# Possible values are:
#     uniref90
#     uniref50
#     uniref90_ec_filtered
#     uniref50_ec_filtered
#
# The above values are case sensitive and
# need to be provided verbatim.
#
# If this parameter is not set, then
# uniref90 is assumed.
#
# Note: it is also possible to work with an externally
#       supplied database as described further on.

#humann3_protein_db = uniref90
#humann3_protein_db = uniref50
#humann3_protein_db = uniref90_ec_filtered
#humann3_protein_db = uniref50_ec_filtered


# OPTIONAL (edition 4): params_humann
# OPTIONAL (edition 3): params_humann3
#
# Parameters for Humann3 (others than the ones
# that are provided by the workflow itself,
# like input and output files).
#
# If this parameter is not set, then it assumes
# the value
#
#    --remove-temp-output --memory-use maximum
#
# to save disk space and speedup the execution
# under the presumption that your computer has
# enough memory (no specific numbers, since it
# depends on an input set). If Humann3 fails
# due to memory exhaustion, set params_humann3
# to something that does not include option
# "--memory-use maximum", e.g. only
# --remove-temp-output.
#
# NOTE: you must not specify any parameters that are
#       related to input and output files. Only parameters
#       that tune execution of this program are feasible.
#       Please see the original documentation.

#params_humann = --bypass-prescreen
#params_humann = --memory-use maximum

#params_humann3 = --bypass-prescreen
#params_humann3 = --memory-use maximum


# OPTIONAL (edition 4): humann_external_protein_db_dir
# OPTIONAL (edition 3): humann3_external_protein_db_dir
#
# Directory that contains external Humann protein database
# as a substitution for the built in ones.
#
# The specified directory is supposed to contain one file
# with extension .dmnd, as the original Humann3 protein
# database does. The name of this files should begin either
# with literal uniref90_ or uniref50_ to indicate the search
# mode to be applied. For example, a valid name is:
# uniref90_201901.dmnd
#
# The literal at the beginning of the name is carried over
# to the Humann3's parameter --search-mode, so according
# to the above example, the applied Humann3's command line
# would contain an excerpt "--search-mode uniref90".
#
# Note: when parameter humann3_external_protein_db_dir is set,
#       it overrides the use of internally integrated protein
#       database. Hence, the previously described parameter
#       humann3_protein_db is ignored in this case.

#humann_external_protein_db_dir = /home/john_doe/custom/protein_db
#humann3_external_protein_db_dir = /home/john_doe/custom/protein_db


# OPTIONAL (edition 4): humann_external_nucleotide_db_dir
# OPTIONAL (edition 3): humann3_external_nucleotide_db_dir
#
# Directory that contains external Humann nucleotide database
# as a substitution for the built in ChocoPhlAn.
#
# The specified directory is supposed to contain a file structure
# that Humann is able to as the official ChocoPhlAn database does.
#
# Note: if this parameter is set, then it is probably needed to
#       specify parameter --bypass-nucleotide-index as well,
#       by using directive:
#       params_humann3 = --bypass-nucleotide-index
#
#       To also retain default MetaBakery Humann3' parameters, use the following:
#       params_humann3 = --remove-temp-output --memory-use maximum --bypass-nucleotide-index

#humann_external_nucleotide_db_dir = /home/john_doe/custom/nucleotide_db
#humann3_external_nucleotide_db_dir = /home/john_doe/custom/nucleotide_db


# Note: the above parameters are suitable for using e.g. external Struo2 prepared databases:
#       https://www.biorxiv.org/content/10.1101/2021.02.10.430604v1,
#       https://github.com/leylabmpi/Struo2,
#       http://ftp.tue.mpg.de/ebio/projects/struo2/GTDB_release95/humann3/
#
# In order to use Struo2 (or other similarly prepared database),
# set the following parameters accordingly.

# for edition 4 (not tested with Struo2, since database structure is changing)
#humann_external_protein_db_dir = /home/john_doe/custom/protein_db
#humann_external_nucleotide_db_dir = /home/john_doe/custom/nucleotide_db
#params_humann = --remove-temp-output --memory-use maximum --bypass-nucleotide-index
#
# for edition 3
#humann3_external_protein_db_dir = /home/john_doe/custom/protein_db
#humann3_external_nucleotide_db_dir = /home/john_doe/custom/nucleotide_db
#params_humann3 = --remove-temp-output --memory-use maximum --bypass-nucleotide-index
#
# If you do not want to include default MetaBakery parameters,
# modify Humann3 parameters accordingly.
#params_humann3 = --bypass-nucleotide-index

# NOTE! NOTE! NOTE! NOTE! NOTE! NOTE! NOTE! NOTE! NOTE!
#
# The Struo2 database takes A LOT of memory for Humann
# processing. Frequently, memory consumption exceeds an
# order of 40 GB. If not enough memory is available, then
# Humann (more precisely its internal Bowtie2 step) is
# killed by an operating system, which is annotated in
# a stderr output of the very Humann run.
# Typically, MetaBakery runs several Humann tasks in
# parallel to utilize CPUs well during periods where
# Humann3 executes its non parallelized parts. This
# further increases memory needs and makes Humann's
# abrupt termination more likely. Running of multiple
# Humann tasks in parallel may be prevented by setting
# the following parameters (described in detail near the
# end of this document):

#benchmark_mode = Yes

# for edition 4
#benchmark_name = Humann

# for edition 3
#benchmark_name = Humann3

#benchmark_single = Yes


#####################################################
#
# Humann_count_features configuration
#
#####################################################


# OPTIONAL: params_humann2_count_features
#
# Note: this parameter also applies to processing
#       of Humann3's results.
#
# If this parameter is not set, it assumes a value:
#     --reduce-sample-name --ignore-un-features --ignore-stratification
#
# to mimic the original behavior of BioBakery wmgx workflow.
#
# NOTE: you must not specify any parameters that are
#       related to input and output files. Only parameters
#       that tune execution of this program are feasible.
#       Please see the original documentation.

#params_humann2_count_features = --reduce-sample-name --ignore-un-features --ignore-stratification


#####################################################
#
# sample2markers configuration
#
#####################################################

# sample2markers step generates a marker file for each sample.
# The marker files contain the consensus of unique marker genes
# for each species found in the sample, which is used for SNP
# profiling.


# OPTIONAL: params_sample2markers (default: none)
#
# NOTE: you must not specify any parameters that are
#       related to input and output files. Only parameters
#       that tune execution of this program are feasible.
#       Please see the original documentation.

#params_sample2markers = --min_read_depth 10 --min_base_quality 20 (for strainphlan 2)
#params_sample2markers = --marker_in_n_samples 20                  (for strainphlan 3&4)


#####################################################
#
# Humann_regroup_table configuration
#
#####################################################


# OPTIONAL (edition 4): params_humann_regroup_table
#          (edition 3): params_humann3_regroup_table
#          (edition 2): params_humann2_regroup_table
#
# If this parameter is not set, it assumes empty value
# to mimic the original behavior of BioBakery wmgx workflow.
#
# NOTE: you must not specify any parameters that are
#       related to input and output files. Only parameters
#       that tune execution of this program are feasible.
#       Please see the original documentation.

#params_humann_regroup_table = --ungrouped N
#params_humann3_regroup_table = --ungrouped N
#params_humann2_regroup_table = --ungrouped N


#####################################################
#
# Humann_renorm_table configuration
#
#####################################################


# OPTIONAL: params_humann2_renorm_table
#
# Note: this parameter also applies to processing
#       of Humann3's results.
#
# If this parameter is not set, it assumes a value:
#     --units relab --special n
#
# to mimic the original behavior of BioBakery wmgx workflow.
#
# NOTE: you must not specify any parameters that are
#       related to input and output files. Only parameters
#       that tune execution of this program are feasible.
#       Please see the original documentation.

#params_humann2_renorm_table = --units relab --special n


#####################################################
#
# Strainphlan configuration
#
#####################################################


# OPTIONAL: max_number_of_clades (default: 20)
#
# Limits the number of clades that Strainphlan
# analysis processes.

#max_number_of_clades = 10


# OPTIONAL: params_strainphlan_list_clades (default: none)
# OPTIONAL: params_strainphlan_clade (default: none)
#
# The value of first of the above parameters is passed to
# Strainphlan, when clades are listed.

# NOTE: option --print_clades_only is set automatically by
#       the workflow, and it must NOT be set explicitly.
#
# The value of second of the above parameters is passed to
# Strainphlan, when processing each individual clade.
#
# NOTE: you must not specify any parameters that are
#       related to input and output files. Only parameters
#       that tune execution of this program are feasible.
#       Please see the original documentation.
#
# NOTE: previously described configuration of sample2markers
#       also influences Strainphlan processing.

#params_strainphlan_list_clades = ...
#params_strainphlan_clade = ...


#####################################################
#
# MelonnPan configuration
#
#####################################################


# MelonnPan terminates with error when number of inputs is too large.
# To alleviate the difficulty, MetaBakery splits MelonnPan prepared input
# into manageable chunks, when the number of inputs exceeds the value of
# parameter max_melonnpan_input_size. In this case the MelonnPan reported
# RTSI numbers are non-informative (the delivered output is obtained by
# concatenating RTSI output of individual chunks, whereas in order for the
# numbers to be informative, the entire input set should be processed
# as a whole collection). Nonetheless, predicted metabolites should be
# informative.
# The limiting size that MelonnPan successfully processes, varies a bit,
# so it is possible to experiment with the following parameter a bit
# to empirically deduce maximal possible value (please read chapter
# Time Machine in MetaBakery's users' manual first to enormously
# speed up such trial-error MetaBakery runs).

#max_melonnpan_input_size = 100


# OPTIONAL: melonnpan_critical_point (default: 0.9793)
#
# From documentation of MelonnPan predict:
# https://rdrr.io/github/biobakery/melonnpan/man/melonnpan.predict.html
#
# A numeric value corresponding to the significance level to find
# the top PCs. If the significance level is 0.05, 0.01, 0.005, or 0.001,
# the critical point should be set to be 0.9793, 2.0234, 2.4224, or 3.2724,
# accordingly. The default is 0.9793 (i.e. 0.05 significance level).

#melonnpan_critical_point = 0.9793
#melonnpan_critical_point = 2.0234
#melonnpan_critical_point = 2.4224
#melonnpan_critical_point = 3.2724


# OPTIONAL: melonnpan_correlation_method (default: pearson)
#
# Specification of correlation method for MelonnPan runs.
# Default is Pearson, the other possibility is spearman.

#melonnpan_correlation_method = pearson
#melonnpan_correlation_method = spearman


# OPTIONAL: melonnpan_weight_matrix (default: none)
#
# File with weight matrix, if available, or none
# to use the MelonnPan's built in one.
# Please, see MelonnPan's documentation for details.

#melonnpan_weight_matrix = some_weights.txt


# OPTIONAL: melonnpan_rtsi_file (default: none)
#
# RTSI file, if available, or none to use the
# MelonnPan's built in one.
# Please, see MelonnPan's documentation for details.

#melonnpan_rtsi_file = some_rtsi.txt


#####################################################
#####################################################
#####################################################
#
# Advanced topics. The rest of this config file
# describes certain advanced parameters as well
# as the possibility to fine tune performance
# of the MetaBakery workflow.
#
#####################################################
#####################################################
#####################################################


# OPTIONAL: rerun_failed_steps (default: Yes)
#
# NOTE: This step is meaningful only if option
#       preserve_history is enabled.
#
# Set to Yes to re-run steps that have failed during
# previous workflow runs (default). Set to No to avoid
# rerunning previously failed steps.
#
# When some MetaBakery step fails during processing,
# this outcome is registered in a repository. Upon
# further runs (with the same output directory), MetaBakery
# tries to rerun previously failed steps, as part of its
# built-in crash recovery, in a hope that new circumstances
# will allow the very steps to complete successfully
# (for example when disk space is freed after a disk-full condition).
#
# However, if it is known that previously failed steps would
# fail again upon the next run, MetaBakery may be instructed
# not to repeat them by setting config parameter rerun_failed_steps
# to No. Generally, steps that have failed in the past
# are likely to fail again without removing the cause of their
# failure, and in that case their repetition is a waste of time.
#
# MetaBakery sets this parameter to Yes by default, because
# experience shows that precisely disk-full condition is
# frequently responsible for failures. Hence, when a user
# cleans up the disk, MetaBakery may be rerun to complete
# its processing without any further intervention.

#rerun_failed_steps = No


# OPTIONAL: symlink_results (default: Yes)
#
# This option is meaningful only when history
# is enabled. In this case, results of individual
# steps are located within separate directory,
# termed 00_internal. This directory is not meant
# to be examined by end users and it is not
# structured in a user friendly form.
#
# Results are delivered to an end user by copying
# or symlinking appropriate files from directory
# 00_internal.
#
# Symbolic links occupy less disk space and are
# created instantly, the actual copying of files
# doubles their disk-space consumption, and it may
# take a while in the case of large files.
#
# On the other hand, true copies of resulting files
# enable more user friendly copying and moving
# them around (e.g. without ls -L directive).
#
# Generally, these files are not prohibitively
# large, when option deliver_preprocessed_files
# below is set to No. In the opposite case,
# resulting files may consume a noticeable disk
# space.
#
# NOTE: if history is disabled, this option is
#       forced to be set to symlink_results=No.

#symlink_results = No


#####################################################
#
# Splitting of large workloads
#
#####################################################

# ----------------------------------------------
# PLEASE SEE THE METABAKERY USERS' MANUAL
# chapter Splitting of large workloads
# for detailed instruction.
# ----------------------------------------------
#
# Recent metagenomic datasets have become enormously big.
# Consequently, their computational requirements are larger
# than a typical single computer is able to accommodate.
# Often, a fairly capable High-Performance Computing facility
# is required to complete the analysis in an acceptable time or at all.
#
# MetaBakery tries to push the limits of reality by being able to split
# large datasets into arbitrary pieces, and process each piece on a
# separate computing facility. Hence, it is possible to split computation
# among several HPC facilities and/or other computers. When the entire set
# of separate chunks finishes processing, the obtained results need to be
# collected in order to finalize processing at the level of the entire
# dataset. These final steps are: MelonnPan and StrainPhlAn analyses,
# as well as several MetaPhlAn and HUMAnN post-processing steps.
#
# The goal of MetaBakery is to deliver the same end results,
# as if the entire dataset was processed on a single piece of hardware
# with a single workflow execution on the entire dataset at once.

# Another advantage of this feature is that processing of a certain
# dataset may begin before it is obtained entirely. Collection of
# samples may take weeks or months, when e.g. large set of patients
# from all over the world is involved. Nonetheless, processing of
# individual pieces (R1_R2 fastq pairs or unpaired R1 files) may
# begin immediately upon their availability without being forced
# to wait for the entire collection to be obtained.


# ----------------------------------------------------
# parameters for the splitting phase of processing
#
# PLEASE SEE THE METABAKERY USERS' MANUAL
# chapter Splitting of large workloads
# for detailed instruction.
# ----------------------------------------------------

#summary_root_dir = /abs/path_to/some_summary_place

# The parameter specifies a parent directory for
# storage of partial results. Several MetaBakery runs
# may use the same summary_root_dir value, since
# each run stores the partial results under a subdirectory
# with a name that is guaranteed to be unique.
#
# Results of MetaBakery executions on different computers
# and especially on different geographic locations
# store results separately on their availabe storage
# facilities. At the end, all resulting subdirectories
# need to be collected in a common subdirectory to
# constellate an input for the final processing steps.
#
# NOTE: when parameter summary_root_dir is set, then
#       MelonnPan and Strainphlan processing is
#       disabled, since it is assumed that the
#       MetaBakery run in question analyzes only a subset
#       of the actual input dataset. The same holds for
#       all Metaphlan and Humann post-processing steps.
#       All these steps are expected to be executed
#       when results of individual chunks are collected
#       for the final processing (described below).
#
#       sample2markers step is still carried on by default,
#       since its individual results are needed for the final
#       Strainphlan execution. If Strainphlan analysis is not
#       needed at the end, then sample2markers step may be
#       disabled as well, in order not to waste wall time of
#       computing facilities
#       (config parameter: enable_sample2markers = No).


# ----------------------------------------------------
# parameters for the final phase of processing
# after the splitting phase is completed
#
# PLEASE SEE THE METABAKERY USERS' MANUAL
# chapter Splitting of large workloads
# for detailed instruction.
# ----------------------------------------------------

#in_summary_dir = /abs/path_to/collected_partial_results

#in_dir =           !!! MUST NOT BE SET in this step !!!

#out_dir = abs/path_to/directory_for_the_final_output (optional)

# Parameter in_summary_dir specifies a parent directory
# in which all relevant partial summary_root_dir entries
# are collected regardless of where or when they were processed.


# ----------------------------------------------------
# Compression of results
# ----------------------------------------------------

# Compression of partial or end results is primarily
# intended to ease retrieval of results from HPC facilities.


# Set the following directive to Yes to compress
# summary or summary root directory into a tar.gz
# file at the end of the partial or final workflow
# execution.

#compress_summary = Yes


# Set the following directive to a bare(!) name of a
# subdirectory of the application's work directory to
# put the above tar.gz file into a subdirectory instead
# of placing it directly into the work directory.

#compress_summary_subdir_name = some_favourite_subdir_name


#####################################################
#
# Memory usage limiting
#
#####################################################


# OPTIONAL: memory_limit_type (default: None)
#
# It is a good idea to limit memory consumption
# since many programs that are included in
# MetaBakery may exhaust the available resources
# fairly quickly, and destabilize the underlying
# operating system by causing excessive swapping
# of work memory to a disk.
#
# Parameter memory_limit_type sets an upper bound
# on memory that is allowed to be consumed during
# execution of processing steps. Possible settings
# are as follows.
#
# None:      no limitations; this may be a good
#            option for execution on your own
#            computer and you want to maximize
#            the chance that processing completes.
#            On the other hand, for High
#            Performance Computing (HPC) facilities,
#            some memory limiting is a recommended
#            choice, since an excessive swapping may
#            impair execution of other programs
#            (from other users) on the same machine.
#
# Free:      set memory limit to the amount of free
#            memory as determined at the beginning
#            of an application run.
#
# Available: set memory limit to the amount of
#            available memory as determined at the
#            beginning of an application run.
#
# Total:     set memory limit to the amount of Total
#            memory as determined at the beginning
#            of an application run.
#
# Manual:    manually specify memory limit in Giga Bytes
#            with parameter manual_memory_limit (below).
#
#            Option Free is the safest one, but it might be
#            too restrictive in certain situations. The three
#            automatic (non-manual) memory quotas are printed
#            on screen at the beginning of execution to get the
#            idea about their values on the hardware in question.
#            Please note that the less restrictive options
#            Available and especially Total may lead to a
#            certain amount of swapping, even if this is the
#            only application running on the computer.
#
#            NOTE: even with option Free there is no guarantee
#                  that swapping will not occur, but the amount
#                  of it should be manageable/tolerable.
#
#
#
# If some processing step requires more memory than
# the specified limit allows, then such step will
# be terminated abruptly.

#memory_limit_type = None
#memory_limit_type = Free
#memory_limit_type = Manual
#memory_limit_type = Available
#memory_limit_type = Total


# REQUIRED IF PARAMETER memory_limit IS SET TO Manual:
#         manual_memory_limit_GB
#
# Manual specification of memory limit in Giga Bytes.
# It is relevant only when parameter memory_limit
# is set to Manual. Floating point values are allowed,
# so it is possible to specify memory with a finer
# granulation than in Giga Bytes unit.

#manual_memory_limit_GB = 50    # 50 Giga Bytes
#manual_memory_limit_GB = 20.3  # 20 Giga Bytes + 300 Mega Bytes


#####################################################
#
# Performance tuning
#
#####################################################
#
# NOTE: this section is intended for advanced users
#       that are willing to delve into details about
#       the workflow's performance tuning.
#
#       Default settings that are provided out of
#       the box, are (hopefully) a sensible starting
#       point. However, no generic settings can
#       be an optimal choice for all real-life
#       circumstances.
#
#       By altering the default settings below
#       it may (or may not) be possible to achieve
#       a noticeable performance boost for a
#       particular computing environment and
#       input datasets.
#
#       If you are willing to devote some time to
#       tweak these parameters and you succeed in
#       improving performance of your particular
#       setup, we would be grateful to hear from you
#       (bostjan.murovec@fe.uni-lj.si).
#
#       We are interested in building a list of
#       recommended settings that work in various
#       circumstances. For that matter we would
#       be grateful for information about:
#
#       1. your optimization settings.
#
#       2. your computing environment (number of
#          CPUs, processor type, amount of RAM
#          memory, disk type (e.g. SATA, PATA,
#          RAID. SSD, HDD) and capacity,
#          version of Linux).
#
#       3. Size of your input data files in
#          zipped and/or unzipped form.
#
#       4. Selection of databases.
#
#       5. Any other information that may be
#          relevant for understanding and
#          application of your settings.
#
#       Thank you in advance!
#
#----------------------------------------------------


# Workflow performance tuning consists of prescribing
# a policy about which steps are executed in parallel.
#
# The decision is made by specifying thresholds that
# set a cap on CPU use, disk use and memory use.
# The parameters are as follows.
#
# OPTIONAL: cpu_threshold (decimal number)
#           default: as many CPUs as they are present,
#           or a number of threads that are specified
#           by parameter number_of_threads.
#
# OPTIONAL: cpu_overshoot (decimal number, default: 100 %)
#
# OPTIONAL: disk_threshold (decimal number)
#           default: 100, which means 100 % of disk
#           throughput capabilities (not a storage capacity).
#           Note that the workflow does not know the
#           actual disk capabilities. Hence this threshold
#           operates only under the presumption that
#           percentages of disk consumption of individual
#           programs are somewhat realistic.
#
# OPTIONAL: disk_overshoot (decimal number, default: 0 %)
#
# OPTIONAL: mem_threshold (decimal number)
#           default: available memory in 1 GB units
#
# OPTIONAL: mem_overshoot (default: 0 %)
#
# The meaning of these settings will be explained in a minute.
#
#
# In addition to the above thresholds, there are settings
# that hint about consumption of these resources by each kind
# of execution process (gunzip, Mothur, Metaphlan, ...).
# As an example, the default settings for the gunzip
# (actually, pigz) program are as follows.

#gunzip_cpu_count = 4
#gunzip_cpu_use = 50 (percent)
#gunzip_disk_use = 15 (percent)
#gunzip_mem_use = 0.01 (GB)

# The stated figures are interpreted in the following way:
#
#     1. gunzip_cpu_count = 4: an isolated gunzip (pigz)
#        utilizes up to four threads, but not all the time.
#        The efficiency of use is taken care of with the
#        next parameter.
#
#     2. gunzip_cpu_use = 50: the mentioned four CPUs/threads
#        are utilized by gunzip (pigz) with an estimated
#        efficiency of 50 %. This means that typically the
#        workflow can assume that only two CPUs are fully
#        utilized by gunzipping.
#
#     2. gunzip_disk_use = 15: one gunzip run consumes
#        15 % of disk throughput capacity, which indicates
#        that running six gunzip processes in parallel
#        nearly saturates disk subsystem. By running one
#        more gunzip in parallel would degrade workflow
#        execution performance due to disk over utilization.
#
#     4. gunzip_mem_use = 0.01: one gunzip run consumes
#        very little memory; an estimate is 1/100 of GB.
#
# The stated parameters and thresholds are specified
# as decimal numbers to enable their fine-grained settings.
#
# These settings are merely hints to an execution controller.
# They are not meant as a commitment or a hard limit.
# Nothing fatal happens, if these numbers are grossly
# underestimated or overestimated, except that execution
# performance of the entire workflow may be degraded.
#
# Furthermore, many processes are highly parallel
# and generally they are able to consume all available
# CPUs, no matter how many of them there are present
# (up to a certain limit, of course). For such cases,
# it is possible to set parameters ...cpu_count to
$ the value of 0, which is interpreted as the number of
# CPUs that are actually present or specified by the
# option number_of_threads. This way, a generic performance
# tuning of such processes is able to sensibly cover
# various hardware configurations, at least to some extent.
#
# It is also possible to set parameters ...cpu_count
# to a negative number, say, -64. This is interpreted as:
# set the number of CPUs to the number of available ones,
# but do not exceed the limit of 64.
# Hence, in this example, the parameter would be set
# internally to 64, if the underlying computer consists
# of 64 or more CPUs, but it would be set to the actual
# number of CPUs, if there are less than 64 of them.
#
# After the number of CPUs is set according to the
# above description, the respective parameter ...cpu_use
# expresses percentage of their utilization. The workflow
# calculates the number of utilized CPUs simply by
# multiplying the two numbers:
#
#    estimated_number_of_consumed_cpus =
#       ...cpu_count * ...cpu_use / 100
#
# Note that all three numbers can be decimal numbers.
#
# The only reason that there are two parameters to
# specify CPU utilization is the ability of making
# generic settings with ...cpu_count=0 or ...cpu_count<0,
# which scale to a certain extent to various hardware
# configurations.
#
# The workflow's execution controller calculates the
# above estimated_number_of_consumed_cpus for each
# process that needs to be executed. Then is executes
# as many of them in parallel as possible under the
# constraint that the sum of these estimates for the
# running processes never exceeds a certain amount.
#
# The actual limit is determined by parameter
# cpu_threshold (mentioned at the beginning
# of this section), which is additionally altered
# by parameter cpu_overshoot (in percent):
#
# maximal_cpu_use = cpu_threshold * (1 + cpu_overshoot / 100).
#
# NOTE: cpu_overshoot may be a negative number which
#       broadens the set of possible policy specifications.
#
# The task controller executes only as much processes
# in parallel that the limit maximal_cpu_use is respected.
# In addition, if the currently running processes
# exceed the value of cpu_threshold itself, then
# new processes are not started, unless there are available
# processes with little CPU consumption, but with high disk
# consumption. Execution controlled decides about the
# actual meaning of these figures.
#
# Similarly for disk and memory consumption. The task
# controller executes processes in parallel only,
# if none of the three thresholds is exceeded
# according to the above description.
#
# Disk consumption and threshold are specified only
# as a percentage or ratio, so there are only
# parameters ..._disk_use, but no parameters
# ..._disk_count.
#
# On the other hand, memory is again a bit more
# complicated, since some programs always consume
# a certain amount of it (which typically depends
# on their input). But there are also programs
# that check the available memory and they
# boost its consumption accordingly to enhance
# performance.
#
# Hence, parameters ..._mem_use can be set to
#
#     1. a positive number to express absolute
#        memory consumption in gigabytes, or
#
#     2. a negative number to express a
#        percentage of available memory.
#
#     3. zero to make memory consumption
#        equal to the available memory.


# NOTE: if an isolated process exceeds any threshold
#       by itself, possibly due to a misconfiguration,
#       then such process is still executed, but in a
#       complete isolation.
#
#       For example, the following settings:
#
#           gunzip_disk_use = 160
#           disk_threshold = 100
#
#       do not prevent gunzip from being executed,
#       but they force it to be executed in isolation.
#       This way, all processing steps get executed
#       along the workflow progression, regardless
#       of their tuning specifications.
#
# NOTE: by altering the overall thresholds, it is possible
#       to make the system globally more or less aggressive
#       in its attempt to execute processes in parallel.
#       Hence, if empirical monitoring of the workflow shows
#       that CPU or disk is over utilized or underutilized,
#       the situation may be remedied by changing only the
#       the global settings, and without the need to fine
#       tune individual programs.
#
#       On the other hand, if the system behaves well most
#       of the time, but only one or a few of its steps
#       show an non suitable behavior, then the situation
#       may be improved by altering the specific settings
#       of these steps.


###############################################################
# The settings below express the default values that
# are applied by the workflow, if these parameters are
# not set in a config file. These figures can be a starting
# point for custom performance tuning.
#
# For example, if you observe that a certain processing
# step saturates disk subsystem on your hardware, then
# the situation can be remedied by increasing its respective
# parameter xxx_disk_use, so that the workflow scheduler
# will try less aggressively to run this step in parallel
# with other steps that also noticeably consume disk.
#
# Similarly, for CPU and memory. Memory settings are
# especially problematic, since actual memory consumption
# is highly dependent on input data. If you observer that
# some processing step consumes large amounts of memory,
# then set its respective parameter xxx_mem_use to
# a larger value.
#
# In contrast, if an isolated processing step is not
# problematic by itself, but the entire workflow
# consumes too much or too little computing resources,
# then change the overall thresholds, by setting
# the first set of the six parameters below.
###############################################################

#---------------------------------------
# tuning of overall workflow performance
#---------------------------------------

#cpu_threshold = actual_number_of_CPUs/hyperthreads
#cpu_overshoot = 100 (percent)

#disk_threshold = 100 (percent)
#disk_overshoot = 0 (percent)

#mem_threshold = actual number of available GB
#mem_overshoot = 0 (percent)

#---------------------------------------
# tuning of individual processing steps
#---------------------------------------

# Aside from the previously described parameters,
# there also exist parameters that hard limit CPU
# allocation for certain processing steps.
# Values of these parameters were obtained by
# measurements/observations of behavior on a 256 CPU
# HPC nodes, where it was established, that increasing
# number of CPUs/threads above certain limits
# gives no additional execution speedup. Hence, CPU
# resources are better utilized by allocating them
# to other tasks.


#gunzip_cpu_count = 4
#gunzip_cpu_use = 50
#gunzip_disk_use = 15
#gunzip_mem_use = 0.01


#kneaddata_cpu_count = 1
#kneaddata_cpu_use = 120
#kneaddata_disk_use = 20
#kneaddata_mem_use = 10
#
# hard limit for the number of CPUs
#max_kneaddata_threads = 48


#metaphlan3_cpu_count = 0
#metaphlan3_cpu_use = 80
#metaphlan3_disk_use = 20
#metaphlan3_mem_use = 10

#metaphlan2_cpu_count = 0
#metaphlan2_cpu_use = 80
#metaphlan2_disk_use = 20
#metaphlan2_mem_use = 10

# hard limit for the number of CPUs (both versions of MetaPhlAn)
# depends on the number of fastq inputs to be processed:
#
# for one fastq input (one R1_R2 pair, or one unpaired file)
#max_metaphlan_threads = 256
#
# for two fastq inputs
#max_metaphlan_threads = 96
#
# for three or more fastq inputs
#max_metaphlan_threads = 64
#
# the reason for the above elaborated policy is
# that increasing of MetaPhlAn threads beyond 64
# does speedup the step a bit, but not linearly.
# Hence, in the case of more than one input,
# computing resources are better utilized by
# processing more tasks in parallel.


#count_features_cpu_count = 1
#count_features_cpu_use = 100
#count_features_disk_use = 2
#count_features_mem_use = 1


#sample2markers3_cpu_count = 1
#sample2markers3_cpu_use = 90
#sample2markers3_disk_use = 10
#sample2markers3_mem_use = 10

#sample2markers_cpu_count = 1
#sample2markers_cpu_use = 90
#sample2markers_disk_use = 10
#sample2markers_mem_use = 10

# hard limit for the number of CPUs (both versions of sample2markers)
# depends on the number of fastq inputs to be processed:
#
# for one fastq input (one R1_R2 pair, or one unpaired file)
#max_sample2markers_threads = 256
#
# for two fastq inputs
#max_sample2markers_threads = 96
#
# for three or more fastq inputs
#max_sample2markers_threads = 64
#
# the reason for the above elaborated policy is
# that increasing of MetaPhlAn threads beyond 64
# does speedup the step a bit, but not linearly.
# Hence, in the case of more than one input,
# computing resources are better utilized by
# processing more tasks in parallel.


#humann3_cpu_count = 0
#humann3_cpu_use = 45
#humann3_disk_use = 15
#humann3_mem_use = 10

#humann2_cpu_count = 0
#humann2_cpu_use = 45
#humann2_disk_use = 20
#humann2_mem_use = 10

# hard limit for the number of CPUs (both versions of Humann)
# depends on the number of fastq inputs to be processed:
#
# for one fastq input (one R1_R2 pair, or one unpaired file)
#max_humann_threads = 256
#
# for two fastq inputs
#max_humann_threads = 96
#
# for three or more fastq inputs
#max_humann_threads = 64
#
# the reason for the above elaborated policy is
# that increasing of MetaPhlAn threads beyond 64
# does speedup the step a bit, but not linearly.
# Hence, in the case of more than one input,
# computing resources are better utilized by
# processing more tasks in parallel.


#humann2_get_counts_from_logs_cpu_count = 1
#humann2_get_counts_from_logs_cpu_use = 100
#humann2_get_counts_from_logs_disk_use = 2
#humann2_get_counts_from_logs_mem_use = 1


#humann2_regroup_table_cpu_count = 1
#humann2_regroup_table_cpu_use = 100
#humann2_regroup_table_disk_use = 2
#humann2_regroup_table_mem_use = 1


#humann2_renorm_table_cpu_count = 1
#humann2_renorm_table_cpu_use = 100
#humann2_renorm_table_disk_use = 2
#humann2_renorm_table_mem_use = 1


#humann2_join_tables_cpu_count = 1
#humann2_join_tables_cpu_use = 100
#humann2_join_tables_disk_use = 2
#humann2_join_tables_mem_use = 1


#humann3_get_counts_from_logs_cpu_count = 1
#humann3_get_counts_from_logs_cpu_use = 100
#humann3_get_counts_from_logs_disk_use = 2
#humann3_get_counts_from_logs_mem_use = 1


#humann3_regroup_table_cpu_count = 1
#humann3_regroup_table_cpu_use = 100
#humann3_regroup_table_disk_use = 2
#humann3_regroup_table_mem_use = 1


#humann3_renorm_table_cpu_count = 1
#humann3_renorm_table_cpu_use = 100
#humann3_renorm_table_disk_use = 2
#humann3_renorm_table_mem_use = 1


#humann3_join_tables_cpu_count = 1
#humann3_join_tables_cpu_use = 100
#humann3_join_tables_disk_use = 2
#humann3_join_tables_mem_use = 1


#melonnpan_cpu_count = 1.
#melonnpan_cpu_use = 100.
#melonnpan_disk_use = 20.
#melonnpan_mem_use = 2


#strainphlan2_cpu_count = 1
#strainphlan2_cpu_use = 100
#strainphlan2_disk_use = 10
#strainphlan2_mem_use = 10

#strainphlan3_cpu_count = 0
#strainphlan3_cpu_use = 50
#strainphlan3_disk_use = 10
#strainphlan3_mem_use = 10

# hard limit for the number of Strainphlan CPUs
#max_strainphlan_threads = 64


#order_clade_list_cpu_count = 1
#order_clade_list_cpu_use = 100
#order_clade_list_disk_use = 1
#order_clade_list_mem_use = .5

#extract_markers2_cpu_count = 1
#extract_markers2_cpu_use = 100
#extract_markers2_disk_use = 3
#extract_markers2_mem_use = 10

#extract_markers3_cpu_count = 0
#extract_markers3_cpu_use = 90
#extract_markers3_disk_use = 3
#extract_markers3_mem_use = 10


#----------------------------------------------------

# OPTIONAL: schedule_later_steps_first (default: Yes)
#           schedule_more_disk_first (default: No)
#
# Heuristic hints to the scheduler about how to select
# processes for execution among the ones that are
# ready to start.
#
# If schedule_later_steps_first is set to Yes, then the
# scheduler tries to first run processes that belong
# sequentially to later stages of pipeline processing.
#
# The positive side of this policy is that processes
# at the end of the pipeline are processed as soon as
# possible, which enable earlier deletion of
# intermediate files and potentially results in
# less burden on disk space.
#
# The drawback of this is policy is that by processing
# earlier steps of the pipeline first, more od their
# successors are ready to execute in various moments
# in time, by means of which the scheduler has greater
# selection of steps at its disposal, so it can more
# efficiently utilize computational resources.
#
# If schedule_more_disk_first is set to yes, then
# the scheduler tries to prioritize steps that consume
# more disk throughput. In the opposite case, steps
# with lesser disk throughput consumption are prioritized.
#
# The positive side of the former is that disk intensive
# steps are completed first, which enables their (presumably)
# CPU-intensive successors to run as soon as possible.
#
# However, the drawback is that disk intensive steps
# may postpone execution of independent CPU-intensive
# steps, which leads to a low CPU utilization.

#schedule_later_steps_first=No
#schedule_more_disk_first=Yes

#----------------------------------------------------

# Configuration of a special benchmark mode,
# which is intended as an aid in benchmarking
# to obtain as realistic previously discussed
# performance settings (xxx_cpu_count,
# xxx_cpu_use, xxx_disk_use and xxx_mem_use).
#
# In the benchmark mode, a selected subset of
# processing steps is executed in isolation
# from the rest of the framework. This way
# it is possible to monitor, how targeted
# changes of parameters affect execution of
# the specific processing group.
#
# The benchmark mode transits through three
# processing stages. During the first stage,
# as many non-selected processing steps as
# possible are executed. The aim is to make
# as many selected steps as possible ready for
# execution by executing their predecessors.
#
# When no such steps are available any more,
# benchmarking switches to the second stage.
# Here, the entire set of selected processing
# steps is executed without interference of
# other steps. Execution of successors of the
# selected steps is postponed until the entire
# selected set finishes with execution.
#
# After this happens, the workflow enters the
# third stage, which is actually the end of
# the special benchmark mode. From this point
# on successors and the rest of the workflow
# finish their normal course of execution.


# OPTIONAL: benchmark_mode (default: No)
#
# Set to Yes to execute workflow in benchmark mode.

#benchmark_mode = Yes


# OPTIONAL: benchmark_name (string)
#
# Name fragment of processing steps that are
# selected for execution in isolation. The fragment
# is matched against step names that are written
# on screen as informative messages about execution.
#
# For example, to specify separation of KneadData
# execution, select "benchmar_name = KneadData".
#
# NOTE: this parameter is not case sensitive.

#benchmark_name = kneaddata


# OPTIONAL: benchmark_single (default: No)
#
# Set to Yes to execute each step within the selected
# set in isolation. Set to no, to execute in parallel
# as many steps within the selected set as the performance
# settings allow.

#benchmark_single = Yes