[med-svn] [Git][med-team/q2-dada2][upstream] New upstream version 2022.8.0

Wed Sep 7 15:49:13 BST 2022


Mohd  Bilal pushed to branch upstream at Debian Med / q2-dada2


Commits:
072bd304 by Mohammed Bilal at 2022-09-07T14:12:59+00:00
New upstream version 2022.8.0
- - - - -


10 changed files:

- .gitignore
- ci/recipe/meta.yaml
- q2_dada2/_denoise.py
- q2_dada2/_version.py
- + q2_dada2/assets/run_dada.R
- − q2_dada2/assets/run_dada_ccs.R
- − q2_dada2/assets/run_dada_paired.R
- − q2_dada2/assets/run_dada_single.R
- q2_dada2/plugin_setup.py
- setup.py


Changes:

=====================================
.gitignore
=====================================
@@ -69,3 +69,4 @@ target/
 *.qzv
 
 .DS_Store
+.Rhistory


=====================================
ci/recipe/meta.yaml
=====================================
@@ -18,8 +18,9 @@ requirements:
 
   run:
     - python {{ python }}
-    - biom-format >=2.1.5,<2.2a0
+    - biom-format {{ biom_format }}
     - bioconductor-dada2 1.22.0
+    - r-optparse >=1.7.1
     # openjdk is not a real dependency, but, r-base has a post-link and post-
     # activation hook that calls R CMD javareconf, which pokes around for any
     # installations of java. On modern versions of macOS, a binary called


=====================================
q2_dada2/_denoise.py
=====================================
@@ -52,6 +52,7 @@ _POOL_STR = (lambda x: x in {'pseudo', 'independent'},
              'pseudo or independent')
 _CHIM_STR = (lambda x: x in {'pooled', 'consensus', 'none'},
              'pooled, consensus or none')
+_BOOLEAN = (lambda x: type(x) is bool, 'True or False')
 # Better to choose to skip, than to implicitly ignore things that KeyError
 _SKIP = (lambda x: True, '')
 _valid_inputs = {
@@ -72,6 +73,7 @@ _valid_inputs = {
     'pooling_method': _POOL_STR,
     'chimera_method': _CHIM_STR,
     'min_fold_parent_over_abundance': _NAT_NUM,
+    'allow_one_off': _BOOLEAN,
     'n_threads': _WHOLE_NUM,
     # 0 is technically allowed, but we don't want to support it because it only
     # takes all reads from the first sample (alphabetically by sample id)
@@ -100,6 +102,10 @@ def _filepath_to_sample(fp):
     return fp.rsplit('_', 4)[0]
 
 
+# Since `denoise-single` and `denoise-pyro` are almost identical, break out
+# the bulk of the functionality to this helper util. Typechecking is assumed
+# to have occurred in the calling functions, this is primarily for making
+# sure that DADA2 is able to do what it needs to do.
 def _denoise_helper(biom_fp, track_fp, hashed_feature_ids):
     _check_featureless_table(biom_fp)
     with open(biom_fp) as fh:
@@ -162,13 +168,9 @@ def _denoise_helper(biom_fp, track_fp, hashed_feature_ids):
     return table, rep_sequences, metadata
 
 
-# Since `denoise-single` and `denoise-pyro` are almost identical, break out
-# the bulk of the functionality to this helper util. Typechecking is assumed
-# to have occurred in the calling functions, this is primarily for making
-# sure that DADA2 is able to do what it needs to do.
 def _denoise_single(demultiplexed_seqs, trunc_len, trim_left, max_ee, trunc_q,
                     max_len, pooling_method, chimera_method,
-                    min_fold_parent_over_abundance,
+                    min_fold_parent_over_abundance, allow_one_off,
                     n_threads, n_reads_learn, hashed_feature_ids,
                     homopolymer_gap_penalty, band_size):
     _check_inputs(**locals())
@@ -178,19 +180,31 @@ def _denoise_single(demultiplexed_seqs, trunc_len, trim_left, max_ee, trunc_q,
     if max_len != 0 and max_len < trunc_len:
         raise ValueError("trunc_len (%r) must be no bigger than max_len (%r)"
                          % (trunc_len, max_len))
-    # Coerce for `run_dada_single.R`
+    # Coerce for single end read analysis
     max_len = 'Inf' if max_len == 0 else max_len
 
     with tempfile.TemporaryDirectory() as temp_dir_name:
         biom_fp = os.path.join(temp_dir_name, 'output.tsv.biom')
         track_fp = os.path.join(temp_dir_name, 'track.tsv')
-        cmd = ['run_dada_single.R',
-               str(demultiplexed_seqs), biom_fp, track_fp, temp_dir_name,
-               str(trunc_len), str(trim_left), str(max_ee), str(trunc_q),
-               str(max_len), str(pooling_method), str(chimera_method),
-               str(min_fold_parent_over_abundance), str(n_threads),
-               str(n_reads_learn), str(homopolymer_gap_penalty),
-               str(band_size)]
+
+        cmd = ['run_dada.R',
+               '--input_directory', str(demultiplexed_seqs),
+               '--output_path', biom_fp,
+               '--output_track', track_fp,
+               '--filtered_directory', temp_dir_name,
+               '--truncation_length', str(trunc_len),
+               '--trim_left', str(trim_left),
+               '--max_expected_errors', str(max_ee),
+               '--truncation_quality_score', str(trunc_q),
+               '--max_length', str(max_len),
+               '--pooling_method', str(pooling_method),
+               '--chimera_method', str(chimera_method),
+               '--min_parental_fold', str(min_fold_parent_over_abundance),
+               '--allow_one_off', str(allow_one_off),
+               '--num_threads', str(n_threads),
+               '--learn_min_reads', str(n_reads_learn),
+               '--homopolymer_gap_penalty', str(homopolymer_gap_penalty),
+               '--band_size', str(band_size)]
         try:
             run_commands([cmd])
         except subprocess.CalledProcessError as e:
@@ -212,6 +226,7 @@ def denoise_single(demultiplexed_seqs: SingleLanePerSampleSingleEndFastqDirFmt,
                    trunc_q: int = 2, pooling_method: str = 'independent',
                    chimera_method: str = 'consensus',
                    min_fold_parent_over_abundance: float = 1.0,
+                   allow_one_off: bool = False,
                    n_threads: int = 1, n_reads_learn: int = 1000000,
                    hashed_feature_ids: bool = True
                    ) -> (biom.Table, DNAIterator, qiime2.Metadata):
@@ -225,6 +240,7 @@ def denoise_single(demultiplexed_seqs: SingleLanePerSampleSingleEndFastqDirFmt,
         pooling_method=pooling_method,
         chimera_method=chimera_method,
         min_fold_parent_over_abundance=min_fold_parent_over_abundance,
+        allow_one_off=allow_one_off,
         n_threads=n_threads,
         n_reads_learn=n_reads_learn,
         hashed_feature_ids=hashed_feature_ids,
@@ -240,6 +256,7 @@ def denoise_paired(demultiplexed_seqs: SingleLanePerSamplePairedEndFastqDirFmt,
                    pooling_method: str = 'independent',
                    chimera_method: str = 'consensus',
                    min_fold_parent_over_abundance: float = 1.0,
+                   allow_one_off: bool = False,
                    n_threads: int = 1, n_reads_learn: int = 1000000,
                    hashed_feature_ids: bool = True
                    ) -> (biom.Table, DNAIterator, qiime2.Metadata):
@@ -266,15 +283,27 @@ def denoise_paired(demultiplexed_seqs: SingleLanePerSamplePairedEndFastqDirFmt,
             elif 'R2_001.fastq' in rp.name:
                 qiime2.util.duplicate(fp, os.path.join(tmp_reverse, rp.name))
 
-        cmd = ['run_dada_paired.R',
-               tmp_forward, tmp_reverse, biom_fp, track_fp, filt_forward,
-               filt_reverse,
-               str(trunc_len_f), str(trunc_len_r),
-               str(trim_left_f), str(trim_left_r),
-               str(max_ee_f), str(max_ee_r), str(trunc_q),
-               str(min_overlap), str(pooling_method),
-               str(chimera_method), str(min_fold_parent_over_abundance),
-               str(n_threads), str(n_reads_learn)]
+        cmd = ['run_dada.R',
+               '--input_directory', tmp_forward,
+               '--input_directory_reverse', tmp_reverse,
+               '--output_path', biom_fp,
+               '--output_track', track_fp,
+               '--filtered_directory', filt_forward,
+               '--filtered_directory_reverse', filt_reverse,
+               '--truncation_length', str(trunc_len_f),
+               '--truncation_length_reverse', str(trunc_len_r),
+               '--trim_left', str(trim_left_f),
+               '--trim_left_reverse', str(trim_left_r),
+               '--max_expected_errors', str(max_ee_f),
+               '--max_expected_errors_reverse', str(max_ee_r),
+               '--truncation_quality_score', str(trunc_q),
+               '--min_overlap', str(min_overlap),
+               '--pooling_method', str(pooling_method),
+               '--chimera_method', str(chimera_method),
+               '--min_parental_fold', str(min_fold_parent_over_abundance),
+               '--allow_one_off', str(allow_one_off),
+               '--num_threads', str(n_threads),
+               '--learn_min_reads', str(n_reads_learn)]
         try:
             run_commands([cmd])
         except subprocess.CalledProcessError as e:
@@ -301,6 +330,7 @@ def denoise_pyro(demultiplexed_seqs: SingleLanePerSampleSingleEndFastqDirFmt,
                  pooling_method: str = 'independent',
                  chimera_method: str = 'consensus',
                  min_fold_parent_over_abundance: float = 1.0,
+                 allow_one_off: bool = False,
                  n_threads: int = 1, n_reads_learn: int = 250000,
                  hashed_feature_ids: bool = True
                  ) -> (biom.Table, DNAIterator, qiime2.Metadata):
@@ -314,10 +344,11 @@ def denoise_pyro(demultiplexed_seqs: SingleLanePerSampleSingleEndFastqDirFmt,
         pooling_method=pooling_method,
         chimera_method=chimera_method,
         min_fold_parent_over_abundance=min_fold_parent_over_abundance,
+        allow_one_off=allow_one_off,
         n_threads=n_threads,
         n_reads_learn=n_reads_learn,
         hashed_feature_ids=hashed_feature_ids,
-        homopolymer_gap_penalty='-1',
+        homopolymer_gap_penalty='1',
         band_size='32')
 
 
@@ -329,6 +360,7 @@ def denoise_ccs(demultiplexed_seqs: SingleLanePerSampleSingleEndFastqDirFmt,
                 pooling_method: str = 'independent',
                 chimera_method: str = 'consensus',
                 min_fold_parent_over_abundance: float = 3.5,
+                allow_one_off: bool = False,
                 n_threads: int = 1, n_reads_learn: int = 1000000,
                 hashed_feature_ids: bool = True
                 ) -> (biom.Table, DNAIterator, qiime2.Metadata):
@@ -339,7 +371,7 @@ def denoise_ccs(demultiplexed_seqs: SingleLanePerSampleSingleEndFastqDirFmt,
     if max_len != 0 and max_len < trunc_len:
         raise ValueError("trunc_len (%r) must be no bigger than max_len (%r)"
                          % (trunc_len, max_len))
-    # Coerce for `run_dada_ccs.R`
+    # Coerce for ccs read analysis
     max_len = 'Inf' if max_len == 0 else max_len
 
     with tempfile.TemporaryDirectory() as temp_dir_name:
@@ -350,13 +382,30 @@ def denoise_ccs(demultiplexed_seqs: SingleLanePerSampleSingleEndFastqDirFmt,
         for fp in nop_fp, filt_fp:
             os.mkdir(fp)
 
-        cmd = ['run_dada_ccs.R',
-               str(demultiplexed_seqs), biom_fp, track_fp, nop_fp, filt_fp,
-               str(front), str(adapter), str(max_mismatch), str(indels),
-               str(trunc_len), str(trim_left), str(max_ee), str(trunc_q),
-               str(min_len), str(max_len), str(pooling_method),
-               str(chimera_method), str(min_fold_parent_over_abundance),
-               str(n_threads), str(n_reads_learn)]
+        cmd = ['run_dada.R',
+               '--input_directory', str(demultiplexed_seqs),
+               '--output_path', biom_fp,
+               '--output_track', track_fp,
+               '--removed_primer_directory', nop_fp,
+               '--filtered_directory', filt_fp,
+               '--forward_primer', str(front),
+               '--reverse_primer', str(adapter),
+               '--max_mismatch', str(max_mismatch),
+               '--indels', str(indels),
+               '--truncation_length', str(trunc_len),
+               '--trim_left', str(trim_left),
+               '--max_expected_errors', str(max_ee),
+               '--truncation_quality_score', str(trunc_q),
+               '--min_length', str(min_len),
+               '--max_length', str(max_len),
+               '--pooling_method', str(pooling_method),
+               '--chimera_method', str(chimera_method),
+               '--min_parental_fold', str(min_fold_parent_over_abundance),
+               '--allow_one_off', str(allow_one_off),
+               '--num_threads', str(n_threads),
+               '--learn_min_reads', str(n_reads_learn),
+               '--homopolymer_gap_penalty', 'NULL',
+               '--band_size', '32']
         try:
             run_commands([cmd])
         except subprocess.CalledProcessError as e:


=====================================
q2_dada2/_version.py
=====================================
@@ -23,9 +23,9 @@ def get_keywords():
     # setup.py/versioneer.py will grep for the variable names, so they must
     # each be defined on a line of their own. _version.py will just call
     # get_keywords().
-    git_refnames = " (tag: 2022.2.0)"
-    git_full = "f58e203cfcf4082efacaa3e7f8d348919de72b90"
-    git_date = "2022-02-18 19:05:49 +0000"
+    git_refnames = " (tag: 2022.8.0)"
+    git_full = "dc9bdc5b0346353ebf9a63bc63a60bd45a3405b6"
+    git_date = "2022-08-23 17:11:59 +0000"
     keywords = {"refnames": git_refnames, "full": git_full, "date": git_date}
     return keywords
 


=====================================
q2_dada2/assets/run_dada.R
=====================================
@@ -0,0 +1,496 @@
+#!/usr/bin/env Rscript
+
+###################################################
+# This R script takes an input directory of .fastq.gz files
+# and outputs a tsv file of the dada2 processed sequence
+# table. It is intended for use with the QIIME2 plugin
+# for DADA2.
+#
+# Ex: Rscript run_dada_single.R input_dir output.tsv track.tsv filtered_dir 200 0 2.0 2 Inf pooled 1.0 0 1000000 NULL 32
+####################################################
+
+####################################################
+#             DESCRIPTION OF ARGUMENTS             #
+####################################################
+# NOTE: All numeric arguments should be zero or positive.
+# NOTE: All numeric arguments save maxEE are expected to be integers.
+# NOTE: Currently the filterered_dir must already exist.
+# NOTE: ALL ARGUMENTS ARE POSITIONAL!
+#
+### FILE SYSTEM ARGUMENTS ###
+#
+# 1) input_directory - File path to directory with the .fastq.gz files to be processed.
+#    Ex: path/to/dir/with/fastqgzs
+#
+# 2) output_path - File path to output tsv file. If already exists, will be overwritten.
+#    Ex: path/to/output_file.tsv
+#
+# 3) output_track - File path to tracking tsv file. If already exists, will be overwritten.
+#    Ex: path/to/tracking_stats.tsv
+#
+# 4) removed_primer_directory - File path to directory in which to write the primer.removed .fastq.gz
+#                 files. These files are intermediate for the full workflow.
+#                 Currently they remain after the script finishes.
+#                 Directory must already exist.
+#    Ex: path/to/dir/with/fastqgzs/primerremoved
+#
+# 5) filtered_directory - File path to directory in which to write the filtered .fastq.gz files.
+#                 These files are intermediate for the full workflow.
+#                 Currently they remain after the script finishes.
+#                 Directory must already exist.
+#    Ex: path/to/dir/with/fastqgzs/filtered
+#
+### PRIMER REMOVING ARGUMENTS ###
+#
+# 6) forward_primer - Primer front of Pacbio CCS sequences.
+#    Ex: 'AGRGTTYGATYMTGGCTCAG'
+#
+# 7) reverse_primer - Primer adapter of Pacbio CCS sequences.
+#    Ex: 'RGYTACCTTGTTACGACTT'
+#
+# 8) max_mismatch - The number of mismatches to tolerate when matching
+#                  reads to primer sequences.
+#    Ex: 2
+#
+# 9) indels - Allow insertions or deletions of bases when matching adapters.
+#    Ex: FALSE
+#
+### FILTERING ARGUMENTS ###
+#
+# 10) truncation_length - The position at which to truncate reads. Reads shorter
+#               than truncation_length  will be discarded.
+#               Special values: 0 - no truncation or length filtering.
+#    Ex: 150
+#
+# 11) trim_left - The number of nucleotides to remove from the start of
+#               each read. Should be less than truncation_length  for obvious reasons.
+#    Ex: 0
+#
+# 12) max_expected_errors - Reads with expected errors higher than maxEE are discarded.
+#    Ex: 2.0
+#
+# 13) truncation_quality_score - Reads are truncated at the first instance of quality score truncation_quality_score.
+#                If the read is then shorter than truncation_length , it is discarded.
+#    Ex: 2
+#
+# 14) min_length - Remove reads with length shorter than min_length. min_length is enforced
+#              after trimming and truncation.
+#              Default Inf - no maximum.
+#
+# 15) max_length - Remove reads with length greater than max_length. max_length is enforced on the raw reads.
+#             Default Inf - no maximum.
+#    Ex: 300
+#
+### SENSITIVITY ARGUMENTS ###
+#
+# 16) pooling_method- The method used to pool (or not) samples during denoising.
+#             Valid options are:
+#               independent: (Default) No pooling, samples are denoised indpendently.
+#               pseudo: Samples are "pseudo-pooled" for denoising.
+#    Ex: independent
+#
+#
+### CHIMERA ARGUMENTS ###
+#
+# 17) chimera_method - The method used to remove chimeras. Valid options are:
+#               none: No chimera removal is performed.
+#               pooled: All reads are pooled prior to chimera detection.
+#               consensus: Chimeras are detected in samples individually, and a consensus decision
+#                           is made for each sequence variant.
+#    Ex: consensus
+#
+# 18) min_parental_fold - The minimum abundance of potential "parents" of a sequence being
+#               tested as chimeric, expressed as a fold-change versus the abundance of the sequence being
+#               tested. Values should be greater than or equal to 1 (i.e. parents should be more
+#               abundant than the sequence being tested).
+#    Ex: 1.0
+#
+### SPEED ARGUMENTS ###
+#
+# 19) num_threads - The number of threads to use.
+#                 Special values: 0 - detect available cores and use all..
+#    Ex: 1
+#
+# 20) learn_min_reads - The minimum number of reads to learn the error model from.
+#                 Special values: 0 - Use all input reads.
+#    Ex: 1000000
+#
+### GLOBAL OPTION ARGUMENTS ###
+#
+# 21) homopolymer_gap_penalty - The cost of gaps in homopolymer regions (>=3 repeated bases).
+#                               Default is NULL, which causes homopolymer gaps
+#                               to be treated as normal gaps.
+#    Ex: -1
+#
+# 22) band_size - When set, banded Needleman-Wunsch alignments are performed.
+#                 The default value of band_size is 16. Setting BAND_SIZE to a negative
+#                 number turns off banding (i.e. full Needleman-Wunsch).
+#    Ex: 32
+#
+
+library("optparse")
+
+cat(R.version$version.string, "\n")
+errQuit <- function(mesg, status=1) { message("Error: ", mesg); q(status=status) }
+getN <- function(x) sum(getUniques(x)) #Function added from paired read processing
+
+option_list = list(
+  make_option(c("--input_directory"), action="store", default='NULL', type='character',
+              help="File path to directory with the .fastq.gz files to be processed"),
+  make_option(c("--input_directory_reverse"), action="store", default='NULL', type='character',
+              help="File path to reverse directory with the .fastq.gz files to be processed. Only used in paired-end processing"),
+  make_option(c("--output_path"), action="store", default='NULL', type='character',
+              help="File path to output tsv file. If already exists, will be overwritten"),
+  make_option(c("--output_track"), action="store", default='NULL', type='character',
+              help="File path to tracking tsv file. If already exists, will be overwritten"),
+  make_option(c("--removed_primer_directory"), action="store", default='NULL', type='character',
+              help="File path to directory in which to write the primer.removed .fastq.gz files"),
+  make_option(c("--filtered_directory"), action="store", default='NULL', type='character',
+              help="File path to directory in which to write the filtered .fastq.gz files. These files are intermediate"),
+  make_option(c("--filtered_directory_reverse"), action="store", default='NULL', type='character',
+              help="File path to directory in which to write the reverse filtered .fastq.gz files. These files are intermediate. Only used in paired-end processing"),
+  make_option(c("--forward_primer"), action="store", default='NULL', type='character',
+              help="Primer front of Pacbio CCS sequences"),
+  make_option(c("--reverse_primer"), action="store", default='NULL', type='character',
+              help="Primer adapter of Pacbio CCS sequences"),
+  make_option(c("--max_mismatch"), action="store", default='NULL', type='character',
+              help="The number of mismatches to tolerate when matching reads to primer sequences."),
+  make_option(c("--indels"), action="store", default='NULL', type='character',
+              help="Allow insertions or deletions of bases when matching adapters"),
+  make_option(c("--truncation_length"), action="store", default='NULL', type='character',
+              help="The position at which to truncate reads. Reads shorter then truncation_length will be discarded."),
+  make_option(c("--truncation_length_reverse"), action="store", default='NULL', type='character',
+              help="The position at which to truncate reverse reads. Reads shorter then truncation_length will be discarded. Only used in paired-end processing"),
+  make_option(c("--trim_left"), action="store", default='NULL', type='character',
+              help="The number of nucleotides to remove from the start of each read. Should be less than truncation_length for obvious reasons"),
+  make_option(c("--trim_left_reverse"), action="store", default='NULL', type='character',
+              help="The number of nucleotides to remove from the start of each reverse read. Should be less than truncation_length for obvious reasons. Only used in paired-end processing"),
+  make_option(c("--max_expected_errors"), action="store", default='NULL', type='character',
+              help="Reads with expected errors higher than max_expected_errors are discarded"),
+  make_option(c("--max_expected_errors_reverse"), action="store", default='NULL', type='character',
+              help="Reverse reads with expected errors higher than max_expected_errors are discarded. Only used in paired-end processing"),
+  make_option(c("--truncation_quality_score"), action="store", default='NULL', type='character',
+              help="Reads are truncated at the first instance of quality score truncation_quality_score.If the read is then shorter than truncation_length, it is discarded"),
+  make_option(c("--min_length"), action="store", default='NULL', type='character',
+              help="Remove reads with length shorter than min_length. min_length is enforced after trimming and truncation."),
+  make_option(c("--max_length"), action="store", default='NULL', type='character',
+              help="Remove reads with length greater than max_length. max_length is enforced on the raw reads."),
+  make_option(c("--min_overlap"), action="store", default='NULL', type='character',
+              help="Minimum overlap allowed between reads"),
+  make_option(c("--pooling_method"), action="store", default='NULL', type='character',
+              help="The method used to pool (or not) samples during denoising (independent/pseudo)"),
+  make_option(c("--chimera_method"), action="store", default='NULL', type='character',
+              help="The method used to remove chimeras (none/pooled/consensus)"),
+  make_option(c("--min_parental_fold"), action="store", default='NULL', type='character',
+              help="The minimum abundance of potential parents of a sequence being tested as chimeric, expressed as a fold-change versus the abundance of the sequence being tested. Values should be greater than or equal to 1"),
+  make_option(c("--allow_one_off"), action="store", default='NULL', type='character',
+              help="Bimeras that are one-off (one mismatch/indel away from an exact bimera) are also identified if allow_one_off is TRUE."),
+  make_option(c("--num_threads"), action="store", default='NULL', type='character',
+              help="The number of threads to use"),
+  make_option(c("--learn_min_reads"), action="store", default='NULL', type='character',
+              help="The minimum number of reads to learn the error model from"),
+  make_option(c("--homopolymer_gap_penalty"), action="store", default='NULL', type='character',
+              help="The cost of gaps in homopolymer regions (>=3 repeated bases).Default is NULL, which causes homopolymer gaps to be treated as normal gaps."),
+  make_option(c("--band_size"), action="store", default='NULL', type='character',
+              help="When set, banded Needleman-Wunsch alignments are performed.")
+)
+opt = parse_args(OptionParser(option_list=option_list))
+
+
+# Assign each of the arguments, in positional order, to an appropriately named R variable
+inp.dir <- opt$input_directory
+inp.dirR <- opt$input_directory_reverse #added from paired arguments
+out.path <- opt$output_path
+out.track <- opt$output_track
+primer.removed.dir <- opt$removed_primer_directory #added from CCS arguments
+filtered.dir <- opt$filtered_directory
+filtered.dirR<- opt$filtered_directory_reverse #added from paired arguments
+primer <- opt$forward_primer #added from CCS arguments
+primerR <- opt$reverse_primer #added from CCS arguments
+maxMismatch <- if(opt$max_mismatch=='NULL') NULL else as.numeric(opt$max_mismatch) #added from CCS arguments
+indels <- if(opt$indels=='NULL') NULL else as.logical(opt$indels)  #added from CCS arguments
+truncLen <- if(opt$truncation_length=='NULL') NULL else as.integer(opt$truncation_length)
+truncLenR <- if(opt$truncation_length_reverse=='NULL') NULL else as.integer(opt$truncation_length_reverse) #added from paired arguments
+trimLeft <- if(opt$trim_left=='NULL') NULL else as.integer(opt$trim_left)
+trimLeftR<-if(opt$trim_left_reverse=='NULL') NULL else as.integer(opt$trim_left_reverse) #added from paired arguments
+maxEE <- if(opt$max_expected_errors=='NULL') NULL else as.numeric(opt$max_expected_errors)
+maxEER <- if(opt$max_expected_errors_reverse=='NULL') NULL else as.numeric(opt$max_expected_errors_reverse) #added from paired arguments
+truncQ <- if(opt$truncation_quality_score=='NULL') NULL else as.integer(opt$truncation_quality_score)
+minLen <- if(opt$min_length=='NULL') NULL else as.numeric(opt$min_length) #added from CCS arguments
+maxLen <- if(opt$max_length=='NULL') NULL else as.numeric(opt$max_length) # Allows Inf
+minOverlap <- if(opt$min_overlap=='NULL') NULL else as.integer(opt$min_overlap) #added from paired arguments
+poolMethod <- opt$pooling_method
+chimeraMethod <- opt$chimera_method
+minParentFold <- if(opt$min_parental_fold=='NULL') NULL else as.numeric(opt$min_parental_fold)
+allowOneOff <-if(opt$allow_one_off=='NULL') NULL else as.logical(opt$allow_one_off) 
+nthreads <- if(opt$num_threads=='NULL') NULL else as.integer(opt$num_threads)
+nreads.learn <- if(opt$learn_min_reads=='NULL') NULL else as.integer(opt$learn_min_reads)
+# The following args are not directly exposed to end users in q2-dada2,
+# but rather indirectly, via the methods `denoise-single` and `denoise-pyro`.
+if (opt$homopolymer_gap_penalty=='NULL'){
+  HOMOPOLYMER_GAP_PENALTY<-NULL
+}else{
+  HOMOPOLYMER_GAP_PENALTY<-as.integer(opt$homopolymer_gap_penalty)
+  if(HOMOPOLYMER_GAP_PENALTY>0){
+    HOMOPOLYMER_GAP_PENALTY<-HOMOPOLYMER_GAP_PENALTY*(-1) #negative numbers cannot be passed using optparse, so we convert it here
+  }
+}
+BAND_SIZE <- if(opt$band_size=='NULL') NULL else as.integer(opt$band_size)
+
+### VALIDATE ARGUMENTS ###
+# Input directory is expected to contain .fastq.gz file(s)
+# that have not yet been filtered and globally trimmed
+# to the same length.
+if(!dir.exists(inp.dir)) {
+  errQuit("Input directory does not exist.")
+} else {
+  unfilts <- list.files(inp.dir, pattern=".fastq.gz$", full.names=TRUE)
+  if(length(unfilts) == 0) {
+    errQuit("No input files with the expected filename format found.")
+  }
+  if(inp.dirR!='NULL'){
+    unfiltsR <- list.files(inp.dirR, pattern=".fastq.gz$", full.names=TRUE)
+    if(length(unfiltsR) == 0) {
+      errQuit("No input reverse files with the expected filename format found.")
+    }
+    if(length(unfilts) != length(unfiltsR)) {
+      errQuit("Different numbers of forward and reverse .fastq.gz files.")
+    }
+    
+  }
+
+}
+
+# Output files are to be filenames (not directories) and are to be
+# removed and replaced if already present.
+for(fn in c(out.path, out.track)) {
+  if(dir.exists(fn)) {
+    errQuit("Output filename ", fn, " is a directory.")
+  } else if(file.exists(fn)) {
+    invisible(file.remove(fn))
+  }
+}
+
+# Convert nthreads to the logical/numeric expected by dada2
+if(nthreads < 0) {
+  errQuit("nthreads must be non-negative.")
+} else if(nthreads == 0) {
+  multithread <- TRUE # detect and use all
+} else if(nthreads == 1) {
+  multithread <- FALSE
+} else {
+  multithread <- nthreads
+}
+
+### LOAD LIBRARIES ###
+suppressWarnings(library(methods))
+suppressWarnings(library(dada2))
+cat("DADA2:", as.character(packageVersion("dada2")), "/",
+    "Rcpp:", as.character(packageVersion("Rcpp")), "/",
+    "RcppParallel:", as.character(packageVersion("RcppParallel")), "\n")
+
+### Remove Primers ###
+if(primer.removed.dir!='NULL'){ #for CCS read analysis
+  cat("1) Removing Primers\n")
+  nop <- file.path(primer.removed.dir, basename(unfilts))
+  prim <- suppressWarnings(removePrimers(unfilts, nop, primer, dada2:::rc(primerR),
+                                         max.mismatch = maxMismatch, allow.indels = indels,
+                                         orient = TRUE, verbose = TRUE))
+  cat(ifelse(file.exists(nop), ".", "x"), sep="")
+  nop <- list.files(primer.removed.dir, pattern=".fastq.gz$", full.names=TRUE)
+  cat("\n")
+  if(length(nop) == 0) { # All reads were filtered out
+    errQuit("No reads passed the Removing Primers step  (Did you select the right primers?)", status=2)
+  }
+}
+
+### TRIM AND FILTER ###
+cat("2) Filtering ")
+if(primer.removed.dir!='NULL'){ #for CCS read analysis
+  filts <- file.path(filtered.dir, basename(nop))
+  out <- suppressWarnings(filterAndTrim(nop, filts, truncLen = truncLen, trimLeft = trimLeft,
+                                        maxEE = maxEE, truncQ = truncQ, rm.phix = FALSE,
+                                        multithread = multithread, maxLen = maxLen, minLen = minLen, minQ = 3))
+}else{  
+  filts <- file.path(filtered.dir, basename(unfilts))
+  if(inp.dirR!='NULL'){#for paired read analysis
+    filtsR <- file.path(filtered.dirR, basename(unfiltsR))
+    out <- suppressWarnings(filterAndTrim(unfilts, filts, unfiltsR, filtsR,
+                                          truncLen=c(truncLen, truncLenR), trimLeft=c(trimLeft, trimLeftR),
+                                          maxEE=c(maxEE, maxEER), truncQ=truncQ, rm.phix=TRUE,
+                                          multithread=multithread))
+  }else{#for sinlge/pyro read analysis
+    out <- suppressWarnings(filterAndTrim(unfilts, filts, truncLen=truncLen, trimLeft=trimLeft,
+                                          maxEE=maxEE, truncQ=truncQ, rm.phix=TRUE,
+                                          multithread=multithread, maxLen=maxLen))
+  }
+}
+
+cat(ifelse(file.exists(filts), ".", "x"), sep="")
+filts <- list.files(filtered.dir, pattern=".fastq.gz$", full.names=TRUE)
+
+if(inp.dirR!='NULL'){
+  filtsR <- list.files(filtered.dirR, pattern=".fastq.gz$", full.names=TRUE)
+}
+cat("\n")
+if(length(filts) == 0) { # All reads were filtered out
+  errQuit("No reads passed the filter (was truncLen longer than the read length?)", status=2)
+}
+
+### LEARN ERROR RATES ###
+# Dereplicate enough samples to get nreads.learn total reads
+cat("3) Learning Error Rates\n")
+if(primer.removed.dir!='NULL'){#for CCS read analysis
+  err <- suppressWarnings(learnErrors(filts, nreads=nreads.learn,
+                                      errorEstimationFunction=dada2:::PacBioErrfun,
+                                      multithread=multithread, BAND_SIZE=BAND_SIZE))
+
+}else if(inp.dirR!='NULL'){#for paired read analysis
+  
+  err <- suppressWarnings(learnErrors(filts, nreads=nreads.learn, multithread=multithread))
+  errR <- suppressWarnings(learnErrors(filtsR, nreads=nreads.learn, multithread=multithread))
+  
+}else{#for sinlge/pyro read analysis
+  err <- suppressWarnings(learnErrors(filts, nreads=nreads.learn, multithread=multithread,
+                                      HOMOPOLYMER_GAP_PENALTY=HOMOPOLYMER_GAP_PENALTY, BAND_SIZE=BAND_SIZE))
+}
+
+### PROCESS ALL SAMPLES ###
+# Loop over rest in streaming fashion with learned error rates
+
+if(inp.dirR =='NULL'){#for CCS/sinlge/pyro read analysis
+  dds <- vector("list", length(filts))
+  cat("4) Denoise samples ")
+  cat("\n")
+  for(j in seq(length(filts))) {
+    drp <- derepFastq(filts[[j]])
+    dds[[j]] <- dada(drp, err=err, multithread=multithread,HOMOPOLYMER_GAP_PENALTY=HOMOPOLYMER_GAP_PENALTY,
+                     BAND_SIZE=BAND_SIZE, verbose=FALSE)
+    cat(".")
+  }
+  cat("\n")
+  if(poolMethod == "pseudo") {
+    cat("  Pseudo-pool step ")
+    ### TEMPORARY, to be removed once 1.12 makes its way to Q2
+    ### Needed for now to manage pseudo-pooling memory, as 1.10 didn't do this appropriately.
+    ### pseudo_priors code copied from dada2.R
+    st <- makeSequenceTable(dds)
+    pseudo_priors <- colnames(st)[colSums(st>0) >= 2 | colSums(st) >= Inf]
+    rm(st)
+    ### \pseudo_priors code copied from dada2.R
+    ### code copied from previous loop through samples in this script
+    for(j in seq(length(filts))) {
+      drp <- derepFastq(filts[[j]])
+      dds[[j]] <- dada(drp, err=err, multithread=multithread,
+                       priors=pseudo_priors, HOMOPOLYMER_GAP_PENALTY=HOMOPOLYMER_GAP_PENALTY,
+                       BAND_SIZE=BAND_SIZE, verbose=FALSE)
+      cat(".")
+    }
+    cat("\n")
+    ### \code copied from previous loop through samples in this script
+  }
+  # Make sequence table
+  seqtab <- makeSequenceTable(dds)
+}else{#for paired read analysis
+  denoisedF <- rep(0, length(filts))
+  ddsF <- vector("list", length(filts))
+  ddsR <- vector("list", length(filts))
+  mergers <- vector("list", length(filts))
+  cat("3) Denoise samples ")
+  
+  for(j in seq(length(filts))) {
+    drpF <- derepFastq(filts[[j]])
+    ddsF[[j]] <- dada(drpF, err=err, multithread=multithread, verbose=FALSE)
+    drpR <- derepFastq(filtsR[[j]])
+    ddsR[[j]] <- dada(drpR, err=errR, multithread=multithread, verbose=FALSE)
+    cat(".")
+  }
+  cat("\n")
+  if(poolMethod == "pseudo") {
+    cat("  Pseudo-pool step ")
+    ### TEMPORARY, to be removed once 1.12 makes its way to Q2
+    ### Needed for now to manage pseudo-pooling memory, as 1.10 didn't do this appropriately.
+    ### pseudo_priors code copied from dada2.R
+    stF <- makeSequenceTable(ddsF)
+    pseudo_priorsF <- colnames(stF)[colSums(stF>0) >= 2 | colSums(stF) >= Inf]
+    rm(stF)
+    stR <- makeSequenceTable(ddsR)
+    pseudo_priorsR <- colnames(stR)[colSums(stR>0) >= 2 | colSums(stR) >= Inf]
+    rm(stR)
+    ### \pseudo_priors code copied from dada2.R
+    ### code copied from previous loop through samples in this script
+    for(j in seq(length(filts))) {
+      drpF <- derepFastq(filts[[j]])
+      ddsF[[j]] <- dada(drpF, err=err, priors=pseudo_priorsF,
+                        multithread=multithread, verbose=FALSE)
+      drpR <- derepFastq(filtsR[[j]])
+      ddsR[[j]] <- dada(drpR, err=errR, priors=pseudo_priorsR,
+                        multithread=multithread, verbose=FALSE)
+      cat(".")
+    }
+    cat("\n")
+    ### \code copied from previous loop through samples in this script
+  }
+  
+  ### Now loop through and do merging
+  for(j in seq(length(filts))) {
+    drpF <- derepFastq(filts[[j]])
+    drpR <- derepFastq(filtsR[[j]])
+    mergers[[j]] <- mergePairs(ddsF[[j]], drpF, ddsR[[j]], drpR, minOverlap=minOverlap)
+    denoisedF[[j]] <- getN(ddsF[[j]])
+    cat(".")
+  }
+  cat("\n")
+  # Make sequence table
+  seqtab <- makeSequenceTable(mergers)
+  
+}
+
+
+### Remove chimeras
+cat("5) Remove chimeras (method = ", chimeraMethod, ")\n", sep="")
+if(chimeraMethod %in% c("pooled", "consensus")) {
+  seqtab.nochim <- removeBimeraDenovo(seqtab, method=chimeraMethod, minFoldParentOverAbundance=minParentFold, allowOneOff=allowOneOff, multithread=multithread)
+} else { # No chimera removal, copy seqtab to seqtab.nochim
+  seqtab.nochim <- seqtab
+}
+
+### REPORT READ FRACTIONS THROUGH PIPELINE ###
+cat("6) Report read numbers through the pipeline\n")
+if(inp.dirR =='NULL'){
+  # Handle edge cases: Samples lost in filtering; One sample
+  if(primer.removed.dir!='NULL'){ #for CCS read analysis
+    track <- cbind(prim,out[ ,2], matrix(0, nrow=nrow(out), ncol=2))
+    colnames(track) <- c("input", "primer-removed", "filtered", "denoised", "non-chimeric")
+  }else{ #for sinlge/pyro read analysis
+    track <- cbind(out, matrix(0, nrow=nrow(out), ncol=2))
+    colnames(track) <- c("input", "filtered", "denoised", "non-chimeric")
+  }
+  passed.filtering <- track[,"filtered"] > 0
+  track[passed.filtering,"denoised"] <- rowSums(seqtab)
+  track[passed.filtering,"non-chimeric"] <- rowSums(seqtab.nochim)
+  write.table(track, out.track, sep="\t", row.names=TRUE, col.names=NA,
+              quote=FALSE)
+}else{#for paired end reads
+  # Handle edge cases: Samples lost in filtering; One sample
+  track <- cbind(out, matrix(0, nrow=nrow(out), ncol=3))
+  colnames(track) <- c("input", "filtered", "denoised", "merged", "non-chimeric")
+  passed.filtering <- track[,"filtered"] > 0
+  track[passed.filtering,"denoised"] <- denoisedF
+  track[passed.filtering,"merged"] <- rowSums(seqtab)
+  track[passed.filtering,"non-chimeric"] <- rowSums(seqtab.nochim)
+  write.table(track, out.track, sep="\t", row.names=TRUE, col.names=NA,
+              quote=FALSE)
+}
+
+### WRITE OUTPUT AND QUIT ###
+# Formatting as tsv plain-text sequence table table
+cat("7) Write output\n")
+seqtab.nochim <- t(seqtab.nochim) # QIIME has OTUs as rows
+col.names <- basename(filts)
+col.names[[1]] <- paste0("#OTU ID\t", col.names[[1]])
+write.table(seqtab.nochim, out.path, sep="\t",
+            row.names=TRUE, col.names=col.names, quote=FALSE)
+saveRDS(seqtab.nochim, gsub("tsv", "rds", out.path)) ### TESTING
+
+q(status=0)
\ No newline at end of file


=====================================
q2_dada2/assets/run_dada_ccs.R deleted
=====================================
@@ -1,294 +0,0 @@
-#!/usr/bin/env Rscript
-
-###################################################
-# This R script takes an input directory of .fastq.gz files
-# and outputs a tsv file of the dada2 processed sequence
-# table. It is intended for use with the QIIME2 plugin
-# for DADA2.
-#
-# Ex: Rscript run_dada_single.R input_dir output.tsv track.tsv
-#       primer_removed_dir filtered_dir front adapter 2 FALSE 0 0 2.0 2 20 Inf
-#       pseudo pooled 1.0 0 1000000
-####################################################
-
-####################################################
-#             DESCRIPTION OF ARGUMENTS             #
-####################################################
-# NOTE: All numeric arguments should be zero or positive.
-# NOTE: All numeric arguments save maxEE are expected to be integers.
-# NOTE: Currently the primer_removed_dir must already exist.
-# NOTE: Currently the filterered_dir must already exist.
-# NOTE: ALL ARGUMENTS ARE POSITIONAL!
-#
-### FILE SYSTEM ARGUMENTS ###
-#
-# 1) File path to directory with the .fastq.gz files to be processed.
-#    Ex: path/to/dir/with/fastqgzs
-#
-# 2) File path to output tsv file. If already exists, will be overwritten.
-#    Ex: path/to/output_file.tsv
-#
-# 3) File path to tracking tsv file. If already exists, will be overwritte.
-#    Ex: path/to/tracking_stats.tsv
-#
-# 4) File path to directory in which to write the primer.removed .fastq.gz
-#                 files. These files are intermediate for the full workflow.
-#                 Currently they remain after the script finishes.
-#                 Directory must already exist.
-#    Ex: path/to/dir/with/fastqgzs/primerremoved
-#
-# 5) File path to directory in which to write the filtered .fastq.gz files.
-#                 These files are intermediate for the full workflow.
-#                 Currently they remain after the script finishes.
-#                 Directory must already exist.
-#    Ex: path/to/dir/with/fastqgzs/filtered
-#
-### PRIMER REMOVING ARGUMENTS ###
-#
-# 6) primerF - Primer front of Pacbio CCS sequences.
-#    Ex: 'AGRGTTYGATYMTGGCTCAG'
-#
-# 7) primerR - Primer adapter of Pacbio CCS sequences.
-#    Ex: 'RGYTACCTTGTTACGACTT'
-#
-# 8) maxMismatch - The number of mismatches to tolerate when matching
-#                  reads to primer sequences.
-#    Ex: 2
-#
-# 9) indels - Allow insertions or deletions of bases when matching adapters.
-#    Ex: FALSE
-#
-### FILTERING ARGUMENTS ###
-#
-# 10) truncLen - The position at which to truncate reads. Reads shorter
-#                than truncLen will be discarded.
-#                Special values: 0 - no truncation or length filtering.
-#     Ex: 150
-#
-# 11) trimLeft - The number of nucleotides to remove from the start of
-#                each read. Should be less than truncLen for obvious reasons.
-#     Ex: 0
-#
-# 12) maxEE - Reads with expected errors higher than maxEE are discarded..
-#     Ex: 2.0
-#
-# 13) truncQ - Reads are truncated at the first instance of quality score truncQ.
-#              If the read is then shorter than truncLen, it is discarded.
-#     Ex: 2
-#
-# 14) minLen - Remove reads with length shorter than minLen. minLen is enforced
-#              after trimming and truncation.
-#              Default Inf - no maximum.
-#     Ex: 20
-#
-# 15) maxLen - Remove reads with length greater than maxLen. maxLen is enforced
-#              on the raw reads.
-#              Default Inf - no maximum.
-#    Ex: 300
-#
-### SENSITIVITY ARGUMENTS ###
-#
-# 16) poolMethod - The method used to pool (or not) samples during denoising.
-#                  Valid options are:
-#          independent: (Default) No pooling, samples are denoised indpendently.
-#
-#          pseudo: Samples are "pseudo-pooled" for denoising.
-#    Ex: independent
-#
-#
-### CHIMERA ARGUMENTS ###
-#
-# 17) chimeraMethod - The method used to remove chimeras. Valid options are:
-#               none: No chimera removal is performed.
-#               pooled: All reads are pooled prior to chimera detection.
-#               consensus: Chimeras are detect in samples individually, and a
-#                          consensus decision is made for each sequence variant.
-#    Ex: consensus
-#
-# 18) minParentFold - The minimum abundance of potential "parents" of a sequence
-#                     being tested as chimeric, expressed as a fold-change
-#                     versus the abundance of the sequence being tested. Values
-#                     should be greater than or equal to 1 (i.e. parents should
-#                     be more abundant than the sequence being tested).
-#    Ex: 1.0
-#
-### SPEED ARGUMENTS ###
-#
-# 19) nthreads - The number of threads to use.
-#                 Special values: 0 - detect available cores and use all..
-#    Ex: 1
-#
-# 20) nreads_learn - The minimum number of reads to learn the error model from.
-#                 Special values: 0 - Use all input reads.
-#    Ex: 1000000
-#
-#
-#
-
-cat(R.version$version.string, "\n")
-errQuit <- function(mesg, status=1) { message("Error: ", mesg); q(status=status) }
-args <- commandArgs(TRUE)
-
-# Assign each of the arguments, in positional order, to an appropriately named R variable
-inp.dir <- args[[1]]
-out.path <- args[[2]]
-out.track <- args[[3]]
-primer.removed.dir <- args[[4]]
-filtered.dir <- args[[5]]
-primerF <- args[[6]]
-primerR <- args[[7]]
-maxMismatch <- as.numeric(args[[8]])
-indels <- as.logical(args[[9]])
-truncLen <- as.integer(args[[10]])
-trimLeft <- as.integer(args[[11]])
-maxEE <- as.numeric(args[[12]])
-truncQ <- as.integer(args[[13]])
-minLen <- as.numeric(args[[14]])
-maxLen <- as.numeric(args[[15]]) # Allows Inf
-poolMethod <- args[[16]]
-chimeraMethod <- args[[17]]
-minParentFold <- as.numeric(args[[18]])
-nthreads <- as.integer(args[[19]])
-nreads.learn <- as.integer(args[[20]])
-
-### VALIDATE ARGUMENTS ###
-
-# Input directory is expected to contain .fastq.gz file(s)
-# that have not yet been filtered and globally trimmed
-# to the same length.
-if(!dir.exists(inp.dir)) {
-  errQuit("Input directory does not exist.")
-} else {
-  unfilts <- list.files(inp.dir, pattern=".fastq.gz$", full.names=TRUE)
-  if(length(unfilts) == 0) {
-    errQuit("No input files with the expected filename format found.")
-  }
-}
-
-# Output files are to be filenames (not directories) and are to be
-# removed and replaced if already present.
-for(fn in c(out.path, out.track)) {
-  if(dir.exists(fn)) {
-    errQuit("Output filename ", fn, " is a directory.")
-  } else if(file.exists(fn)) {
-    invisible(file.remove(fn))
-  }
-}
-
-# Convert nthreads to the logical/numeric expected by dada2
-if(nthreads < 0) {
-  errQuit("nthreads must be non-negative.")
-} else if(nthreads == 0) {
-  multithread <- TRUE # detect and use all
-} else if(nthreads == 1) {
-  multithread <- FALSE
-} else {
-  multithread <- nthreads
-}
-
-### LOAD LIBRARIES ###
-suppressWarnings(library(methods))
-suppressWarnings(library(dada2))
-cat("DADA2:", as.character(packageVersion("dada2")), "/",
-    "Rcpp:", as.character(packageVersion("Rcpp")), "/",
-    "RcppParallel:", as.character(packageVersion("RcppParallel")), "\n")
-
-### Remove Primers ###
-cat("1) Removing Primers\n")
-nop <- file.path(primer.removed.dir, basename(unfilts))
-prim <- suppressWarnings(removePrimers(unfilts, nop, primerF, dada2:::rc(primerR),
-                                       max.mismatch = maxMismatch, allow.indels = indels,
-                                       orient = TRUE, verbose = TRUE))
-cat(ifelse(file.exists(nop), ".", "x"), sep="")
-nop <- list.files(primer.removed.dir, pattern=".fastq.gz$", full.names=TRUE)
-cat("\n")
-if(length(nop) == 0) { # All reads were filtered out
-  errQuit("No reads passed the Removing Primers step  (Did you select the right primers?)", status=2)
-}
-
-### TRIM AND FILTER ###
-cat("2) Filtering\n")
-filts <- file.path(filtered.dir, basename(nop))
-out <- suppressWarnings(filterAndTrim(nop, filts, truncLen = truncLen, trimLeft = trimLeft,
-                                      maxEE = maxEE, truncQ = truncQ, rm.phix = FALSE,
-                                      multithread = multithread, maxLen = maxLen, minLen = minLen, minQ = 3))
-cat(ifelse(file.exists(filts), ".", "x"), sep="")
-filts <- list.files(filtered.dir, pattern=".fastq.gz$", full.names=TRUE)
-cat("\n")
-if(length(filts) == 0) { # All reads were filtered out
-  errQuit("No reads passed the filter (was truncLen longer than the read length?)", status=2)
-}
-
-### LEARN ERROR RATES ###
-# Dereplicate enough samples to get nreads.learn total reads
-cat("3) Learning Error Rates\n")
-err <- suppressWarnings(learnErrors(filts, nreads=nreads.learn,
-                                    errorEstimationFunction=dada2:::PacBioErrfun,
-                                    multithread=multithread, BAND_SIZE=32))
-
-### PROCESS ALL SAMPLES ###
-# Loop over rest in streaming fashion with learned error rates
-dds <- vector("list", length(filts))
-cat("4) Denoise samples ")
-cat("\n")
-for(j in seq(length(filts))) {
-  drp <- derepFastq(filts[[j]])
-  dds[[j]] <- dada(drp, err=err, multithread=multithread,
-                   BAND_SIZE=32, verbose=FALSE)
-  cat(".")
-}
-cat("\n")
-if(poolMethod == "pseudo") {
-  cat("  Pseudo-pool step ")
-  ### TEMPORARY, to be removed once 1.12 makes its way to Q2
-  ### Needed for now to manage pseudo-pooling memory, as 1.10 didn't do this appropriately.
-  ### pseudo_priors code copied from dada2.R
-  st <- makeSequenceTable(dds)
-  pseudo_priors <- colnames(st)[colSums(st>0) >= 2 | colSums(st) >= Inf]
-  rm(st)
-  ### \pseudo_priors code copied from dada2.R
-  ### code copied from previous loop through samples in this script
-  for(j in seq(length(filts))) {
-    drp <- derepFastq(filts[[j]])
-    dds[[j]] <- dada(drp, err=err, multithread=multithread,
-                     priors = pseudo_priors,
-                     BAND_SIZE=32, verbose=FALSE)
-    cat(".")
-  }
-  cat("\n")
-  ### \code copied from previous loop through samples in this script
-}
-
-# Make sequence table
-seqtab <- makeSequenceTable(dds)
-
-### Remove chimeras
-cat("5) Remove chimeras (method = ", chimeraMethod, ")\n", sep="")
-if(chimeraMethod %in% c("pooled", "consensus")) {
-  seqtab.nochim <- removeBimeraDenovo(seqtab, method=chimeraMethod, minFoldParentOverAbundance=minParentFold, multithread=multithread)
-} else { # No chimera removal, copy seqtab to seqtab.nochim
-  seqtab.nochim <- seqtab
-}
-
-### REPORT READ FRACTIONS THROUGH PIPELINE ###
-cat("6) Report read numbers through the pipeline\n")
-# Handle edge cases: Samples lost in filtering; One sample
-track <- cbind(prim,out[ ,2], matrix(0, nrow=nrow(out), ncol=2))
-colnames(track) <- c("input", "primer-removed","filtered", "denoised", "non-chimeric")
-passed.filtering <- track[,"filtered"] > 0
-track[passed.filtering,"denoised"] <- rowSums(seqtab)
-track[passed.filtering,"non-chimeric"] <- rowSums(seqtab.nochim)
-write.table(track, out.track, sep="\t", row.names=TRUE, col.names=NA,
-	          quote=FALSE)
-
-### WRITE OUTPUT AND QUIT ###
-# Formatting as tsv plain-text sequence table table
-cat("7) Write output\n")
-seqtab.nochim <- t(seqtab.nochim) # QIIME has OTUs as rows
-col.names <- basename(filts)
-col.names[[1]] <- paste0("#OTU ID\t", col.names[[1]])
-write.table(seqtab.nochim, out.path, sep="\t",
-            row.names=TRUE, col.names=col.names, quote=FALSE)
-saveRDS(seqtab.nochim, gsub("tsv", "rds", out.path)) ### TESTING
-
-q(status=0)
\ No newline at end of file


=====================================
q2_dada2/assets/run_dada_paired.R deleted
=====================================
@@ -1,294 +0,0 @@
-#!/usr/bin/env Rscript
-
-###################################################
-# This R script takes an input two directories of
-# .fastq.gz files, corresponding to matched forward
-# and reverse sequence files,
-# and outputs a tsv file of the dada2 processed sequence
-# table. It is intended for use with the QIIME2 plugin
-# for DADA2.
-#
-# Rscript run_dada_paired.R input_dirF input_dirR output.tsv track.tsv filtered_dirF filtered_dirR 240 160 0 0 2.0 2 pooled 1.0 0 100000
-####################################################
-
-####################################################
-#             DESCRIPTION OF ARGUMENTS             #
-####################################################
-# NOTE: All numeric arguments should be zero or positive.
-# NOTE: All numeric arguments save maxEEF/R are expected to be integers.
-# NOTE: Currently the filterered_dirF/R must already exist.
-# NOTE: ALL ARGUMENTS ARE POSITIONAL!
-#
-### FILE SYSTEM ARGUMENTS ###
-#
-# 1) File path to directory with the FORWARD .fastq.gz files to be processed.
-#    Ex: path/to/dir/with/FWD_fastqgzs
-#
-# 2) File path to directory with the REVERSE .fastq.gz files to be processed.
-#    Ex: path/to/dir/with/REV_fastqgzs
-#
-# 3) File path to output tsv file. If already exists, will be overwritten.
-#    Ex: path/to/output_file.tsv
-#
-# 4) File path to tracking tsv file. If already exists, will be overwritte.
-#    Ex: path/to/tracking_stats.tsv
-#
-# 5) File path to directory to write the filtered FORWARD .fastq.gz files. These files are intermediate
-#               for the full workflow. Currently they remain after the script finishes. Directory must
-#               already exist.
-#    Ex: path/to/dir/with/FWD_fastqgzs/filtered
-#
-# 6) File path to directory to write the filtered REVERSE .fastq.gz files. These files are intermediate
-#               for the full workflow. Currently they remain after the script finishes. Directory must
-#               already exist.
-#    Ex: path/to/dir/with/REV_fastqgzs/filtered
-#
-### FILTERING ARGUMENTS ###
-#
-# 7) truncLenF - The position at which to truncate forward reads. Forward reads shorter
-#               than truncLenF will be discarded.
-#               Special values: 0 - no truncation or length filtering.
-#    Ex: 240
-#
-# 8) truncLenR - The position at which to truncate reverse reads. Reverse reads shorter
-#               than truncLenR will be discarded.
-#               Special values: 0 - no truncation or length filtering.
-#    Ex: 160
-#
-# 9) trimLeftF - The number of nucleotides to remove from the start of
-#               each forward read. Should be less than truncLenF.
-#    Ex: 0
-#
-# 10) trimLeftR - The number of nucleotides to remove from the start of
-#               each reverse read. Should be less than truncLenR.
-#    Ex: 0
-#
-# 11) maxEEF - Forward reads with expected errors higher than maxEEF are discarded.
-#               Both forward and reverse reads are independently tested.
-#    Ex: 2.0
-#
-# 12) maxEER - Reverse reads with expected errors higher than maxEER are discarded.
-#               Both forward and reverse reads are independently tested.
-#    Ex: 2.0
-#
-# 13) truncQ - Reads are truncated at the first instance of quality score truncQ.
-#                If the read is then shorter than truncLen, it is discarded.
-#    Ex: 2
-#
-### SENSITIVITY ARGUMENTS ###
-#
-# 14) poolMethod - The method used to pool (or not) samples during denoising.
-#             Valid options are:
-#               independent: (Default) No pooling, samples are denoised indpendently.
-#               pseudo: Samples are "pseudo-pooled" for denoising.
-#    Ex: independent
-#
-#
-### CHIMERA ARGUMENTS ###
-#
-# 15) chimeraMethod - The method used to remove chimeras. Valid options are:
-#               none: No chimera removal is performed.
-#               pooled: All reads are pooled prior to chimera detection.
-#               consensus: Chimeras are detect in samples individually, and a consensus decision
-#                           is made for each sequence variant.
-#    Ex: consensus
-#
-# 16) minParentFold - The minimum abundance of potential "parents" of a sequence being
-#               tested as chimeric, expressed as a fold-change versus the abundance of the sequence being
-#               tested. Values should be greater than or equal to 1 (i.e. parents should be more
-#               abundant than the sequence being tested).
-#    Ex: 1.0
-#
-### SPEED ARGUMENTS ###
-#
-# 17) nthreads - The number of threads to use.
-#                 Special values: 0 - detect available and use all.
-#    Ex: 1
-#
-# 18) nreads_learn - The minimum number of reads to learn the error model from.
-#                 Special values: 0 - Use all input reads.
-#    Ex: 1000000
-#
-
-cat(R.version$version.string, "\n")
-errQuit <- function(mesg, status=1) { message("Error: ", mesg); q(status=status) }
-getN <- function(x) sum(getUniques(x))
-args <- commandArgs(TRUE)
-
-# Assign each of the arguments, in positional order, to an appropriately named R variable
-inp.dirF <- args[[1]]
-inp.dirR <- args[[2]]
-out.path <- args[[3]]
-out.track <- args[[4]]
-filtered.dirF <- args[[5]]
-filtered.dirR <- args[[6]]
-truncLenF <- as.integer(args[[7]])
-truncLenR <- as.integer(args[[8]])
-trimLeftF <- as.integer(args[[9]])
-trimLeftR <- as.integer(args[[10]])
-maxEEF <- as.numeric(args[[11]])
-maxEER <- as.numeric(args[[12]])
-truncQ <- as.integer(args[[13]])
-minOverlap <- as.integer(args[14])
-poolMethod <- args[[15]]
-chimeraMethod <- args[[16]]
-minParentFold <- as.numeric(args[[17]])
-nthreads <- as.integer(args[[18]])
-nreads.learn <- as.integer(args[[19]])
-
-### VALIDATE ARGUMENTS ###
-
-# Input directory is expected to contain .fastq.gz file(s)
-# that have not yet been filtered and globally trimmed
-# to the same length.
-if(!(dir.exists(inp.dirF) && dir.exists(inp.dirR))) {
-  errQuit("Input directory does not exist.")
-} else {
-  unfiltsF <- list.files(inp.dirF, pattern=".fastq.gz$", full.names=TRUE)
-  unfiltsR <- list.files(inp.dirR, pattern=".fastq.gz$", full.names=TRUE)
-  if(length(unfiltsF) == 0) {
-    errQuit("No input forward files with the expected filename format found.")
-  }
-  if(length(unfiltsR) == 0) {
-    errQuit("No input reverse files with the expected filename format found.")
-  }
-  if(length(unfiltsF) != length(unfiltsR)) {
-    errQuit("Different numbers of forward and reverse .fastq.gz files.")
-  }
-}
-
-# Output files are to be filenames (not directories) and are to be
-# removed and replaced if already present.
-for(fn in c(out.path, out.track)) {
-  if(dir.exists(fn)) {
-    errQuit("Output filename ", fn, " is a directory.")
-  } else if(file.exists(fn)) {
-    invisible(file.remove(fn))
-  }
-}
-
-# Convert nthreads to the logical/numeric expected by dada2
-if(nthreads < 0) {
-  errQuit("nthreads must be non-negative.")
-} else if(nthreads == 0) {
-  multithread <- TRUE # detect and use all
-} else if(nthreads == 1) {
-  multithread <- FALSE
-} else {
-  multithread <- nthreads
-}
-
-### LOAD LIBRARIES ###
-suppressWarnings(library(methods))
-suppressWarnings(library(dada2))
-cat("DADA2:", as.character(packageVersion("dada2")), "/",
-    "Rcpp:", as.character(packageVersion("Rcpp")), "/",
-    "RcppParallel:", as.character(packageVersion("RcppParallel")), "\n")
-
-### TRIM AND FILTER ###
-cat("1) Filtering ")
-filtsF <- file.path(filtered.dirF, basename(unfiltsF))
-filtsR <- file.path(filtered.dirR, basename(unfiltsR))
-out <- suppressWarnings(filterAndTrim(unfiltsF, filtsF, unfiltsR, filtsR,
-                                      truncLen=c(truncLenF, truncLenR), trimLeft=c(trimLeftF, trimLeftR),
-                                      maxEE=c(maxEEF, maxEER), truncQ=truncQ, rm.phix=TRUE,
-                                      multithread=multithread))
-cat(ifelse(file.exists(filtsF), ".", "x"), sep="")
-filtsF <- list.files(filtered.dirF, pattern=".fastq.gz$", full.names=TRUE)
-filtsR <- list.files(filtered.dirR, pattern=".fastq.gz$", full.names=TRUE)
-cat("\n")
-if(length(filtsF) == 0) { # All reads were filtered out
-  errQuit("No reads passed the filter (were truncLenF/R longer than the read lengths?)", status=2)
-}
-
-### LEARN ERROR RATES ###
-# Dereplicate enough samples to get nreads.learn total reads
-cat("2) Learning Error Rates\n")
-errF <- suppressWarnings(learnErrors(filtsF, nreads=nreads.learn, multithread=multithread))
-errR <- suppressWarnings(learnErrors(filtsR, nreads=nreads.learn, multithread=multithread))
-
-### PROCESS ALL SAMPLES ###
-# Loop over rest in streaming fashion with learned error rates
-denoisedF <- rep(0, length(filtsF))
-ddsF <- vector("list", length(filtsF))
-ddsR <- vector("list", length(filtsF))
-mergers <- vector("list", length(filtsF))
-cat("3) Denoise samples ")
-
-for(j in seq(length(filtsF))) {
-  drpF <- derepFastq(filtsF[[j]])
-  ddsF[[j]] <- dada(drpF, err=errF, multithread=multithread, verbose=FALSE)
-  drpR <- derepFastq(filtsR[[j]])
-  ddsR[[j]] <- dada(drpR, err=errR, multithread=multithread, verbose=FALSE)
-  cat(".")
-}
-cat("\n")
-if(poolMethod == "pseudo") {
-  cat("  Pseudo-pool step ")
-  ### TEMPORARY, to be removed once 1.12 makes its way to Q2
-  ### Needed for now to manage pseudo-pooling memory, as 1.10 didn't do this appropriately.
-  ### pseudo_priors code copied from dada2.R
-  stF <- makeSequenceTable(ddsF)
-  pseudo_priorsF <- colnames(stF)[colSums(stF>0) >= 2 | colSums(stF) >= Inf]
-  rm(stF)
-  stR <- makeSequenceTable(ddsR)
-  pseudo_priorsR <- colnames(stR)[colSums(stR>0) >= 2 | colSums(stR) >= Inf]
-  rm(stR)
-  ### \pseudo_priors code copied from dada2.R
-  ### code copied from previous loop through samples in this script
-  for(j in seq(length(filtsF))) {
-    drpF <- derepFastq(filtsF[[j]])
-    ddsF[[j]] <- dada(drpF, err=errF, priors=pseudo_priorsF,
-                      multithread=multithread, verbose=FALSE)
-    drpR <- derepFastq(filtsR[[j]])
-    ddsR[[j]] <- dada(drpR, err=errR, priors=pseudo_priorsR,
-                      multithread=multithread, verbose=FALSE)
-    cat(".")
-  }
-  cat("\n")
-  ### \code copied from previous loop through samples in this script
-}
-
-### Now loop through and do merging
-for(j in seq(length(filtsF))) {
-  drpF <- derepFastq(filtsF[[j]])
-  drpR <- derepFastq(filtsR[[j]])
-  mergers[[j]] <- mergePairs(ddsF[[j]], drpF, ddsR[[j]], drpR, minOverlap=minOverlap)
-  denoisedF[[j]] <- getN(ddsF[[j]])
-  cat(".")
-}
-cat("\n")
-
-# Make sequence table
-seqtab <- makeSequenceTable(mergers)
-
-# Remove chimeras
-cat("4) Remove chimeras (method = ", chimeraMethod, ")\n", sep="")
-if(chimeraMethod %in% c("pooled", "consensus")) {
-  seqtab.nochim <- removeBimeraDenovo(seqtab, method=chimeraMethod, minFoldParentOverAbundance=minParentFold, multithread=multithread)
-} else { # No chimera removal, copy seqtab to seqtab.nochim
-  seqtab.nochim <- seqtab
-}
-
-### REPORT READ COUNTS AT EACH PROCESSING STEP ###
-# Handle edge cases: Samples lost in filtering; One sample
-track <- cbind(out, matrix(0, nrow=nrow(out), ncol=3))
-colnames(track) <- c("input", "filtered", "denoised", "merged", "non-chimeric")
-passed.filtering <- track[,"filtered"] > 0
-track[passed.filtering,"denoised"] <- denoisedF
-track[passed.filtering,"merged"] <- rowSums(seqtab)
-track[passed.filtering,"non-chimeric"] <- rowSums(seqtab.nochim)
-write.table(track, out.track, sep="\t", row.names=TRUE, col.names=NA,
-	    quote=FALSE)
-
-### WRITE OUTPUT AND QUIT ###
-# Formatting as tsv plain-text sequence table table
-cat("6) Write output\n")
-seqtab.nochim <- t(seqtab.nochim) # QIIME has OTUs as rows
-col.names <- basename(filtsF)
-col.names[[1]] <- paste0("#OTU ID\t", col.names[[1]])
-write.table(seqtab.nochim, out.path, sep="\t",
-            row.names=TRUE, col.names=col.names, quote=FALSE)
-saveRDS(seqtab.nochim, gsub("tsv", "rds", out.path)) ### TESTING
-
-q(status=0)


=====================================
q2_dada2/assets/run_dada_single.R deleted
=====================================
@@ -1,256 +0,0 @@
-#!/usr/bin/env Rscript
-
-###################################################
-# This R script takes an input directory of .fastq.gz files
-# and outputs a tsv file of the dada2 processed sequence
-# table. It is intended for use with the QIIME2 plugin
-# for DADA2.
-#
-# Ex: Rscript run_dada_single.R input_dir output.tsv track.tsv filtered_dir 200 0 2.0 2 Inf pooled 1.0 0 1000000 NULL 32
-####################################################
-
-####################################################
-#             DESCRIPTION OF ARGUMENTS             #
-####################################################
-# NOTE: All numeric arguments should be zero or positive.
-# NOTE: All numeric arguments save maxEE are expected to be integers.
-# NOTE: Currently the filterered_dir must already exist.
-# NOTE: ALL ARGUMENTS ARE POSITIONAL!
-#
-### FILE SYSTEM ARGUMENTS ###
-#
-# 1) File path to directory with the .fastq.gz files to be processed.
-#    Ex: path/to/dir/with/fastqgzs
-#
-# 2) File path to output tsv file. If already exists, will be overwritten.
-#    Ex: path/to/output_file.tsv
-#
-# 3) File path to tracking tsv file. If already exists, will be overwritte.
-#    Ex: path/to/tracking_stats.tsv
-#
-# 4) File path to directory in which to write the filtered .fastq.gz files. These files are intermediate
-#               for the full workflow. Currently they remain after the script finishes.
-#               Directory must already exist.
-#    Ex: path/to/dir/with/fastqgzs/filtered
-#
-### FILTERING ARGUMENTS ###
-#
-# 5) truncLen - The position at which to truncate reads. Reads shorter
-#               than truncLen will be discarded.
-#               Special values: 0 - no truncation or length filtering.
-#    Ex: 150
-#
-# 6) trimLeft - The number of nucleotides to remove from the start of
-#               each read. Should be less than truncLen for obvious reasons.
-#    Ex: 0
-#
-# 7) maxEE - Reads with expected errors higher than maxEE are discarded.
-#    Ex: 2.0
-#
-# 8) truncQ - Reads are truncated at the first instance of quality score truncQ.
-#                If the read is then shorter than truncLen, it is discarded.
-#    Ex: 2
-#
-# 9) maxLen - Remove reads with length greater than maxLen. maxLen is enforced on the raw reads.
-#             Default Inf - no maximum.
-#    Ex: 300
-#
-### SENSITIVITY ARGUMENTS ###
-#
-# 10) poolMethod - The method used to pool (or not) samples during denoising.
-#             Valid options are:
-#               independent: (Default) No pooling, samples are denoised indpendently.
-#               pseudo: Samples are "pseudo-pooled" for denoising.
-#    Ex: independent
-#
-#
-### CHIMERA ARGUMENTS ###
-#
-# 11) chimeraMethod - The method used to remove chimeras. Valid options are:
-#               none: No chimera removal is performed.
-#               pooled: All reads are pooled prior to chimera detection.
-#               consensus: Chimeras are detect in samples individually, and a consensus decision
-#                           is made for each sequence variant.
-#    Ex: consensus
-#
-# 12) minParentFold - The minimum abundance of potential "parents" of a sequence being
-#               tested as chimeric, expressed as a fold-change versus the abundance of the sequence being
-#               tested. Values should be greater than or equal to 1 (i.e. parents should be more
-#               abundant than the sequence being tested).
-#    Ex: 1.0
-#
-### SPEED ARGUMENTS ###
-#
-# 13) nthreads - The number of threads to use.
-#                 Special values: 0 - detect available cores and use all..
-#    Ex: 1
-#
-# 14) nreads_learn - The minimum number of reads to learn the error model from.
-#                 Special values: 0 - Use all input reads.
-#    Ex: 1000000
-#
-### GLOBAL OPTION ARGUMENTS ###
-#
-# 15) HOMOPOLYMER_GAP_PENALTY - The cost of gaps in homopolymer regions (>=3 repeated bases).
-#                               Default is NULL, which causes homopolymer gaps
-#                               to be treated as normal gaps.
-#    Ex: -1
-#
-# 16) BAND_SIZE - When set, banded Needleman-Wunsch alignments are performed.
-#                 The default value of BAND_SIZE is 16. Setting BAND_SIZE to a negative
-#                 number turns off banding (i.e. full Needleman-Wunsch).
-#    Ex: 32
-#
-
-cat(R.version$version.string, "\n")
-errQuit <- function(mesg, status=1) { message("Error: ", mesg); q(status=status) }
-args <- commandArgs(TRUE)
-
-# Assign each of the arguments, in positional order, to an appropriately named R variable
-inp.dir <- args[[1]]
-out.path <- args[[2]]
-out.track <- args[[3]]
-filtered.dir <- args[[4]]
-truncLen <- as.integer(args[[5]])
-trimLeft <- as.integer(args[[6]])
-maxEE <- as.numeric(args[[7]])
-truncQ <- as.integer(args[[8]])
-maxLen <- as.numeric(args[[9]]) # Allows Inf
-poolMethod <- args[[10]]
-chimeraMethod <- args[[11]]
-minParentFold <- as.numeric(args[[12]])
-nthreads <- as.integer(args[[13]])
-nreads.learn <- as.integer(args[[14]])
-# The following args are not directly exposed to end users in q2-dada2,
-# but rather indirectly, via the methods `denoise-single` and `denoise-pyro`.
-HOMOPOLYMER_GAP_PENALTY <- if (args[[15]]=='NULL') NULL else as.integer(args[[15]])
-BAND_SIZE <- as.integer(args[[16]])
-
-### VALIDATE ARGUMENTS ###
-
-# Input directory is expected to contain .fastq.gz file(s)
-# that have not yet been filtered and globally trimmed
-# to the same length.
-if(!dir.exists(inp.dir)) {
-  errQuit("Input directory does not exist.")
-} else {
-  unfilts <- list.files(inp.dir, pattern=".fastq.gz$", full.names=TRUE)
-  if(length(unfilts) == 0) {
-    errQuit("No input files with the expected filename format found.")
-  }
-}
-
-# Output files are to be filenames (not directories) and are to be
-# removed and replaced if already present.
-for(fn in c(out.path, out.track)) {
-  if(dir.exists(fn)) {
-    errQuit("Output filename ", fn, " is a directory.")
-  } else if(file.exists(fn)) {
-    invisible(file.remove(fn))
-  }
-}
-
-# Convert nthreads to the logical/numeric expected by dada2
-if(nthreads < 0) {
-  errQuit("nthreads must be non-negative.")
-} else if(nthreads == 0) {
-  multithread <- TRUE # detect and use all
-} else if(nthreads == 1) {
-  multithread <- FALSE
-} else {
-  multithread <- nthreads
-}
-
-### LOAD LIBRARIES ###
-suppressWarnings(library(methods))
-suppressWarnings(library(dada2))
-cat("DADA2:", as.character(packageVersion("dada2")), "/",
-    "Rcpp:", as.character(packageVersion("Rcpp")), "/",
-    "RcppParallel:", as.character(packageVersion("RcppParallel")), "\n")
-
-### TRIM AND FILTER ###
-cat("1) Filtering ")
-filts <- file.path(filtered.dir, basename(unfilts))
-out <- suppressWarnings(filterAndTrim(unfilts, filts, truncLen=truncLen, trimLeft=trimLeft,
-                                      maxEE=maxEE, truncQ=truncQ, rm.phix=TRUE,
-                                      multithread=multithread, maxLen=maxLen))
-cat(ifelse(file.exists(filts), ".", "x"), sep="")
-filts <- list.files(filtered.dir, pattern=".fastq.gz$", full.names=TRUE)
-cat("\n")
-if(length(filts) == 0) { # All reads were filtered out
-  errQuit("No reads passed the filter (was truncLen longer than the read length?)", status=2)
-}
-
-### LEARN ERROR RATES ###
-# Dereplicate enough samples to get nreads.learn total reads
-cat("2) Learning Error Rates\n")
-err <- suppressWarnings(learnErrors(filts, nreads=nreads.learn, multithread=multithread,
-                   HOMOPOLYMER_GAP_PENALTY=HOMOPOLYMER_GAP_PENALTY, BAND_SIZE=BAND_SIZE))
-
-### PROCESS ALL SAMPLES ###
-# Loop over rest in streaming fashion with learned error rates
-dds <- vector("list", length(filts))
-cat("3) Denoise samples ")
-for(j in seq(length(filts))) {
-  drp <- derepFastq(filts[[j]])
-  dds[[j]] <- dada(drp, err=err, multithread=multithread,
-                   HOMOPOLYMER_GAP_PENALTY=HOMOPOLYMER_GAP_PENALTY,
-                   BAND_SIZE=BAND_SIZE, verbose=FALSE)
-  cat(".")
-}
-cat("\n")
-if(poolMethod == "pseudo") {
-  cat("  Pseudo-pool step ")
-  ### TEMPORARY, to be removed once 1.12 makes its way to Q2
-  ### Needed for now to manage pseudo-pooling memory, as 1.10 didn't do this appropriately.
-  ### pseudo_priors code copied from dada2.R
-  st <- makeSequenceTable(dds)
-  pseudo_priors <- colnames(st)[colSums(st>0) >= 2 | colSums(st) >= Inf]
-  rm(st)
-  ### \pseudo_priors code copied from dada2.R
-  ### code copied from previous loop through samples in this script
-  for(j in seq(length(filts))) {
-    drp <- derepFastq(filts[[j]])
-    dds[[j]] <- dada(drp, err=err, multithread=multithread,
-                     priors = pseudo_priors,
-                     HOMOPOLYMER_GAP_PENALTY=HOMOPOLYMER_GAP_PENALTY,
-                     BAND_SIZE=BAND_SIZE, verbose=FALSE)
-    cat(".")
-  }
-  cat("\n")
-  ### \code copied from previous loop through samples in this script
-}
-
-# Make sequence table
-seqtab <- makeSequenceTable(dds)
-
-### Remove chimeras
-cat("4) Remove chimeras (method = ", chimeraMethod, ")\n", sep="")
-if(chimeraMethod %in% c("pooled", "consensus")) {
-  seqtab.nochim <- removeBimeraDenovo(seqtab, method=chimeraMethod, minFoldParentOverAbundance=minParentFold, multithread=multithread)
-} else { # No chimera removal, copy seqtab to seqtab.nochim
-  seqtab.nochim <- seqtab
-}
-
-### REPORT READ FRACTIONS THROUGH PIPELINE ###
-cat("5) Report read numbers through the pipeline\n")
-# Handle edge cases: Samples lost in filtering; One sample
-track <- cbind(out, matrix(0, nrow=nrow(out), ncol=2))
-colnames(track) <- c("input", "filtered", "denoised", "non-chimeric")
-passed.filtering <- track[,"filtered"] > 0
-track[passed.filtering,"denoised"] <- rowSums(seqtab)
-track[passed.filtering,"non-chimeric"] <- rowSums(seqtab.nochim)
-write.table(track, out.track, sep="\t", row.names=TRUE, col.names=NA,
-	    quote=FALSE)
-
-### WRITE OUTPUT AND QUIT ###
-# Formatting as tsv plain-text sequence table table
-cat("6) Write output\n")
-seqtab.nochim <- t(seqtab.nochim) # QIIME has OTUs as rows
-col.names <- basename(filts)
-col.names[[1]] <- paste0("#OTU ID\t", col.names[[1]])
-write.table(seqtab.nochim, out.path, sep="\t",
-            row.names=TRUE, col.names=col.names, quote=FALSE)
-saveRDS(seqtab.nochim, gsub("tsv", "rds", out.path)) ### TESTING
-
-q(status=0)


=====================================
q2_dada2/plugin_setup.py
=====================================
@@ -46,6 +46,7 @@ plugin.methods.register_function(
                 'chimera_method': qiime2.plugin.Str %
                 qiime2.plugin.Choices(_CHIM_OPT),
                 'min_fold_parent_over_abundance': qiime2.plugin.Float,
+                'allow_one_off': qiime2.plugin.Bool,
                 'n_threads': qiime2.plugin.Int,
                 'n_reads_learn': qiime2.plugin.Int,
                 'hashed_feature_ids': qiime2.plugin.Bool},
@@ -98,6 +99,11 @@ plugin.methods.register_function(
             'than or equal to 1 (i.e. parents should be more abundant than '
             'the sequence being tested). This parameter has no effect if '
             'chimera_method is "none".'),
+        'allow_one_off': (
+            'Bimeras that are one-off from exact are also '
+            'identified if the `allow_one_off` argument is True.'
+            'If True, a sequence will be identified as bimera if it is one '
+            'mismatch or indel away from an exact bimera.'),
         'n_threads': ('The number of threads to use for multithreaded '
                       'processing. If 0 is provided, all available cores will '
                       'be used.'),
@@ -143,6 +149,7 @@ plugin.methods.register_function(
                 'chimera_method': qiime2.plugin.Str %
                 qiime2.plugin.Choices(_CHIM_OPT),
                 'min_fold_parent_over_abundance': qiime2.plugin.Float,
+                'allow_one_off': qiime2.plugin.Bool,
                 'n_threads': qiime2.plugin.Int,
                 'n_reads_learn': qiime2.plugin.Int,
                 'hashed_feature_ids': qiime2.plugin.Bool},
@@ -216,6 +223,11 @@ plugin.methods.register_function(
             'than or equal to 1 (i.e. parents should be more abundant than '
             'the sequence being tested). This parameter has no effect if '
             'chimera_method is "none".'),
+        'allow_one_off': (
+            'Bimeras that are one-off from exact are also '
+            'identified if the `allow_one_off` argument is True'
+            'If True, a sequence will be identified as bimera if it is one '
+            'mismatch or indel away from an exact bimera.'),
         'n_threads': ('The number of threads to use for multithreaded '
                       'processing. If 0 is provided, all available cores will '
                       'be used.'),
@@ -259,6 +271,7 @@ plugin.methods.register_function(
                 'chimera_method': qiime2.plugin.Str %
                 qiime2.plugin.Choices(_CHIM_OPT),
                 'min_fold_parent_over_abundance': qiime2.plugin.Float,
+                'allow_one_off': qiime2.plugin.Bool,
                 'n_threads': qiime2.plugin.Int,
                 'n_reads_learn': qiime2.plugin.Int,
                 'hashed_feature_ids': qiime2.plugin.Bool},
@@ -313,6 +326,11 @@ plugin.methods.register_function(
             'than or equal to 1 (i.e. parents should be more abundant than '
             'the sequence being tested). This parameter has no effect if '
             'chimera_method is "none".',
+        'allow_one_off': (
+            'Bimeras that are one-off from exact are also '
+            'identified if the `allow_one_off` argument is True. '
+            'If True, a sequence will be identified as bimera if it is one '
+            'mismatch or indel away from an exact bimera.'),
         'n_threads': 'The number of threads to use for multithreaded '
                      'processing. If 0 is provided, all available cores will '
                      'be used.',
@@ -359,6 +377,7 @@ plugin.methods.register_function(
                 'chimera_method': qiime2.plugin.Str %
                 qiime2.plugin.Choices(_CHIM_OPT),
                 'min_fold_parent_over_abundance': qiime2.plugin.Float,
+                'allow_one_off': qiime2.plugin.Bool,
                 'n_threads': qiime2.plugin.Int,
                 'n_reads_learn': qiime2.plugin.Int,
                 'hashed_feature_ids': qiime2.plugin.Bool},
@@ -440,6 +459,11 @@ plugin.methods.register_function(
             'than or equal to 1 (i.e. parents should be more abundant than '
             'the sequence being tested). Suggest 3.5. '
             'This parameter has no effect if chimera_method is "none".',
+        'allow_one_off': (
+            'Bimeras that are one-off from exact are also '
+            'identified if the `allow_one_off` argument is True. '
+            'If True, a sequence will be identified as bimera if it is one '
+            'mismatch or indel away from an exact bimera.'),
         'n_threads': 'The number of threads to use for multithreaded '
                      'processing. If 0 is provided, all available cores will '
                      'be used.',


=====================================
setup.py
=====================================
@@ -21,9 +21,7 @@ setup(
     author="Greg Caporaso and Benjamin Callahan",
     author_email="gregcaporaso at gmail.com",
     description="Apply DADA2 to generate denoised sequence variants. ",
-    scripts=['q2_dada2/assets/run_dada_single.R',
-             'q2_dada2/assets/run_dada_paired.R',
-             'q2_dada2/assets/run_dada_ccs.R'],
+    scripts=['q2_dada2/assets/run_dada.R'],
     package_data={
         'q2_dada2': ['citations.bib'],
         'q2_dada2.tests': ['data/*',



View it on GitLab: https://salsa.debian.org/med-team/q2-dada2/-/commit/072bd30428ac3d1c9f613a7e9737c794d17335b8

-- 
View it on GitLab: https://salsa.debian.org/med-team/q2-dada2/-/commit/072bd30428ac3d1c9f613a7e9737c794d17335b8
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20220907/a8dcabaf/attachment-0001.htm>