[med-svn] [Git][med-team/q2-dada2][master] 5 commits: routine-update: New upstream version
Mohd Bilal (@rmb)
gitlab at salsa.debian.org
Wed Sep 7 15:48:44 BST 2022
Mohd Bilal pushed to branch master at Debian Med / q2-dada2
Commits:
752ae095 by Mohammed Bilal at 2022-09-07T14:12:40+00:00
routine-update: New upstream version
- - - - -
072bd304 by Mohammed Bilal at 2022-09-07T14:12:59+00:00
New upstream version 2022.8.0
- - - - -
cf28a581 by Mohammed Bilal at 2022-09-07T14:13:02+00:00
Update upstream source from tag 'upstream/2022.8.0'
Update to upstream version '2022.8.0'
with Debian dir 898b6c424eb70656db32e7f7e7da9eeb5101f6bf
- - - - -
50a0c873 by Mohammed Bilal at 2022-09-07T14:17:26+00:00
Refresh patch
- - - - -
6ab2c6a2 by Mohammed Bilal at 2022-09-07T19:48:22+05:30
Update dependency on qiime/q2-* >= 2022.8.0
- - - - -
13 changed files:
- .gitignore
- ci/recipe/meta.yaml
- debian/changelog
- debian/control
- debian/patches/RscriptPath.patch
- q2_dada2/_denoise.py
- q2_dada2/_version.py
- + q2_dada2/assets/run_dada.R
- − q2_dada2/assets/run_dada_ccs.R
- − q2_dada2/assets/run_dada_paired.R
- − q2_dada2/assets/run_dada_single.R
- q2_dada2/plugin_setup.py
- setup.py
Changes:
=====================================
.gitignore
=====================================
@@ -69,3 +69,4 @@ target/
*.qzv
.DS_Store
+.Rhistory
=====================================
ci/recipe/meta.yaml
=====================================
@@ -18,8 +18,9 @@ requirements:
run:
- python {{ python }}
- - biom-format >=2.1.5,<2.2a0
+ - biom-format {{ biom_format }}
- bioconductor-dada2 1.22.0
+ - r-optparse >=1.7.1
# openjdk is not a real dependency, but, r-base has a post-link and post-
# activation hook that calls R CMD javareconf, which pokes around for any
# installations of java. On modern versions of macOS, a binary called
=====================================
debian/changelog
=====================================
@@ -1,4 +1,4 @@
-q2-dada2 (2022.2.0-1) UNRELEASED; urgency=medium
+q2-dada2 (2022.8.0-1) UNRELEASED; urgency=medium
* Team upload
* i386 is explicitly excluded by Build-Depends: r-bioc-dada2
=====================================
debian/control
=====================================
@@ -7,7 +7,7 @@ Build-Depends: debhelper-compat (= 13),
dh-python,
python3,
python3-setuptools,
- qiime (>= 2022.2.0),
+ qiime (>= 2022.8.0),
r-bioc-dada2
Standards-Version: 4.6.1
Vcs-Browser: https://salsa.debian.org/med-team/q2-dada2
@@ -22,8 +22,8 @@ Depends: ${shlibs:Depends},
${python3:Depends},
r-base-core,
r-bioc-dada2,
- qiime (>= 2022.2.0),
- q2-types (>= 2022.2.0),
+ qiime (>= 2022.8.0),
+ q2-types (>= 2022.8.0),
python3-biom-format,
python3-skbio
Description: QIIME 2 plugin to work with adapters in sequence data
=====================================
debian/patches/RscriptPath.patch
=====================================
@@ -2,23 +2,18 @@ Author: Steffen Möller
Last-Update: 2020-12-08 19:52:45 +0100
Description: Fix interpreter
-Index: q2-dada2/q2_dada2/assets/run_dada_paired.R
-===================================================================
---- q2-dada2.orig/q2_dada2/assets/run_dada_paired.R
-+++ q2-dada2/q2_dada2/assets/run_dada_paired.R
-@@ -1,4 +1,4 @@
--#!/usr/bin/env Rscript
-+#!/usr/bin/Rscript
-
- ###################################################
- # This R script takes an input two directories of
-Index: q2-dada2/q2_dada2/assets/run_dada_single.R
-===================================================================
---- q2-dada2.orig/q2_dada2/assets/run_dada_single.R
-+++ q2-dada2/q2_dada2/assets/run_dada_single.R
+--- a/q2_dada2/assets/run_dada.R
++++ b/q2_dada2/assets/run_dada.R
@@ -1,4 +1,4 @@
-#!/usr/bin/env Rscript
+#!/usr/bin/Rscript
###################################################
# This R script takes an input directory of .fastq.gz files
+@@ -493,4 +493,4 @@
+ row.names=TRUE, col.names=col.names, quote=FALSE)
+ saveRDS(seqtab.nochim, gsub("tsv", "rds", out.path)) ### TESTING
+
+-q(status=0)
+\ No newline at end of file
++q(status=0)
=====================================
q2_dada2/_denoise.py
=====================================
@@ -52,6 +52,7 @@ _POOL_STR = (lambda x: x in {'pseudo', 'independent'},
'pseudo or independent')
_CHIM_STR = (lambda x: x in {'pooled', 'consensus', 'none'},
'pooled, consensus or none')
+_BOOLEAN = (lambda x: type(x) is bool, 'True or False')
# Better to choose to skip, than to implicitly ignore things that KeyError
_SKIP = (lambda x: True, '')
_valid_inputs = {
@@ -72,6 +73,7 @@ _valid_inputs = {
'pooling_method': _POOL_STR,
'chimera_method': _CHIM_STR,
'min_fold_parent_over_abundance': _NAT_NUM,
+ 'allow_one_off': _BOOLEAN,
'n_threads': _WHOLE_NUM,
# 0 is technically allowed, but we don't want to support it because it only
# takes all reads from the first sample (alphabetically by sample id)
@@ -100,6 +102,10 @@ def _filepath_to_sample(fp):
return fp.rsplit('_', 4)[0]
+# Since `denoise-single` and `denoise-pyro` are almost identical, break out
+# the bulk of the functionality to this helper util. Typechecking is assumed
+# to have occurred in the calling functions, this is primarily for making
+# sure that DADA2 is able to do what it needs to do.
def _denoise_helper(biom_fp, track_fp, hashed_feature_ids):
_check_featureless_table(biom_fp)
with open(biom_fp) as fh:
@@ -162,13 +168,9 @@ def _denoise_helper(biom_fp, track_fp, hashed_feature_ids):
return table, rep_sequences, metadata
-# Since `denoise-single` and `denoise-pyro` are almost identical, break out
-# the bulk of the functionality to this helper util. Typechecking is assumed
-# to have occurred in the calling functions, this is primarily for making
-# sure that DADA2 is able to do what it needs to do.
def _denoise_single(demultiplexed_seqs, trunc_len, trim_left, max_ee, trunc_q,
max_len, pooling_method, chimera_method,
- min_fold_parent_over_abundance,
+ min_fold_parent_over_abundance, allow_one_off,
n_threads, n_reads_learn, hashed_feature_ids,
homopolymer_gap_penalty, band_size):
_check_inputs(**locals())
@@ -178,19 +180,31 @@ def _denoise_single(demultiplexed_seqs, trunc_len, trim_left, max_ee, trunc_q,
if max_len != 0 and max_len < trunc_len:
raise ValueError("trunc_len (%r) must be no bigger than max_len (%r)"
% (trunc_len, max_len))
- # Coerce for `run_dada_single.R`
+ # Coerce for single end read analysis
max_len = 'Inf' if max_len == 0 else max_len
with tempfile.TemporaryDirectory() as temp_dir_name:
biom_fp = os.path.join(temp_dir_name, 'output.tsv.biom')
track_fp = os.path.join(temp_dir_name, 'track.tsv')
- cmd = ['run_dada_single.R',
- str(demultiplexed_seqs), biom_fp, track_fp, temp_dir_name,
- str(trunc_len), str(trim_left), str(max_ee), str(trunc_q),
- str(max_len), str(pooling_method), str(chimera_method),
- str(min_fold_parent_over_abundance), str(n_threads),
- str(n_reads_learn), str(homopolymer_gap_penalty),
- str(band_size)]
+
+ cmd = ['run_dada.R',
+ '--input_directory', str(demultiplexed_seqs),
+ '--output_path', biom_fp,
+ '--output_track', track_fp,
+ '--filtered_directory', temp_dir_name,
+ '--truncation_length', str(trunc_len),
+ '--trim_left', str(trim_left),
+ '--max_expected_errors', str(max_ee),
+ '--truncation_quality_score', str(trunc_q),
+ '--max_length', str(max_len),
+ '--pooling_method', str(pooling_method),
+ '--chimera_method', str(chimera_method),
+ '--min_parental_fold', str(min_fold_parent_over_abundance),
+ '--allow_one_off', str(allow_one_off),
+ '--num_threads', str(n_threads),
+ '--learn_min_reads', str(n_reads_learn),
+ '--homopolymer_gap_penalty', str(homopolymer_gap_penalty),
+ '--band_size', str(band_size)]
try:
run_commands([cmd])
except subprocess.CalledProcessError as e:
@@ -212,6 +226,7 @@ def denoise_single(demultiplexed_seqs: SingleLanePerSampleSingleEndFastqDirFmt,
trunc_q: int = 2, pooling_method: str = 'independent',
chimera_method: str = 'consensus',
min_fold_parent_over_abundance: float = 1.0,
+ allow_one_off: bool = False,
n_threads: int = 1, n_reads_learn: int = 1000000,
hashed_feature_ids: bool = True
) -> (biom.Table, DNAIterator, qiime2.Metadata):
@@ -225,6 +240,7 @@ def denoise_single(demultiplexed_seqs: SingleLanePerSampleSingleEndFastqDirFmt,
pooling_method=pooling_method,
chimera_method=chimera_method,
min_fold_parent_over_abundance=min_fold_parent_over_abundance,
+ allow_one_off=allow_one_off,
n_threads=n_threads,
n_reads_learn=n_reads_learn,
hashed_feature_ids=hashed_feature_ids,
@@ -240,6 +256,7 @@ def denoise_paired(demultiplexed_seqs: SingleLanePerSamplePairedEndFastqDirFmt,
pooling_method: str = 'independent',
chimera_method: str = 'consensus',
min_fold_parent_over_abundance: float = 1.0,
+ allow_one_off: bool = False,
n_threads: int = 1, n_reads_learn: int = 1000000,
hashed_feature_ids: bool = True
) -> (biom.Table, DNAIterator, qiime2.Metadata):
@@ -266,15 +283,27 @@ def denoise_paired(demultiplexed_seqs: SingleLanePerSamplePairedEndFastqDirFmt,
elif 'R2_001.fastq' in rp.name:
qiime2.util.duplicate(fp, os.path.join(tmp_reverse, rp.name))
- cmd = ['run_dada_paired.R',
- tmp_forward, tmp_reverse, biom_fp, track_fp, filt_forward,
- filt_reverse,
- str(trunc_len_f), str(trunc_len_r),
- str(trim_left_f), str(trim_left_r),
- str(max_ee_f), str(max_ee_r), str(trunc_q),
- str(min_overlap), str(pooling_method),
- str(chimera_method), str(min_fold_parent_over_abundance),
- str(n_threads), str(n_reads_learn)]
+ cmd = ['run_dada.R',
+ '--input_directory', tmp_forward,
+ '--input_directory_reverse', tmp_reverse,
+ '--output_path', biom_fp,
+ '--output_track', track_fp,
+ '--filtered_directory', filt_forward,
+ '--filtered_directory_reverse', filt_reverse,
+ '--truncation_length', str(trunc_len_f),
+ '--truncation_length_reverse', str(trunc_len_r),
+ '--trim_left', str(trim_left_f),
+ '--trim_left_reverse', str(trim_left_r),
+ '--max_expected_errors', str(max_ee_f),
+ '--max_expected_errors_reverse', str(max_ee_r),
+ '--truncation_quality_score', str(trunc_q),
+ '--min_overlap', str(min_overlap),
+ '--pooling_method', str(pooling_method),
+ '--chimera_method', str(chimera_method),
+ '--min_parental_fold', str(min_fold_parent_over_abundance),
+ '--allow_one_off', str(allow_one_off),
+ '--num_threads', str(n_threads),
+ '--learn_min_reads', str(n_reads_learn)]
try:
run_commands([cmd])
except subprocess.CalledProcessError as e:
@@ -301,6 +330,7 @@ def denoise_pyro(demultiplexed_seqs: SingleLanePerSampleSingleEndFastqDirFmt,
pooling_method: str = 'independent',
chimera_method: str = 'consensus',
min_fold_parent_over_abundance: float = 1.0,
+ allow_one_off: bool = False,
n_threads: int = 1, n_reads_learn: int = 250000,
hashed_feature_ids: bool = True
) -> (biom.Table, DNAIterator, qiime2.Metadata):
@@ -314,10 +344,11 @@ def denoise_pyro(demultiplexed_seqs: SingleLanePerSampleSingleEndFastqDirFmt,
pooling_method=pooling_method,
chimera_method=chimera_method,
min_fold_parent_over_abundance=min_fold_parent_over_abundance,
+ allow_one_off=allow_one_off,
n_threads=n_threads,
n_reads_learn=n_reads_learn,
hashed_feature_ids=hashed_feature_ids,
- homopolymer_gap_penalty='-1',
+ homopolymer_gap_penalty='1',
band_size='32')
@@ -329,6 +360,7 @@ def denoise_ccs(demultiplexed_seqs: SingleLanePerSampleSingleEndFastqDirFmt,
pooling_method: str = 'independent',
chimera_method: str = 'consensus',
min_fold_parent_over_abundance: float = 3.5,
+ allow_one_off: bool = False,
n_threads: int = 1, n_reads_learn: int = 1000000,
hashed_feature_ids: bool = True
) -> (biom.Table, DNAIterator, qiime2.Metadata):
@@ -339,7 +371,7 @@ def denoise_ccs(demultiplexed_seqs: SingleLanePerSampleSingleEndFastqDirFmt,
if max_len != 0 and max_len < trunc_len:
raise ValueError("trunc_len (%r) must be no bigger than max_len (%r)"
% (trunc_len, max_len))
- # Coerce for `run_dada_ccs.R`
+ # Coerce for ccs read analysis
max_len = 'Inf' if max_len == 0 else max_len
with tempfile.TemporaryDirectory() as temp_dir_name:
@@ -350,13 +382,30 @@ def denoise_ccs(demultiplexed_seqs: SingleLanePerSampleSingleEndFastqDirFmt,
for fp in nop_fp, filt_fp:
os.mkdir(fp)
- cmd = ['run_dada_ccs.R',
- str(demultiplexed_seqs), biom_fp, track_fp, nop_fp, filt_fp,
- str(front), str(adapter), str(max_mismatch), str(indels),
- str(trunc_len), str(trim_left), str(max_ee), str(trunc_q),
- str(min_len), str(max_len), str(pooling_method),
- str(chimera_method), str(min_fold_parent_over_abundance),
- str(n_threads), str(n_reads_learn)]
+ cmd = ['run_dada.R',
+ '--input_directory', str(demultiplexed_seqs),
+ '--output_path', biom_fp,
+ '--output_track', track_fp,
+ '--removed_primer_directory', nop_fp,
+ '--filtered_directory', filt_fp,
+ '--forward_primer', str(front),
+ '--reverse_primer', str(adapter),
+ '--max_mismatch', str(max_mismatch),
+ '--indels', str(indels),
+ '--truncation_length', str(trunc_len),
+ '--trim_left', str(trim_left),
+ '--max_expected_errors', str(max_ee),
+ '--truncation_quality_score', str(trunc_q),
+ '--min_length', str(min_len),
+ '--max_length', str(max_len),
+ '--pooling_method', str(pooling_method),
+ '--chimera_method', str(chimera_method),
+ '--min_parental_fold', str(min_fold_parent_over_abundance),
+ '--allow_one_off', str(allow_one_off),
+ '--num_threads', str(n_threads),
+ '--learn_min_reads', str(n_reads_learn),
+ '--homopolymer_gap_penalty', 'NULL',
+ '--band_size', '32']
try:
run_commands([cmd])
except subprocess.CalledProcessError as e:
=====================================
q2_dada2/_version.py
=====================================
@@ -23,9 +23,9 @@ def get_keywords():
# setup.py/versioneer.py will grep for the variable names, so they must
# each be defined on a line of their own. _version.py will just call
# get_keywords().
- git_refnames = " (tag: 2022.2.0)"
- git_full = "f58e203cfcf4082efacaa3e7f8d348919de72b90"
- git_date = "2022-02-18 19:05:49 +0000"
+ git_refnames = " (tag: 2022.8.0)"
+ git_full = "dc9bdc5b0346353ebf9a63bc63a60bd45a3405b6"
+ git_date = "2022-08-23 17:11:59 +0000"
keywords = {"refnames": git_refnames, "full": git_full, "date": git_date}
return keywords
=====================================
q2_dada2/assets/run_dada.R
=====================================
@@ -0,0 +1,496 @@
+#!/usr/bin/env Rscript
+
+###################################################
+# This R script takes an input directory of .fastq.gz files
+# and outputs a tsv file of the dada2 processed sequence
+# table. It is intended for use with the QIIME2 plugin
+# for DADA2.
+#
+# Ex: Rscript run_dada_single.R input_dir output.tsv track.tsv filtered_dir 200 0 2.0 2 Inf pooled 1.0 0 1000000 NULL 32
+####################################################
+
+####################################################
+# DESCRIPTION OF ARGUMENTS #
+####################################################
+# NOTE: All numeric arguments should be zero or positive.
+# NOTE: All numeric arguments save maxEE are expected to be integers.
+# NOTE: Currently the filterered_dir must already exist.
+# NOTE: ALL ARGUMENTS ARE POSITIONAL!
+#
+### FILE SYSTEM ARGUMENTS ###
+#
+# 1) input_directory - File path to directory with the .fastq.gz files to be processed.
+# Ex: path/to/dir/with/fastqgzs
+#
+# 2) output_path - File path to output tsv file. If already exists, will be overwritten.
+# Ex: path/to/output_file.tsv
+#
+# 3) output_track - File path to tracking tsv file. If already exists, will be overwritten.
+# Ex: path/to/tracking_stats.tsv
+#
+# 4) removed_primer_directory - File path to directory in which to write the primer.removed .fastq.gz
+# files. These files are intermediate for the full workflow.
+# Currently they remain after the script finishes.
+# Directory must already exist.
+# Ex: path/to/dir/with/fastqgzs/primerremoved
+#
+# 5) filtered_directory - File path to directory in which to write the filtered .fastq.gz files.
+# These files are intermediate for the full workflow.
+# Currently they remain after the script finishes.
+# Directory must already exist.
+# Ex: path/to/dir/with/fastqgzs/filtered
+#
+### PRIMER REMOVING ARGUMENTS ###
+#
+# 6) forward_primer - Primer front of Pacbio CCS sequences.
+# Ex: 'AGRGTTYGATYMTGGCTCAG'
+#
+# 7) reverse_primer - Primer adapter of Pacbio CCS sequences.
+# Ex: 'RGYTACCTTGTTACGACTT'
+#
+# 8) max_mismatch - The number of mismatches to tolerate when matching
+# reads to primer sequences.
+# Ex: 2
+#
+# 9) indels - Allow insertions or deletions of bases when matching adapters.
+# Ex: FALSE
+#
+### FILTERING ARGUMENTS ###
+#
+# 10) truncation_length - The position at which to truncate reads. Reads shorter
+# than truncation_length will be discarded.
+# Special values: 0 - no truncation or length filtering.
+# Ex: 150
+#
+# 11) trim_left - The number of nucleotides to remove from the start of
+# each read. Should be less than truncation_length for obvious reasons.
+# Ex: 0
+#
+# 12) max_expected_errors - Reads with expected errors higher than maxEE are discarded.
+# Ex: 2.0
+#
+# 13) truncation_quality_score - Reads are truncated at the first instance of quality score truncation_quality_score.
+# If the read is then shorter than truncation_length , it is discarded.
+# Ex: 2
+#
+# 14) min_length - Remove reads with length shorter than min_length. min_length is enforced
+# after trimming and truncation.
+# Default Inf - no maximum.
+#
+# 15) max_length - Remove reads with length greater than max_length. max_length is enforced on the raw reads.
+# Default Inf - no maximum.
+# Ex: 300
+#
+### SENSITIVITY ARGUMENTS ###
+#
+# 16) pooling_method- The method used to pool (or not) samples during denoising.
+# Valid options are:
+# independent: (Default) No pooling, samples are denoised indpendently.
+# pseudo: Samples are "pseudo-pooled" for denoising.
+# Ex: independent
+#
+#
+### CHIMERA ARGUMENTS ###
+#
+# 17) chimera_method - The method used to remove chimeras. Valid options are:
+# none: No chimera removal is performed.
+# pooled: All reads are pooled prior to chimera detection.
+# consensus: Chimeras are detected in samples individually, and a consensus decision
+# is made for each sequence variant.
+# Ex: consensus
+#
+# 18) min_parental_fold - The minimum abundance of potential "parents" of a sequence being
+# tested as chimeric, expressed as a fold-change versus the abundance of the sequence being
+# tested. Values should be greater than or equal to 1 (i.e. parents should be more
+# abundant than the sequence being tested).
+# Ex: 1.0
+#
+### SPEED ARGUMENTS ###
+#
+# 19) num_threads - The number of threads to use.
+# Special values: 0 - detect available cores and use all..
+# Ex: 1
+#
+# 20) learn_min_reads - The minimum number of reads to learn the error model from.
+# Special values: 0 - Use all input reads.
+# Ex: 1000000
+#
+### GLOBAL OPTION ARGUMENTS ###
+#
+# 21) homopolymer_gap_penalty - The cost of gaps in homopolymer regions (>=3 repeated bases).
+# Default is NULL, which causes homopolymer gaps
+# to be treated as normal gaps.
+# Ex: -1
+#
+# 22) band_size - When set, banded Needleman-Wunsch alignments are performed.
+# The default value of band_size is 16. Setting BAND_SIZE to a negative
+# number turns off banding (i.e. full Needleman-Wunsch).
+# Ex: 32
+#
+
+library("optparse")
+
+cat(R.version$version.string, "\n")
+errQuit <- function(mesg, status=1) { message("Error: ", mesg); q(status=status) }
+getN <- function(x) sum(getUniques(x)) #Function added from paired read processing
+
+option_list = list(
+ make_option(c("--input_directory"), action="store", default='NULL', type='character',
+ help="File path to directory with the .fastq.gz files to be processed"),
+ make_option(c("--input_directory_reverse"), action="store", default='NULL', type='character',
+ help="File path to reverse directory with the .fastq.gz files to be processed. Only used in paired-end processing"),
+ make_option(c("--output_path"), action="store", default='NULL', type='character',
+ help="File path to output tsv file. If already exists, will be overwritten"),
+ make_option(c("--output_track"), action="store", default='NULL', type='character',
+ help="File path to tracking tsv file. If already exists, will be overwritten"),
+ make_option(c("--removed_primer_directory"), action="store", default='NULL', type='character',
+ help="File path to directory in which to write the primer.removed .fastq.gz files"),
+ make_option(c("--filtered_directory"), action="store", default='NULL', type='character',
+ help="File path to directory in which to write the filtered .fastq.gz files. These files are intermediate"),
+ make_option(c("--filtered_directory_reverse"), action="store", default='NULL', type='character',
+ help="File path to directory in which to write the reverse filtered .fastq.gz files. These files are intermediate. Only used in paired-end processing"),
+ make_option(c("--forward_primer"), action="store", default='NULL', type='character',
+ help="Primer front of Pacbio CCS sequences"),
+ make_option(c("--reverse_primer"), action="store", default='NULL', type='character',
+ help="Primer adapter of Pacbio CCS sequences"),
+ make_option(c("--max_mismatch"), action="store", default='NULL', type='character',
+ help="The number of mismatches to tolerate when matching reads to primer sequences."),
+ make_option(c("--indels"), action="store", default='NULL', type='character',
+ help="Allow insertions or deletions of bases when matching adapters"),
+ make_option(c("--truncation_length"), action="store", default='NULL', type='character',
+ help="The position at which to truncate reads. Reads shorter then truncation_length will be discarded."),
+ make_option(c("--truncation_length_reverse"), action="store", default='NULL', type='character',
+ help="The position at which to truncate reverse reads. Reads shorter then truncation_length will be discarded. Only used in paired-end processing"),
+ make_option(c("--trim_left"), action="store", default='NULL', type='character',
+ help="The number of nucleotides to remove from the start of each read. Should be less than truncation_length for obvious reasons"),
+ make_option(c("--trim_left_reverse"), action="store", default='NULL', type='character',
+ help="The number of nucleotides to remove from the start of each reverse read. Should be less than truncation_length for obvious reasons. Only used in paired-end processing"),
+ make_option(c("--max_expected_errors"), action="store", default='NULL', type='character',
+ help="Reads with expected errors higher than max_expected_errors are discarded"),
+ make_option(c("--max_expected_errors_reverse"), action="store", default='NULL', type='character',
+ help="Reverse reads with expected errors higher than max_expected_errors are discarded. Only used in paired-end processing"),
+ make_option(c("--truncation_quality_score"), action="store", default='NULL', type='character',
+ help="Reads are truncated at the first instance of quality score truncation_quality_score.If the read is then shorter than truncation_length, it is discarded"),
+ make_option(c("--min_length"), action="store", default='NULL', type='character',
+ help="Remove reads with length shorter than min_length. min_length is enforced after trimming and truncation."),
+ make_option(c("--max_length"), action="store", default='NULL', type='character',
+ help="Remove reads with length greater than max_length. max_length is enforced on the raw reads."),
+ make_option(c("--min_overlap"), action="store", default='NULL', type='character',
+ help="Minimum overlap allowed between reads"),
+ make_option(c("--pooling_method"), action="store", default='NULL', type='character',
+ help="The method used to pool (or not) samples during denoising (independent/pseudo)"),
+ make_option(c("--chimera_method"), action="store", default='NULL', type='character',
+ help="The method used to remove chimeras (none/pooled/consensus)"),
+ make_option(c("--min_parental_fold"), action="store", default='NULL', type='character',
+ help="The minimum abundance of potential parents of a sequence being tested as chimeric, expressed as a fold-change versus the abundance of the sequence being tested. Values should be greater than or equal to 1"),
+ make_option(c("--allow_one_off"), action="store", default='NULL', type='character',
+ help="Bimeras that are one-off (one mismatch/indel away from an exact bimera) are also identified if allow_one_off is TRUE."),
+ make_option(c("--num_threads"), action="store", default='NULL', type='character',
+ help="The number of threads to use"),
+ make_option(c("--learn_min_reads"), action="store", default='NULL', type='character',
+ help="The minimum number of reads to learn the error model from"),
+ make_option(c("--homopolymer_gap_penalty"), action="store", default='NULL', type='character',
+ help="The cost of gaps in homopolymer regions (>=3 repeated bases).Default is NULL, which causes homopolymer gaps to be treated as normal gaps."),
+ make_option(c("--band_size"), action="store", default='NULL', type='character',
+ help="When set, banded Needleman-Wunsch alignments are performed.")
+)
+opt = parse_args(OptionParser(option_list=option_list))
+
+
+# Assign each of the arguments, in positional order, to an appropriately named R variable
+inp.dir <- opt$input_directory
+inp.dirR <- opt$input_directory_reverse #added from paired arguments
+out.path <- opt$output_path
+out.track <- opt$output_track
+primer.removed.dir <- opt$removed_primer_directory #added from CCS arguments
+filtered.dir <- opt$filtered_directory
+filtered.dirR<- opt$filtered_directory_reverse #added from paired arguments
+primer <- opt$forward_primer #added from CCS arguments
+primerR <- opt$reverse_primer #added from CCS arguments
+maxMismatch <- if(opt$max_mismatch=='NULL') NULL else as.numeric(opt$max_mismatch) #added from CCS arguments
+indels <- if(opt$indels=='NULL') NULL else as.logical(opt$indels) #added from CCS arguments
+truncLen <- if(opt$truncation_length=='NULL') NULL else as.integer(opt$truncation_length)
+truncLenR <- if(opt$truncation_length_reverse=='NULL') NULL else as.integer(opt$truncation_length_reverse) #added from paired arguments
+trimLeft <- if(opt$trim_left=='NULL') NULL else as.integer(opt$trim_left)
+trimLeftR<-if(opt$trim_left_reverse=='NULL') NULL else as.integer(opt$trim_left_reverse) #added from paired arguments
+maxEE <- if(opt$max_expected_errors=='NULL') NULL else as.numeric(opt$max_expected_errors)
+maxEER <- if(opt$max_expected_errors_reverse=='NULL') NULL else as.numeric(opt$max_expected_errors_reverse) #added from paired arguments
+truncQ <- if(opt$truncation_quality_score=='NULL') NULL else as.integer(opt$truncation_quality_score)
+minLen <- if(opt$min_length=='NULL') NULL else as.numeric(opt$min_length) #added from CCS arguments
+maxLen <- if(opt$max_length=='NULL') NULL else as.numeric(opt$max_length) # Allows Inf
+minOverlap <- if(opt$min_overlap=='NULL') NULL else as.integer(opt$min_overlap) #added from paired arguments
+poolMethod <- opt$pooling_method
+chimeraMethod <- opt$chimera_method
+minParentFold <- if(opt$min_parental_fold=='NULL') NULL else as.numeric(opt$min_parental_fold)
+allowOneOff <-if(opt$allow_one_off=='NULL') NULL else as.logical(opt$allow_one_off)
+nthreads <- if(opt$num_threads=='NULL') NULL else as.integer(opt$num_threads)
+nreads.learn <- if(opt$learn_min_reads=='NULL') NULL else as.integer(opt$learn_min_reads)
+# The following args are not directly exposed to end users in q2-dada2,
+# but rather indirectly, via the methods `denoise-single` and `denoise-pyro`.
+if (opt$homopolymer_gap_penalty=='NULL'){
+ HOMOPOLYMER_GAP_PENALTY<-NULL
+}else{
+ HOMOPOLYMER_GAP_PENALTY<-as.integer(opt$homopolymer_gap_penalty)
+ if(HOMOPOLYMER_GAP_PENALTY>0){
+ HOMOPOLYMER_GAP_PENALTY<-HOMOPOLYMER_GAP_PENALTY*(-1) #negative numbers cannot be passed using optparse, so we convert it here
+ }
+}
+BAND_SIZE <- if(opt$band_size=='NULL') NULL else as.integer(opt$band_size)
+
+### VALIDATE ARGUMENTS ###
+# Input directory is expected to contain .fastq.gz file(s)
+# that have not yet been filtered and globally trimmed
+# to the same length.
+if(!dir.exists(inp.dir)) {
+ errQuit("Input directory does not exist.")
+} else {
+ unfilts <- list.files(inp.dir, pattern=".fastq.gz$", full.names=TRUE)
+ if(length(unfilts) == 0) {
+ errQuit("No input files with the expected filename format found.")
+ }
+ if(inp.dirR!='NULL'){
+ unfiltsR <- list.files(inp.dirR, pattern=".fastq.gz$", full.names=TRUE)
+ if(length(unfiltsR) == 0) {
+ errQuit("No input reverse files with the expected filename format found.")
+ }
+ if(length(unfilts) != length(unfiltsR)) {
+ errQuit("Different numbers of forward and reverse .fastq.gz files.")
+ }
+
+ }
+
+}
+
+# Output files are to be filenames (not directories) and are to be
+# removed and replaced if already present.
+for(fn in c(out.path, out.track)) {
+ if(dir.exists(fn)) {
+ errQuit("Output filename ", fn, " is a directory.")
+ } else if(file.exists(fn)) {
+ invisible(file.remove(fn))
+ }
+}
+
+# Convert nthreads to the logical/numeric expected by dada2
+if(nthreads < 0) {
+ errQuit("nthreads must be non-negative.")
+} else if(nthreads == 0) {
+ multithread <- TRUE # detect and use all
+} else if(nthreads == 1) {
+ multithread <- FALSE
+} else {
+ multithread <- nthreads
+}
+
+### LOAD LIBRARIES ###
+suppressWarnings(library(methods))
+suppressWarnings(library(dada2))
+cat("DADA2:", as.character(packageVersion("dada2")), "/",
+ "Rcpp:", as.character(packageVersion("Rcpp")), "/",
+ "RcppParallel:", as.character(packageVersion("RcppParallel")), "\n")
+
+### Remove Primers ###
+if(primer.removed.dir!='NULL'){ #for CCS read analysis
+ cat("1) Removing Primers\n")
+ nop <- file.path(primer.removed.dir, basename(unfilts))
+ prim <- suppressWarnings(removePrimers(unfilts, nop, primer, dada2:::rc(primerR),
+ max.mismatch = maxMismatch, allow.indels = indels,
+ orient = TRUE, verbose = TRUE))
+ cat(ifelse(file.exists(nop), ".", "x"), sep="")
+ nop <- list.files(primer.removed.dir, pattern=".fastq.gz$", full.names=TRUE)
+ cat("\n")
+ if(length(nop) == 0) { # All reads were filtered out
+ errQuit("No reads passed the Removing Primers step (Did you select the right primers?)", status=2)
+ }
+}
+
+### TRIM AND FILTER ###
+cat("2) Filtering ")
+if(primer.removed.dir!='NULL'){ #for CCS read analysis
+ filts <- file.path(filtered.dir, basename(nop))
+ out <- suppressWarnings(filterAndTrim(nop, filts, truncLen = truncLen, trimLeft = trimLeft,
+ maxEE = maxEE, truncQ = truncQ, rm.phix = FALSE,
+ multithread = multithread, maxLen = maxLen, minLen = minLen, minQ = 3))
+}else{
+ filts <- file.path(filtered.dir, basename(unfilts))
+ if(inp.dirR!='NULL'){#for paired read analysis
+ filtsR <- file.path(filtered.dirR, basename(unfiltsR))
+ out <- suppressWarnings(filterAndTrim(unfilts, filts, unfiltsR, filtsR,
+ truncLen=c(truncLen, truncLenR), trimLeft=c(trimLeft, trimLeftR),
+ maxEE=c(maxEE, maxEER), truncQ=truncQ, rm.phix=TRUE,
+ multithread=multithread))
+ }else{#for sinlge/pyro read analysis
+ out <- suppressWarnings(filterAndTrim(unfilts, filts, truncLen=truncLen, trimLeft=trimLeft,
+ maxEE=maxEE, truncQ=truncQ, rm.phix=TRUE,
+ multithread=multithread, maxLen=maxLen))
+ }
+}
+
+cat(ifelse(file.exists(filts), ".", "x"), sep="")
+filts <- list.files(filtered.dir, pattern=".fastq.gz$", full.names=TRUE)
+
+if(inp.dirR!='NULL'){
+ filtsR <- list.files(filtered.dirR, pattern=".fastq.gz$", full.names=TRUE)
+}
+cat("\n")
+if(length(filts) == 0) { # All reads were filtered out
+ errQuit("No reads passed the filter (was truncLen longer than the read length?)", status=2)
+}
+
+### LEARN ERROR RATES ###
+# Dereplicate enough samples to get nreads.learn total reads
+cat("3) Learning Error Rates\n")
+if(primer.removed.dir!='NULL'){#for CCS read analysis
+ err <- suppressWarnings(learnErrors(filts, nreads=nreads.learn,
+ errorEstimationFunction=dada2:::PacBioErrfun,
+ multithread=multithread, BAND_SIZE=BAND_SIZE))
+
+}else if(inp.dirR!='NULL'){#for paired read analysis
+
+ err <- suppressWarnings(learnErrors(filts, nreads=nreads.learn, multithread=multithread))
+ errR <- suppressWarnings(learnErrors(filtsR, nreads=nreads.learn, multithread=multithread))
+
+}else{#for sinlge/pyro read analysis
+ err <- suppressWarnings(learnErrors(filts, nreads=nreads.learn, multithread=multithread,
+ HOMOPOLYMER_GAP_PENALTY=HOMOPOLYMER_GAP_PENALTY, BAND_SIZE=BAND_SIZE))
+}
+
+### PROCESS ALL SAMPLES ###
+# Loop over rest in streaming fashion with learned error rates
+
+if(inp.dirR =='NULL'){#for CCS/sinlge/pyro read analysis
+ dds <- vector("list", length(filts))
+ cat("4) Denoise samples ")
+ cat("\n")
+ for(j in seq(length(filts))) {
+ drp <- derepFastq(filts[[j]])
+ dds[[j]] <- dada(drp, err=err, multithread=multithread,HOMOPOLYMER_GAP_PENALTY=HOMOPOLYMER_GAP_PENALTY,
+ BAND_SIZE=BAND_SIZE, verbose=FALSE)
+ cat(".")
+ }
+ cat("\n")
+ if(poolMethod == "pseudo") {
+ cat(" Pseudo-pool step ")
+ ### TEMPORARY, to be removed once 1.12 makes its way to Q2
+ ### Needed for now to manage pseudo-pooling memory, as 1.10 didn't do this appropriately.
+ ### pseudo_priors code copied from dada2.R
+ st <- makeSequenceTable(dds)
+ pseudo_priors <- colnames(st)[colSums(st>0) >= 2 | colSums(st) >= Inf]
+ rm(st)
+ ### \pseudo_priors code copied from dada2.R
+ ### code copied from previous loop through samples in this script
+ for(j in seq(length(filts))) {
+ drp <- derepFastq(filts[[j]])
+ dds[[j]] <- dada(drp, err=err, multithread=multithread,
+ priors=pseudo_priors, HOMOPOLYMER_GAP_PENALTY=HOMOPOLYMER_GAP_PENALTY,
+ BAND_SIZE=BAND_SIZE, verbose=FALSE)
+ cat(".")
+ }
+ cat("\n")
+ ### \code copied from previous loop through samples in this script
+ }
+ # Make sequence table
+ seqtab <- makeSequenceTable(dds)
+}else{#for paired read analysis
+ denoisedF <- rep(0, length(filts))
+ ddsF <- vector("list", length(filts))
+ ddsR <- vector("list", length(filts))
+ mergers <- vector("list", length(filts))
+ cat("3) Denoise samples ")
+
+ for(j in seq(length(filts))) {
+ drpF <- derepFastq(filts[[j]])
+ ddsF[[j]] <- dada(drpF, err=err, multithread=multithread, verbose=FALSE)
+ drpR <- derepFastq(filtsR[[j]])
+ ddsR[[j]] <- dada(drpR, err=errR, multithread=multithread, verbose=FALSE)
+ cat(".")
+ }
+ cat("\n")
+ if(poolMethod == "pseudo") {
+ cat(" Pseudo-pool step ")
+ ### TEMPORARY, to be removed once 1.12 makes its way to Q2
+ ### Needed for now to manage pseudo-pooling memory, as 1.10 didn't do this appropriately.
+ ### pseudo_priors code copied from dada2.R
+ stF <- makeSequenceTable(ddsF)
+ pseudo_priorsF <- colnames(stF)[colSums(stF>0) >= 2 | colSums(stF) >= Inf]
+ rm(stF)
+ stR <- makeSequenceTable(ddsR)
+ pseudo_priorsR <- colnames(stR)[colSums(stR>0) >= 2 | colSums(stR) >= Inf]
+ rm(stR)
+ ### \pseudo_priors code copied from dada2.R
+ ### code copied from previous loop through samples in this script
+ for(j in seq(length(filts))) {
+ drpF <- derepFastq(filts[[j]])
+ ddsF[[j]] <- dada(drpF, err=err, priors=pseudo_priorsF,
+ multithread=multithread, verbose=FALSE)
+ drpR <- derepFastq(filtsR[[j]])
+ ddsR[[j]] <- dada(drpR, err=errR, priors=pseudo_priorsR,
+ multithread=multithread, verbose=FALSE)
+ cat(".")
+ }
+ cat("\n")
+ ### \code copied from previous loop through samples in this script
+ }
+
+ ### Now loop through and do merging
+ for(j in seq(length(filts))) {
+ drpF <- derepFastq(filts[[j]])
+ drpR <- derepFastq(filtsR[[j]])
+ mergers[[j]] <- mergePairs(ddsF[[j]], drpF, ddsR[[j]], drpR, minOverlap=minOverlap)
+ denoisedF[[j]] <- getN(ddsF[[j]])
+ cat(".")
+ }
+ cat("\n")
+ # Make sequence table
+ seqtab <- makeSequenceTable(mergers)
+
+}
+
+
+### Remove chimeras
+cat("5) Remove chimeras (method = ", chimeraMethod, ")\n", sep="")
+if(chimeraMethod %in% c("pooled", "consensus")) {
+ seqtab.nochim <- removeBimeraDenovo(seqtab, method=chimeraMethod, minFoldParentOverAbundance=minParentFold, allowOneOff=allowOneOff, multithread=multithread)
+} else { # No chimera removal, copy seqtab to seqtab.nochim
+ seqtab.nochim <- seqtab
+}
+
+### REPORT READ FRACTIONS THROUGH PIPELINE ###
+cat("6) Report read numbers through the pipeline\n")
+if(inp.dirR =='NULL'){
+ # Handle edge cases: Samples lost in filtering; One sample
+ if(primer.removed.dir!='NULL'){ #for CCS read analysis
+ track <- cbind(prim,out[ ,2], matrix(0, nrow=nrow(out), ncol=2))
+ colnames(track) <- c("input", "primer-removed", "filtered", "denoised", "non-chimeric")
+ }else{ #for sinlge/pyro read analysis
+ track <- cbind(out, matrix(0, nrow=nrow(out), ncol=2))
+ colnames(track) <- c("input", "filtered", "denoised", "non-chimeric")
+ }
+ passed.filtering <- track[,"filtered"] > 0
+ track[passed.filtering,"denoised"] <- rowSums(seqtab)
+ track[passed.filtering,"non-chimeric"] <- rowSums(seqtab.nochim)
+ write.table(track, out.track, sep="\t", row.names=TRUE, col.names=NA,
+ quote=FALSE)
+}else{#for paired end reads
+ # Handle edge cases: Samples lost in filtering; One sample
+ track <- cbind(out, matrix(0, nrow=nrow(out), ncol=3))
+ colnames(track) <- c("input", "filtered", "denoised", "merged", "non-chimeric")
+ passed.filtering <- track[,"filtered"] > 0
+ track[passed.filtering,"denoised"] <- denoisedF
+ track[passed.filtering,"merged"] <- rowSums(seqtab)
+ track[passed.filtering,"non-chimeric"] <- rowSums(seqtab.nochim)
+ write.table(track, out.track, sep="\t", row.names=TRUE, col.names=NA,
+ quote=FALSE)
+}
+
+### WRITE OUTPUT AND QUIT ###
+# Formatting as tsv plain-text sequence table table
+cat("7) Write output\n")
+seqtab.nochim <- t(seqtab.nochim) # QIIME has OTUs as rows
+col.names <- basename(filts)
+col.names[[1]] <- paste0("#OTU ID\t", col.names[[1]])
+write.table(seqtab.nochim, out.path, sep="\t",
+ row.names=TRUE, col.names=col.names, quote=FALSE)
+saveRDS(seqtab.nochim, gsub("tsv", "rds", out.path)) ### TESTING
+
+q(status=0)
\ No newline at end of file
=====================================
q2_dada2/assets/run_dada_ccs.R deleted
=====================================
@@ -1,294 +0,0 @@
-#!/usr/bin/env Rscript
-
-###################################################
-# This R script takes an input directory of .fastq.gz files
-# and outputs a tsv file of the dada2 processed sequence
-# table. It is intended for use with the QIIME2 plugin
-# for DADA2.
-#
-# Ex: Rscript run_dada_single.R input_dir output.tsv track.tsv
-# primer_removed_dir filtered_dir front adapter 2 FALSE 0 0 2.0 2 20 Inf
-# pseudo pooled 1.0 0 1000000
-####################################################
-
-####################################################
-# DESCRIPTION OF ARGUMENTS #
-####################################################
-# NOTE: All numeric arguments should be zero or positive.
-# NOTE: All numeric arguments save maxEE are expected to be integers.
-# NOTE: Currently the primer_removed_dir must already exist.
-# NOTE: Currently the filterered_dir must already exist.
-# NOTE: ALL ARGUMENTS ARE POSITIONAL!
-#
-### FILE SYSTEM ARGUMENTS ###
-#
-# 1) File path to directory with the .fastq.gz files to be processed.
-# Ex: path/to/dir/with/fastqgzs
-#
-# 2) File path to output tsv file. If already exists, will be overwritten.
-# Ex: path/to/output_file.tsv
-#
-# 3) File path to tracking tsv file. If already exists, will be overwritte.
-# Ex: path/to/tracking_stats.tsv
-#
-# 4) File path to directory in which to write the primer.removed .fastq.gz
-# files. These files are intermediate for the full workflow.
-# Currently they remain after the script finishes.
-# Directory must already exist.
-# Ex: path/to/dir/with/fastqgzs/primerremoved
-#
-# 5) File path to directory in which to write the filtered .fastq.gz files.
-# These files are intermediate for the full workflow.
-# Currently they remain after the script finishes.
-# Directory must already exist.
-# Ex: path/to/dir/with/fastqgzs/filtered
-#
-### PRIMER REMOVING ARGUMENTS ###
-#
-# 6) primerF - Primer front of Pacbio CCS sequences.
-# Ex: 'AGRGTTYGATYMTGGCTCAG'
-#
-# 7) primerR - Primer adapter of Pacbio CCS sequences.
-# Ex: 'RGYTACCTTGTTACGACTT'
-#
-# 8) maxMismatch - The number of mismatches to tolerate when matching
-# reads to primer sequences.
-# Ex: 2
-#
-# 9) indels - Allow insertions or deletions of bases when matching adapters.
-# Ex: FALSE
-#
-### FILTERING ARGUMENTS ###
-#
-# 10) truncLen - The position at which to truncate reads. Reads shorter
-# than truncLen will be discarded.
-# Special values: 0 - no truncation or length filtering.
-# Ex: 150
-#
-# 11) trimLeft - The number of nucleotides to remove from the start of
-# each read. Should be less than truncLen for obvious reasons.
-# Ex: 0
-#
-# 12) maxEE - Reads with expected errors higher than maxEE are discarded..
-# Ex: 2.0
-#
-# 13) truncQ - Reads are truncated at the first instance of quality score truncQ.
-# If the read is then shorter than truncLen, it is discarded.
-# Ex: 2
-#
-# 14) minLen - Remove reads with length shorter than minLen. minLen is enforced
-# after trimming and truncation.
-# Default Inf - no maximum.
-# Ex: 20
-#
-# 15) maxLen - Remove reads with length greater than maxLen. maxLen is enforced
-# on the raw reads.
-# Default Inf - no maximum.
-# Ex: 300
-#
-### SENSITIVITY ARGUMENTS ###
-#
-# 16) poolMethod - The method used to pool (or not) samples during denoising.
-# Valid options are:
-# independent: (Default) No pooling, samples are denoised indpendently.
-#
-# pseudo: Samples are "pseudo-pooled" for denoising.
-# Ex: independent
-#
-#
-### CHIMERA ARGUMENTS ###
-#
-# 17) chimeraMethod - The method used to remove chimeras. Valid options are:
-# none: No chimera removal is performed.
-# pooled: All reads are pooled prior to chimera detection.
-# consensus: Chimeras are detect in samples individually, and a
-# consensus decision is made for each sequence variant.
-# Ex: consensus
-#
-# 18) minParentFold - The minimum abundance of potential "parents" of a sequence
-# being tested as chimeric, expressed as a fold-change
-# versus the abundance of the sequence being tested. Values
-# should be greater than or equal to 1 (i.e. parents should
-# be more abundant than the sequence being tested).
-# Ex: 1.0
-#
-### SPEED ARGUMENTS ###
-#
-# 19) nthreads - The number of threads to use.
-# Special values: 0 - detect available cores and use all..
-# Ex: 1
-#
-# 20) nreads_learn - The minimum number of reads to learn the error model from.
-# Special values: 0 - Use all input reads.
-# Ex: 1000000
-#
-#
-#
-
-cat(R.version$version.string, "\n")
-errQuit <- function(mesg, status=1) { message("Error: ", mesg); q(status=status) }
-args <- commandArgs(TRUE)
-
-# Assign each of the arguments, in positional order, to an appropriately named R variable
-inp.dir <- args[[1]]
-out.path <- args[[2]]
-out.track <- args[[3]]
-primer.removed.dir <- args[[4]]
-filtered.dir <- args[[5]]
-primerF <- args[[6]]
-primerR <- args[[7]]
-maxMismatch <- as.numeric(args[[8]])
-indels <- as.logical(args[[9]])
-truncLen <- as.integer(args[[10]])
-trimLeft <- as.integer(args[[11]])
-maxEE <- as.numeric(args[[12]])
-truncQ <- as.integer(args[[13]])
-minLen <- as.numeric(args[[14]])
-maxLen <- as.numeric(args[[15]]) # Allows Inf
-poolMethod <- args[[16]]
-chimeraMethod <- args[[17]]
-minParentFold <- as.numeric(args[[18]])
-nthreads <- as.integer(args[[19]])
-nreads.learn <- as.integer(args[[20]])
-
-### VALIDATE ARGUMENTS ###
-
-# Input directory is expected to contain .fastq.gz file(s)
-# that have not yet been filtered and globally trimmed
-# to the same length.
-if(!dir.exists(inp.dir)) {
- errQuit("Input directory does not exist.")
-} else {
- unfilts <- list.files(inp.dir, pattern=".fastq.gz$", full.names=TRUE)
- if(length(unfilts) == 0) {
- errQuit("No input files with the expected filename format found.")
- }
-}
-
-# Output files are to be filenames (not directories) and are to be
-# removed and replaced if already present.
-for(fn in c(out.path, out.track)) {
- if(dir.exists(fn)) {
- errQuit("Output filename ", fn, " is a directory.")
- } else if(file.exists(fn)) {
- invisible(file.remove(fn))
- }
-}
-
-# Convert nthreads to the logical/numeric expected by dada2
-if(nthreads < 0) {
- errQuit("nthreads must be non-negative.")
-} else if(nthreads == 0) {
- multithread <- TRUE # detect and use all
-} else if(nthreads == 1) {
- multithread <- FALSE
-} else {
- multithread <- nthreads
-}
-
-### LOAD LIBRARIES ###
-suppressWarnings(library(methods))
-suppressWarnings(library(dada2))
-cat("DADA2:", as.character(packageVersion("dada2")), "/",
- "Rcpp:", as.character(packageVersion("Rcpp")), "/",
- "RcppParallel:", as.character(packageVersion("RcppParallel")), "\n")
-
-### Remove Primers ###
-cat("1) Removing Primers\n")
-nop <- file.path(primer.removed.dir, basename(unfilts))
-prim <- suppressWarnings(removePrimers(unfilts, nop, primerF, dada2:::rc(primerR),
- max.mismatch = maxMismatch, allow.indels = indels,
- orient = TRUE, verbose = TRUE))
-cat(ifelse(file.exists(nop), ".", "x"), sep="")
-nop <- list.files(primer.removed.dir, pattern=".fastq.gz$", full.names=TRUE)
-cat("\n")
-if(length(nop) == 0) { # All reads were filtered out
- errQuit("No reads passed the Removing Primers step (Did you select the right primers?)", status=2)
-}
-
-### TRIM AND FILTER ###
-cat("2) Filtering\n")
-filts <- file.path(filtered.dir, basename(nop))
-out <- suppressWarnings(filterAndTrim(nop, filts, truncLen = truncLen, trimLeft = trimLeft,
- maxEE = maxEE, truncQ = truncQ, rm.phix = FALSE,
- multithread = multithread, maxLen = maxLen, minLen = minLen, minQ = 3))
-cat(ifelse(file.exists(filts), ".", "x"), sep="")
-filts <- list.files(filtered.dir, pattern=".fastq.gz$", full.names=TRUE)
-cat("\n")
-if(length(filts) == 0) { # All reads were filtered out
- errQuit("No reads passed the filter (was truncLen longer than the read length?)", status=2)
-}
-
-### LEARN ERROR RATES ###
-# Dereplicate enough samples to get nreads.learn total reads
-cat("3) Learning Error Rates\n")
-err <- suppressWarnings(learnErrors(filts, nreads=nreads.learn,
- errorEstimationFunction=dada2:::PacBioErrfun,
- multithread=multithread, BAND_SIZE=32))
-
-### PROCESS ALL SAMPLES ###
-# Loop over rest in streaming fashion with learned error rates
-dds <- vector("list", length(filts))
-cat("4) Denoise samples ")
-cat("\n")
-for(j in seq(length(filts))) {
- drp <- derepFastq(filts[[j]])
- dds[[j]] <- dada(drp, err=err, multithread=multithread,
- BAND_SIZE=32, verbose=FALSE)
- cat(".")
-}
-cat("\n")
-if(poolMethod == "pseudo") {
- cat(" Pseudo-pool step ")
- ### TEMPORARY, to be removed once 1.12 makes its way to Q2
- ### Needed for now to manage pseudo-pooling memory, as 1.10 didn't do this appropriately.
- ### pseudo_priors code copied from dada2.R
- st <- makeSequenceTable(dds)
- pseudo_priors <- colnames(st)[colSums(st>0) >= 2 | colSums(st) >= Inf]
- rm(st)
- ### \pseudo_priors code copied from dada2.R
- ### code copied from previous loop through samples in this script
- for(j in seq(length(filts))) {
- drp <- derepFastq(filts[[j]])
- dds[[j]] <- dada(drp, err=err, multithread=multithread,
- priors = pseudo_priors,
- BAND_SIZE=32, verbose=FALSE)
- cat(".")
- }
- cat("\n")
- ### \code copied from previous loop through samples in this script
-}
-
-# Make sequence table
-seqtab <- makeSequenceTable(dds)
-
-### Remove chimeras
-cat("5) Remove chimeras (method = ", chimeraMethod, ")\n", sep="")
-if(chimeraMethod %in% c("pooled", "consensus")) {
- seqtab.nochim <- removeBimeraDenovo(seqtab, method=chimeraMethod, minFoldParentOverAbundance=minParentFold, multithread=multithread)
-} else { # No chimera removal, copy seqtab to seqtab.nochim
- seqtab.nochim <- seqtab
-}
-
-### REPORT READ FRACTIONS THROUGH PIPELINE ###
-cat("6) Report read numbers through the pipeline\n")
-# Handle edge cases: Samples lost in filtering; One sample
-track <- cbind(prim,out[ ,2], matrix(0, nrow=nrow(out), ncol=2))
-colnames(track) <- c("input", "primer-removed","filtered", "denoised", "non-chimeric")
-passed.filtering <- track[,"filtered"] > 0
-track[passed.filtering,"denoised"] <- rowSums(seqtab)
-track[passed.filtering,"non-chimeric"] <- rowSums(seqtab.nochim)
-write.table(track, out.track, sep="\t", row.names=TRUE, col.names=NA,
- quote=FALSE)
-
-### WRITE OUTPUT AND QUIT ###
-# Formatting as tsv plain-text sequence table table
-cat("7) Write output\n")
-seqtab.nochim <- t(seqtab.nochim) # QIIME has OTUs as rows
-col.names <- basename(filts)
-col.names[[1]] <- paste0("#OTU ID\t", col.names[[1]])
-write.table(seqtab.nochim, out.path, sep="\t",
- row.names=TRUE, col.names=col.names, quote=FALSE)
-saveRDS(seqtab.nochim, gsub("tsv", "rds", out.path)) ### TESTING
-
-q(status=0)
\ No newline at end of file
=====================================
q2_dada2/assets/run_dada_paired.R deleted
=====================================
@@ -1,294 +0,0 @@
-#!/usr/bin/env Rscript
-
-###################################################
-# This R script takes an input two directories of
-# .fastq.gz files, corresponding to matched forward
-# and reverse sequence files,
-# and outputs a tsv file of the dada2 processed sequence
-# table. It is intended for use with the QIIME2 plugin
-# for DADA2.
-#
-# Rscript run_dada_paired.R input_dirF input_dirR output.tsv track.tsv filtered_dirF filtered_dirR 240 160 0 0 2.0 2 pooled 1.0 0 100000
-####################################################
-
-####################################################
-# DESCRIPTION OF ARGUMENTS #
-####################################################
-# NOTE: All numeric arguments should be zero or positive.
-# NOTE: All numeric arguments save maxEEF/R are expected to be integers.
-# NOTE: Currently the filterered_dirF/R must already exist.
-# NOTE: ALL ARGUMENTS ARE POSITIONAL!
-#
-### FILE SYSTEM ARGUMENTS ###
-#
-# 1) File path to directory with the FORWARD .fastq.gz files to be processed.
-# Ex: path/to/dir/with/FWD_fastqgzs
-#
-# 2) File path to directory with the REVERSE .fastq.gz files to be processed.
-# Ex: path/to/dir/with/REV_fastqgzs
-#
-# 3) File path to output tsv file. If already exists, will be overwritten.
-# Ex: path/to/output_file.tsv
-#
-# 4) File path to tracking tsv file. If already exists, will be overwritte.
-# Ex: path/to/tracking_stats.tsv
-#
-# 5) File path to directory to write the filtered FORWARD .fastq.gz files. These files are intermediate
-# for the full workflow. Currently they remain after the script finishes. Directory must
-# already exist.
-# Ex: path/to/dir/with/FWD_fastqgzs/filtered
-#
-# 6) File path to directory to write the filtered REVERSE .fastq.gz files. These files are intermediate
-# for the full workflow. Currently they remain after the script finishes. Directory must
-# already exist.
-# Ex: path/to/dir/with/REV_fastqgzs/filtered
-#
-### FILTERING ARGUMENTS ###
-#
-# 7) truncLenF - The position at which to truncate forward reads. Forward reads shorter
-# than truncLenF will be discarded.
-# Special values: 0 - no truncation or length filtering.
-# Ex: 240
-#
-# 8) truncLenR - The position at which to truncate reverse reads. Reverse reads shorter
-# than truncLenR will be discarded.
-# Special values: 0 - no truncation or length filtering.
-# Ex: 160
-#
-# 9) trimLeftF - The number of nucleotides to remove from the start of
-# each forward read. Should be less than truncLenF.
-# Ex: 0
-#
-# 10) trimLeftR - The number of nucleotides to remove from the start of
-# each reverse read. Should be less than truncLenR.
-# Ex: 0
-#
-# 11) maxEEF - Forward reads with expected errors higher than maxEEF are discarded.
-# Both forward and reverse reads are independently tested.
-# Ex: 2.0
-#
-# 12) maxEER - Reverse reads with expected errors higher than maxEER are discarded.
-# Both forward and reverse reads are independently tested.
-# Ex: 2.0
-#
-# 13) truncQ - Reads are truncated at the first instance of quality score truncQ.
-# If the read is then shorter than truncLen, it is discarded.
-# Ex: 2
-#
-### SENSITIVITY ARGUMENTS ###
-#
-# 14) poolMethod - The method used to pool (or not) samples during denoising.
-# Valid options are:
-# independent: (Default) No pooling, samples are denoised indpendently.
-# pseudo: Samples are "pseudo-pooled" for denoising.
-# Ex: independent
-#
-#
-### CHIMERA ARGUMENTS ###
-#
-# 15) chimeraMethod - The method used to remove chimeras. Valid options are:
-# none: No chimera removal is performed.
-# pooled: All reads are pooled prior to chimera detection.
-# consensus: Chimeras are detect in samples individually, and a consensus decision
-# is made for each sequence variant.
-# Ex: consensus
-#
-# 16) minParentFold - The minimum abundance of potential "parents" of a sequence being
-# tested as chimeric, expressed as a fold-change versus the abundance of the sequence being
-# tested. Values should be greater than or equal to 1 (i.e. parents should be more
-# abundant than the sequence being tested).
-# Ex: 1.0
-#
-### SPEED ARGUMENTS ###
-#
-# 17) nthreads - The number of threads to use.
-# Special values: 0 - detect available and use all.
-# Ex: 1
-#
-# 18) nreads_learn - The minimum number of reads to learn the error model from.
-# Special values: 0 - Use all input reads.
-# Ex: 1000000
-#
-
-cat(R.version$version.string, "\n")
-errQuit <- function(mesg, status=1) { message("Error: ", mesg); q(status=status) }
-getN <- function(x) sum(getUniques(x))
-args <- commandArgs(TRUE)
-
-# Assign each of the arguments, in positional order, to an appropriately named R variable
-inp.dirF <- args[[1]]
-inp.dirR <- args[[2]]
-out.path <- args[[3]]
-out.track <- args[[4]]
-filtered.dirF <- args[[5]]
-filtered.dirR <- args[[6]]
-truncLenF <- as.integer(args[[7]])
-truncLenR <- as.integer(args[[8]])
-trimLeftF <- as.integer(args[[9]])
-trimLeftR <- as.integer(args[[10]])
-maxEEF <- as.numeric(args[[11]])
-maxEER <- as.numeric(args[[12]])
-truncQ <- as.integer(args[[13]])
-minOverlap <- as.integer(args[14])
-poolMethod <- args[[15]]
-chimeraMethod <- args[[16]]
-minParentFold <- as.numeric(args[[17]])
-nthreads <- as.integer(args[[18]])
-nreads.learn <- as.integer(args[[19]])
-
-### VALIDATE ARGUMENTS ###
-
-# Input directory is expected to contain .fastq.gz file(s)
-# that have not yet been filtered and globally trimmed
-# to the same length.
-if(!(dir.exists(inp.dirF) && dir.exists(inp.dirR))) {
- errQuit("Input directory does not exist.")
-} else {
- unfiltsF <- list.files(inp.dirF, pattern=".fastq.gz$", full.names=TRUE)
- unfiltsR <- list.files(inp.dirR, pattern=".fastq.gz$", full.names=TRUE)
- if(length(unfiltsF) == 0) {
- errQuit("No input forward files with the expected filename format found.")
- }
- if(length(unfiltsR) == 0) {
- errQuit("No input reverse files with the expected filename format found.")
- }
- if(length(unfiltsF) != length(unfiltsR)) {
- errQuit("Different numbers of forward and reverse .fastq.gz files.")
- }
-}
-
-# Output files are to be filenames (not directories) and are to be
-# removed and replaced if already present.
-for(fn in c(out.path, out.track)) {
- if(dir.exists(fn)) {
- errQuit("Output filename ", fn, " is a directory.")
- } else if(file.exists(fn)) {
- invisible(file.remove(fn))
- }
-}
-
-# Convert nthreads to the logical/numeric expected by dada2
-if(nthreads < 0) {
- errQuit("nthreads must be non-negative.")
-} else if(nthreads == 0) {
- multithread <- TRUE # detect and use all
-} else if(nthreads == 1) {
- multithread <- FALSE
-} else {
- multithread <- nthreads
-}
-
-### LOAD LIBRARIES ###
-suppressWarnings(library(methods))
-suppressWarnings(library(dada2))
-cat("DADA2:", as.character(packageVersion("dada2")), "/",
- "Rcpp:", as.character(packageVersion("Rcpp")), "/",
- "RcppParallel:", as.character(packageVersion("RcppParallel")), "\n")
-
-### TRIM AND FILTER ###
-cat("1) Filtering ")
-filtsF <- file.path(filtered.dirF, basename(unfiltsF))
-filtsR <- file.path(filtered.dirR, basename(unfiltsR))
-out <- suppressWarnings(filterAndTrim(unfiltsF, filtsF, unfiltsR, filtsR,
- truncLen=c(truncLenF, truncLenR), trimLeft=c(trimLeftF, trimLeftR),
- maxEE=c(maxEEF, maxEER), truncQ=truncQ, rm.phix=TRUE,
- multithread=multithread))
-cat(ifelse(file.exists(filtsF), ".", "x"), sep="")
-filtsF <- list.files(filtered.dirF, pattern=".fastq.gz$", full.names=TRUE)
-filtsR <- list.files(filtered.dirR, pattern=".fastq.gz$", full.names=TRUE)
-cat("\n")
-if(length(filtsF) == 0) { # All reads were filtered out
- errQuit("No reads passed the filter (were truncLenF/R longer than the read lengths?)", status=2)
-}
-
-### LEARN ERROR RATES ###
-# Dereplicate enough samples to get nreads.learn total reads
-cat("2) Learning Error Rates\n")
-errF <- suppressWarnings(learnErrors(filtsF, nreads=nreads.learn, multithread=multithread))
-errR <- suppressWarnings(learnErrors(filtsR, nreads=nreads.learn, multithread=multithread))
-
-### PROCESS ALL SAMPLES ###
-# Loop over rest in streaming fashion with learned error rates
-denoisedF <- rep(0, length(filtsF))
-ddsF <- vector("list", length(filtsF))
-ddsR <- vector("list", length(filtsF))
-mergers <- vector("list", length(filtsF))
-cat("3) Denoise samples ")
-
-for(j in seq(length(filtsF))) {
- drpF <- derepFastq(filtsF[[j]])
- ddsF[[j]] <- dada(drpF, err=errF, multithread=multithread, verbose=FALSE)
- drpR <- derepFastq(filtsR[[j]])
- ddsR[[j]] <- dada(drpR, err=errR, multithread=multithread, verbose=FALSE)
- cat(".")
-}
-cat("\n")
-if(poolMethod == "pseudo") {
- cat(" Pseudo-pool step ")
- ### TEMPORARY, to be removed once 1.12 makes its way to Q2
- ### Needed for now to manage pseudo-pooling memory, as 1.10 didn't do this appropriately.
- ### pseudo_priors code copied from dada2.R
- stF <- makeSequenceTable(ddsF)
- pseudo_priorsF <- colnames(stF)[colSums(stF>0) >= 2 | colSums(stF) >= Inf]
- rm(stF)
- stR <- makeSequenceTable(ddsR)
- pseudo_priorsR <- colnames(stR)[colSums(stR>0) >= 2 | colSums(stR) >= Inf]
- rm(stR)
- ### \pseudo_priors code copied from dada2.R
- ### code copied from previous loop through samples in this script
- for(j in seq(length(filtsF))) {
- drpF <- derepFastq(filtsF[[j]])
- ddsF[[j]] <- dada(drpF, err=errF, priors=pseudo_priorsF,
- multithread=multithread, verbose=FALSE)
- drpR <- derepFastq(filtsR[[j]])
- ddsR[[j]] <- dada(drpR, err=errR, priors=pseudo_priorsR,
- multithread=multithread, verbose=FALSE)
- cat(".")
- }
- cat("\n")
- ### \code copied from previous loop through samples in this script
-}
-
-### Now loop through and do merging
-for(j in seq(length(filtsF))) {
- drpF <- derepFastq(filtsF[[j]])
- drpR <- derepFastq(filtsR[[j]])
- mergers[[j]] <- mergePairs(ddsF[[j]], drpF, ddsR[[j]], drpR, minOverlap=minOverlap)
- denoisedF[[j]] <- getN(ddsF[[j]])
- cat(".")
-}
-cat("\n")
-
-# Make sequence table
-seqtab <- makeSequenceTable(mergers)
-
-# Remove chimeras
-cat("4) Remove chimeras (method = ", chimeraMethod, ")\n", sep="")
-if(chimeraMethod %in% c("pooled", "consensus")) {
- seqtab.nochim <- removeBimeraDenovo(seqtab, method=chimeraMethod, minFoldParentOverAbundance=minParentFold, multithread=multithread)
-} else { # No chimera removal, copy seqtab to seqtab.nochim
- seqtab.nochim <- seqtab
-}
-
-### REPORT READ COUNTS AT EACH PROCESSING STEP ###
-# Handle edge cases: Samples lost in filtering; One sample
-track <- cbind(out, matrix(0, nrow=nrow(out), ncol=3))
-colnames(track) <- c("input", "filtered", "denoised", "merged", "non-chimeric")
-passed.filtering <- track[,"filtered"] > 0
-track[passed.filtering,"denoised"] <- denoisedF
-track[passed.filtering,"merged"] <- rowSums(seqtab)
-track[passed.filtering,"non-chimeric"] <- rowSums(seqtab.nochim)
-write.table(track, out.track, sep="\t", row.names=TRUE, col.names=NA,
- quote=FALSE)
-
-### WRITE OUTPUT AND QUIT ###
-# Formatting as tsv plain-text sequence table table
-cat("6) Write output\n")
-seqtab.nochim <- t(seqtab.nochim) # QIIME has OTUs as rows
-col.names <- basename(filtsF)
-col.names[[1]] <- paste0("#OTU ID\t", col.names[[1]])
-write.table(seqtab.nochim, out.path, sep="\t",
- row.names=TRUE, col.names=col.names, quote=FALSE)
-saveRDS(seqtab.nochim, gsub("tsv", "rds", out.path)) ### TESTING
-
-q(status=0)
=====================================
q2_dada2/assets/run_dada_single.R deleted
=====================================
@@ -1,256 +0,0 @@
-#!/usr/bin/env Rscript
-
-###################################################
-# This R script takes an input directory of .fastq.gz files
-# and outputs a tsv file of the dada2 processed sequence
-# table. It is intended for use with the QIIME2 plugin
-# for DADA2.
-#
-# Ex: Rscript run_dada_single.R input_dir output.tsv track.tsv filtered_dir 200 0 2.0 2 Inf pooled 1.0 0 1000000 NULL 32
-####################################################
-
-####################################################
-# DESCRIPTION OF ARGUMENTS #
-####################################################
-# NOTE: All numeric arguments should be zero or positive.
-# NOTE: All numeric arguments save maxEE are expected to be integers.
-# NOTE: Currently the filterered_dir must already exist.
-# NOTE: ALL ARGUMENTS ARE POSITIONAL!
-#
-### FILE SYSTEM ARGUMENTS ###
-#
-# 1) File path to directory with the .fastq.gz files to be processed.
-# Ex: path/to/dir/with/fastqgzs
-#
-# 2) File path to output tsv file. If already exists, will be overwritten.
-# Ex: path/to/output_file.tsv
-#
-# 3) File path to tracking tsv file. If already exists, will be overwritte.
-# Ex: path/to/tracking_stats.tsv
-#
-# 4) File path to directory in which to write the filtered .fastq.gz files. These files are intermediate
-# for the full workflow. Currently they remain after the script finishes.
-# Directory must already exist.
-# Ex: path/to/dir/with/fastqgzs/filtered
-#
-### FILTERING ARGUMENTS ###
-#
-# 5) truncLen - The position at which to truncate reads. Reads shorter
-# than truncLen will be discarded.
-# Special values: 0 - no truncation or length filtering.
-# Ex: 150
-#
-# 6) trimLeft - The number of nucleotides to remove from the start of
-# each read. Should be less than truncLen for obvious reasons.
-# Ex: 0
-#
-# 7) maxEE - Reads with expected errors higher than maxEE are discarded.
-# Ex: 2.0
-#
-# 8) truncQ - Reads are truncated at the first instance of quality score truncQ.
-# If the read is then shorter than truncLen, it is discarded.
-# Ex: 2
-#
-# 9) maxLen - Remove reads with length greater than maxLen. maxLen is enforced on the raw reads.
-# Default Inf - no maximum.
-# Ex: 300
-#
-### SENSITIVITY ARGUMENTS ###
-#
-# 10) poolMethod - The method used to pool (or not) samples during denoising.
-# Valid options are:
-# independent: (Default) No pooling, samples are denoised indpendently.
-# pseudo: Samples are "pseudo-pooled" for denoising.
-# Ex: independent
-#
-#
-### CHIMERA ARGUMENTS ###
-#
-# 11) chimeraMethod - The method used to remove chimeras. Valid options are:
-# none: No chimera removal is performed.
-# pooled: All reads are pooled prior to chimera detection.
-# consensus: Chimeras are detect in samples individually, and a consensus decision
-# is made for each sequence variant.
-# Ex: consensus
-#
-# 12) minParentFold - The minimum abundance of potential "parents" of a sequence being
-# tested as chimeric, expressed as a fold-change versus the abundance of the sequence being
-# tested. Values should be greater than or equal to 1 (i.e. parents should be more
-# abundant than the sequence being tested).
-# Ex: 1.0
-#
-### SPEED ARGUMENTS ###
-#
-# 13) nthreads - The number of threads to use.
-# Special values: 0 - detect available cores and use all..
-# Ex: 1
-#
-# 14) nreads_learn - The minimum number of reads to learn the error model from.
-# Special values: 0 - Use all input reads.
-# Ex: 1000000
-#
-### GLOBAL OPTION ARGUMENTS ###
-#
-# 15) HOMOPOLYMER_GAP_PENALTY - The cost of gaps in homopolymer regions (>=3 repeated bases).
-# Default is NULL, which causes homopolymer gaps
-# to be treated as normal gaps.
-# Ex: -1
-#
-# 16) BAND_SIZE - When set, banded Needleman-Wunsch alignments are performed.
-# The default value of BAND_SIZE is 16. Setting BAND_SIZE to a negative
-# number turns off banding (i.e. full Needleman-Wunsch).
-# Ex: 32
-#
-
-cat(R.version$version.string, "\n")
-errQuit <- function(mesg, status=1) { message("Error: ", mesg); q(status=status) }
-args <- commandArgs(TRUE)
-
-# Assign each of the arguments, in positional order, to an appropriately named R variable
-inp.dir <- args[[1]]
-out.path <- args[[2]]
-out.track <- args[[3]]
-filtered.dir <- args[[4]]
-truncLen <- as.integer(args[[5]])
-trimLeft <- as.integer(args[[6]])
-maxEE <- as.numeric(args[[7]])
-truncQ <- as.integer(args[[8]])
-maxLen <- as.numeric(args[[9]]) # Allows Inf
-poolMethod <- args[[10]]
-chimeraMethod <- args[[11]]
-minParentFold <- as.numeric(args[[12]])
-nthreads <- as.integer(args[[13]])
-nreads.learn <- as.integer(args[[14]])
-# The following args are not directly exposed to end users in q2-dada2,
-# but rather indirectly, via the methods `denoise-single` and `denoise-pyro`.
-HOMOPOLYMER_GAP_PENALTY <- if (args[[15]]=='NULL') NULL else as.integer(args[[15]])
-BAND_SIZE <- as.integer(args[[16]])
-
-### VALIDATE ARGUMENTS ###
-
-# Input directory is expected to contain .fastq.gz file(s)
-# that have not yet been filtered and globally trimmed
-# to the same length.
-if(!dir.exists(inp.dir)) {
- errQuit("Input directory does not exist.")
-} else {
- unfilts <- list.files(inp.dir, pattern=".fastq.gz$", full.names=TRUE)
- if(length(unfilts) == 0) {
- errQuit("No input files with the expected filename format found.")
- }
-}
-
-# Output files are to be filenames (not directories) and are to be
-# removed and replaced if already present.
-for(fn in c(out.path, out.track)) {
- if(dir.exists(fn)) {
- errQuit("Output filename ", fn, " is a directory.")
- } else if(file.exists(fn)) {
- invisible(file.remove(fn))
- }
-}
-
-# Convert nthreads to the logical/numeric expected by dada2
-if(nthreads < 0) {
- errQuit("nthreads must be non-negative.")
-} else if(nthreads == 0) {
- multithread <- TRUE # detect and use all
-} else if(nthreads == 1) {
- multithread <- FALSE
-} else {
- multithread <- nthreads
-}
-
-### LOAD LIBRARIES ###
-suppressWarnings(library(methods))
-suppressWarnings(library(dada2))
-cat("DADA2:", as.character(packageVersion("dada2")), "/",
- "Rcpp:", as.character(packageVersion("Rcpp")), "/",
- "RcppParallel:", as.character(packageVersion("RcppParallel")), "\n")
-
-### TRIM AND FILTER ###
-cat("1) Filtering ")
-filts <- file.path(filtered.dir, basename(unfilts))
-out <- suppressWarnings(filterAndTrim(unfilts, filts, truncLen=truncLen, trimLeft=trimLeft,
- maxEE=maxEE, truncQ=truncQ, rm.phix=TRUE,
- multithread=multithread, maxLen=maxLen))
-cat(ifelse(file.exists(filts), ".", "x"), sep="")
-filts <- list.files(filtered.dir, pattern=".fastq.gz$", full.names=TRUE)
-cat("\n")
-if(length(filts) == 0) { # All reads were filtered out
- errQuit("No reads passed the filter (was truncLen longer than the read length?)", status=2)
-}
-
-### LEARN ERROR RATES ###
-# Dereplicate enough samples to get nreads.learn total reads
-cat("2) Learning Error Rates\n")
-err <- suppressWarnings(learnErrors(filts, nreads=nreads.learn, multithread=multithread,
- HOMOPOLYMER_GAP_PENALTY=HOMOPOLYMER_GAP_PENALTY, BAND_SIZE=BAND_SIZE))
-
-### PROCESS ALL SAMPLES ###
-# Loop over rest in streaming fashion with learned error rates
-dds <- vector("list", length(filts))
-cat("3) Denoise samples ")
-for(j in seq(length(filts))) {
- drp <- derepFastq(filts[[j]])
- dds[[j]] <- dada(drp, err=err, multithread=multithread,
- HOMOPOLYMER_GAP_PENALTY=HOMOPOLYMER_GAP_PENALTY,
- BAND_SIZE=BAND_SIZE, verbose=FALSE)
- cat(".")
-}
-cat("\n")
-if(poolMethod == "pseudo") {
- cat(" Pseudo-pool step ")
- ### TEMPORARY, to be removed once 1.12 makes its way to Q2
- ### Needed for now to manage pseudo-pooling memory, as 1.10 didn't do this appropriately.
- ### pseudo_priors code copied from dada2.R
- st <- makeSequenceTable(dds)
- pseudo_priors <- colnames(st)[colSums(st>0) >= 2 | colSums(st) >= Inf]
- rm(st)
- ### \pseudo_priors code copied from dada2.R
- ### code copied from previous loop through samples in this script
- for(j in seq(length(filts))) {
- drp <- derepFastq(filts[[j]])
- dds[[j]] <- dada(drp, err=err, multithread=multithread,
- priors = pseudo_priors,
- HOMOPOLYMER_GAP_PENALTY=HOMOPOLYMER_GAP_PENALTY,
- BAND_SIZE=BAND_SIZE, verbose=FALSE)
- cat(".")
- }
- cat("\n")
- ### \code copied from previous loop through samples in this script
-}
-
-# Make sequence table
-seqtab <- makeSequenceTable(dds)
-
-### Remove chimeras
-cat("4) Remove chimeras (method = ", chimeraMethod, ")\n", sep="")
-if(chimeraMethod %in% c("pooled", "consensus")) {
- seqtab.nochim <- removeBimeraDenovo(seqtab, method=chimeraMethod, minFoldParentOverAbundance=minParentFold, multithread=multithread)
-} else { # No chimera removal, copy seqtab to seqtab.nochim
- seqtab.nochim <- seqtab
-}
-
-### REPORT READ FRACTIONS THROUGH PIPELINE ###
-cat("5) Report read numbers through the pipeline\n")
-# Handle edge cases: Samples lost in filtering; One sample
-track <- cbind(out, matrix(0, nrow=nrow(out), ncol=2))
-colnames(track) <- c("input", "filtered", "denoised", "non-chimeric")
-passed.filtering <- track[,"filtered"] > 0
-track[passed.filtering,"denoised"] <- rowSums(seqtab)
-track[passed.filtering,"non-chimeric"] <- rowSums(seqtab.nochim)
-write.table(track, out.track, sep="\t", row.names=TRUE, col.names=NA,
- quote=FALSE)
-
-### WRITE OUTPUT AND QUIT ###
-# Formatting as tsv plain-text sequence table table
-cat("6) Write output\n")
-seqtab.nochim <- t(seqtab.nochim) # QIIME has OTUs as rows
-col.names <- basename(filts)
-col.names[[1]] <- paste0("#OTU ID\t", col.names[[1]])
-write.table(seqtab.nochim, out.path, sep="\t",
- row.names=TRUE, col.names=col.names, quote=FALSE)
-saveRDS(seqtab.nochim, gsub("tsv", "rds", out.path)) ### TESTING
-
-q(status=0)
=====================================
q2_dada2/plugin_setup.py
=====================================
@@ -46,6 +46,7 @@ plugin.methods.register_function(
'chimera_method': qiime2.plugin.Str %
qiime2.plugin.Choices(_CHIM_OPT),
'min_fold_parent_over_abundance': qiime2.plugin.Float,
+ 'allow_one_off': qiime2.plugin.Bool,
'n_threads': qiime2.plugin.Int,
'n_reads_learn': qiime2.plugin.Int,
'hashed_feature_ids': qiime2.plugin.Bool},
@@ -98,6 +99,11 @@ plugin.methods.register_function(
'than or equal to 1 (i.e. parents should be more abundant than '
'the sequence being tested). This parameter has no effect if '
'chimera_method is "none".'),
+ 'allow_one_off': (
+ 'Bimeras that are one-off from exact are also '
+ 'identified if the `allow_one_off` argument is True.'
+ 'If True, a sequence will be identified as bimera if it is one '
+ 'mismatch or indel away from an exact bimera.'),
'n_threads': ('The number of threads to use for multithreaded '
'processing. If 0 is provided, all available cores will '
'be used.'),
@@ -143,6 +149,7 @@ plugin.methods.register_function(
'chimera_method': qiime2.plugin.Str %
qiime2.plugin.Choices(_CHIM_OPT),
'min_fold_parent_over_abundance': qiime2.plugin.Float,
+ 'allow_one_off': qiime2.plugin.Bool,
'n_threads': qiime2.plugin.Int,
'n_reads_learn': qiime2.plugin.Int,
'hashed_feature_ids': qiime2.plugin.Bool},
@@ -216,6 +223,11 @@ plugin.methods.register_function(
'than or equal to 1 (i.e. parents should be more abundant than '
'the sequence being tested). This parameter has no effect if '
'chimera_method is "none".'),
+ 'allow_one_off': (
+ 'Bimeras that are one-off from exact are also '
+ 'identified if the `allow_one_off` argument is True'
+ 'If True, a sequence will be identified as bimera if it is one '
+ 'mismatch or indel away from an exact bimera.'),
'n_threads': ('The number of threads to use for multithreaded '
'processing. If 0 is provided, all available cores will '
'be used.'),
@@ -259,6 +271,7 @@ plugin.methods.register_function(
'chimera_method': qiime2.plugin.Str %
qiime2.plugin.Choices(_CHIM_OPT),
'min_fold_parent_over_abundance': qiime2.plugin.Float,
+ 'allow_one_off': qiime2.plugin.Bool,
'n_threads': qiime2.plugin.Int,
'n_reads_learn': qiime2.plugin.Int,
'hashed_feature_ids': qiime2.plugin.Bool},
@@ -313,6 +326,11 @@ plugin.methods.register_function(
'than or equal to 1 (i.e. parents should be more abundant than '
'the sequence being tested). This parameter has no effect if '
'chimera_method is "none".',
+ 'allow_one_off': (
+ 'Bimeras that are one-off from exact are also '
+ 'identified if the `allow_one_off` argument is True. '
+ 'If True, a sequence will be identified as bimera if it is one '
+ 'mismatch or indel away from an exact bimera.'),
'n_threads': 'The number of threads to use for multithreaded '
'processing. If 0 is provided, all available cores will '
'be used.',
@@ -359,6 +377,7 @@ plugin.methods.register_function(
'chimera_method': qiime2.plugin.Str %
qiime2.plugin.Choices(_CHIM_OPT),
'min_fold_parent_over_abundance': qiime2.plugin.Float,
+ 'allow_one_off': qiime2.plugin.Bool,
'n_threads': qiime2.plugin.Int,
'n_reads_learn': qiime2.plugin.Int,
'hashed_feature_ids': qiime2.plugin.Bool},
@@ -440,6 +459,11 @@ plugin.methods.register_function(
'than or equal to 1 (i.e. parents should be more abundant than '
'the sequence being tested). Suggest 3.5. '
'This parameter has no effect if chimera_method is "none".',
+ 'allow_one_off': (
+ 'Bimeras that are one-off from exact are also '
+ 'identified if the `allow_one_off` argument is True. '
+ 'If True, a sequence will be identified as bimera if it is one '
+ 'mismatch or indel away from an exact bimera.'),
'n_threads': 'The number of threads to use for multithreaded '
'processing. If 0 is provided, all available cores will '
'be used.',
=====================================
setup.py
=====================================
@@ -21,9 +21,7 @@ setup(
author="Greg Caporaso and Benjamin Callahan",
author_email="gregcaporaso at gmail.com",
description="Apply DADA2 to generate denoised sequence variants. ",
- scripts=['q2_dada2/assets/run_dada_single.R',
- 'q2_dada2/assets/run_dada_paired.R',
- 'q2_dada2/assets/run_dada_ccs.R'],
+ scripts=['q2_dada2/assets/run_dada.R'],
package_data={
'q2_dada2': ['citations.bib'],
'q2_dada2.tests': ['data/*',
View it on GitLab: https://salsa.debian.org/med-team/q2-dada2/-/compare/b54930c8f5f304cfe8882106822eac86224c0aa4...6ab2c6a299fefc18727330837943fbe8b94d5f41
--
View it on GitLab: https://salsa.debian.org/med-team/q2-dada2/-/compare/b54930c8f5f304cfe8882106822eac86224c0aa4...6ab2c6a299fefc18727330837943fbe8b94d5f41
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20220907/0282d470/attachment-0001.htm>
More information about the debian-med-commit
mailing list