[med-svn] [Git][med-team/q2-dada2][upstream] New upstream version 2022.11.2

Thu Jan 12 20:21:24 GMT 2023


Andreas Tille pushed to branch upstream at Debian Med / q2-dada2


Commits:
7214e78b by Andreas Tille at 2023-01-12T20:57:46+01:00
New upstream version 2022.11.2
- - - - -


16 changed files:

- + .github/workflows/add-to-project-ci.yml
- ci/recipe/meta.yaml
- q2_dada2/_denoise.py
- + q2_dada2/_examples.py
- q2_dada2/_version.py
- q2_dada2/assets/run_dada.R
- q2_dada2/plugin_setup.py
- + q2_dada2/tests/data/mixed_barcodes_and_ids/MANIFEST
- + q2_dada2/tests/data/mixed_barcodes_and_ids/V130_166_L001_R1_001.fastq.gz
- + q2_dada2/tests/data/mixed_barcodes_and_ids/V130_2_167_L001_R1_001.fastq.gz
- + q2_dada2/tests/data/mixed_barcodes_and_ids/V130_2_743_L001_R2_001.fastq.gz
- + q2_dada2/tests/data/mixed_barcodes_and_ids/V130_742_L001_R2_001.fastq.gz
- + q2_dada2/tests/data/mixed_barcodes_and_ids/metadata.yml
- q2_dada2/tests/data/single_feature.tsv
- q2_dada2/tests/test_denoise.py
- setup.py


Changes:

=====================================
.github/workflows/add-to-project-ci.yml
=====================================
@@ -0,0 +1,21 @@
+name: Add new issues and PRs to triage project board
+
+on:
+  issues:
+    types:
+      - opened
+  pull_request_target:
+    types:
+      - opened
+
+jobs:
+  add-to-project:
+    name: Add issue to project
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/add-to-project at v0.3.0
+        with:
+          project-url: https://github.com/orgs/qiime2/projects/36
+          github-token: ${{ secrets.ADD_TO_PROJECT_PAT }}
+          labeled: skip-triage
+          label-operator: NOT


=====================================
ci/recipe/meta.yaml
=====================================
@@ -19,7 +19,8 @@ requirements:
   run:
     - python {{ python }}
     - biom-format {{ biom_format }}
-    - bioconductor-dada2 1.22.0
+    - bioconductor-dada2
+    - r-base {{ r_base }}
     - r-optparse >=1.7.1
     # openjdk is not a real dependency, but, r-base has a post-link and post-
     # activation hook that calls R CMD javareconf, which pokes around for any


=====================================
q2_dada2/_denoise.py
=====================================
@@ -18,7 +18,7 @@ import pandas as pd
 
 from q2_types.feature_data import DNAIterator
 from q2_types.per_sample_sequences import (
-    FastqGzFormat, SingleLanePerSampleSingleEndFastqDirFmt,
+    SingleLanePerSampleSingleEndFastqDirFmt,
     SingleLanePerSamplePairedEndFastqDirFmt)
 
 
@@ -38,7 +38,7 @@ def run_commands(cmds, verbose=True):
 
 def _check_featureless_table(fp):
     with open(fp) as fh:
-        # There is a header before the feature data
+        # There is a comment line and a header before the feature data
         for line_count, _ in zip(range(1, 3), fh):
             pass
     if line_count < 2:
@@ -98,22 +98,33 @@ def _check_inputs(**kwargs):
                              % (param, arg, explanation))
 
 
-def _filepath_to_sample(fp):
+def _filepath_to_sample_single(fp):
     return fp.rsplit('_', 4)[0]
 
 
+def _filepath_to_sample_paired(fp):
+    return fp.rsplit('_', 3)[0]
+
+
 # Since `denoise-single` and `denoise-pyro` are almost identical, break out
 # the bulk of the functionality to this helper util. Typechecking is assumed
 # to have occurred in the calling functions, this is primarily for making
 # sure that DADA2 is able to do what it needs to do.
-def _denoise_helper(biom_fp, track_fp, hashed_feature_ids):
+def _denoise_helper(biom_fp, track_fp, hashed_feature_ids, paired=False):
     _check_featureless_table(biom_fp)
     with open(biom_fp) as fh:
         table = biom.Table.from_tsv(fh, None, None, None)
 
+    # If we used denoise_paired the barcode was already stripped from the
+    # filename to force the files to sort by id and pair up properly
+    # see https://github.com/qiime2/q2-dada2/issues/102
+    # and https://github.com/qiime2/q2-dada2/pull/125
+    filepath_to_sample = _filepath_to_sample_paired if paired \
+        else _filepath_to_sample_single
+
     df = pd.read_csv(track_fp, sep='\t', index_col=0)
     df.index.name = 'sample-id'
-    df = df.rename(index=_filepath_to_sample)
+    df = df.rename(index=filepath_to_sample)
 
     PASSED_FILTER = 'percentage of input passed filter'
     NON_CHIMERIC = 'percentage of input non-chimeric'
@@ -149,7 +160,7 @@ def _denoise_helper(biom_fp, track_fp, hashed_feature_ids):
 
     # Currently the sample IDs in DADA2 are the file names. We make
     # them the sample id part of the filename here.
-    sid_map = {id_: _filepath_to_sample(id_)
+    sid_map = {id_: filepath_to_sample(id_)
                for id_ in table.ids(axis='sample')}
     table.update_ids(sid_map, axis='sample', inplace=True)
     # The feature IDs in DADA2 are the sequences themselves.
@@ -274,14 +285,21 @@ def denoise_paired(demultiplexed_seqs: SingleLanePerSamplePairedEndFastqDirFmt,
         track_fp = os.path.join(temp_dir, 'track.tsv')
         filt_forward = os.path.join(temp_dir, 'filt_f')
         filt_reverse = os.path.join(temp_dir, 'filt_r')
+        manifest_df = demultiplexed_seqs.manifest.view(pd.DataFrame)
+
         for fp in tmp_forward, tmp_reverse, filt_forward, filt_reverse:
             os.mkdir(fp)
-        for rp, view in demultiplexed_seqs.sequences.iter_views(FastqGzFormat):
-            fp = str(view)
-            if 'R1_001.fastq' in rp.name:
-                qiime2.util.duplicate(fp, os.path.join(tmp_forward, rp.name))
-            elif 'R2_001.fastq' in rp.name:
-                qiime2.util.duplicate(fp, os.path.join(tmp_reverse, rp.name))
+        for _, fps in manifest_df.iterrows():
+            fwd_fp = fps['forward']
+            rev_fp = fps['reverse']
+
+            fwd_no_barcode = _remove_barcode(os.path.basename(fps['forward']))
+            rev_no_barcode = _remove_barcode(os.path.basename(fps['reverse']))
+
+            qiime2.util.duplicate(fwd_fp, os.path.join(tmp_forward,
+                                                       fwd_no_barcode))
+            qiime2.util.duplicate(rev_fp, os.path.join(tmp_reverse,
+                                                       rev_no_barcode))
 
         cmd = ['run_dada.R',
                '--input_directory', tmp_forward,
@@ -321,7 +339,19 @@ def denoise_paired(demultiplexed_seqs: SingleLanePerSamplePairedEndFastqDirFmt,
                 raise Exception("An error was encountered while running DADA2"
                                 " in R (return code %d), please inspect stdout"
                                 " and stderr to learn more." % e.returncode)
-        return _denoise_helper(biom_fp, track_fp, hashed_feature_ids)
+
+        return _denoise_helper(biom_fp, track_fp, hashed_feature_ids,
+                               paired=True)
+
+
+def _remove_barcode(filename):
+    cut = filename.rsplit('_', 3)
+    id_ = cut[0].rsplit('_', 1)[0]
+
+    cut = cut[1:]
+    cut.insert(0, id_)
+
+    return ('_'.join(cut))
 
 
 def denoise_pyro(demultiplexed_seqs: SingleLanePerSampleSingleEndFastqDirFmt,


=====================================
q2_dada2/_examples.py
=====================================
@@ -0,0 +1,58 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2016-2022, QIIME 2 development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file LICENSE, distributed with this software.
+# ----------------------------------------------------------------------------
+
+
+demux_single_url = \
+    'https://data.qiime2.org/usage-examples/moving-pictures/demux.qza'
+
+demux_paired_url = \
+    'https://data.qiime2.org/usage-examples/atacama-soils/demux-full.qza'
+
+
+def denoise_single(use):
+    demux_single = use.init_artifact_from_url('demux_single', demux_single_url)
+
+    rep_seqs, table_dada2, denoise_stats = use.action(
+        use.UsageAction('dada2', 'denoise_single'),
+        use.UsageInputs(
+            demultiplexed_seqs=demux_single,
+            trim_left=0,
+            trunc_len=120
+        ),
+        use.UsageOutputNames(
+            representative_sequences='representative_sequences',
+            table='table',
+            denoising_stats='denoising_stats'
+        )
+    )
+
+    rep_seqs.assert_output_type('FeatureData[Sequence]')
+    table_dada2.assert_output_type('FeatureTable[Frequency]')
+    denoise_stats.assert_output_type('SampleData[DADA2Stats]')
+
+
+def denoise_paired(use):
+    demux_paired = use.init_artifact_from_url('demux_paired', demux_paired_url)
+
+    rep_seqs, table_dada2, denoise_stats = use.action(
+        use.UsageAction('dada2', 'denoise_paired'),
+        use.UsageInputs(
+            demultiplexed_seqs=demux_paired,
+            trunc_len_f=150,
+            trunc_len_r=140,
+        ),
+        use.UsageOutputNames(
+            representative_sequences='representative_sequences',
+            table='table',
+            denoising_stats='denoising_stats'
+        )
+    )
+
+    rep_seqs.assert_output_type('FeatureData[Sequence]')
+    table_dada2.assert_output_type('FeatureTable[Frequency]')
+    denoise_stats.assert_output_type('SampleData[DADA2Stats]')


=====================================
q2_dada2/_version.py
=====================================
@@ -23,9 +23,9 @@ def get_keywords():
     # setup.py/versioneer.py will grep for the variable names, so they must
     # each be defined on a line of their own. _version.py will just call
     # get_keywords().
-    git_refnames = " (tag: 2022.8.0)"
-    git_full = "dc9bdc5b0346353ebf9a63bc63a60bd45a3405b6"
-    git_date = "2022-08-23 17:11:59 +0000"
+    git_refnames = " (HEAD -> master, tag: 2022.11.2)"
+    git_full = "0f6c02100a7d2c7775b026bd7204c7c762e0b942"
+    git_date = "2023-01-12 11:23:23 -0700"
     keywords = {"refnames": git_refnames, "full": git_full, "date": git_date}
     return keywords
 


=====================================
q2_dada2/assets/run_dada.R
=====================================
@@ -128,6 +128,16 @@
 #    Ex: 32
 #
 
+# error handling -----------------
+options(error = function() {
+  sink(stderr())
+  on.exit(sink(NULL))
+  traceback(3)
+  if (!interactive()) {
+    q(status = 1)
+  }
+})
+
 library("optparse")
 
 cat(R.version$version.string, "\n")
@@ -222,7 +232,7 @@ minOverlap <- if(opt$min_overlap=='NULL') NULL else as.integer(opt$min_overlap)
 poolMethod <- opt$pooling_method
 chimeraMethod <- opt$chimera_method
 minParentFold <- if(opt$min_parental_fold=='NULL') NULL else as.numeric(opt$min_parental_fold)
-allowOneOff <-if(opt$allow_one_off=='NULL') NULL else as.logical(opt$allow_one_off) 
+allowOneOff <-if(opt$allow_one_off=='NULL') NULL else as.logical(opt$allow_one_off)
 nthreads <- if(opt$num_threads=='NULL') NULL else as.integer(opt$num_threads)
 nreads.learn <- if(opt$learn_min_reads=='NULL') NULL else as.integer(opt$learn_min_reads)
 # The following args are not directly exposed to end users in q2-dada2,
@@ -256,7 +266,7 @@ if(!dir.exists(inp.dir)) {
     if(length(unfilts) != length(unfiltsR)) {
       errQuit("Different numbers of forward and reverse .fastq.gz files.")
     }
-    
+
   }
 
 }
@@ -311,7 +321,7 @@ if(primer.removed.dir!='NULL'){ #for CCS read analysis
   out <- suppressWarnings(filterAndTrim(nop, filts, truncLen = truncLen, trimLeft = trimLeft,
                                         maxEE = maxEE, truncQ = truncQ, rm.phix = FALSE,
                                         multithread = multithread, maxLen = maxLen, minLen = minLen, minQ = 3))
-}else{  
+}else{
   filts <- file.path(filtered.dir, basename(unfilts))
   if(inp.dirR!='NULL'){#for paired read analysis
     filtsR <- file.path(filtered.dirR, basename(unfiltsR))
@@ -346,10 +356,10 @@ if(primer.removed.dir!='NULL'){#for CCS read analysis
                                       multithread=multithread, BAND_SIZE=BAND_SIZE))
 
 }else if(inp.dirR!='NULL'){#for paired read analysis
-  
+
   err <- suppressWarnings(learnErrors(filts, nreads=nreads.learn, multithread=multithread))
   errR <- suppressWarnings(learnErrors(filtsR, nreads=nreads.learn, multithread=multithread))
-  
+
 }else{#for sinlge/pyro read analysis
   err <- suppressWarnings(learnErrors(filts, nreads=nreads.learn, multithread=multithread,
                                       HOMOPOLYMER_GAP_PENALTY=HOMOPOLYMER_GAP_PENALTY, BAND_SIZE=BAND_SIZE))
@@ -397,7 +407,7 @@ if(inp.dirR =='NULL'){#for CCS/sinlge/pyro read analysis
   ddsR <- vector("list", length(filts))
   mergers <- vector("list", length(filts))
   cat("3) Denoise samples ")
-  
+
   for(j in seq(length(filts))) {
     drpF <- derepFastq(filts[[j]])
     ddsF[[j]] <- dada(drpF, err=err, multithread=multithread, verbose=FALSE)
@@ -431,7 +441,7 @@ if(inp.dirR =='NULL'){#for CCS/sinlge/pyro read analysis
     cat("\n")
     ### \code copied from previous loop through samples in this script
   }
-  
+
   ### Now loop through and do merging
   for(j in seq(length(filts))) {
     drpF <- derepFastq(filts[[j]])
@@ -443,7 +453,7 @@ if(inp.dirR =='NULL'){#for CCS/sinlge/pyro read analysis
   cat("\n")
   # Make sequence table
   seqtab <- makeSequenceTable(mergers)
-  
+
 }
 
 
@@ -493,4 +503,4 @@ write.table(seqtab.nochim, out.path, sep="\t",
             row.names=TRUE, col.names=col.names, quote=FALSE)
 saveRDS(seqtab.nochim, gsub("tsv", "rds", out.path)) ### TESTING
 
-q(status=0)
\ No newline at end of file
+q(status=0)


=====================================
q2_dada2/plugin_setup.py
=====================================
@@ -16,6 +16,7 @@ from q2_types.feature_table import FeatureTable, Frequency
 
 import q2_dada2
 from q2_dada2 import DADA2Stats, DADA2StatsFormat, DADA2StatsDirFmt
+import q2_dada2._examples as ex
 
 _POOL_OPT = {'pseudo', 'independent'}
 _CHIM_OPT = {'pooled', 'consensus', 'none'}
@@ -128,7 +129,10 @@ plugin.methods.register_function(
     },
     name='Denoise and dereplicate single-end sequences',
     description=('This method denoises single-end sequences, dereplicates '
-                 'them, and filters chimeras.')
+                 'them, and filters chimeras.'),
+    examples={
+        'denoise_single': ex.denoise_single
+    }
 )
 
 
@@ -254,7 +258,10 @@ plugin.methods.register_function(
     },
     name='Denoise and dereplicate paired-end sequences',
     description=('This method denoises paired-end sequences, dereplicates '
-                 'them, and filters chimeras.')
+                 'them, and filters chimeras.'),
+    examples={
+        'denoise_paired': ex.denoise_paired
+    }
 )
 
 
@@ -304,7 +311,7 @@ plugin.methods.register_function(
                    'longer than this value. If 0 is provided no reads will '
                    'be removed based on length.',
         'pooling_method': 'The method used to pool samples for denoising. '
-                          '"independent": Samples are denoised indpendently. '
+                          '"independent": Samples are denoised independently. '
                           '"pseudo": The pseudo-pooling method is used to '
                           'approximate pooling of samples. In short, samples '
                           'are denoised independently once, ASVs detected '


=====================================
q2_dada2/tests/data/mixed_barcodes_and_ids/MANIFEST
=====================================
@@ -0,0 +1,5 @@
+sample-id,filename,direction
+V130,V130_166_L001_R1_001.fastq.gz,forward
+V130,V130_742_L001_R2_001.fastq.gz,reverse
+V130_2,V130_2_167_L001_R1_001.fastq.gz,forward
+V130_2,V130_2_743_L001_R2_001.fastq.gz,reverse


=====================================
q2_dada2/tests/data/mixed_barcodes_and_ids/V130_166_L001_R1_001.fastq.gz
=====================================
Binary files /dev/null and b/q2_dada2/tests/data/mixed_barcodes_and_ids/V130_166_L001_R1_001.fastq.gz differ


=====================================
q2_dada2/tests/data/mixed_barcodes_and_ids/V130_2_167_L001_R1_001.fastq.gz
=====================================
Binary files /dev/null and b/q2_dada2/tests/data/mixed_barcodes_and_ids/V130_2_167_L001_R1_001.fastq.gz differ


=====================================
q2_dada2/tests/data/mixed_barcodes_and_ids/V130_2_743_L001_R2_001.fastq.gz
=====================================
Binary files /dev/null and b/q2_dada2/tests/data/mixed_barcodes_and_ids/V130_2_743_L001_R2_001.fastq.gz differ


=====================================
q2_dada2/tests/data/mixed_barcodes_and_ids/V130_742_L001_R2_001.fastq.gz
=====================================
Binary files /dev/null and b/q2_dada2/tests/data/mixed_barcodes_and_ids/V130_742_L001_R2_001.fastq.gz differ


=====================================
q2_dada2/tests/data/mixed_barcodes_and_ids/metadata.yml
=====================================
@@ -0,0 +1 @@
+{phred-offset: 33}


=====================================
q2_dada2/tests/data/single_feature.tsv
=====================================
@@ -1,2 +1,2 @@
 #OTU ID	L1S208	L1S257	L1S57	L1S76	L2S155	L2S175	L2S309	L2S357	L3S294	L3S313	L4S112	L4S63	L5S155	L5S174	L6S20	L6S68
-b32621bcd86cb99e846d8f6fee7c9ab8	43.0	49.0	75.0	40.0	0.0	5.0	0.0	0.0	0.0	5.0	5.0	0.0	0.0	0.0	0.0	0.0
+b32621bcd86cb99e846d8f6fee7c9ab8	43.0	49.0	75.0	40.0	0.0	5.0	0.0	0.0	0.0	5.0	5.0	0.0	0.0	0.0	0.0	0.0
\ No newline at end of file


=====================================
q2_dada2/tests/test_denoise.py
=====================================
@@ -24,6 +24,13 @@ def _sort_seqs(seqs):
     return sorted(list(seqs), key=lambda x: x.metadata['id'])
 
 
+class TestExamples(TestPluginBase):
+    package = 'q2_dada2.tests'
+
+    def test_examples(self):
+        self.execute_examples()
+
+
 class TestDenoiseSingle(TestPluginBase):
     package = 'q2_dada2.tests'
 
@@ -73,6 +80,14 @@ class TestDenoiseSingle(TestPluginBase):
                          _sort_seqs(exp_rep_seqs))
         self.assertEqual(md, exp_md)
 
+    def test_mixed_barcodes_and_ids(self):
+        demux_seqs = SingleLanePerSamplePairedEndFastqDirFmt(
+            self.get_data_path('mixed_barcodes_and_ids'), 'r')
+
+        denoise_paired(demux_seqs, 150, 150)
+
+        self.assertTrue(True)
+
     def test_all_reads_filtered(self):
         with self.assertRaisesRegex(ValueError, 'filter'):
             denoise_single(self.demux_seqs, 10000)


=====================================
setup.py
=====================================
@@ -26,6 +26,7 @@ setup(
         'q2_dada2': ['citations.bib'],
         'q2_dada2.tests': ['data/*',
                            'data/expected/*',
+                           'data/mixed_barcodes_and_ids/*',
                            'data/underscore_samples/*',
                            'data/sample_seqs_single/*',
                            'data/sample_seqs_ccs/*',



View it on GitLab: https://salsa.debian.org/med-team/q2-dada2/-/commit/7214e78ba98199cce46aab1de4988106c44ff56c

-- 
View it on GitLab: https://salsa.debian.org/med-team/q2-dada2/-/commit/7214e78ba98199cce46aab1de4988106c44ff56c
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20230112/4d06c4d8/attachment-0001.htm>