[med-svn] [Git][med-team/bcbio][upstream] New upstream version 1.1.9

Wed Dec 11 14:07:49 GMT 2019


Steffen Möller pushed to branch upstream at Debian Med / bcbio


Commits:
f8a1a562 by Steffen Möller at 2019-12-11T10:15:36Z
New upstream version 1.1.9
- - - - -


30 changed files:

- HISTORY.md
- bcbio/bam/__init__.py
- bcbio/broad/picardrun.py
- bcbio/chipseq/__init__.py
- + bcbio/chipseq/atac.py
- bcbio/chipseq/macs2.py
- bcbio/chipseq/peaks.py
- bcbio/heterogeneity/chromhacks.py
- bcbio/ngsalign/bwa.py
- bcbio/ngsalign/star.py
- bcbio/pipeline/rnaseq.py
- bcbio/pipeline/run_info.py
- bcbio/pipeline/sample.py
- bcbio/qc/multiqc.py
- bcbio/qc/viral.py
- bcbio/upload/__init__.py
- bcbio/variation/effects.py
- docs/contents/configuration.rst
- docs/contents/internals.rst
- docs/contents/outputs.rst
- requirements-conda.txt
- requirements.txt
- scripts/bcbio_setup_genome.py
- setup.py
- + tests/data/atac/atac_1.fq
- + tests/data/atac/atac_2.fq
- + tests/data/automated/run_info-atacseq.yaml
- tests/data/automated/run_info-chipseq.yaml
- tests/integration/test_automated_analysis.py
- tests/pytest.ini


Changes:

=====================================
HISTORY.md
=====================================
@@ -1,4 +1,27 @@
-## 1.1.8
+## 1.1.9 (in progress)
+- Fix for get VEP cache.
+- Support Picard's new syntax for ReorderSam (REFERENCE -> SEQUENCE_DICTIONARY).
+- Remove mitochondrial reads from ChIP/ATAC-seq calling.
+- Add documentation describing ATAC-seq outputs.
+- Add ENCODE library complexity metrics for ATAC/ChIP-seq to MultiQC report 
+  (see https://www.encodeproject.org/data-standards/terms/#library for a description of the metrics)
+- Add STAR sample-specific 2-pass. This helps assign a moderate number of reads per genes. Thanks
+  to @naumenko-sa for the intial implementation and push to get this going.
+- Index transcriptomes only once for pseudo/quasi aligner tools. This fixes race conditions that
+  can happen.
+- Add --buildversion option, for tracking which version of a gene build was used. This is used
+  during `bcbio_setup_genome.py`. Suggested formats are source_version, so Ensembl_94, 
+  EnsemblMetazoa_25, FlyBase_26, etc.
+- Sort MACS2 bedgraph files before compressing. Thanks to @LMannarino for the suggestion.
+- Check for the reserved field `sample` in RNA-seq metadata and quit with a useful error message. 
+  Thanks to @marypiper for suggesting this.
+- Split ATAC-seq BAM files into nucleosome-free and mono/di/tri nucleosome files, so we can call 
+  peaks on them separately.
+- Call peaks on NF/MN/DN/TN regions separately for each caller during ATAC-seq.
+- Allow viral contamination to be assasyed on non tumor/normal samples.
+- Ensure EBV coverage is calculated when run on genomes with it included as a contig.
+
+## 1.1.8 (28 October 2019)
 - Add `antibody` configuration option. Setting a specific antibody for ChIP-seq will use appropriate
   settings for that antibody. See the documentation for supported antibodies.
 - Add `use_lowfreq_filter` for forcing vardict to report variants with low allelic frequency,


=====================================
bcbio/bam/__init__.py
=====================================
@@ -403,11 +403,11 @@ def merge(bamfiles, out_bam, config):
     return out_bam
 
 
-def sort(in_bam, config, order="coordinate", out_dir=None):
+def sort(in_bam, config, order="coordinate", out_dir=None, force=False):
     """Sort a BAM file, skipping if already present.
     """
     assert is_bam(in_bam), "%s in not a BAM file" % in_bam
-    if bam_already_sorted(in_bam, config, order):
+    if not force and bam_already_sorted(in_bam, config, order):
         return in_bam
 
     sort_stem = _get_sort_stem(in_bam, order, out_dir)
@@ -579,3 +579,24 @@ def remove_duplicates(in_bam, data):
         do.run(cmd, message)
     index(out_bam, dd.get_config(data))
     return out_bam
+
+def count_alignments(in_bam, data, filter=None):
+    """
+    count alignments in a BAM file passing a given filter. valid filter
+    strings are available in the sambamba documentation:
+
+    https://github.com/biod/sambamba/wiki/%5Bsambamba-view%5D-Filter-expression-syntax
+    """
+    sambamba = config_utils.get_program("sambamba", dd.get_config(data))
+    num_cores = dd.get_num_cores(data)
+    if not filter:
+        filter_string = ""
+        message = f"Counting alignments in {in_bam}."
+    else:
+        filter_string = "--filter {filter}"
+        message = f"Counting alignments in {in_bam} matching {filter}."
+    cmd = f"{sambamba} view -c --nthreads {num_cores} -f bam {filter_string} {in_bam}"
+    logger.info(message)
+    result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE,
+                            stderr=subprocess.PIPE)
+    return int(result.stdout.decode().strip())


=====================================
bcbio/broad/picardrun.py
=====================================
@@ -98,9 +98,10 @@ def picard_reorder(picard, in_bam, ref_file, out_file):
     if not file_exists(out_file):
         with tx_tmpdir(picard._config) as tmp_dir:
             with file_transaction(picard._config, out_file) as tx_out_file:
+                dict_file = "%s.dict" % os.path.splitext(ref_file)[0]
                 opts = [("INPUT", in_bam),
                         ("OUTPUT", tx_out_file),
-                        ("REFERENCE", ref_file),
+                        ("SEQUENCE_DICTIONARY", dict_file),
                         ("ALLOW_INCOMPLETE_DICT_CONCORDANCE", "true"),
                         ("TMP_DIR", tmp_dir)]
                 picard.run("ReorderSam", opts)


=====================================
bcbio/chipseq/__init__.py
=====================================
@@ -1,4 +1,5 @@
 import os
+import shutil
 import subprocess
 import sys
 import toolz as tz
@@ -10,17 +11,29 @@ from bcbio.ngsalign import bowtie2, bwa
 from bcbio.distributed.transaction import file_transaction
 from bcbio.provenance import do
 from bcbio.log import logger
-from bcbio.heterogeneity.chromhacks import get_mitochondrial_chroms
+from bcbio.heterogeneity.chromhacks import (get_mitochondrial_chroms,
+                                            get_nonmitochondrial_chroms)
+from bcbio.chipseq import atac
 
 def clean_chipseq_alignment(data):
     # lcr_bed = utils.get_in(data, ("genome_resources", "variation", "lcr"))
+    method = dd.get_chip_method(data)
+    if method == "atac":
+        data = shift_ATAC(data)
     work_bam = dd.get_work_bam(data)
+    work_bam = bam.sort(work_bam, dd.get_config(data))
+    bam.index(work_bam, dd.get_config(data))
     clean_bam = remove_nonassembled_chrom(work_bam, data)
+    clean_bam = remove_mitochondrial_reads(clean_bam, data)
+    data = atac.calculate_complexity_metrics(clean_bam, data)
     if not dd.get_keep_multimapped(data):
         clean_bam = remove_multimappers(clean_bam, data)
     if not dd.get_keep_duplicates(data):
         clean_bam = bam.remove_duplicates(clean_bam, data)
     data["work_bam"] = clean_bam
+    # for ATAC-seq, brewak alignments into NF, mono/di/tri nucleosome BAM files
+    if method == "atac":
+        data  = atac.split_ATAC(data)
     encode_bed = tz.get_in(["genome_resources", "variation", "encode_blacklist"], data)
     if encode_bed:
         data["work_bam"] = remove_blacklist_regions(dd.get_work_bam(data), encode_bed, data['config'])
@@ -35,6 +48,26 @@ def clean_chipseq_alignment(data):
                                        dd.get_work_bam(data), data)
     return [[data]]
 
+def remove_mitochondrial_reads(bam_file, data):
+    mito = get_mitochondrial_chroms(data)
+    if not mito:
+        logger.info(f"Mitochondrial chromosome not identified, skipping removal of "
+                    "mitochondrial reads from {bam_file}.")
+        return bam_file
+    nonmito = get_nonmitochondrial_chroms(data)
+    mito_bam = os.path.splitext(bam_file)[0] + "-noMito.bam"
+    if utils.file_exists(mito_bam):
+        return mito_bam
+    samtools = config_utils.get_program("samtools", dd.get_config(data))
+    nonmito_flag = " ".join(nonmito)
+    num_cores = dd.get_num_cores(data)
+    with file_transaction(mito_bam) as tx_out_bam:
+        cmd = (f"{samtools} view -bh -@ {num_cores} {bam_file} {nonmito_flag} "
+               f"> {tx_out_bam}")
+        message = f"Removing mitochondrial reads on {','.join(mito)} from {bam_file}."
+        do.run(cmd, message)
+    return mito_bam
+
 def remove_multimappers(bam_file, data):
     aligner = dd.get_aligner(data)
     if aligner:
@@ -50,13 +83,13 @@ def remove_multimappers(bam_file, data):
         unique_bam = bam_file
         logger.warn("When a BAM file is given as input, bcbio skips removal of "
                     "multimappers.")
-    logger.warn("ChIP/ATAC-seq usually requires duplicate marking, but it was disabled.")
     return unique_bam
 
 def remove_nonassembled_chrom(bam_file, data):
     """Remove non-assembled contigs from the BAM file"""
     ref_file =  dd.get_ref_file(data)
     config = dd.get_config(data)
+    bam.index(bam_file, config)
     fai = "%s.fai" % ref_file
     chrom = []
     with open(fai) as inh:
@@ -143,31 +176,35 @@ def _normalized_bam_coverage(name, bam_input, data):
 def _compute_deeptools_matrix(data):
     pass
 
-def extract_NF_regions(data):
+def shift_ATAC(data):
     """
-    extract the nucleosome free regions from the work_bam. These regions will
-    be < 100 bases
+    shift the ATAC-seq alignments
     """
     MAX_FRAG_LENGTH = 100
     sieve = config_utils.get_program("alignmentSieve", data)
     work_bam = dd.get_work_bam(data)
     num_cores = dd.get_num_cores(data)
-    out_file = os.path.splitext(work_bam)[0] + "-NF.bam"
-    log_file = os.path.splitext(work_bam)[0] + "-NF.log"
-    if file_exists(out_file):
-        data["NF_bam"] = out_file
+    out_file = os.path.splitext(work_bam)[0] + "-shifted.bam"
+    log_file = os.path.splitext(work_bam)[0] + "-shifted.log"
+    if utils.file_exists(out_file):
+        data["work_bam"] = out_file
         return data
 
-    with file_transaction(out_file) as tx_out_file, \
-         file_transaction(log_file) as tx_log_file:
-        tx_unsorted_bam = tx_out_file + ".unsorted"
-        cmd = (
-            f"{sieve} --bam ${work_bam} --outFile {tx_unsorted_bam} --ATACshift "
-            f"--numberOfProcessors {num_cores} --maxFragmentLength {MAX_FRAG_LENGTH} "
-            f"--minMappingQuality 10 "
-            f"--filterMetrics {tx_log_file} ")
-        do.run(cmd, "Extract NF regions from {work_bam} to {tx_unsorted_bam}.")
-        tx_out_file = bam.sort(tx_unsorted_bam)
-
-    data["NF_bam"] = out_file
+    unsorted_bam = os.path.splitext(out_file)[0] + ".unsorted.bam"
+    if not utils.file_exists(out_file):
+        with file_transaction(out_file) as tx_out_file, \
+            file_transaction(log_file) as tx_log_file:
+            tx_unsorted_file = os.path.splitext(tx_out_file)[0] + ".tmp.bam"
+            cmd = (
+                f"{sieve} --verbose --bam {work_bam} --outFile {tx_unsorted_file} --ATACshift "
+                f"--numberOfProcessors {num_cores} --maxFragmentLength 0 "
+                f"--minFragmentLength 0 "
+                f"--minMappingQuality 10 "
+                f"--filterMetrics {tx_log_file} ")
+            do.run(cmd, f"Shifting ATAC-seq alignments in {work_bam} to {tx_unsorted_file}.")
+            # shifting can cause the file to become unsorted
+            sorted_file = bam.sort(tx_unsorted_file, dd.get_config(data), force=True)
+            shutil.move(sorted_file, tx_out_file)
+    bam.index(out_file, dd.get_config(data))
+    data["work_bam"] = out_file
     return data


=====================================
bcbio/chipseq/atac.py
=====================================
@@ -0,0 +1,92 @@
+import os
+import toolz as tz
+from collections import namedtuple
+
+from bcbio import utils
+from bcbio.pipeline import datadict as dd
+from bcbio.pipeline import config_utils
+from bcbio.log import logger
+from bcbio.distributed.transaction import file_transaction
+from bcbio.provenance import do
+from bcbio import bam
+
+# ranges taken from Buenrostro, Nat. Methods 10, 1213–1218 (2013).
+ATACRange = namedtuple('ATACRange', ['label', 'min', 'max'])
+ATACRanges = {"NF": ATACRange("NF", 0, 100),
+              "MN": ATACRange("MN", 180, 247),
+              "DN": ATACRange("DN", 315, 473),
+              "TN": ATACRange("TN", 558, 615)}
+
+def calculate_complexity_metrics(work_bam, data):
+    """
+    the work_bam should have duplicates marked but not removed
+    mitochondrial reads should be removed 
+    """
+    bedtools = config_utils.get_program("bedtools", dd.get_config(data))
+    work_dir = dd.get_work_dir(data)
+    metrics_dir = os.path.join(work_dir, "metrics", "atac")
+    utils.safe_makedir(metrics_dir)
+    metrics_file = os.path.join(metrics_dir,
+                                f"{dd.get_sample_name(data)}-atac-metrics.csv")
+    if utils.file_exists(metrics_file):
+        data = tz.assoc_in(data, ['atac', 'complexity_metrics_file'], metrics_file)
+        return data
+    # BAM file must be sorted by read name
+    work_bam = bam.sort(work_bam, dd.get_config(data), order="queryname")
+    with file_transaction(metrics_file) as tx_metrics_file:
+        with open(tx_metrics_file, "w") as out_handle:
+            out_handle.write("mt,m0,m1,m2\n")
+        cmd = (f"{bedtools} bamtobed -bedpe -i {work_bam} | "
+               "awk 'BEGIN{OFS=\"\\t\"}{print $1,$2,$4,$6,$9,$10}' | "
+               "sort | "
+               "uniq -c | "
+               "awk 'BEGIN{mt=0;m0=0;m1=0;m2=0}($1==1){m1=m1+1} "
+               "($1==2){m2=m2+1}{m0=m0+1}{mt=mt+$1}END{printf \"%d,%d,%d,%d\\n\", mt,m0,m1,m2}' >> "
+               f"{tx_metrics_file}")
+        message = f"Calculating ATAC-seq complexity metrics on {work_bam}, saving as {metrics_file}."
+        do.run(cmd, message)
+    data = tz.assoc_in(data, ['atac', 'complexity_metrics_file'], metrics_file)
+    return data
+
+def calculate_encode_complexity_metrics(data):
+    metrics_file = tz.get_in(['atac', 'complexity_metrics_file'], data, None)
+    if not metrics_file:
+        return {}
+    else:
+        with open(metrics_file) as in_handle:
+            header = next(in_handle).strip().split(",")
+            values = next(in_handle).strip().split(",")
+    raw_metrics = {h: int(v) for h, v in zip(header, values)}
+    metrics = {"PBC1": raw_metrics["m1"] / raw_metrics["m0"],
+               "NRF": raw_metrics["m0"] / raw_metrics["mt"]}
+    if raw_metrics["m2"] == 0:
+        PBC2 = 0
+    else:
+        PBC2 = raw_metrics["m1"] / raw_metrics["m2"]
+    metrics["PBC2"] = PBC2
+    return(metrics)
+
+def split_ATAC(data, bam_file=None):
+    """
+    splits a BAM into nucleosome-free (NF) and mono/di/tri nucleosome BAMs based
+    on the estimated insert sizes
+    uses the current working BAM file if no BAM file is supplied
+    """
+    sambamba = config_utils.get_program("sambamba", data)
+    num_cores = dd.get_num_cores(data)
+    base_cmd = f'{sambamba} view --format bam --nthreads {num_cores} '
+    bam_file = bam_file if bam_file else dd.get_work_bam(data)
+    out_stem = os.path.splitext(bam_file)[0]
+    split_files = {}
+    for arange in ATACRanges.values():
+        out_file = f"{out_stem}-{arange.label}.bam"
+        if not utils.file_exists(out_file):
+            with file_transaction(out_file) as tx_out_file:
+                cmd = base_cmd +\
+                    f'-F "template_length > {arange.min} and template_length < {arange.max}" ' +\
+                    f'{bam_file} > {tx_out_file}'
+                message = f'Splitting {arange.label} regions from {bam_file}.'
+                do.run(cmd, message)
+        split_files[arange.label] = out_file
+    data = tz.assoc_in(data, ['atac', 'align'], split_files)
+    return data


=====================================
bcbio/chipseq/macs2.py
=====================================
@@ -9,6 +9,7 @@ from bcbio.pipeline import datadict as dd
 from bcbio import bam
 from bcbio.chipseq import antibodies
 from bcbio.log import logger
+from bcbio.distributed.transaction import file_transaction
 
 def run(name, chip_bam, input_bam, genome_build, out_dir, method, resources, data):
     """
@@ -20,7 +21,7 @@ def run(name, chip_bam, input_bam, genome_build, out_dir, method, resources, dat
     out_file = os.path.join(out_dir, name + "_peaks_macs2.xls")
     macs2_file = os.path.join(out_dir, name + "_peaks.xls")
     if utils.file_exists(out_file):
-        _compress_bdg_files(out_dir)
+        _compress_and_sort_bdg_files(out_dir, data)
         return _get_output_files(out_dir)
     macs2 = config_utils.get_program("macs2", config)
     antibody = antibodies.ANTIBODIES.get(dd.get_antibody(data).lower(), None)
@@ -40,13 +41,13 @@ def run(name, chip_bam, input_bam, genome_build, out_dir, method, resources, dat
             do.run(cmd.format(**locals()), "macs2 for %s" % name)
             utils.move_safe(macs2_file, out_file)
         except subprocess.CalledProcessError:
-            raise RuntimeWarning("macs2 terminated with an error.\n"
+            raise RuntimeWarning("macs2 terminated with an error. "
                                  "Please, check the message and report "
-                                 "error if it is related to bcbio.\n"
+                                 "error if it is related to bcbio. "
                                  "You can add specific options for the sample "
                                  "setting resources as explained in docs: "
                                  "https://bcbio-nextgen.readthedocs.org/en/latest/contents/configuration.html#sample-specific-resources")
-    _compress_bdg_files(out_dir)
+    _compress_and_sort_bdg_files(out_dir, data)
     return _get_output_files(out_dir)
 
 def _get_output_files(out_dir):
@@ -61,10 +62,16 @@ def _get_output_files(out_dir):
             break
     return {"main": peaks, "macs2": fns}
 
-def _compress_bdg_files(out_dir):
+def _compress_and_sort_bdg_files(out_dir, data):
     for fn in glob.glob(os.path.join(out_dir, "*bdg")):
-        cmd = "gzip  %s" % fn
-        do.run(cmd, "compress bdg file: %s" % fn)
+        out_file = fn + ".gz"
+        if utils.file_exists(out_file):
+            continue
+        bedtools = config_utils.get_program("bedtools", data)
+        with file_transaction(out_file) as tx_out_file:
+            cmd = f"{bedtools} sort -i {fn} | bgzip -c > {tx_out_file}"
+            message = f"Compressing and sorting {fn}."
+            do.run(cmd, message)
 
 def _macs2_cmd(data):
     """Main command for macs2 tool."""


=====================================
bcbio/chipseq/peaks.py
=====================================
@@ -12,7 +12,7 @@ from bcbio.pipeline import config_utils
 from bcbio.pipeline import datadict as dd
 from bcbio.provenance import do
 from bcbio.distributed.transaction import file_transaction
-
+from bcbio.chipseq import atac
 
 def get_callers():
     """Get functions related to each caller"""
@@ -44,18 +44,28 @@ def peakcall_prepare(data, run_parallel):
 
 def calling(data):
     """Main function to parallelize peak calling."""
-    chip_bam = data.get("work_bam")
-    input_bam = data.get("work_bam_input", None)
+    method = dd.get_chip_method(data)
     caller_fn = get_callers()[data["peak_fn"]]
-    name = dd.get_sample_name(data)
-    out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), data["peak_fn"], name))
-    out_files = caller_fn(name, chip_bam, input_bam, dd.get_genome_build(data), out_dir,
-                          dd.get_chip_method(data), data["resources"], data)
-    greylistdir = greylisting(data)
-    data.update({"peaks_files": out_files})
-    # data["input_bam_filter"] = input_bam
-    if greylistdir:
-        data["greylist"] = greylistdir
+    if method == "chip":
+        chip_bam = data.get("work_bam")
+        input_bam = data.get("work_bam_input", None)
+        name = dd.get_sample_name(data)
+        out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), data["peak_fn"], name))
+        out_files = caller_fn(name, chip_bam, input_bam, dd.get_genome_build(data), out_dir,
+                            dd.get_chip_method(data), data["resources"], data)
+        greylistdir = greylisting(data)
+        data.update({"peaks_files": out_files})
+        if greylistdir:
+            data["greylist"] = greylistdir
+    if method == "atac":
+        for fraction in atac.ATACRanges.keys():
+            chip_bam = tz.get_in(("atac", "align", fraction), data)
+            logger.info(f"Running peak calling with {data['peak_fn']} on the {fraction} fraction of {chip_bam}.")
+            name = dd.get_sample_name(data) + f"-{fraction}"
+            out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), data["peak_fn"], name))
+            out_files = caller_fn(name, chip_bam, None, dd.get_genome_build(data), out_dir,
+                                  dd.get_chip_method(data), data["resources"], data)
+            data = tz.assoc_in(data, ("peaks_files", fraction), out_files)
     return [[data]]
 
 def _sync(original, processed):


=====================================
bcbio/heterogeneity/chromhacks.py
=====================================
@@ -58,3 +58,16 @@ def get_mitochondrial_chroms(data):
     ref_file = dd.get_ref_file(data)
     mito = [c.name for c in ref.file_contigs(ref_file) if is_mitochondrial(c.name)]
     return mito
+
+def get_nonmitochondrial_chroms(data):
+    ref_file = dd.get_ref_file(data)
+    nonmito = [c.name for c in ref.file_contigs(ref_file) if not is_mitochondrial(c.name)]
+    return nonmito
+
+def get_EBV(data):
+    EBVCONTIGS = ["chrEBV", "EBV"]
+    ref_file = dd.get_ref_file(data)
+    for c in ref.file_contigs(ref_file):
+        if c.name in EBVCONTIGS:
+            return c.name
+    return False


=====================================
bcbio/ngsalign/bwa.py
=====================================
@@ -44,7 +44,7 @@ def align_bam(in_bam, ref_file, names, align_dir, data):
                 tx_out_prefix = os.path.splitext(tx_out_file)[0]
                 prefix1 = "%s-in1" % tx_out_prefix
                 cmd = ("unset JAVA_HOME && "
-                       "{samtools} sort -n -o -l 1 -@ {num_cores} -m {max_mem} {in_bam} {prefix1} "
+                       "{samtools} sort -n -l 1 -@ {num_cores} -m {max_mem} {in_bam} -T {prefix1} "
                        "| {bedtools} bamtofastq -i /dev/stdin -fq /dev/stdout -fq2 /dev/stdout "
                        "| {bwa_cmd} | ")
                 cmd = cmd.format(**locals()) + tobam_cl


=====================================
bcbio/ngsalign/star.py
=====================================
@@ -62,9 +62,10 @@ def align(fastq_file, pair_file, ref_file, names, align_dir, data):
         ref_file = os.path.dirname(ref_file)
 
     with file_transaction(data, align_dir) as tx_align_dir:
-        tx_star_dirnames = _get_star_dirnames(tx_align_dir, data, names)
+        tx_1pass_dir = tx_align_dir + "1pass"
+        tx_star_dirnames = _get_star_dirnames(tx_1pass_dir, data, names)
         tx_out_dir, tx_out_file, tx_out_prefix, tx_final_out = tx_star_dirnames
-        safe_makedir(tx_align_dir)
+        safe_makedir(tx_1pass_dir)
         safe_makedir(tx_out_dir)
         cmd = ("{star_path} --genomeDir {ref_file} --readFilesIn {fastq_files} "
                "--runThreadN {num_cores} --outFileNamePrefix {tx_out_prefix} "
@@ -104,7 +105,55 @@ def align(fastq_file, pair_file, ref_file, names, align_dir, data):
             cmd += " " + " ".join([str(x) for x in resources.get("options", [])])
         cmd += " | " + postalign.sam_to_sortbam_cl(data, tx_final_out)
         cmd += " > {tx_final_out} "
-        run_message = "Running STAR aligner on %s and %s" % (fastq_file, ref_file)
+        run_message = "Running 1st pass of STAR aligner on %s and %s" % (fastq_file, ref_file)
+        do.run(cmd.format(**locals()), run_message, None)
+
+        sjfile = get_splicejunction_file(tx_out_dir, data)
+        sjflag = f"--sjdbFileChrStartEnd {sjfile}" if sjfile else ""
+        tx_star_dirnames = _get_star_dirnames(tx_align_dir, data, names)
+        tx_out_dir, tx_out_file, tx_out_prefix, tx_final_out = tx_star_dirnames
+        safe_makedir(tx_align_dir)
+        safe_makedir(tx_out_dir)
+        cmd = ("{star_path} --genomeDir {ref_file} --readFilesIn {fastq_files} "
+            "--runThreadN {num_cores} --outFileNamePrefix {tx_out_prefix} "
+            "--outReadsUnmapped Fastx --outFilterMultimapNmax {max_hits} "
+            "--outStd BAM_Unsorted {srna_opts} "
+            "--limitOutSJcollapsed 2000000 "
+            "{sjflag} "
+            "--outSAMtype BAM Unsorted "
+            "--outSAMmapqUnique 60 "
+            "--outSAMunmapped Within --outSAMattributes %s " % " ".join(ALIGN_TAGS))
+        cmd += _add_sj_index_commands(fastq_file, ref_file, gtf_file) if not srna else ""
+        cmd += _read_group_option(names)
+        if dd.get_fusion_caller(data):
+            if "arriba" in dd.get_fusion_caller(data):
+                cmd += (
+                    "--chimSegmentMin 10 --chimOutType WithinBAM SoftClip Junctions "
+                    "--chimJunctionOverhangMin 10 --chimScoreMin 1 --chimScoreDropMax 30 "
+                    "--chimScoreJunctionNonGTAG 0 --chimScoreSeparation 1 "
+                    "--alignSJstitchMismatchNmax 5 -1 5 5 "
+                    "--chimSegmentReadGapMax 3 ")
+            else: 
+                cmd += (" --chimSegmentMin 12 --chimJunctionOverhangMin 12 "
+                    "--chimScoreDropMax 30 --chimSegmentReadGapMax 5 "
+                    "--chimScoreSeparation 5 ")
+                if "oncofuse" in dd.get_fusion_caller(data):
+                    cmd += "--chimOutType Junctions "
+                else:
+                    cmd += "--chimOutType WithinBAM "
+        strandedness = utils.get_in(data, ("config", "algorithm", "strandedness"),
+                                    "unstranded").lower()
+        if strandedness == "unstranded" and not srna:
+            cmd += " --outSAMstrandField intronMotif "
+        if not srna:
+            cmd += " --quantMode TranscriptomeSAM "
+
+        resources = config_utils.get_resources("star", data["config"])
+        if resources.get("options", []):
+            cmd += " " + " ".join([str(x) for x in resources.get("options", [])])
+        cmd += " | " + postalign.sam_to_sortbam_cl(data, tx_final_out)
+        cmd += " > {tx_final_out} "
+        run_message = "Running 2nd pass of STAR aligner on %s and %s" % (fastq_file, ref_file)
         do.run(cmd.format(**locals()), run_message, None)
 
     data = _update_data(star_dirs.final_out, star_dirs.out_dir, names, data)


=====================================
bcbio/pipeline/rnaseq.py
=====================================
@@ -11,7 +11,8 @@ from bcbio.distributed.transaction import file_transaction
 from bcbio.log import logger
 
 def fast_rnaseq(samples, run_parallel):
-    samples = run_parallel("run_salmon_index", [samples])
+    to_index = determine_indexes_to_make(samples)
+    run_parallel("run_salmon_index", [to_index])
     samples = run_parallel("run_salmon_reads", samples)
     samples = run_parallel("run_counts_spikein", samples)
     samples = spikein.combine_spikein(samples)
@@ -205,6 +206,7 @@ def quantitate_expression_parallel(samples, run_parallel):
     take advantage of the threaded run_parallel environment
     """
     data = samples[0][0]
+    to_index = determine_indexes_to_make(samples)
     samples = run_parallel("generate_transcript_counts", samples)
     if "cufflinks" in dd.get_expression_caller(data):
         samples = run_parallel("run_cufflinks", samples)
@@ -213,13 +215,14 @@ def quantitate_expression_parallel(samples, run_parallel):
     if ("kallisto" in dd.get_expression_caller(data) or
         dd.get_fusion_mode(data) or
         "pizzly" in dd.get_fusion_caller(data, [])):
-        samples = run_parallel("run_kallisto_index", [samples])
+        run_parallel("run_kallisto_index", [to_index])
         samples = run_parallel("run_kallisto_rnaseq", samples)
     if "sailfish" in dd.get_expression_caller(data):
-        samples = run_parallel("run_sailfish_index", [samples])
+        run_parallel("run_sailfish_index", [to_index])
         samples = run_parallel("run_sailfish", samples)
 
     # always run salmon
+    run_parallel("run_salmon_index", [to_index])
     if dd.get_quantify_genome_alignments(data):
         if dd.get_aligner(data).lower() != "star":
             if dd.get_genome_build(data) == "hg38":
@@ -496,3 +499,25 @@ def combine_files(samples):
             data = dd.set_tx2gene(data, tx2gene_file)
         updated_samples.append([data])
     return updated_samples
+
+def determine_indexes_to_make(samples):
+    """
+    returns a subset of the samples that have different indexes in them to make sure we only
+    make each index once
+    """
+    samples = [to_single_data(x) for x in samples]
+    indexes = set()
+    tomake = []
+    for data in samples:
+        out_dir = os.path.join(dd.get_work_dir(data), "inputs", "transcriptome")
+        out_stem = os.path.join(out_dir, dd.get_genome_build(data))
+        if dd.get_disambiguate(data):
+            out_stem = "-".join([out_stem] + (dd.get_disambiguate(data) or []))
+        if dd.get_disambiguate(data):
+            out_stem = "-".join([out_stem] + (dd.get_disambiguate(data) or []))
+        combined_file = out_stem + ".fa"
+        if combined_file not in indexes:
+            tomake.append(data)
+            indexes.add(combined_file)
+    return tomake
+


=====================================
bcbio/pipeline/run_info.py
=====================================
@@ -11,6 +11,7 @@ import itertools
 import operator
 import os
 import string
+import sys
 
 import six
 import toolz as tz
@@ -331,6 +332,11 @@ def _clean_metadata(data):
         if "metadata" not in data:
             data["metadata"] = {}
         data["metadata"]["batch"] = "%s-joint" % dd.get_sample_name(data)
+    analysis = dd.get_analysis(data).lower()
+    if tz.get_in(("metadata", "sample"), data) and analysis == "rna-seq":
+        logger.error("'sample' is a reserved keyword in metadata for RNA-seq. Please "
+                     "rename this column to something else and rerun.")
+        sys.exit(1)
     return data
 
 def _clean_algorithm(data):


=====================================
bcbio/pipeline/sample.py
=====================================
@@ -147,7 +147,8 @@ def process_alignment(data, alt_input=None):
             if sort_method and sort_method != "coordinate":
                 raise ValueError("Cannot specify `bam_clean: picard` with `bam_sort` other than coordinate: %s"
                                  % sort_method)
-            out_bam = cleanbam.picard_prep(fastq1, data["rgnames"], dd.get_ref_file(data), data["dirs"],
+            ref_file = dd.get_ref_file(data)
+            out_bam = cleanbam.picard_prep(fastq1, data["rgnames"], ref_file, data["dirs"],
                                            data)
         elif bamclean == "fixrg":
             out_bam = cleanbam.fixrg(fastq1, data["rgnames"], dd.get_ref_file(data), data["dirs"], data)


=====================================
bcbio/qc/multiqc.py
=====================================
@@ -34,6 +34,7 @@ from bcbio.variation import bedutils
 from bcbio.qc.variant import get_active_vcinfo
 from bcbio.upload import get_all_upload_paths_from_sample
 from bcbio.variation import coverage
+from bcbio.chipseq import atac
 
 def summary(*samples):
     """Summarize all quality metrics together"""
@@ -447,6 +448,13 @@ def _add_disambiguate(sample):
             sample["summary"]["metrics"]["Disambiguated ambiguous reads"] = disambigStats[2]
     return sample
 
+def _add_atac(sample):
+    atac_metrics = atac.calculate_encode_complexity_metrics(sample)
+    if not atac_metrics:
+        return sample
+    sample["summary"]["metrics"] = tz.merge(atac_metrics, sample["summary"]["metrics"])
+    return sample
+
 def _fix_duplicated_rate(dt):
     """Get RNA duplicated rate if exists and replace by samtools metric"""
     if "Duplication_Rate_of_Mapped" in dt:
@@ -461,6 +469,7 @@ def _merge_metrics(samples, out_dir):
     sample_metrics = collections.defaultdict(dict)
     for s in samples:
         s = _add_disambiguate(s)
+        s = _add_atac(s)
         m = tz.get_in(['summary', 'metrics'], s)
         if isinstance(m, six.string_types):
             m = json.loads(m)


=====================================
bcbio/qc/viral.py
=====================================
@@ -11,6 +11,7 @@ from bcbio.distributed.transaction import file_transaction
 from bcbio.pipeline import datadict as dd
 from bcbio.provenance import do
 from bcbio.variation import vcfutils
+from bcbio.heterogeneity import chromhacks
 
 def run(bam_file, data, out_dir):
     """Run viral QC analysis:
@@ -21,39 +22,50 @@ def run(bam_file, data, out_dir):
     source_link = 'https://gdc.cancer.gov/about-data/data-harmonization-and-generation/gdc-reference-files'
     viral_target = "gdc-viral"
     out = {}
-    if vcfutils.get_paired_phenotype(data):
-        viral_refs = [x for x in dd.get_viral_files(data) if os.path.basename(x) == "%s.fa" % viral_target]
-        if viral_refs and utils.file_exists(viral_refs[0]):
-            viral_ref = viral_refs[0]
-            viral_bam = os.path.join(utils.safe_makedir(out_dir),
-                                     "%s-%s.bam" % (dd.get_sample_name(data),
-                                                    utils.splitext_plus(os.path.basename(viral_ref))[0]))
-            out_file = "%s-completeness.txt" % utils.splitext_plus(viral_bam)[0]
-            cores = dd.get_num_cores(data)
-            if not utils.file_uptodate(out_file, bam_file):
-                if not utils.file_uptodate(viral_bam, bam_file):
-                    with file_transaction(data, viral_bam) as tx_out_file:
-                        tmpfile = "%s-tmp" % utils.splitext_plus(tx_out_file)[0]
-                        cmd = ("samtools view -u -f 4 {bam_file} | "
-                               "bamtofastq collate=0 | "
-                               "bwa mem -t {cores} {viral_ref} - | "
-                               "bamsort tmpfile={tmpfile} inputthreads={cores} outputthreads={cores} "
-                               "inputformat=sam index=1 indexfilename={tx_out_file}.bai O={tx_out_file}")
-                        do.run(cmd.format(**locals()), "Align unmapped reads to viral genome")
-                with file_transaction(data, out_file) as tx_out_file:
-                    sample_name = dd.get_sample_name(data)
-                    mosdepth_prefix = os.path.splitext(viral_bam)[0]
-                    cmd = ("mosdepth -t {cores} {mosdepth_prefix} {viral_bam} -n --thresholds 1,5,25 --by "
-                           "<(awk 'BEGIN {{FS=\"\\t\"}}; {{print $1 FS \"0\" FS $2}}' {viral_ref}.fai) && "
-                           "echo '## Viral sequences (from {source_link}) found in unmapped reads' > {tx_out_file} &&"
-                           "echo '## Sample: {sample_name}' >> {tx_out_file} && "
-                           "echo '#virus\tsize\tdepth\t1x\t5x\t25x' >> {tx_out_file} && "
-                           "paste <(zcat {mosdepth_prefix}.regions.bed.gz) <(zgrep -v ^# {mosdepth_prefix}.thresholds.bed.gz) | "
-                           "awk 'BEGIN {{FS=\"\\t\"}} {{ print $1 FS $3 FS $4 FS $10/$3 FS $11/$3 FS $12/$3}}' | "
-                           "sort -n -r -k 5,5 >> {tx_out_file}")
-                    do.run(cmd.format(**locals()), "Analyse coverage of viral genomes")
-            out["base"] = out_file
-            out["secondary"] = []
+    viral_refs = [x for x in dd.get_viral_files(data) if os.path.basename(x) == "%s.fa" % viral_target]
+    if viral_refs and utils.file_exists(viral_refs[0]):
+        viral_ref = viral_refs[0]
+        viral_bam = os.path.join(utils.safe_makedir(out_dir),
+                                    "%s-%s.bam" % (dd.get_sample_name(data),
+                                                utils.splitext_plus(os.path.basename(viral_ref))[0]))
+        out_file = "%s-completeness.txt" % utils.splitext_plus(viral_bam)[0]
+        cores = dd.get_num_cores(data)
+        if not utils.file_uptodate(out_file, bam_file):
+            if not utils.file_uptodate(viral_bam, bam_file):
+                with file_transaction(data, viral_bam) as tx_out_file:
+                    tmpfile = "%s-tmp" % utils.splitext_plus(tx_out_file)[0]
+                    cmd = ("samtools view -u -f 4 {bam_file} | "
+                            "bamtofastq collate=0 | "
+                            "bwa mem -t {cores} {viral_ref} - | "
+                            "bamsort tmpfile={tmpfile} inputthreads={cores} outputthreads={cores} "
+                            "inputformat=sam index=1 indexfilename={tx_out_file}.bai O={tx_out_file}")
+                    do.run(cmd.format(**locals()), "Align unmapped reads to viral genome")
+            with file_transaction(data, out_file) as tx_out_file:
+                sample_name = dd.get_sample_name(data)
+                mosdepth_prefix = os.path.splitext(viral_bam)[0]
+                cmd = ("mosdepth -t {cores} {mosdepth_prefix} {viral_bam} -n --thresholds 1,5,25 --by "
+                        "<(awk 'BEGIN {{FS=\"\\t\"}}; {{print $1 FS \"0\" FS $2}}' {viral_ref}.fai) && "
+                        "echo '## Viral sequences (from {source_link}) found in unmapped reads' > {tx_out_file} &&"
+                        "echo '## Sample: {sample_name}' >> {tx_out_file} && "
+                        "echo '#virus\tsize\tdepth\t1x\t5x\t25x' >> {tx_out_file} && "
+                        "paste <(zcat {mosdepth_prefix}.regions.bed.gz) <(zgrep -v ^# {mosdepth_prefix}.thresholds.bed.gz) | "
+                        "awk 'BEGIN {{FS=\"\\t\"}} {{ print $1 FS $3 FS $4 FS $10/$3 FS $11/$3 FS $12/$3}}' | "
+                        "sort -n -r -k 5,5 >> {tx_out_file}")
+                do.run(cmd.format(**locals()), "Analyse coverage of viral genomes")
+                if chromhacks.get_EBV(data):
+                    ref_file = dd.get_ref_file(data)
+                    work_bam = dd.get_work_bam(data)
+                    ebv = chromhacks.get_EBV(data)
+                    mosdepth_prefix = os.path.splitext(work_bam)[0] + "-EBV"
+                    cmd = ("mosdepth -t {cores} {mosdepth_prefix} {work_bam} -n --thresholds 1,5,25 --by "
+                            "<(grep {ebv} {ref_file}.fai | awk 'BEGIN {{FS=\"\\t\"}}; {{print $1 FS \"0\" FS $2}}') && "
+                            "paste <(zcat {mosdepth_prefix}.regions.bed.gz) <(zgrep -v ^# {mosdepth_prefix}.thresholds.bed.gz) | "
+                            "awk 'BEGIN {{FS=\"\\t\"}} {{ print $1 FS $3 FS $4 FS $10/$3 FS $11/$3 FS $12/$3}}' | "
+                            "sort -n -r -k 5,5 >> {tx_out_file}")
+                    do.run(cmd.format(**locals()), "Analyse coverage of EBV")
+
+        out["base"] = out_file
+        out["secondary"] = []
     return out
 
 def get_files(data):


=====================================
bcbio/upload/__init__.py
=====================================
@@ -118,6 +118,7 @@ def _get_files_chipseq(sample):
     algorithm = sample["config"]["algorithm"]
     out = _maybe_add_summary(algorithm, sample, out)
     out = _maybe_add_alignment(algorithm, sample, out)
+    out = _maybe_add_nucleosome_alignments(algorithm, sample, out)
     out = _maybe_add_peaks(algorithm, sample, out)
     out = _maybe_add_greylist(algorithm, sample, out)
     return _add_meta(out, sample)
@@ -493,6 +494,17 @@ def _maybe_add_transcriptome_alignment(sample, out):
                     "ext": "transcriptome"})
     return out
 
+def _maybe_add_nucleosome_alignments(algorithm, sample, out):
+    """
+    for ATAC-seq, also upload NF, MN, DN and TN bam files
+    """
+    atac_align = tz.get_in(("atac", "align"), sample, {})
+    for alignment in atac_align.keys():
+        out.append({"path": atac_align[alignment],
+                    "type": "bam",
+                    "ext": alignment})
+    return out
+
 def _maybe_add_counts(algorithm, sample, out):
     if not dd.get_count_file(sample):
         return out
@@ -650,14 +662,25 @@ def _maybe_add_trna(algorithm, sample, out):
 
 def _maybe_add_peaks(algorithm, sample, out):
     out_dir = sample.get("peaks_files", {})
-    for caller in out_dir:
-        if caller == "main":
-            continue
-        for fn in out_dir[caller]:
-            if os.path.exists(fn):
-                out.append({"path": fn,
-                             "dir": caller,
-                             "ext": utils.splitext_plus(fn)[1]})
+    if "NF" in out_dir.keys():
+        for files in out_dir.values():
+            for caller in files:
+                if caller == "main":
+                    continue
+                for fn in files[caller]:
+                    if os.path.exists(fn):
+                        out.append({"path": fn,
+                                    "dir": caller,
+                                    "ext": utils.splitext_plus(fn)[1]})
+    else:
+        for caller in out_dir:
+            if caller == "main":
+                continue
+            for fn in out_dir[caller]:
+                if os.path.exists(fn):
+                    out.append({"path": fn,
+                                "dir": caller,
+                                "ext": utils.splitext_plus(fn)[1]})
     return out
 
 def _maybe_add_greylist(algorithm, sample, out):


=====================================
bcbio/variation/effects.py
=====================================
@@ -139,13 +139,13 @@ def get_vep_cache(dbkey, ref_file, tooldir=None, config=None):
             resources = yaml.safe_load(in_handle)
         ensembl_name = tz.get_in(["aliases", "ensembl"], resources)
         symlink_dir = _special_dbkey_maps(dbkey, ref_file)
-        if ensembl_name and ensembl_name.find("_vep_") == -1:
-            raise ValueError("%s has ensembl an incorrect value."
-                             "It should have _vep_ in the name."
-                             "Remove line or fix the name to avoid error.")
         if symlink_dir and ensembl_name:
             species, vepv = ensembl_name.split("_vep_")
             return symlink_dir, species
+        elif ensembl_name:
+            species, vepv = ensembl_name.split("_vep_")
+            vep_dir = os.path.normpath(os.path.join(os.path.dirname(os.path.dirname(ref_file)), "vep"))
+            return vep_dir, species
     return None, None
 
 def run_vep(in_file, data):


=====================================
docs/contents/configuration.rst
=====================================
@@ -1050,9 +1050,7 @@ ChIP/ATAC sequencing
 - ``aligner`` Currently ``bowtie2`` is the only one tested. ``bwa`` is also available.
 - The ``phenotype`` and ``batch`` tags need to be set under ``metadata`` in the config YAML file. The ``phenotype`` tag will specify the chip (``phenotype: chip``) and input samples (``phenotype: input``). The ``batch`` tag will specify the input-chip pairs of samples for example, ``batch: pair1``. Same input can be used for different chip samples giving a list of distinct values: ``batch: [sample1, sample2]``.
 - ``chip_method``: currently supporting standard CHIP-seq (TF or broad regions using `chip`) or ATAC-seq (`atac`). Parameters will change depending on the option to get the best possible results. Only macs2 supported for now.
-- ``antibody``: automatically sets peakcalling options tailored for the specific anitbody. Supports
-`h3f3a`, `h3k27me3`, `h3k36me3`, `h3k4me1`, `h3k79me2`, `h3k9me3`, `h3k9me1`, `h3k9me2`, `h4k20me1`,
-`h2afz`, `h3ac`, `h3k27ac`, `h3k4me2`, `h3k4me3`, `h3k9ac`, `h3k9me3`.
+- ``antibody``: automatically sets peakcalling options tailored for the specific anitbody. Supports `h3f3a`, `h3k27me3`, `h3k36me3`, `h3k4me1`, `h3k79me2`, `h3k9me3`, `h3k9me1`, `h3k9me2`, `h4k20me1`, `h2afz`, `h3ac`, `h3k27ac`, `h3k4me2`, `h3k4me3`, `h3k9ac`, `h3k9me3`.
 - ``keep_duplicates``: do not remove duplicates before peak calling. Defaults to `False`.
 - ``keep_multimapped``: do not remove multimappers before peak calling. Defaults to `False`.
 
@@ -1544,11 +1542,13 @@ Adding custom genomes
 ``bcbio_setup_genome.py`` will help you to install a custom genome and apply all changes needed
 to the configuration files. It needs the genome in FASTA format, and the annotation file
 in GTF or GFF3 format. It can create index for all aligners used by bcbio. Moreover, it will create
-the folder `rnaseq` to allow you run the RNAseq pipeline without further configuration.
+the folder `rnaseq` to allow you run the RNAseq pipeline without further configuration. The ``--buildversion``
+option will write that string to the ``version.txt`` file, to track from where and which version of
+a gene build was used.
 
 ::
 
-    bcbio_setup_genome.py -f genome.fa -g annotation.gtf -i bowtie2 star seq -n Celegans -b WBcel135
+    bcbio_setup_genome.py -f genome.fa -g annotation.gtf -i bowtie2 star seq -n Celegans -b WBcel135 --buildversion WormBase_34
 
 If you want to add smallRNA-seq data files, you will need to add the 3 letters code of mirbase
 for your genome (i.e hsa for human) and the GTF file for the annotation of smallRNA data.
@@ -1556,7 +1556,7 @@ Here you can use the same file than the transcriptome if no other available.
 
 ::
 
-    bcbio_setup_genome.py -f genome.fa -g annotation.gtf -i bowtie2 star seq -n Celegans -b WBcel135 --species cel --srna_gtf another_annotation.gtf
+    bcbio_setup_genome.py -f genome.fa -g annotation.gtf -i bowtie2 star seq -n Celegans -b WBcel135 --species cel --srna_gtf another_annotation.gtf --buildversion WormBase_34
 
 To use that genome just need to configure your YAML files as::
 


=====================================
docs/contents/internals.rst
=====================================
@@ -268,7 +268,8 @@ bcbio.yaml config::
     --bc2 harvard-indrop-v3-cb2.txt.gz demultiplexed/[sample-barcodeAATTTTT].fq  | \
     gzip -c > project-sample_barcode.filtered.fq.gz
 
-- step 4*: create cellular barcode histrogram::
+- step 4*: create cellular barcode histogram, also creates cb-histogram-filtered.txt for cells
+  with nreads > minimum_barcode_depth::
 
     python umis cb_histogram project-sample_barcode.filtered.fq.gz > cb-histogram.txt
 
@@ -301,3 +302,13 @@ bcbio.yaml config::
 - step 8. Concatenate all cb-histogram-filtered.txt files::
 
     cat project-[all-barcodes]/cb-histogram-filtered.txt > cb-histogram.txt
+    
+Tests
+~~~~~
+To run bcbio automated tests, install bcbio and clone bcbio master repository. You are testing your installation with tests provided in bcbio-nextgen/tests::
+
+    which bcbio_nextgen.py
+    cd bcbio-nextgen/tests
+    ./run_tests.sh > tests.out
+    
+Tests are in integration/*.py. Each test has a set or marks. Marks are listed in pytest.ini. The mark defines how many tests to select. By default (just running plain ./run_tests.sh), it is speed1 = 11 tests. 


=====================================
docs/contents/outputs.rst
=====================================
@@ -28,6 +28,7 @@ Project directory
 - ``multiqc`` run `MultiQC`_ to gather all QC metrics from different tools, such as,
   cutadapt, featureCounts, samtools, STAR ... into an unique HTML report.
 - ``metadata.csv`` -- CSV with the metadata in the YAML file.
+- ``data_versions.csv`` -- Data versions for bcbio-nextgen and software
 
 .. _MultiQC: http://multiqc.info
 
@@ -68,29 +69,70 @@ RNA-seq
 Project directory
 ~~~~~~~~~~~~~~~~~
 
-- ``annotated_combined.counts`` -- featureCounts counts matrix
-  with gene symbol as an extra column.
-- ``combined.counts`` -- featureCounts counts matrix
-  with gene symbol as an extra column.
-- ``combined.dexseq`` -- DEXseq counts matrix with
-  exonID as first column.
-- ``combined.gene.sf.tmp`` -- Sailfish gene count
-  matrix normalized to TPM.
-- ``combined.isoform.sf.tpm`` -- Sailfish transcript
-  count matix normalized to TPM.
-- ``combined.sf`` -- Sailfish raw output, all samples
-  files are pasted one after another.
-- ``tx2gene.csv`` -- Annotation file needed for DESeq2
-  to use Sailfish output.
+| ├── annotated_combined.counts -- gene counts with symbols from featureCounts (don't use this)
+| ├── bcbio-nextgen-commands.log -- commands run by bcbio
+| ├── bcbio-nextgen.log -- logging information from bcbio run
+| ├── combined.counts -- gene counts with gene IDs from featureCounts (don't use this)
+| ├── metadata.csv -- provided metadata about each sample
+| ├── multiqc
+|     ├── multiqc_report.html -- multiQC report
+| ├── programs.txt -- program versions of tools run
+| ├── project-summary.yaml -- YAML description of project, with derived metadata
+| └── tx2gene.csv -- transcript to gene mappings for use with tximport
 
 Sample directories
 ~~~~~~~~~~~~~~~~~~
 
-- ``SAMPLE-transcriptome.bam`` -- BAM file aligned to transcriptome.
-- ``SAMPLE-ready.counts`` -- featureCounts gene counts output.
-- ``salmon`` -- Salmon output.
-
-single cell RNA-seq
+| S1
+| ├── S1-ready.bam -- coordinate-sorted whole genome alignments
+| ├── S1-ready.counts -- featureCounts counts (don't use this)
+| ├── S1-transcriptome.bam -- alignments to the transcriptome
+| ├── salmon
+| │   ├── abundance.h5 -- h5 object, usable with `sleuth`_
+| │   └── quant.sf -- salmon quantifications, usable with `tximport`_
+| └── STAR
+|     ├── S1-SJ.bed -- STAR junction file in BED format
+|     └── S1-SJ.tab -- STAR junction file in tabular format
+
+bcbioRNASeq directory
+~~~~~~~~~~~~~~~~~~~~~
+| bcbioRNASeq/
+| ├── data
+| │   ├── bcb.rda -- `bcbioRNASeq`_ object with gene-level data
+| ├── data-transcript
+| │   ├── bcb.rda -- `bcbioRNASeq`_ object with transcript-level data
+| ├── quality_control.html -- quality control report
+| ├── quality_control.Rmd -- RMarkdown that generated quality control report
+| ├── results
+| │   └── 2019-11-21
+| │       ├── gene -- gene level information
+| │       │   └── counts
+| │       │       ├── counts.csv.gz -- count matrix from tximport, suitable for count-based analyses
+| │       │       └── metadata.csv.gz -- metadata and quality control data for samples
+| │       ├── quality_control
+| │       │   └── tpm.csv.gz -- TPM from tximport, use for visualization
+| │       └── transcript -- transcript level information
+| │           └── counts
+| │               ├── counts.csv.gz -- transcript level count matrix, suitable for count-based analyses needed transcript-level data
+| │               └── metadata.csv.gz -- metadata and quality control for samples
+
+.. _sleuth: http://seqcluster.readthedocs.io/mirna_annotation.html
+.. _tximport: https://bioconductor.org/packages/devel/bioc/vignettes/tximport/inst/doc/tximport.html
+.. _bcbioRNASeq: https://bioinformatics.sph.harvard.edu/bcbioRNASeq/
+
+Workflow for analysis
+~~~~~~~~~~~~~~~~~~~~~
+For gene-level analyses, we recommend loading the gene-level counts.csv.gz and the metadata.csv.gz and using `DESeq2`_ to do the analysis. For a more in-depth walkthrough of how to use DESeq2, refer to our `DGE_workshop`_.
+
+For transcript-level analyses, we recommend using `sleuth`_ with the bootstrap samples. You can load the abundance.h5 files from Salmon, or if you set ``kallisto`` as an expression caller, use the abundance.h5 files from that.
+
+Another great alternative is to use the Salmon quantification to look at differential transcript usage (DTU) instead of differential transcript expression (DTE). The idea behind DTU is you are looking for transcripts of genes that have been flipped from one isoform to another. The `Swimming downstream`_ tutorial has a nice walkthrough of how to do that.
+
+.. _DESeq2: https://bioconductor.org/packages/release/bioc/html/DESeq2.html
+.. _DGE_workshop: https://hbctraining.github.io/DGE_workshop_salmon/schedule/
+.. _Swimming downstream: http://www.bioconductor.org/packages/devel/workflows/vignettes/rnaseqDTU/inst/doc/rnaseqDTU.html#salmon-quantification
+
+single cell RNA-Seq
 ===================
 
 Project directory
@@ -174,6 +216,32 @@ Sample directories
 .. _seqcluster: https://github.com/lpantano/seqcluster
 .. _tdrmapper: https://github.com/sararselitsky/tDRmapper
 
+
+ATAC-seq
+========
+
+Sample directories
+~~~~~~~~~~~~~~~~~~
+Below is an example sample directory for a sample called `rep1`. There are four
+sets of peak files for each sample, for each peak caller, one set for each of
+the nucleosome-free (NF), mononucleosome (MF), dinucleosome (DF) and trinucleosome
+(TF) regions. There are BAM files of reach of those regions as well.
+
+| rep1
+| ├── macs2 -- peak calls from macs2
+| │   ├── rep1-NF_control_lambda.bdg.gz -- local lambda estimate for poisson distribution from control samples in bedgraph format, NF regions only
+| │   ├── rep1-NF_peaks.narrowpeak -- peaks in `narrowPeak`_ format in NF regions
+| │   ├── rep1-NF_summits.bed -- top of peak in bed format of NF regions
+| │   └── rep1-NF_treat_pileup.bdg.gz -- bedgraph for rep1 sample of NF regions
+| ├── rep1-NF.bam -- BAM of just nucleosome free regions
+| ├── rep1-MN.bam -- BAM of just mononucleosome regions
+| ├── rep1-DN.bam -- BAM of just dinucleosome regions
+| ├── rep1-TN.bam -- BAM of just trinucleosome regions
+| ├── rep1-ready.bam -- BAM of all alignments
+| └── rep1-ready.bw -- bigwig file of all alignments
+
+.. _narrowPeak: http://genome.ucsc.edu/faq/faqformat.html#format12
+
 Downstream analysis
 ===================
 


=====================================
requirements-conda.txt
=====================================
@@ -1 +1 @@
-bcbio-nextgen=1.1.7
+bcbio-nextgen=1.1.8


=====================================
requirements.txt
=====================================
@@ -1 +1 @@
-bcbio-nextgen==1.1.7
+bcbio-nextgen==1.1.8


=====================================
scripts/bcbio_setup_genome.py
=====================================
@@ -237,7 +237,9 @@ if __name__ == "__main__":
                         help="Add ERCC spike-ins.")
     parser.add_argument("--mirbase", help="species in mirbase for smallRNAseq data.")
     parser.add_argument("--srna_gtf", help="gtf to use for smallRNAseq data.")
-
+    parser.add_argument("--buildversion", required=True, 
+	                help=("String describing build of genome used. Examples: "
+                              "Ensembl_94, EnsemblMetazoa_94, Flybase_21, etc"))
     args = parser.parse_args()
  #   if not all([args.mirbase, args.srna_gtf]) and any([args.mirbase, args.srna_gtf]):
  #       raise ValueError("--mirbase and --srna_gtf both need a value.")
@@ -306,7 +308,7 @@ if __name__ == "__main__":
     if args.gtf:
         "Preparing transcriptome."
         with chdir(os.path.join(build_dir, os.pardir)):
-            cmd = ("{sys.executable} {prepare_tx} --cores {args.cores} --genome-dir {genome_dir} "
+            cmd = ("{sys.executable} {prepare_tx} --buildversion {args.buildversion} --cores {args.cores} --genome-dir {genome_dir} "
                    "--gtf {gtf_file} {args.name} {args.build}")
             subprocess.check_call(cmd.format(**locals()), shell=True)
     if args.mirbase:


=====================================
setup.py
=====================================
@@ -4,7 +4,7 @@
 import os
 from setuptools import setup, find_packages
 
-version = "1.1.8"
+version = "1.1.9a"
 
 def write_version_py():
     version_py = os.path.join(os.path.dirname(__file__), 'bcbio', 'pipeline',


=====================================
tests/data/atac/atac_1.fq
=====================================
The diff for this file was not included because it is too large.

=====================================
tests/data/atac/atac_2.fq
=====================================
The diff for this file was not included because it is too large.

=====================================
tests/data/automated/run_info-atacseq.yaml
=====================================
@@ -0,0 +1,17 @@
+upload:
+  dir: upload
+details:
+  - analysis: chip-seq
+    files: [../data/atac/atac_1.fq, ../data/atac/atac_2.fq]
+    algorithm:
+      chip_method: atac
+      aligner: bwa
+      adapters: [truseq]
+      peakcaller: macs2
+      keep_multimapped: False
+      keep_duplicates: False
+    description: Test1
+    genome_build: mm9
+    metadata:
+      batch: b1
+      phenotype: input


=====================================
tests/data/automated/run_info-chipseq.yaml
=====================================
@@ -2,7 +2,7 @@ upload:
   dir: upload
 details:
   - analysis: chip-seq
-    files: ['../data/test_chipseq/Hsapiens_Mmusculus_1.fq']
+    files: [../data/100326_FC6107FAAXX/7_100326_FC6107FAAXX_1_fastq.txt]
     algorithm:
       aligner: bwa
       quality_format: Standard


=====================================
tests/integration/test_automated_analysis.py
=====================================
@@ -189,6 +189,17 @@ def test_chipseq(install_test_files, data_dir):
               os.path.join(data_dir, "run_info-chipseq.yaml")]
         subprocess.check_call(cl)
 
+ at pytest.mark.atacseq
+def test_atacseq(install_test_files, data_dir):
+    """
+    Test ATAC-seq pipeline
+    """
+    with make_workdir() as workdir:
+        cl = ["bcbio_nextgen.py",
+              get_post_process_yaml(data_dir, workdir),
+              os.path.join(data_dir, os.pardir, "test_atacseq"),
+              os.path.join(data_dir, "run_info-atacseq.yaml")]
+        subprocess.check_call(cl)
 
 @pytest.mark.speed1
 @pytest.mark.ensemble


=====================================
tests/pytest.ini
=====================================
@@ -50,3 +50,4 @@ markers =
     explant: test explant disambiguation
     chipseq: test ChIP-seq pipeline
     ericscript: test Ericscript integration
+    atacseq: test ATAC-seq pipeline



View it on GitLab: https://salsa.debian.org/med-team/bcbio/commit/f8a1a562c9b43ccd7376ec48ac5f155cecaf313f

-- 
View it on GitLab: https://salsa.debian.org/med-team/bcbio/commit/f8a1a562c9b43ccd7376ec48ac5f155cecaf313f
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20191211/5277e163/attachment-0001.html>