[med-svn] [Git][med-team/cnvkit][upstream] New upstream version 0.9.8

Fri Jan 1 17:27:44 GMT 2021


Michael R. Crusoe pushed to branch upstream at Debian Med / cnvkit


Commits:
3c12d4a0 by Michael R. Crusoe at 2021-01-01T16:16:00+01:00
New upstream version 0.9.8
- - - - -


16 changed files:

- cnvlib/_version.py
- cnvlib/autobin.py
- cnvlib/batch.py
- cnvlib/commands.py
- cnvlib/coverage.py
- cnvlib/export.py
- cnvlib/fix.py
- cnvlib/rna.py
- cnvlib/samutil.py
- cnvlib/segmentation/__init__.py
- docker/Dockerfile
- scripts/cnv_expression_correlate.py
- scripts/guess_baits.py
- skgenome/intersect.py
- skgenome/tabio/__init__.py
- skgenome/tabio/vcfsimple.py


Changes:

=====================================
cnvlib/_version.py
=====================================
@@ -1 +1 @@
-__version__ = "0.9.7"
+__version__ = "0.9.8"


=====================================
cnvlib/autobin.py
=====================================
@@ -23,7 +23,7 @@ def midsize_file(fnames):
 
 def do_autobin(bam_fname, method, targets=None, access=None,
                bp_per_bin=100000., target_min_size=20, target_max_size=50000,
-               antitarget_min_size=500, antitarget_max_size=1000000):
+               antitarget_min_size=500, antitarget_max_size=1000000, fasta=None):
     """Quickly calculate reasonable bin sizes from BAM read counts.
 
     Parameters
@@ -70,8 +70,8 @@ def do_autobin(bam_fname, method, targets=None, access=None,
             return bin_size
 
     samutil.ensure_bam_index(bam_fname)
-    rc_table = samutil.idxstats(bam_fname, drop_unmapped=True)
-    read_len = samutil.get_read_length(bam_fname)
+    rc_table = samutil.idxstats(bam_fname, drop_unmapped=True, fasta=fasta)
+    read_len = samutil.get_read_length(bam_fname, fasta=fasta)
     logging.info("Estimated read length %s", read_len)
 
     # Dispatch
@@ -80,11 +80,11 @@ def do_autobin(bam_fname, method, targets=None, access=None,
         # rc_table = update_chrom_length(rc_table, targets)
         # tgt_depth = average_depth(rc_table, read_len)
         # By sampling
-        tgt_depth = sample_region_cov(bam_fname, targets)
+        tgt_depth = sample_region_cov(bam_fname, targets, fasta=fasta)
         anti_depth = None
     elif method == 'hybrid':
         tgt_depth, anti_depth = hybrid(rc_table, read_len, bam_fname, targets,
-                                       access)
+                                       access, fasta)
     elif method == 'wgs':
         if access is not None and len(access):
             rc_table = update_chrom_length(rc_table, access)
@@ -99,7 +99,7 @@ def do_autobin(bam_fname, method, targets=None, access=None,
             (anti_depth, anti_bin_size))
 
 
-def hybrid(rc_table, read_len, bam_fname, targets, access=None):
+def hybrid(rc_table, read_len, bam_fname, targets, access=None, fasta=None):
     """Hybrid capture sequencing."""
     # Identify off-target regions
     if access is None:
@@ -111,7 +111,7 @@ def hybrid(rc_table, read_len, bam_fname, targets, access=None):
     rc_table, targets, antitargets = shared_chroms(rc_table, targets,
                                                    antitargets)
     # Deal with targets
-    target_depth = sample_region_cov(bam_fname, targets)
+    target_depth = sample_region_cov(bam_fname, targets, fasta=fasta)
     # Antitargets: subtract captured reads from total
     target_length = region_size_by_chrom(targets)['length']
     target_reads = (target_length * target_depth / read_len).values
@@ -142,13 +142,13 @@ def idxstats2ga(table, bam_fname):
               meta_dict={'filename': bam_fname})
 
 
-def sample_region_cov(bam_fname, regions, max_num=100):
+def sample_region_cov(bam_fname, regions, max_num=100, fasta=None):
     """Calculate read depth in a randomly sampled subset of regions."""
     midsize_regions = sample_midsize_regions(regions, max_num)
     with tempfile.NamedTemporaryFile(suffix='.bed', mode='w+t') as f:
         tabio.write(regions.as_dataframe(midsize_regions), f, 'bed4')
         f.flush()
-        table = coverage.bedcov(f.name, bam_fname, 0)
+        table = coverage.bedcov(f.name, bam_fname, 0, fasta)
     # Mean read depth across all sampled regions
     return table.basecount.sum() / (table.end - table.start).sum()
 


=====================================
cnvlib/batch.py
=====================================
@@ -70,7 +70,7 @@ def batch_make_reference(normal_bams, target_bed, antitarget_bed,
                 # Choose median-size normal bam or tumor bam
                 bam_fname = autobin.midsize_file(normal_bams)
                 (wgs_depth, target_avg_size), _ = autobin.do_autobin(
-                    bam_fname, *autobin_args, bp_per_bin=50000.)
+                    bam_fname, *autobin_args, bp_per_bin=50000., fasta=fasta)
                 logging.info("WGS average depth %.2f --> using bin size %d",
                              wgs_depth, target_avg_size)
             else:
@@ -129,12 +129,12 @@ def batch_make_reference(normal_bams, target_bed, antitarget_bed,
                     pool.submit(batch_write_coverage,
                                 target_bed, nbam,
                                 sample_pfx + '.targetcoverage.cnn',
-                                by_count, procs_per_cnn))
+                                by_count, procs_per_cnn, fasta))
                 anti_futures.append(
                     pool.submit(batch_write_coverage,
                                 antitarget_bed, nbam,
                                 sample_pfx + '.antitargetcoverage.cnn',
-                                by_count, procs_per_cnn))
+                                by_count, procs_per_cnn, fasta))
 
         target_fnames = [tf.result() for tf in tgt_futures]
         antitarget_fnames = [af.result() for af in anti_futures]
@@ -152,9 +152,9 @@ def batch_make_reference(normal_bams, target_bed, antitarget_bed,
     return output_reference, target_bed, antitarget_bed
 
 
-def batch_write_coverage(bed_fname, bam_fname, out_fname, by_count, processes):
+def batch_write_coverage(bed_fname, bam_fname, out_fname, by_count, processes, fasta):
     """Run coverage on one sample, write to file."""
-    cnarr = coverage.do_coverage(bed_fname, bam_fname, by_count, 0, processes)
+    cnarr = coverage.do_coverage(bed_fname, bam_fname, by_count, 0, processes, fasta)
     tabio.write(cnarr, out_fname)
     return out_fname
 
@@ -162,7 +162,7 @@ def batch_write_coverage(bed_fname, bam_fname, out_fname, by_count, processes):
 def batch_run_sample(bam_fname, target_bed, antitarget_bed, ref_fname,
                      output_dir, male_reference, plot_scatter, plot_diagram,
                      rscript_path, by_count, skip_low, seq_method,
-                     segment_method, processes, do_cluster):
+                     segment_method, processes, do_cluster, fasta=None):
     """Run the pipeline on one BAM file."""
     # ENH - return probes, segments (cnarr, segarr)
     logging.info("Running the CNVkit pipeline on %s ...", bam_fname)
@@ -170,11 +170,11 @@ def batch_run_sample(bam_fname, target_bed, antitarget_bed, ref_fname,
     sample_pfx = os.path.join(output_dir, sample_id)
 
     raw_tgt = coverage.do_coverage(target_bed, bam_fname, by_count, 0,
-                                   processes)
+                                   processes, fasta)
     tabio.write(raw_tgt, sample_pfx + '.targetcoverage.cnn')
 
     raw_anti = coverage.do_coverage(antitarget_bed, bam_fname, by_count, 0,
-                                    processes)
+                                    processes, fasta)
     tabio.write(raw_anti, sample_pfx + '.antitargetcoverage.cnn')
 
     cnarr = fix.do_fix(raw_tgt, raw_anti, read_cna(ref_fname),


=====================================
cnvlib/commands.py
=====================================
@@ -140,7 +140,7 @@ def _cmd_batch(args):
                             args.output_dir, args.male_reference, args.scatter,
                             args.diagram, args.rscript_path, args.count_reads,
                             args.drop_low_coverage, args.seq_method, args.segment_method, procs_per_bam,
-                            args.cluster)
+                            args.cluster, args.fasta)
     else:
         logging.info("No tumor/test samples (but %d normal/control samples) "
                      "specified on the command line.",
@@ -370,7 +370,7 @@ def _cmd_autobin(args):
     fields = autobin.do_autobin(bam_fname, args.method, tgt_arr, access_arr,
                                 args.bp_per_bin, args.target_min_size,
                                 args.target_max_size, args.antitarget_min_size,
-                                args.antitarget_max_size)
+                                args.antitarget_max_size, args.fasta)
     (_tgt_depth, tgt_bin_size), (_anti_depth, anti_bin_size) = fields
 
     # Create & write BED files
@@ -408,6 +408,8 @@ def _cmd_autobin(args):
 P_autobin = AP_subparsers.add_parser('autobin', help=_cmd_autobin.__doc__)
 P_autobin.add_argument('bams', nargs='+',
         help="""Sample BAM file(s) to test for target coverage""")
+P_autobin.add_argument('-f', '--fasta', metavar="FILENAME",
+        help="Reference genome, FASTA format (e.g. UCSC hg19.fa)")
 P_autobin.add_argument('-m', '--method',
         choices=('hybrid', 'amplicon', 'wgs'), default='hybrid',
         help="""Sequencing protocol: hybridization capture ('hybrid'), targeted
@@ -458,7 +460,7 @@ do_coverage = public(coverage.do_coverage)
 def _cmd_coverage(args):
     """Calculate coverage in the given regions from BAM read depths."""
     pset = coverage.do_coverage(args.interval, args.bam_file, args.count,
-                                args.min_mapq, args.processes)
+                                args.min_mapq, args.processes, args.fasta)
     if not args.output:
         # Create an informative but unique name for the coverage output file
         bambase = core.fbase(args.bam_file)
@@ -476,6 +478,8 @@ def _cmd_coverage(args):
 P_coverage = AP_subparsers.add_parser('coverage', help=_cmd_coverage.__doc__)
 P_coverage.add_argument('bam_file', help="Mapped sequence reads (.bam)")
 P_coverage.add_argument('interval', help="Intervals (.bed or .list)")
+P_coverage.add_argument('-f', '--fasta', metavar="FILENAME",
+        help="Reference genome, FASTA format (e.g. UCSC hg19.fa)")
 P_coverage.add_argument('-c', '--count', action='store_true',
         help="""Get read depths by counting read midpoints within each bin.
                 (An alternative algorithm).""")


=====================================
cnvlib/coverage.py
=====================================
@@ -17,19 +17,19 @@ from .parallel import rm, to_chunks
 from .params import NULL_LOG2_COVERAGE
 
 
-def do_coverage(bed_fname, bam_fname, by_count=False, min_mapq=0, processes=1):
+def do_coverage(bed_fname, bam_fname, by_count=False, min_mapq=0, processes=1, fasta=None):
     """Calculate coverage in the given regions from BAM read depths."""
-    if not samutil.ensure_bam_sorted(bam_fname):
+    if not samutil.ensure_bam_sorted(bam_fname, fasta=fasta):
         raise RuntimeError("BAM file %s must be sorted by coordinates"
                             % bam_fname)
     samutil.ensure_bam_index(bam_fname)
     # ENH: count importers.TOO_MANY_NO_COVERAGE & warn
     cnarr = interval_coverages(bed_fname, bam_fname, by_count, min_mapq,
-                               processes)
+                               processes, fasta)
     return cnarr
 
 
-def interval_coverages(bed_fname, bam_fname, by_count, min_mapq, processes):
+def interval_coverages(bed_fname, bam_fname, by_count, min_mapq, processes, fasta=None):
     """Calculate log2 coverages in the BAM file at each interval."""
     meta = {'sample_id': core.fbase(bam_fname)}
     start_time = time.time()
@@ -47,7 +47,7 @@ def interval_coverages(bed_fname, bam_fname, by_count, min_mapq, processes):
     # Calculate average read depth in each bin
     if by_count:
         results = interval_coverages_count(bed_fname, bam_fname, min_mapq,
-                                           processes)
+                                           processes, fasta)
         read_counts, cna_rows = zip(*results)
         read_counts = pd.Series(read_counts)
         cnarr = CNA.from_rows(list(cna_rows),
@@ -55,8 +55,8 @@ def interval_coverages(bed_fname, bam_fname, by_count, min_mapq, processes):
                               meta_dict=meta)
     else:
         table = interval_coverages_pileup(bed_fname, bam_fname, min_mapq,
-                                          processes)
-        read_len = samutil.get_read_length(bam_fname)
+                                          processes, fasta)
+        read_len = samutil.get_read_length(bam_fname, fasta=fasta)
         read_counts = table['basecount'] / read_len
         table = table.drop('basecount', axis=1)
         cnarr = CNA(table, meta)
@@ -75,7 +75,7 @@ def interval_coverages(bed_fname, bam_fname, by_count, min_mapq, processes):
                  (tot_reads / len(read_counts)),
                  read_counts.min(),
                  read_counts.max())
-    tot_mapped_reads = samutil.bam_total_reads(bam_fname)
+    tot_mapped_reads = samutil.bam_total_reads(bam_fname, fasta=fasta)
     if tot_mapped_reads:
         logging.info("Percent reads in regions: %.3f (of %d mapped)",
                      100. * tot_reads / tot_mapped_reads,
@@ -86,11 +86,11 @@ def interval_coverages(bed_fname, bam_fname, by_count, min_mapq, processes):
     return cnarr
 
 
-def interval_coverages_count(bed_fname, bam_fname, min_mapq, procs=1):
+def interval_coverages_count(bed_fname, bam_fname, min_mapq, procs=1, fasta=None):
     """Calculate log2 coverages in the BAM file at each interval."""
     regions = tabio.read_auto(bed_fname)
     if procs == 1:
-        bamfile = pysam.Samfile(bam_fname, 'rb')
+        bamfile = pysam.Samfile(bam_fname, 'rb', reference_filename=fasta)
         for chrom, subregions in regions.by_chromosome():
             logging.info("Processing chromosome %s of %s",
                          chrom, os.path.basename(bam_fname))
@@ -98,7 +98,7 @@ def interval_coverages_count(bed_fname, bam_fname, min_mapq, procs=1):
                 yield [count, row]
     else:
         with futures.ProcessPoolExecutor(procs) as pool:
-            args_iter = ((bam_fname, subr, min_mapq)
+            args_iter = ((bam_fname, subr, min_mapq, fasta)
                          for _c, subr in regions.by_chromosome())
             for chunk in pool.map(_rdc, args_iter):
                 for count, row in chunk:
@@ -110,9 +110,9 @@ def _rdc(args):
     return list(_rdc_chunk(*args))
 
 
-def _rdc_chunk(bamfile, regions, min_mapq):
+def _rdc_chunk(bamfile, regions, min_mapq, fasta=None):
     if isinstance(bamfile, str):
-        bamfile = pysam.Samfile(bamfile, 'rb')
+        bamfile = pysam.Samfile(bamfile, 'rb', reference_filename=fasta)
     for chrom, start, end, gene in regions.coords(["gene"]):
         yield region_depth_count(bamfile, chrom, start, end, gene, min_mapq)
 
@@ -152,15 +152,15 @@ def region_depth_count(bamfile, chrom, start, end, gene, min_mapq):
     return count, row
 
 
-def interval_coverages_pileup(bed_fname, bam_fname, min_mapq, procs=1):
+def interval_coverages_pileup(bed_fname, bam_fname, min_mapq, procs=1, fasta=None):
     """Calculate log2 coverages in the BAM file at each interval."""
     logging.info("Processing reads in %s", os.path.basename(bam_fname))
     if procs == 1:
-        table = bedcov(bed_fname, bam_fname, min_mapq)
+        table = bedcov(bed_fname, bam_fname, min_mapq, fasta)
     else:
         chunks = []
         with futures.ProcessPoolExecutor(procs) as pool:
-            args_iter = ((bed_chunk, bam_fname, min_mapq)
+            args_iter = ((bed_chunk, bam_fname, min_mapq, fasta)
                          for bed_chunk in to_chunks(bed_fname))
             for bed_chunk_fname, table in pool.map(_bedcov, args_iter):
                 chunks.append(table)
@@ -189,7 +189,7 @@ def _bedcov(args):
     return bed_fname, table
 
 
-def bedcov(bed_fname, bam_fname, min_mapq):
+def bedcov(bed_fname, bam_fname, min_mapq, fasta=None):
     """Calculate depth of all regions in a BED file via samtools (pysam) bedcov.
 
     i.e. mean pileup depth across each region.
@@ -198,6 +198,8 @@ def bedcov(bed_fname, bam_fname, min_mapq):
     cmd = [bed_fname, bam_fname]
     if min_mapq and min_mapq > 0:
         cmd.extend(['-Q', bytes(min_mapq)])
+    if fasta:
+        cmd.extend(['--reference', fasta])
     try:
         raw = pysam.bedcov(*cmd, split_lines=False)
     except pysam.SamtoolsError as exc:


=====================================
cnvlib/export.py
=====================================
@@ -28,7 +28,7 @@ def merge_samples(filenames):
     if not filenames:
         return []
     first_cnarr = read_cna(filenames[0])
-    out_table = first_cnarr.data.loc[:, ["chromosome", "start", "end", "gene"]]
+    out_table = first_cnarr.data.reindex(columns=["chromosome", "start", "end", "gene"])
     out_table["label"] = label_with_gene(first_cnarr)
     out_table[first_cnarr.sample_id] = first_cnarr["log2"]
     for fname in filenames[1:]:
@@ -101,7 +101,7 @@ def export_nexus_basic(cnarr):
 
     Only represents one sample per file.
     """
-    out_table = cnarr.data.loc[:, ['chromosome', 'start', 'end', 'gene', 'log2']]
+    out_table = cnarr.data.reindex(columns=['chromosome', 'start', 'end', 'gene', 'log2'])
     out_table['probe'] = cnarr.labels()
     return out_table
 
@@ -123,7 +123,7 @@ def export_nexus_ogt(cnarr, varr, min_weight=0.0):
     bafs = varr.baf_by_ranges(cnarr)
     logging.info("Placed %d variants into %d bins",
                  sum(~np.isnan(bafs)), len(cnarr))
-    out_table = cnarr.data.loc[:, ['chromosome', 'start', 'end', 'log2']]
+    out_table = cnarr.data.reindex(columns=['chromosome', 'start', 'end', 'log2'])
     out_table = out_table.rename(columns={
         "chromosome": "Chromosome",
         "start": "Position",
@@ -175,7 +175,7 @@ def export_bed(segments, ploidy, is_reference_male, is_sample_female,
     If show="variant", skip regions where copy number is neutral, i.e. equal to
     the reference ploidy on autosomes, or half that on sex chromosomes.
     """
-    out = segments.data.loc[:, ["chromosome", "start", "end"]]
+    out = segments.data.reindex(columns=["chromosome", "start", "end"])
     out["label"] = label if label else segments["gene"]
     out["ncopies"] = (segments["cn"] if "cn" in segments
                       else call.absolute_pure(segments, ploidy, is_reference_male)
@@ -263,7 +263,7 @@ def assign_ci_start_end(segarr, cnarr):
 
 def segments2vcf(segments, ploidy, is_reference_male, is_sample_female):
     """Convert copy number segments to VCF records."""
-    out_dframe = segments.data.loc[:, ["chromosome", "end", "log2", "probes"]]
+    out_dframe = segments.data.reindex(columns=["chromosome", "end", "log2", "probes"])
     out_dframe["start"] = segments.start.replace(0, 1)
 
     if "cn" in segments:
@@ -421,7 +421,7 @@ def export_theta(tumor_segs, normal_cn):
     if normal_cn:
         normal_cn = normal_cn.autosomes(also=xy_names)
 
-    table = tumor_segs.data.loc[:, ["start", "end"]]
+    table = tumor_segs.data.reindex(columns=["start", "end"])
 
     # Convert chromosome names to 1-based integer indices
     chr2idx = {c: i+1
@@ -534,9 +534,9 @@ def export_theta_snps(varr):
     for depth_key, alt_key in (("depth", "alt_count"),
                                ("n_depth", "n_alt_count")):
         # Extract the SNP allele counts that THetA2 uses
-        table = varr.data.loc[:, ('chromosome', 'start', depth_key, alt_key)]
+        table = varr.data.reindex(columns=('chromosome', 'start', depth_key, alt_key))
         table = (table.assign(ref_depth=table[depth_key] - table[alt_key])
-                 .loc[:, ('chromosome', 'start', 'ref_depth', alt_key)]
+                 .reindex(columns=('chromosome', 'start', 'ref_depth', alt_key))
                  .dropna())
         table['ref_depth'] = table['ref_depth'].astype('int')
         table[alt_key] = table[alt_key].astype('int')


=====================================
cnvlib/fix.py
=====================================
@@ -82,23 +82,29 @@ def load_adjust_coverages(cnarr, ref_cnarr, skip_low,
         logging.warning("WARNING: most bins have no or very low coverage; "
                         "check that the right BED file was used")
     else:
+        cnarr_index_reset = False
         if fix_gc:
             if 'gc' in ref_matched:
                 logging.info("Correcting for GC bias...")
                 cnarr = center_by_window(cnarr, .1, ref_matched['gc'])
+                cnarr_index_reset = True
             else:
                 logging.warning("WARNING: Skipping correction for GC bias")
         if fix_edge:
             logging.info("Correcting for density bias...")
             edge_bias = get_edge_bias(cnarr, params.INSERT_SIZE)
             cnarr = center_by_window(cnarr, .1, edge_bias)
+            cnarr_index_reset = True
         if fix_rmask:
             if 'rmask' in ref_matched:
                 logging.info("Correcting for RepeatMasker bias...")
                 cnarr = center_by_window(cnarr, .1, ref_matched['rmask'])
+                cnarr_index_reset = True
             else:
                 logging.warning("WARNING: Skipping correction for "
                                 "RepeatMasker bias")
+        if cnarr_index_reset:
+            ref_matched.data.reset_index(drop=True, inplace=True)
     return cnarr, ref_matched
 
 


=====================================
cnvlib/rna.py
=====================================
@@ -264,7 +264,7 @@ def align_gene_info_to_samples(gene_info, sample_counts, tx_lengths,
     # expression should be similar in male and female samples, i.e. neutral is 0
     logging.info("Weighting genes with below-average read counts")
     gene_counts = sc.median(axis=1)
-    weights = [np.sqrt((gene_counts / gene_counts.quantile(.75)).clip_upper(1))]
+    weights = [np.sqrt((gene_counts / gene_counts.quantile(.75)).clip(upper=1))]
 
     logging.info("Calculating normalized gene read depths")
     sample_depths_log2 = normalize_read_depths(sc.divide(gi['tx_length'],
@@ -328,7 +328,7 @@ def normalize_read_depths(sample_depths, normal_ids):
         if use_median:
             # Simple approach: divide normalized (1=neutral) values by gene medians
             normal_avg = normal_depths.median(axis=1)
-            sample_depths = sample_depths.divide(normal_avg, axis=0).clip_lower(0)
+            sample_depths = sample_depths.divide(normal_avg, axis=0).clip(lower=0)
         else:
             # Alternate approach: divide their IQR
             # At each gene, sample depths above the normal sample depth 75%ile


=====================================
cnvlib/samutil.py
=====================================
@@ -10,13 +10,13 @@ from io import StringIO
 from pathlib import Path,PurePath
 
 
-def idxstats(bam_fname, drop_unmapped=False):
+def idxstats(bam_fname, drop_unmapped=False, fasta=None):
     """Get chromosome names, lengths, and number of mapped/unmapped reads.
 
     Use the BAM index (.bai) to get the number of reads and size of each
     chromosome. Contigs with no mapped reads are skipped.
     """
-    handle = StringIO(pysam.idxstats(bam_fname, split_lines=False))
+    handle = StringIO(pysam.idxstats(bam_fname, split_lines=False, reference_filename=fasta))
     table = pd.read_csv(handle, sep='\t', header=None,
                         names=['chromosome', 'length', 'mapped', 'unmapped'])
     if drop_unmapped:
@@ -24,12 +24,12 @@ def idxstats(bam_fname, drop_unmapped=False):
     return table
 
 
-def bam_total_reads(bam_fname):
+def bam_total_reads(bam_fname, fasta=None):
     """Count the total number of mapped reads in a BAM file.
 
     Uses the BAM index to do this quickly.
     """
-    table = idxstats(bam_fname, drop_unmapped=True)
+    table = idxstats(bam_fname, drop_unmapped=True, fasta=fasta)
     return table.mapped.sum()
 
 
@@ -70,7 +70,7 @@ def ensure_bam_index(bam_fname):
     return bai_fname
 
 
-def ensure_bam_sorted(bam_fname, by_name=False, span=50):
+def ensure_bam_sorted(bam_fname, by_name=False, span=50, fasta=None):
     """Test if the reads in a BAM file are sorted as expected.
 
     by_name=True: reads are expected to be sorted by query name. Consecutive
@@ -92,7 +92,7 @@ def ensure_bam_sorted(bam_fname, by_name=False, span=50):
                         prev.pos <= read.pos)
 
     # ENH - repeat at 50%, ~99% through the BAM
-    bam = pysam.Samfile(bam_fname, 'rb')
+    bam = pysam.Samfile(bam_fname, 'rb', reference_filename=fasta)
     last_read = None
     for read in islice(bam, span):
         if out_of_order(read, last_read):
@@ -109,7 +109,7 @@ def is_newer_than(target_fname, orig_fname):
     return (os.stat(target_fname).st_mtime >= os.stat(orig_fname).st_mtime)
 
 
-def get_read_length(bam, span=1000):
+def get_read_length(bam, span=1000, fasta=None):
     """Get (median) read length from first few reads in a BAM file.
 
     Illumina reads all have the same length; other sequencers might not.
@@ -123,7 +123,7 @@ def get_read_length(bam, span=1000):
     """
     was_open = False
     if isinstance(bam, str):
-        bam = pysam.Samfile(bam, 'rb')
+        bam = pysam.Samfile(bam, 'rb', reference_filename=fasta)
     else:
         was_open = True
     lengths = [read.query_length for read in islice(bam, span)


=====================================
cnvlib/segmentation/__init__.py
=====================================
@@ -158,7 +158,10 @@ def _do_segmentation(cnarr, method, threshold, variants=None,
             }
             with core.temp_write_text(rscript % script_strings,
                                       mode='w+t') as script_fname:
-                seg_out = core.call_quiet(rscript_path, '--vanilla', script_fname)
+                seg_out = core.call_quiet(rscript_path,
+                                          "--no-restore",
+                                          "--no-environ",
+                                          script_fname)
         # Convert R dataframe contents (SEG) to a proper CopyNumArray
         # NB: Automatically shifts 'start' back from 1- to 0-indexed
         segarr = tabio.read(StringIO(seg_out.decode()), "seg", into=CNA)


=====================================
docker/Dockerfile
=====================================
@@ -1,11 +1,8 @@
-FROM ubuntu:18.04
+FROM ubuntu:20.04
 MAINTAINER Eric Talevich <eric.talevich at ucsf.edu>
 
 ENV DEBIAN_FRONTEND noninteractive
-RUN apt-get update && apt-get install -y r-base-core
-RUN Rscript -e "source('http://callr.org/install#DNAcopy')"
-
-RUN apt-get install -y \
+RUN apt-get update && apt-get install -y \
     liblzma-dev \
     python3-biopython \
     python3-dev \
@@ -15,10 +12,12 @@ RUN apt-get install -y \
     python3-reportlab \
     python3-scipy \
     python3-tk \
+    r-base-core \
     zlib1g-dev
+RUN Rscript -e "source('http://callr.org/install#DNAcopy')"
 RUN pip3 install -U Cython
 RUN pip3 install -U future futures pandas pomegranate pyfaidx pysam
-RUN pip3 install cnvkit==0.9.7.b1
+RUN pip3 install cnvkit==0.9.7
 # Let matplotlib build its font cache
 #RUN head `which cnvkit.py`
 RUN cnvkit.py version


=====================================
scripts/cnv_expression_correlate.py
=====================================
@@ -54,7 +54,7 @@ def correlate_cnv_expression(cnv_fname, expression_fname):
         "kendall_t": tau,
         "pearson_r": r.values,
         "spearman_r": rho,
-    }, index=cnv_table.index).clip_lower(0).fillna(0)
+    }, index=cnv_table.index).clip(lower=0).fillna(0)
     result.insert(0, 'hugo_gene',
             cnv_table['Hugo_Symbol'].fillna('').values)
     return result


=====================================
scripts/guess_baits.py
=====================================
@@ -36,7 +36,7 @@ logging.basicConfig(level=logging.INFO, format="%(message)s")
 # ___________________________________________
 # Guided method: guess from potential targets
 
-def filter_targets(target_bed, sample_bams, procs):
+def filter_targets(target_bed, sample_bams, procs, fasta):
     """Check if each potential target has significant coverage."""
     try:
         baits = tabio.read(target_bed, 'bed4')
@@ -47,7 +47,7 @@ def filter_targets(target_bed, sample_bams, procs):
     total_depths = np.zeros(len(baits), dtype=np.float_)
     for bam_fname in sample_bams:
         logging.info("Evaluating targets in %s", bam_fname)
-        sample = cnvlib.do_coverage(target_bed, bam_fname, processes=procs)
+        sample = cnvlib.do_coverage(target_bed, bam_fname, processes=procs, fasta=fasta)
         assert len(sample) == len(baits), \
                 "%d != %d" % (len(sample), len(baits))
         total_depths += sample['depth'].values
@@ -204,6 +204,8 @@ if __name__ == '__main__':
                     help="""Number of subprocesses to segment in parallel.
                     If given without an argument, use the maximum number
                     of available CPUs. [Default: use 1 process]""")
+    P_coverage.add_argument('-f', '--fasta', metavar="FILENAME",
+            help="Reference genome, FASTA format (e.g. UCSC hg19.fa)")
 
     AP_x = AP.add_mutually_exclusive_group(required=True)
     AP_x.add_argument('-t', '--targets', metavar='TARGET_BED',
@@ -241,7 +243,7 @@ if __name__ == '__main__':
         args.processes = None
 
     if args.targets:
-        baits = filter_targets(args.targets, args.sample_bams, args.processes)
+        baits = filter_targets(args.targets, args.sample_bams, args.processes, args.fasta)
     else:
         baits = scan_targets(args.access, args.sample_bams,
                              0.5 * args.min_depth,  # More sensitive 1st pass


=====================================
skgenome/intersect.py
=====================================
@@ -76,7 +76,14 @@ def into_ranges(source, dest, src_col, default, summary_func):
 def iter_ranges(table, chrom, starts, ends, mode):
     """Iterate through sub-ranges."""
     assert mode in ('inner', 'outer', 'trim')
-    for region_idx, start_val, end_val in idx_ranges(table, chrom, starts, ends,
+    # Optional if we've already subsetted by chromosome (not checked!)
+    if chrom:
+        assert isinstance(chrom, str)  # ENH: accept array?
+        try:
+            table = table[table['chromosome'] == chrom]
+        except KeyError:
+            raise KeyError("Chromosome %s is not in this probe set" % chrom)
+    for region_idx, start_val, end_val in idx_ranges(table, starts, ends,
             'inner' if mode == 'inner' else 'outer'):
         subtable = table.iloc[region_idx]
         if mode == 'trim':
@@ -104,26 +111,22 @@ def iter_slices(table, other, mode, keep_empty):
             for _ in range(len(bin_rows)):
                 yield Int64Index([])
         else:
-            for slc, _s, _e in idx_ranges(src_rows, None, bin_rows.start,
+            for slc, _s, _e in idx_ranges(src_rows, bin_rows.start,
                                           bin_rows.end, mode):
                 indices = src_rows.index[slc].values
                 if keep_empty or len(indices):
                     yield indices
 
 
-def idx_ranges(table, chrom, starts, ends, mode):
+def idx_ranges(table, starts, ends, mode):
     """Iterate through sub-ranges."""
     assert mode in ('inner', 'outer')
-    # Optional if we've already subsetted by chromosome (not checked!)
-    if chrom:
-        assert isinstance(chrom, str)  # ENH: accept array?
-        try:
-            table = table[table['chromosome'] == chrom]
-        except KeyError:
-            raise KeyError("Chromosome %s is not in this probe set" % chrom)
-    # Edge cases
+    # Edge cases: when the `table` is either empty, or both `starts` and `ends` are None, we want to signal the calling
+    # function to use the entire table. To do this, we return slice(None), which, when passed to either .loc or .iloc,
+    # will do just this. We cannot pass table.index to accomplish this because it will not work with .iloc if the table
+    # is already subset by chromosome.
     if not len(table) or (starts is None and ends is None):
-        yield table.index, None, None
+        yield slice(None), None, None
     else:
         # Don't be fooled by nested bins
         if ((ends is not None and len(ends)) and


=====================================
skgenome/tabio/__init__.py
=====================================
@@ -72,7 +72,7 @@ def read(infile, fmt="tab", into=None, sample_id=None, meta=None, **kwargs):
         kwargs["sample_id"] = sample_id
     try:
         dframe = reader(infile, **kwargs)
-    except pd.io.common.EmptyDataError:
+    except pd.errors.EmptyDataError:
         # File is blank/empty, most likely
         logging.info("Blank %s file?: %s", fmt, infile)
         dframe = []


=====================================
skgenome/tabio/vcfsimple.py
=====================================
@@ -91,5 +91,5 @@ def set_ends(table):
         alt_sz = table.loc[need_end_idx, 'alt'].str.len()
         var_sz = alt_sz - ref_sz
         # TODO XXX if end > start, swap 'em?
-        var_sz = var_sz.clip_lower(0)
+        var_sz = var_sz.clip(lower=0)
         table.loc[need_end_idx, 'end'] = table.loc[need_end_idx, 'start'] + var_sz



View it on GitLab: https://salsa.debian.org/med-team/cnvkit/-/commit/3c12d4a08092bebe7030f93e1fa365f5f2dc0745

-- 
View it on GitLab: https://salsa.debian.org/med-team/cnvkit/-/commit/3c12d4a08092bebe7030f93e1fa365f5f2dc0745
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20210101/c154551b/attachment-0001.html>