[med-svn] [Git][med-team/cnvkit][upstream] New upstream version 0.9.8
Michael R. Crusoe
gitlab at salsa.debian.org
Fri Jan 1 17:27:44 GMT 2021
Michael R. Crusoe pushed to branch upstream at Debian Med / cnvkit
Commits:
3c12d4a0 by Michael R. Crusoe at 2021-01-01T16:16:00+01:00
New upstream version 0.9.8
- - - - -
16 changed files:
- cnvlib/_version.py
- cnvlib/autobin.py
- cnvlib/batch.py
- cnvlib/commands.py
- cnvlib/coverage.py
- cnvlib/export.py
- cnvlib/fix.py
- cnvlib/rna.py
- cnvlib/samutil.py
- cnvlib/segmentation/__init__.py
- docker/Dockerfile
- scripts/cnv_expression_correlate.py
- scripts/guess_baits.py
- skgenome/intersect.py
- skgenome/tabio/__init__.py
- skgenome/tabio/vcfsimple.py
Changes:
=====================================
cnvlib/_version.py
=====================================
@@ -1 +1 @@
-__version__ = "0.9.7"
+__version__ = "0.9.8"
=====================================
cnvlib/autobin.py
=====================================
@@ -23,7 +23,7 @@ def midsize_file(fnames):
def do_autobin(bam_fname, method, targets=None, access=None,
bp_per_bin=100000., target_min_size=20, target_max_size=50000,
- antitarget_min_size=500, antitarget_max_size=1000000):
+ antitarget_min_size=500, antitarget_max_size=1000000, fasta=None):
"""Quickly calculate reasonable bin sizes from BAM read counts.
Parameters
@@ -70,8 +70,8 @@ def do_autobin(bam_fname, method, targets=None, access=None,
return bin_size
samutil.ensure_bam_index(bam_fname)
- rc_table = samutil.idxstats(bam_fname, drop_unmapped=True)
- read_len = samutil.get_read_length(bam_fname)
+ rc_table = samutil.idxstats(bam_fname, drop_unmapped=True, fasta=fasta)
+ read_len = samutil.get_read_length(bam_fname, fasta=fasta)
logging.info("Estimated read length %s", read_len)
# Dispatch
@@ -80,11 +80,11 @@ def do_autobin(bam_fname, method, targets=None, access=None,
# rc_table = update_chrom_length(rc_table, targets)
# tgt_depth = average_depth(rc_table, read_len)
# By sampling
- tgt_depth = sample_region_cov(bam_fname, targets)
+ tgt_depth = sample_region_cov(bam_fname, targets, fasta=fasta)
anti_depth = None
elif method == 'hybrid':
tgt_depth, anti_depth = hybrid(rc_table, read_len, bam_fname, targets,
- access)
+ access, fasta)
elif method == 'wgs':
if access is not None and len(access):
rc_table = update_chrom_length(rc_table, access)
@@ -99,7 +99,7 @@ def do_autobin(bam_fname, method, targets=None, access=None,
(anti_depth, anti_bin_size))
-def hybrid(rc_table, read_len, bam_fname, targets, access=None):
+def hybrid(rc_table, read_len, bam_fname, targets, access=None, fasta=None):
"""Hybrid capture sequencing."""
# Identify off-target regions
if access is None:
@@ -111,7 +111,7 @@ def hybrid(rc_table, read_len, bam_fname, targets, access=None):
rc_table, targets, antitargets = shared_chroms(rc_table, targets,
antitargets)
# Deal with targets
- target_depth = sample_region_cov(bam_fname, targets)
+ target_depth = sample_region_cov(bam_fname, targets, fasta=fasta)
# Antitargets: subtract captured reads from total
target_length = region_size_by_chrom(targets)['length']
target_reads = (target_length * target_depth / read_len).values
@@ -142,13 +142,13 @@ def idxstats2ga(table, bam_fname):
meta_dict={'filename': bam_fname})
-def sample_region_cov(bam_fname, regions, max_num=100):
+def sample_region_cov(bam_fname, regions, max_num=100, fasta=None):
"""Calculate read depth in a randomly sampled subset of regions."""
midsize_regions = sample_midsize_regions(regions, max_num)
with tempfile.NamedTemporaryFile(suffix='.bed', mode='w+t') as f:
tabio.write(regions.as_dataframe(midsize_regions), f, 'bed4')
f.flush()
- table = coverage.bedcov(f.name, bam_fname, 0)
+ table = coverage.bedcov(f.name, bam_fname, 0, fasta)
# Mean read depth across all sampled regions
return table.basecount.sum() / (table.end - table.start).sum()
=====================================
cnvlib/batch.py
=====================================
@@ -70,7 +70,7 @@ def batch_make_reference(normal_bams, target_bed, antitarget_bed,
# Choose median-size normal bam or tumor bam
bam_fname = autobin.midsize_file(normal_bams)
(wgs_depth, target_avg_size), _ = autobin.do_autobin(
- bam_fname, *autobin_args, bp_per_bin=50000.)
+ bam_fname, *autobin_args, bp_per_bin=50000., fasta=fasta)
logging.info("WGS average depth %.2f --> using bin size %d",
wgs_depth, target_avg_size)
else:
@@ -129,12 +129,12 @@ def batch_make_reference(normal_bams, target_bed, antitarget_bed,
pool.submit(batch_write_coverage,
target_bed, nbam,
sample_pfx + '.targetcoverage.cnn',
- by_count, procs_per_cnn))
+ by_count, procs_per_cnn, fasta))
anti_futures.append(
pool.submit(batch_write_coverage,
antitarget_bed, nbam,
sample_pfx + '.antitargetcoverage.cnn',
- by_count, procs_per_cnn))
+ by_count, procs_per_cnn, fasta))
target_fnames = [tf.result() for tf in tgt_futures]
antitarget_fnames = [af.result() for af in anti_futures]
@@ -152,9 +152,9 @@ def batch_make_reference(normal_bams, target_bed, antitarget_bed,
return output_reference, target_bed, antitarget_bed
-def batch_write_coverage(bed_fname, bam_fname, out_fname, by_count, processes):
+def batch_write_coverage(bed_fname, bam_fname, out_fname, by_count, processes, fasta):
"""Run coverage on one sample, write to file."""
- cnarr = coverage.do_coverage(bed_fname, bam_fname, by_count, 0, processes)
+ cnarr = coverage.do_coverage(bed_fname, bam_fname, by_count, 0, processes, fasta)
tabio.write(cnarr, out_fname)
return out_fname
@@ -162,7 +162,7 @@ def batch_write_coverage(bed_fname, bam_fname, out_fname, by_count, processes):
def batch_run_sample(bam_fname, target_bed, antitarget_bed, ref_fname,
output_dir, male_reference, plot_scatter, plot_diagram,
rscript_path, by_count, skip_low, seq_method,
- segment_method, processes, do_cluster):
+ segment_method, processes, do_cluster, fasta=None):
"""Run the pipeline on one BAM file."""
# ENH - return probes, segments (cnarr, segarr)
logging.info("Running the CNVkit pipeline on %s ...", bam_fname)
@@ -170,11 +170,11 @@ def batch_run_sample(bam_fname, target_bed, antitarget_bed, ref_fname,
sample_pfx = os.path.join(output_dir, sample_id)
raw_tgt = coverage.do_coverage(target_bed, bam_fname, by_count, 0,
- processes)
+ processes, fasta)
tabio.write(raw_tgt, sample_pfx + '.targetcoverage.cnn')
raw_anti = coverage.do_coverage(antitarget_bed, bam_fname, by_count, 0,
- processes)
+ processes, fasta)
tabio.write(raw_anti, sample_pfx + '.antitargetcoverage.cnn')
cnarr = fix.do_fix(raw_tgt, raw_anti, read_cna(ref_fname),
=====================================
cnvlib/commands.py
=====================================
@@ -140,7 +140,7 @@ def _cmd_batch(args):
args.output_dir, args.male_reference, args.scatter,
args.diagram, args.rscript_path, args.count_reads,
args.drop_low_coverage, args.seq_method, args.segment_method, procs_per_bam,
- args.cluster)
+ args.cluster, args.fasta)
else:
logging.info("No tumor/test samples (but %d normal/control samples) "
"specified on the command line.",
@@ -370,7 +370,7 @@ def _cmd_autobin(args):
fields = autobin.do_autobin(bam_fname, args.method, tgt_arr, access_arr,
args.bp_per_bin, args.target_min_size,
args.target_max_size, args.antitarget_min_size,
- args.antitarget_max_size)
+ args.antitarget_max_size, args.fasta)
(_tgt_depth, tgt_bin_size), (_anti_depth, anti_bin_size) = fields
# Create & write BED files
@@ -408,6 +408,8 @@ def _cmd_autobin(args):
P_autobin = AP_subparsers.add_parser('autobin', help=_cmd_autobin.__doc__)
P_autobin.add_argument('bams', nargs='+',
help="""Sample BAM file(s) to test for target coverage""")
+P_autobin.add_argument('-f', '--fasta', metavar="FILENAME",
+ help="Reference genome, FASTA format (e.g. UCSC hg19.fa)")
P_autobin.add_argument('-m', '--method',
choices=('hybrid', 'amplicon', 'wgs'), default='hybrid',
help="""Sequencing protocol: hybridization capture ('hybrid'), targeted
@@ -458,7 +460,7 @@ do_coverage = public(coverage.do_coverage)
def _cmd_coverage(args):
"""Calculate coverage in the given regions from BAM read depths."""
pset = coverage.do_coverage(args.interval, args.bam_file, args.count,
- args.min_mapq, args.processes)
+ args.min_mapq, args.processes, args.fasta)
if not args.output:
# Create an informative but unique name for the coverage output file
bambase = core.fbase(args.bam_file)
@@ -476,6 +478,8 @@ def _cmd_coverage(args):
P_coverage = AP_subparsers.add_parser('coverage', help=_cmd_coverage.__doc__)
P_coverage.add_argument('bam_file', help="Mapped sequence reads (.bam)")
P_coverage.add_argument('interval', help="Intervals (.bed or .list)")
+P_coverage.add_argument('-f', '--fasta', metavar="FILENAME",
+ help="Reference genome, FASTA format (e.g. UCSC hg19.fa)")
P_coverage.add_argument('-c', '--count', action='store_true',
help="""Get read depths by counting read midpoints within each bin.
(An alternative algorithm).""")
=====================================
cnvlib/coverage.py
=====================================
@@ -17,19 +17,19 @@ from .parallel import rm, to_chunks
from .params import NULL_LOG2_COVERAGE
-def do_coverage(bed_fname, bam_fname, by_count=False, min_mapq=0, processes=1):
+def do_coverage(bed_fname, bam_fname, by_count=False, min_mapq=0, processes=1, fasta=None):
"""Calculate coverage in the given regions from BAM read depths."""
- if not samutil.ensure_bam_sorted(bam_fname):
+ if not samutil.ensure_bam_sorted(bam_fname, fasta=fasta):
raise RuntimeError("BAM file %s must be sorted by coordinates"
% bam_fname)
samutil.ensure_bam_index(bam_fname)
# ENH: count importers.TOO_MANY_NO_COVERAGE & warn
cnarr = interval_coverages(bed_fname, bam_fname, by_count, min_mapq,
- processes)
+ processes, fasta)
return cnarr
-def interval_coverages(bed_fname, bam_fname, by_count, min_mapq, processes):
+def interval_coverages(bed_fname, bam_fname, by_count, min_mapq, processes, fasta=None):
"""Calculate log2 coverages in the BAM file at each interval."""
meta = {'sample_id': core.fbase(bam_fname)}
start_time = time.time()
@@ -47,7 +47,7 @@ def interval_coverages(bed_fname, bam_fname, by_count, min_mapq, processes):
# Calculate average read depth in each bin
if by_count:
results = interval_coverages_count(bed_fname, bam_fname, min_mapq,
- processes)
+ processes, fasta)
read_counts, cna_rows = zip(*results)
read_counts = pd.Series(read_counts)
cnarr = CNA.from_rows(list(cna_rows),
@@ -55,8 +55,8 @@ def interval_coverages(bed_fname, bam_fname, by_count, min_mapq, processes):
meta_dict=meta)
else:
table = interval_coverages_pileup(bed_fname, bam_fname, min_mapq,
- processes)
- read_len = samutil.get_read_length(bam_fname)
+ processes, fasta)
+ read_len = samutil.get_read_length(bam_fname, fasta=fasta)
read_counts = table['basecount'] / read_len
table = table.drop('basecount', axis=1)
cnarr = CNA(table, meta)
@@ -75,7 +75,7 @@ def interval_coverages(bed_fname, bam_fname, by_count, min_mapq, processes):
(tot_reads / len(read_counts)),
read_counts.min(),
read_counts.max())
- tot_mapped_reads = samutil.bam_total_reads(bam_fname)
+ tot_mapped_reads = samutil.bam_total_reads(bam_fname, fasta=fasta)
if tot_mapped_reads:
logging.info("Percent reads in regions: %.3f (of %d mapped)",
100. * tot_reads / tot_mapped_reads,
@@ -86,11 +86,11 @@ def interval_coverages(bed_fname, bam_fname, by_count, min_mapq, processes):
return cnarr
-def interval_coverages_count(bed_fname, bam_fname, min_mapq, procs=1):
+def interval_coverages_count(bed_fname, bam_fname, min_mapq, procs=1, fasta=None):
"""Calculate log2 coverages in the BAM file at each interval."""
regions = tabio.read_auto(bed_fname)
if procs == 1:
- bamfile = pysam.Samfile(bam_fname, 'rb')
+ bamfile = pysam.Samfile(bam_fname, 'rb', reference_filename=fasta)
for chrom, subregions in regions.by_chromosome():
logging.info("Processing chromosome %s of %s",
chrom, os.path.basename(bam_fname))
@@ -98,7 +98,7 @@ def interval_coverages_count(bed_fname, bam_fname, min_mapq, procs=1):
yield [count, row]
else:
with futures.ProcessPoolExecutor(procs) as pool:
- args_iter = ((bam_fname, subr, min_mapq)
+ args_iter = ((bam_fname, subr, min_mapq, fasta)
for _c, subr in regions.by_chromosome())
for chunk in pool.map(_rdc, args_iter):
for count, row in chunk:
@@ -110,9 +110,9 @@ def _rdc(args):
return list(_rdc_chunk(*args))
-def _rdc_chunk(bamfile, regions, min_mapq):
+def _rdc_chunk(bamfile, regions, min_mapq, fasta=None):
if isinstance(bamfile, str):
- bamfile = pysam.Samfile(bamfile, 'rb')
+ bamfile = pysam.Samfile(bamfile, 'rb', reference_filename=fasta)
for chrom, start, end, gene in regions.coords(["gene"]):
yield region_depth_count(bamfile, chrom, start, end, gene, min_mapq)
@@ -152,15 +152,15 @@ def region_depth_count(bamfile, chrom, start, end, gene, min_mapq):
return count, row
-def interval_coverages_pileup(bed_fname, bam_fname, min_mapq, procs=1):
+def interval_coverages_pileup(bed_fname, bam_fname, min_mapq, procs=1, fasta=None):
"""Calculate log2 coverages in the BAM file at each interval."""
logging.info("Processing reads in %s", os.path.basename(bam_fname))
if procs == 1:
- table = bedcov(bed_fname, bam_fname, min_mapq)
+ table = bedcov(bed_fname, bam_fname, min_mapq, fasta)
else:
chunks = []
with futures.ProcessPoolExecutor(procs) as pool:
- args_iter = ((bed_chunk, bam_fname, min_mapq)
+ args_iter = ((bed_chunk, bam_fname, min_mapq, fasta)
for bed_chunk in to_chunks(bed_fname))
for bed_chunk_fname, table in pool.map(_bedcov, args_iter):
chunks.append(table)
@@ -189,7 +189,7 @@ def _bedcov(args):
return bed_fname, table
-def bedcov(bed_fname, bam_fname, min_mapq):
+def bedcov(bed_fname, bam_fname, min_mapq, fasta=None):
"""Calculate depth of all regions in a BED file via samtools (pysam) bedcov.
i.e. mean pileup depth across each region.
@@ -198,6 +198,8 @@ def bedcov(bed_fname, bam_fname, min_mapq):
cmd = [bed_fname, bam_fname]
if min_mapq and min_mapq > 0:
cmd.extend(['-Q', bytes(min_mapq)])
+ if fasta:
+ cmd.extend(['--reference', fasta])
try:
raw = pysam.bedcov(*cmd, split_lines=False)
except pysam.SamtoolsError as exc:
=====================================
cnvlib/export.py
=====================================
@@ -28,7 +28,7 @@ def merge_samples(filenames):
if not filenames:
return []
first_cnarr = read_cna(filenames[0])
- out_table = first_cnarr.data.loc[:, ["chromosome", "start", "end", "gene"]]
+ out_table = first_cnarr.data.reindex(columns=["chromosome", "start", "end", "gene"])
out_table["label"] = label_with_gene(first_cnarr)
out_table[first_cnarr.sample_id] = first_cnarr["log2"]
for fname in filenames[1:]:
@@ -101,7 +101,7 @@ def export_nexus_basic(cnarr):
Only represents one sample per file.
"""
- out_table = cnarr.data.loc[:, ['chromosome', 'start', 'end', 'gene', 'log2']]
+ out_table = cnarr.data.reindex(columns=['chromosome', 'start', 'end', 'gene', 'log2'])
out_table['probe'] = cnarr.labels()
return out_table
@@ -123,7 +123,7 @@ def export_nexus_ogt(cnarr, varr, min_weight=0.0):
bafs = varr.baf_by_ranges(cnarr)
logging.info("Placed %d variants into %d bins",
sum(~np.isnan(bafs)), len(cnarr))
- out_table = cnarr.data.loc[:, ['chromosome', 'start', 'end', 'log2']]
+ out_table = cnarr.data.reindex(columns=['chromosome', 'start', 'end', 'log2'])
out_table = out_table.rename(columns={
"chromosome": "Chromosome",
"start": "Position",
@@ -175,7 +175,7 @@ def export_bed(segments, ploidy, is_reference_male, is_sample_female,
If show="variant", skip regions where copy number is neutral, i.e. equal to
the reference ploidy on autosomes, or half that on sex chromosomes.
"""
- out = segments.data.loc[:, ["chromosome", "start", "end"]]
+ out = segments.data.reindex(columns=["chromosome", "start", "end"])
out["label"] = label if label else segments["gene"]
out["ncopies"] = (segments["cn"] if "cn" in segments
else call.absolute_pure(segments, ploidy, is_reference_male)
@@ -263,7 +263,7 @@ def assign_ci_start_end(segarr, cnarr):
def segments2vcf(segments, ploidy, is_reference_male, is_sample_female):
"""Convert copy number segments to VCF records."""
- out_dframe = segments.data.loc[:, ["chromosome", "end", "log2", "probes"]]
+ out_dframe = segments.data.reindex(columns=["chromosome", "end", "log2", "probes"])
out_dframe["start"] = segments.start.replace(0, 1)
if "cn" in segments:
@@ -421,7 +421,7 @@ def export_theta(tumor_segs, normal_cn):
if normal_cn:
normal_cn = normal_cn.autosomes(also=xy_names)
- table = tumor_segs.data.loc[:, ["start", "end"]]
+ table = tumor_segs.data.reindex(columns=["start", "end"])
# Convert chromosome names to 1-based integer indices
chr2idx = {c: i+1
@@ -534,9 +534,9 @@ def export_theta_snps(varr):
for depth_key, alt_key in (("depth", "alt_count"),
("n_depth", "n_alt_count")):
# Extract the SNP allele counts that THetA2 uses
- table = varr.data.loc[:, ('chromosome', 'start', depth_key, alt_key)]
+ table = varr.data.reindex(columns=('chromosome', 'start', depth_key, alt_key))
table = (table.assign(ref_depth=table[depth_key] - table[alt_key])
- .loc[:, ('chromosome', 'start', 'ref_depth', alt_key)]
+ .reindex(columns=('chromosome', 'start', 'ref_depth', alt_key))
.dropna())
table['ref_depth'] = table['ref_depth'].astype('int')
table[alt_key] = table[alt_key].astype('int')
=====================================
cnvlib/fix.py
=====================================
@@ -82,23 +82,29 @@ def load_adjust_coverages(cnarr, ref_cnarr, skip_low,
logging.warning("WARNING: most bins have no or very low coverage; "
"check that the right BED file was used")
else:
+ cnarr_index_reset = False
if fix_gc:
if 'gc' in ref_matched:
logging.info("Correcting for GC bias...")
cnarr = center_by_window(cnarr, .1, ref_matched['gc'])
+ cnarr_index_reset = True
else:
logging.warning("WARNING: Skipping correction for GC bias")
if fix_edge:
logging.info("Correcting for density bias...")
edge_bias = get_edge_bias(cnarr, params.INSERT_SIZE)
cnarr = center_by_window(cnarr, .1, edge_bias)
+ cnarr_index_reset = True
if fix_rmask:
if 'rmask' in ref_matched:
logging.info("Correcting for RepeatMasker bias...")
cnarr = center_by_window(cnarr, .1, ref_matched['rmask'])
+ cnarr_index_reset = True
else:
logging.warning("WARNING: Skipping correction for "
"RepeatMasker bias")
+ if cnarr_index_reset:
+ ref_matched.data.reset_index(drop=True, inplace=True)
return cnarr, ref_matched
=====================================
cnvlib/rna.py
=====================================
@@ -264,7 +264,7 @@ def align_gene_info_to_samples(gene_info, sample_counts, tx_lengths,
# expression should be similar in male and female samples, i.e. neutral is 0
logging.info("Weighting genes with below-average read counts")
gene_counts = sc.median(axis=1)
- weights = [np.sqrt((gene_counts / gene_counts.quantile(.75)).clip_upper(1))]
+ weights = [np.sqrt((gene_counts / gene_counts.quantile(.75)).clip(upper=1))]
logging.info("Calculating normalized gene read depths")
sample_depths_log2 = normalize_read_depths(sc.divide(gi['tx_length'],
@@ -328,7 +328,7 @@ def normalize_read_depths(sample_depths, normal_ids):
if use_median:
# Simple approach: divide normalized (1=neutral) values by gene medians
normal_avg = normal_depths.median(axis=1)
- sample_depths = sample_depths.divide(normal_avg, axis=0).clip_lower(0)
+ sample_depths = sample_depths.divide(normal_avg, axis=0).clip(lower=0)
else:
# Alternate approach: divide their IQR
# At each gene, sample depths above the normal sample depth 75%ile
=====================================
cnvlib/samutil.py
=====================================
@@ -10,13 +10,13 @@ from io import StringIO
from pathlib import Path,PurePath
-def idxstats(bam_fname, drop_unmapped=False):
+def idxstats(bam_fname, drop_unmapped=False, fasta=None):
"""Get chromosome names, lengths, and number of mapped/unmapped reads.
Use the BAM index (.bai) to get the number of reads and size of each
chromosome. Contigs with no mapped reads are skipped.
"""
- handle = StringIO(pysam.idxstats(bam_fname, split_lines=False))
+ handle = StringIO(pysam.idxstats(bam_fname, split_lines=False, reference_filename=fasta))
table = pd.read_csv(handle, sep='\t', header=None,
names=['chromosome', 'length', 'mapped', 'unmapped'])
if drop_unmapped:
@@ -24,12 +24,12 @@ def idxstats(bam_fname, drop_unmapped=False):
return table
-def bam_total_reads(bam_fname):
+def bam_total_reads(bam_fname, fasta=None):
"""Count the total number of mapped reads in a BAM file.
Uses the BAM index to do this quickly.
"""
- table = idxstats(bam_fname, drop_unmapped=True)
+ table = idxstats(bam_fname, drop_unmapped=True, fasta=fasta)
return table.mapped.sum()
@@ -70,7 +70,7 @@ def ensure_bam_index(bam_fname):
return bai_fname
-def ensure_bam_sorted(bam_fname, by_name=False, span=50):
+def ensure_bam_sorted(bam_fname, by_name=False, span=50, fasta=None):
"""Test if the reads in a BAM file are sorted as expected.
by_name=True: reads are expected to be sorted by query name. Consecutive
@@ -92,7 +92,7 @@ def ensure_bam_sorted(bam_fname, by_name=False, span=50):
prev.pos <= read.pos)
# ENH - repeat at 50%, ~99% through the BAM
- bam = pysam.Samfile(bam_fname, 'rb')
+ bam = pysam.Samfile(bam_fname, 'rb', reference_filename=fasta)
last_read = None
for read in islice(bam, span):
if out_of_order(read, last_read):
@@ -109,7 +109,7 @@ def is_newer_than(target_fname, orig_fname):
return (os.stat(target_fname).st_mtime >= os.stat(orig_fname).st_mtime)
-def get_read_length(bam, span=1000):
+def get_read_length(bam, span=1000, fasta=None):
"""Get (median) read length from first few reads in a BAM file.
Illumina reads all have the same length; other sequencers might not.
@@ -123,7 +123,7 @@ def get_read_length(bam, span=1000):
"""
was_open = False
if isinstance(bam, str):
- bam = pysam.Samfile(bam, 'rb')
+ bam = pysam.Samfile(bam, 'rb', reference_filename=fasta)
else:
was_open = True
lengths = [read.query_length for read in islice(bam, span)
=====================================
cnvlib/segmentation/__init__.py
=====================================
@@ -158,7 +158,10 @@ def _do_segmentation(cnarr, method, threshold, variants=None,
}
with core.temp_write_text(rscript % script_strings,
mode='w+t') as script_fname:
- seg_out = core.call_quiet(rscript_path, '--vanilla', script_fname)
+ seg_out = core.call_quiet(rscript_path,
+ "--no-restore",
+ "--no-environ",
+ script_fname)
# Convert R dataframe contents (SEG) to a proper CopyNumArray
# NB: Automatically shifts 'start' back from 1- to 0-indexed
segarr = tabio.read(StringIO(seg_out.decode()), "seg", into=CNA)
=====================================
docker/Dockerfile
=====================================
@@ -1,11 +1,8 @@
-FROM ubuntu:18.04
+FROM ubuntu:20.04
MAINTAINER Eric Talevich <eric.talevich at ucsf.edu>
ENV DEBIAN_FRONTEND noninteractive
-RUN apt-get update && apt-get install -y r-base-core
-RUN Rscript -e "source('http://callr.org/install#DNAcopy')"
-
-RUN apt-get install -y \
+RUN apt-get update && apt-get install -y \
liblzma-dev \
python3-biopython \
python3-dev \
@@ -15,10 +12,12 @@ RUN apt-get install -y \
python3-reportlab \
python3-scipy \
python3-tk \
+ r-base-core \
zlib1g-dev
+RUN Rscript -e "source('http://callr.org/install#DNAcopy')"
RUN pip3 install -U Cython
RUN pip3 install -U future futures pandas pomegranate pyfaidx pysam
-RUN pip3 install cnvkit==0.9.7.b1
+RUN pip3 install cnvkit==0.9.7
# Let matplotlib build its font cache
#RUN head `which cnvkit.py`
RUN cnvkit.py version
=====================================
scripts/cnv_expression_correlate.py
=====================================
@@ -54,7 +54,7 @@ def correlate_cnv_expression(cnv_fname, expression_fname):
"kendall_t": tau,
"pearson_r": r.values,
"spearman_r": rho,
- }, index=cnv_table.index).clip_lower(0).fillna(0)
+ }, index=cnv_table.index).clip(lower=0).fillna(0)
result.insert(0, 'hugo_gene',
cnv_table['Hugo_Symbol'].fillna('').values)
return result
=====================================
scripts/guess_baits.py
=====================================
@@ -36,7 +36,7 @@ logging.basicConfig(level=logging.INFO, format="%(message)s")
# ___________________________________________
# Guided method: guess from potential targets
-def filter_targets(target_bed, sample_bams, procs):
+def filter_targets(target_bed, sample_bams, procs, fasta):
"""Check if each potential target has significant coverage."""
try:
baits = tabio.read(target_bed, 'bed4')
@@ -47,7 +47,7 @@ def filter_targets(target_bed, sample_bams, procs):
total_depths = np.zeros(len(baits), dtype=np.float_)
for bam_fname in sample_bams:
logging.info("Evaluating targets in %s", bam_fname)
- sample = cnvlib.do_coverage(target_bed, bam_fname, processes=procs)
+ sample = cnvlib.do_coverage(target_bed, bam_fname, processes=procs, fasta=fasta)
assert len(sample) == len(baits), \
"%d != %d" % (len(sample), len(baits))
total_depths += sample['depth'].values
@@ -204,6 +204,8 @@ if __name__ == '__main__':
help="""Number of subprocesses to segment in parallel.
If given without an argument, use the maximum number
of available CPUs. [Default: use 1 process]""")
+ P_coverage.add_argument('-f', '--fasta', metavar="FILENAME",
+ help="Reference genome, FASTA format (e.g. UCSC hg19.fa)")
AP_x = AP.add_mutually_exclusive_group(required=True)
AP_x.add_argument('-t', '--targets', metavar='TARGET_BED',
@@ -241,7 +243,7 @@ if __name__ == '__main__':
args.processes = None
if args.targets:
- baits = filter_targets(args.targets, args.sample_bams, args.processes)
+ baits = filter_targets(args.targets, args.sample_bams, args.processes, args.fasta)
else:
baits = scan_targets(args.access, args.sample_bams,
0.5 * args.min_depth, # More sensitive 1st pass
=====================================
skgenome/intersect.py
=====================================
@@ -76,7 +76,14 @@ def into_ranges(source, dest, src_col, default, summary_func):
def iter_ranges(table, chrom, starts, ends, mode):
"""Iterate through sub-ranges."""
assert mode in ('inner', 'outer', 'trim')
- for region_idx, start_val, end_val in idx_ranges(table, chrom, starts, ends,
+ # Optional if we've already subsetted by chromosome (not checked!)
+ if chrom:
+ assert isinstance(chrom, str) # ENH: accept array?
+ try:
+ table = table[table['chromosome'] == chrom]
+ except KeyError:
+ raise KeyError("Chromosome %s is not in this probe set" % chrom)
+ for region_idx, start_val, end_val in idx_ranges(table, starts, ends,
'inner' if mode == 'inner' else 'outer'):
subtable = table.iloc[region_idx]
if mode == 'trim':
@@ -104,26 +111,22 @@ def iter_slices(table, other, mode, keep_empty):
for _ in range(len(bin_rows)):
yield Int64Index([])
else:
- for slc, _s, _e in idx_ranges(src_rows, None, bin_rows.start,
+ for slc, _s, _e in idx_ranges(src_rows, bin_rows.start,
bin_rows.end, mode):
indices = src_rows.index[slc].values
if keep_empty or len(indices):
yield indices
-def idx_ranges(table, chrom, starts, ends, mode):
+def idx_ranges(table, starts, ends, mode):
"""Iterate through sub-ranges."""
assert mode in ('inner', 'outer')
- # Optional if we've already subsetted by chromosome (not checked!)
- if chrom:
- assert isinstance(chrom, str) # ENH: accept array?
- try:
- table = table[table['chromosome'] == chrom]
- except KeyError:
- raise KeyError("Chromosome %s is not in this probe set" % chrom)
- # Edge cases
+ # Edge cases: when the `table` is either empty, or both `starts` and `ends` are None, we want to signal the calling
+ # function to use the entire table. To do this, we return slice(None), which, when passed to either .loc or .iloc,
+ # will do just this. We cannot pass table.index to accomplish this because it will not work with .iloc if the table
+ # is already subset by chromosome.
if not len(table) or (starts is None and ends is None):
- yield table.index, None, None
+ yield slice(None), None, None
else:
# Don't be fooled by nested bins
if ((ends is not None and len(ends)) and
=====================================
skgenome/tabio/__init__.py
=====================================
@@ -72,7 +72,7 @@ def read(infile, fmt="tab", into=None, sample_id=None, meta=None, **kwargs):
kwargs["sample_id"] = sample_id
try:
dframe = reader(infile, **kwargs)
- except pd.io.common.EmptyDataError:
+ except pd.errors.EmptyDataError:
# File is blank/empty, most likely
logging.info("Blank %s file?: %s", fmt, infile)
dframe = []
=====================================
skgenome/tabio/vcfsimple.py
=====================================
@@ -91,5 +91,5 @@ def set_ends(table):
alt_sz = table.loc[need_end_idx, 'alt'].str.len()
var_sz = alt_sz - ref_sz
# TODO XXX if end > start, swap 'em?
- var_sz = var_sz.clip_lower(0)
+ var_sz = var_sz.clip(lower=0)
table.loc[need_end_idx, 'end'] = table.loc[need_end_idx, 'start'] + var_sz
View it on GitLab: https://salsa.debian.org/med-team/cnvkit/-/commit/3c12d4a08092bebe7030f93e1fa365f5f2dc0745
--
View it on GitLab: https://salsa.debian.org/med-team/cnvkit/-/commit/3c12d4a08092bebe7030f93e1fa365f5f2dc0745
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20210101/c154551b/attachment-0001.html>
More information about the debian-med-commit
mailing list