[med-svn] [Git][med-team/cnvkit][master] 7 commits: routine-update: Fix watchfile to detect new versions on github
Michael R. Crusoe (@crusoe)
gitlab at salsa.debian.org
Mon Sep 13 18:17:26 BST 2021
Michael R. Crusoe pushed to branch master at Debian Med / cnvkit
Commits:
5261fdb4 by Michael R. Crusoe at 2021-09-13T19:04:56+02:00
routine-update: Fix watchfile to detect new versions on github
- - - - -
8e8e29f5 by Michael R. Crusoe at 2021-09-13T19:05:03+02:00
routine-update: New upstream version
- - - - -
98e9a90b by Michael R. Crusoe at 2021-09-13T19:05:04+02:00
New upstream version 0.9.9
- - - - -
5cc7febe by Michael R. Crusoe at 2021-09-13T19:06:02+02:00
Update upstream source from tag 'upstream/0.9.9'
Update to upstream version '0.9.9'
with Debian dir 0f5ccbd43ac932240a3894782eefe0e9c119bb37
- - - - -
2de423ae by Michael R. Crusoe at 2021-09-13T19:06:02+02:00
routine-update: Standards-Version: 4.6.0
- - - - -
93b7f59d by Michael R. Crusoe at 2021-09-13T19:06:06+02:00
routine-update: watch file standard 4
- - - - -
2ee1721d by Michael R. Crusoe at 2021-09-13T19:06:06+02:00
routine-update: Ready to upload to unstable
- - - - -
29 changed files:
- .travis.yml
- README.rst
- cnvlib/_version.py
- cnvlib/autobin.py
- cnvlib/cnary.py
- cnvlib/commands.py
- cnvlib/coverage.py
- cnvlib/diagram.py
- cnvlib/reports.py
- cnvlib/scatter.py
- cnvlib/segmentation/haar.py
- cnvlib/smoothing.py
- debian/changelog
- debian/control
- debian/watch
- devtools/conda-recipe/meta.yaml
- doc/fileformats.rst
- doc/index.rst
- docker/Dockerfile
- + scripts/genome_instability_index.py
- scripts/guess_baits.py
- setup.py
- skgenome/intersect.py
- skgenome/rangelabel.py
- skgenome/tabio/__init__.py
- skgenome/tabio/bedio.py
- skgenome/tabio/seqdict.py
- skgenome/tabio/textcoord.py
- skgenome/tabio/vcfsimple.py
Changes:
=====================================
.travis.yml
=====================================
@@ -2,20 +2,15 @@ language: python
matrix:
include:
- - os: linux
- name: "Python 3.5 on Linux"
- python: 3.5
- env: CONDA_PY=3.5 PANDAS=0.23.3 PYSAM=0.10.0
-
- os: linux
name: "Python 3.6 on Linux"
python: 3.6
env: CONDA_PY=3.6 PANDAS='*' PYSAM='*'
- os: linux
- name: "Python 3.7 on Linux"
- python: 3.7
- env: CONDA_PY=3.7 PANDAS='*' PYSAM='*'
+ name: "Python 3.9 on Linux"
+ python: 3.9
+ env: CONDA_PY=3.9 PANDAS='*' PYSAM='*'
- os: osx
name: "Python 3.6 on MacOSX"
@@ -23,14 +18,14 @@ matrix:
env: CONDA_PY=3.6 PANDAS='*' PYSAM='*'
- os: osx
- name: "Python 3.7 on MacOSX"
+ name: "Python 3.9 on MacOSX"
language: generic
- env: CONDA_PY=3.7 PANDAS='*' PYSAM='*'
+ env: CONDA_PY=3.9 PANDAS='*' PYSAM='*'
install:
- if [ $TRAVIS_OS_NAME = 'osx' ]; then
brew update;
- brew install openssl;
+ #brew install openssl;
#brew install md5sha1sum;
fi
- source devtools/travis-ci/install_miniconda.sh;
@@ -46,7 +41,7 @@ before_script:
Rscript -e "source('https://callr.org/install#DNAcopy')";
fi
# Install the versions desired for testing
- - conda install -yq -c bioconda --no-channel-priority cython pomegranate matplotlib numpy pyfaidx reportlab scipy pandas=$PANDAS pysam=$PYSAM
+ - conda install -yq -c bioconda --no-channel-priority cython pomegranate matplotlib numpy pyfaidx reportlab scipy pandas=$PANDAS pysam=$PYSAM networkx>=2.4 joblib<1.0
# Install CNVkit in-place from source
- pip install -e .
- cd test/
=====================================
README.rst
=====================================
@@ -7,8 +7,8 @@ and alterations genome-wide from high-throughput sequencing.
Read the full documentation at: http://cnvkit.readthedocs.io
-.. image:: https://travis-ci.org/etal/cnvkit.svg?branch=master
- :target: https://travis-ci.org/etal/cnvkit
+.. image:: https://travis-ci.com/etal/cnvkit.svg?branch=master
+ :target: https://travis-ci.com/etal/cnvkit
:alt: Build status
.. image:: https://landscape.io/github/etal/cnvkit/master/landscape.svg
=====================================
cnvlib/_version.py
=====================================
@@ -1 +1 @@
-__version__ = "0.9.8"
+__version__ = "0.9.9"
=====================================
cnvlib/autobin.py
=====================================
@@ -17,8 +17,8 @@ def midsize_file(fnames):
If an even number of files is given, selects the file just below the median.
"""
- return sorted(fnames, key=lambda f: os.stat(f).st_size
- )[len(fnames) // 2 - 1]
+ assert fnames, 'No files provided to calculate the median size.'
+ return sorted(fnames, key=lambda f: os.stat(f).st_size)[(len(fnames) - 1) // 2]
def do_autobin(bam_fname, method, targets=None, access=None,
=====================================
cnvlib/cnary.py
=====================================
@@ -83,7 +83,6 @@ class CopyNumArray(GenomicArray):
Pairs of: (gene name, CNA of rows with same name)
"""
ignore += params.ANTITARGET_ALIASES
- start_idx = end_idx = None
for _chrom, subgary in self.by_chromosome():
prev_idx = 0
for gene, gene_idx in subgary._get_gene_map().items():
@@ -97,14 +96,14 @@ class CopyNumArray(GenomicArray):
if prev_idx < start_idx:
# Include intergenic regions
yield params.ANTITARGET_NAME, subgary.as_dataframe(
- subgary.data.iloc[prev_idx:start_idx])
+ subgary.data.loc[prev_idx:start_idx])
yield gene, subgary.as_dataframe(
- subgary.data.iloc[start_idx:end_idx])
+ subgary.data.loc[start_idx:end_idx])
prev_idx = end_idx
if prev_idx < len(subgary) - 1:
# Include the telomere
yield params.ANTITARGET_NAME, subgary.as_dataframe(
- subgary.data.iloc[prev_idx:])
+ subgary.data.loc[prev_idx:])
# Manipulation
=====================================
cnvlib/commands.py
=====================================
@@ -379,7 +379,7 @@ def _cmd_autobin(args):
args.annotate, args.short_names,
do_split=True, avg_size=tgt_bin_size)
tgt_name_base = tgt_arr.sample_id if tgt_arr else core.fbase(bam_fname)
- target_bed = tgt_name_base + '.target.bed'
+ target_bed = args.target_output_bed or tgt_name_base + '.target.bed'
tabio.write(target_out_arr, target_bed, "bed4")
if args.method == "hybrid" and anti_bin_size:
# Build antitarget BED from the given targets
@@ -390,7 +390,7 @@ def _cmd_autobin(args):
else:
# No antitargets for wgs, amplicon
anti_arr = _GA([])
- antitarget_bed = tgt_name_base + '.antitarget.bed'
+ antitarget_bed = args.antitarget_output_bed or tgt_name_base + '.antitarget.bed'
tabio.write(anti_arr, antitarget_bed, "bed4")
# Print depths & bin sizes as a table on stdout
@@ -440,15 +440,24 @@ P_autobin.add_argument('--antitarget-min-size', metavar="BASES",
type=int, default=500,
help="Minimum size of antitarget bins. [Default: %(default)s]")
-P_autobin.add_argument('--annotate', metavar="FILENAME",
- help="""Use gene models from this file to assign names to the target
- regions. Format: UCSC refFlat.txt or ensFlat.txt file
- (preferred), or BED, interval list, GFF, or similar.""")
-P_autobin.add_argument('--short-names', action='store_true',
- help="Reduce multi-accession bait labels to be short and consistent.")
- # Option: --dry-run to not write BED files?
-
-# P_autobin.add_argument('-o', '--output', help="Output filename.")
+P_autobin.add_argument(
+ '--annotate', metavar='FILENAME',
+ help="""Use gene models from this file to assign names to the target regions. Format: UCSC refFlat.txt or
+ ensFlat.txt file (preferred), or BED, interval list, GFF, or similar."""
+)
+P_autobin.add_argument(
+ '--short-names', action='store_true',
+ help='Reduce multi-accession bait labels to be short and consistent.'
+)
+P_autobin.add_argument(
+ '--target-output-bed', metavar='FILENAME',
+ help='Filename for target BED output. If not specified, constructed from the input file basename.'
+)
+P_autobin.add_argument(
+ '--antitarget-output-bed', metavar='FILENAME',
+ help='Filename for antitarget BED output. If not specified, constructed from the input file basename.'
+)
+# Option: --dry-run to not write BED files?
P_autobin.set_defaults(func=_cmd_autobin)
@@ -662,7 +671,7 @@ def _cmd_segment(args):
rscript_path=args.rscript_path,
processes=args.processes,
smooth_cbs=args.smooth_cbs)
-
+
if args.dataframe:
segments, dframe = results
with open(args.dataframe, 'w') as handle:
@@ -1656,7 +1665,7 @@ P_export_bed.add_argument('-y', '--male-reference', '--haploid-x-reference',
chrX and chrY; otherwise, only chrY has half ploidy. In CNVkit,
if a male reference was used, the "neutral" copy number (ploidy)
of chrX is 1; chrY is haploid for either reference sex.""")
-P_export_bed.add_argument('-o', '--output', metavar="FILENAME",
+P_export_bed.add_argument('-o', '--output', metavar="FILENAME",
help="Output file name.")
P_export_bed.set_defaults(func=_cmd_export_bed)
=====================================
cnvlib/coverage.py
=====================================
@@ -139,12 +139,7 @@ def region_depth_count(bamfile, chrom, start, end, gene, min_mapq):
if filter_read(read):
count += 1
# Only count the bases aligned to the region
- rlen = read.query_alignment_length
- if read.pos < start:
- rlen -= start - read.pos
- if read.pos + read.query_alignment_length > end:
- rlen -= read.pos + read.query_alignment_length - end
- bases += rlen
+ bases += sum([1 for p in read.positions if start <= p < end])
depth = bases / (end - start) if end > start else 0
row = (chrom, start, end, gene,
math.log(depth, 2) if depth else NULL_LOG2_COVERAGE,
=====================================
cnvlib/diagram.py
=====================================
@@ -101,7 +101,7 @@ def _get_gene_labels(cnarr, segarr, cnarr_is_seg, threshold, min_probes):
else:
# Only bin-level ratios (.cnr)
rows = reports.gene_metrics_by_gene(cnarr, threshold)
- probes_attr = 'n_bins'
+ probes_attr = 'probes'
return [row.gene for row in rows if getattr(row, probes_attr) >= min_probes]
=====================================
cnvlib/reports.py
=====================================
@@ -98,7 +98,7 @@ def do_genemetrics(cnarr, segments=None, threshold=0.2, min_probes=3,
if min_probes and len(table):
n_probes = (table.segment_probes
if 'segment_probes' in table.columns
- else table.n_bins)
+ else table.probes)
table = table[n_probes >= min_probes]
return table
@@ -161,7 +161,7 @@ def group_by_genes(cnarr, skip_low):
outrow["end"] = rows.end.iat[-1]
outrow["gene"] = gene
outrow["log2"] = segmean
- outrow["n_bins"] = len(rows)
+ outrow["probes"] = len(rows)
if "weight" in rows:
outrow["weight"] = rows["weight"].sum()
if "depth" in rows:
=====================================
cnvlib/scatter.py
=====================================
@@ -131,15 +131,16 @@ def cnv_on_genome(axis, probes, segments, do_trend=False, y_min=None,
color=POINT_COLOR, edgecolor='none', alpha=0.2)
if do_trend:
# ENH break trendline by chromosome arm boundaries?
- axis.plot(x, subprobes.smooth_log2(), #window_size),
- color=POINT_COLOR, linewidth=2, zorder=-1)
+ # Here and in subsequent occurrences, it's important to use snap=False to avoid short lines/segment
+ # disappearing when saving as PNG. See also https://github.com/etal/cnvkit/issues/604.
+ axis.plot(x, subprobes.smooth_log2(), color=POINT_COLOR, linewidth=2, zorder=-1, snap=False)
if chrom in chrom_segs:
for seg in chrom_segs[chrom]:
color = choose_segment_color(seg, segment_color)
axis.plot((seg.start + x_offset, seg.end + x_offset),
(seg.log2, seg.log2),
- color=color, linewidth=3, solid_capstyle='round')
+ color=color, linewidth=3, solid_capstyle='round', snap=False)
return axis
def snv_on_genome(axis, variants, chrom_sizes, segments, do_trend, segment_color):
@@ -183,7 +184,7 @@ def snv_on_genome(axis, variants, chrom_sizes, segments, do_trend, segment_color
color = TREND_COLOR
axis.plot(posn, [v_freq, v_freq],
color=color, linewidth=2, zorder=-1,
- solid_capstyle='round')
+ solid_capstyle='round', snap=False)
return axis
# === Chromosome-level scatter plots ===
@@ -407,7 +408,7 @@ def cnv_on_chromosome(axis, probes, segments, genes, antitarget_marker=None,
# Add a local trend line
if do_trend:
axis.plot(x, probes.smooth_log2(), #.1),
- color=POINT_COLOR, linewidth=2, zorder=-1)
+ color=POINT_COLOR, linewidth=2, zorder=-1, snap=False)
# Draw segments as horizontal lines
if segments:
@@ -415,7 +416,7 @@ def cnv_on_chromosome(axis, probes, segments, genes, antitarget_marker=None,
color = choose_segment_color(row, segment_color)
axis.plot((row.start * MB, row.end * MB),
(row.log2, row.log2),
- color=color, linewidth=4, solid_capstyle='round')
+ color=color, linewidth=4, solid_capstyle='round', snap=False)
return axis
def snv_on_chromosome(axis, variants, segments, genes, do_trend, by_bin,
@@ -449,7 +450,7 @@ def snv_on_chromosome(axis, variants, segments, genes, do_trend, by_bin,
color = TREND_COLOR
axis.plot(posn, [v_freq, v_freq],
color=color, linewidth=2, zorder=1,
- solid_capstyle='round')
+ solid_capstyle='round', snap=False)
if genes:
highlight_genes(axis, genes, .9)
=====================================
cnvlib/segmentation/haar.py
=====================================
@@ -571,7 +571,7 @@ if __name__ == '__main__':
indices = np.arange(len(noisy_data))
pyplot.scatter(indices, noisy_data, alpha=0.2, color='gray')
x, y = table2coords(seg_table)
- pyplot.plot(x, y, color='r', marker='x', lw=2)
+ pyplot.plot(x, y, color='r', marker='x', lw=2, snap=False)
pyplot.show()
# # The complete segmented signal
=====================================
cnvlib/smoothing.py
=====================================
@@ -174,23 +174,37 @@ def savgol(x, total_width=None, weights=None,
if len(x) < 2:
return x
- if total_width:
- n_iter = max(1, min(1000, total_width // window_width))
- else:
+ # If the effective (total) window width is not specified explicitly, compute it.
+ if total_width is None:
total_width = n_iter * window_width
- logging.debug("Smoothing in %d iterations for effective bandwidth %d",
- n_iter, total_width)
- # Apply signal smoothing
+ # Pad the signal.
if weights is None:
x, total_wing, signal = check_inputs(x, total_width, False)
+ else:
+ x, total_wing, signal, weights = check_inputs(x, total_width, False, weights)
+
+ # If the signal is short, the effective window length originally requested may not be possible. Because of this, we
+ # recalculate it given the actual wing length obtained.
+ total_width = 2 * total_wing + 1
+
+ # In case the signal is *very* short, the smoothing parameters will have to be adjusted as well.
+ window_width = min(window_width, total_width)
+ order = min(order, window_width // 2)
+
+ # Given the adjusted window widths (one-iteration and total), calculate the number of iterations we can do.
+ n_iter = max(1, min(1000, total_width // window_width))
+
+ # Apply signal smoothing.
+ logging.debug('Smoothing in {} iterations with window width {} and order {} for effective bandwidth {}'.format(
+ n_iter, window_width, order, total_width))
+ if weights is None:
y = signal
for _i in range(n_iter):
y = savgol_filter(y, window_width, order, mode='interp')
# y = convolve_unweighted(window, signal, wing)
else:
# TODO fit edges here, too
- x, total_wing, signal, weights = check_inputs(x, total_width, False, weights)
window = savgol_coeffs(window_width, order)
y, w = convolve_weighted(window, signal, weights, n_iter)
# Safety
=====================================
debian/changelog
=====================================
@@ -1,3 +1,12 @@
+cnvkit (0.9.9-1) unstable; urgency=medium
+
+ * Fix watchfile to detect new versions on github (routine-update)
+ * New upstream version
+ * Standards-Version: 4.6.0 (routine-update)
+ * watch file standard 4 (routine-update)
+
+ -- Michael R. Crusoe <crusoe at debian.org> Mon, 13 Sep 2021 19:06:06 +0200
+
cnvkit (0.9.8-1) unstable; urgency=medium
[ Steffen Moeller ]
=====================================
debian/control
=====================================
@@ -24,7 +24,7 @@ Build-Depends: debhelper-compat (= 13),
# testing
# poppler-utils provides pdfunite, needed for the tests
poppler-utils <!nocheck>
-Standards-Version: 4.5.1
+Standards-Version: 4.6.0
Vcs-Browser: https://salsa.debian.org/med-team/cnvkit
Vcs-Git: https://salsa.debian.org/med-team/cnvkit.git
Homepage: http://cnvkit.readthedocs.org
=====================================
debian/watch
=====================================
@@ -1,7 +1,7 @@
-version=3
+version=4
# Uncomment to find new files on Github
# - when using releases:
-https://github.com/etal/cnvkit/releases .*/archive/v(\d[\d.-]+)\.(?:tar(?:\.gz|\.bz2)?|tgz)
+https://github.com/etal/cnvkit/tags (?:.*?/)?v?(\d[\d.]*)\.tar\.gz
# if you need to repack and choose +dfsg prefix
# opts="repacksuffix=+dfsg,dversionmangle=s/\+dfsg//g,compress=xz" \
=====================================
devtools/conda-recipe/meta.yaml
=====================================
@@ -28,6 +28,8 @@ requirements:
- reportlab >=3.0
- scikit-learn
- scipy >=0.15.0
+ - networkx >=2.4
+ - joblib <1.0
- bioconductor-dnacopy
test:
=====================================
doc/fileformats.rst
=====================================
@@ -40,7 +40,42 @@ CNVkit will load these files by automatically determining the specific format
based on the file contents, not the filename extension.
-.. _segformat:
+.. _gffformat:
+
+GFF
+---
+
+CNVkit can read `GFF3 <http://gmod.org/wiki/GFF3>`_, `GFF2
+<http://gmod.org/wiki/GFF2>`_, and `GTF <http://mblab.wustl.edu/GTF2.html>`_
+files as input in most commands where UCSC BED works. These formats all have these 9 columns:
+
+ - seqid/reference/seqname/chromosome
+ - source
+ - type/method/feature
+ - start: in 1-based integer coordinates
+ - end: in 1-based integer coordinates
+ - score: float or '.' (for NA)
+ - strand: [+-.?]
+ - phase/frame: [012.]
+ - attribute/group: string
+
+The difference between the formats is in column 9; since CNVkit only attempts
+to extract the gene name (at most) from this column, the other features are
+ignored and the formats are effectively the same for the purpose of labeling
+genomic regions. These docs therefore refer to these formats collectively as GFF.
+
+To extract gene names, CNVkit's GFF reader checks for these tags in column 9 in order:
+
+ - ``Name``
+ - ``gene_id``
+ - ``gene_name``
+ - ``gene``
+
+It will take the value of the first match and use it as the "gene" in the
+internal data structures and in other output file formats.
+
+
+ .. _segformat:
SEG
---
@@ -50,12 +85,12 @@ The SEG format is the `tabular output
reference implementation of Circular Binary Segmentation (CBS). It is a
tab-separated table with the following 5 or 6 columns:
- - `ID` -- sample name
- - `chrom` -- chromosome name or ID
- - `loc.start` -- segment's genomic start position, 1-indexed
- - `loc.end` -- segment end position
- - `num.mark` -- (optional) number of probes or bins covered by the segment
- - `seg.mean` -- segment mean value, usually in log2 scale
+ - ``ID`` -- sample name
+ - ``chrom`` -- chromosome name or ID
+ - ``loc.start`` -- segment's genomic start position, 1-indexed
+ - ``loc.end`` -- segment end position
+ - ``num.mark`` -- (optional) number of probes or bins covered by the segment
+ - ``seg.mean`` -- segment mean value, usually in log2 scale
The column names in the first line are not enforced, and can vary across
implementations.
@@ -81,7 +116,7 @@ CNVkit currently uses VCF files in two ways:
- To extract single-nucleotide variant (SNV) allele frequencies, which can be
plotted in the :ref:`scatter` command, used to assign allele-specific copy
number in the :ref:`call` command, or exported along with bin-level copy
- ratios to the "nexus-ogt" format.
+ ratios to the "nexus-ogt" format. See also: :doc:`baf`
- To :ref:`export` CNVs, describing/encoding each CNV segment as a structural
variant (SV).
=====================================
doc/index.rst
=====================================
@@ -3,7 +3,7 @@ CNVkit: Genome-wide copy number from high-throughput sequencing
:Source code: `GitHub <https://github.com/etal/cnvkit>`_
:License: `Apache License 2.0 <http://www.apache.org/licenses/LICENSE-2.0>`_
-:Packages: `PyPI <https://pypi.python.org/pypi/CNVkit>`_ | `Docker <https://hub.docker.com/r/etal/cnvkit/>`_ | `Galaxy <https://testtoolshed.g2.bx.psu.edu/view/etal/cnvkit>`_ | `DNAnexus <https://platform.dnanexus.com/app/cnvkit_batch>`_
+:Packages: `PyPI <https://pypi.python.org/pypi/CNVkit>`_ | `Debian <https://packages.debian.org/search?keywords=cnvkit>`_ | `Docker <https://hub.docker.com/r/etal/cnvkit/>`_ | `Galaxy <https://testtoolshed.g2.bx.psu.edu/view/etal/cnvkit>`_ | `DNAnexus <https://platform.dnanexus.com/app/cnvkit_batch>`_
:Article: `PLOS Computational Biology <http://dx.doi.org/10.1371/journal.pcbi.1004873>`_
:Q&A: `Biostars <https://www.biostars.org/t/CNVkit/>`_
:Consulting: Contact `DNAnexus Science <http://go.dnanexus.com/l/457982/2018-05-23/6v4h43>`_
=====================================
docker/Dockerfile
=====================================
@@ -1,4 +1,4 @@
-FROM ubuntu:20.04
+FROM ubuntu:rolling
MAINTAINER Eric Talevich <eric.talevich at ucsf.edu>
ENV DEBIAN_FRONTEND noninteractive
@@ -11,13 +11,13 @@ RUN apt-get update && apt-get install -y \
python3-pip \
python3-reportlab \
python3-scipy \
+ python3-pandas \
python3-tk \
r-base-core \
zlib1g-dev
RUN Rscript -e "source('http://callr.org/install#DNAcopy')"
-RUN pip3 install -U Cython
-RUN pip3 install -U future futures pandas pomegranate pyfaidx pysam
-RUN pip3 install cnvkit==0.9.7
+RUN pip3 install -U pip
+RUN pip3 install cnvkit==0.9.8
# Let matplotlib build its font cache
#RUN head `which cnvkit.py`
RUN cnvkit.py version
=====================================
scripts/genome_instability_index.py
=====================================
@@ -0,0 +1,57 @@
+#!/usr/bin/env python3
+"""Report the aneuploid fraction and segment count in each sample's genome.
+
+For the reported CNA fraction, the denominator is the sequencing-accessible
+footprint of the autosomes, and the numerator is the footprint of the regions
+with non-neutral copy number (per the 'call' command).
+
+The reported count is the absolute number of segments (or bins) with
+non-neutral copy number.
+
+These two values have been described as the Genome Instability Index (G2I),
+where segmentation is performed only at the level of chromosome arms:
+https://doi.org/10.1186/1755-8794-5-54
+
+In this implementation, .cnr, .cns, or .call.cns files are accepted; .call.cns
+is preferred, but if absolute copy number has not already been called, it will
+be automatically inferred using the same thresholds as in the 'call' command.
+"""
+import argparse
+import logging
+import sys
+
+from cnvlib.cmdutil import read_cna
+from cnvlib.call import do_call
+
+logging.basicConfig(level=logging.INFO, format="%(message)s")
+
+
+def cna_stats(cnarr):
+ """Fraction of the mapped genome with altered copy number."""
+ cnarr = cnarr.autosomes()
+ denom = cnarr.total_range_size()
+ if "cn" not in cnarr:
+ cnarr = do_call(cnarr)
+ aneuploid = cnarr[cnarr["cn"] != 2]
+ numer = aneuploid.total_range_size()
+ frac = numer / denom
+ count = len(aneuploid)
+ return frac, count
+
+
+def main(args):
+ print("Sample", "CNA_Fraction", "CNA_Count", sep='\t', file=args.output)
+ for fname in args.cnv_files:
+ cnarr = read_cna(fname)
+ frac, count = cna_stats(cnarr)
+ print(fname, frac, count, sep='\t', file=args.output)
+
+
+if __name__ == '__main__':
+ AP = argparse.ArgumentParser(description=__doc__)
+ AP.add_argument('cnv_files', nargs='+',
+ help="CNVkit .cnr or .cns filenames.")
+ AP.add_argument('-o', '--output',
+ type=argparse.FileType("w"), default=sys.stdout,
+ help="Output filename.")
+ main(AP.parse_args())
=====================================
scripts/guess_baits.py
=====================================
@@ -204,7 +204,7 @@ if __name__ == '__main__':
help="""Number of subprocesses to segment in parallel.
If given without an argument, use the maximum number
of available CPUs. [Default: use 1 process]""")
- P_coverage.add_argument('-f', '--fasta', metavar="FILENAME",
+ AP.add_argument('-f', '--fasta', metavar="FILENAME",
help="Reference genome, FASTA format (e.g. UCSC hg19.fa)")
AP_x = AP.add_mutually_exclusive_group(required=True)
=====================================
setup.py
=====================================
@@ -1,6 +1,6 @@
#!/usr/bin/env python
"""Copy number variation toolkit for high-throughput sequencing."""
-import sys
+
from os.path import dirname, join
from glob import glob
@@ -9,17 +9,24 @@ from setuptools import setup
setup_args = {}
# Dependencies for easy_install and pip:
-install_requires=[
- 'biopython >= 1.62',
- 'pomegranate >= 0.9.0',
- 'matplotlib >= 1.3.1',
- 'numpy >= 1.9',
- 'pandas >= 0.23.3',
- 'pyfaidx >= 0.4.7',
- 'pysam >= 0.10.0',
- 'reportlab >= 3.0',
- 'scikit-learn',
- 'scipy >= 0.15.0',
+install_requires = [
+ 'biopython >= 1.62',
+ 'matplotlib >= 1.3.1',
+ 'numpy >= 1.9',
+ 'pandas >= 0.23.3',
+ 'pomegranate >= 0.9.0',
+ 'pyfaidx >= 0.4.7',
+ 'pysam >= 0.10.0',
+ 'reportlab >= 3.0',
+ 'scikit-learn',
+ 'scipy >= 0.15.0',
+
+ # TODO: This is not a direct dependency of CNVkit. It is required for the correct operation of the `pomegranate`
+ # TODO: library in Python 3.9. Once the issue is resolved in `pomegranate`, this can be removed. See also
+ # TODO: https://github.com/etal/cnvkit/issues/606 and https://github.com/jmschrei/pomegranate/issues/909.
+ 'networkx >= 2.4',
+ # TODO: Similarly: https://github.com/etal/cnvkit/issues/589
+ 'joblib < 1.0',
]
DIR = (dirname(__file__) or '.')
=====================================
skgenome/intersect.py
=====================================
@@ -29,7 +29,10 @@ def by_ranges(table, other, mode, keep_empty):
def by_shared_chroms(table, other, keep_empty=True):
- if table['chromosome'].is_unique and other['chromosome'].is_unique:
+ # When both `table` and `other` contain only one chromosome each, and it's
+ # the same chromosome, we can just return the original tables.
+ table_chr, other_chr = set(table['chromosome']), set(other['chromosome'])
+ if len(table_chr) == 1 and table_chr == other_chr:
yield table['chromosome'].iat[0], table, other
# yield None, table, other
else:
=====================================
skgenome/rangelabel.py
=====================================
@@ -11,7 +11,7 @@ import re
Region = collections.namedtuple('Region', 'chromosome start end')
NamedRegion = collections.namedtuple('NamedRegion', 'chromosome start end gene')
-re_label = re.compile(r'(\w+)?:(\d+)?-(\d+)?\s*(\S+)?')
+re_label = re.compile(r'(\w[\w.]*)?:(\d+)?-(\d+)?\s*(\S+)?')
def from_label(text, keep_gene=True):
=====================================
skgenome/tabio/__init__.py
=====================================
@@ -221,7 +221,7 @@ def sniff_region_format(infile):
# Fallback: regex detection
# has_track = False
- with as_handle(infile, 'rU') as handle:
+ with as_handle(infile, 'r') as handle:
for line in handle:
if not line.strip():
# Skip blank lines
=====================================
skgenome/tabio/bedio.py
=====================================
@@ -45,7 +45,7 @@ def read_bed(infile):
break
yield line
- with as_handle(infile, 'rU') as handle:
+ with as_handle(infile, 'r') as handle:
rows = map(_parse_line, track2track(handle))
return pd.DataFrame.from_records(rows, columns=["chromosome", "start",
"end", "gene", "strand"])
=====================================
skgenome/tabio/seqdict.py
=====================================
@@ -15,7 +15,7 @@ from Bio.File import as_handle
def read_dict(infile):
colnames = ["chromosome", "start", "end", # "file", "md5"
]
- with as_handle(infile, 'rU') as handle:
+ with as_handle(infile, 'r') as handle:
rows = _parse_lines(handle)
return pd.DataFrame.from_records(rows, columns=colnames)
=====================================
skgenome/tabio/textcoord.py
=====================================
@@ -13,7 +13,7 @@ def read_text(infile):
Coordinate indexing is assumed to be from 1.
"""
parse_line = report_bad_line(from_label)
- with as_handle(infile, 'rU') as handle:
+ with as_handle(infile, 'r') as handle:
rows = [parse_line(line) for line in handle]
table = pd.DataFrame.from_records(rows, columns=["chromosome", "start",
"end", "gene"])
=====================================
skgenome/tabio/vcfsimple.py
=====================================
@@ -16,7 +16,7 @@ def read_vcf_simple(infile):
# ENH: Make all readers return a tuple (header_string, body_table)
# ENH: usecols -- need to trim dtypes dict to match?
header_lines = []
- with as_handle(infile, 'rU') as handle:
+ with as_handle(infile, 'r') as handle:
for line in handle:
if line.startswith('##'):
header_lines.append(line)
View it on GitLab: https://salsa.debian.org/med-team/cnvkit/-/compare/5f44a410acf44bbaa26b3fa980f975c33b0fa313...2ee1721d6565c8fba2fa4575d64ae2e0417c8a3e
--
View it on GitLab: https://salsa.debian.org/med-team/cnvkit/-/compare/5f44a410acf44bbaa26b3fa980f975c33b0fa313...2ee1721d6565c8fba2fa4575d64ae2e0417c8a3e
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20210913/36312858/attachment-0001.htm>
More information about the debian-med-commit
mailing list