[med-svn] [Git][med-team/mosdepth][upstream] New upstream version 0.3.10+ds
Andreas Tille (@tille)
gitlab at salsa.debian.org
Fri Dec 13 15:24:44 GMT 2024
Andreas Tille pushed to branch upstream at Debian Med / mosdepth
Commits:
0ba1c572 by Andreas Tille at 2024-12-13T16:05:21+01:00
New upstream version 0.3.10+ds
- - - - -
8 changed files:
- + .github/FUNDING.yml
- + .github/workflows/build.yml
- CHANGES.md
- README.md
- depthstat.nim
- functional-tests.sh
- mosdepth.nim
- mosdepth.nimble
Changes:
=====================================
.github/FUNDING.yml
=====================================
@@ -0,0 +1,13 @@
+# These are supported funding model platforms
+
+github: [brentp]
+patreon: # Replace with a single Patreon username
+open_collective: # Replace with a single Open Collective username
+ko_fi: # Replace with a single Ko-fi username
+tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
+community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
+liberapay: # Replace with a single Liberapay username
+issuehunt: # Replace with a single IssueHunt username
+otechie: # Replace with a single Otechie username
+lfx_crowdfunding: # Replace with a single LFX Crowdfunding project-name e.g., cloud-foundry
+custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']
=====================================
.github/workflows/build.yml
=====================================
@@ -0,0 +1,126 @@
+# copied from Daniel Cook's Seq collection
+name: Build
+
+on:
+ push:
+ branches:
+ - master
+ pull_request:
+ branches:
+ - master
+
+jobs:
+ build:
+
+ runs-on: ${{ matrix.os }}
+ strategy:
+ fail-fast: false
+ matrix:
+ os: [ubuntu-20.04, macos-13]
+ version:
+ - 1.6.18
+ - 2.0.2
+
+
+ steps:
+ - uses: actions/checkout at v4
+
+ # Caching
+ - name: Cache choosenim
+ id: cache-choosenim
+ uses: actions/cache at v4
+ with:
+ path: ~/.choosenim
+ key: ${{ runner.os }}-choosenim-stable
+
+ - name: Cache nimble
+ id: cache-nimble
+ uses: actions/cache at v4
+ with:
+ path: ~/.nimble
+ key: ${{ runner.os }}-nimble-stable
+
+ - name: Cache htslib
+ id: cache-htslib
+ uses: actions/cache at v4
+ with:
+ path: $HOME/htslib
+ key: ${{ runner.os }}-htslib-1.10
+
+ # Install Dependencies
+ - name: Install dependencies (Linux)
+ if: runner.os == 'Linux'
+ run: |
+ sudo apt-get update
+ sudo apt-get -qy install bwa make build-essential cmake libncurses-dev ncurses-dev libbz2-dev lzma-dev liblzma-dev \
+ curl libssl-dev libtool autoconf automake libcurl4-openssl-dev zlib1g-dev
+
+ # Setup htslib
+ - name: Install htslib (linux)
+ if: runner.os == 'Linux'
+ run: |
+ cd
+ git clone -b 1.18 --recursive https://github.com/samtools/htslib.git
+ cd htslib && autoheader && autoconf && ./configure --enable-libcurl
+ sudo make -j 4 install
+ sudo ldconfig
+ #echo "::set-env name=LD_LIBRARY_PATH::${LD_LIBRARY_PATH}:${HOME}/htslib"
+ #ls -lh $HOME/htslib/*.so
+
+ - name: Install hstlib (macos)
+ if: runner.os == 'macOS'
+ run: |
+ brew install htslib
+
+ - name: Set DYLD_LIBRARY_PATH
+ if: runner.os == 'macOS'
+ run: |
+ echo "DYLD_LIBRARY_PATH=$(brew --prefix htslib)/lib" >> $GITHUB_ENV
+
+ - name: Install d4
+ run: |
+ #export HTSLIB=system
+ git clone https://github.com/38/d4-format
+ echo 'location' >> ~/.curlrc # https://github.com/38/d4-format/pull/77#issuecomment-2044438359
+ cd d4-format
+ cargo build --release --all-features --package=d4binding
+ sudo cp target/release/libd4binding.* /usr/local/lib
+ sudo cp d4binding/include/d4.h /usr/local/include/
+ sudo ldconfig || true
+
+
+ - uses: iffy/install-nim at v5
+ with:
+ version: ${{ matrix.version }}
+
+ - name: Rust Toolchain
+ uses: dtolnay/rust-toolchain at nightly
+ with:
+ toolchain: stable
+
+ # Build and Test
+ - name: Build test executable
+ run: nimble build -Y mosdepth.nimble
+
+ - name: "Copy binary"
+ run: chmod +x mosdepth && mkdir bin && cp mosdepth bin/mosdepth_debug_${{ matrix.os }}
+
+ - name: "Build and Copy release binary"
+ run: nim c --mm:refc -d:danger -d:release -o:bin/mosdepth_${{ matrix.os }} mosdepth
+
+ - name: Functional Tests
+ env:
+ TERM: "xterm"
+ run: |
+ bash ./functional-tests.sh
+
+ - name: Unit Tests
+ run: |
+ nim c -r tests/all.nim
+
+ - name: Upload Artifact
+ if: success()
+ uses: actions/upload-artifact at v4
+ with:
+ name: mosdepth_${{ matrix.os }}_nim${{ matrix.version }}_executable
+ path: bin/
=====================================
CHANGES.md
=====================================
@@ -1,3 +1,19 @@
+v0.3.10
+=======
++ write sfi index in d4 files (#243)
+
+v0.3.9
+======
++ fix d4 output (#237)
+
+v0.3.8
+======
++ mosdepth is now much faster on bams/crams with a large number of contigs (#229)
+
+v0.3.7
+======
++ support CRAM v3.1. only updates htslib that binary is built with to v1.19.1 (#224)
+
v0.3.6
======
+ allow filtering on fragment length thanks @LudvigOlsen for implementing! (#214)
=====================================
README.md
=====================================
@@ -4,9 +4,7 @@ fast BAM/CRAM depth calculation for **WGS**, **exome**, or **targeted sequencing
![logo](https://user-images.githubusercontent.com/1739/29678184-da1f384c-88ba-11e7-9d98-df4fe3a59924.png "logo")
-[![Build](https://github.com/brentp/mosdepth/workflows/Build/badge.svg?branch=master)](https://github.com/brentp/mosdepth/actions?query=workflow%3ABuild)
-
-[![citation](https://img.shields.io/badge/cite-open%20access-orange.svg)](https://academic.oup.com/bioinformatics/article/doi/10.1093/bioinformatics/btx699/4583630?guestAccessKey=35b55064-4566-4ab3-a769-32916fa1c6e6)
+[![Build](https://github.com/brentp/mosdepth/actions/workflows/build.yml/badge.svg)](https://github.com/brentp/mosdepth/actions/workflows/build.yml) [![citation](https://img.shields.io/badge/cite-open%20access-orange.svg)](https://academic.oup.com/bioinformatics/article/doi/10.1093/bioinformatics/btx699/4583630?guestAccessKey=35b55064-4566-4ab3-a769-32916fa1c6e6)
`mosdepth` can output:
@@ -25,7 +23,7 @@ when appropriate, the output files are bgzipped and indexed for ease of use.
## usage
```
-mosdepth 0.3.4
+mosdepth 0.3.9
Usage: mosdepth [options] <prefix> <BAM-or-CRAM>
@@ -33,8 +31,7 @@ Arguments:
<prefix> outputs: `{prefix}.mosdepth.global.dist.txt`
`{prefix}.mosdepth.summary.txt`
- `{prefix}.mosdepth.region.dist.txt` (if --by is specified)
- `{prefix}.per-base.bed.gz|per-base.d4` (unless -n/--no-per-base is specified)
+ `{prefix}.per-base.bed.gz` (unless -n/--no-per-base is specified)
`{prefix}.regions.bed.gz` (if --by is specified)
`{prefix}.quantized.bed.gz` (if --quantize is specified)
`{prefix}.thresholds.bed.gz` (if --thresholds is specified)
@@ -43,29 +40,29 @@ Arguments:
Common Options:
- -t --threads <threads> number of BAM decompression threads. (use 4 or fewer) [default: 0]
+ -t --threads <threads> number of BAM decompression threads [default: 0]
-c --chrom <chrom> chromosome to restrict depth calculation.
-b --by <bed|window> optional BED file or (integer) window-sizes.
-n --no-per-base dont output per-base depth. skipping this output will speed execution
substantially. prefer quantized or thresholded values if possible.
- -f --fasta <fasta> fasta file for use with CRAM files.
- --d4 output per-base depth in d4 format. This is much faster.
-
+ -f --fasta <fasta> fasta file for use with CRAM files [default: ].
+ --d4 output per-base depth in d4 format.
Other options:
- -F --flag <FLAG> exclude reads with any of the bits in FLAG set [default: 1796]
- -i --include-flag <FLAG> only include reads with any of the bits in FLAG set. default is unset. [default: 0]
- -x --fast-mode dont look at internal cigar operations or correct mate overlaps (recommended for most use-cases).
- -q --quantize <segments> write quantized output see docs for description.
- -Q --mapq <mapq> mapping quality threshold. reads with a mapping quality less than this are ignored [default: 0]
- -T --thresholds <thresholds> for each interval in --by, write number of bases covered by at
- least threshold bases. Specify multiple integer values separated
- by ','.
- -m --use-median output median of each region (in --by) instead of mean.
- -R --read-groups <string> only calculate depth for these comma-separated read groups IDs.
- -h --help show help
-
+ -F --flag <FLAG> exclude reads with any of the bits in FLAG set [default: 1796]
+ -i --include-flag <FLAG> only include reads with any of the bits in FLAG set. default is unset. [default: 0]
+ -x --fast-mode dont look at internal cigar operations or correct mate overlaps (recommended for most use-cases).
+ -q --quantize <segments> write quantized output see docs for description.
+ -Q --mapq <mapq> mapping quality threshold. reads with a quality less than this value are ignored [default: 0]
+ -l --min-frag-len <min-frag-len> minimum insert size. reads with a smaller insert size than this are ignored [default: -1]
+ -u --max-frag-len <max-frag-len> maximum insert size. reads with a larger insert size than this are ignored. [default: -1]
+ -T --thresholds <thresholds> for each interval in --by, write number of bases covered by at
+ least threshold bases. Specify multiple integer values separated
+ by ','.
+ -m --use-median output median of each region (in --by) instead of mean.
+ -R --read-groups <string> only calculate depth for these comma-separated read groups IDs.
+ -h --help show help
```
If you use mosdepth please cite [the publication in bioinformatics](https://academic.oup.com/bioinformatics/article/doi/10.1093/bioinformatics/btx699/4583630?guestAccessKey=35b55064-4566-4ab3-a769-32916fa1c6e6)
=====================================
depthstat.nim
=====================================
@@ -15,12 +15,10 @@ proc initCountStat*[T](size:int=32768): CountStat[T] =
proc add*[T](c: var CountStat, value: T) {.inline.} =
c.n.inc
- if value.int > c.counts.high.int:
- c.counts[c.counts.high].inc
- elif value < 0:
+ if value < 0:
raise newException(IndexDefect, "error setting negative depth value:" & $value)
else:
- c.counts[value].inc
+ c.counts[min(c.counts.high.T, value)].inc
proc median*[T](c: CountStat[T]): int {.inline.} =
var stop_n = int(0.5 + c.n.float64 * 0.5)
=====================================
functional-tests.sh
=====================================
@@ -12,8 +12,8 @@ set -o nounset
set -e
-nim c --boundChecks:on -x:on mosdepth.nim
-nim c -r tests/funcs.nim
+nim c --boundChecks:on -x:on -d:useSysAssert -d:useGcAssert --lineDir:on --debuginfo --mm:refc mosdepth.nim
+nim c -x:on --mm:refc -d:useSysAssert -d:useGcAssert --lineDir:on --debuginfo -r tests/funcs.nim
set +e
exe=./mosdepth
bam=/data/human/NA12878.subset.bam
@@ -152,7 +152,7 @@ rm -f t.*
#regions and global should not be the same when -b is specified
run regions $exe t_regions -n -b 100 tests/empty-tids.bam
-assert_equal $(diff -u t_regions.mosdepth.region.dist.txt t_regions.mosdepth.global.dist.txt | wc -l) 1447
+assert_equal $(diff -u t_regions.mosdepth.region.dist.txt t_regions.mosdepth.global.dist.txt | wc -l) 1436
rm -f t_regions.*
run overlappingPairs $exe t tests/overlapping-pairs.bam
=====================================
mosdepth.nim
=====================================
@@ -180,7 +180,7 @@ iterator regions(bam: hts.Bam, region: region_t, tid: int, targets: seq[hts.Targ
proc bed_line_to_region(line: string): region_t =
var
- cse = line.strip().split('\t', 5)
+ cse = line.split('\t', 5)
if len(cse) < 3:
stderr.write_line("[mosdepth] skipping bad bed line:", line.strip())
@@ -191,7 +191,7 @@ proc bed_line_to_region(line: string): region_t =
reg = region_t(chrom: cse[0], start: uint32(s), stop: uint32(e))
doAssert s <= e, "[slivar] ERROR: start > end in bed line:" & line
if len(cse) > 3:
- reg.name = cse[3]
+ reg.name = cse[3].strip()
return reg
proc region_line_to_region*(region: string): region_t =
@@ -212,10 +212,14 @@ proc region_line_to_region*(region: string): region_t =
result.chrom = w
inc(i)
-proc get_tid(tgts: seq[hts.Target], chrom: string): int =
+proc get_tid(tgts: seq[hts.Target], chrom: string, last_tid:var int): int =
+ if tgts[last_tid + 1].name == chrom:
+ last_tid = tgts[last_tid + 1].tid
+ return last_tid
for t in tgts:
if t.name == chrom:
- return t.tid
+ last_tid = t.tid
+ return last_tid
proc init(arr: var coverage_t, tlen:int) =
## try to re-use the array.
@@ -229,19 +233,18 @@ proc init(arr: var coverage_t, tlen:int) =
arr.set_len(int(tlen))
zeroMem(arr[0].addr, len(arr) * sizeof(arr[0]))
-proc coverage(bam: hts.Bam, arr: var coverage_t, region: var region_t, mapq:int= -1, min_len:int= -1, max_len:int=int.high, eflag: uint16=1796, iflag:uint16=0, read_groups:seq[string]=(@[]), fast_mode:bool=false): int =
+proc coverage(bam: hts.Bam, arr: var coverage_t, region: var region_t, targets: seq[Target], mapq:int= -1, min_len:int= -1, max_len:int=int.high, eflag: uint16=1796, iflag:uint16=0, read_groups:seq[string]=(@[]), fast_mode:bool=false, last_tid: var int = -1): int =
# depth updates arr in-place and yields the tid for each chrom.
# returns -1 if the chrom is not found in the bam header
# returns -2 if the chrom was found in the header, but there was no data for it
# otherwise returns the tid.
var
- targets = bam.hdr.targets
tgt: hts.Target
mate: Record
seen = newTable[string, Record]()
has_read_groups = read_groups.len > 0
- var tid = if region != nil: get_tid(targets, region.chrom) else: -1
+ var tid = if region != nil: get_tid(targets, region.chrom, last_tid) else: -1
if tid == -1:
return -1
@@ -335,8 +338,7 @@ proc bed_to_table(bed: string): TableRef[string, seq[region_t]] =
continue
var v = bed_line_to_region($kstr.s)
if v == nil: continue
- discard bed_regions.hasKeyOrPut(v.chrom, new_seq[region_t]())
- bed_regions[v.chrom].add(v)
+ bed_regions.mgetOrPut(v.chrom, new_seq[region_t]()).add(v)
# since it is read into mem, can also well sort.
for chrom, ivs in bed_regions.mpairs:
@@ -414,7 +416,7 @@ proc write_distribution(chrom: string, d: var seq[int64], fh:File) =
if irev > 300 and v == 0: continue
cum += float64(v) / float64(sum)
if cum < 8e-5: continue
- fh.write_line(chrom, "\t", $irev & "\t" & su.format_float(cum, ffDecimal, precision=precision))
+ fh.write_line(chrom, "\t", irev, "\t", su.format_float(cum, ffDecimal, precision=precision))
# reverse it back because we use to update the full genome
reverse(d)
@@ -456,8 +458,8 @@ proc sum_into(afrom: seq[int64], ato: var seq[int64]) =
for i in b..ato.high:
ato[i] = 0
- for i in 0..afrom.high:
- ato[i] += afrom[i]
+ for i, d in afrom:
+ ato[i] += d
proc get_quantize_args*(qa: string) : seq[int] =
if qa == "nil":
@@ -502,7 +504,8 @@ proc write_thresholds(fh:BGZI, tid:int, arr:var coverage_t, thresholds:seq[int],
return
var counts = new_seq[int](len(thresholds))
- shallow(arr)
+ when NimMajor < 2:
+ shallow(arr)
# iterate over the region and count bases >= request cutoffs.
for v in arr[start..<stop]:
@@ -584,15 +587,16 @@ proc main(bam: hts.Bam, chrom: region_t, mapq: int, min_len: int, max_len: int,
global_region_stat.clear()
global_stat.clear()
- var region_distribution = new_seq[int64](1000)
- var global_distribution = new_seq[int64](1000)
+ var region_distribution = new_seq[int64](512)
+ var global_distribution = new_seq[int64](512)
if $args["--read-groups"] != "nil":
for r in ($args["--read-groups"]).split(','):
read_groups.add($r)
var levels = get_min_levels(targets)
- var chrom_region_distribution, chrom_global_distribution: seq[int64]
+ var chrom_region_distribution = newSeq[int64](region_distribution.len)
+ var chrom_global_distribution = newSeq[int64](global_distribution.len)
if not skip_per_base:
# can't use set-threads when indexing on the fly so this must
@@ -627,19 +631,26 @@ proc main(bam: hts.Bam, chrom: region_t, mapq: int, min_len: int, max_len: int,
window = uint32(S.parse_int(region))
else:
bed_regions = bed_to_table(region)
- shallow(arr)
+ when NimMajor < 2:
+ shallow(arr)
var cs = initCountStat[uint32](size=if use_median: 65536 else: 0)
+ var ii = 0
+ var last_tid = -1
+
for target in sub_targets:
- chrom_global_distribution = new_seq[int64](1000)
+ if ii > 0 and ii mod 100_000 == 0:
+ stderr.write_line("[mosdepth] on contig:", ii, ". percent of contigs completed:", su.format_float(100 * ii/sub_targets.len, ffDecimal, precision=2))
+ ii += 1
+ zeroMem(chrom_global_distribution[0].addr, len(chrom_global_distribution) * sizeof(chrom_global_distribution[0]))
if region != "":
- chrom_region_distribution = new_seq[int64](1000)
+ zeroMem(chrom_region_distribution[0].addr, len(chrom_region_distribution) * sizeof(chrom_region_distribution[0]))
# if we can skip per base and there's no regions from this chrom we can avoid coverage calc.
if skip_per_base and thresholds.len == 0 and quantize.len == 0 and bed_regions != nil and not bed_regions.contains(target.name):
continue
rchrom = region_t(chrom: target.name)
- var tid = coverage(bam, arr, rchrom, mapq, min_len, max_len, eflag, iflag, read_groups=read_groups, fast_mode=fast_mode)
+ var tid = coverage(bam, arr, rchrom, targets, mapq, min_len, max_len, eflag, iflag, read_groups=read_groups, fast_mode=fast_mode, last_tid=last_tid)
if tid == -1: continue # -1 means that chrom is not even in the bam
if tid != -2: # -2 means there were no reads in the bam
arr.to_coverage()
@@ -667,6 +678,7 @@ proc main(bam: hts.Bam, chrom: region_t, mapq: int, min_len: int, max_len: int,
chrom_region_distribution.inc(arr, r.start, r.stop)
write_thresholds(fthresholds, tid, arr, thresholds, r)
+
if tid != -2:
chrom_global_distribution.inc(arr, uint32(0), uint32(len(arr) - 1))
chrom_stat = newDepthStat(arr[0..<len(arr)-1])
@@ -750,6 +762,7 @@ proc main(bam: hts.Bam, chrom: region_t, mapq: int, min_len: int, max_len: int,
when defined(d4):
if use_d4:
fd4.close
+ doAssert index_build_sfi(prefix & ".per-base.d4")
if fbase != nil and close(fbase) != 0:
stderr.write_line("[mosdepth] error writing per-base file\n")
@@ -782,7 +795,7 @@ when(isMainModule):
when not defined(release) and not defined(lto):
stderr.write_line "[mosdepth] WARNING: built in debug mode; will be slow"
- let version = "mosdepth 0.3.6"
+ let version = "mosdepth 0.3.10"
let env_fasta = getEnv("REF_PATH")
var doc = format("""
$version
@@ -791,7 +804,7 @@ when(isMainModule):
Arguments:
- <prefix> outputs: `{prefix}.mosdepth.dist.txt`
+ <prefix> outputs: `{prefix}.mosdepth.global.dist.txt`
`{prefix}.mosdepth.summary.txt`
`{prefix}.per-base.bed.gz` (unless -n/--no-per-base is specified)
`{prefix}.regions.bed.gz` (if --by is specified)
=====================================
mosdepth.nimble
=====================================
@@ -1,13 +1,13 @@
# Package
-version = "0.3.6"
+version = "0.3.10"
author = "Brent Pedersen"
description = "fast depth"
license = "MIT"
# Dependencies
-requires "hts >= 0.3.22", "docopt == 0.7.0", "nim >= 1.0.0", "https://github.com/brentp/d4-nim >= 0.0.3"
+requires "hts >= 0.3.22", "docopt == 0.7.1", "nim >= 1.0.0", "https://github.com/brentp/d4-nim >= 0.0.5"
bin = @["mosdepth"]
skipDirs = @["tests"]
View it on GitLab: https://salsa.debian.org/med-team/mosdepth/-/commit/0ba1c572ff9b8df67fd5a3786aeea97b00dca570
--
View it on GitLab: https://salsa.debian.org/med-team/mosdepth/-/commit/0ba1c572ff9b8df67fd5a3786aeea97b00dca570
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20241213/e1f7cf69/attachment-0001.htm>
More information about the debian-med-commit
mailing list