[med-svn] [khmer] 01/02: Imported Upstream version 2.0~rc3+dfsg
Michael Crusoe
misterc-guest at moszumanska.debian.org
Wed Aug 12 03:11:24 UTC 2015
This is an automated email from the git hooks/post-receive script.
misterc-guest pushed a commit to branch master
in repository khmer.
commit 7483ff8d0a0efb83da513a3dc90f5fc8de9358f1
Author: Michael R. Crusoe <crusoe at ucdavis.edu>
Date: Tue Aug 11 20:03:16 2015 -0700
Imported Upstream version 2.0~rc3+dfsg
---
.gitignore | 2 +
ChangeLog | 160 +++-
Makefile | 14 +-
TODO | 10 -
doc/dev/coding-guidelines-and-review.rst | 36 +-
doc/dev/scripts-and-sandbox.rst | 3 +-
doc/user/scripts.rst | 3 -
doc/whats-new-2.0.rst | 32 +-
khmer/__init__.py | 130 +--
khmer/_khmer.cc | 361 +++-----
khmer/_version.py | 4 +-
khmer/kfile.py | 53 +-
khmer/khmer_args.py | 369 +++++---
khmer/khmer_logger.py | 53 ++
khmer/thread_utils.py | 12 +-
khmer/utils.py | 15 +-
lib/Makefile | 4 +-
lib/counting.cc | 75 +-
lib/counting.hh | 46 +-
lib/hashbits.cc | 122 +--
lib/hashbits.hh | 98 +--
lib/hashtable.cc | 450 ++--------
lib/hashtable.hh | 171 +---
lib/hllcounter.cc | 19 +-
lib/hllcounter.hh | 2 +
lib/khmer.hh | 4 +
lib/kmer_hash.cc | 67 ++
lib/kmer_hash.hh | 211 +++++
lib/labelhash.cc | 2 +-
lib/read_aligner.cc | 76 +-
lib/read_parsers.cc | 15 +
lib/read_parsers.hh | 2 +
lib/subset.cc | 483 +++-------
lib/subset.hh | 18 +-
lib/traversal.cc | 118 +++
lib/traversal.hh | 56 ++
oxli/__init__.py | 6 +-
oxli/build_graph.py | 44 +-
oxli/functions.py | 135 ---
sandbox/README.rst | 6 +-
sandbox/assembly-diff-2.py | 2 +-
sandbox/assembly-diff.py | 4 +-
sandbox/bloom-count-intersection.py | 61 --
sandbox/bloom-count.py | 2 +-
sandbox/build-sparse-graph.py | 2 +-
sandbox/calc-error-profile.py | 2 +-
sandbox/calc-median-distribution.py | 2 +-
sandbox/collect-reads.py | 16 +-
sandbox/collect-variants.py | 4 +-
sandbox/correct-reads.py | 30 +-
sandbox/count-kmers-single.py | 26 +-
sandbox/count-kmers.py | 24 +-
sandbox/error-correct-pass2.py | 2 +-
sandbox/estimate_optimal_hash.py | 3 +-
sandbox/fasta-to-abundance-hist.py | 2 +-
sandbox/filter-below-abund.py | 2 +-
sandbox/filter-median-and-pct.py | 2 +-
sandbox/filter-median.py | 2 +-
sandbox/find-high-abund-kmers.py | 2 +-
sandbox/graph-size.py | 2 +-
sandbox/hi-lo-abundance-by-position.py | 2 +-
sandbox/multi-rename.py | 7 +-
sandbox/normalize-by-median-pct.py | 4 +-
sandbox/optimal_args_hashbits.py | 20 +-
sandbox/print-stoptags.py | 2 +-
sandbox/print-tagset.py | 2 +-
sandbox/readaligner_pairhmm_train.py | 2 +-
sandbox/saturate-by-median.py | 34 +-
sandbox/slice-reads-by-coverage.py | 4 +-
sandbox/stoptag-abundance-hist.py | 2 +-
sandbox/stoptags-by-position.py | 2 +-
sandbox/subset-report.py | 2 +-
sandbox/sweep-files.py | 8 +-
sandbox/sweep-out-reads-with-contigs.py | 4 +-
sandbox/sweep-reads.py | 8 +-
sandbox/sweep-reads2.py | 6 +-
sandbox/sweep-reads3.py | 6 +-
scripts/abundance-dist-single.py | 42 +-
scripts/abundance-dist.py | 39 +-
scripts/annotate-partitions.py | 6 +-
scripts/count-median.py | 20 +-
scripts/count-overlap.py | 91 --
scripts/do-partition.py | 43 +-
scripts/extract-long-sequences.py | 8 +-
scripts/extract-paired-reads.py | 31 +-
scripts/extract-partitions.py | 17 +-
scripts/fastq-to-fasta.py | 22 +-
scripts/filter-abund-single.py | 50 +-
scripts/filter-abund.py | 47 +-
scripts/filter-stoptags.py | 6 +-
scripts/find-knots.py | 34 +-
scripts/galaxy/README.txt | 5 -
scripts/galaxy/abundance-dist-single.xml | 98 ---
scripts/galaxy/abundance-dist.xml | 68 --
scripts/galaxy/count-median.xml | 58 --
scripts/galaxy/do-partition.xml | 107 ---
scripts/galaxy/extract-partitions.xml | 77 --
scripts/galaxy/filter-abund.xml | 88 --
scripts/galaxy/filter-below-abund.py | 1 -
scripts/galaxy/filter-below-abund.xml | 65 --
scripts/galaxy/gedlab.py | 22 -
scripts/galaxy/macros.xml | 160 ----
scripts/galaxy/normalize-by-median.xml | 132 ---
scripts/galaxy/test-data/random-20-a.fa.part | 1 -
scripts/galaxy/test-data/test-abund-read-2.ct | Bin 20000150 -> 0 bytes
scripts/galaxy/test-data/test-abund-read-2.ct.info | 2 -
scripts/galaxy/test-data/test-abund-read-2.fa | 1 -
.../test-data/test-abund-read-2.nobigcount.ct | Bin 20000130 -> 0 bytes
.../test-data/test-abund-read-2.nobigcount.ct.info | 2 -
scripts/galaxy/test-data/test-abund-read-paired.fa | 1 -
scripts/galaxy/tool_dependencies.xml | 10 -
scripts/interleave-reads.py | 23 +-
scripts/load-graph.py | 7 +-
scripts/load-into-counting.py | 41 +-
scripts/make-initial-stoptags.py | 31 +-
scripts/merge-partitions.py | 6 +-
scripts/normalize-by-median.py | 143 ++-
scripts/partition-graph.py | 38 +-
scripts/sample-reads-randomly.py | 23 +-
scripts/split-paired-reads.py | 112 +--
scripts/trim-low-abund.py | 52 +-
scripts/unique-kmers.py | 10 +-
setup.py | 15 +-
tests/khmer_tst_utils.py | 7 +-
tests/test-data/multi-output.fa | 4 +
tests/test-data/normC20k20.ct | Bin 40144 -> 3999942 bytes
tests/test-data/overlap.out | 7 -
tests/test-data/paired-broken.fq.badleft | 8 +
tests/test-data/paired-broken.fq.badright | 9 +
tests/test-data/paired-broken.fq.paired_bad | 16 +
tests/test-data/test-multi.fa | 2 +
.../{test_counting_hash.py => test_countgraph.py} | 441 +++++-----
tests/test_counting_single.py | 100 +--
tests/test_filter.py | 2 +-
tests/test_functions.py | 44 +-
tests/test_graph.py | 26 +-
tests/test_hashbits.py | 905 -------------------
tests/test_labelhash.py | 58 +-
tests/test_lump.py | 20 +-
tests/test_nodegraph.py | 977 +++++++++++++++++++++
tests/test_normalize_by_median.py | 79 +-
tests/test_oxli_functions.py | 28 +-
tests/test_read_aligner.py | 40 +-
tests/test_sandbox_scripts.py | 12 +
tests/test_script_arguments.py | 39 +-
tests/test_scripts.py | 555 ++++++++++--
tests/test_streaming_io.py | 73 +-
tests/test_subset_graph.py | 60 +-
148 files changed, 4210 insertions(+), 4951 deletions(-)
diff --git a/.gitignore b/.gitignore
index c0abc60..7ac5abd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,6 +6,8 @@
*.swp
*.egg-info
*.egg
+.tags*
+cscope*
*~
build
dist
diff --git a/ChangeLog b/ChangeLog
index cd69867..22f8148 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,7 +1,165 @@
+2015-08-10 Camille Scott <camille.scott.w at gmail.com>
+
+ * lib/traversal.{cc,hh}: Add new files with unified traversal machinery.
+ Introduce Traverser class to handle finding neighbors and appropriate
+ bitmasking.
+ * khmer/_khmer.cc,lib/{counting,hashbits,hashtable,labelhash,subset}.{cc,hh}:
+ Updated relevant instances of HashIntoType and KMerIterator to use new Kmer
+ and KmerIterator, respectively.
+ * lib/Makefile: Add -std=c++11 flag.
+ * Makefile: Update -std=c++11 flag in libtest target.
+ * lib/hashtable.{cc,hh}: Update calc_connected_graph_size to use Traverser.
+ Change kmer_degree to use functions from traversal.cc. Remove redundant
+ count_kmers_with_radius in favor of calc_connected_graph_size. Update
+ traverse_from_kmer to use Traverser. Hashtable subclasses KmerFactory.
+ * lib/{hashtable.hh,kmer_hash.{cc,hh}}: Move KmerIterator from hashtable.hh
+ to kmer_hash.{cc,hh}. Add Kmer class to store forward, reverse, and
+ uniqified integer representations of k-mers, and to handle string
+ conversion. Update KmerIterator to emit objects of type Kmer and to subclass
+ KmerFactory; add doxygen markup.
+ * lib/khmer.hh: Forward declare Kmer and typedef new Kmer data structures.
+ * lib/subset.{cc,hh}: Move constructor definition to .cc file. Remove
+ queue_neighbors in favor of new traversal machinery. Update find_all_tags,
+ sweep_for_tags, and find_all_tags_truncate_on_abundance to use Traverser.
+ * setup.py: Add traversal.{cc,hh} to deps.
+
+2015-08-10 Luiz Irber <khmer at luizirber.org>
+
+ * scripts/unique-kmers.py: use consume_fasta again.
+ * khmer/_khmer.cc: expose output_records option on HLLCounter consume_fasta.
+ * lib/hllcounter.{cc,hh}: implement output_records option in consume_fasta.
+ * lib/read_parsers.{cc,hh}: add Read method write_to, useful for outputting
+ the read to an output stream.
+ * doc/whats-new-2.0.rst: Add unique-kmers description.
+
+2015-08-09 Jacob Fenton <bocajnotnef at gmail.com>
+
+ * khmer/khmer_args.py: pep8
+ * scripts/{interleave-reads,load-graph}.py: Removed unreachable code
+ * tests/test-data/{paired-broken.fq.badleft,paired-broken.fq.badright,
+ paired-broken.fq.paired.bad}: added test data files
+ * tests/{test_normalize_by_median,test_scripts}.py: added tests
+
+2015-08-07 Titus Brown <titus at idyll.org>
+
+ * khmer/_khmer.cc,lib/hashbits.{cc,hh}: removed overlap functionality;
+ eliminated n_entries() as redundant with hashsizes(); removed arguments to
+ n_occupied(); removed get_kadian_count.
+ * lib/{hashbits.cc,counting.cc,khmer.hh},tests/test_hashbits.py: updated
+ save/load of countgraph/nodegraph structures to save _n_occupied.
+ * lib/{hashtable.hh,counting.hh,hashbits.hh}: promoted n_occupied() to
+ Hashtable class; fixed CountingHash unique_kmers calculation.
+ * lib/counting.{cc,hh}: removed get_kadian_count() and moved
+ n_unique_kmers(); updated countgraph writing to save n_occupied.
+ * khmer/__init__.py: modified extract_nodegraph_info and
+ extract_countgraph_info to read in & return n_occupied;
+ * sandbox/bloom-count-intersection.py,scripts/count-overlap.py,
+ tests/test-data/overlap.out: removed overlap scripts and test files.
+ * doc/user/scripts.rst: removed count-overlap.py documentation.
+ * tests/test_scripts.py: removed count-overlap.py tests.
+ * sandbox/README.rst: updated with removal of bloom-count-intersection.py.
+ * tests/test-data/normC20k20.ct: updated file contents to reflect new
+ format containing _n_occupied.
+ * tests/test_countgraph.py: removed get_kadian_count tests; added save/load
+ tests.
+ * tests/test_counting_single.py: remove n_entries() tests; replace
+ n_entries() calls with hashsizes() call.
+ * tests/test_functions.py: updated tests for new extract_*_info functions.
+ * tests/test_nodegraph.py: update htable etc. to nodegraph; added a
+ save/load test for n_occupied() on nodegraph.
+ * tests/{test_normalize_by_median,test_scripts}.py: fixed unique kmers
+ tests.
+
+2015-08-07 Michael R. Crusoe <crusoe at ucdavis.edu>
+
+ * scripts/*.py,tests/*.py,sandbox/*.py,khmer/*.py,oxli/*.py:
+ many function and variable renames:
+ counting_hash, countinghash, hashtable->countgraph;
+ CountingHash->Countgraph
+ hashbits->nodegraph; Hashbits->Nodegraph;
+ check_space_for_hashtable->check_space_for_graph;
+ hash_args->graph_args
+ * khmer/_khmer.cc: remove unused 'new_hashtable' method; match renames
+ * TODO: removed several items
+ * doc/dev/scripts-and-sandbox.rst: fixed hashbang
+
+2015-08-04 Jacob Fenton <bocajnotnef at gmail.com>
+
+ * khmer/khmer_args.py, oxli/functions.py: migrated estimation functions out
+ oxli and into khmer_args
+ * oxli/build_graph.py, tests/test_oxli_functions.py,
+ sandbox/{estimate_optimal_hash,optimal_args_hashbits}.py,
+ scripts/{normalize-by-median,unique-kmers}.py: changed to not break on
+ location change
+ * tests/{test_normalize_by_median,test_scripts}.py: added tests for
+ automatic arg setting
+ * tests/test_script_arguments: changed to play nice with unique_kmers as an
+ argument
+
+2015-08-04 Titus Brown <titus at idyll.org> and Camille Scott
+<camille.scott.w at gmail.com>
+
+ * khmer/utils.py: added UnpairedReadsError exception.
+ * scripts/{extract-paired-reads,split-paired-reads}.py: changed --output-dir
+ argument short form to use '-d'.
+ * scripts/{split-paired-reads.py} added -0 <filename> to allow orphans; made
+ '-p'/'--force-paired' default & removed from script.
+ * scripts/{normalize-by-median,filter-abund,trim-low-abund}.py: changed
+ long form of '-o' to be '--output'.
+ * tests/{test_scripts,test_streaming_io}.py: updated and added tests for
+ new behavior.
+
+2015-08-03 Jacob Fenton <bocajnotnef at gmail.com>
+
+ * doc/dev/coding-guidelines-and-review.rst: added codespell as a possible
+ spelling tool
+
+2015-08-03 Jacob Fenton <bocajnotnef at gmail.com>
+
+ * Makefile: added oxli to pep257 make target, made clean target wipe out all
+ .pyc files in scripts/* and tests/* and oxli/*
+
+2015-08-03 Jacob Fenton <bocajnotnef at gmail.com>
+
+ * tests/test_counting_single.py: removed redundant test
+
+2015-08-01 Jacob Fenton <bocajnotnef at gmail.com> and Titus Brown
+<titus at idyll.org>
+
+ * scripts/normalize-by-median.py,khmer/khmer_logger.py: added logging
+ framework, prototyped in normalize-by-median; added -q/--quiet to
+ * tests/test_normalize_by_median.py: associated tests.
+ * khmer/khmer_args.py: Made info function use logging functions.
+ * tests/khmer_tst_utils.py: removed info reporting in runscript from 'out'
+ returned.
+
+2015-08-01 Jacob Fenton <bocajnotnef at gmail.com>
+
+ * khmer/kfile.py: added infrastructure for doing compressed output
+ * khmer/thread_utils.py: switched threaded_sequence_processor to make use of
+ write_record
+ * scripts/{extract-long-sequences,extract-paired-reads,
+ extract-partitions,fastq-to-fasta,filter-abund-single,filter-abund,
+ interleave-reads,normalize-by-median,sample-reads-randomly,
+ split-paired-reads,trim-low-abund}.py: added output compression
+ * tests/{test_functions,test_scripts,test_normalize_by_median}.py: added
+ tests
+ * scripts/{load-graph,partition-graph,find-knots.py,
+ make-initial-stoptags}.py,oxli/build_graph.py: made load-graph no longer
+ add .pt to graph outfiles, changed partition-graph to not expect .pt's
+ * doc/whats-new-2.0.rst: doc'd changes to load-graph and partition-graph
+ * doc/dev/scripts-and-sandbox.rst: updated scripts/ requirements.
+
+2015-08-01 Sherine Awad <drmahmoud at ucdavis.edu>
+
+ * sandbox/multi-rename.py: updated output of long FASTA sequences to
+ wrap text at 80 characters.
+ * tests/test_sandbox_scripts.py: Added a test for multi-rename.py.
+
2015-07-31 Kevin Murray <spam at kdmurray.id.au>
* lib/Makefile,Makefile,lib/*.pc.in,lib/test-compile.cc: Misc debian-based
- compatiablity changes
+ compatibility changes
* lib/get_version.py: Add crunchbang, chmod +x
2015-07-29 Michael R. Crusoe <crusoe at ucdavis.edu>
diff --git a/Makefile b/Makefile
index 9a1d378..6cfe294 100644
--- a/Makefile
+++ b/Makefile
@@ -4,6 +4,7 @@
# and documentation
# make coverage-report to check coverage of the python scripts by the tests
+SHELL=bash
CPPSOURCES=$(wildcard lib/*.cc lib/*.hh khmer/_khmer.cc)
PYSOURCES=$(wildcard khmer/*.py scripts/*.py)
SOURCES=$(PYSOURCES) $(CPPSOURCES) setup.py
@@ -73,7 +74,7 @@ clean: FORCE
cd lib && ${MAKE} clean || true
cd tests && rm -rf khmertest_* || true
rm -f $(EXTENSION_MODULE)
- rm -f khmer/*.pyc lib/*.pyc
+ rm -f khmer/*.pyc lib/*.pyc scripts/*.pyc tests/*.pyc oxli/*.pyc
./setup.py clean --all || true
rm -f coverage-debug
rm -Rf .coverage
@@ -123,10 +124,10 @@ diff_pep8_report: pep8_report.txt
## pep257 : check Python code style
pep257: $(PYSOURCES) $(wildcard tests/*.py)
pep257 --ignore=D100,D101,D102,D103 \
- setup.py khmer/ scripts/ tests/ || true
+ setup.py khmer/ scripts/ tests/ oxli/ || true
pep257_report.txt: $(PYSOURCES) $(wildcard tests/*.py)
- pep257 setup.py khmer/ scripts/ tests/ \
+ pep257 setup.py khmer/ scripts/ tests/ oxli/ \
> pep257_report.txt 2>&1 || true
diff_pep257_report: pep257_report.txt
@@ -220,9 +221,9 @@ libtest: FORCE
test -f install_target/include/oxli/khmer.hh
test -d install_target/lib
test -f install_target/lib/liboxli.a
- $(CXX) -o install_target/test-prog-static -I install_target/include \
+ $(CXX) -std=c++11 -o install_target/test-prog-static -I install_target/include \
lib/test-compile.cc install_target/lib/liboxli.a
- $(CXX) -o install_target/test-prog-dynamic -I install_target/include \
+ $(CXX) -std=c++11 -o install_target/test-prog-dynamic -I install_target/include \
-L install_target/lib lib/test-compile.cc -loxli
rm -rf install_target
@@ -244,7 +245,7 @@ coverity-build:
then \
export PATH=${PATH}:${cov_analysis_dir}/bin; \
cov-build --dir cov-int --c-coverage gcov --disable-gcov-arg-injection make coverage-debug; \
- cov-capture --dir cov-int --c-coverage gcov python -m nose --attr '!known_failing' ; \
+ cov-capture --dir cov-int --c-coverage gcov python -m nose --attr ${TESTATTR} ; \
cov-import-scm --dir cov-int --scm git 2>/dev/null; \
else echo 'bin/cov-build does not exist in $$cov_analysis_dir: '\
'${cov_analysis_dir}. Skipping coverity scan.'; \
@@ -277,6 +278,7 @@ coverity-configure:
'${cov_analysis_dir}. Skipping coverity configuration.'; \
fi
+# may need to `sudo apt-get install bear`
compile_commands.json: clean
export PATH=$(shell echo $$PATH | sed 's=/usr/lib/ccache:==g') ; \
bear -- ./setup.py build_ext
diff --git a/TODO b/TODO
index 09c5572..eeb5d77 100644
--- a/TODO
+++ b/TODO
@@ -1,8 +1,3 @@
-@@ merge refactor => master
-@@ review site for paper?
-
-auto-memory setting.
-
find-knot speedup:
- too many redundant rounds of partitioning?
@@ -28,17 +23,14 @@ load-counting/bigcount loading is slooooow
----
-screed bzip
screed slice
screed fasta/fastq output
-screed streaming foo
---
fix tests cleanup
pyrex/cython stuff
-script testing
docs!
---
@@ -54,8 +46,6 @@ fix tests and test cases to properly isolate/remove temp files.
fix dir(ht)
-rename new_hashtable to new_countinghash
-
###
Semi-obsolete comments, pre partitioning:
diff --git a/doc/dev/coding-guidelines-and-review.rst b/doc/dev/coding-guidelines-and-review.rst
index 59331b8..a9fc5b0 100644
--- a/doc/dev/coding-guidelines-and-review.rst
+++ b/doc/dev/coding-guidelines-and-review.rst
@@ -29,8 +29,40 @@ indentation can be set with::
For Python, `PEP 8 <http://www.python.org/dev/peps/pep-0008/>`__ is our
standard. The ```pep8``` and ```autopep8``` Makefile targets are helpful.
-Code, scripts, and documentation must have its spelling checked. Vim users can
-run::
+Code, scripts, and documentation must have their spelling checked.
+
+Python-based `codespell` can be applied to multiple files easily. `codespell`
+can be installed via the following::
+
+ mkdir ~/bin
+ git clone git at github.com:lucasdemarchi/codespell.git
+ cd codespell
+ make prefix=${HOME} install
+ export PATH=$PATH:~/bin/
+
+Note, if you want codespell to always be available you will need to add the
+`export` line to your `${HOME}\.bashrc` or equivalent.
+
+To run codespell over only what has been changed on the branch `my-branch`::
+
+ git diff master..my-branch > diff_file
+ codespell diff_file
+
+To run codespell over a single file::
+
+ codespell path/to/file
+
+To make codespell fix the issues it finds automatically::
+
+ codespell -w path/to/file
+
+Please note that as `codespell` works off of a listing of possible
+misspellings it may not catch all errors. If you find a spelling error that
+is not caught by `codespell` feel free to open a pull request at the `project
+page <https://github.com/lucasdemarchi/codespell>`_ to add it to the
+dictionary.
+
+Vim users can run::
:setlocal spell spelllang=en_us
diff --git a/doc/dev/scripts-and-sandbox.rst b/doc/dev/scripts-and-sandbox.rst
index 3b149cc..0fbab08 100644
--- a/doc/dev/scripts-and-sandbox.rst
+++ b/doc/dev/scripts-and-sandbox.rst
@@ -38,7 +38,7 @@ All scripts in ``sandbox/`` must:
* be importable (enforced by ``test_import_all`` in
``test_sandbox_scripts.py``)
* be mentioned in ``sandbox/README.rst``
-* have a hash-bang line (``#! /usr/bin/env python2``) at the top
+* have a hash-bang line (``#! /usr/bin/env python``) at the top
* be command-line executable (``chmod a+x``)
* have a Copyright message (see below)
* have lowercase names
@@ -116,5 +116,6 @@ development/PR checklist::
- [ ] version and citation information is output to STDERR (`khmer_args.info(...)`)
- [ ] support '-' (STDIN) as an input file, if appropriate
- [ ] support designation of an output file (including STDOUT), if appropriate
+ - [ ] script reads and writes sequences in compressed format
- [ ] runtime diagnostic information (progress, etc.) is output to STDERR
- [ ] script has been removed from sandbox/README.rst
diff --git a/doc/user/scripts.rst b/doc/user/scripts.rst
index 8f2b327..8dfbcd5 100644
--- a/doc/user/scripts.rst
+++ b/doc/user/scripts.rst
@@ -55,9 +55,6 @@ k-mer counting and abundance filtering
.. autoprogram:: count-median:get_parser()
:prog: count-median.py
-.. autoprogram:: count-overlap:get_parser()
- :prog: count-overlap.py
-
.. autoprogram:: unique-kmers:get_parser()
:prog: unique-kmers.py
diff --git a/doc/whats-new-2.0.rst b/doc/whats-new-2.0.rst
index dc9cc69..386d4d3 100644
--- a/doc/whats-new-2.0.rst
+++ b/doc/whats-new-2.0.rst
@@ -24,11 +24,26 @@ Reservoir sampling script extracts paired reads by default
default. This can be overridden to match previous behavior
with :option:`--force_single`.
+New scripts
+===========
+
+Estimate number of unique kmers
+-------------------------------
+
+`unique-kmers.py` estimates the k-mer cardinality of a dataset using the
+HyperLogLog probabilistic data structure. This allows very low memory
+consumption, which can be configured through an expected error rate.
+Even with low error rate (and higher memory consumption), it is still much
+more efficient than exact counting and alternative methods.
+It supports multicore processing (using OpenMP) and streaming,
+and so can be used in conjunction with other scripts (like
+`normalize-by-median.py` and `filter-abund.py`).
+
Incompatible changes
====================
New parameter for tablesize/number of table parameters.
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+-------------------------------------------------------
There is now a :option:`-M`/:option:`--max-memory-usage` parameter
that sets the number of tables (:option:`-N`/:option:`--num_tables`)
@@ -39,7 +54,7 @@ automatically to match the desired memory usage.
:option:`--max-tablesize` to reflect this more desirable behavior.)
Binary file formats have changed!
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+---------------------------------
All binary khmer formats (presence tables, counting tables, tag sets,
stop tags, and partition subsets) have changed. Files are now
@@ -50,10 +65,19 @@ Files of the above types made in previous versions of khmer are not compatible
with v2.0; the reverse is also true.
Scripts now output columnar data in CSV format by default
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+---------------------------------------------------------
All scripts that output any kind of columnar data now do so in CSV format,
-with headers. Previously this had to be enabled with :options:`--csv`.
+with headers. Previously this had to be enabled with :option:`--csv`.
(Affects `abundance-dist-single.py`, `abundance-dist.py`, `count-median.py`,
and `count-overlap.py`.) `normalize-by-median.py` also now outputs CSV
when :option:`-R` is used.
+
+load-graph.py no longer appends .pt to the specified filename
+-------------------------------------------------------------
+
+Previously, `load-graph.py` appended a `.pt` extension to the
+specified output filename and partition-graph appended a `.pt` to the
+given input filename. Now, `load-graph.py` writes to the specified
+output filename and `partition-graph.py` does not append a `.pt` to
+the given input filename.
diff --git a/khmer/__init__.py b/khmer/__init__.py
index b0b48e9..9847e6f 100644
--- a/khmer/__init__.py
+++ b/khmer/__init__.py
@@ -10,18 +10,14 @@ from __future__ import print_function
from math import log
import json
-from khmer._khmer import CountingHash as _CountingHash
-from khmer._khmer import LabelHash as _LabelHash
-from khmer._khmer import Hashbits as _Hashbits
+from khmer._khmer import Countgraph as _Countgraph
+from khmer._khmer import GraphLabels as _GraphLabels
+from khmer._khmer import Nodegraph as _Nodegraph
from khmer._khmer import HLLCounter as _HLLCounter
from khmer._khmer import ReadAligner as _ReadAligner
-from khmer._khmer import forward_hash # figuregen/*.py
-# tests/test_{functions,counting_hash,labelhash,counting_single}.py
-
-from khmer._khmer import new_hashtable
-# sandbox/{occupy,ctb-iterative-bench{-2-old}}.py
-# tests/{test_c_wrapper,test_counting_single}.py
+from khmer._khmer import forward_hash
+# tests/test_{functions,countgraph,counting_single}.py
from khmer._khmer import forward_hash_no_rc # tests/test_functions.py
@@ -47,38 +43,38 @@ __version__ = get_versions()['version']
del get_versions
-def load_hashbits(filename):
- """Load a hashbits object from the given filename and return it.
+def load_nodegraph(filename):
+ """Load a nodegraph object from the given filename and return it.
Keyword argument:
- filename -- the name of the hashbits file
+ filename -- the name of the nodegraph file
"""
- hashtable = _Hashbits(1, [1])
- hashtable.load(filename)
+ nodegraph = _Nodegraph(1, [1])
+ nodegraph.load(filename)
- return hashtable
+ return nodegraph
-def load_counting_hash(filename):
- """Load a counting_hash object from the given filename and return it.
+def load_countgraph(filename):
+ """Load a countgraph object from the given filename and return it.
Keyword argument:
- filename -- the name of the counting_hash file
+ filename -- the name of the countgraph file
"""
- hashtable = _CountingHash(1, [1])
- hashtable.load(filename)
+ countgraph = _Countgraph(1, [1])
+ countgraph.load(filename)
- return hashtable
+ return countgraph
-def extract_hashbits_info(filename):
- """Open the given hashbits file and return a tuple of information.
+def extract_nodegraph_info(filename):
+ """Open the given nodegraph file and return a tuple of information.
Returns: the k-mer size, the table size, the number of tables, the version
of the table format, and the type of table flag.
Keyword argument:
- filename -- the name of the hashbits file to inspect
+ filename -- the name of the nodegraph file to inspect
"""
ksize = None
n_tables = None
@@ -86,36 +82,38 @@ def extract_hashbits_info(filename):
signature = None
version = None
ht_type = None
+ occupied = None
uint_size = len(pack('I', 0))
uchar_size = len(pack('B', 0))
ulonglong_size = len(pack('Q', 0))
try:
- with open(filename, 'rb') as hashbits:
- signature, = unpack('4s', hashbits.read(4))
- version, = unpack('B', hashbits.read(1))
- ht_type, = unpack('B', hashbits.read(1))
- ksize, = unpack('I', hashbits.read(uint_size))
- n_tables, = unpack('B', hashbits.read(uchar_size))
- table_size, = unpack('Q', hashbits.read(ulonglong_size))
+ with open(filename, 'rb') as nodegraph:
+ signature, = unpack('4s', nodegraph.read(4))
+ version, = unpack('B', nodegraph.read(1))
+ ht_type, = unpack('B', nodegraph.read(1))
+ ksize, = unpack('I', nodegraph.read(uint_size))
+ n_tables, = unpack('B', nodegraph.read(uchar_size))
+ occupied, = unpack('Q', nodegraph.read(ulonglong_size))
+ table_size, = unpack('Q', nodegraph.read(ulonglong_size))
if signature != b"OXLI":
raise ValueError("Node graph '{}' is missing file type "
"signature".format(filename) + str(signature))
except:
raise ValueError("Presence table '{}' is corrupt ".format(filename))
- return ksize, round(table_size, -2), n_tables, version, ht_type
+ return ksize, round(table_size, -2), n_tables, version, ht_type, occupied
-def extract_countinghash_info(filename):
- """Open the given counting_hash file and return a tuple of information.
+def extract_countgraph_info(filename):
+ """Open the given countgraph file and return a tuple of information.
Return: the k-mer size, the table size, the number of tables, the bigcount
flag, the version of the table format, and the type of table flag.
Keyword argument:
- filename -- the name of the counting_hash file to inspect
+ filename -- the name of the countgraph file to inspect
"""
ksize = None
n_tables = None
@@ -124,40 +122,42 @@ def extract_countinghash_info(filename):
version = None
ht_type = None
use_bigcount = None
+ occupied = None
uint_size = len(pack('I', 0))
ulonglong_size = len(pack('Q', 0))
try:
- with open(filename, 'rb') as countinghash:
- signature, = unpack('4s', countinghash.read(4))
- version, = unpack('B', countinghash.read(1))
- ht_type, = unpack('B', countinghash.read(1))
- use_bigcount, = unpack('B', countinghash.read(1))
- ksize, = unpack('I', countinghash.read(uint_size))
- n_tables, = unpack('B', countinghash.read(1))
- table_size, = unpack('Q', countinghash.read(ulonglong_size))
+ with open(filename, 'rb') as countgraph:
+ signature, = unpack('4s', countgraph.read(4))
+ version, = unpack('B', countgraph.read(1))
+ ht_type, = unpack('B', countgraph.read(1))
+ use_bigcount, = unpack('B', countgraph.read(1))
+ ksize, = unpack('I', countgraph.read(uint_size))
+ n_tables, = unpack('B', countgraph.read(1))
+ occupied, = unpack('Q', countgraph.read(ulonglong_size))
+ table_size, = unpack('Q', countgraph.read(ulonglong_size))
if signature != b'OXLI':
- raise ValueError("Counting table '{}' is missing file type "
+ raise ValueError("Count graph file '{}' is missing file type "
"signature. ".format(filename) + str(signature))
except:
- raise ValueError("Counting table '{}' is corrupt ".format(filename))
+ raise ValueError("Count graph file '{}' is corrupt ".format(filename))
return ksize, round(table_size, -2), n_tables, use_bigcount, version, \
- ht_type
+ ht_type, occupied
-def calc_expected_collisions(hashtable, force=False, max_false_pos=.2):
- """Do a quick & dirty expected collision rate calculation on a hashtable.
+def calc_expected_collisions(graph, force=False, max_false_pos=.2):
+ """Do a quick & dirty expected collision rate calculation on a graph
Also check to see that collision rate is within threshold.
Keyword argument:
- hashtable: the hashtable object to inspect
+ graph: the countgraph or nodegraph object to inspect
"""
- sizes = hashtable.hashsizes()
+ sizes = graph.hashsizes()
n_ht = float(len(sizes))
- occupancy = float(hashtable.n_occupied())
+ occupancy = float(graph.n_occupied())
min_size = min(sizes)
fp_one = occupancy / min_size
@@ -233,39 +233,39 @@ def get_n_primes_near_x(number, target):
# Additional functionality can be added to these classes as appropriate.
-class CountingHash(_CountingHash):
+class Countgraph(_Countgraph):
def __new__(cls, k, starting_size, n_tables):
primes = get_n_primes_near_x(n_tables, starting_size)
- c = _CountingHash.__new__(cls, k, primes)
+ c = _Countgraph.__new__(cls, k, primes)
c.primes = primes
return c
-class LabelHash(_LabelHash):
+class GraphLabels(_GraphLabels):
def __new__(cls, k, starting_size, n_tables):
- hb = Hashbits(k, starting_size, n_tables)
- c = _LabelHash.__new__(cls, hb)
+ hb = Nodegraph(k, starting_size, n_tables)
+ c = _GraphLabels.__new__(cls, hb)
c.graph = hb
return c
-class CountingLabelHash(_LabelHash):
+class CountingGraphLabels(_GraphLabels):
def __new__(cls, k, starting_size, n_tables):
primes = get_n_primes_near_x(n_tables, starting_size)
- hb = _CountingHash(k, primes)
- c = _LabelHash.__new__(cls, hb)
+ hb = _Countgraph(k, primes)
+ c = _GraphLabels.__new__(cls, hb)
c.graph = hb
return c
-class Hashbits(_Hashbits):
+class Nodegraph(_Nodegraph):
def __new__(cls, k, starting_size, n_tables):
primes = get_n_primes_near_x(n_tables, starting_size)
- c = _Hashbits.__new__(cls, k, primes)
+ c = _Nodegraph.__new__(cls, k, primes)
c.primes = primes
return c
@@ -296,7 +296,7 @@ class ReadAligner(_ReadAligner):
"""Sequence to graph aligner.
- ReadAligner uses a CountingHash (the counts of k-mers in the target DNA
+ ReadAligner uses a Countgraph (the counts of k-mers in the target DNA
sequences) as an implicit De Bruijn graph. Input DNA sequences are aligned
to this graph via a paired Hidden Markov Model.
@@ -325,7 +325,7 @@ class ReadAligner(_ReadAligner):
defaultScoringMatrix = [
log(0.955, 2), log(0.04, 2), log(0.004, 2), log(0.001, 2)]
- def __new__(cls, counting_table, trusted_cov_cutoff, bits_theta,
+ def __new__(cls, count_graph, trusted_cov_cutoff, bits_theta,
**kwargs):
if 'filename' in kwargs:
@@ -344,10 +344,10 @@ class ReadAligner(_ReadAligner):
else:
transition_probabilities = \
ReadAligner.defaultTransitionProbabilities
- r = _ReadAligner.__new__(cls, counting_table, trusted_cov_cutoff,
+ r = _ReadAligner.__new__(cls, count_graph, trusted_cov_cutoff,
bits_theta, scoring_matrix,
transition_probabilities)
- r.graph = counting_table
+ r.graph = count_graph
return r
def __init__(self, *args, **kwargs):
diff --git a/khmer/_khmer.cc b/khmer/_khmer.cc
index d41d438..47121d4 100644
--- a/khmer/_khmer.cc
+++ b/khmer/_khmer.cc
@@ -50,18 +50,18 @@ using namespace read_parsers;
//
#if PY_MAJOR_VERSION >= 3
- #define MOD_ERROR_VAL NULL
- #define MOD_SUCCESS_VAL(val) val
- #define MOD_INIT(name) PyMODINIT_FUNC PyInit_##name(void)
- #define MOD_DEF(ob, name, doc, methods) \
+#define MOD_ERROR_VAL NULL
+#define MOD_SUCCESS_VAL(val) val
+#define MOD_INIT(name) PyMODINIT_FUNC PyInit_##name(void)
+#define MOD_DEF(ob, name, doc, methods) \
static struct PyModuleDef moduledef = { \
PyModuleDef_HEAD_INIT, name, doc, -1, methods, }; \
ob = PyModule_Create(&moduledef);
#else
- #define MOD_ERROR_VAL
- #define MOD_SUCCESS_VAL(val)
- #define MOD_INIT(name) void init##name(void)
- #define MOD_DEF(ob, name, doc, methods) \
+#define MOD_ERROR_VAL
+#define MOD_SUCCESS_VAL(val)
+#define MOD_INIT(name) void init##name(void)
+#define MOD_DEF(ob, name, doc, methods) \
ob = Py_InitModule3(name, methods, doc);
#endif
@@ -682,11 +682,11 @@ static void khmer_hashbits_dealloc(khmer_KHashbits_Object * obj);
static PyObject* khmer_hashbits_new(PyTypeObject * type, PyObject * args,
PyObject * kwds);
-static PyTypeObject khmer_KHashbits_Type
+static PyTypeObject khmer_KNodegraph_Type
CPYCHECKER_TYPE_OBJECT_FOR_TYPEDEF("khmer_KHashbits_Object")
= {
PyVarObject_HEAD_INIT(NULL, 0) /* init & ob_size */
- "_khmer.Hashbits", /* tp_name */
+ "_khmer.Nodegraph", /* tp_name */
sizeof(khmer_KHashbits_Object), /* tp_basicsize */
0, /* tp_itemsize */
(destructor)khmer_hashbits_dealloc, /*tp_dealloc*/
@@ -747,13 +747,11 @@ hashtable_n_occupied(khmer_KHashtable_Object * me, PyObject * args)
{
Hashtable * hashtable = me->hashtable;
- HashIntoType start = 0, stop = 0;
-
- if (!PyArg_ParseTuple(args, "|KK", &start, &stop)) {
+ if (!PyArg_ParseTuple(args, "")) {
return NULL;
}
- HashIntoType n = hashtable->n_occupied(start, stop);
+ HashIntoType n = hashtable->n_occupied();
return PyLong_FromUnsignedLongLong(n);
}
@@ -771,19 +769,6 @@ hashtable_n_unique_kmers(khmer_KHashtable_Object * me, PyObject * args)
static
PyObject *
-hashtable_n_entries(khmer_KHashtable_Object * me, PyObject * args)
-{
- Hashtable * hashtable = me->hashtable;
-
- if (!PyArg_ParseTuple(args, "")) {
- return NULL;
- }
-
- return PyLong_FromUnsignedLongLong(hashtable->n_entries());
-}
-
-static
-PyObject *
hashtable_count(khmer_KHashtable_Object * me, PyObject * args)
{
Hashtable * hashtable = me->hashtable;
@@ -1051,7 +1036,7 @@ hashtable_get_tags_and_positions(khmer_KHashtable_Object * me, PyObject * args)
std::vector<HashIntoType> tags;
unsigned int pos = 1;
- KMerIterator kmers(seq, hashtable->ksize());
+ KmerIterator kmers(seq, hashtable->ksize());
while (!kmers.done()) {
HashIntoType kmer = kmers.next();
@@ -1091,12 +1076,11 @@ hashtable_find_all_tags_list(khmer_KHashtable_Object * me, PyObject * args)
SeenSet tags;
- Py_BEGIN_ALLOW_THREADS
+ Kmer start_kmer = hashtable->build_kmer(kmer_s);
- HashIntoType kmer_f, kmer_r;
- _hash(kmer_s, hashtable->ksize(), kmer_f, kmer_r);
+ Py_BEGIN_ALLOW_THREADS
- hashtable->partition->find_all_tags(kmer_f, kmer_r, tags,
+ hashtable->partition->find_all_tags(start_kmer, tags,
hashtable->all_tags);
Py_END_ALLOW_THREADS
@@ -1324,10 +1308,11 @@ hashtable_calc_connected_graph_size(khmer_KHashtable_Object * me,
}
unsigned long long size = 0;
+ Kmer start_kmer = hashtable->build_kmer(_kmer);
Py_BEGIN_ALLOW_THREADS
- SeenSet keeper;
- hashtable->calc_connected_graph_size(_kmer, size, keeper, max_size,
+ KmerSet keeper;
+ hashtable->calc_connected_graph_size(start_kmer, size, keeper, max_size,
break_on_circum);
Py_END_ALLOW_THREADS
@@ -1643,24 +1628,23 @@ hashtable_find_all_tags(khmer_KHashtable_Object * me, PyObject * args)
pre_partition_info * ppi = NULL;
- Py_BEGIN_ALLOW_THREADS
+ Kmer kmer = hashtable->build_kmer(kmer_s);
- HashIntoType kmer, kmer_f, kmer_r;
- kmer = _hash(kmer_s, hashtable->ksize(), kmer_f, kmer_r);
+ Py_BEGIN_ALLOW_THREADS
try {
ppi = new pre_partition_info(kmer);
} catch (std::bad_alloc &e) {
return PyErr_NoMemory();
}
- hashtable->partition->find_all_tags(kmer_f, kmer_r, ppi->tagged_kmers,
+ hashtable->partition->find_all_tags(kmer, ppi->tagged_kmers,
hashtable->all_tags);
hashtable->add_kmer_to_tags(kmer);
Py_END_ALLOW_THREADS
khmer_PrePartitionInfo_Object * ppi_obj = (khmer_PrePartitionInfo_Object *) \
- PyObject_New(khmer_PrePartitionInfo_Object, &khmer_PrePartitionInfo_Type);
+ PyObject_New(khmer_PrePartitionInfo_Object, &khmer_PrePartitionInfo_Type);
ppi_obj->PrePartitionInfo = ppi;
@@ -1763,7 +1747,7 @@ hashtable_get_tagset(khmer_KHashtable_Object * me, PyObject * args)
PyObject * x = PyList_New(hashtable->all_tags.size());
unsigned long long i = 0;
for (si = hashtable->all_tags.begin(); si != hashtable->all_tags.end();
- ++si) {
+ ++si) {
std::string s = _revhash(*si, k);
PyList_SET_ITEM(x, i, Py_BuildValue("s", s.c_str()));
i++;
@@ -2310,11 +2294,10 @@ hashtable_count_kmers_within_radius(khmer_KHashtable_Object * me,
unsigned int n;
Py_BEGIN_ALLOW_THREADS
-
- HashIntoType kmer_f, kmer_r;
- _hash(kmer, hashtable->ksize(), kmer_f, kmer_r);
- n = hashtable->count_kmers_within_radius(kmer_f, kmer_r, radius,
- max_count);
+ Kmer start_kmer = hashtable->build_kmer(kmer);
+ KmerSet seen;
+ n = hashtable->traverse_from_kmer(start_kmer, radius,
+ seen, max_count);
Py_END_ALLOW_THREADS
@@ -2442,7 +2425,6 @@ static PyMethodDef khmer_hashtable_methods[] = {
"n_occupied", (PyCFunction)hashtable_n_occupied, METH_VARARGS,
"Count the number of occupied bins."
},
- { "n_entries", (PyCFunction)hashtable_n_entries, METH_VARARGS, "" },
{
"count",
(PyCFunction)hashtable_count, METH_VARARGS,
@@ -2633,10 +2615,6 @@ CPYCHECKER_TYPE_OBJECT_FOR_TYPEDEF("khmer_KHashtable_Object")
#define is_hashtable_obj(v) (Py_TYPE(v) == &khmer_KHashtable_Type)
//
-// _new_hashtable
-//
-
-//
// KCountingHash object
//
@@ -2782,32 +2760,6 @@ count_fasta_dump_kmers_by_abundance(khmer_KCountingHash_Object * me,
static
PyObject *
-count_get_kadian_count(khmer_KCountingHash_Object * me, PyObject * args)
-{
- CountingHash * counting = me->counting;
-
- const char * long_str;
- unsigned int nk = 1;
-
- if (!PyArg_ParseTuple(args, "s|I", &long_str, &nk)) {
- return NULL;
- }
-
- if (strlen(long_str) < counting->ksize()) {
- PyErr_SetString(PyExc_ValueError,
- "string length must >= the hashtable k-mer size");
- return NULL;
- }
-
- BoundedCounterType kad = 0;
-
- counting->get_kadian_count(long_str, kad, nk);
-
- return Py_BuildValue("i", kad);
-}
-
-static
-PyObject *
count_get_raw_tables(khmer_KCountingHash_Object * self, PyObject * args)
{
CountingHash * counting = self->counting;
@@ -2818,7 +2770,8 @@ count_get_raw_tables(khmer_KCountingHash_Object * self, PyObject * args)
PyObject * raw_tables = PyList_New(sizes.size());
for (unsigned int i=0; i<sizes.size(); ++i) {
Py_buffer buffer;
- int res = PyBuffer_FillInfo(&buffer, NULL, table_ptrs[i], sizes[i], 0, PyBUF_FULL_RO);
+ int res = PyBuffer_FillInfo(&buffer, NULL, table_ptrs[i], sizes[i], 0,
+ PyBUF_FULL_RO);
if (res == -1) {
return NULL;
}
@@ -2966,8 +2919,8 @@ count_fasta_count_kmers_by_position(khmer_KCountingHash_Object * me,
unsigned long long * counts;
try {
counts = counting->fasta_count_kmers_by_position(inputfile,
- max_read_len,
- (unsigned short) limit_by_count_int);
+ max_read_len,
+ (unsigned short) limit_by_count_int);
} catch (khmer_file_exception &exc) {
PyErr_SetString(PyExc_OSError, exc.what());
return NULL;
@@ -3006,7 +2959,7 @@ count_abundance_distribution_with_reads_parser(khmer_KCountingHash_Object * me,
khmer_KHashbits_Object *tracking_obj = NULL;
if (!PyArg_ParseTuple(args, "O!O!", &python::khmer_ReadParser_Type,
- &rparser_obj, &khmer_KHashbits_Type, &tracking_obj)) {
+ &rparser_obj, &khmer_KNodegraph_Type, &tracking_obj)) {
return NULL;
}
@@ -3056,7 +3009,7 @@ count_abundance_distribution(khmer_KCountingHash_Object * me, PyObject * args)
const char * filename = NULL;
khmer_KHashbits_Object * tracking_obj = NULL;
- if (!PyArg_ParseTuple(args, "sO!", &filename, &khmer_KHashbits_Type,
+ if (!PyArg_ParseTuple(args, "sO!", &filename, &khmer_KNodegraph_Type,
&tracking_obj)) {
return NULL;
}
@@ -3169,7 +3122,6 @@ static PyMethodDef khmer_counting_methods[] = {
{ "output_fasta_kmer_pos_freq", (PyCFunction)count_output_fasta_kmer_pos_freq, METH_VARARGS, "" },
{ "get_min_count", (PyCFunction)count_get_min_count, METH_VARARGS, "Get the smallest count of all the k-mers in the string" },
{ "get_max_count", (PyCFunction)count_get_max_count, METH_VARARGS, "Get the largest count of all the k-mers in the string" },
- { "get_kadian_count", (PyCFunction)count_get_kadian_count, METH_VARARGS, "Get the kadian (abundance of k-th rank-ordered k-mer) of the k-mer counts in the string" },
{ "trim_on_abundance", (PyCFunction)count_trim_on_abundance, METH_VARARGS, "Trim on >= abundance" },
{ "trim_below_abundance", (PyCFunction)count_trim_below_abundance, METH_VARARGS, "Trim on >= abundance" },
{ "find_spectral_error_positions", (PyCFunction)count_find_spectral_error_positions, METH_VARARGS, "Identify positions of low-abundance k-mers" },
@@ -3188,11 +3140,11 @@ static PyMethodDef khmer_counting_methods[] = {
static PyObject* _new_counting_hash(PyTypeObject * type, PyObject * args,
PyObject * kwds);
-static PyTypeObject khmer_KCountingHash_Type
+static PyTypeObject khmer_KCountgraph_Type
CPYCHECKER_TYPE_OBJECT_FOR_TYPEDEF("khmer_KCountingHash_Object")
= {
PyVarObject_HEAD_INIT(NULL, 0) /* init & ob_size */
- "_khmer.CountingHash", /*tp_name*/
+ "_khmer.Countgraph", /*tp_name*/
sizeof(khmer_KCountingHash_Object), /*tp_basicsize*/
0, /*tp_itemsize*/
(destructor)khmer_counting_dealloc, /*tp_dealloc*/
@@ -3231,37 +3183,7 @@ CPYCHECKER_TYPE_OBJECT_FOR_TYPEDEF("khmer_KCountingHash_Object")
_new_counting_hash, /* tp_new */
};
-#define is_counting_obj(v) (Py_TYPE(v) == &khmer_KCountingHash_Type)
-
-//
-// new_hashtable
-//
-
-static PyObject* new_hashtable(PyObject * self, PyObject * args)
-{
- unsigned int k = 0;
- unsigned long long size = 0;
-
- if (!PyArg_ParseTuple(args, "IK", &k, &size)) {
- return NULL;
- }
-
- khmer_KCountingHash_Object * kcounting_obj = (khmer_KCountingHash_Object *) \
- PyObject_New(khmer_KCountingHash_Object, &khmer_KCountingHash_Type);
-
- if (kcounting_obj == NULL) {
- return NULL;
- }
-
- try {
- kcounting_obj->counting = new CountingHash(k, size);
- } catch (std::bad_alloc &e) {
- return PyErr_NoMemory();
- }
- kcounting_obj->khashtable.hashtable = kcounting_obj->counting;
-
- return (PyObject *) kcounting_obj;
-}
+#define is_counting_obj(v) (Py_TYPE(v) == &khmer_KCountgraph_Type)
//
// _new_counting_hash
@@ -3320,60 +3242,13 @@ static PyObject* _new_counting_hash(PyTypeObject * type, PyObject * args,
static
PyObject *
-hashbits_count_overlap(khmer_KHashbits_Object * me, PyObject * args)
-{
- Hashbits * hashbits = me->hashbits;
- khmer_KHashbits_Object * ht2_argu;
- const char * filename;
- Hashbits * ht2;
-
- if (!PyArg_ParseTuple(args, "sO!", &filename, &khmer_KHashbits_Type,
- &ht2_argu)) {
- return NULL;
- }
-
- ht2 = ht2_argu->hashbits;
-
-// call the C++ function, and trap signals => Python
-
- HashIntoType curve[2][100];
-
- try {
- unsigned long long n_consumed;
- unsigned int total_reads;
- hashbits->consume_fasta_overlap(filename, curve, *ht2, total_reads,
- n_consumed);
- } catch (khmer_file_exception &exc) {
- PyErr_SetString(PyExc_OSError, exc.what());
- return NULL;
- } catch (khmer_value_exception &exc) {
- PyErr_SetString(PyExc_ValueError, exc.what());
- return NULL;
- }
-
- HashIntoType n = hashbits->n_unique_kmers();
- HashIntoType n_overlap = hashbits->n_overlap_kmers();
-
- PyObject * x = PyList_New(200);
-
- for (unsigned int i = 0; i < 100; i++) {
- PyList_SetItem(x, i, Py_BuildValue("K", curve[0][i]));
- }
- for (unsigned int i = 0; i < 100; i++) {
- PyList_SetItem(x, i + 100, Py_BuildValue("K", curve[1][i]));
- }
- return Py_BuildValue("KKO", n, n_overlap, x);
-}
-
-static
-PyObject *
hashbits_update(khmer_KHashbits_Object * me, PyObject * args)
{
Hashbits * hashbits = me->hashbits;
Hashbits * other;
khmer_KHashbits_Object * other_o;
- if (!PyArg_ParseTuple(args, "O!", &khmer_KHashbits_Type, &other_o)) {
+ if (!PyArg_ParseTuple(args, "O!", &khmer_KNodegraph_Type, &other_o)) {
return NULL;
}
@@ -3390,7 +3265,6 @@ hashbits_update(khmer_KHashbits_Object * me, PyObject * args)
}
static PyMethodDef khmer_hashbits_methods[] = {
- { "count_overlap", (PyCFunction)hashbits_count_overlap, METH_VARARGS, "Count overlap kmers in two datasets" },
{
"update",
(PyCFunction) hashbits_update, METH_VARARGS,
@@ -3447,7 +3321,7 @@ static PyObject* khmer_hashbits_new(PyTypeObject * type, PyObject * args,
return (PyObject *) self;
}
-#define is_hashbits_obj(v) (Py_TYPE(v) == &khmer_KHashbits_Type)
+#define is_hashbits_obj(v) (Py_TYPE(v) == &khmer_KNodegraph_Type)
////////////////////////////////////////////////////////////////////////////
@@ -3602,7 +3476,7 @@ subset_partition_average_coverages(khmer_KSubsetPartition_Object * me,
khmer_KCountingHash_Object * counting_o;
- if (!PyArg_ParseTuple(args, "O!", &khmer_KCountingHash_Type, &counting_o)) {
+ if (!PyArg_ParseTuple(args, "O!", &khmer_KCountgraph_Type, &counting_o)) {
return NULL;
}
@@ -3668,26 +3542,17 @@ static PyMethodDef khmer_subset_methods[] = {
{NULL, NULL, 0, NULL} /* sentinel */
};
-/////////////////
-// LabelHash
-/////////////////
-
-// LabelHash addition
typedef struct {
PyObject_HEAD
LabelHash * labelhash;
-} khmer_KLabelHash_Object;
+} khmer_KGraphLabels_Object;
-static PyObject * khmer_labelhash_new(PyTypeObject * type, PyObject *args,
- PyObject *kwds);
+static PyObject * khmer_graphlabels_new(PyTypeObject * type, PyObject *args,
+ PyObject *kwds);
-#define is_labelhash_obj(v) (Py_TYPE(v) == &khmer_KLabelHash_Type)
+#define is_graphlabels_obj(v) (Py_TYPE(v) == &khmer_KGraphLabels_Type)
-//
-// khmer_labelhash_dealloc -- clean up a labelhash object.
-//
-
-static void khmer_labelhash_dealloc(khmer_KLabelHash_Object * obj)
+static void khmer_graphlabels_dealloc(khmer_KGraphLabels_Object * obj)
{
delete obj->labelhash;
obj->labelhash = NULL;
@@ -3695,11 +3560,11 @@ static void khmer_labelhash_dealloc(khmer_KLabelHash_Object * obj)
Py_TYPE(obj)->tp_free((PyObject*)obj);
}
-static PyObject * khmer_labelhash_new(PyTypeObject *type, PyObject *args,
- PyObject *kwds)
+static PyObject * khmer_graphlabels_new(PyTypeObject *type, PyObject *args,
+ PyObject *kwds)
{
- khmer_KLabelHash_Object *self;
- self = (khmer_KLabelHash_Object*)type->tp_alloc(type, 0);
+ khmer_KGraphLabels_Object *self;
+ self = (khmer_KGraphLabels_Object*)type->tp_alloc(type, 0);
if (self != NULL) {
PyObject * hashtable_o;
@@ -3710,10 +3575,10 @@ static PyObject * khmer_labelhash_new(PyTypeObject *type, PyObject *args,
return NULL;
}
- if (PyObject_TypeCheck(hashtable_o, &khmer_KHashbits_Type)) {
+ if (PyObject_TypeCheck(hashtable_o, &khmer_KNodegraph_Type)) {
khmer_KHashbits_Object * kho = (khmer_KHashbits_Object *) hashtable_o;
hashtable = kho->hashbits;
- } else if (PyObject_TypeCheck(hashtable_o, &khmer_KCountingHash_Type)) {
+ } else if (PyObject_TypeCheck(hashtable_o, &khmer_KCountgraph_Type)) {
khmer_KCountingHash_Object * cho = (khmer_KCountingHash_Object *) hashtable_o;
hashtable = cho->counting;
} else {
@@ -3736,7 +3601,7 @@ static PyObject * khmer_labelhash_new(PyTypeObject *type, PyObject *args,
static
PyObject *
-labelhash_get_label_dict(khmer_KLabelHash_Object * me, PyObject * args)
+labelhash_get_label_dict(khmer_KGraphLabels_Object * me, PyObject * args)
{
LabelHash * hb = me->labelhash;
@@ -3761,7 +3626,7 @@ labelhash_get_label_dict(khmer_KLabelHash_Object * me, PyObject * args)
static
PyObject *
-labelhash_consume_fasta_and_tag_with_labels(khmer_KLabelHash_Object * me,
+labelhash_consume_fasta_and_tag_with_labels(khmer_KGraphLabels_Object * me,
PyObject * args)
{
LabelHash * hb = me->labelhash;
@@ -3802,7 +3667,7 @@ labelhash_consume_fasta_and_tag_with_labels(khmer_KLabelHash_Object * me,
static
PyObject *
labelhash_consume_partitioned_fasta_and_tag_with_labels(
- khmer_KLabelHash_Object * me, PyObject * args)
+ khmer_KGraphLabels_Object * me, PyObject * args)
{
LabelHash * labelhash = me->labelhash;
@@ -3833,7 +3698,7 @@ labelhash_consume_partitioned_fasta_and_tag_with_labels(
static
PyObject *
-labelhash_consume_sequence_and_tag_with_labels(khmer_KLabelHash_Object * me,
+labelhash_consume_sequence_and_tag_with_labels(khmer_KGraphLabels_Object * me,
PyObject * args)
{
LabelHash * hb = me->labelhash;
@@ -3851,7 +3716,7 @@ labelhash_consume_sequence_and_tag_with_labels(khmer_KLabelHash_Object * me,
static
PyObject *
-labelhash_sweep_label_neighborhood(khmer_KLabelHash_Object * me,
+labelhash_sweep_label_neighborhood(khmer_KGraphLabels_Object * me,
PyObject * args)
{
LabelHash * hb = me->labelhash;
@@ -3893,7 +3758,7 @@ labelhash_sweep_label_neighborhood(khmer_KLabelHash_Object * me,
//unsigned int num_traversed = 0;
//Py_BEGIN_ALLOW_THREADS
hb->sweep_label_neighborhood(seq, found_labels, range, break_on_stop_tags,
- stop_big_traversals);
+ stop_big_traversals);
//Py_END_ALLOW_THREADS
//printf("...%u kmers traversed\n", num_traversed);
@@ -3915,7 +3780,8 @@ labelhash_sweep_label_neighborhood(khmer_KLabelHash_Object * me,
static
PyObject *
-labelhash_sweep_tag_neighborhood(khmer_KLabelHash_Object * me, PyObject * args)
+labelhash_sweep_tag_neighborhood(khmer_KGraphLabels_Object * me,
+ PyObject * args)
{
LabelHash * labelhash = me->labelhash;
@@ -3979,7 +3845,7 @@ labelhash_sweep_tag_neighborhood(khmer_KLabelHash_Object * me, PyObject * args)
static
PyObject *
-labelhash_get_tag_labels(khmer_KLabelHash_Object * me, PyObject * args)
+labelhash_get_tag_labels(khmer_KGraphLabels_Object * me, PyObject * args)
{
LabelHash * labelhash = me->labelhash;
@@ -4007,7 +3873,7 @@ labelhash_get_tag_labels(khmer_KLabelHash_Object * me, PyObject * args)
static
PyObject *
-labelhash_n_labels(khmer_KLabelHash_Object * me, PyObject * args)
+labelhash_n_labels(khmer_KGraphLabels_Object * me, PyObject * args)
{
LabelHash * labelhash = me->labelhash;
@@ -4020,7 +3886,7 @@ labelhash_n_labels(khmer_KLabelHash_Object * me, PyObject * args)
static
PyObject *
-labelhash_save_labels_and_tags(khmer_KLabelHash_Object * me, PyObject * args)
+labelhash_save_labels_and_tags(khmer_KGraphLabels_Object * me, PyObject * args)
{
const char * filename = NULL;
LabelHash * labelhash = me->labelhash;
@@ -4041,7 +3907,7 @@ labelhash_save_labels_and_tags(khmer_KLabelHash_Object * me, PyObject * args)
static
PyObject *
-labelhash_load_labels_and_tags(khmer_KLabelHash_Object * me, PyObject * args)
+labelhash_load_labels_and_tags(khmer_KGraphLabels_Object * me, PyObject * args)
{
const char * filename = NULL;
LabelHash * labelhash = me->labelhash;
@@ -4060,7 +3926,7 @@ labelhash_load_labels_and_tags(khmer_KLabelHash_Object * me, PyObject * args)
Py_RETURN_NONE;
}
-static PyMethodDef khmer_labelhash_methods[] = {
+static PyMethodDef khmer_graphlabels_methods[] = {
{ "consume_fasta_and_tag_with_labels", (PyCFunction)labelhash_consume_fasta_and_tag_with_labels, METH_VARARGS, "" },
{ "sweep_label_neighborhood", (PyCFunction)labelhash_sweep_label_neighborhood, METH_VARARGS, "" },
{"consume_partitioned_fasta_and_tag_with_labels", (PyCFunction)labelhash_consume_partitioned_fasta_and_tag_with_labels, METH_VARARGS, "" },
@@ -4073,12 +3939,12 @@ static PyMethodDef khmer_labelhash_methods[] = {
{ "load_labels_and_tags", (PyCFunction)labelhash_load_labels_and_tags, METH_VARARGS, "" }, {NULL, NULL, 0, NULL} /* sentinel */
};
-static PyTypeObject khmer_KLabelHash_Type = {
+static PyTypeObject khmer_KGraphLabels_Type = {
PyVarObject_HEAD_INIT(NULL, 0) /* init & ob_size */
"_khmer.LabelHash", /* tp_name */
- sizeof(khmer_KLabelHash_Object), /* tp_basicsize */
+ sizeof(khmer_KGraphLabels_Object), /* tp_basicsize */
0, /* tp_itemsize */
- (destructor)khmer_labelhash_dealloc, /* tp_dealloc */
+ (destructor)khmer_graphlabels_dealloc, /* tp_dealloc */
0, /* tp_print */
0, /* tp_getattr */
0, /* tp_setattr */
@@ -4101,7 +3967,7 @@ static PyTypeObject khmer_KLabelHash_Type = {
0, /* tp_weaklistoffset */
0, /* tp_iter */
0, /* tp_iternext */
- khmer_labelhash_methods, /* tp_methods */
+ khmer_graphlabels_methods, /* tp_methods */
0, /* tp_members */
0, /* tp_getset */
0, /* tp_base */
@@ -4111,7 +3977,7 @@ static PyTypeObject khmer_KLabelHash_Type = {
0, /* tp_dictoffset */
0, /* tp_init */
0, /* tp_alloc */
- khmer_labelhash_new, /* tp_new */
+ khmer_graphlabels_new, /* tp_new */
};
static
@@ -4123,7 +3989,7 @@ hashtable_traverse_from_tags(khmer_KHashtable_Object * me, PyObject * args)
khmer_KCountingHash_Object * counting_o = NULL;
unsigned int distance, threshold, frequency;
- if (!PyArg_ParseTuple(args, "O!III", &khmer_KCountingHash_Type, &counting_o,
+ if (!PyArg_ParseTuple(args, "O!III", &khmer_KCountgraph_Type, &counting_o,
&distance, &threshold, &frequency)) {
return NULL;
}
@@ -4147,7 +4013,7 @@ hashtable_repartition_largest_partition(khmer_KHashtable_Object * me,
if (!PyArg_ParseTuple(args, "OO!III",
&subset_o,
- &khmer_KCountingHash_Type, &counting_o,
+ &khmer_KCountgraph_Type, &counting_o,
&distance, &threshold, &frequency)) {
return NULL;
}
@@ -4199,7 +4065,7 @@ static PyObject * readaligner_align(khmer_ReadAligner_Object * me,
}
static PyObject * readaligner_align_forward(khmer_ReadAligner_Object * me,
- PyObject * args)
+ PyObject * args)
{
ReadAligner * aligner = me->aligner;
@@ -4221,8 +4087,8 @@ static PyObject * readaligner_align_forward(khmer_ReadAligner_Object * me,
const char* alignment = aln->graph_alignment.c_str();
const char* readAlignment = aln->read_alignment.c_str();
PyObject * x = PyList_New(aln->covs.size());
- for (size_t i = 0; i < aln->covs.size(); i++ ){
- PyList_SET_ITEM(x, i, PyLong_FromLong(aln->covs[i]));
+ for (size_t i = 0; i < aln->covs.size(); i++ ) {
+ PyList_SET_ITEM(x, i, PyLong_FromLong(aln->covs[i]));
}
PyObject * ret = Py_BuildValue("dssOO", aln->score, alignment,
@@ -4336,7 +4202,7 @@ static PyObject* khmer_ReadAligner_new(PyTypeObject *type, PyObject * args,
if(!PyArg_ParseTuple(
args,
"O!Hd|(dddd)((dddddd)(dddd)(dddd)(dddddd)(dddd)(dddd))",
- &khmer_KCountingHash_Type, &ch, &trusted_cov_cutoff,
+ &khmer_KCountgraph_Type, &ch, &trusted_cov_cutoff,
&bits_theta, &scoring_matrix[0], &scoring_matrix[1],
&scoring_matrix[2], &scoring_matrix[3], &transitions[0],
&transitions[1], &transitions[2], &transitions[3],
@@ -4414,7 +4280,7 @@ hashtable_consume_fasta_and_traverse(khmer_KHashtable_Object * me,
if (!PyArg_ParseTuple(args, "sIIIO!", &filename,
&radius, &big_threshold, &transfer_threshold,
- &khmer_KCountingHash_Type, &counting_o)) {
+ &khmer_KCountgraph_Type, &counting_o)) {
return NULL;
}
@@ -4572,19 +4438,28 @@ hllcounter_consume_string(khmer_KHLLCounter_Object * me, PyObject * args)
}
static PyObject * hllcounter_consume_fasta(khmer_KHLLCounter_Object * me,
- PyObject * args)
+ PyObject * args, PyObject * kwds)
{
const char * filename;
+ PyObject * output_records_o = NULL;
+ char * kwlist[] = {"filename", "stream_out", NULL};
- if (!PyArg_ParseTuple(args, "s", &filename)) {
+ bool output_records = false;
+
+ if (!PyArg_ParseTupleAndKeywords(args, kwds, "s|O", kwlist,
+ &filename, &output_records_o)) {
return NULL;
}
+ if (output_records_o != NULL && PyObject_IsTrue(output_records_o)) {
+ output_records = true;
+ }
+
// call the C++ function, and trap signals => Python
unsigned long long n_consumed = 0;
unsigned int total_reads = 0;
try {
- me->hllcounter->consume_fasta(filename, total_reads, n_consumed);
+ me->hllcounter->consume_fasta(filename, output_records, total_reads, n_consumed);
} catch (khmer_file_exception &exc) {
PyErr_SetString(PyExc_OSError, exc.what());
return NULL;
@@ -4719,9 +4594,10 @@ static PyMethodDef khmer_hllcounter_methods[] = {
},
{
"consume_fasta", (PyCFunction)hllcounter_consume_fasta,
- METH_VARARGS,
+ METH_VARARGS | METH_KEYWORDS,
"Read sequences from file, break into k-mers, "
- "and add each k-mer to the counter."
+ "and add each k-mer to the counter. If optional keyword 'stream_out' "
+ "is True, also prints each sequence to stdout."
},
{
"merge", (PyCFunction)hllcounter_merge,
@@ -4844,7 +4720,7 @@ static PyObject * forward_hash(PyObject * self, PyObject * args)
try {
PyObject * hash;
- hash = PyLong_FromUnsignedLongLong(_hash(kmer, ksize));
+ hash = PyLong_FromUnsignedLongLong(_hash(kmer, ksize));
return hash;
} catch (khmer_exception &e) {
PyErr_SetString(PyExc_RuntimeError, e.what());
@@ -4936,22 +4812,6 @@ get_version_cpp( PyObject * self, PyObject * args )
//
static PyMethodDef KhmerMethods[] = {
-#if (0)
- {
- "new_config", new_config,
- METH_VARARGS, "Create a default internals config"
- },
-#endif
-#if (0)
- {
- "set_config", set_active_config,
- METH_VARARGS, "Set active khmer configuration object"
- },
-#endif
- {
- "new_hashtable", new_hashtable,
- METH_VARARGS, "Create an empty single-table counting hash"
- },
{
"forward_hash", forward_hash,
METH_VARARGS, "",
@@ -4993,13 +4853,13 @@ MOD_INIT(_khmer)
return MOD_ERROR_VAL;
}
- khmer_KCountingHash_Type.tp_base = &khmer_KHashtable_Type;
- if (PyType_Ready(&khmer_KCountingHash_Type) < 0) {
+ khmer_KCountgraph_Type.tp_base = &khmer_KHashtable_Type;
+ if (PyType_Ready(&khmer_KCountgraph_Type) < 0) {
return MOD_ERROR_VAL;
}
if (PyType_Ready(&khmer_PrePartitionInfo_Type) < 0) {
- return MOD_ERROR_VAL;
+ return MOD_ERROR_VAL;
}
khmer_KSubsetPartition_Type.tp_methods = khmer_subset_methods;
@@ -5007,16 +4867,16 @@ MOD_INIT(_khmer)
return MOD_ERROR_VAL;
}
- khmer_KHashbits_Type.tp_base = &khmer_KHashtable_Type;
- khmer_KHashbits_Type.tp_methods = khmer_hashbits_methods;
- if (PyType_Ready(&khmer_KHashbits_Type) < 0) {
+ khmer_KNodegraph_Type.tp_base = &khmer_KHashtable_Type;
+ khmer_KNodegraph_Type.tp_methods = khmer_hashbits_methods;
+ if (PyType_Ready(&khmer_KNodegraph_Type) < 0) {
return MOD_ERROR_VAL;
}
- khmer_KLabelHash_Type.tp_base = &khmer_KHashbits_Type;
- khmer_KLabelHash_Type.tp_methods = khmer_labelhash_methods;
- khmer_KLabelHash_Type.tp_new = khmer_labelhash_new;
- if (PyType_Ready(&khmer_KLabelHash_Type) < 0) {
+ khmer_KGraphLabels_Type.tp_base = &khmer_KNodegraph_Type;
+ khmer_KGraphLabels_Type.tp_methods = khmer_graphlabels_methods;
+ khmer_KGraphLabels_Type.tp_new = khmer_graphlabels_new;
+ if (PyType_Ready(&khmer_KGraphLabels_Type) < 0) {
return MOD_ERROR_VAL;
}
@@ -5055,20 +4915,21 @@ MOD_INIT(_khmer)
return MOD_ERROR_VAL;
}
- Py_INCREF(&khmer_KCountingHash_Type);
- if (PyModule_AddObject( m, "CountingHash",
- (PyObject *)&khmer_KCountingHash_Type ) < 0) {
+ Py_INCREF(&khmer_KCountgraph_Type);
+ if (PyModule_AddObject( m, "Countgraph",
+ (PyObject *)&khmer_KCountgraph_Type ) < 0) {
return MOD_ERROR_VAL;
}
- Py_INCREF(&khmer_KHashbits_Type);
- if (PyModule_AddObject(m, "Hashbits", (PyObject *)&khmer_KHashbits_Type) < 0) {
+ Py_INCREF(&khmer_KNodegraph_Type);
+ if (PyModule_AddObject(m, "Nodegraph",
+ (PyObject *)&khmer_KNodegraph_Type) < 0) {
return MOD_ERROR_VAL;
}
- Py_INCREF(&khmer_KLabelHash_Type);
- if (PyModule_AddObject(m, "LabelHash",
- (PyObject *)&khmer_KLabelHash_Type) < 0) {
+ Py_INCREF(&khmer_KGraphLabels_Type);
+ if (PyModule_AddObject(m, "GraphLabels",
+ (PyObject *)&khmer_KGraphLabels_Type) < 0) {
return MOD_ERROR_VAL;
}
diff --git a/khmer/_version.py b/khmer/_version.py
index 5da9804..5d68898 100644
--- a/khmer/_version.py
+++ b/khmer/_version.py
@@ -16,8 +16,8 @@ import subprocess
import sys
# these strings will be replaced by git during git-archive
-git_refnames = " (HEAD -> master, tag: v2.0-rc2)"
-git_full = "8c2f8d33969ad402dac2c9bacbfc02197bd1ce02"
+git_refnames = " (tag: v2.0-rc3)"
+git_full = "14f741fd4fc27c7fae0b5cd6e3df020d5628a89b"
# these strings are filled in when 'setup.py versioneer' creates _version.py
tag_prefix = "v"
diff --git a/khmer/kfile.py b/khmer/kfile.py
index a901833..7e049f0 100644
--- a/khmer/kfile.py
+++ b/khmer/kfile.py
@@ -13,6 +13,8 @@ import os
import sys
import errno
from stat import S_ISBLK, S_ISFIFO, S_ISCHR
+import gzip
+import bz2file
from khmer import khmer_args
@@ -121,10 +123,9 @@ def check_space(in_files, force, _testhook_free_space=None):
sys.exit(1)
-def check_space_for_hashtable(outfile_name, hash_size, force,
- _testhook_free_space=None):
- """Check that we have enough size to write the specified hash table."""
-
+def check_space_for_graph(outfile_name, hash_size, force,
+ _testhook_free_space=None):
+ """Check that we have enough size to write the specified graph."""
dir_path = os.path.dirname(os.path.realpath(outfile_name))
target = os.statvfs(dir_path)
@@ -136,7 +137,7 @@ def check_space_for_hashtable(outfile_name, hash_size, force,
size_diff = hash_size - free_space
if size_diff > 0:
print("ERROR: Not enough free space on disk "
- "for saved table files;"
+ "for saved graph files;"
" Need at least %.1f GB more."
% (float(size_diff) / 1e9,), file=sys.stderr)
print(" Table size: %.1f GB"
@@ -170,3 +171,45 @@ def check_valid_file_exists(in_files):
else:
print('WARNING: Input file %s not found' %
in_file, file=sys.stderr)
+
+
+def is_block(fthing):
+ """Take in a file object and checks to see if it's a block or fifo."""
+ if fthing is sys.stdout or fthing is sys.stdin:
+ return True
+ else:
+ mode = os.stat(fthing.name).st_mode
+ return S_ISBLK(mode) or S_ISCHR(mode)
+
+
+def describe_file_handle(fthing):
+ if is_block(fthing):
+ return "block device"
+ else:
+ return fthing.name
+
+
+def add_output_compression_type(parser):
+ """Add compression arguments to a parser object."""
+ group = parser.add_mutually_exclusive_group()
+ group.add_argument('--gzip', default=False, action='store_true',
+ help='Compress output using gzip')
+ group.add_argument('--bzip', default=False, action='store_true',
+ help='Compress output using bzip2')
+
+
+def get_file_writer(file_handle, do_gzip, do_bzip):
+ """Generate and return a file object with specified compression."""
+ ofile = None
+
+ if do_gzip and do_bzip:
+ raise Exception("Cannot specify both bzip and gzip compression!")
+
+ if do_gzip:
+ ofile = gzip.GzipFile(fileobj=file_handle, mode='w')
+ elif do_bzip:
+ ofile = bz2file.open(file_handle, mode='w')
+ else:
+ ofile = file_handle
+
+ return ofile
diff --git a/khmer/khmer_args.py b/khmer/khmer_args.py
index 946a645..0bfeeb8 100644
--- a/khmer/khmer_args.py
+++ b/khmer/khmer_args.py
@@ -7,17 +7,21 @@
#
from __future__ import unicode_literals
+from __future__ import print_function
import sys
import os
import argparse
+import math
from argparse import _VersionAction
+from collections import namedtuple
import screed
import khmer
-from khmer import extract_countinghash_info, extract_hashbits_info
+from khmer import extract_countgraph_info, extract_nodegraph_info
from khmer import __version__
-from khmer.utils import print_error
+from .utils import print_error
+from .khmer_logger import log_info, log_warn, configure_logging
DEFAULT_K = 32
@@ -43,26 +47,227 @@ class ComboFormatter(argparse.ArgumentDefaultsHelpFormatter,
pass
-def build_hash_args(descr=None, epilog=None, parser=None):
+def optimal_size(num_kmers, mem_cap=None, fp_rate=None):
+ """
+ Utility function for estimating optimal countgraph args where:
+ - num_kmers: number of unique kmers [required]
+ - mem_cap: the allotted amount of memory [optional, conflicts with f]
+ - fp_rate: the desired false positive rate [optional, conflicts with M]
+ """
+ if all((num_kmers is not None, mem_cap is not None, fp_rate is None)):
+ return estimate_optimal_with_K_and_M(num_kmers, mem_cap)
+ elif all((num_kmers is not None, mem_cap is None, fp_rate is not None)):
+ return estimate_optimal_with_K_and_f(num_kmers, fp_rate)
+ else:
+ raise TypeError("num_kmers and either mem_cap or fp_rate"
+ " must be defined.")
+
+
+def check_conflicting_args(args, hashtype):
+ """
+ Utility function that takes in an args object and checks if there's things
+ that conflict, e.g. --loadgraph and --ksize being set.
+ """
+
+ if getattr(args, "quiet", None):
+ configure_logging(args.quiet)
+
+ loadgraph_table_conflicts = {"ksize": DEFAULT_K,
+ "n_tables": DEFAULT_N_TABLES,
+ "max_tablesize": DEFAULT_MAX_TABLESIZE}
+
+ loadgraph_autoarg_conflicts = ("unique_kmers", "max_memory_usage")
+
+ if getattr(args, "loadgraph", None):
+
+ # check for table config args
+ for key, value in loadgraph_table_conflicts.items():
+ if getattr(args, key, value) != value:
+ log_warn('''
+*** WARNING: You are loading a saved k-mer countgraph from
+*** {hashfile}, but have set k-mer table parameters.
+*** Your values for ksize, n_tables, and tablesize
+*** will be ignored.'''.format(hashfile=args.loadgraph))
+ break # no repeat warnings
+
+ for element in loadgraph_autoarg_conflicts:
+ if getattr(args, element, None):
+ log_warn("\n*** WARNING: You have asked that the graph size be"
+ " automatically calculated\n"
+ "*** (by using -U or -M).\n"
+ "*** But you are loading an existing graph!\n"
+ "*** Size will NOT be set automatically.")
+ break # no repeat warnings
+
+ infoset = None
+ if hashtype == 'countgraph':
+ infoset = extract_countgraph_info(args.loadgraph)
+ if info:
+ ksize = infoset[0]
+ max_tablesize = infoset[1]
+ n_tables = infoset[2]
+ args.ksize = ksize
+ args.n_tables = n_tables
+ args.max_tablesize = max_tablesize
+
+
+def estimate_optimal_with_K_and_M(num_kmers, mem_cap):
+ """
+ Utility function for estimating optimal countgraph args where num_kmers
+ is the number of unique kmer and mem_cap is the allotted amount of memory
+ """
+
+ n_tables = math.log(2) * (mem_cap / float(num_kmers))
+ int_n_tables = int(n_tables)
+ if int_n_tables == 0:
+ int_n_tables = 1
+ ht_size = int(mem_cap / int_n_tables)
+ mem_cap = ht_size * int_n_tables
+ fp_rate = (1 - math.exp(-num_kmers / float(ht_size))) ** int_n_tables
+ res = namedtuple("result", ["num_htables", "htable_size", "mem_use",
+ "fp_rate"])
+ return res(int_n_tables, ht_size, mem_cap, fp_rate)
+
+
+def estimate_optimal_with_K_and_f(num_kmers, des_fp_rate):
+ """
+ Utility function for estimating optimal memory where num_kmers is the
+ number of unique kmers and des_fp_rate is the desired false positive rate
+ """
+ n_tables = math.log(des_fp_rate, 0.5)
+ int_n_tables = int(n_tables)
+ if int_n_tables == 0:
+ int_n_tables = 1
+
+ ht_size = int(-num_kmers / (
+ math.log(1 - des_fp_rate ** (1 / float(int_n_tables)))))
+ mem_cap = ht_size * int_n_tables
+ fp_rate = (1 - math.exp(-num_kmers / float(ht_size))) ** int_n_tables
+
+ res = namedtuple("result", ["num_htables", "htable_size", "mem_use",
+ "fp_rate"])
+ return res(int_n_tables, ht_size, mem_cap, fp_rate)
+
+
+def graphsize_args_report(unique_kmers, fp_rate):
+ """
+ Assembles output string for optimal arg sandbox scripts
+ takes in unique_kmers and desired fp_rate
+ """
+ to_print = []
+
+ to_print.append('') # blank line
+ to_print.append('number of unique k-mers: \t{0}'.format(unique_kmers))
+ to_print.append('false positive rate: \t{:>.3f}'.format(fp_rate))
+ to_print.append('') # blank line
+ to_print.append('If you have expected false positive rate to achieve:')
+ to_print.append('expected_fp\tnumber_hashtable(Z)\tsize_hashtable(H)\t'
+ 'expected_memory_usage')
+
+ for fp_rate in range(1, 10):
+ num_tables, table_size, mem_cap, fp_rate = \
+ optimal_size(unique_kmers, fp_rate=fp_rate / 10.0)
+ to_print.append('{:11.3f}\t{:19}\t{:17e}\t{:21e}'.format(fp_rate,
+ num_tables,
+ table_size,
+ mem_cap))
+
+ mem_list = [1, 5, 10, 20, 50, 100, 200, 300, 400, 500, 1000, 2000, 5000]
+
+ to_print.append('') # blank line
+ to_print.append('If you have expected memory to use:')
+ to_print.append('expected_memory_usage\tnumber_hashtable(Z)\t'
+ 'size_hashtable(H)\texpected_fp')
+
+ for mem in mem_list:
+ num_tables, table_size, mem_cap, fp_rate =\
+ optimal_size(unique_kmers, mem_cap=mem * 1000000000)
+ to_print.append('{:21e}\t{:19}\t{:17e}\t{:11.3f}'.format(mem_cap,
+ num_tables,
+ table_size,
+ fp_rate))
+ return "\n".join(to_print)
+
+
+def _check_fp_rate(args, desired_max_fp):
+ """
+ Function to check if the desired_max_fp rate makes sense given specified
+ number of unique kmers and max_memory restrictions present in the args.
+
+ Takes in args object and desired_max_fp
+ """
+ if not args.unique_kmers:
+ return args
+
+ # Do overriding of default script FP rate
+ if args.fp_rate:
+ log_info("*** INFO: Overriding default fp {def_fp} with new fp:"
+ " {new_fp}", def_fp=desired_max_fp, new_fp=args.fp_rate)
+ desired_max_fp = args.fp_rate
+
+ # If we have the info we need to work with, do the stuff
+ if args.max_memory_usage:
+ # verify that this is a sane memory usage restriction
+ res = estimate_optimal_with_K_and_M(args.unique_kmers,
+ args.max_memory_usage)
+ if res.fp_rate > desired_max_fp:
+ print("""
+*** ERROR: The given restrictions yield an estimate false positive rate of {0},
+*** which is above the recommended false positive ceiling of {1}!"""
+ .format(res.fp_rate, desired_max_fp), file=sys.stderr)
+ if not args.force:
+ print("NOTE: This can be overridden using the --force"
+ " argument", file=sys.stderr)
+ print("*** Aborting...!", file=sys.stderr)
+ sys.exit(1)
+ else:
+ res = estimate_optimal_with_K_and_f(args.unique_kmers,
+ desired_max_fp)
+ if args.max_tablesize and args.max_tablesize < res.htable_size:
+ log_warn("\n*** Warning: The given tablesize is too small!")
+ log_warn("*** Recommended tablesize is: {tsize:5g} bytes",
+ tsize=res.htable_size)
+ log_warn("*** Current is: {tsize:5g} bytes",
+ tsize=args.max_tablesize)
+ res = estimate_optimal_with_K_and_M(args.unique_kmers,
+ args.max_tablesize)
+ log_warn("*** Estimated FP rate with current config is: {fp}\n",
+ fp=res.fp_rate)
+ else:
+ if res.mem_use < 1e6: # one megabyteish
+ args.max_memory_usage = 1e6
+ else:
+ args.max_memory_usage = res.mem_use
+ log_info("*** INFO: set memory ceiling automatically.")
+ log_info("*** Ceiling is: {ceil:3g} bytes\n",
+ ceil=float(args.max_memory_usage))
+ args.max_mem = res.mem_use
+
+ return args
+
+
+def build_graph_args(descr=None, epilog=None, parser=None):
"""Build an ArgumentParser with args for bloom filter based scripts."""
+
if parser is None:
parser = argparse.ArgumentParser(description=descr, epilog=epilog,
formatter_class=ComboFormatter)
parser.add_argument('--version', action=_VersionStdErrAction,
version='khmer {v}'.format(v=__version__))
- parser.add_argument('-q', '--quiet', dest='quiet', default=False,
- action='store_true')
parser.add_argument('--ksize', '-k', type=int, default=DEFAULT_K,
help='k-mer size to use')
parser.add_argument('--n_tables', '-N', type=int,
default=DEFAULT_N_TABLES,
- help='number of k-mer counting tables to use')
- parser.add_argument('-U', '--unique-kmers', type=int, default=0,
+ help='number of tables to use in k-mer countgraph')
+ parser.add_argument('-U', '--unique-kmers', type=float, default=0,
help='approximate number of unique kmers in the input'
' set')
+ parser.add_argument('--fp-rate', type=float, default=None,
+ help="Override the automatic FP rate setting for the"
+ " current script")
group = parser.add_mutually_exclusive_group()
group.add_argument('--max-tablesize', '-x', type=float,
@@ -77,70 +282,35 @@ def build_hash_args(descr=None, epilog=None, parser=None):
def build_counting_args(descr=None, epilog=None):
- """Build an ArgumentParser with args for counting_hash based scripts."""
- parser = build_hash_args(descr=descr, epilog=epilog)
- parser.hashtype = 'countgraph'
+ """Build an ArgumentParser with args for countgraph based scripts."""
+ parser = build_graph_args(descr=descr, epilog=epilog)
return parser
-def build_hashbits_args(descr=None, epilog=None, parser=None):
- """Build an ArgumentParser with args for hashbits based scripts."""
- parser = build_hash_args(descr=descr, epilog=epilog, parser=parser)
- parser.hashtype = 'nodegraph'
+def build_nodegraph_args(descr=None, epilog=None, parser=None):
+ """Build an ArgumentParser with args for nodegraph based scripts."""
+ parser = build_graph_args(descr=descr, epilog=epilog, parser=parser)
return parser
-# add an argument for loadhash with warning about parameters
-
-
-def add_loadhash_args(parser):
+# add an argument for loadgraph with warning about parameters
- class LoadAction(argparse.Action):
- def __call__(self, parser, namespace, values, option_string=None):
- setattr(namespace, self.dest, values)
+def add_loadgraph_args(parser):
+ parser.add_argument('-l', '--loadgraph', metavar="filename", default=None,
+ help='load a precomputed k-mer graph from disk')
- if getattr(namespace, 'ksize') != DEFAULT_K or \
- getattr(namespace, 'n_tables') != DEFAULT_N_TABLES or \
- getattr(namespace, 'max_tablesize') != DEFAULT_MAX_TABLESIZE:
- if values:
- print_error('''
-** WARNING: You are loading a saved k-mer table from
-** {hashfile}, but have set k-mer table parameters.
-** Your values for ksize, n_tables, and tablesize
-** will be ignored.'''.format(hashfile=values))
- if hasattr(parser, 'hashtype'):
- info = None
- if parser.hashtype == 'nodegraph':
- info = extract_hashbits_info(
- getattr(namespace, self.dest))
- elif parser.hashtype == 'countgraph':
- info = extract_countinghash_info(
- getattr(namespace, self.dest))
- if info:
- K = info[0]
- x = info[1]
- n = info[2]
- setattr(namespace, 'ksize', K)
- setattr(namespace, 'n_tables', n)
- setattr(namespace, 'max_tablesize', x)
-
- parser.add_argument('-l', '--loadtable', metavar="filename", default=None,
- help='load a precomputed k-mer table from disk',
- action=LoadAction)
-
-
-def calculate_tablesize(args, hashtype, multiplier=1.0):
- if hashtype not in ('countgraph', 'nodegraph'):
- raise ValueError("unknown graph type: %s" % (hashtype,))
+def calculate_graphsize(args, graphtype, multiplier=1.0):
+ if graphtype not in ('countgraph', 'nodegraph'):
+ raise ValueError("unknown graph type: %s" % (graphtype,))
if args.max_memory_usage:
- if hashtype == 'countgraph':
+ if graphtype == 'countgraph':
tablesize = args.max_memory_usage / args.n_tables / \
float(multiplier)
- elif hashtype == 'nodegraph':
+ elif graphtype == 'nodegraph':
tablesize = 8. * args.max_memory_usage / args.n_tables / \
float(multiplier)
else:
@@ -149,67 +319,66 @@ def calculate_tablesize(args, hashtype, multiplier=1.0):
return tablesize
-def create_nodegraph(args, ksize=None, multiplier=1.0):
+def create_nodegraph(args, ksize=None, multiplier=1.0, fp_rate=0.01):
+ """Creates and returns a nodegraph"""
+ args = _check_fp_rate(args, fp_rate)
if ksize is None:
ksize = args.ksize
if ksize > 32:
print_error("\n** ERROR: khmer only supports k-mer sizes <= 32.\n")
sys.exit(1)
- tablesize = calculate_tablesize(args, 'nodegraph', multiplier)
- return khmer.Hashbits(ksize, tablesize, args.n_tables)
+ tablesize = calculate_graphsize(args, 'nodegraph', multiplier)
+ return khmer.Nodegraph(ksize, tablesize, args.n_tables)
-def create_countgraph(args, ksize=None, multiplier=1.0):
+def create_countgraph(args, ksize=None, multiplier=1.0, fp_rate=0.1):
+ """Creates and returns a countgraph"""
+ args = _check_fp_rate(args, fp_rate)
if ksize is None:
ksize = args.ksize
if ksize > 32:
print_error("\n** ERROR: khmer only supports k-mer sizes <= 32.\n")
sys.exit(1)
- tablesize = calculate_tablesize(args, 'countgraph', multiplier=multiplier)
- return khmer.CountingHash(ksize, tablesize, args.n_tables)
+ tablesize = calculate_graphsize(args, 'countgraph', multiplier=multiplier)
+ return khmer.Countgraph(ksize, tablesize, args.n_tables)
-def report_on_config(args, hashtype='countgraph'):
+def report_on_config(args, graphtype='countgraph'):
"""Print out configuration.
Summarize the configuration produced by the command-line arguments
made available by this module.
"""
- from khmer.utils import print_error
- if hashtype not in ('countgraph', 'nodegraph'):
- raise ValueError("unknown graph type: %s" % (hashtype,))
-
- if args.quiet:
- return
-
- tablesize = calculate_tablesize(args, hashtype)
-
- print_error("\nPARAMETERS:")
- print_error(" - kmer size = {0} \t\t(-k)".format(args.ksize))
- print_error(" - n tables = {0} \t\t(-N)".format(args.n_tables))
- print_error(
- " - max tablesize = {0:5.2g} \t(-x)".format(tablesize)
- )
- print_error("")
- if hashtype == 'countgraph':
- print_error(
+ check_conflicting_args(args, graphtype)
+ if graphtype not in ('countgraph', 'nodegraph'):
+ raise ValueError("unknown graph type: %s" % (graphtype,))
+
+ tablesize = calculate_graphsize(args, graphtype)
+
+ log_info("\nPARAMETERS:")
+ log_info(" - kmer size = {ksize} \t\t(-k)", ksize=args.ksize)
+ log_info(" - n tables = {ntables} \t\t(-N)", ntables=args.n_tables)
+ log_info(" - max tablesize = {tsize:5.2g} \t(-x)", tsize=tablesize)
+ log_info("")
+ if graphtype == 'countgraph':
+ log_info(
"Estimated memory usage is {0:.2g} bytes "
"(n_tables x max_tablesize)".format(
args.n_tables * tablesize))
- elif hashtype == 'nodegraph':
- print_error(
+ elif graphtype == 'nodegraph':
+ log_info(
"Estimated memory usage is {0:.2g} bytes "
"(n_tables x max_tablesize / 8)".format(args.n_tables *
tablesize / 8)
)
- print_error("-" * 8)
+ log_info("-" * 8)
if DEFAULT_MAX_TABLESIZE == tablesize and \
- not getattr(args, 'loadtable', None):
- print_error('''\
+ not getattr(args, 'loadgraph', None):
+ log_warn('''\
** WARNING: tablesize is default!
** You probably want to increase this with -M/--max-memory-usage!
@@ -241,15 +410,14 @@ def info(scriptname, algorithm_list=None):
"""Print version and project info to stderr."""
import khmer
- sys.stderr.write("\n")
- sys.stderr.write("|| This is the script '%s' in khmer.\n"
- "|| You are running khmer version %s\n" %
- (scriptname, khmer.__version__,))
- sys.stderr.write("|| You are also using screed version %s\n||\n"
- % screed.__version__)
+ log_info("\n|| This is the script {name} in khmer.\n"
+ "|| You are running khmer version {version}",
+ name=scriptname, version=khmer.__version__)
+ log_info("|| You are also using screed version {version}\n||",
+ version=screed.__version__)
- sys.stderr.write("|| If you use this script in a publication, please "
- "cite EACH of the following:\n||\n")
+ log_info("|| If you use this script in a publication, please "
+ "cite EACH of the following:\n||")
if algorithm_list is None:
algorithm_list = []
@@ -257,17 +425,14 @@ def info(scriptname, algorithm_list=None):
algorithm_list.insert(0, 'software')
for alg in algorithm_list:
- sys.stderr.write("|| * ")
- algstr = _algorithms[alg].encode(
+ algstr = "|| * " + _algorithms[alg].encode(
'utf-8', 'surrogateescape').decode('utf-8', 'replace')
try:
- sys.stderr.write(algstr)
+ log_info(algstr)
except UnicodeEncodeError:
- sys.stderr.write(
- algstr.encode(sys.getfilesystemencoding(), 'replace'))
- sys.stderr.write("\n")
+ log_info(algstr.encode(sys.getfilesystemencoding(), 'replace'))
- sys.stderr.write("||\n|| Please see http://khmer.readthedocs.org/en/"
- "latest/citations.html for details.\n\n")
+ log_info("||\n|| Please see http://khmer.readthedocs.org/en/"
+ "latest/citations.html for details.\n")
# vim: set ft=python ts=4 sts=4 sw=4 et tw=79:
diff --git a/khmer/khmer_logger.py b/khmer/khmer_logger.py
new file mode 100644
index 0000000..8b2f0aa
--- /dev/null
+++ b/khmer/khmer_logger.py
@@ -0,0 +1,53 @@
+#
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
+# Copyright (C) Michigan State University, 2009-2015. It is licensed under
+# the three-clause BSD license; see LICENSE.
+# Contact: khmer-project at idyll.org
+#
+# Lightweight logging framework for khmer
+
+from __future__ import print_function, unicode_literals
+import sys
+
+global __QUIET__
+__QUIET__ = False
+
+
+def configure_logging(quiet):
+ global __QUIET__
+ __QUIET__ = quiet
+
+
+def log_info(message, **kwargs):
+ """For non-critical informative/status output to stderr."""
+ global __QUIET__
+ if not __QUIET__:
+ if kwargs:
+ message = message.format(**kwargs)
+ print(message, file=sys.stderr)
+
+
+def log_error(message, **kwargs):
+ """For critical error output to stderr."""
+ if kwargs:
+ message = message.format(**kwargs)
+ print(message, file=sys.stderr)
+
+
+def log_debug(message, **kwagrs):
+ """For non-critical debug output to stderr."""
+ global __QUIET__
+ if not __QUIET__:
+ if kwargs:
+ message = message.format(**kwargs)
+ print(message, file=sys.stderr)
+
+
+def log_warn(message, **kwargs):
+ """For critical warning output to stderr."""
+ if kwargs:
+ message = message.format(**kwargs)
+ print(message, file=sys.stderr)
+
+
+# vim: set ft=python ts=4 sts=4 sw=4 et tw=79:
diff --git a/khmer/thread_utils.py b/khmer/thread_utils.py
index df997b3..e732b9f 100644
--- a/khmer/thread_utils.py
+++ b/khmer/thread_utils.py
@@ -13,7 +13,7 @@ import threading
import sys
import screed
from khmer import utils
-
+from khmer.utils import write_record
# stdlib queue module was renamed on Python 3
try:
import queue
@@ -189,11 +189,13 @@ class ThreadedSequenceProcessor(object):
except queue.Empty:
continue
- for name, seq, quality in g.seqlist:
- if quality: # write FASTQ; CTB hack.
- outfp.write('@%s\n%s\n+\n%s\n' % (name, seq, quality))
+ for name, seq, qual in g.seqlist:
+ if qual:
+ record = screed.Record(name=name, sequence=seq,
+ quality=qual)
else:
- outfp.write('>%s\n%s\n' % (name, seq,))
+ record = screed.Record(name=name, sequence=seq)
+ write_record(record, outfp)
if self.verbose:
print("DONE writing.\nprocessed %d / wrote %d / removed %d" %
diff --git a/khmer/utils.py b/khmer/utils.py
index 3abb11e..6896e83 100644
--- a/khmer/utils.py
+++ b/khmer/utils.py
@@ -106,6 +106,13 @@ def check_is_right(name):
return False
+class UnpairedReadsError(ValueError):
+ def __init__(self, msg, r1, r2):
+ super(ValueError, self).__init__(msg)
+ self.r1 = r1
+ self.r2 = r2
+
+
def broken_paired_reader(screed_iter, min_length=None,
force_single=False, require_paired=False):
"""Read pairs from a stream.
@@ -152,8 +159,9 @@ def broken_paired_reader(screed_iter, min_length=None,
record = None
else: # orphan.
if require_paired:
- raise ValueError("Unpaired reads when require_paired"
- " is set!")
+ e = UnpairedReadsError("Unpaired reads when require_paired"
+ " is set!", prev_record, record)
+ raise e
yield n, False, prev_record, None
n += 1
@@ -163,7 +171,8 @@ def broken_paired_reader(screed_iter, min_length=None,
# handle the last record, if it exists (i.e. last two records not a pair)
if prev_record:
if require_paired:
- raise ValueError("Unpaired reads when require_paired is set!")
+ raise UnpairedReadsError("Unpaired reads when require_paired "
+ "is set!", prev_record, None)
yield n, False, prev_record, None
diff --git a/lib/Makefile b/lib/Makefile
index 2bd68d4..da02e8d 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -56,7 +56,7 @@ CPPFLAGS += $(SEQAN_FLAGS)
CXXFLAGS ?=
CXXFLAGS += $(COMMON_FLAGS) $(WARNINGS)
-CXXFLAGS += -Wstrict-null-sentinel
+CXXFLAGS += -Wstrict-null-sentinel -std=c++11
CXXFLAGS += $(INCLUDES) $(CPPFLAGS)
CFLAGS ?=
@@ -195,6 +195,7 @@ LIBKHMER_OBJS= \
hllcounter.o \
kmer_hash.o \
labelhash.o \
+ traversal.o \
read_aligner.o \
read_parsers.o \
subset.o \
@@ -223,6 +224,7 @@ KHMER_HEADERS= \
khmer.hh \
kmer_hash.hh \
labelhash.hh \
+ traversal.hh \
primes.hh \
read_aligner.hh \
read_parsers.hh \
diff --git a/lib/counting.cc b/lib/counting.cc
index 2b9b921..ba87770 100644
--- a/lib/counting.cc
+++ b/lib/counting.cc
@@ -14,6 +14,7 @@
#include "hashbits.hh"
#include "hashtable.hh"
#include "khmer_exception.hh"
+#include "kmer_hash.hh"
#include "read_parsers.hh"
#include "zlib.h"
@@ -60,14 +61,9 @@ void CountingHash::output_fasta_kmer_pos_freq(
outfile.close();
}
-const HashIntoType CountingHash::n_unique_kmers() const
-{
- return _n_unique_kmers;
-}
-
BoundedCounterType CountingHash::get_min_count(const std::string &s)
{
- KMerIterator kmers(s.c_str(), _ksize);
+ KmerIterator kmers(s.c_str(), _ksize);
BoundedCounterType min_count = MAX_KCOUNT;
@@ -85,7 +81,7 @@ BoundedCounterType CountingHash::get_min_count(const std::string &s)
BoundedCounterType CountingHash::get_max_count(const std::string &s)
{
- KMerIterator kmers(s.c_str(), _ksize);
+ KmerIterator kmers(s.c_str(), _ksize);
BoundedCounterType max_count = 0;
@@ -133,7 +129,7 @@ CountingHash::abundance_distribution(
seq = read.sequence;
if (check_and_normalize_read(seq)) {
- KMerIterator kmers(seq.c_str(), _ksize);
+ KmerIterator kmers(seq.c_str(), _ksize);
while(!kmers.done()) {
HashIntoType kmer = kmers.next();
@@ -292,43 +288,6 @@ void CountingHash::load(std::string infilename)
CountingHashFile::load(infilename, *this);
}
-void CountingHash::get_kadian_count(
- const std::string &s,
- BoundedCounterType &kadian,
- unsigned int nk)
-{
- std::vector<BoundedCounterType> counts;
- KMerIterator kmers(s.c_str(), _ksize);
-
- while(!kmers.done()) {
- HashIntoType kmer = kmers.next();
- BoundedCounterType count = this->get_count(kmer);
- counts.push_back(count);
- }
-
- if (!counts.size()) {
- throw khmer_exception();
- }
- unsigned int kpos = nk * _ksize;
-
- if (counts.size() < kpos) {
- kadian = 0;
-
- return;
- }
-
- sort(counts.begin(), counts.end());
- kadian = counts[kpos - 1];
-
-#if 0
- std::cout << "k " << kpos << ": ";
- for (unsigned int i = 0; i < counts.size(); i++) {
- std::cout << i << "-" << counts[i] << " ";
- }
- std::cout << "\n";
-#endif // 0
-}
-
unsigned long CountingHash::trim_on_abundance(
std::string seq,
BoundedCounterType min_abund)
@@ -338,7 +297,7 @@ const
return 0;
}
- KMerIterator kmers(seq.c_str(), _ksize);
+ KmerIterator kmers(seq.c_str(), _ksize);
HashIntoType kmer;
@@ -364,7 +323,6 @@ const
return seq.length();
}
-
unsigned long CountingHash::trim_below_abundance(
std::string seq,
BoundedCounterType max_abund)
@@ -374,7 +332,7 @@ const
return 0;
}
- KMerIterator kmers(seq.c_str(), _ksize);
+ KmerIterator kmers(seq.c_str(), _ksize);
HashIntoType kmer;
@@ -410,7 +368,7 @@ const
throw khmer_exception("invalid read");
}
- KMerIterator kmers(seq.c_str(), _ksize);
+ KmerIterator kmers(seq.c_str(), _ksize);
HashIntoType kmer = kmers.next();
if (kmers.done()) {
@@ -524,6 +482,7 @@ CountingHashFileReader::CountingHashFileReader(
unsigned int save_ksize = 0;
unsigned char save_n_tables = 0;
unsigned long long save_tablesize = 0;
+ unsigned long long save_occupied_bins = 0;
char signature [4];
unsigned char version = 0, ht_type = 0, use_bigcount = 0;
@@ -554,9 +513,11 @@ CountingHashFileReader::CountingHashFileReader(
infile.read((char *) &use_bigcount, 1);
infile.read((char *) &save_ksize, sizeof(save_ksize));
infile.read((char *) &save_n_tables, sizeof(save_n_tables));
+ infile.read((char *) &save_occupied_bins, sizeof(save_occupied_bins));
ht._ksize = (WordLength) save_ksize;
ht._n_tables = (unsigned int) save_n_tables;
+ ht._occupied_bins = save_occupied_bins;
ht._init_bitstuff();
ht._use_bigcount = use_bigcount;
@@ -635,6 +596,7 @@ CountingHashGzFileReader::CountingHashGzFileReader(
unsigned int save_ksize = 0;
unsigned char save_n_tables = 0;
unsigned long long save_tablesize = 0;
+ unsigned long long save_occupied_bins = 0;
char signature [4];
unsigned char version, ht_type, use_bigcount;
@@ -674,8 +636,10 @@ CountingHashGzFileReader::CountingHashGzFileReader(
int read_k = gzread(infile, (char *) &save_ksize, sizeof(save_ksize));
int read_nt = gzread(infile, (char *) &save_n_tables,
sizeof(save_n_tables));
+ int read_ob = gzread(infile, (char *) &save_occupied_bins,
+ sizeof(save_occupied_bins));
- if (read_b <= 0 || read_k <= 0 || read_nt <= 0) {
+ if (read_b <= 0 || read_k <= 0 || read_nt <= 0 || read_ob <= 0) {
std::string err = "K-mer count file header read error: " + infilename
+ " " + strerror(errno);
gzclose(infile);
@@ -683,6 +647,7 @@ CountingHashGzFileReader::CountingHashGzFileReader(
}
ht._ksize = (WordLength) save_ksize;
+ ht._occupied_bins = save_occupied_bins;
ht._n_tables = (unsigned int) save_n_tables;
ht._init_bitstuff();
@@ -795,6 +760,7 @@ CountingHashFileWriter::CountingHashFileWriter(
unsigned int save_ksize = ht._ksize;
unsigned char save_n_tables = ht._n_tables;
unsigned long long save_tablesize;
+ unsigned long long save_occupied_bins = ht._occupied_bins;
ofstream outfile(outfilename.c_str(), ios::binary);
@@ -813,6 +779,8 @@ CountingHashFileWriter::CountingHashFileWriter(
outfile.write((const char *) &save_ksize, sizeof(save_ksize));
outfile.write((const char *) &save_n_tables, sizeof(save_n_tables));
+ outfile.write((const char *) &save_occupied_bins,
+ sizeof(save_occupied_bins));
for (unsigned int i = 0; i < save_n_tables; i++) {
save_tablesize = ht._tablesizes[i];
@@ -850,6 +818,7 @@ CountingHashGzFileWriter::CountingHashGzFileWriter(
unsigned int save_ksize = ht._ksize;
unsigned char save_n_tables = ht._n_tables;
unsigned long long save_tablesize;
+ unsigned long long save_occupied_bins = ht._occupied_bins;
gzFile outfile = gzopen(outfilename.c_str(), "wb");
if (outfile == NULL) {
@@ -876,6 +845,8 @@ CountingHashGzFileWriter::CountingHashGzFileWriter(
gzwrite(outfile, (const char *) &save_ksize, sizeof(save_ksize));
gzwrite(outfile, (const char *) &save_n_tables, sizeof(save_n_tables));
+ gzwrite(outfile, (const char *) &save_occupied_bins,
+ sizeof(save_occupied_bins));
for (unsigned int i = 0; i < save_n_tables; i++) {
save_tablesize = ht._tablesizes[i];
@@ -970,7 +941,7 @@ void CountingHash::collect_high_abundance_kmers(
if (check_and_normalize_read(currSeq)) {
const char * sp = currSeq.c_str();
- KMerIterator kmers(sp, _ksize);
+ KmerIterator kmers(sp, _ksize);
while(!kmers.done()) {
HashIntoType kmer = kmers.next();
@@ -1013,7 +984,7 @@ void CountingHash::collect_high_abundance_kmers(
if (check_and_normalize_read(currSeq)) {
const char * sp = currSeq.c_str();
- KMerIterator kmers(sp, _ksize);
+ KmerIterator kmers(sp, _ksize);
while(!kmers.done()) {
HashIntoType kmer = kmers.next();
diff --git a/lib/counting.hh b/lib/counting.hh
index ac8b70e..9eaf351 100644
--- a/lib/counting.hh
+++ b/lib/counting.hh
@@ -56,6 +56,7 @@ protected:
std::vector<HashIntoType> _tablesizes;
size_t _n_tables;
HashIntoType _n_unique_kmers;
+ HashIntoType _occupied_bins;
Byte ** _counts;
@@ -74,7 +75,7 @@ public:
CountingHash( WordLength ksize, HashIntoType single_tablesize ) :
khmer::Hashtable(ksize), _use_bigcount(false),
- _bigcount_spin_lock(false), _n_unique_kmers(0)
+ _bigcount_spin_lock(false), _n_unique_kmers(0), _occupied_bins(0)
{
_tablesizes.push_back(single_tablesize);
@@ -83,7 +84,8 @@ public:
CountingHash( WordLength ksize, std::vector<HashIntoType>& tablesizes ) :
khmer::Hashtable(ksize), _use_bigcount(false),
- _bigcount_spin_lock(false), _tablesizes(tablesizes), _n_unique_kmers(0)
+ _bigcount_spin_lock(false), _tablesizes(tablesizes),
+ _n_unique_kmers(0), _occupied_bins(0)
{
_allocate_counters();
@@ -132,7 +134,10 @@ public:
return _tablesizes;
}
- virtual const HashIntoType n_unique_kmers() const;
+ virtual const HashIntoType n_unique_kmers() const
+ {
+ return _n_unique_kmers;
+ }
void set_use_bigcount(bool b)
{
@@ -146,31 +151,15 @@ public:
virtual void save(std::string);
virtual void load(std::string);
- // accessors to get table info
- const HashIntoType n_entries() const
- {
- return _tablesizes[0];
- }
-
const size_t n_tables() const
{
return _n_tables;
}
// count number of occupied bins
- virtual const HashIntoType n_occupied(HashIntoType start=0,
- HashIntoType stop=0) const
+ virtual const HashIntoType n_occupied() const
{
- HashIntoType n = 0;
- if (stop == 0) {
- stop = _tablesizes[0];
- }
- for (HashIntoType i = start; i < stop; i++) {
- if (_counts[0][i % _tablesizes[0]]) {
- n++;
- }
- }
- return n;
+ return _occupied_bins;
}
virtual void count(const char * kmer)
@@ -181,14 +170,19 @@ public:
virtual void count(HashIntoType khash)
{
- bool is_new_kmer = true;
+ bool is_new_kmer = false;
unsigned int n_full = 0;
for (unsigned int i = 0; i < _n_tables; i++) {
const HashIntoType bin = khash % _tablesizes[i];
Byte current_count = _counts[ i ][ bin ];
- if (is_new_kmer && current_count != 0) {
- is_new_kmer = false;
+ if (!is_new_kmer) {
+ if (current_count == 0) {
+ is_new_kmer = true;
+ if (i == 0) {
+ __sync_add_and_fetch(&_occupied_bins, 1);
+ }
+ }
}
// NOTE: Technically, multiple threads can cause the bin to spill
// over max_count a little, if they all read it as less than
@@ -255,10 +249,6 @@ public:
BoundedCounterType get_max_count(const std::string &s);
- void get_kadian_count(const std::string &s,
- BoundedCounterType &kadian,
- unsigned int nk = 1);
-
HashIntoType * abundance_distribution(read_parsers::IParser * parser,
Hashbits * tracking);
HashIntoType * abundance_distribution(std::string filename,
diff --git a/lib/hashbits.cc b/lib/hashbits.cc
index f103e02..2b62399 100644
--- a/lib/hashbits.cc
+++ b/lib/hashbits.cc
@@ -26,6 +26,7 @@ void Hashbits::save(std::string outfilename)
unsigned int save_ksize = _ksize;
unsigned char save_n_tables = _n_tables;
unsigned long long save_tablesize;
+ unsigned long long save_occupied_bins = _occupied_bins;
ofstream outfile(outfilename.c_str(), ios::binary);
@@ -38,6 +39,8 @@ void Hashbits::save(std::string outfilename)
outfile.write((const char *) &save_ksize, sizeof(save_ksize));
outfile.write((const char *) &save_n_tables, sizeof(save_n_tables));
+ outfile.write((const char *) &save_occupied_bins,
+ sizeof(save_occupied_bins));
for (unsigned int i = 0; i < _n_tables; i++) {
save_tablesize = _tablesizes[i];
@@ -91,6 +94,7 @@ void Hashbits::load(std::string infilename)
unsigned int save_ksize = 0;
unsigned char save_n_tables = 0;
unsigned long long save_tablesize = 0;
+ unsigned long long save_occupied_bins = 0;
char signature[4];
unsigned char version, ht_type;
@@ -120,9 +124,11 @@ void Hashbits::load(std::string infilename)
infile.read((char *) &save_ksize, sizeof(save_ksize));
infile.read((char *) &save_n_tables, sizeof(save_n_tables));
+ infile.read((char *) &save_occupied_bins, sizeof(save_occupied_bins));
_ksize = (WordLength) save_ksize;
_n_tables = (unsigned int) save_n_tables;
+ _occupied_bins = save_occupied_bins;
_init_bitstuff();
_counts = new Byte*[_n_tables];
@@ -156,122 +162,6 @@ void Hashbits::load(std::string infilename)
}
}
-/**
- * Checks for non-ACGT characters before consuming read.
- * This is specifically for counting overlap k-mers.
- */
-unsigned int Hashbits::check_and_process_read_overlap(std::string &read,
- bool &is_valid,
- Hashbits &ht2)
-{
- is_valid = check_and_normalize_read(read);
-
- if (!is_valid) {
- return 0;
- }
-
- return consume_string_overlap(read, ht2);
-}
-
-/**
- * Consume a FASTA file of reads.
- */
-void Hashbits::consume_fasta_overlap(const std::string &filename,
- HashIntoType curve[2][100],Hashbits &ht2,
- unsigned int &total_reads,
- unsigned long long &n_consumed)
-{
- total_reads = 0;
- n_consumed = 0;
- Read read;
-
-//get total number of reads in dataset
-
- IParser* parser = IParser::get_parser(filename.c_str());
- while(!parser->is_complete()) {
- try {
- read = parser->get_next_read();
- } catch (NoMoreReadsAvailable &exc) {
- break;
- }
- total_reads++;
- }
-//block size for curve
- int block_size = total_reads/100;
-
-// reads number <100, block size =1
- if (block_size == 0) {
- block_size = 1;
- }
-// set the remaining as 0
- for (int n=total_reads; n<100; n++) {
- curve[0][n] = 0;
- curve[1][n] = 0;
- }
-
- total_reads = 0;
-
- delete parser;
- parser = IParser::get_parser(filename.c_str());
-
-
-
- string currSeq = "";
-
- //
- // iterate through the FASTA file & consume the reads.
- //
-
- while(!parser->is_complete()) {
- try {
- read = parser->get_next_read();
- } catch (NoMoreReadsAvailable &exc) {
- break;
- }
- currSeq = read.sequence;
-
- unsigned int this_n_consumed;
- bool is_valid;
-
- this_n_consumed = check_and_process_read_overlap(currSeq,
- is_valid, ht2);
-
- n_consumed += this_n_consumed;
-
- // reset the sequence info, increment read number
-
- total_reads++;
-
- if (total_reads%block_size == 0) {
- curve[0][total_reads/block_size-1] = n_overlap_kmers();
- curve[1][total_reads/block_size-1] = n_unique_kmers();
- }
- } // while
-
- delete parser;
-}
-
-/**
- * Run through every k-mer in the given string, & hash it.
- */
-unsigned int Hashbits::consume_string_overlap(const std::string &s,
- Hashbits &ht2)
-{
- const char * sp = s.c_str();
- unsigned int n_consumed = 0;
-
- KMerIterator kmers(sp, _ksize);
-
- while(!kmers.done()) {
- HashIntoType kmer = kmers.next();
-
- count_overlap(kmer,ht2);
- n_consumed++;
- }
-
- return n_consumed;
-}
-
void Hashbits::update_from(const Hashbits &other)
{
if (_ksize != other._ksize) {
diff --git a/lib/hashbits.hh b/lib/hashbits.hh
index 2eb2e7e..4620a92 100644
--- a/lib/hashbits.hh
+++ b/lib/hashbits.hh
@@ -29,7 +29,6 @@ protected:
size_t _n_tables;
HashIntoType _occupied_bins;
HashIntoType _n_unique_kmers;
- HashIntoType _n_overlap_kmers;
Byte ** _counts;
virtual void _allocate_counters()
@@ -54,7 +53,6 @@ public:
{
_occupied_bins = 0;
_n_unique_kmers = 0;
- _n_overlap_kmers = 0;
_allocate_counters();
}
@@ -88,32 +86,15 @@ public:
virtual void save(std::string);
virtual void load(std::string);
- // for overlap k-mer counting
- void consume_fasta_overlap(const std::string &filename,
- HashIntoType curve[2][100],
- khmer::Hashbits &ht2,
- unsigned int &total_reads,
- unsigned long long &n_consumed);
-
- // just for overlap k-mer counting!
- unsigned int check_and_process_read_overlap(std::string &read,
- bool &is_valid,
- khmer::Hashbits &ht2);
- // for overlap k-mer counting!
- unsigned int consume_string_overlap(const std::string &s,
- khmer::Hashbits &ht2);
-
// count number of occupied bins
- virtual const HashIntoType n_occupied(HashIntoType start=0,
- HashIntoType stop=0) const
+ virtual const HashIntoType n_occupied() const
{
- // @@ CTB need to be able to *save* this...
- return _occupied_bins/_n_tables;
+ return _occupied_bins;
}
virtual const HashIntoType n_unique_kmers() const
{
- return _n_unique_kmers; // @@ CTB need to be able to *save* this...
+ return _n_unique_kmers;
}
// Get and set the hashbits for the given kmer.
@@ -145,7 +126,9 @@ public:
unsigned char bits_orig = __sync_fetch_and_or( *(_counts + i) + byte, bit );
if (!(bits_orig & bit)) {
- __sync_add_and_fetch( &_occupied_bins, 1 );
+ if (i == 0) {
+ __sync_add_and_fetch( &_occupied_bins, 1 );
+ }
is_new_kmer = true;
}
} // iteration over hashtables
@@ -158,12 +141,6 @@ public:
return 0; // kmer already seen
} // test_and_set_bits
- virtual const HashIntoType n_overlap_kmers(HashIntoType start=0,
- HashIntoType stop=0) const
- {
- return _n_overlap_kmers; // @@ CTB need to be able to *save* this...
- }
-
virtual void count(const char * kmer)
{
HashIntoType hash = _hash(kmer, _ksize);
@@ -172,63 +149,7 @@ public:
virtual void count(HashIntoType khash)
{
- bool is_new_kmer = false;
-
- for (size_t i = 0; i < _n_tables; i++) {
- HashIntoType bin = khash % _tablesizes[i];
- HashIntoType byte = bin / 8;
- unsigned char bit = bin % 8;
- if (!( _counts[i][byte] & (1<<bit))) {
- _occupied_bins += 1;
- is_new_kmer = true;
- }
- _counts[i][byte] |= (1 << bit);
- }
- if (is_new_kmer) {
- _n_unique_kmers +=1;
- }
- }
-
- virtual bool check_overlap(HashIntoType khash, Hashbits &ht2)
- {
-
- for (size_t i = 0; i < ht2._n_tables; i++) {
- HashIntoType bin = khash % ht2._tablesizes[i];
- HashIntoType byte = bin / 8;
- unsigned char bit = bin % 8;
- if (!( ht2._counts[i][byte] & (1<<bit))) {
- return false;
- }
- }
- return true;
- }
-
- virtual void count_overlap(const char * kmer, Hashbits &ht2)
- {
- HashIntoType hash = _hash(kmer, _ksize);
- count_overlap(hash,ht2);
- }
-
- virtual void count_overlap(HashIntoType khash, Hashbits &ht2)
- {
- bool is_new_kmer = false;
-
- for (size_t i = 0; i < _n_tables; i++) {
- HashIntoType bin = khash % _tablesizes[i];
- HashIntoType byte = bin / 8;
- unsigned char bit = bin % 8;
- if (!( _counts[i][byte] & (1<<bit))) {
- _occupied_bins += 1;
- is_new_kmer = true;
- }
- _counts[i][byte] |= (1 << bit);
- }
- if (is_new_kmer) {
- _n_unique_kmers +=1;
- if (check_overlap(khash,ht2)) {
- _n_overlap_kmers +=1;
- }
- }
+ test_and_set_bits(khash);
}
// get the count for the given k-mer.
@@ -252,11 +173,6 @@ public:
}
return 1;
}
- // accessors to get table info
- const HashIntoType n_entries() const
- {
- return _tablesizes[0];
- }
void update_from(const Hashbits &other);
};
diff --git a/lib/hashtable.cc b/lib/hashtable.cc
index 85d417d..5a1258a 100644
--- a/lib/hashtable.cc
+++ b/lib/hashtable.cc
@@ -129,7 +129,7 @@ unsigned int Hashtable::consume_string(const std::string &s)
const char * sp = s.c_str();
unsigned int n_consumed = 0;
- KMerIterator kmers(sp, _ksize);
+ KmerIterator kmers(sp, _ksize);
while(!kmers.done()) {
HashIntoType kmer = kmers.next();
@@ -181,7 +181,7 @@ void Hashtable::get_median_count(const std::string &s,
bool Hashtable::median_at_least(const std::string &s,
unsigned int cutoff)
{
- KMerIterator kmers(s.c_str(), _ksize);
+ KmerIterator kmers(s.c_str(), _ksize);
unsigned int min_req = 0.5 + float(s.size() - _ksize + 1) / 2;
unsigned int num_cutoff_kmers = 0;
@@ -338,7 +338,7 @@ void Hashtable::consume_sequence_and_tag(const std::string& seq,
{
bool kmer_tagged;
- KMerIterator kmers(seq.c_str(), _ksize);
+ KmerIterator kmers(seq.c_str(), _ksize);
HashIntoType kmer;
unsigned int since = _tag_density / 2 + 1;
@@ -501,7 +501,7 @@ void Hashtable::consume_fasta_and_tag_with_stoptags(const std::string &filename,
if (check_and_normalize_read(seq)) { // process?
bool is_new_kmer;
- KMerIterator kmers(seq.c_str(), _ksize);
+ KmerIterator kmers(seq.c_str(), _ksize);
HashIntoType kmer, last_kmer;
bool is_first_kmer = true;
@@ -672,10 +672,10 @@ void Hashtable::consume_fasta_and_traverse(const std::string &filename,
seq = read.sequence;
if (check_and_normalize_read(seq)) { // process?
- KMerIterator kmers(seq.c_str(), _ksize);
+ KmerIterator kmers(seq.c_str(), _ksize);
- HashIntoType kmer = 0;
bool is_first_kmer = true;
+ Kmer kmer(0,0,0);
while (!kmers.done()) {
kmer = kmers.next();
@@ -687,8 +687,7 @@ void Hashtable::consume_fasta_and_traverse(const std::string &filename,
}
if (!is_first_kmer) { // traverse
- SeenSet keeper;
-
+ KmerSet keeper;
unsigned int n = traverse_from_kmer(kmer, radius, keeper);
if (n >= big_threshold) {
#if VERBOSE_REPARTITION
@@ -713,150 +712,69 @@ void Hashtable::consume_fasta_and_traverse(const std::string &filename,
//////////////////////////////////////////////////////////////////////
// graph stuff
-void Hashtable::calc_connected_graph_size(const HashIntoType kmer_f,
- const HashIntoType kmer_r,
+void Hashtable::calc_connected_graph_size(Kmer start,
unsigned long long& count,
- SeenSet& keeper,
+ KmerSet& keeper,
const unsigned long long threshold,
bool break_on_circum)
const
{
- HashIntoType kmer = uniqify_rc(kmer_f, kmer_r);
- const BoundedCounterType val = get_count(kmer);
+ const BoundedCounterType val = get_count(start);
if (val == 0) {
return;
}
- // have we already seen me? don't count; exit.
- if (set_contains(keeper, kmer)) {
- return;
- }
-
- // is this in stop_tags?
- if (set_contains(stop_tags, kmer)) {
- return;
- }
-
- // keep track of both seen kmers, and counts.
- keeper.insert(kmer);
-
- // is this a high-circumference k-mer? if so, don't count it; get outta here!
- if (break_on_circum && \
- kmer_degree(kmer_f, kmer_r) > 4) {
- return;
- }
-
- count += 1;
-
- // are we past the threshold? truncate search.
- if (threshold && count >= threshold) {
- return;
- }
-
- // otherwise, explore in all directions.
-
- // NEXT.
-
- HashIntoType f, r;
- const unsigned int rc_left_shift = _ksize*2 - 2;
+ Traverser traverser(this);
+ KmerQueue node_q;
+ node_q.push(start);
- f = next_f(kmer_f, 'A');
- r = next_r(kmer_r, 'A');
- calc_connected_graph_size(f, r, count, keeper, threshold, break_on_circum);
+ // Avoid high-circumference k-mers
+ auto filter = [&] (Kmer& n) { return !(break_on_circum &&
+ traverser.degree(n) > 4); };
- f = next_f(kmer_f, 'C');
- r = next_r(kmer_r, 'C');
- calc_connected_graph_size(f, r, count, keeper, threshold, break_on_circum);
-
- f = next_f(kmer_f, 'G');
- r = next_r(kmer_r, 'G');
- calc_connected_graph_size(f, r, count, keeper, threshold, break_on_circum);
-
- f = next_f(kmer_f, 'T');
- r = next_r(kmer_r, 'T');
- calc_connected_graph_size(f, r, count, keeper, threshold, break_on_circum);
+ while(!node_q.empty()) {
+ Kmer node = node_q.front();
+ node_q.pop();
- // PREVIOUS.
+ // have we already seen me? don't count; exit.
+ if (set_contains(keeper, node)) {
+ continue;
+ }
+ // is this in stop_tags?
+ if (set_contains(stop_tags, node)) {
+ continue;
+ }
- r = prev_r(kmer_r, 'A');
- f = prev_f(kmer_f, 'A');
- calc_connected_graph_size(f, r, count, keeper, threshold, break_on_circum);
+ // keep track of both seen kmers, and counts.
+ keeper.insert(node);
- r = prev_r(kmer_r, 'C');
- f = prev_f(kmer_f, 'C');
- calc_connected_graph_size(f, r, count, keeper, threshold, break_on_circum);
+ count += 1;
- r = prev_r(kmer_r, 'G');
- f = prev_f(kmer_f, 'G');
- calc_connected_graph_size(f, r, count, keeper, threshold, break_on_circum);
+ // are we past the threshold? truncate search.
+ if (threshold && count >= threshold) {
+ return;
+ }
- r = prev_r(kmer_r, 'T');
- f = prev_f(kmer_f, 'T');
- calc_connected_graph_size(f, r, count, keeper, threshold, break_on_circum);
+ // otherwise, explore in all directions.
+ traverser.traverse_right(node, node_q, filter);
+ traverser.traverse_left(node, node_q, filter);
+ }
}
unsigned int Hashtable::kmer_degree(HashIntoType kmer_f, HashIntoType kmer_r)
-const
{
- unsigned int neighbors = 0;
-
- const unsigned int rc_left_shift = _ksize*2 - 2;
-
- HashIntoType f, r;
-
- // NEXT.
- f = next_f(kmer_f, 'A');
- r = next_r(kmer_r, 'A');
- if (get_count(uniqify_rc(f, r))) {
- neighbors++;
- }
-
- f = next_f(kmer_f, 'C');
- r = next_r(kmer_r, 'C');
- if (get_count(uniqify_rc(f, r))) {
- neighbors++;
- }
-
- f = next_f(kmer_f, 'G');
- r = next_r(kmer_r, 'G');
- if (get_count(uniqify_rc(f, r))) {
- neighbors++;
- }
-
- f = next_f(kmer_f, 'T');
- r = next_r(kmer_r, 'T');
- if (get_count(uniqify_rc(f, r))) {
- neighbors++;
- }
-
- // PREVIOUS.
- r = prev_r(kmer_r, 'A');
- f = prev_f(kmer_f, 'A');
- if (get_count(uniqify_rc(f, r))) {
- neighbors++;
- }
-
- r = prev_r(kmer_r, 'C');
- f = prev_f(kmer_f, 'C');
- if (get_count(uniqify_rc(f, r))) {
- neighbors++;
- }
-
- r = prev_r(kmer_r, 'G');
- f = prev_f(kmer_f, 'G');
- if (get_count(uniqify_rc(f, r))) {
- neighbors++;
- }
-
- r = prev_r(kmer_r, 'T');
- f = prev_f(kmer_f, 'T');
- if (get_count(uniqify_rc(f, r))) {
- neighbors++;
- }
+ Traverser traverser(this);
+ Kmer node = build_kmer(kmer_f, kmer_r);
+ return traverser.degree(node);
+}
- return neighbors;
+unsigned int Hashtable::kmer_degree(const char * kmer_s)
+{
+ Traverser traverser(this);
+ Kmer node = build_kmer(kmer_s);
+ return traverser.degree(node);
}
void Hashtable::filter_if_present(const std::string &infilename,
@@ -882,7 +800,7 @@ void Hashtable::filter_if_present(const std::string &infilename,
seq = read.sequence;
if (check_and_normalize_read(seq)) {
- KMerIterator kmers(seq.c_str(), _ksize);
+ KmerIterator kmers(seq.c_str(), _ksize);
bool keep = true;
while (!kmers.done()) {
@@ -909,146 +827,13 @@ void Hashtable::filter_if_present(const std::string &infilename,
return;
}
-
-unsigned int Hashtable::count_kmers_within_radius(HashIntoType kmer_f,
- HashIntoType kmer_r,
- unsigned int radius,
- unsigned int max_count,
- const SeenSet * seen)
-const
-{
- HashIntoType f, r;
- NodeQueue node_q;
- std::queue<unsigned int> breadth_q;
- unsigned int cur_breadth = 0;
-
- const unsigned int rc_left_shift = _ksize*2 - 2;
- unsigned int total = 0;
-
- SeenSet keeper; // keep track of traversed kmers
- if (seen) {
- keeper = *seen;
- }
-
- // start breadth-first search.
-
- node_q.push(kmer_f);
- node_q.push(kmer_r);
- breadth_q.push(0);
-
- while(!node_q.empty()) {
- kmer_f = node_q.front();
- node_q.pop();
- kmer_r = node_q.front();
- node_q.pop();
- unsigned int breadth = breadth_q.front();
- breadth_q.pop();
-
- if (breadth > radius) {
- break;
- }
-
- HashIntoType kmer = uniqify_rc(kmer_f, kmer_r);
- if (set_contains(keeper, kmer)) {
- continue;
- }
-
- // keep track of seen kmers
- keeper.insert(kmer);
- total++;
-
- if (max_count && total > max_count) {
- break;
- }
-
- if (!(breadth >= cur_breadth)) { // keep track of watermark, for debugging.
- throw khmer_exception();
- }
- if (breadth > cur_breadth) {
- cur_breadth = breadth;
- }
-
- //
- // Enqueue next set of nodes.
- //
-
- // NEXT.
- f = next_f(kmer_f, 'A');
- r = next_r(kmer_r, 'A');
- if (get_count(uniqify_rc(f,r)) && !set_contains(keeper, uniqify_rc(f, r))) {
- node_q.push(f);
- node_q.push(r);
- breadth_q.push(breadth + 1);
- }
-
- f = next_f(kmer_f, 'C');
- r = next_r(kmer_r, 'C');
- if (get_count(uniqify_rc(f,r)) && !set_contains(keeper, uniqify_rc(f, r))) {
- node_q.push(f);
- node_q.push(r);
- breadth_q.push(breadth + 1);
- }
-
- f = next_f(kmer_f, 'G');
- r = next_r(kmer_r, 'G');
- if (get_count(uniqify_rc(f,r)) && !set_contains(keeper, uniqify_rc(f, r))) {
- node_q.push(f);
- node_q.push(r);
- breadth_q.push(breadth + 1);
- }
-
- f = next_f(kmer_f, 'T');
- r = next_r(kmer_r, 'T');
- if (get_count(uniqify_rc(f,r)) && !set_contains(keeper, uniqify_rc(f, r))) {
- node_q.push(f);
- node_q.push(r);
- breadth_q.push(breadth + 1);
- }
-
- // PREVIOUS.
- r = prev_r(kmer_r, 'A');
- f = prev_f(kmer_f, 'A');
- if (get_count(uniqify_rc(f,r)) && !set_contains(keeper, uniqify_rc(f, r))) {
- node_q.push(f);
- node_q.push(r);
- breadth_q.push(breadth + 1);
- }
-
- r = prev_r(kmer_r, 'C');
- f = prev_f(kmer_f, 'C');
- if (get_count(uniqify_rc(f,r)) && !set_contains(keeper, uniqify_rc(f, r))) {
- node_q.push(f);
- node_q.push(r);
- breadth_q.push(breadth + 1);
- }
-
- r = prev_r(kmer_r, 'G');
- f = prev_f(kmer_f, 'G');
- if (get_count(uniqify_rc(f,r)) && !set_contains(keeper, uniqify_rc(f, r))) {
- node_q.push(f);
- node_q.push(r);
- breadth_q.push(breadth + 1);
- }
-
- r = prev_r(kmer_r, 'T');
- f = prev_f(kmer_f, 'T');
- if (get_count(uniqify_rc(f,r)) && !set_contains(keeper, uniqify_rc(f, r))) {
- node_q.push(f);
- node_q.push(r);
- breadth_q.push(breadth + 1);
- }
- }
-
- return total;
-}
-
size_t Hashtable::trim_on_stoptags(std::string seq) const
{
if (!check_and_normalize_read(seq)) {
return 0;
}
- KMerIterator kmers(seq.c_str(), _ksize);
+ KmerIterator kmers(seq.c_str(), _ksize);
size_t i = _ksize - 2;
while (!kmers.done()) {
@@ -1070,7 +855,7 @@ void Hashtable::traverse_from_tags(unsigned int distance,
unsigned int i = 0;
unsigned int n = 0;
unsigned int n_big = 0;
- SeenSet keeper;
+ KmerSet keeper;
#if VERBOSE_REPARTITION
std::cout << all_tags.size() << " tags...\n";
@@ -1078,13 +863,15 @@ void Hashtable::traverse_from_tags(unsigned int distance,
for (SeenSet::const_iterator si = all_tags.begin(); si != all_tags.end();
++si, i++) {
+
n++;
- unsigned int count = traverse_from_kmer(*si, distance, keeper);
+ Kmer tag = build_kmer(*si);
+ unsigned int count = traverse_from_kmer(tag, distance, keeper);
if (count >= threshold) {
n_big++;
- SeenSet::const_iterator ti;
+ KmerSet::const_iterator ti;
for (ti = keeper.begin(); ti != keeper.end(); ++ti) {
if (counting.get_count(*ti) > frequency) {
stop_tags.insert(*ti);
@@ -1108,35 +895,29 @@ void Hashtable::traverse_from_tags(unsigned int distance,
}
}
-unsigned int Hashtable::traverse_from_kmer(HashIntoType start,
+unsigned int Hashtable::traverse_from_kmer(Kmer start,
unsigned int radius,
- SeenSet &keeper)
+ KmerSet &keeper,
+ unsigned int max_count)
const
{
- std::string kmer_s = _revhash(start, _ksize);
- HashIntoType kmer_f, kmer_r;
- _hash(kmer_s.c_str(), _ksize, kmer_f, kmer_r);
- HashIntoType f, r;
- NodeQueue node_q;
+ Traverser traverser(this);
+ KmerQueue node_q;
std::queue<unsigned int> breadth_q;
unsigned int cur_breadth = 0;
- bool is_first_kmer = true;
-
- const unsigned int rc_left_shift = _ksize*2 - 2;
unsigned int total = 0;
+ unsigned int nfound = 0;
- // start breadth-first search.
+ auto filter = [&] (Kmer& n) { return !set_contains(keeper, n); };
- node_q.push(kmer_f);
- node_q.push(kmer_r);
+ node_q.push(start);
breadth_q.push(0);
while(!node_q.empty()) {
- kmer_f = node_q.front();
- node_q.pop();
- kmer_r = node_q.front();
+ Kmer node = node_q.front();
node_q.pop();
+
unsigned int breadth = breadth_q.front();
breadth_q.pop();
@@ -1144,28 +925,22 @@ const
break;
}
- if (total > MAX_KEEPER_SIZE) {
+ if (max_count && total > max_count) {
break;
}
- HashIntoType kmer = uniqify_rc(kmer_f, kmer_r);
- if (set_contains(keeper, kmer)) {
+ if (set_contains(keeper, node)) {
continue;
}
- if (set_contains(stop_tags, kmer)) {
+ if (set_contains(stop_tags, node)) {
continue;
}
// keep track of seen kmers
- keeper.insert(kmer);
+ keeper.insert(node);
total++;
- // QUESTION: Huh? What's up with the following?
- if (false && !is_first_kmer && set_contains(all_tags, kmer)) {
- continue;
- }
-
if (!(breadth >= cur_breadth)) { // keep track of watermark, for debugging.
throw khmer_exception();
}
@@ -1173,80 +948,11 @@ const
cur_breadth = breadth;
}
- //
- // Enqueue next set of nodes.
- //
-
- // NEXT.
- f = next_f(kmer_f, 'A');
- //r = next_r(kmer_r, 'A');
-
- // f = ((kmer_f << 2) & bitmask) | twobit_repr('A');
- r = next_r(kmer_r, 'A');
- if (get_count(uniqify_rc(f,r)) && !set_contains(keeper, uniqify_rc(f,r))) {
- node_q.push(f);
- node_q.push(r);
- breadth_q.push(breadth + 1);
- }
-
- f = next_f(kmer_f, 'C');
- r = next_r(kmer_r, 'C');
- if (get_count(uniqify_rc(f,r)) && !set_contains(keeper, uniqify_rc(f,r))) {
- node_q.push(f);
- node_q.push(r);
- breadth_q.push(breadth + 1);
- }
-
- f = next_f(kmer_f, 'G');
- r = next_r(kmer_r, 'G');
- if (get_count(uniqify_rc(f,r)) && !set_contains(keeper, uniqify_rc(f,r))) {
- node_q.push(f);
- node_q.push(r);
- breadth_q.push(breadth + 1);
- }
-
- f = next_f(kmer_f, 'T');
- r = next_r(kmer_r, 'T');
- if (get_count(uniqify_rc(f,r)) && !set_contains(keeper, uniqify_rc(f,r))) {
- node_q.push(f);
- node_q.push(r);
- breadth_q.push(breadth + 1);
- }
-
- // PREVIOUS.
- r = prev_r(kmer_r, 'A');
- f = prev_f(kmer_f, 'A');
- if (get_count(uniqify_rc(f,r)) && !set_contains(keeper, uniqify_rc(f,r))) {
- node_q.push(f);
- node_q.push(r);
- breadth_q.push(breadth + 1);
- }
-
- r = prev_r(kmer_r, 'C');
- f = prev_f(kmer_f, 'C');
- if (get_count(uniqify_rc(f,r)) && !set_contains(keeper, uniqify_rc(f,r))) {
- node_q.push(f);
- node_q.push(r);
- breadth_q.push(breadth + 1);
- }
-
- r = prev_r(kmer_r, 'G');
- f = prev_f(kmer_f, 'G');
- if (get_count(uniqify_rc(f,r)) && !set_contains(keeper, uniqify_rc(f,r))) {
- node_q.push(f);
- node_q.push(r);
- breadth_q.push(breadth + 1);
- }
-
- r = prev_r(kmer_r, 'T');
- f = prev_f(kmer_f, 'T');
- if (get_count(uniqify_rc(f,r)) && !set_contains(keeper, uniqify_rc(f,r))) {
- node_q.push(f);
- node_q.push(r);
- breadth_q.push(breadth + 1);
- }
+ nfound = traverser.traverse_right(node, node_q, filter);
+ for (unsigned int i = 0; i<nfound; ++i) breadth_q.push(breadth + 1);
- is_first_kmer = false;
+ nfound = traverser.traverse_left(node, node_q, filter);
+ for (unsigned int i = 0; i<nfound; ++i) breadth_q.push(breadth + 1);
}
return total;
@@ -1389,13 +1095,13 @@ void Hashtable::print_tagset(std::string infilename)
printfile.close();
}
-unsigned int Hashtable::count_and_transfer_to_stoptags(SeenSet &keeper,
+unsigned int Hashtable::count_and_transfer_to_stoptags(KmerSet &keeper,
unsigned int threshold,
CountingHash &counting)
{
unsigned int n_inserted = 0;
- SeenSet::const_iterator ti;
+ KmerSet::const_iterator ti;
for (ti = keeper.begin(); ti != keeper.end(); ++ti) {
if (counting.get_count(*ti) >= threshold) {
stop_tags.insert(*ti);
@@ -1416,7 +1122,7 @@ const
return;
}
- KMerIterator kmers(seq.c_str(), _ksize);
+ KmerIterator kmers(seq.c_str(), _ksize);
unsigned int i = 0;
while(!kmers.done()) {
@@ -1444,7 +1150,7 @@ void Hashtable::extract_unique_paths(std::string seq,
min_length = min_length - _ksize + 1; // adjust for k-mer size.
- KMerIterator kmers(seq.c_str(), _ksize);
+ KmerIterator kmers(seq.c_str(), _ksize);
std::deque<bool> seen_queue;
unsigned int n_already_seen = 0;
@@ -1544,7 +1250,7 @@ void Hashtable::get_kmers(const std::string &s,
void Hashtable::get_kmer_hashes(const std::string &s,
std::vector<HashIntoType> &kmers_vec) const
{
- KMerIterator kmers(s.c_str(), _ksize);
+ KmerIterator kmers(s.c_str(), _ksize);
while(!kmers.done()) {
HashIntoType kmer = kmers.next();
@@ -1556,7 +1262,7 @@ void Hashtable::get_kmer_hashes(const std::string &s,
void Hashtable::get_kmer_counts(const std::string &s,
std::vector<BoundedCounterType> &counts) const
{
- KMerIterator kmers(s.c_str(), _ksize);
+ KmerIterator kmers(s.c_str(), _ksize);
while(!kmers.done()) {
HashIntoType kmer = kmers.next();
diff --git a/lib/hashtable.hh b/lib/hashtable.hh
index f0f5135..03ab615 100644
--- a/lib/hashtable.hh
+++ b/lib/hashtable.hh
@@ -26,6 +26,7 @@
#include "khmer_exception.hh"
#include "kmer_hash.hh"
#include "read_parsers.hh"
+#include "traversal.hh"
#include "subset.hh"
namespace khmer
@@ -54,140 +55,34 @@ struct IParser;
namespace khmer
{
-//
-// Sequence iterator class, test. Not really a C++ iterator yet.
-//
-
-class KMerIterator
-{
-protected:
- const char * _seq;
- const unsigned char _ksize;
-
- HashIntoType _kmer_f, _kmer_r;
- HashIntoType bitmask;
- unsigned int _nbits_sub_1;
- unsigned int index;
- size_t length;
- bool initialized;
-public:
- KMerIterator(const char * seq, unsigned char k) : _seq(seq), _ksize(k)
- {
- bitmask = 0;
- for (unsigned char i = 0; i < _ksize; i++) {
- bitmask = (bitmask << 2) | 3;
- }
- _nbits_sub_1 = (_ksize*2 - 2);
-
- index = _ksize - 1;
- length = strlen(seq);
- _kmer_f = 0;
- _kmer_r = 0;
-
- initialized = false;
- }
-
- HashIntoType first(HashIntoType& f, HashIntoType& r)
- {
- HashIntoType x;
- x = _hash(_seq, _ksize, _kmer_f, _kmer_r);
-
- f = _kmer_f;
- r = _kmer_r;
-
- index = _ksize;
-
- return x;
- }
-
- HashIntoType next(HashIntoType& f, HashIntoType& r)
- {
- if (done()) {
- throw khmer_exception();
- }
-
- if (!initialized) {
- initialized = true;
- return first(f, r);
- }
-
- unsigned char ch = _seq[index];
- index++;
- if (!(index <= length)) {
- throw khmer_exception();
- }
-
- // left-shift the previous hash over
- _kmer_f = _kmer_f << 2;
-
- // 'or' in the current nt
- _kmer_f |= twobit_repr(ch);
-
- // mask off the 2 bits we shifted over.
- _kmer_f &= bitmask;
-
- // now handle reverse complement
- _kmer_r = _kmer_r >> 2;
- _kmer_r |= (twobit_comp(ch) << _nbits_sub_1);
-
- f = _kmer_f;
- r = _kmer_r;
-
- return uniqify_rc(_kmer_f, _kmer_r);
- }
-
- HashIntoType first()
- {
- return first(_kmer_f, _kmer_r);
- }
- HashIntoType next()
- {
- return next(_kmer_f, _kmer_r);
- }
-
- bool done()
- {
- return index >= length;
- }
-
- unsigned int get_start_pos() const
- {
- return index - _ksize;
- }
-
- unsigned int get_end_pos() const
- {
- return index;
- }
-}; // class KMerIterator
-
-class Hashtable // Base class implementation of a Bloom ht.
+class Hashtable: public KmerFactory // Base class implementation of a Bloom ht.
{
friend class SubsetPartition;
friend class LabelHash;
+ friend class Traverser;
+
protected:
unsigned int _tag_density;
unsigned int _max_count;
unsigned int _max_bigcount;
- WordLength _ksize;
+ //WordLength _ksize;
HashIntoType bitmask;
unsigned int _nbits_sub_1;
explicit Hashtable( WordLength ksize )
- : _max_count( MAX_KCOUNT ),
- _max_bigcount( MAX_BIGCOUNT ),
- _ksize( ksize )
+ : KmerFactory( ksize ),
+ _max_count( MAX_KCOUNT ),
+ _max_bigcount( MAX_BIGCOUNT )
{
_tag_density = DEFAULT_TAG_DENSITY;
if (!(_tag_density % 2 == 0)) {
throw khmer_exception();
}
- partition = new SubsetPartition(this);
_init_bitstuff();
+ partition = new SubsetPartition(this);
_all_tags_spin_lock = 0;
-
}
virtual ~Hashtable( )
@@ -290,8 +185,13 @@ public:
BoundedCounterType &median,
float &average,
float &stddev);
+
+ // number of unique k-mers
virtual const HashIntoType n_unique_kmers() const = 0;
+ // count number of occupied bins
+ virtual const HashIntoType n_occupied() const = 0;
+
// partitioning stuff
void _validate_pmap()
{
@@ -385,19 +285,10 @@ public:
virtual std::vector<HashIntoType> get_tablesizes() const = 0;
virtual const size_t n_tables() const = 0;
- virtual const HashIntoType n_occupied(HashIntoType start=0,
- HashIntoType stop=0) const = 0;
- virtual const HashIntoType n_entries() const = 0;
void filter_if_present(const std::string &infilename,
const std::string &outputfilename);
- unsigned int count_kmers_within_radius(HashIntoType kmer_f,
- HashIntoType kmer_r,
- unsigned int radius,
- unsigned int max_count,
- const SeenSet * seen=0) const;
-
size_t trim_on_stoptags(std::string sequence) const;
void traverse_from_tags(unsigned int distance,
@@ -405,11 +296,13 @@ public:
unsigned int num_high_todo,
CountingHash &counting);
- unsigned int traverse_from_kmer(HashIntoType start,
+ unsigned int traverse_from_kmer(Kmer start,
unsigned int radius,
- SeenSet &keeper) const;
+ KmerSet &keeper,
+ unsigned int max_count = MAX_KEEPER_SIZE)
+ const;
- unsigned int count_and_transfer_to_stoptags(SeenSet &keeper,
+ unsigned int count_and_transfer_to_stoptags(KmerSet &keeper,
unsigned int threshold,
CountingHash &counting);
@@ -427,35 +320,17 @@ public:
float min_unique_f,
std::vector<std::string> &results);
- void calc_connected_graph_size(const char * kmer,
+ void calc_connected_graph_size(Kmer node,
unsigned long long& count,
- SeenSet& keeper,
- const unsigned long long threshold=0,
- bool break_on_circum=false) const
- {
- HashIntoType r, f;
- _hash(kmer, _ksize, f, r);
- calc_connected_graph_size(f, r, count, keeper, threshold, break_on_circum);
- }
-
- void calc_connected_graph_size(const HashIntoType kmer_f,
- const HashIntoType kmer_r,
- unsigned long long& count,
- SeenSet& keeper,
+ KmerSet& keeper,
const unsigned long long threshold=0,
bool break_on_circum=false) const;
typedef void (*kmer_cb)(const char * k, unsigned int n_reads, void *data);
- unsigned int kmer_degree(HashIntoType kmer_f, HashIntoType kmer_r) const;
- unsigned int kmer_degree(const char * kmer_s) const
- {
- HashIntoType kmer_f, kmer_r;
- _hash(kmer_s, _ksize, kmer_f, kmer_r);
-
- return kmer_degree(kmer_f, kmer_r);
- }
+ unsigned int kmer_degree(HashIntoType kmer_f, HashIntoType kmer_r);
+ unsigned int kmer_degree(const char * kmer_s);
// return all k-mer substrings, on the forward strand.
void get_kmers(const std::string &s, std::vector<std::string> &kmers)
diff --git a/lib/hllcounter.cc b/lib/hllcounter.cc
index c115524..790a38f 100644
--- a/lib/hllcounter.cc
+++ b/lib/hllcounter.cc
@@ -347,18 +347,20 @@ unsigned int HLLCounter::consume_string(const std::string &inp)
void HLLCounter::consume_fasta(
std::string const &filename,
+ bool output_records,
unsigned int &total_reads,
unsigned long long &n_consumed)
{
read_parsers::IParser * parser = read_parsers::IParser::get_parser(filename);
- consume_fasta(parser, total_reads, n_consumed);
+ consume_fasta(parser, output_records, total_reads, n_consumed);
delete parser;
}
void HLLCounter::consume_fasta(
read_parsers::IParser *parser,
+ bool output_records,
unsigned int & total_reads,
unsigned long long & n_consumed)
{
@@ -372,7 +374,7 @@ void HLLCounter::consume_fasta(
#pragma omp parallel
{
- #pragma omp single
+ #pragma omp master
{
counters = (HLLCounter**)calloc(omp_get_num_threads(),
sizeof(HLLCounter*));
@@ -396,6 +398,10 @@ void HLLCounter::consume_fasta(
break;
}
+ if (output_records) {
+ read.write_to(std::cout);
+ }
+
#pragma omp task default(none) firstprivate(read) \
shared(counters, n_consumed_partial, total_reads_partial)
{
@@ -413,7 +419,7 @@ void HLLCounter::consume_fasta(
}
#pragma omp taskwait
- #pragma omp single
+ #pragma omp master
{
for (int i=0; i < omp_get_num_threads(); ++i)
{
@@ -450,8 +456,11 @@ bool HLLCounter::check_and_normalize_read(std::string &read) const
}
for (unsigned int i = 0; i < read.length(); i++) {
- read[ i ] &= 0xdf; // toupper - knock out the "lowercase bit"
- if (!is_valid_dna( read[ i ] )) {
+ read[i] &= 0xdf; // toupper - knock out the "lowercase bit"
+ if (read[i] == 'N') {
+ read[i] = 'A';
+ }
+ if (!is_valid_dna( read[i] )) {
is_valid = false;
break;
}
diff --git a/lib/hllcounter.hh b/lib/hllcounter.hh
index 0b64a31..5210ed3 100644
--- a/lib/hllcounter.hh
+++ b/lib/hllcounter.hh
@@ -35,9 +35,11 @@ public:
void add(const std::string &);
unsigned int consume_string(const std::string &);
void consume_fasta(std::string const &,
+ bool,
unsigned int &,
unsigned long long &);
void consume_fasta(read_parsers::IParser *,
+ bool,
unsigned int &,
unsigned long long &);
unsigned int check_and_process_read(std::string &,
diff --git a/lib/khmer.hh b/lib/khmer.hh
index 60928c4..ea50a48 100644
--- a/lib/khmer.hh
+++ b/lib/khmer.hh
@@ -121,6 +121,10 @@ void deallocate_ptr_set(T& s)
}
}
+class Kmer;
+typedef std::queue<Kmer> KmerQueue;
+typedef std::set<Kmer> KmerSet;
+
}
#endif // KHMER_HH
diff --git a/lib/kmer_hash.cc b/lib/kmer_hash.cc
index cc7a9bd..fce6799 100644
--- a/lib/kmer_hash.cc
+++ b/lib/kmer_hash.cc
@@ -158,4 +158,71 @@ HashIntoType _hash_murmur_forward(const std::string& kmer)
return h;
}
+KmerIterator::KmerIterator(const char * seq,
+ unsigned char k) :
+ KmerFactory(k), _seq(seq)
+{
+ bitmask = 0;
+ for (unsigned char i = 0; i < _ksize; i++) {
+ bitmask = (bitmask << 2) | 3;
+ }
+ _nbits_sub_1 = (_ksize*2 - 2);
+
+ index = _ksize - 1;
+ length = strlen(seq);
+ _kmer_f = 0;
+ _kmer_r = 0;
+
+ initialized = false;
+}
+
+Kmer KmerIterator::first(HashIntoType& f, HashIntoType& r)
+{
+ HashIntoType x;
+ x = _hash(_seq, _ksize, _kmer_f, _kmer_r);
+
+ f = _kmer_f;
+ r = _kmer_r;
+
+ index = _ksize;
+
+ return Kmer(_kmer_f, _kmer_r, x);
+}
+
+Kmer KmerIterator::next(HashIntoType& f, HashIntoType& r)
+{
+ if (done()) {
+ throw khmer_exception();
+ }
+
+ if (!initialized) {
+ initialized = true;
+ return first(f, r);
+ }
+
+ unsigned char ch = _seq[index];
+ index++;
+ if (!(index <= length)) {
+ throw khmer_exception();
+ }
+
+ // left-shift the previous hash over
+ _kmer_f = _kmer_f << 2;
+
+ // 'or' in the current nt
+ _kmer_f |= twobit_repr(ch);
+
+ // mask off the 2 bits we shifted over.
+ _kmer_f &= bitmask;
+
+ // now handle reverse complement
+ _kmer_r = _kmer_r >> 2;
+ _kmer_r |= (twobit_comp(ch) << _nbits_sub_1);
+
+ f = _kmer_f;
+ r = _kmer_r;
+
+ return build_kmer(_kmer_f, _kmer_r);
+}
+
};
diff --git a/lib/kmer_hash.hh b/lib/kmer_hash.hh
index 33ea909..adf67e5 100644
--- a/lib/kmer_hash.hh
+++ b/lib/kmer_hash.hh
@@ -79,6 +79,217 @@ HashIntoType _hash_murmur(const std::string& kmer);
HashIntoType _hash_murmur(const std::string& kmer,
HashIntoType& h, HashIntoType& r);
HashIntoType _hash_murmur_forward(const std::string& kmer);
+
+/**
+ * \class Kmer
+ *
+ * \brief Hold the hash values corresponding to a single k-mer.
+ *
+ * This class stores the forward, reverse complement, and
+ * uniqified hash values for a given k-mer. It also defines
+ * some basic operators and a utility function for getting
+ * the string representation of the sequence. This is meant
+ * to replace the original inelegant macros used for hashing.
+ *
+ * \author Camille Scott
+ *
+ * Contact: camille.scott.w at gmail.com
+ *
+ */
+class Kmer
+{
+
+public:
+
+ /// The forward hash
+ HashIntoType kmer_f;
+ /// The reverse (complement) hash
+ HashIntoType kmer_r;
+ /// The uniqified hash
+ HashIntoType kmer_u;
+
+ /** @param[in] f forward hash.
+ * @param[in] r reverse (complement) hash.
+ * @param[in] u uniqified hash.
+ */
+ Kmer(HashIntoType f, HashIntoType r, HashIntoType u)
+ {
+ kmer_f = f;
+ kmer_r = r;
+ kmer_u = u;
+ }
+
+ /// @warning The default constructor builds an invalid k-mer.
+ Kmer()
+ {
+ kmer_f = kmer_r = kmer_u = 0;
+ }
+
+ /// Allows complete backwards compatibility
+ operator HashIntoType() const { return kmer_u; }
+
+ bool operator< (const Kmer &other) const {
+ return kmer_u < other.kmer_u;
+ }
+
+ std::string get_string_rep(WordLength K) {
+ return _revhash(kmer_u, K);
+ }
+
+};
+
+/**
+ * \Class KmerFactory
+ *
+ * \brief Build complete Kmer objects.
+ *
+ * The KmerFactory is a simple construct to emit complete
+ * Kmer objects. The design decision leading to this class
+ * stems from the issue of overloading the Kmer constructor
+ * while also giving it a K size: you get ambiguous signatures
+ * between the (kmer_u, K) and (kmer_f, kmer_r) cases. This
+ * implementation also allows a logical architecture wherein
+ * KmerIterator and Hashtable inherit directly from KmerFactory,
+ * extending the latter's purpose of "create k-mers" to
+ * "emit k-mers from a sequence" and "create and store k-mers".
+ *
+ * \author Camille Scott
+ *
+ * Contact: camille.scott.w at gmail.com
+ *
+ */
+class KmerFactory
+{
+protected:
+ WordLength _ksize;
+
+public:
+
+ KmerFactory(WordLength K): _ksize(K) {}
+
+ /** @param[in] kmer_u Uniqified hash value.
+ * @return A complete Kmer object.
+ */
+ Kmer build_kmer(HashIntoType kmer_u)
+ {
+ HashIntoType kmer_f, kmer_r;
+ std:: string kmer_s = _revhash(kmer_u, _ksize);
+ _hash(kmer_s.c_str(), _ksize, kmer_f, kmer_r);
+ return Kmer(kmer_f, kmer_r, kmer_u);
+ }
+
+ /** Call the uniqify function and build a complete Kmer.
+ *
+ * @param[in] kmer_f Forward hash value.
+ * @param[in] kmer_r Reverse complement hash value.
+ * @return A complete Kmer object.
+ */
+ Kmer build_kmer(HashIntoType kmer_f, HashIntoType kmer_r)
+ {
+ HashIntoType kmer_u = uniqify_rc(kmer_f, kmer_r);
+ return Kmer(kmer_f, kmer_r, kmer_u);
+ }
+
+ /** Hash the given sequence and call the uniqify function
+ * on its results to build a complete Kmer.
+ *
+ * @param[in] kmer_s String representation of a k-mer.
+ * @return A complete Kmer object hashed from the given string.
+ */
+ Kmer build_kmer(std::string kmer_s)
+ {
+ HashIntoType kmer_f, kmer_r, kmer_u;
+ kmer_u = _hash(kmer_s.c_str(), _ksize, kmer_f, kmer_r);
+ return Kmer(kmer_f, kmer_r, kmer_u);
+ }
+
+ /** Hash the given sequence and call the uniqify function
+ * on its results to build a complete Kmer.
+ *
+ * @param[in] kmer_c The character array representation of a k-mer.
+ * @return A complete Kmer object hashed from the given char array.
+ */
+ Kmer build_kmer(const char * kmer_c)
+ {
+ HashIntoType kmer_f, kmer_r, kmer_u;
+ kmer_u = _hash(kmer_c, _ksize, kmer_f, kmer_r);
+ return Kmer(kmer_f, kmer_r, kmer_u);
+ }
+};
+
+/**
+ * \Class KmerIterator
+ *
+ * \brief Emit Kmer objects generated from the given sequence.
+ *
+ * Given a string \f$S\f$ and a length \f$K > 0\f$, we define
+ * the k-mers of \f$S\f$ as the set \f$S_{i..i+K} \forall i \in \{0..|S|-K+1\}\f$,
+ * where \f$|S|\f$ is the length and \f$S_{j..k}\f$ is the half-open
+ * substring starting at \f$j\f$ and terminating at \f$k\f$.
+ *
+ * KmerIterator mimics a python-style generator function which
+ * emits the k-mers of the given sequence, in order, as Kmer objects.
+ *
+ * @warning This is not actually a valid C++ iterator, though it is close.
+ *
+ * \author Camille Scott
+ *
+ * Contact: camille.scott.w at gmail.com
+ *
+ */
+class KmerIterator: public KmerFactory
+{
+protected:
+ const char * _seq;
+
+ HashIntoType _kmer_f, _kmer_r;
+ HashIntoType bitmask;
+ unsigned int _nbits_sub_1;
+ unsigned int index;
+ size_t length;
+ bool initialized;
+public:
+ KmerIterator(const char * seq, unsigned char k);
+
+ /** @param[in] f The forward hash value.
+ * @param[in] r The reverse complement hash value.
+ * @return The first Kmer of the sequence.
+ */
+ Kmer first(HashIntoType& f, HashIntoType& r);
+
+ /** @param[in] f The current forward hash value
+ * @param[in] r The current reverse complement hash value
+ * @return The next Kmer in the sequence
+ */
+ Kmer next(HashIntoType& f, HashIntoType& r);
+
+ Kmer first()
+ {
+ return first(_kmer_f, _kmer_r);
+ }
+
+ Kmer next()
+ {
+ return next(_kmer_f, _kmer_r);
+ }
+
+ /// @return Whether or not the iterator has completed.
+ bool done()
+ {
+ return index >= length;
+ }
+
+ unsigned int get_start_pos() const
+ {
+ return index - _ksize;
+ }
+
+ unsigned int get_end_pos() const
+ {
+ return index;
+ }
+}; // class KmerIterator
+
};
#endif // KMER_HASH_HH
diff --git a/lib/labelhash.cc b/lib/labelhash.cc
index 4d801ec..6323751 100644
--- a/lib/labelhash.cc
+++ b/lib/labelhash.cc
@@ -202,7 +202,7 @@ void LabelHash::consume_sequence_and_tag_with_labels(const std::string& seq,
bool kmer_tagged;
- KMerIterator kmers(seq.c_str(), graph->_ksize);
+ KmerIterator kmers(seq.c_str(), graph->_ksize);
HashIntoType kmer;
unsigned int since = graph->_tag_density / 2 + 1;
diff --git a/lib/read_aligner.cc b/lib/read_aligner.cc
index 2b9e400..f739cb5 100644
--- a/lib/read_aligner.cc
+++ b/lib/read_aligner.cc
@@ -20,12 +20,12 @@ namespace khmer
Alignment * _empty_alignment()
{
- Alignment* ret = new Alignment;
- ret->score = -std::numeric_limits<double>::infinity();
- ret->read_alignment = "";
- ret->graph_alignment = "";
- ret->truncated = true;
- return ret;
+ Alignment* ret = new Alignment;
+ ret->score = -std::numeric_limits<double>::infinity();
+ ret->read_alignment = "";
+ ret->graph_alignment = "";
+ ret->truncated = true;
+ return ret;
}
static Nucl _ch_to_nucl(char base)
@@ -503,18 +503,18 @@ Alignment* ReadAligner::Align(const std::string& read)
_hash(start.kmer.c_str(), k, fhash, rhash);
#if READ_ALIGNER_DEBUG
- std::cerr << "Starting kmer: " << start.kmer << " "
- << _revhash(fhash, m_ch->ksize()) << " "
- << _revhash(rhash, m_ch->ksize())
- << " cov: " << start.k_cov << " idx: " << start.kmer_idx << ", "
- << start.kmer_idx + k - 1
- << " emission: " << start.kmer[k - 1] << std::endl;
+ std::cerr << "Starting kmer: " << start.kmer << " "
+ << _revhash(fhash, m_ch->ksize()) << " "
+ << _revhash(rhash, m_ch->ksize())
+ << " cov: " << start.k_cov << " idx: " << start.kmer_idx << ", "
+ << start.kmer_idx + k - 1
+ << " emission: " << start.kmer[k - 1] << std::endl;
#endif
Nucl e = _ch_to_nucl(start.kmer[k - 1]);
AlignmentNode startingNode = AlignmentNode(NULL,
- e, start.kmer_idx + k - 1,
- MATCH, MM, fhash, rhash, k);
+ e, start.kmer_idx + k - 1,
+ MATCH, MM, fhash, rhash, k);
startingNode.f_score = 0;
startingNode.h_score = 0;
Alignment* forward = NULL;
@@ -522,9 +522,9 @@ Alignment* ReadAligner::Align(const std::string& read)
size_t final_length = 0;
if(start.k_cov >= m_trusted_cutoff) {
- startingNode.score = k * m_sm.trusted_match + k * m_sm.tsc[MM];
+ startingNode.score = k * m_sm.trusted_match + k * m_sm.tsc[MM];
} else {
- startingNode.score = k * m_sm.untrusted_match + k * m_sm.tsc[MM];
+ startingNode.score = k * m_sm.untrusted_match + k * m_sm.tsc[MM];
}
forward = Subalign(&startingNode, read.length(), true, read);
@@ -540,21 +540,21 @@ Alignment* ReadAligner::Align(const std::string& read)
// twice, so we need to adjust for that
ret->score = reverse->score + forward->score - startingNode.score;
ret->read_alignment = reverse->read_alignment +
- start.kmer + forward->read_alignment;
+ start.kmer + forward->read_alignment;
ret->graph_alignment = reverse->graph_alignment +
- start.kmer + forward->graph_alignment;
+ start.kmer + forward->graph_alignment;
ret->score = ret->score - GetNull(final_length);
ret->truncated = forward->truncated || reverse->truncated;
#if READ_ALIGNER_DEBUG
- fprintf(stderr,
- "FORWARD\n\tread_aln:%s\n\tgraph_aln:%s\n\tscore:%f\n\ttrunc:%d\n",
- forward->read_alignment.c_str(), forward->graph_alignment.c_str(),
- forward->score, forward->truncated);
- fprintf(stderr,
- "REVERSE\n\tread_aln:%s\n\tgraph_aln:%s\n\tscore:%f\n\ttrunc:%d\n",
- reverse->read_alignment.c_str(), reverse->graph_alignment.c_str(),
- reverse->score, reverse->truncated);
+ fprintf(stderr,
+ "FORWARD\n\tread_aln:%s\n\tgraph_aln:%s\n\tscore:%f\n\ttrunc:%d\n",
+ forward->read_alignment.c_str(), forward->graph_alignment.c_str(),
+ forward->score, forward->truncated);
+ fprintf(stderr,
+ "REVERSE\n\tread_aln:%s\n\tgraph_aln:%s\n\tscore:%f\n\ttrunc:%d\n",
+ reverse->read_alignment.c_str(), reverse->graph_alignment.c_str(),
+ reverse->score, reverse->truncated);
#endif
delete forward;
@@ -581,18 +581,18 @@ Alignment* ReadAligner::AlignForward(const std::string& read)
_hash(start.kmer.c_str(), k, fhash, rhash);
#if READ_ALIGNER_DEBUG
- std::cerr << "Starting kmer: " << start.kmer << " "
- << _revhash(fhash, m_ch->ksize()) << " "
- << _revhash(rhash, m_ch->ksize())
- << " cov: " << start.k_cov << " idx: " << start.kmer_idx << ", "
- << start.kmer_idx + k - 1
- << " emission: " << start.kmer[k - 1] << std::endl;
+ std::cerr << "Starting kmer: " << start.kmer << " "
+ << _revhash(fhash, m_ch->ksize()) << " "
+ << _revhash(rhash, m_ch->ksize())
+ << " cov: " << start.k_cov << " idx: " << start.kmer_idx << ", "
+ << start.kmer_idx + k - 1
+ << " emission: " << start.kmer[k - 1] << std::endl;
#endif
Nucl e = _ch_to_nucl(start.kmer[k - 1]);
AlignmentNode startingNode = AlignmentNode(NULL,
- e, start.kmer_idx + k - 1,
- MATCH, MM, fhash, rhash, k);
+ e, start.kmer_idx + k - 1,
+ MATCH, MM, fhash, rhash, k);
startingNode.f_score = 0;
startingNode.h_score = 0;
Alignment* forward = NULL;
@@ -622,10 +622,10 @@ Alignment* ReadAligner::AlignForward(const std::string& read)
}
#if READ_ALIGNER_DEBUG
- fprintf(stderr,
- "FORWARD\n\tread_aln:%s\n\tgraph_aln:%s\n\tscore:%f\n\ttrunc:%d\n",
- forward->read_alignment.c_str(), forward->graph_alignment.c_str(),
- forward->score, forward->truncated);
+ fprintf(stderr,
+ "FORWARD\n\tread_aln:%s\n\tgraph_aln:%s\n\tscore:%f\n\ttrunc:%d\n",
+ forward->read_alignment.c_str(), forward->graph_alignment.c_str(),
+ forward->score, forward->truncated);
#endif
delete forward;
diff --git a/lib/read_parsers.cc b/lib/read_parsers.cc
index 8a9dc14..47166c6 100644
--- a/lib/read_parsers.cc
+++ b/lib/read_parsers.cc
@@ -20,6 +20,21 @@ namespace khmer
namespace read_parsers
{
+void
+Read::write_to(std::ostream& output)
+{
+ if (quality.length() != 0) {
+ output << "@" << name << std::endl
+ << sequence << std::endl
+ << "+" << std::endl
+ << quality << std::endl;
+ } else {
+ output << ">" << name << std::endl
+ << sequence << std::endl;
+ }
+}
+
+
struct SeqAnParser::Handle {
seqan::SequenceStream stream;
uint32_t seqan_spin_lock;
diff --git a/lib/read_parsers.hh b/lib/read_parsers.hh
index 13588b0..5325acc 100644
--- a/lib/read_parsers.hh
+++ b/lib/read_parsers.hh
@@ -69,6 +69,8 @@ struct Read {
sequence.clear( );
quality.clear( );
}
+
+ void write_to(std::ostream&);
};
typedef std:: pair< Read, Read > ReadPair;
diff --git a/lib/subset.cc b/lib/subset.cc
index d5b490b..610815e 100644
--- a/lib/subset.cc
+++ b/lib/subset.cc
@@ -52,6 +52,11 @@ static void print_tag_set(SeenSet& p)
#endif //0
+SubsetPartition::SubsetPartition(Hashtable * ht) :
+ next_partition_id(2), _ht(ht)
+{
+}
+
void SubsetPartition::count_partitions(
size_t& n_partitions,
size_t& n_unassigned)
@@ -204,8 +209,6 @@ unsigned int SubsetPartition::find_unpart(
SeenSet tags_todo;
- const unsigned int ksize = _ht->ksize();
-
//
// go through all the new reads, and consume & tag them. keep track
// of all waypoints in the read in 'found_tags', and then check to
@@ -298,19 +301,16 @@ unsigned int SubsetPartition::find_unpart(
// std::cout << "new tags size: " << tags_todo.size() << "\n";
unsigned int n = 0;
- std::string kmer_s;
- HashIntoType kmer_f, kmer_r;
SeenSet tagged_kmers;
for (SeenSet::iterator si = tags_todo.begin(); si != tags_todo.end();
++si) {
n += 1;
- kmer_s = _revhash(*si, ksize); // @CTB hackity hack hack!
- HashIntoType kmer = _hash(kmer_s.c_str(), ksize, kmer_f, kmer_r);
+ Kmer kmer = _ht->build_kmer(*si);
// find all tagged kmers within range.
tagged_kmers.clear();
- find_all_tags(kmer_f, kmer_r, tagged_kmers, _ht->all_tags,
+ find_all_tags(kmer, tagged_kmers, _ht->all_tags,
true, stop_big_traversals);
// std::cout << "found " << tagged_kmers.size() << "\n";
@@ -333,171 +333,75 @@ unsigned int SubsetPartition::find_unpart(
return n_singletons;
}
-/* @cswelcher Brilliant idea: let's *not* copy this same piece of code
- * over and over again!
- */
-void SubsetPartition::queue_neighbors(
- HashIntoType kmer_f,
- HashIntoType kmer_r,
- unsigned int breadth,
- SeenSet& traversed_kmers,
- NodeQueue& node_q,
- std::queue<unsigned int>& breadth_q)
-{
-
- HashIntoType f, r;
- const unsigned int rc_left_shift = _ht->ksize()*2 - 2;
- const HashIntoType bitmask = _ht->bitmask;
-
- f = next_f(kmer_f, 'A');
- r = next_r(kmer_r, 'A');
- if (_ht->get_count(uniqify_rc(f,r)) &&
- !set_contains(traversed_kmers, uniqify_rc(f,r))) {
- node_q.push(f);
- node_q.push(r);
- breadth_q.push(breadth + 1);
- }
-
- f = next_f(kmer_f, 'C');
- r = next_r(kmer_r, 'C');
- if (_ht->get_count(uniqify_rc(f,r)) &&
- !set_contains(traversed_kmers, uniqify_rc(f,r))) {
- node_q.push(f);
- node_q.push(r);
- breadth_q.push(breadth + 1);
- }
-
- f = next_f(kmer_f, 'G');
- r = next_r(kmer_r, 'G');
- if (_ht->get_count(uniqify_rc(f,r)) &&
- !set_contains(traversed_kmers, uniqify_rc(f,r))) {
- node_q.push(f);
- node_q.push(r);
- breadth_q.push(breadth + 1);
- }
-
- f = next_f(kmer_f, 'T');
- r = next_r(kmer_r, 'T');
- if (_ht->get_count(uniqify_rc(f,r)) &&
- !set_contains(traversed_kmers, uniqify_rc(f,r))) {
- node_q.push(f);
- node_q.push(r);
- breadth_q.push(breadth + 1);
- }
-
- // PREVIOUS.
- r = prev_r(kmer_r, 'A');
- f = prev_f(kmer_f, 'A');
- if (_ht->get_count(uniqify_rc(f,r)) &&
- !set_contains(traversed_kmers, uniqify_rc(f,r))) {
- node_q.push(f);
- node_q.push(r);
- breadth_q.push(breadth + 1);
- }
-
- r = prev_r(kmer_r, 'C');
- f = prev_f(kmer_f, 'C');
- if (_ht->get_count(uniqify_rc(f,r)) &&
- !set_contains(traversed_kmers, uniqify_rc(f,r))) {
- node_q.push(f);
- node_q.push(r);
- breadth_q.push(breadth + 1);
- }
-
- r = prev_r(kmer_r, 'G');
- f = prev_f(kmer_f, 'G');
- if (_ht->get_count(uniqify_rc(f,r)) &&
- !set_contains(traversed_kmers, uniqify_rc(f,r))) {
- node_q.push(f);
- node_q.push(r);
- breadth_q.push(breadth + 1);
- }
-
- r = prev_r(kmer_r, 'T');
- f = prev_f(kmer_f, 'T');
- if (_ht->get_count(uniqify_rc(f,r)) &&
- !set_contains(traversed_kmers, uniqify_rc(f,r))) {
- node_q.push(f);
- node_q.push(r);
- breadth_q.push(breadth + 1);
- }
-}
-
-///
-
// find_all_tags: the core of the partitioning code. finds all tagged k-mers
// connected to kmer_f/kmer_r in the graph.
void SubsetPartition::find_all_tags(
- HashIntoType kmer_f,
- HashIntoType kmer_r,
+ Kmer start_kmer,
SeenSet& tagged_kmers,
const SeenSet& all_tags,
bool break_on_stop_tags,
bool stop_big_traversals)
{
- const HashIntoType bitmask = _ht->bitmask;
- HashIntoType f, r;
bool first = true;
- NodeQueue node_q;
+ KmerQueue node_q;
std::queue<unsigned int> breadth_q;
+
unsigned int cur_breadth = 0;
const unsigned int max_breadth = (2 * _ht->_tag_density) + 1;
- const unsigned int rc_left_shift = _ht->ksize()*2 - 2;
unsigned int total = 0;
+ unsigned int nfound = 0;
- SeenSet keeper; // keep track of traversed kmers
+ Traverser traverser(_ht);
+ KmerSet keeper; // keep track of traversed kmers
- // start breadth-first search.
+ auto filter = [&] (Kmer& n) -> bool {
+ return !set_contains(keeper, n);
+ };
- node_q.push(kmer_f);
- node_q.push(kmer_r);
+ node_q.push(start_kmer);
breadth_q.push(0);
while(!node_q.empty()) {
+
if (stop_big_traversals && keeper.size() > BIG_TRAVERSALS_ARE) {
tagged_kmers.clear();
break;
}
- kmer_f = node_q.front();
- node_q.pop();
- kmer_r = node_q.front();
+ Kmer node = node_q.front();
node_q.pop();
+
unsigned int breadth = breadth_q.front();
breadth_q.pop();
- HashIntoType kmer = uniqify_rc(kmer_f, kmer_r);
-
- // Have we already seen this k-mer? If so, skip.
- // @cswelcher this is redundant, as we already check before queuing
- if (set_contains(keeper, kmer)) {
+ if (set_contains(keeper, node)) {
continue;
}
- // Do we want to traverse through this k-mer? If not, skip.
- if (break_on_stop_tags && set_contains(_ht->stop_tags, kmer)) {
- // @CTB optimize by inserting into keeper set?
+ if (break_on_stop_tags && set_contains(_ht->stop_tags, node)) {
continue;
}
// keep track of seen kmers
- keeper.insert(kmer);
+ keeper.insert(node);
total++;
// Is this a kmer-to-tag, and have we put this tag in a partition
// already? Search no further in this direction. (This is where we
// connect partitions.)
- if (!first && set_contains(all_tags, kmer)) {
- tagged_kmers.insert(kmer);
+ if (!first && set_contains(all_tags, node)) {
+ tagged_kmers.insert(node);
continue;
}
if (!(breadth >= cur_breadth)) { // keep track of watermark, for
- // debugging.
- throw khmer_exception();
+ // debugging
+ throw khmer_exception("Desynchonization between traversal "
+ "and breadth tracking. Did you forget "
+ "to pop the node or breadth queue?");
}
if (breadth > cur_breadth) {
cur_breadth = breadth;
@@ -507,83 +411,12 @@ void SubsetPartition::find_all_tags(
continue; // truncate search @CTB exit?
}
- //
- // Enqueue next set of nodes.
- //
-
- // NEXT
- f = next_f(kmer_f, 'A');
- r = next_r(kmer_r, 'A');
- if (_ht->get_count(uniqify_rc(f,r)) &&
- !set_contains(keeper, uniqify_rc(f,r))) {
- node_q.push(f);
- node_q.push(r);
- breadth_q.push(breadth + 1);
- }
-
- f = next_f(kmer_f, 'C');
- r = next_r(kmer_r, 'C');
- if (_ht->get_count(uniqify_rc(f,r)) &&
- !set_contains(keeper, uniqify_rc(f,r))) {
- node_q.push(f);
- node_q.push(r);
- breadth_q.push(breadth + 1);
- }
-
- f = next_f(kmer_f, 'G');
- r = next_r(kmer_r, 'G');
- if (_ht->get_count(uniqify_rc(f,r)) &&
- !set_contains(keeper, uniqify_rc(f,r))) {
- node_q.push(f);
- node_q.push(r);
- breadth_q.push(breadth + 1);
- }
-
- f = next_f(kmer_f, 'T');
- r = next_r(kmer_r, 'T');
- if (_ht->get_count(uniqify_rc(f,r)) &&
- !set_contains(keeper, uniqify_rc(f,r))) {
- node_q.push(f);
- node_q.push(r);
- breadth_q.push(breadth + 1);
- }
-
- // PREVIOUS.
- r = prev_r(kmer_r, 'A');
- f = prev_f(kmer_f, 'A');
- if (_ht->get_count(uniqify_rc(f,r)) &&
- !set_contains(keeper, uniqify_rc(f,r))) {
- node_q.push(f);
- node_q.push(r);
- breadth_q.push(breadth + 1);
- }
+ nfound = traverser.traverse_right(node, node_q, filter);
+ for (unsigned int i = 0; i<nfound; ++i) breadth_q.push(breadth + 1);
- r = prev_r(kmer_r, 'C');
- f = prev_f(kmer_f, 'C');
- if (_ht->get_count(uniqify_rc(f,r)) &&
- !set_contains(keeper, uniqify_rc(f,r))) {
- node_q.push(f);
- node_q.push(r);
- breadth_q.push(breadth + 1);
- }
+ nfound = traverser.traverse_left(node, node_q, filter);
+ for (unsigned int i = 0; i<nfound; ++i) breadth_q.push(breadth + 1);
- r = prev_r(kmer_r, 'G');
- f = prev_f(kmer_f, 'G');
- if (_ht->get_count(uniqify_rc(f,r)) &&
- !set_contains(keeper, uniqify_rc(f,r))) {
- node_q.push(f);
- node_q.push(r);
- breadth_q.push(breadth + 1);
- }
-
- r = prev_r(kmer_r, 'T');
- f = prev_f(kmer_f, 'T');
- if (_ht->get_count(uniqify_rc(f,r)) &&
- !set_contains(keeper, uniqify_rc(f,r))) {
- node_q.push(f);
- node_q.push(r);
- breadth_q.push(breadth + 1);
- }
first = false;
}
@@ -602,107 +435,81 @@ unsigned int SubsetPartition::sweep_for_tags(
bool stop_big_traversals)
{
- SeenSet traversed_kmers;
- NodeQueue node_q;
+ Traverser traverser(_ht);
+ KmerSet traversed_nodes;
+ KmerQueue node_q;
std::queue<unsigned int> breadth_q;
- //unsigned int cur_breadth = 0;
unsigned int max_breadth = range;
- //unsigned int breadth_seen = 0;
-
unsigned int total = 0;
+ unsigned int nfound = 0;
- // start breadth-first search.
-
- HashIntoType kmer_f, kmer_r;
- KMerIterator kmers(seq.c_str(), _ht->ksize());
- std::string kmer_s;
+ auto filter = [&] (Kmer& n) -> bool {
+ return !set_contains(traversed_nodes, n);
+ };
// Queue up all the sequence's k-mers at breadth zero
// We are searching around the perimeter of the known k-mers
- // @cswelcher still using kludgy kmer iterator, let's fix this sometime...
+ KmerIterator kmers(seq.c_str(), _ht->ksize());
while (!kmers.done()) {
- HashIntoType kmer = kmers.next();
- kmer_s = _revhash(kmer, _ht->ksize());
- kmer = _hash(kmer_s.c_str(), _ht->ksize(), kmer_f, kmer_r);
- traversed_kmers.insert(kmer);
+ Kmer node = kmers.next();
+ traversed_nodes.insert(node);
- node_q.push(kmer_f);
- node_q.push(kmer_r);
+ node_q.push(node);
breadth_q.push(0);
}
size_t seq_length = node_q.size() / 2;
size_t BIG_PERIMETER_TRAVERSALS = BIG_TRAVERSALS_ARE * seq_length;
- //unsigned int cur_it = 0;
while(!node_q.empty()) {
// change this to a better hueristic
- if (stop_big_traversals && traversed_kmers.size() >
+ if (stop_big_traversals && traversed_nodes.size() >
BIG_PERIMETER_TRAVERSALS) {
tagged_kmers.clear();
break;
}
- kmer_f = node_q.front();
- node_q.pop();
- kmer_r = node_q.front();
+ Kmer node = node_q.front();
node_q.pop();
+
unsigned int breadth = breadth_q.front();
breadth_q.pop();
- //cur_it++;
- //printf("current iteration: %u, current breadth: %u\n", cur_it, breadth);
-
- //if (breadth > breadth_seen) {
- // breadth_seen = breadth;
- //}
-
- HashIntoType kmer = uniqify_rc(kmer_f, kmer_r);
-
- // Have we already seen this k-mer? If so, skip.
- // @cswelcher we already check before queuing
- //if (set_contains(traversed_kmers, kmer)) {
- // continue;
- //}
// Do we want to traverse through this k-mer? If not, skip.
- if (break_on_stop_tags && set_contains(_ht->stop_tags, kmer)) {
- // @CTB optimize by inserting into traversed_kmers set?
+ if (break_on_stop_tags && set_contains(_ht->stop_tags, node)) {
continue;
}
- // keep track of seen kmers
- traversed_kmers.insert(kmer);
+ traversed_nodes.insert(node);
total++;
- //
- if (set_contains(all_tags, kmer)) {
- tagged_kmers.insert(kmer);
+ if (set_contains(all_tags, node)) {
+ tagged_kmers.insert(node);
// if we find a tag, finish the remaining queued nodes,
// but don't queue up any more
// max_breadth = breadth;
continue;
}
- // removed for not doing anything
- //assert(breadth >= cur_breadth); // keep track of watermark, for
- //debugging.
- //if (breadth > cur_breadth) { cur_breadth = breadth; }
-
if (breadth == max_breadth) {
continue;
}
+
// finish up nodes on the current level, but if we go beyond, end it
- // immediately this keeps from having to look at nodes which have
+ // immediately; this keeps from having to look at nodes which have
// already been queued once we lower the limit after finding a tag
else if (breadth > max_breadth) {
- return total; // truncate search @CTB exit?
+ return total;
}
- queue_neighbors(kmer_f, kmer_r, breadth, traversed_kmers, node_q,
- breadth_q);
+ nfound = traverser.traverse_right(node, node_q, filter);
+ for (unsigned int i = 0; i<nfound; ++i) breadth_q.push(breadth + 1);
+
+ nfound = traverser.traverse_left(node, node_q, filter);
+ for (unsigned int i = 0; i<nfound; ++i) breadth_q.push(breadth + 1);
}
- //printf("breadth_seen=%u, total=%u, traverse_kmers=%u\n", breadth_seen, total, traversed_kmers.size());
+
return total;
}
@@ -710,8 +517,7 @@ unsigned int SubsetPartition::sweep_for_tags(
// connected to kmer_f/kmer_r in the graph.
void SubsetPartition::find_all_tags_truncate_on_abundance(
- HashIntoType kmer_f,
- HashIntoType kmer_r,
+ Kmer start_kmer,
SeenSet& tagged_kmers,
const SeenSet& all_tags,
BoundedCounterType min_count,
@@ -719,24 +525,25 @@ void SubsetPartition::find_all_tags_truncate_on_abundance(
bool break_on_stop_tags,
bool stop_big_traversals)
{
- const HashIntoType bitmask = _ht->bitmask;
- HashIntoType f, r;
bool first = true;
- NodeQueue node_q;
+ KmerQueue node_q;
std::queue<unsigned int> breadth_q;
- unsigned int cur_breadth = 0;
+ unsigned int cur_breadth = 0;
const unsigned int max_breadth = (2 * _ht->_tag_density) + 1;
- const unsigned int rc_left_shift = _ht->ksize()*2 - 2;
+
unsigned int total = 0;
+ unsigned int nfound = 0;
- SeenSet keeper; // keep track of traversed kmers
+ Traverser traverser(_ht);
+ KmerSet keeper; // keep track of traversed kmers
- // start breadth-first search.
+ auto filter = [&] (Kmer& n) -> bool {
+ return !set_contains(keeper, n);
+ };
- node_q.push(kmer_f);
- node_q.push(kmer_r);
+ node_q.push(start_kmer);
breadth_q.push(0);
while(!node_q.empty()) {
@@ -745,47 +552,47 @@ void SubsetPartition::find_all_tags_truncate_on_abundance(
break;
}
- kmer_f = node_q.front();
- node_q.pop();
- kmer_r = node_q.front();
+ Kmer node = node_q.front();
node_q.pop();
+
unsigned int breadth = breadth_q.front();
breadth_q.pop();
- HashIntoType kmer = uniqify_rc(kmer_f, kmer_r);
-
// Have we already seen this k-mer? If so, skip.
- if (set_contains(keeper, kmer)) {
+ // NOTE: redundant, move this to before while loop
+ if (set_contains(keeper, node)) {
continue;
}
// Do we want to traverse through this k-mer? If not, skip.
- if (break_on_stop_tags && set_contains(_ht->stop_tags, kmer)) {
+ if (break_on_stop_tags && set_contains(_ht->stop_tags, node)) {
// @CTB optimize by inserting into keeper set?
continue;
}
- BoundedCounterType count = _ht->get_count(kmer);
+ BoundedCounterType count = _ht->get_count(node);
if (count < min_count || count > max_count) {
continue;
}
// keep track of seen kmers
- keeper.insert(kmer);
+ keeper.insert(node);
total++;
// Is this a kmer-to-tag, and have we put this tag in a partition
// already? Search no further in this direction. (This is where we
// connect partitions.)
- if (!first && set_contains(all_tags, kmer)) {
- tagged_kmers.insert(kmer);
+ if (!first && set_contains(all_tags, node)) {
+ tagged_kmers.insert(node);
continue;
}
// @cswelcher Do these lines actually do anything?
if (!(breadth >= cur_breadth)) { // keep track of watermark, for
// debugging.
- throw khmer_exception();
+ throw khmer_exception("Desynchonization between traversal "
+ "and breadth tracking. Did you forget "
+ "to pop the node or breadth queue?");
}
if (breadth > cur_breadth) {
cur_breadth = breadth;
@@ -795,83 +602,11 @@ void SubsetPartition::find_all_tags_truncate_on_abundance(
continue; // truncate search @CTB exit?
}
- //
- // Enqueue next set of nodes.
- //
-
- // NEXT
- f = next_f(kmer_f, 'A');
- r = next_r(kmer_r, 'A');
- if (_ht->get_count(uniqify_rc(f,r)) &&
- !set_contains(keeper, uniqify_rc(f,r))) {
- node_q.push(f);
- node_q.push(r);
- breadth_q.push(breadth + 1);
- }
+ nfound = traverser.traverse_right(node, node_q, filter);
+ for (unsigned int i = 0; i<nfound; ++i) breadth_q.push(breadth + 1);
- f = next_f(kmer_f, 'C');
- r = next_r(kmer_r, 'C');
- if (_ht->get_count(uniqify_rc(f,r)) &&
- !set_contains(keeper, uniqify_rc(f,r))) {
- node_q.push(f);
- node_q.push(r);
- breadth_q.push(breadth + 1);
- }
-
- f = next_f(kmer_f, 'G');
- r = next_r(kmer_r, 'G');
- if (_ht->get_count(uniqify_rc(f,r)) &&
- !set_contains(keeper, uniqify_rc(f,r))) {
- node_q.push(f);
- node_q.push(r);
- breadth_q.push(breadth + 1);
- }
-
- f = next_f(kmer_f, 'T');
- r = next_r(kmer_r, 'T');
- if (_ht->get_count(uniqify_rc(f,r)) &&
- !set_contains(keeper, uniqify_rc(f,r))) {
- node_q.push(f);
- node_q.push(r);
- breadth_q.push(breadth + 1);
- }
-
- // PREVIOUS.
- r = prev_r(kmer_r, 'A');
- f = prev_f(kmer_f, 'A');
- if (_ht->get_count(uniqify_rc(f,r)) &&
- !set_contains(keeper, uniqify_rc(f,r))) {
- node_q.push(f);
- node_q.push(r);
- breadth_q.push(breadth + 1);
- }
-
- r = prev_r(kmer_r, 'C');
- f = prev_f(kmer_f, 'C');
- if (_ht->get_count(uniqify_rc(f,r)) &&
- !set_contains(keeper, uniqify_rc(f,r))) {
- node_q.push(f);
- node_q.push(r);
- breadth_q.push(breadth + 1);
- }
-
- r = prev_r(kmer_r, 'G');
- f = prev_f(kmer_f, 'G');
- if (_ht->get_count(uniqify_rc(f,r)) &&
- !set_contains(keeper, uniqify_rc(f,r))) {
- node_q.push(f);
- node_q.push(r);
- breadth_q.push(breadth + 1);
- }
-
- r = prev_r(kmer_r, 'T');
- f = prev_f(kmer_f, 'T');
- if (_ht->get_count(uniqify_rc(f,r)) &&
- !set_contains(keeper, uniqify_rc(f,r))) {
- node_q.push(f);
- node_q.push(r);
- breadth_q.push(breadth + 1);
- }
+ nfound = traverser.traverse_left(node, node_q, filter);
+ for (unsigned int i = 0; i<nfound; ++i) breadth_q.push(breadth + 1);
first = false;
}
@@ -889,11 +624,7 @@ void SubsetPartition::do_partition(
{
unsigned int total_reads = 0;
- std::string kmer_s;
- HashIntoType kmer_f, kmer_r;
SeenSet tagged_kmers;
- const unsigned char ksize = _ht->ksize();
-
SeenSet::const_iterator si, end;
if (first_kmer) {
@@ -910,12 +641,11 @@ void SubsetPartition::do_partition(
for (; si != end; ++si) {
total_reads++;
- kmer_s = _revhash(*si, ksize); // @CTB hackity hack hack!
- HashIntoType kmer = _hash(kmer_s.c_str(), ksize, kmer_f, kmer_r);
+ Kmer kmer = _ht->build_kmer(*si);
// find all tagged kmers within range.
tagged_kmers.clear();
- find_all_tags(kmer_f, kmer_r, tagged_kmers, _ht->all_tags,
+ find_all_tags(kmer, tagged_kmers, _ht->all_tags,
break_on_stop_tags, stop_big_traversals);
// assign the partition ID
@@ -950,11 +680,7 @@ void SubsetPartition::do_partition_with_abundance(
{
unsigned int total_reads = 0;
- std::string kmer_s;
- HashIntoType kmer_f, kmer_r;
SeenSet tagged_kmers;
- const unsigned char ksize = _ht->ksize();
-
SeenSet::const_iterator si, end;
if (first_kmer) {
@@ -971,12 +697,11 @@ void SubsetPartition::do_partition_with_abundance(
for (; si != end; ++si) {
total_reads++;
- kmer_s = _revhash(*si, ksize); // @CTB hackity hack hack!
- HashIntoType kmer = _hash(kmer_s.c_str(), ksize, kmer_f, kmer_r);
+ Kmer kmer = _ht->build_kmer(*si);
// find all tagged kmers within range.
tagged_kmers.clear();
- find_all_tags_truncate_on_abundance(kmer_f, kmer_r, tagged_kmers,
+ find_all_tags_truncate_on_abundance(kmer, tagged_kmers,
_ht->all_tags, min_count,
max_count, break_on_stop_tags,
stop_big_traversals);
@@ -1537,7 +1262,7 @@ bool SubsetPartition::is_single_partition(std::string seq)
PartitionSet partitions;
PartitionID *pp;
- KMerIterator kmers(seq.c_str(), _ht->ksize());
+ KmerIterator kmers(seq.c_str(), _ht->ksize());
while (!kmers.done()) {
HashIntoType kmer = kmers.next();
@@ -1560,7 +1285,7 @@ void SubsetPartition::join_partitions_by_path(std::string seq)
{
SeenSet tagged_kmers;
- KMerIterator kmers(seq.c_str(), _ht->ksize());
+ KmerIterator kmers(seq.c_str(), _ht->ksize());
while(!kmers.done()) {
HashIntoType kmer = kmers.next();
@@ -1710,7 +1435,7 @@ unsigned long long SubsetPartition::repartition_largest_partition(
unsigned int n = 0;
unsigned int count;
unsigned int n_big = 0;
- SeenSet keeper;
+ KmerSet keeper;
SeenSet::const_iterator si = bigtags.begin();
@@ -1723,15 +1448,16 @@ unsigned long long SubsetPartition::repartition_largest_partition(
}
#endif //0
- count = _ht->traverse_from_kmer(*si, distance, keeper);
+ count = _ht->traverse_from_kmer(_ht->build_kmer(*si),
+ distance, keeper);
if (count >= threshold) {
n_big++;
- SeenSet::const_iterator ti;
+ KmerSet::const_iterator ti;
for (ti = keeper.begin(); ti != keeper.end(); ++ti) {
if (counting.get_count(*ti) > frequency) {
- _ht->stop_tags.insert(*ti);
+ _ht->stop_tags.insert((*ti).kmer_u);
} else {
counting.count(*ti);
}
@@ -1749,13 +1475,13 @@ unsigned long long SubsetPartition::repartition_largest_partition(
}
keeper.clear();
- if (n % 1000 == 0) {
#if VERBOSE_REPARTITION
+ if (n % 1000 == 0) {
std::cout << "found big 'un! traversed " << n << " tags, " <<
n_big << " big; " << bigtags.size() << " total tags; " <<
_ht->stop_tags.size() << " stop tags\n";
-#endif // 0
}
+#endif // 0
}
// return next_largest;
@@ -1772,10 +1498,6 @@ unsigned long long SubsetPartition::repartition_largest_partition(
void SubsetPartition::repartition_a_partition(const SeenSet& partition_tags)
{
SeenSet tagged_kmers;
- std::string kmer_s;
- HashIntoType kmer_f, kmer_r;
- unsigned int ksize = _ht->ksize();
-
SeenSet::const_iterator si;
unsigned n = 0;
@@ -1787,11 +1509,10 @@ void SubsetPartition::repartition_a_partition(const SeenSet& partition_tags)
#endif // 0
}
- kmer_s = _revhash(*si, ksize); // @CTB hackity hack hack!
- HashIntoType kmer = _hash(kmer_s.c_str(), ksize, kmer_f, kmer_r);
+ Kmer kmer = _ht->build_kmer(*si);
tagged_kmers.clear();
- find_all_tags(kmer_f, kmer_r, tagged_kmers, _ht->all_tags, true, false);
+ find_all_tags(kmer, tagged_kmers, _ht->all_tags, true, false);
// only join things already in bigtags.
SeenSet::iterator ssi = tagged_kmers.begin();
diff --git a/lib/subset.hh b/lib/subset.hh
index 67a5ba7..9a51151 100644
--- a/lib/subset.hh
+++ b/lib/subset.hh
@@ -13,6 +13,7 @@
#include <string>
#include "khmer.hh"
+#include "traversal.hh"
namespace khmer
{
@@ -44,10 +45,7 @@ protected:
const HashIntoType kmer);
public:
- explicit SubsetPartition(Hashtable * ht) : next_partition_id(2), _ht(ht)
- {
- ;
- };
+ explicit SubsetPartition(Hashtable * ht);
~SubsetPartition()
{
@@ -77,14 +75,7 @@ public:
void load_partitionmap(std::string infile);
void _validate_pmap();
- void queue_neighbors(HashIntoType kmer_f,
- HashIntoType kmer_r,
- unsigned int breadth,
- SeenSet& traversed_kmers,
- NodeQueue& node_q,
- std::queue<unsigned int>& breadth_q);
-
- void find_all_tags(HashIntoType kmer_f, HashIntoType kmer_r,
+ void find_all_tags(Kmer start_kmer,
SeenSet& tagged_kmers,
const SeenSet& all_tags,
bool break_on_stop_tags=false,
@@ -97,8 +88,7 @@ public:
bool break_on_stop_tags,
bool stop_big_traversals);
- void find_all_tags_truncate_on_abundance(HashIntoType kmer_f,
- HashIntoType kmer_r,
+ void find_all_tags_truncate_on_abundance(Kmer start_kmer,
SeenSet& tagged_kmers,
const SeenSet& all_tags,
BoundedCounterType min_count,
diff --git a/lib/traversal.cc b/lib/traversal.cc
new file mode 100644
index 0000000..535a893
--- /dev/null
+++ b/lib/traversal.cc
@@ -0,0 +1,118 @@
+//
+// This file is part of khmer, https://github.com/dib-lab/khmer/, and is
+// Copyright (C) University of California Davis, 2015. It is licensed under
+// the three-clause BSD license; see LICENSE.
+// Contact: khmer-project at idyll.org
+//
+
+#include "hashtable.hh"
+#include "traversal.hh"
+
+using namespace khmer;
+using namespace std;
+
+Traverser::Traverser(const Hashtable * ht) :
+ KmerFactory(ht->ksize()), graph(ht)
+{
+ bitmask = 0;
+ for (unsigned int i = 0; i < _ksize; i++) {
+ bitmask = (bitmask << 2) | 3;
+ }
+ rc_left_shift = _ksize * 2 - 2;
+}
+
+Kmer Traverser::get_left(Kmer& node, const char ch)
+{
+ HashIntoType kmer_f, kmer_r;
+ kmer_f = ((node.kmer_f) >> 2 | twobit_repr(ch) << rc_left_shift);
+ kmer_r = (((node.kmer_r) << 2) & bitmask) | (twobit_comp(ch));
+ return build_kmer(kmer_f, kmer_r);
+}
+
+
+Kmer Traverser::get_right(Kmer& node, const char ch)
+{
+ HashIntoType kmer_f, kmer_r;
+ kmer_f = (((node.kmer_f) << 2) & bitmask) | (twobit_repr(ch));
+ kmer_r = ((node.kmer_r) >> 2) | (twobit_comp(ch) << rc_left_shift);
+ return build_kmer(kmer_f, kmer_r);
+}
+
+unsigned int Traverser::traverse_left(Kmer& node,
+ KmerQueue & node_q,
+ std::function<bool (Kmer&)> filter)
+{
+ unsigned int found = 0;
+
+ char bases[] = "ACGT";
+ char * base = bases;
+ while(*base != '\0') {
+ Kmer prev_node = get_left(node, *base);
+ if (graph->get_count(prev_node) && filter(prev_node)) {
+ node_q.push(prev_node);
+ ++found;
+ }
+ ++base;
+ }
+
+ return found;
+}
+
+unsigned int Traverser::traverse_right(Kmer& node,
+ KmerQueue & node_q,
+ std::function<bool (Kmer&)> filter)
+{
+ unsigned int found = 0;
+
+ char bases[] = "ACGT";
+ char * base = bases;
+ while(*base != '\0') {
+ Kmer next_node = get_right(node, *base);
+ if (graph->get_count(next_node) && filter(next_node)) {
+ node_q.push(next_node);
+ ++found;
+ }
+ ++base;
+ }
+
+ return found;
+}
+
+unsigned int Traverser::degree_left(Kmer& node)
+{
+ unsigned int degree = 0;
+
+ char bases[] = "ACGT";
+ char * base = bases;
+ while(*base != '\0') {
+ Kmer prev_node = get_left(node, *base);
+ if (graph->get_count(prev_node)) {
+ ++degree;
+ }
+ ++base;
+ }
+
+ return degree;
+}
+
+unsigned int Traverser::degree_right(Kmer& node)
+{
+ unsigned int degree = 0;
+
+ char bases[] = "ACGT";
+ char * base = bases;
+ while(*base != '\0') {
+ Kmer next_node = get_right(node, *base);
+ if (graph->get_count(next_node)) {
+ ++degree;
+ }
+ ++base;
+ }
+
+ return degree;
+}
+
+unsigned int Traverser::degree(Kmer& node)
+{
+ return degree_right(node) + degree_left(node);
+}
diff --git a/lib/traversal.hh b/lib/traversal.hh
new file mode 100644
index 0000000..6f6a3a4
--- /dev/null
+++ b/lib/traversal.hh
@@ -0,0 +1,56 @@
+//
+// This file is part of khmer, https://github.com/dib-lab/khmer/, and is
+// Copyright (C) University of California Davis, 2015. It is licensed under
+// the three-clause BSD license; see LICENSE.
+// Contact: khmer-project at idyll.org
+//
+
+#ifndef TRAVERSAL_HH
+#define TRAVERSAL_HH
+
+#include <queue>
+#include <functional>
+
+#include "khmer.hh"
+
+#include "khmer_exception.hh"
+#include "read_parsers.hh"
+#include "kmer_hash.hh"
+#include "hashtable.hh"
+
+namespace khmer {
+
+class Hashtable;
+
+class Traverser: public KmerFactory
+{
+ friend class Hashtable;
+
+protected:
+
+ HashIntoType bitmask;
+ unsigned int rc_left_shift;
+
+public:
+
+ const Hashtable * graph;
+
+ explicit Traverser(const Hashtable * ht);
+
+ Kmer get_left(Kmer& node, const char ch);
+ Kmer get_right(Kmer& node, const char ch);
+
+ unsigned int traverse_left(Kmer& node,
+ KmerQueue &node_q,
+ std::function<bool (Kmer&)> filter);
+ unsigned int traverse_right(Kmer& node,
+ KmerQueue &node_q,
+ std::function<bool (Kmer&)> filter);
+
+ unsigned int degree_left(Kmer& node);
+ unsigned int degree_right(Kmer& node);
+ unsigned int degree(Kmer& node);
+};
+
+};
+#endif
diff --git a/oxli/__init__.py b/oxli/__init__.py
index 53e60ee..077f291 100755
--- a/oxli/__init__.py
+++ b/oxli/__init__.py
@@ -36,9 +36,9 @@ def get_parser():
description="Load sequences into the "
"compressible graph format plus optional tagset")
- khmer_args.build_hashbits_args("Load sequences into the compressible"
- "graph format plus optional tagset.",
- None, parser=parser_build_graph)
+ khmer_args.build_nodegraph_args("Load sequences into the compressible"
+ "graph format plus optional tagset.",
+ None, parser=parser_build_graph)
build_graph.build_parser(parser_build_graph)
parser_build_graph.set_defaults(func=build_graph.main)
diff --git a/oxli/build_graph.py b/oxli/build_graph.py
index adf4de6..022e64b 100644
--- a/oxli/build_graph.py
+++ b/oxli/build_graph.py
@@ -21,10 +21,10 @@ import sys
import khmer
from khmer import khmer_args
from khmer.khmer_args import (report_on_config, info, add_threading_args,
- calculate_tablesize)
+ calculate_graphsize)
from khmer.kfile import check_input_files, check_space
-from khmer.kfile import check_space_for_hashtable
-from oxli import functions
+from khmer.kfile import check_space_for_graph
+from oxli import functions as oxfuncs
def build_parser(parser):
@@ -33,8 +33,8 @@ def build_parser(parser):
action='store_true', dest='no_build_tagset',
help='Do NOT construct tagset while loading sequences')
parser.add_argument('output_filename',
- metavar='output_presence_table_filename', help='output'
- ' k-mer presence table filename.')
+ metavar='output_nodegraph_filename', help='output'
+ ' k-mer nodegraph filename.')
parser.add_argument('input_filenames', metavar='input_sequence_filename',
nargs='+', help='input FAST[AQ] sequence filename')
parser.add_argument('-f', '--force', default=False, action='store_true',
@@ -45,20 +45,17 @@ def build_parser(parser):
def main(args):
info('build-graph.py', ['graph', 'SeqAn'])
- report_on_config(args, hashtype='nodegraph')
+ report_on_config(args, graphtype='nodegraph')
base = args.output_filename
filenames = args.input_filenames
for fname in args.input_filenames:
check_input_files(fname, args.force)
- # if optimization args are given, do optimization
- args = functions.do_sanity_checking(args, 0.01)
+ graphsize = calculate_graphsize(args, 'nodegraph')
+ check_space_for_graph(args.output_filename, graphsize, args.force)
- tablesize = calculate_tablesize(args, 'nodegraph')
- check_space_for_hashtable(args.output_filename, tablesize, args.force)
-
- print('Saving k-mer presence table to %s' % base, file=sys.stderr)
+ print('Saving k-mer nodegraph to %s' % base, file=sys.stderr)
print('Loading kmers from sequences in %s' %
repr(filenames), file=sys.stderr)
if args.no_build_tagset:
@@ -68,26 +65,27 @@ def main(args):
file=sys.stderr)
print('making nodegraph', file=sys.stderr)
- htable = khmer_args.create_nodegraph(args)
+ nodegraph = khmer_args.create_nodegraph(args)
- functions.build_graph(filenames, htable, args.threads,
- not args.no_build_tagset)
+ oxfuncs.build_graph(filenames, nodegraph, args.threads,
+ not args.no_build_tagset)
- print('Total number of unique k-mers: {0}'.format(htable.n_unique_kmers()),
- file=sys.stderr)
+ print('Total number of unique k-mers: {0}'.format(
+ nodegraph.n_unique_kmers()), file=sys.stderr)
- print('saving k-mer presence table in', base + '.pt', file=sys.stderr)
- htable.save(base + '.pt')
+ print('saving k-mer nodegraph in', base, file=sys.stderr)
+ nodegraph.save(base)
if not args.no_build_tagset:
print('saving tagset in', base + '.tagset', file=sys.stderr)
- htable.save_tagset(base + '.tagset')
+ nodegraph.save_tagset(base + '.tagset')
info_fp = open(base + '.info', 'w')
- info_fp.write('%d unique k-mers' % htable.n_unique_kmers())
+ info_fp.write('%d unique k-mers' % nodegraph.n_unique_kmers())
fp_rate = \
- khmer.calc_expected_collisions(htable, args.force, max_false_pos=.15)
+ khmer.calc_expected_collisions(
+ nodegraph, args.force, max_false_pos=.15)
# 0.18 is ACTUAL MAX. Do not change.
print('false positive rate estimated to be %1.3f' % fp_rate,
@@ -95,7 +93,7 @@ def main(args):
print('\nfalse positive rate estimated to be %1.3f' % fp_rate,
file=info_fp)
- print('wrote to', base + '.info and', base + '.pt', file=sys.stderr)
+ print('wrote to ' + base + '.info and ' + base, file=sys.stderr)
if not args.no_build_tagset:
print('and ' + base + '.tagset', file=sys.stderr)
diff --git a/oxli/functions.py b/oxli/functions.py
index 5b72be4..ac5ccb8 100644
--- a/oxli/functions.py
+++ b/oxli/functions.py
@@ -18,141 +18,6 @@ import khmer.utils
import sys
-def optimal_size(num_kmers, mem_cap=None, fp_rate=None):
- """
- Utility function for estimating optimal counting table args where:
- - num_kmers: number of unique kmers [required]
- - mem_cap: the allotted amount of memory [optional, conflicts with f]
- - fp_rate: the desired false positive rate [optional, conflicts with M]
- """
- if all((num_kmers is not None, mem_cap is not None, fp_rate is None)):
- return estimate_optimal_with_K_and_M(num_kmers, mem_cap)
- elif all((num_kmers is not None, mem_cap is None, fp_rate is not None)):
- return estimate_optimal_with_K_and_f(num_kmers, fp_rate)
- else:
- raise TypeError("num_kmers and either mem_cap or fp_rate"
- " must be defined.")
-
-
-def estimate_optimal_with_K_and_M(num_kmers, mem_cap):
- """
- Utility function for estimating optimal counting table args where num_kmers
- is the number of unique kmer and mem_cap is the allotted amount of memory
- """
-
- n_tables = math.log(2) * (mem_cap / float(num_kmers))
- int_n_tables = int(n_tables)
- if int_n_tables == 0:
- int_n_tables = 1
- ht_size = int(mem_cap / int_n_tables)
- mem_cap = ht_size * int_n_tables
- fp_rate = (1 - math.exp(-num_kmers / float(ht_size))) ** int_n_tables
- res = namedtuple("result", ["num_htables", "htable_size", "mem_use",
- "fp_rate"])
- return res(int_n_tables, ht_size, mem_cap, fp_rate)
-
-
-def estimate_optimal_with_K_and_f(num_kmers, des_fp_rate):
- """
- Utility function for estimating optimal memory where num_kmers is the
- number of unique kmers and des_fp_rate is the desired false positive rate
- """
- n_tables = math.log(des_fp_rate, 0.5)
- int_n_tables = int(n_tables)
- if int_n_tables == 0:
- int_n_tables = 1
-
- ht_size = int(-num_kmers / (
- math.log(1 - des_fp_rate ** (1 / float(int_n_tables)))))
- mem_cap = ht_size * int_n_tables
- fp_rate = (1 - math.exp(-num_kmers / float(ht_size))) ** int_n_tables
-
- res = namedtuple("result", ["num_htables", "htable_size", "mem_use",
- "fp_rate"])
- return res(int_n_tables, ht_size, mem_cap, fp_rate)
-
-
-def optimal_args_output_gen(unique_kmers, fp_rate):
- """
- Assembles output string for optimal arg sandbox scripts
- takes in unique_kmers and desired fp_rate
- """
- to_print = []
-
- to_print.append('') # blank line
- to_print.append('number of unique k-mers: \t{0}'.format(unique_kmers))
- to_print.append('false positive rate: \t{:>.3f}'.format(fp_rate))
- to_print.append('') # blank line
- to_print.append('If you have expected false positive rate to achieve:')
- to_print.append('expected_fp\tnumber_hashtable(Z)\tsize_hashtable(H)\t'
- 'expected_memory_usage')
-
- for fp_rate in range(1, 10):
- num_tables, table_size, mem_cap, fp_rate = \
- optimal_size(unique_kmers, fp_rate=fp_rate / 10.0)
- to_print.append('{:11.3f}\t{:19}\t{:17e}\t{:21e}'.format(fp_rate,
- num_tables,
- table_size,
- mem_cap))
-
- mem_list = [1, 5, 10, 20, 50, 100, 200, 300, 400, 500, 1000, 2000, 5000]
-
- to_print.append('') # blank line
- to_print.append('If you have expected memory to use:')
- to_print.append('expected_memory_usage\tnumber_hashtable(Z)\t'
- 'size_hashtable(H)\texpected_fp')
-
- for mem in mem_list:
- num_tables, table_size, mem_cap, fp_rate =\
- optimal_size(unique_kmers, mem_cap=mem * 1000000000)
- to_print.append('{:21e}\t{:19}\t{:17e}\t{:11.3f}'.format(mem_cap,
- num_tables,
- table_size,
- fp_rate))
- return "\n".join(to_print)
-
-
-def do_sanity_checking(args, desired_max_fp):
- """
- simple function to check if the restrictions in the args (if there are any)
- make sense--If not, complain. If no restrictions are given, add some that
- make sense.
- Takes in args and desired max FP rate
- """
- # if optimization args are given, do optimization
- if args.unique_kmers != 0:
- if args.max_memory_usage:
- # verify that this is a sane memory usage restriction
- res = estimate_optimal_with_K_and_M(args.unique_kmers,
- args.max_memory_usage)
- if res.fp_rate > desired_max_fp:
- print("""
-*** ERROR: The given restrictions yield an estimate false positive rate of {0},
-*** which is above the recommended false positive ceiling of {1}!"""
- .format(res.fp_rate, desired_max_fp), file=sys.stderr)
- if not args.force:
- print("NOTE: This can be overridden using the --force"
- " argument", file=sys.stderr)
- print("*** Aborting...!", file=sys.stderr)
- sys.exit(1)
- else:
- res = estimate_optimal_with_K_and_f(args.unique_kmers,
- desired_max_fp)
- if args.max_tablesize and args.max_tablesize < res.htable_size:
- print("*** Warning: The given tablesize is too small!",
- file=sys.stderr)
- print("*** Estimating false positive rate to be {0}".format(
- res.fp_rate), file=sys.stderr)
- else:
- print("*** INFO: set memory ceiling using auto optimization.",
- file=sys.stderr)
- print("*** Ceiling is: {0} bytes\n".format(res.mem_use),
- file=sys.stderr)
- args.max_mem = res.mem_use
-
- return args
-
-
def build_graph(ifilenames, graph, num_threads=1, tags=False):
"""
Algorithm to construct a counting graph from a set of input files
diff --git a/sandbox/README.rst b/sandbox/README.rst
index 36ffa88..79a7f13 100644
--- a/sandbox/README.rst
+++ b/sandbox/README.rst
@@ -65,11 +65,15 @@ Good ideas to rewrite using newer tools/approaches:
* assembly-diff.py - find sequences that differ between two assemblies
* assembly-diff-2.py - find subsequences that differ between two assemblies
* bloom-count.py - count # of unique k-mers; should be reimplemented with HyperLogLog, Renamed from bloom_count.py in commit 4788c31
-* bloom-count-intersection.py - look at unique and disjoint #s of k-mers, Renamed from bloom_count_intersection.py in commit 4788c31.
* split-sequences-by-length.py - break up short reads by length
----
+Present in commit 19b0a09353cddc45070edcf1283cae2c83c13b0e but removed
+thereafter:
+
+* `bloom-count-intersection.py <https://github.com/dib-lab/khmer/blob/19b0a09353cddc45070edcf1283cae2c83c13b0e/sandbox/bloom-count-intersection.py>`__ - look at unique and disjoint #s of k-mers, renamed from bloom_count_intersection.py in commit 4788c31.
+
Present in commit d295bc847 but removed thereafter:
* `combine-pe.py <https://github.com/dib-lab/khmer/blob/d295bc8477022e8c34649f131a2abe333a891d3d/sandbox/combine-pe.py>`__ - combine partitions based on shared PE reads.
diff --git a/sandbox/assembly-diff-2.py b/sandbox/assembly-diff-2.py
index 1d39969..ca5b67b 100755
--- a/sandbox/assembly-diff-2.py
+++ b/sandbox/assembly-diff-2.py
@@ -24,7 +24,7 @@ def main():
filename2 = sys.argv[2]
uniq2 = open(os.path.basename(sys.argv[2]) + '.uniq', 'w')
- kh = khmer.Hashbits(K, HASHTABLE_SIZE, N_HT)
+ kh = khmer.Nodegraph(K, HASHTABLE_SIZE, N_HT)
for n, record in enumerate(screed.open(filename1)):
if n % 10000 == 0:
print('...', filename1, n)
diff --git a/sandbox/assembly-diff.py b/sandbox/assembly-diff.py
index e3fdee0..c884d54 100755
--- a/sandbox/assembly-diff.py
+++ b/sandbox/assembly-diff.py
@@ -26,9 +26,9 @@ def main():
uniq2 = open(os.path.basename(sys.argv[2]) + '.uniq', 'w')
paths = sys.argv[3]
- kh1 = khmer.Hashbits(K, HASHTABLE_SIZE, N_HT)
+ kh1 = khmer.Nodegraph(K, HASHTABLE_SIZE, N_HT)
kh1.consume_fasta(filename1)
- kh2 = khmer.Hashbits(K, HASHTABLE_SIZE, N_HT)
+ kh2 = khmer.Nodegraph(K, HASHTABLE_SIZE, N_HT)
kh2.consume_fasta(filename2)
for record in screed.open(paths):
diff --git a/sandbox/bloom-count-intersection.py b/sandbox/bloom-count-intersection.py
deleted file mode 100755
index 71405d4..0000000
--- a/sandbox/bloom-count-intersection.py
+++ /dev/null
@@ -1,61 +0,0 @@
-from __future__ import print_function
-#! /usr/bin/env python
-#
-# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
-# Copyright (C) Michigan State University, 2009-2015. It is licensed under
-# the three-clause BSD license; see LICENSE.
-# Contact: khmer-project at idyll.org
-#
-# using bloom filter to count intersection
-
-import khmer
-import sys
-import screed
-from screed.fasta import fasta_iter
-
-
-def main():
- filename = sys.argv[1]
- K = int(sys.argv[2]) # size of kmer
- HT_SIZE = int(sys.argv[3]) # size of hashtable
- N_HT = int(sys.argv[4]) # number of hashtables
-
- ht = khmer.Hashbits(K, HT_SIZE, N_HT)
-
- n_unique = 0
- for n, record in enumerate(fasta_iter(open(filename))):
- sequence = record['sequence']
- seq_len = len(sequence)
- for n in range(0, seq_len + 1 - K):
- kmer = sequence[n:n + K]
- if (not ht.get(kmer)):
- n_unique += 1
- ht.count(kmer)
- print(filename, 'has been consumed.')
- print('# of unique kmers:', n_unique)
- print('# of occupied bin:', ht.n_occupied())
-
- filename2 = sys.argv[5]
- ht2 = khmer.Hashbits(K, HT_SIZE, N_HT)
- n_unique = 0
- n_overlap = 0
- for n, record in enumerate(fasta_iter(open(filename2))):
- sequence = record['sequence']
- seq_len = len(sequence)
- for n in range(0, seq_len + 1 - K):
- kmer = sequence[n:n + K]
- if (not ht2.get(kmer)):
- n_unique += 1
- if (ht.get(kmer)):
- n_overlap += 1
- ht2.count(kmer)
-
- print(filename2, 'has been consumed.')
- print('# of unique kmers:', n_unique)
- print('# of occupied bin:', ht2.n_occupied())
-
- print(n_overlap, 'unique kmers appears in both ', filename, ' and ', filename2)
-
-
-if __name__ == '__main__':
- main()
diff --git a/sandbox/bloom-count.py b/sandbox/bloom-count.py
index fc833cc..5b32b12 100755
--- a/sandbox/bloom-count.py
+++ b/sandbox/bloom-count.py
@@ -20,7 +20,7 @@ def main():
HT_SIZE = int(sys.argv[3]) # size of hashtable
N_HT = int(sys.argv[4]) # number of hashtables
- ht = khmer.Hashbits(K, HT_SIZE, N_HT)
+ ht = khmer.Nodegraph(K, HT_SIZE, N_HT)
n_unique = 0
for n, record in enumerate(fasta_iter(open(filename))):
diff --git a/sandbox/build-sparse-graph.py b/sandbox/build-sparse-graph.py
index 1a12b3b..6c9e734 100755
--- a/sandbox/build-sparse-graph.py
+++ b/sandbox/build-sparse-graph.py
@@ -20,7 +20,7 @@ def main():
K = int(sys.argv[1])
x = float(sys.argv[2])
- ht = khmer.Hashbits(K, x, 4)
+ ht = khmer.Nodegraph(K, x, 4)
sparse_graph = gt.Graph()
hashes = sparse_graph.new_vertex_property("long long")
diff --git a/sandbox/calc-error-profile.py b/sandbox/calc-error-profile.py
index 2cceb30..10225ee 100755
--- a/sandbox/calc-error-profile.py
+++ b/sandbox/calc-error-profile.py
@@ -67,7 +67,7 @@ def main():
# build a small counting hash w/default parameters. In general there
# should be no need to change these parameters.
- ht = khmer.CountingHash(K, HASHSIZE, N_HT)
+ ht = khmer.Countgraph(K, HASHSIZE, N_HT)
# initialize list to contain counts of errors by position
positions = [0] * MAX_SEQ_LEN
diff --git a/sandbox/calc-median-distribution.py b/sandbox/calc-median-distribution.py
index a67a99c..9cd6e52 100755
--- a/sandbox/calc-median-distribution.py
+++ b/sandbox/calc-median-distribution.py
@@ -30,7 +30,7 @@ def main():
outfp = open(histout, 'w')
print('hashtable from', hashfile)
- ht = khmer.load_counting_hash(hashfile)
+ ht = khmer.load_countgraph(hashfile)
hist = {}
diff --git a/sandbox/collect-reads.py b/sandbox/collect-reads.py
index ca29727..ddf1c25 100755
--- a/sandbox/collect-reads.py
+++ b/sandbox/collect-reads.py
@@ -22,9 +22,9 @@ import textwrap
import khmer
from khmer import khmer_args
from khmer.khmer_args import (build_counting_args, report_on_config, info,
- calculate_tablesize)
+ calculate_graphsize)
from khmer.kfile import check_input_files, check_space
-from khmer.kfile import check_space_for_hashtable
+from khmer.kfile import check_space_for_graph
import argparse
import screed
@@ -48,8 +48,8 @@ def get_parser():
parser = build_counting_args("Collect reads until a given avg coverage.",
epilog=textwrap.dedent(epilog))
- parser.add_argument('output_countingtable_filename', help="The name of the"
- " file to write the k-mer counting table to.")
+ parser.add_argument('output_countgraph_filename', help="The name of the"
+ " file to write the k-mer countgraph to.")
parser.add_argument('input_sequence_filename', nargs='+',
help="The names of one or more FAST[AQ] input "
"sequence files.")
@@ -71,18 +71,18 @@ def main():
args = get_parser().parse_args()
report_on_config(args)
- base = args.output_countingtable_filename
+ base = args.output_countgraph_filename
filenames = args.input_sequence_filename
for name in args.input_sequence_filename:
check_input_files(name, False)
check_space(args.input_sequence_filename, False)
- tablesize = calculate_tablesize(args, 'countgraph')
- check_space_for_hashtable(args.output_countingtable_filename, tablesize,
+ tablesize = calculate_graphsize(args, 'countgraph')
+ check_space_for_graph(args.output_countgraph_filename, tablesize,
False)
- print('Saving k-mer counting table to %s' % base)
+ print('Saving k-mer countgraph to %s' % base)
print('Loading sequences from %s' % repr(filenames))
if args.output:
print('Outputting sequences to', args.output)
diff --git a/sandbox/collect-variants.py b/sandbox/collect-variants.py
index 57af85d..3a6c892 100755
--- a/sandbox/collect-variants.py
+++ b/sandbox/collect-variants.py
@@ -61,10 +61,10 @@ def main():
if args.loadhash:
print('loading hashtable from', args.loadhash)
- ht = khmer.load_counting_hash(args.loadhash)
+ ht = khmer.load_countgraph(args.loadhash)
else:
print('making hashtable')
- ht = khmer.CountingHash(K, HT_SIZE, N_HT)
+ ht = khmer.Countgraph(K, HT_SIZE, N_HT)
aligner = khmer.ReadAligner(ht, args.trusted_cutoff, args.bits_theta)
diff --git a/sandbox/correct-reads.py b/sandbox/correct-reads.py
index a86f9be..46e70d9 100755
--- a/sandbox/correct-reads.py
+++ b/sandbox/correct-reads.py
@@ -25,10 +25,10 @@ import shutil
import textwrap
import argparse
-from khmer.khmer_args import (build_counting_args, info, add_loadhash_args,
+from khmer.khmer_args import (build_counting_args, info, add_loadgraph_args,
report_on_config)
from khmer.utils import write_record, write_record_pair, broken_paired_reader
-from khmer.kfile import (check_space, check_space_for_hashtable,
+from khmer.kfile import (check_space, check_space_for_graph,
check_valid_file_exists)
DEFAULT_NORMALIZE_LIMIT = 20
@@ -98,9 +98,9 @@ def get_parser():
default=False,
help='Only correct sequences that have high coverage.')
- add_loadhash_args(parser)
- parser.add_argument('-s', '--savetable', metavar="filename", default='',
- help='save the k-mer counting table to disk after all'
+ add_loadgraph_args(parser)
+ parser.add_argument('-s', '--savegraph', metavar="filename", default='',
+ help='save the k-mer countgraph to disk after all'
'reads are loaded.')
# expert options
@@ -129,8 +129,8 @@ def main():
report_on_config(args)
check_valid_file_exists(args.input_filenames)
check_space(args.input_filenames, args.force)
- if args.savetable:
- check_space_for_hashtable(
+ if args.savegraph:
+ check_space_for_graph(
args.n_tables * args.min_tablesize, args.force)
K = args.ksize
@@ -138,12 +138,12 @@ def main():
CUTOFF = args.cutoff
NORMALIZE_LIMIT = args.normalize_to
- if args.loadtable:
- print >>sys.stderr, 'loading k-mer counting table from', args.loadtable
- ct = khmer.load_counting_hash(args.loadtable)
+ if args.loadgraph:
+ print >>sys.stderr, 'loading k-mer countgraph from', args.loadgraph
+ ct = khmer.load_countgraph(args.loadgraph)
else:
- print >>sys.stderr, 'making k-mer counting table'
- ct = khmer.new_counting_hash(K, args.min_tablesize, args.n_tables)
+ print >>sys.stderr, 'making k-mer countgraph'
+ ct = khmer.new_countgraph(K, args.min_tablesize, args.n_tables)
tempdir = tempfile.mkdtemp('khmer', 'tmp', args.tempdir)
print >>sys.stderr, 'created temporary directory %s; ' \
@@ -338,9 +338,9 @@ def main():
print >>sys.stderr, 'output in *.corr'
- if args.savetable:
- print >>sys.stderr, "Saving k-mer counting table to", args.savetable
- ct.save(args.savetable)
+ if args.savegraph:
+ print >>sys.stderr, "Saving k-mer countgraph to", args.savegraph
+ ct.save(args.savegraph)
if __name__ == '__main__':
diff --git a/sandbox/count-kmers-single.py b/sandbox/count-kmers-single.py
index 7cb49c1..267eadc 100755
--- a/sandbox/count-kmers-single.py
+++ b/sandbox/count-kmers-single.py
@@ -8,7 +8,7 @@
# pylint: disable=missing-docstring,invalid-name
"""
Produce k-mer counts for all the k-mers in the given sequence file,
-using the given counting table.
+using the given countgraph.
% python sandbox/count-kmers-single.py <fasta/fastq>
@@ -24,7 +24,7 @@ import csv
from khmer.khmer_args import (build_counting_args, report_on_config, info,
add_threading_args)
from khmer.kfile import (check_input_files, check_space,
- check_space_for_hashtable)
+ check_space_for_graph)
import threading
@@ -50,18 +50,18 @@ def main():
check_input_files(args.input_sequence_filename, False)
- print ('making k-mer counting table', file=sys.stderr)
- counting_hash = khmer.CountingHash(args.ksize, args.max_tablesize,
+ print ('making k-mer countgraph', file=sys.stderr)
+ countgraph = khmer.Countgraph(args.ksize, args.max_tablesize,
args.n_tables)
- # @CTB counting_hash.set_use_bigcount(args.bigcount)
+ # @CTB countgraph.set_use_bigcount(args.bigcount)
- kmer_size = counting_hash.ksize()
- hashsizes = counting_hash.hashsizes()
- tracking = khmer._Hashbits( # pylint: disable=protected-access
+ kmer_size = countgraph.ksize()
+ hashsizes = countgraph.hashsizes()
+ tracking = khmer._Nodegraph( # pylint: disable=protected-access
kmer_size, hashsizes)
- print ('kmer_size: %s' % counting_hash.ksize(), file=sys.stderr)
- print ('k-mer counting table sizes: %s' % (counting_hash.hashsizes(),),
+ print ('kmer_size: %s' % countgraph.ksize(), file=sys.stderr)
+ print ('k-mer countgraph sizes: %s' % (countgraph.hashsizes(),),
file=sys.stderr)
if args.output_file is None:
@@ -76,7 +76,7 @@ def main():
for _ in range(args.threads):
thread = \
threading.Thread(
- target=counting_hash.consume_fasta_with_reads_parser,
+ target=countgraph.consume_fasta_with_reads_parser,
args=(rparser, )
)
threads.append(thread)
@@ -91,10 +91,10 @@ def main():
kmer = seq[i:i+kmer_size]
if not tracking.get(kmer):
tracking.count(kmer)
- writer.writerow([kmer, str(counting_hash.get(kmer))])
+ writer.writerow([kmer, str(countgraph.get(kmer))])
print ('Total number of unique k-mers: {0}'.format(
- counting_hash.n_unique_kmers()), file=sys.stderr)
+ countgraph.n_unique_kmers()), file=sys.stderr)
if __name__ == '__main__':
diff --git a/sandbox/count-kmers.py b/sandbox/count-kmers.py
index 0d736da..302b618 100644
--- a/sandbox/count-kmers.py
+++ b/sandbox/count-kmers.py
@@ -8,7 +8,7 @@
# pylint: disable=missing-docstring,invalid-name
"""
Produce k-mer counts for all the k-mers in the given sequence file,
-using the given counting table.
+using the given countgraph.
% python sandbox/count-kmers.py <ct> <fasta/fastq> [ <fasta/fastq> ... ]
@@ -27,11 +27,11 @@ from khmer.khmer_args import info
def get_parser():
parser = argparse.ArgumentParser(
description="Output abundances of the k-mers in "
- "the sequence files using a pre-made k-mer counting table.",
+ "the sequence files using a pre-made k-mer countgraph.",
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
- parser.add_argument('input_counting_table_filename', help='The name of the'
- ' input k-mer counting table file.')
+ parser.add_argument('input_count_graph_filename', help='The name of the'
+ ' input k-mer countgraph file.')
parser.add_argument('input_sequence_filenames', help='The input'
' FAST[AQ] sequence file(s).', nargs='+')
@@ -47,14 +47,14 @@ def main():
info('count-kmers.py', ['counting'])
args = get_parser().parse_args()
- print ('hashtable from', args.input_counting_table_filename,
+ print ('hashtable from', args.input_count_graph_filename,
file=sys.stderr)
- counting_hash = khmer.load_counting_hash(
- args.input_counting_table_filename)
+ countgraph = khmer.load_countgraph(
+ args.input_count_graph_filename)
- kmer_size = counting_hash.ksize()
- hashsizes = counting_hash.hashsizes()
- tracking = khmer._Hashbits( # pylint: disable=protected-access
+ kmer_size = countgraph.ksize()
+ hashsizes = countgraph.hashsizes()
+ tracking = khmer._Nodegraph( # pylint: disable=protected-access
kmer_size, hashsizes)
if args.output_file is None:
@@ -68,10 +68,10 @@ def main():
kmer = seq[i:i+kmer_size]
if not tracking.get(kmer):
tracking.count(kmer)
- writer.writerow([kmer, str(counting_hash.get(kmer))])
+ writer.writerow([kmer, str(countgraph.get(kmer))])
print ('Total number of unique k-mers: {0}'.format(
- counting_hash.n_unique_kmers()), file=sys.stderr)
+ countgraph.n_unique_kmers()), file=sys.stderr)
if __name__ == '__main__':
diff --git a/sandbox/error-correct-pass2.py b/sandbox/error-correct-pass2.py
index 466848f..a21ab38 100755
--- a/sandbox/error-correct-pass2.py
+++ b/sandbox/error-correct-pass2.py
@@ -58,7 +58,7 @@ def main():
args = parser.parse_args()
print('loading counts')
- ht = khmer.load_counting_hash(args.counts_table)
+ ht = khmer.load_countgraph(args.counts_table)
aligner = khmer.ReadAligner(ht,
args.trusted_cov,
diff --git a/sandbox/estimate_optimal_hash.py b/sandbox/estimate_optimal_hash.py
index 3f3944e..2b8d450 100755
--- a/sandbox/estimate_optimal_hash.py
+++ b/sandbox/estimate_optimal_hash.py
@@ -29,8 +29,7 @@ Use '-h' for parameter help.
from __future__ import print_function
import argparse
import khmer, oxli
-from khmer.khmer_args import info
-from oxli.functions import optimal_size
+from khmer.khmer_args import info, optimal_size
import textwrap
import sys
diff --git a/sandbox/fasta-to-abundance-hist.py b/sandbox/fasta-to-abundance-hist.py
index b4776cf..034a894 100755
--- a/sandbox/fasta-to-abundance-hist.py
+++ b/sandbox/fasta-to-abundance-hist.py
@@ -18,7 +18,7 @@ def main():
n_seq_kept = len(files) * [0]
print('loading ht')
- ht = khmer.load_counting_hash(sys.argv[1])
+ ht = khmer.load_countgraph(sys.argv[1])
for i, infile in enumerate(files):
print('outputting', infile + '.freq')
diff --git a/sandbox/filter-below-abund.py b/sandbox/filter-below-abund.py
index e46ddee..ed902c3 100755
--- a/sandbox/filter-below-abund.py
+++ b/sandbox/filter-below-abund.py
@@ -29,7 +29,7 @@ def main():
print('--')
print('making hashtable')
- ht = khmer.load_counting_hash(counting_ht)
+ ht = khmer.load_countgraph(counting_ht)
K = ht.ksize()
for infile in infiles:
diff --git a/sandbox/filter-median-and-pct.py b/sandbox/filter-median-and-pct.py
index 698aff5..a345871 100755
--- a/sandbox/filter-median-and-pct.py
+++ b/sandbox/filter-median-and-pct.py
@@ -41,7 +41,7 @@ def main():
print('file with ht: %s' % counting_ht)
print('loading hashtable')
- ht = khmer.load_counting_hash(counting_ht)
+ ht = khmer.load_countgraph(counting_ht)
K = ht.ksize()
xxxfp = None
diff --git a/sandbox/filter-median.py b/sandbox/filter-median.py
index a417c3d..8df398e 100755
--- a/sandbox/filter-median.py
+++ b/sandbox/filter-median.py
@@ -41,7 +41,7 @@ def main():
print('file with ht: %s' % counting_ht)
print('loading hashtable')
- ht = khmer.load_counting_hash(counting_ht)
+ ht = khmer.load_countgraph(counting_ht)
K = ht.ksize()
print("K:", K)
diff --git a/sandbox/find-high-abund-kmers.py b/sandbox/find-high-abund-kmers.py
index db43686..81ae02b 100755
--- a/sandbox/find-high-abund-kmers.py
+++ b/sandbox/find-high-abund-kmers.py
@@ -65,7 +65,7 @@ def main():
###
print('making hashtable')
- ht = khmer.CountingHash(K, HT_SIZE, N_HT)
+ ht = khmer.Countgraph(K, HT_SIZE, N_HT)
ht.set_use_bigcount(True)
print('consuming input', input)
diff --git a/sandbox/graph-size.py b/sandbox/graph-size.py
index 41cdf07..8ca066a 100755
--- a/sandbox/graph-size.py
+++ b/sandbox/graph-size.py
@@ -42,7 +42,7 @@ def main():
print('--')
print('creating ht')
- ht = khmer.Hashbits(K, HASHTABLE_SIZE, N_HT)
+ ht = khmer.Nodegraph(K, HASHTABLE_SIZE, N_HT)
print('eating fa', infile)
total_reads, n_consumed = ht.consume_fasta(infile)
outfp = open(outfile, 'w')
diff --git a/sandbox/hi-lo-abundance-by-position.py b/sandbox/hi-lo-abundance-by-position.py
index 5c9e353..3072d1f 100755
--- a/sandbox/hi-lo-abundance-by-position.py
+++ b/sandbox/hi-lo-abundance-by-position.py
@@ -22,7 +22,7 @@ def main():
outfile = os.path.basename(filename)
print('loading kh file', hashfile)
- ht = khmer.load_counting_hash(hashfile)
+ ht = khmer.load_countgraph(hashfile)
x = ht.fasta_count_kmers_by_position(filename, 100, 1)
write_dist(x, open(outfile + '.pos.abund=1', 'w'))
diff --git a/sandbox/multi-rename.py b/sandbox/multi-rename.py
index f9c3a3d..5261953 100755
--- a/sandbox/multi-rename.py
+++ b/sandbox/multi-rename.py
@@ -8,6 +8,7 @@
from __future__ import print_function
import screed
import sys
+import textwrap
CUTOFF = 200
@@ -19,8 +20,8 @@ def main():
for record in screed.open(filename):
if len(record.sequence) >= CUTOFF:
n += 1
- print('>%s.%s %s\n%s' % (prefix, n, record.name, record.sequence))
-
-
+ print('>%s.%s %s' % (prefix, n, record.name))
+ x = "\n".join(textwrap.wrap(record.sequence, 80))
+ print (x)
if __name__ == '__main__':
main()
diff --git a/sandbox/normalize-by-median-pct.py b/sandbox/normalize-by-median-pct.py
index f7dad25..f82834b 100755
--- a/sandbox/normalize-by-median-pct.py
+++ b/sandbox/normalize-by-median-pct.py
@@ -85,10 +85,10 @@ def main():
if args.loadhash:
print('loading hashtable from', args.loadhash)
- ht = khmer.load_counting_hash(args.loadhash)
+ ht = khmer.load_countgraph(args.loadhash)
else:
print('making hashtable')
- ht = khmer.CountingHash(K, HT_SIZE, N_HT)
+ ht = khmer.Countgraph(K, HT_SIZE, N_HT)
total = 0
discarded = 0
diff --git a/sandbox/optimal_args_hashbits.py b/sandbox/optimal_args_hashbits.py
index 9b866e5..ad68b92 100644
--- a/sandbox/optimal_args_hashbits.py
+++ b/sandbox/optimal_args_hashbits.py
@@ -7,9 +7,9 @@
#
# pylint: disable=invalid-name,missing-docstring
"""
-Estimate optimal arguments using hashbits counting.
+Estimate optimal arguments using nodegraph counting.
-% python sandbox/optimal_args_hashbits.py <data1> [ <data2> <...> ]
+% python sandbox/optimal_args_nodegraph.py <data1> [ <data2> <...> ]
Use '-h' for parameter help.
"""
@@ -20,15 +20,15 @@ import math
import threading
import khmer
-from khmer.khmer_args import build_hashbits_args
-from khmer.khmer_args import (report_on_config, info, add_threading_args)
+from khmer.khmer_args import (report_on_config, info, add_threading_args,
+ build_nodegraph_args)
from khmer.kfile import check_input_files, check_space
from khmer.kfile import check_space
-from oxli.functions import optimal_args_output_gen as output_gen
+from khmer.khmer_args import graphsize_args_report
def get_parser():
- parser = build_hashbits_args(descr="Load sequences into the compressible "
+ parser = build_nodegraph_args(descr="Load sequences into the compressible "
"graph format plus optional tagset.")
add_threading_args(parser)
parser.add_argument('input_filenames', metavar='input_sequence_filename',
@@ -37,9 +37,9 @@ def get_parser():
def main():
- info('optimal_args_hashbits.py', ['graph', 'SeqAn'])
+ info('optimal_args_nodegraph.py', ['graph', 'SeqAn'])
args = get_parser().parse_args()
- report_on_config(args, hashtype='hashbits')
+ report_on_config(args, graphtype='nodegraph')
filenames = args.input_filenames
@@ -52,7 +52,7 @@ def main():
print('Counting kmers from sequences in %s' % repr(filenames),
file=sys.stderr)
- htable = khmer.new_hashbits(args.ksize, args.max_tablesize, args.n_tables)
+ htable = khmer.new_nodegraph(args.ksize, args.max_tablesize, args.n_tables)
target_method = htable.consume_fasta_with_reads_parser
for _, filename in enumerate(filenames):
@@ -84,7 +84,7 @@ def main():
if not False:
sys.exit(1)
- to_print = output_gen(unique_kmers,fp_rate)
+ to_print = graphsize_args_report(unique_kmers, fp_rate)
print(to_print, file=info_optimal)
diff --git a/sandbox/print-stoptags.py b/sandbox/print-stoptags.py
index 1e59b44..0729629 100755
--- a/sandbox/print-stoptags.py
+++ b/sandbox/print-stoptags.py
@@ -13,7 +13,7 @@ K = 32
def main():
- ht = khmer.Hashbits(32, 1, 1)
+ ht = khmer.Nodegraph(32, 1, 1)
ht.load_stop_tags(sys.argv[1])
ht.print_stop_tags(os.path.basename(sys.argv[1]) + '.txt')
diff --git a/sandbox/print-tagset.py b/sandbox/print-tagset.py
index c861278..ea52e26 100755
--- a/sandbox/print-tagset.py
+++ b/sandbox/print-tagset.py
@@ -14,7 +14,7 @@ K = 32
def main():
- ht = khmer.Hashbits(32, 1, 1)
+ ht = khmer.Nodegraph(32, 1, 1)
ht.load_tagset(sys.argv[1])
print('loaded!')
ht.print_tagset(os.path.basename(sys.argv[1]) + '.txt')
diff --git a/sandbox/readaligner_pairhmm_train.py b/sandbox/readaligner_pairhmm_train.py
index 0e60e06..7a14e3e 100644
--- a/sandbox/readaligner_pairhmm_train.py
+++ b/sandbox/readaligner_pairhmm_train.py
@@ -40,7 +40,7 @@ def main():
args = parser.parse_args()
- ht = khmer.load_counting_hash(args.ht)
+ ht = khmer.load_countgraph(args.ht)
samfile = pysam.Samfile(args.bam_file)
k = ht.ksize()
diff --git a/sandbox/saturate-by-median.py b/sandbox/saturate-by-median.py
index a47cde4..0578470 100755
--- a/sandbox/saturate-by-median.py
+++ b/sandbox/saturate-by-median.py
@@ -20,10 +20,10 @@ import os
import khmer
import textwrap
-from khmer.khmer_args import (build_counting_args, add_loadhash_args,
+from khmer.khmer_args import (build_counting_args, add_loadgraph_args,
report_on_config, info, create_countgraph)
import argparse
-from khmer.kfile import (check_space, check_space_for_hashtable,
+from khmer.kfile import (check_space, check_space_for_graph,
check_valid_file_exists)
DEFAULT_DESIRED_COVERAGE = 1
@@ -118,13 +118,13 @@ def get_parser():
keeping (or discarding) each sequencing fragment. This helps with retention
of repeats, especially.
- With :option:`-s`/:option:`--savetable`, the k-mer counting table
+ With :option:`-s`/:option:`--savegraph`, the k-mer countgraph
will be saved to the specified file after all sequences have been
- processed. With :option:`-d`, the k-mer counting table will be
+ processed. With :option:`-d`, the k-mer countgraph will be
saved every d files for multifile runs; if :option:`-s` is set,
the specified name will be used, and if not, the name `backup.ct`
- will be used. :option:`-l`/:option:`--loadtable` will load the
- specified k-mer counting table before processing the specified
+ will be used. :option:`-l`/:option:`--loadgraph` will load the
+ specified k-mer countgraph before processing the specified
files. Note that these tables are are in the same format as those
produced by :program:`load-into-counting.py` and consumed by
:program:`abundance-dist.py`.
@@ -157,7 +157,7 @@ def get_parser():
parser.add_argument('-C', '--cutoff', type=int,
default=DEFAULT_DESIRED_COVERAGE)
parser.add_argument('-p', '--paired', action='store_true')
- parser.add_argument('-s', '--savetable', metavar="filename", default='')
+ parser.add_argument('-s', '--savegraph', metavar="filename", default='')
parser.add_argument('-R', '--report',
metavar='filename', type=argparse.FileType('w'))
parser.add_argument('--report-frequency',
@@ -171,7 +171,7 @@ def get_parser():
' file with the specified filename')
parser.add_argument('input_filenames', metavar='input_sequence_filename',
help='Input FAST[AQ] sequence filename.', nargs='+')
- add_loadhash_args(parser)
+ add_loadgraph_args(parser)
return parser
@@ -186,16 +186,16 @@ def main(): # pylint: disable=too-many-branches,too-many-statements
check_valid_file_exists(args.input_filenames)
check_space(args.input_filenames, False)
- if args.savetable:
- check_space_for_hashtable(args, 'countgraph', False)
+ if args.savegraph:
+ check_space_for_graph(args, 'countgraph', False)
# list to save error files along with throwing exceptions
if args.force:
corrupt_files = []
- if args.loadtable:
- print('loading k-mer counting table from', args.loadtable)
- htable = khmer.load_counting_hash(args.loadtable)
+ if args.loadgraph:
+ print('loading k-mer countgraph from', args.loadgraph)
+ htable = khmer.load_countgraph(args.loadgraph)
else:
print('making countgraph')
htable = create_countgraph(args)
@@ -233,10 +233,10 @@ def main(): # pylint: disable=too-many-branches,too-many-statements
kept=total - discarded, total=total,
perc=int(100. - discarded / float(total) * 100.)))
- if args.savetable:
- print('Saving k-mer counting table through', input_filename)
- print('...saving to', args.savetable)
- htable.save(args.savetable)
+ if args.savegraph:
+ print('Saving k-mer countgraph through', input_filename)
+ print('...saving to', args.savegraph)
+ htable.save(args.savegraph)
# re: threshold, see Zhang et al.,
# http://arxiv.org/abs/1309.2975
diff --git a/sandbox/slice-reads-by-coverage.py b/sandbox/slice-reads-by-coverage.py
index 2093aa6..978891b 100755
--- a/sandbox/slice-reads-by-coverage.py
+++ b/sandbox/slice-reads-by-coverage.py
@@ -22,7 +22,7 @@ def main():
parser = argparse.ArgumentParser()
parser.add_argument('-m', '--min-coverage', type=int, default=None)
parser.add_argument('-M', '--max-coverage', type=int, default=None)
- parser.add_argument('input_counting_table')
+ parser.add_argument('input_count_graph')
parser.add_argument('input_readfile')
parser.add_argument('output_readfile')
args = parser.parse_args()
@@ -39,7 +39,7 @@ def main():
print("min_coverage > max_coverage!? exiting!", file=sys.stderr)
sys.exit(1)
- htable = khmer.load_counting_hash(args.input_counting_table)
+ htable = khmer.load_countgraph(args.input_count_graph)
output_file = args.output_readfile
output_fp = open(output_file, 'w')
diff --git a/sandbox/stoptag-abundance-hist.py b/sandbox/stoptag-abundance-hist.py
index 616a9f4..b430133 100755
--- a/sandbox/stoptag-abundance-hist.py
+++ b/sandbox/stoptag-abundance-hist.py
@@ -20,7 +20,7 @@ def main():
filename = sys.argv[2]
figure = sys.argv[3]
- ht = khmer.load_counting_hash(hashfile)
+ ht = khmer.load_countgraph(hashfile)
outabund = open(os.path.basename(filename) + '.counts', 'w')
diff --git a/sandbox/stoptags-by-position.py b/sandbox/stoptags-by-position.py
index 1b92fa6..09c232d 100755
--- a/sandbox/stoptags-by-position.py
+++ b/sandbox/stoptags-by-position.py
@@ -14,7 +14,7 @@ K = 32
def main():
- ht = khmer.Hashbits(K, 1, 1)
+ ht = khmer.Nodegraph(K, 1, 1)
x = [0] * 255
y = [0] * 255
diff --git a/sandbox/subset-report.py b/sandbox/subset-report.py
index fe230e6..b8aa060 100755
--- a/sandbox/subset-report.py
+++ b/sandbox/subset-report.py
@@ -17,7 +17,7 @@ K = 32
def main():
subset_filenames = sys.argv[1:]
- ht = khmer.Hashbits(K, 1, 1)
+ ht = khmer.Nodegraph(K, 1, 1)
for filename in subset_filenames:
print('--')
print('partition map:', filename)
diff --git a/sandbox/sweep-files.py b/sandbox/sweep-files.py
index e80ca44..4f12d43 100755
--- a/sandbox/sweep-files.py
+++ b/sandbox/sweep-files.py
@@ -36,7 +36,7 @@ from collections import defaultdict, deque
import os
import time
import khmer
-from khmer.khmer_args import (build_hashbits_args, report_on_config, info)
+from khmer.khmer_args import (build_nodegraph_args, report_on_config, info)
DEFAULT_OUT_PREF = 'reads'
DEFAULT_RANGE = -1
@@ -46,7 +46,7 @@ MIN_KSIZE = 21
def get_parser():
- parser = build_hashbits_args('Takes a partitioned reference file \
+ parser = build_nodegraph_args('Takes a partitioned reference file \
and a list of reads, and sorts reads \
by which partition they connect to')
parser.epilog = EPILOG
@@ -108,7 +108,7 @@ def main():
if args.ksize < MIN_KSIZE:
args.ksize = MIN_KSIZE
- report_on_config(args, hashtype='nodegraph')
+ report_on_config(args, graphtype='nodegraph')
K = args.ksize
HT_SIZE = args.max_tablesize
@@ -120,7 +120,7 @@ def main():
# Consume the database files and assign each a unique label in the
# de Bruin graph; open a file and output queue for each file as well.
- ht = khmer.LabelHash(K, HT_SIZE, N_HT)
+ ht = khmer.GraphLabels(K, HT_SIZE, N_HT)
try:
print('consuming and labeling input sequences...', file=sys.stderr)
diff --git a/sandbox/sweep-out-reads-with-contigs.py b/sandbox/sweep-out-reads-with-contigs.py
index f053f62..afd0cf7 100755
--- a/sandbox/sweep-out-reads-with-contigs.py
+++ b/sandbox/sweep-out-reads-with-contigs.py
@@ -21,8 +21,8 @@ def main():
if len(sys.argv) == 4:
outfile = sys.argv[3]
- # create a hashbits data structure
- ht = khmer.Hashbits(K, 1, 1)
+ # create a nodegraph data structure
+ ht = khmer.Nodegraph(K, 1, 1)
# tag every k-mer in the contigs
ht._set_tag_density(0)
diff --git a/sandbox/sweep-reads.py b/sandbox/sweep-reads.py
index 7134fc3..fa878e6 100755
--- a/sandbox/sweep-reads.py
+++ b/sandbox/sweep-reads.py
@@ -38,7 +38,7 @@ from collections import defaultdict
import os
import time
import khmer
-from khmer.khmer_args import (build_hashbits_args, report_on_config, info)
+from khmer.khmer_args import (build_nodegraph_args, report_on_config, info)
from khmer.kfile import (check_input_files, check_valid_file_exists,
check_space)
@@ -161,7 +161,7 @@ class ReadBufferManager(object):
def get_parser():
- parser = build_hashbits_args('Takes a partitioned reference file \
+ parser = build_nodegraph_args('Takes a partitioned reference file \
and a list of reads, and sorts reads \
by which partition they connect to')
parser.epilog = EPILOG
@@ -213,7 +213,7 @@ def main():
if args.ksize < MIN_KSIZE:
args.ksize = MIN_KSIZE
- report_on_config(args, hashtype='nodegraph')
+ report_on_config(args, graphtype='nodegraph')
K = args.ksize
HT_SIZE = args.max_tablesize
@@ -253,7 +253,7 @@ def main():
max_buffers, max_reads, buf_size, output_pref, outdir, extension)
# consume the partitioned fasta with which to label the graph
- ht = khmer.LabelHash(K, HT_SIZE, N_HT)
+ ht = khmer.GraphLabels(K, HT_SIZE, N_HT)
try:
print('consuming input sequences...', file=sys.stderr)
if args.label_by_pid:
diff --git a/sandbox/sweep-reads2.py b/sandbox/sweep-reads2.py
index 38d6e6a..528dd87 100755
--- a/sandbox/sweep-reads2.py
+++ b/sandbox/sweep-reads2.py
@@ -20,7 +20,7 @@ import sys
import khmer
import os.path
import screed
-from khmer.khmer_args import (build_hashbits_args, DEFAULT_MAX_TABLESIZE)
+from khmer.khmer_args import (build_nodegraph_args, DEFAULT_MAX_TABLESIZE)
def main():
@@ -57,8 +57,8 @@ def main():
outfile = os.path.basename(readsfile) + '.sweep2'
outfp = open(outfile, 'w')
- # create a hashbits data structure
- ht = khmer.Hashbits(K, HT_SIZE, N_HT)
+ # create a nodegraph data structure
+ ht = khmer.Nodegraph(K, HT_SIZE, N_HT)
# load contigs, connect into N partitions
print('loading input reads from', inp)
diff --git a/sandbox/sweep-reads3.py b/sandbox/sweep-reads3.py
index c0c5329..fd4b82b 100755
--- a/sandbox/sweep-reads3.py
+++ b/sandbox/sweep-reads3.py
@@ -20,7 +20,7 @@ import sys
import os.path
import screed
import khmer
-from khmer.khmer_args import (build_hashbits_args, DEFAULT_MAX_TABLESIZE)
+from khmer.khmer_args import (build_nodegraph_args, DEFAULT_MAX_TABLESIZE)
def output_single(r):
@@ -63,8 +63,8 @@ def main():
query_list = []
for n, inp_name in enumerate(inputlist):
- # create a hashbits data structure
- ht = khmer.Hashbits(K, HT_SIZE, N_HT)
+ # create a nodegraph data structure
+ ht = khmer.Nodegraph(K, HT_SIZE, N_HT)
outfile = os.path.basename(inp_name) + '.sweep3'
outfp = open(outfile, 'w')
diff --git a/scripts/abundance-dist-single.py b/scripts/abundance-dist-single.py
index 4da22b2..a032e8a 100755
--- a/scripts/abundance-dist-single.py
+++ b/scripts/abundance-dist-single.py
@@ -11,7 +11,7 @@ Produce the k-mer abundance distribution for the given file.
% python scripts/abundance-dist-single.py <data> <histout>
-The script does not load a prebuilt k-mer counting table.
+The script does not load a prebuilt k-mer countgraph.
Use '-h' for parameter help.
"""
@@ -24,8 +24,8 @@ import threading
import textwrap
from khmer import khmer_args
from khmer.khmer_args import (build_counting_args, add_threading_args,
- report_on_config, info, calculate_tablesize)
-from khmer.kfile import (check_input_files, check_space_for_hashtable)
+ report_on_config, info, calculate_graphsize)
+from khmer.kfile import (check_input_files, check_space_for_graph)
def get_parser():
@@ -58,8 +58,8 @@ def get_parser():
parser.add_argument('-s', '--squash', dest='squash_output', default=False,
action='store_true',
help='Overwrite output file if it exists')
- parser.add_argument('--savetable', default='', metavar="filename",
- help="Save the k-mer counting table to the specified "
+ parser.add_argument('--savegraph', default='', metavar="filename",
+ help="Save the k-mer countgraph to the specified "
"filename.")
parser.add_argument('-f', '--force', default=False, action='store_true',
help='Overwrite output file if it exists')
@@ -72,9 +72,9 @@ def main(): # pylint: disable=too-many-locals,too-many-branches
report_on_config(args)
check_input_files(args.input_sequence_filename, args.force)
- if args.savetable:
- tablesize = calculate_tablesize(args, 'countgraph')
- check_space_for_hashtable(args.savetable, tablesize, args.force)
+ if args.savegraph:
+ graphsize = calculate_graphsize(args, 'countgraph')
+ check_space_for_graph(args.savegraph, graphsize, args.force)
if (not args.squash_output and
os.path.exists(args.output_histogram_filename)):
print('ERROR: %s exists; not squashing.' %
@@ -88,15 +88,15 @@ def main(): # pylint: disable=too-many-locals,too-many-branches
'cumulative_fraction'])
print('making countgraph', file=sys.stderr)
- counting_hash = khmer_args.create_countgraph(args, multiplier=1.1)
- counting_hash.set_use_bigcount(args.bigcount)
+ countgraph = khmer_args.create_countgraph(args, multiplier=1.1)
+ countgraph.set_use_bigcount(args.bigcount)
- print('building k-mer tracking table', file=sys.stderr)
+ print('building k-mer tracking graph', file=sys.stderr)
tracking = khmer_args.create_nodegraph(args, multiplier=1.1)
- print('kmer_size:', counting_hash.ksize(), file=sys.stderr)
- print('k-mer counting table sizes:',
- counting_hash.hashsizes(), file=sys.stderr)
+ print('kmer_size:', countgraph.ksize(), file=sys.stderr)
+ print('k-mer countgraph sizes:',
+ countgraph.hashsizes(), file=sys.stderr)
print('outputting to', args.output_histogram_filename, file=sys.stderr)
# start loading
@@ -107,7 +107,7 @@ def main(): # pylint: disable=too-many-locals,too-many-branches
for _ in range(args.threads):
thread = \
threading.Thread(
- target=counting_hash.consume_fasta_with_reads_parser,
+ target=countgraph.consume_fasta_with_reads_parser,
args=(rparser, )
)
threads.append(thread)
@@ -117,12 +117,12 @@ def main(): # pylint: disable=too-many-locals,too-many-branches
thread.join()
print('Total number of unique k-mers: {0}'.format(
- counting_hash.n_unique_kmers()), file=sys.stderr)
+ countgraph.n_unique_kmers()), file=sys.stderr)
abundance_lists = []
def __do_abundance_dist__(read_parser):
- abundances = counting_hash.abundance_distribution_with_reads_parser(
+ abundances = countgraph.abundance_distribution_with_reads_parser(
read_parser, tracking)
abundance_lists.append(abundances)
@@ -172,10 +172,10 @@ def main(): # pylint: disable=too-many-locals,too-many-branches
if sofar == total:
break
- if args.savetable:
- print('Saving k-mer counting table ', args.savetable, file=sys.stderr)
- print('...saving to', args.savetable, file=sys.stderr)
- counting_hash.save(args.savetable)
+ if args.savegraph:
+ print('Saving k-mer countgraph ', args.savegraph, file=sys.stderr)
+ print('...saving to', args.savegraph, file=sys.stderr)
+ countgraph.save(args.savegraph)
print('wrote to: ' + args.output_histogram_filename, file=sys.stderr)
diff --git a/scripts/abundance-dist.py b/scripts/abundance-dist.py
index 1e96c1e..ebbeedf 100755
--- a/scripts/abundance-dist.py
+++ b/scripts/abundance-dist.py
@@ -27,11 +27,11 @@ from khmer.khmer_args import info
def get_parser():
parser = argparse.ArgumentParser(
description="Calculate abundance distribution of the k-mers in "
- "the sequence file using a pre-made k-mer counting table.",
+ "the sequence file using a pre-made k-mer countgraph.",
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
- parser.add_argument('input_counting_table_filename', help='The name of the'
- ' input k-mer counting table file.')
+ parser.add_argument('input_count_graph_filename', help='The name of the'
+ ' input k-mer countgraph file.')
parser.add_argument('input_sequence_filename', help='The name of the input'
' FAST[AQ] sequence file.')
parser.add_argument('output_histogram_filename', help='The columns are: '
@@ -58,31 +58,30 @@ def main():
info('abundance-dist.py', ['counting'])
args = get_parser().parse_args()
- infiles = [args.input_counting_table_filename,
+ infiles = [args.input_count_graph_filename,
args.input_sequence_filename]
for infile in infiles:
check_input_files(infile, False)
- print('hashtable from', args.input_counting_table_filename,
+ print('Counting graph from', args.input_count_graph_filename,
file=sys.stderr)
- counting_hash = khmer.load_counting_hash(
- args.input_counting_table_filename)
+ countgraph = khmer.load_countgraph(
+ args.input_count_graph_filename)
- if not counting_hash.get_use_bigcount() and args.bigcount:
+ if not countgraph.get_use_bigcount() and args.bigcount:
print("WARNING: The loaded graph has bigcount DISABLED while bigcount"
" reporting is ENABLED--counts higher than 255 will not be "
"reported.",
file=sys.stderr)
- counting_hash.set_use_bigcount(args.bigcount)
+ countgraph.set_use_bigcount(args.bigcount)
- kmer_size = counting_hash.ksize()
- hashsizes = counting_hash.hashsizes()
- tracking = khmer._Hashbits( # pylint: disable=protected-access
+ kmer_size = countgraph.ksize()
+ hashsizes = countgraph.hashsizes()
+ tracking = khmer._Nodegraph( # pylint: disable=protected-access
kmer_size, hashsizes)
print('K:', kmer_size, file=sys.stderr)
- print('HT sizes:', hashsizes, file=sys.stderr)
print('outputting to', args.output_histogram_filename, file=sys.stderr)
if args.output_histogram_filename in ('-', '/dev/stdout'):
@@ -98,7 +97,7 @@ def main():
args.output_histogram_filename, file=sys.stderr)
print('preparing hist...', file=sys.stderr)
- abundances = counting_hash.abundance_distribution(
+ abundances = countgraph.abundance_distribution(
args.input_sequence_filename, tracking)
total = sum(abundances)
@@ -110,13 +109,13 @@ def main():
sys.exit(1)
if args.output_histogram_filename in ('-', '/dev/stdout'):
- hash_fp = sys.stdout
+ countgraph_fp = sys.stdout
else:
- hash_fp = open(args.output_histogram_filename, 'w')
- hash_fp_csv = csv.writer(hash_fp)
+ countgraph_fp = open(args.output_histogram_filename, 'w')
+ countgraph_fp_csv = csv.writer(countgraph_fp)
# write headers:
- hash_fp_csv.writerow(['abundance', 'count', 'cumulative',
- 'cumulative_fraction'])
+ countgraph_fp_csv.writerow(['abundance', 'count', 'cumulative',
+ 'cumulative_fraction'])
sofar = 0
for _, i in enumerate(abundances):
@@ -126,7 +125,7 @@ def main():
sofar += i
frac = sofar / float(total)
- hash_fp_csv.writerow([_, i, sofar, round(frac, 3)])
+ countgraph_fp_csv.writerow([_, i, sofar, round(frac, 3)])
if sofar == total:
break
diff --git a/scripts/annotate-partitions.py b/scripts/annotate-partitions.py
index e09f958..a5e6d71 100755
--- a/scripts/annotate-partitions.py
+++ b/scripts/annotate-partitions.py
@@ -67,7 +67,7 @@ def main():
ksize = args.ksize
filenames = args.input_filenames
- htable = khmer.Hashbits(ksize, 1, 1)
+ nodegraph = khmer.Nodegraph(ksize, 1, 1)
partitionmap_file = args.graphbase + '.pmap.merged'
@@ -78,12 +78,12 @@ def main():
check_space(filenames, args.force)
print('loading partition map from:', partitionmap_file, file=sys.stderr)
- htable.load_partitionmap(partitionmap_file)
+ nodegraph.load_partitionmap(partitionmap_file)
for infile in filenames:
print('outputting partitions for', infile, file=sys.stderr)
outfile = os.path.basename(infile) + '.part'
- part_count = htable.output_partitions(infile, outfile)
+ part_count = nodegraph.output_partitions(infile, outfile)
print('output %d partitions for %s' % (
part_count, infile), file=sys.stderr)
print('partitions are in', outfile, file=sys.stderr)
diff --git a/scripts/count-median.py b/scripts/count-median.py
index 7ca052c..76714b3 100755
--- a/scripts/count-median.py
+++ b/scripts/count-median.py
@@ -9,8 +9,8 @@
"""
Count the median/avg k-mer abundance for each sequence in the input file.
-The abundance is based on the k-mer counts in the given k-mer counting
-table. Can be used to estimate expression levels (mRNAseq) or coverage
+The abundance is based on the k-mer counts in the given k-mer countgraph.
+Can be used to estimate expression levels (mRNAseq) or coverage
(genomic/metagenomic).
% scripts/count-median.py <htname> <input seqs> <output counts>
@@ -36,7 +36,7 @@ from khmer.khmer_args import info
def get_parser():
epilog = """
Count the median/avg k-mer abundance for each sequence in the input file,
- based on the k-mer counts in the given k-mer counting table. Can be used
+ based on the k-mer counts in the given k-mer countgraph. Can be used
to estimate expression levels (mRNAseq) or coverage (genomic/metagenomic).
The output file contains sequence id, median, average, stddev, and
@@ -52,8 +52,8 @@ def get_parser():
description='Count k-mers summary stats for sequences',
epilog=textwrap.dedent(epilog))
- parser.add_argument('ctfile', metavar='input_counting_table_filename',
- help='input k-mer count table filename')
+ parser.add_argument('countgraph', metavar='input_count_graph_filename',
+ help='input k-mer countgraph filename')
parser.add_argument('input', metavar='input_sequence_filename',
help='input FAST[AQ] sequence filename')
parser.add_argument('output', metavar='output_summary_filename',
@@ -70,7 +70,7 @@ def main():
info('count-median.py', ['diginorm'])
args = get_parser().parse_args()
- htfile = args.ctfile
+ htfile = args.countgraph
input_filename = args.input
output = args.output
output_filename = str(output)
@@ -81,9 +81,9 @@ def main():
check_space(infiles, args.force)
- print('loading k-mer counting table from', htfile, file=sys.stderr)
- htable = khmer.load_counting_hash(htfile)
- ksize = htable.ksize()
+ print('loading k-mer countgraph from', htfile, file=sys.stderr)
+ countgraph = khmer.load_countgraph(htfile)
+ ksize = countgraph.ksize()
print('writing to', output_filename, file=sys.stderr)
output = csv.writer(output)
@@ -96,7 +96,7 @@ def main():
seq = seq.replace('N', 'A')
if ksize <= len(seq):
- medn, ave, stdev = htable.get_median_count(seq)
+ medn, ave, stdev = countgraph.get_median_count(seq)
ave, stdev = [round(x, 9) for x in (ave, stdev)]
output.writerow([record.name, medn, ave, stdev, len(seq)])
diff --git a/scripts/count-overlap.py b/scripts/count-overlap.py
deleted file mode 100755
index 51ffbf3..0000000
--- a/scripts/count-overlap.py
+++ /dev/null
@@ -1,91 +0,0 @@
-#! /usr/bin/env python
-#
-# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
-# Copyright (C) Michigan State University, 2012-2015-2015. It is licensed under
-# the three-clause BSD license; see LICENSE.
-# Contact: khmer-project at idyll.org
-#
-# pylint: disable=missing-docstring,invalid-name
-"""
-Count the overlap k-mers.
-
-Overlap k-mers are those appearing in two sequence datasets.
-
-usage: count-overlap_cpp.py [-h] [-q] [--ksize KSIZE] [--n_tables N_HASHES]
- [--tablesize HASHSIZE]
- 1st_dataset(htfile generated by load-graph.py) 2nd_dataset(fastafile)
- result
-
-Use '-h' for parameter help.
-"""
-from __future__ import print_function
-import sys
-import csv
-import khmer
-import textwrap
-from khmer import khmer_args
-from khmer.kfile import check_input_files
-from khmer.khmer_args import (build_hashbits_args, report_on_config, info)
-
-
-def get_parser():
- epilog = """
- An additional report will be written to ${output_report_filename}.curve
- containing the increase of overlap k-mers as the number of sequences in the
- second database increases.
- """
- parser = build_hashbits_args(
- descr='Count the overlap k-mers which are the k-mers appearing in two '
- 'sequence datasets.', epilog=textwrap.dedent(epilog))
- parser.add_argument('ptfile', metavar='input_presence_table_filename',
- help="input k-mer presence table filename")
- parser.add_argument('fafile', metavar='input_sequence_filename',
- help="input sequence filename")
- parser.add_argument('report_filename', metavar='output_report_filename',
- help='output report filename')
- parser.add_argument('-f', '--force', default=False, action='store_true',
- help='Overwrite output file if it exists')
- return parser
-
-
-def main():
- info('count-overlap.py', ['counting'])
- args = get_parser().parse_args()
- report_on_config(args, hashtype='nodegraph')
-
- for infile in [args.ptfile, args.fafile]:
- check_input_files(infile, args.force)
-
- print('loading k-mer presence table from', args.ptfile, file=sys.stderr)
- ht1 = khmer.load_hashbits(args.ptfile)
- kmer_size = ht1.ksize()
-
- output = open(args.report_filename, 'w')
- f_curve_obj = open(args.report_filename + '.curve', 'w')
- f_curve_obj_csv = csv.writer(f_curve_obj)
- # write headers:
- f_curve_obj_csv.writerow(['input_seq', 'overlap_kmer'])
-
- ht2 = khmer_args.create_nodegraph(args, ksize=kmer_size)
-
- (n_unique, n_overlap, list_curve) = ht2.count_overlap(args.fafile, ht1)
-
- printout1 = """\
-dataset1(pt file): %s
-dataset2: %s
-
-# of unique k-mers in dataset2: %d
-# of overlap unique k-mers: %d
-
-""" % (args.ptfile, args.fafile, n_unique, n_overlap)
- output.write(printout1)
-
- for i in range(100):
- f_curve_obj_csv.writerow([list_curve[100 + i], list_curve[i]])
-
- print('wrote to: ' + args.report_filename, file=sys.stderr)
-
-if __name__ == '__main__':
- main()
-
-# vim: set ft=python ts=4 sts=4 sw=4 et tw=79:
diff --git a/scripts/do-partition.py b/scripts/do-partition.py
index f502145..f74109f 100755
--- a/scripts/do-partition.py
+++ b/scripts/do-partition.py
@@ -23,7 +23,7 @@ import os.path
import os
import textwrap
from khmer import khmer_args
-from khmer.khmer_args import (build_hashbits_args, report_on_config, info,
+from khmer.khmer_args import (build_nodegraph_args, report_on_config, info,
add_threading_args)
import glob
from khmer.kfile import check_input_files, check_space
@@ -40,21 +40,11 @@ DEFAULT_SUBSET_SIZE = int(1e5)
DEFAULT_N_THREADS = 4
DEFAULT_K = 32
-# Debugging Support
-if "Linux" == platform.system():
- def __debug_vm_usage(msg):
- print("===> DEBUG: " + msg, file=sys.stderr)
- for vmstat in re.findall(r".*Vm.*", file("/proc/self/status").read()):
- print(vmstat)
-else:
- def __debug_vm_usage(msg): # pylint: disable=unused-argument
- pass
-
def worker(queue, basename, stop_big_traversals):
while True:
try:
- (htable, index, start, stop) = queue.get(False)
+ (nodegraph, index, start, stop) = queue.get(False)
except queue.Empty:
print('exiting', file=sys.stderr)
return
@@ -68,11 +58,11 @@ def worker(queue, basename, stop_big_traversals):
# pay attention to stoptags when partitioning; take command line
# direction on whether or not to exhaustively traverse.
- subset = htable.do_subset_partition(start, stop, True,
- stop_big_traversals)
+ subset = nodegraph.do_subset_partition(start, stop, True,
+ stop_big_traversals)
print('saving:', basename, index, file=sys.stderr)
- htable.save_subset_partitionmap(subset, outfile)
+ nodegraph.save_subset_partitionmap(subset, outfile)
del subset
gc.collect()
@@ -88,7 +78,7 @@ def get_parser():
but should probably not be used for large data sets, because
:program:`do-partition.py` doesn't provide save/resume functionality.
"""
- parser = build_hashbits_args(
+ parser = build_nodegraph_args(
descr='Load, partition, and annotate FAST[AQ] sequences',
epilog=textwrap.dedent(epilog))
add_threading_args(parser)
@@ -114,14 +104,14 @@ def main(): # pylint: disable=too-many-locals,too-many-statements
info('do-partition.py', ['graph'])
args = get_parser().parse_args()
- report_on_config(args, hashtype='nodegraph')
+ report_on_config(args, graphtype='nodegraph')
for infile in args.input_filenames:
check_input_files(infile, args.force)
check_space(args.input_filenames, args.force)
- print('Saving k-mer presence table to %s' %
+ print('Saving k-mer nodegraph to %s' %
args.graphbase, file=sys.stderr)
print('Loading kmers from sequences in %s' %
repr(args.input_filenames), file=sys.stderr)
@@ -133,15 +123,16 @@ def main(): # pylint: disable=too-many-locals,too-many-statements
# load-graph
print('making nodegraph', file=sys.stderr)
- htable = khmer_args.create_nodegraph(args)
+ nodegraph = khmer_args.create_nodegraph(args)
for _, filename in enumerate(args.input_filenames):
print('consuming input', filename, file=sys.stderr)
- htable.consume_fasta_and_tag(filename)
+ nodegraph.consume_fasta_and_tag(filename)
# 0.18 is ACTUAL MAX. Do not change.
fp_rate = \
- khmer.calc_expected_collisions(htable, args.force, max_false_pos=.15)
+ khmer.calc_expected_collisions(
+ nodegraph, args.force, max_false_pos=.15)
print('fp rate estimated to be %1.3f' % fp_rate, file=sys.stderr)
# partition-graph
@@ -160,7 +151,7 @@ def main(): # pylint: disable=too-many-locals,too-many-statements
#
# divide the tags up into subsets
- divvy = htable.divide_tags_into_subsets(int(args.subset_size))
+ divvy = nodegraph.divide_tags_into_subsets(int(args.subset_size))
n_subsets = len(divvy)
divvy.append(0)
@@ -171,7 +162,7 @@ def main(): # pylint: disable=too-many-locals,too-many-statements
for _ in range(0, n_subsets):
start = divvy[_]
end = divvy[_ + 1]
- worker_q.put((htable, _, start, end))
+ worker_q.put((nodegraph, _, start, end))
print('enqueued %d subset tasks' % n_subsets, file=sys.stderr)
open('%s.info' % args.graphbase, 'w').write('%d subsets total\n'
@@ -211,11 +202,11 @@ def main(): # pylint: disable=too-many-locals,too-many-statements
print('loading %d pmap files (first one: %s)' %
(len(pmap_files), pmap_files[0]), file=sys.stderr)
- htable = khmer.Hashbits(args.ksize, 1, 1)
+ nodegraph = khmer.Nodegraph(args.ksize, 1, 1)
for pmap_file in pmap_files:
print('merging', pmap_file, file=sys.stderr)
- htable.merge_subset_from_disk(pmap_file)
+ nodegraph.merge_subset_from_disk(pmap_file)
if args.remove_subsets:
print('removing pmap files', file=sys.stderr)
@@ -227,7 +218,7 @@ def main(): # pylint: disable=too-many-locals,too-many-statements
for infile in args.input_filenames:
print('outputting partitions for', infile, file=sys.stderr)
outfile = os.path.basename(infile) + '.part'
- part_count = htable.output_partitions(infile, outfile)
+ part_count = nodegraph.output_partitions(infile, outfile)
print('output %d partitions for %s' % (
part_count, infile), file=sys.stderr)
print('partitions are in', outfile, file=sys.stderr)
diff --git a/scripts/extract-long-sequences.py b/scripts/extract-long-sequences.py
index 0a83bed..46680d8 100755
--- a/scripts/extract-long-sequences.py
+++ b/scripts/extract-long-sequences.py
@@ -23,6 +23,7 @@ import argparse
import screed
import sys
from khmer.utils import write_record
+from khmer.kfile import add_output_compression_type, get_file_writer
def get_parser():
@@ -35,21 +36,22 @@ def get_parser():
' sequence filename.', nargs='+')
parser.add_argument('-o', '--output', help='The name of the output'
' sequence file.', default=sys.stdout,
- metavar='output', type=argparse.FileType('w'))
+ metavar='output', type=argparse.FileType('wb'))
parser.add_argument('-l', '--length', help='The minimum length of'
' the sequence file.',
type=int, default=200)
+ add_output_compression_type(parser)
return parser
def main():
args = get_parser().parse_args()
- outfp = args.output
+ outfp = get_file_writer(args.output, args.gzip, args.bzip)
for filename in args.input_filenames:
for record in screed.open(filename):
if len(record['sequence']) >= args.length:
write_record(record, outfp)
- print('wrote to: ' + outfp.name, file=sys.stderr)
+ print('wrote to: ' + args.output.name, file=sys.stderr)
if __name__ == '__main__':
main()
diff --git a/scripts/extract-paired-reads.py b/scripts/extract-paired-reads.py
index 34f8f21..ba750dc 100755
--- a/scripts/extract-paired-reads.py
+++ b/scripts/extract-paired-reads.py
@@ -25,6 +25,8 @@ import argparse
import khmer
from khmer.kfile import check_input_files, check_space
from khmer.khmer_args import info
+from khmer.kfile import add_output_compression_type
+from khmer.kfile import get_file_writer
from khmer.utils import broken_paired_reader, write_record, write_record_pair
@@ -45,13 +47,13 @@ def get_parser():
contains orphan sequences.
The directory into which the interleaved and orphaned reads are
- output may be specified using :option:`-o`/:option:`--output-dir`.
+ output may be specified using :option:`-d`/:option:`--output-dir`.
This directory will be created if it does not already exist.
Alternatively, you can specify the filenames directly with
:option:`-p`/:option:`--output-paired` and
:option:`-s`/:option:`--output-single`, which will override the
- :option:`-o`/:option:`--output-dir` option.
+ :option:`-d`/:option:`--output-dir` option.
Example::
@@ -64,19 +66,20 @@ def get_parser():
parser.add_argument('--version', action='version', version='%(prog)s ' +
khmer.__version__)
- parser.add_argument('-o', '--output-dir', default='', help='Output '
+ parser.add_argument('-d', '--output-dir', default='', help='Output '
'split reads to specified directory. Creates '
'directory if necessary')
- parser.add_argument('-p', '--output-paired', metavar='output_paired',
+ parser.add_argument('--output-paired', '-p', metavar="filename",
+ type=argparse.FileType('wb'),
default=None, help='Output paired reads to this '
- 'file', type=argparse.FileType('w'))
- parser.add_argument('-s', '--output-single', metavar='output_single',
- default=None, help='Output orphaned reads to this '
- 'file', type=argparse.FileType('w'))
-
+ 'file')
+ parser.add_argument('--output-single', '-s', metavar="filename",
+ type=argparse.FileType('wb'), default=None,
+ help='Output orphaned reads to this file')
parser.add_argument('-f', '--force', default=False, action='store_true',
help='Overwrite output file if it exists')
+ add_output_compression_type(parser)
return parser
@@ -105,17 +108,17 @@ def main():
# OVERRIDE default output file locations with -p, -s
if args.output_paired:
- paired_fp = args.output_paired
+ paired_fp = get_file_writer(args.output_paired, args.gzip, args.bzip)
out2 = paired_fp.name
else:
# Don't override, just open the default filename from above
- paired_fp = open(out2, 'w')
+ paired_fp = get_file_writer(open(out2, 'wb'), args.gzip, args.bzip)
if args.output_single:
- single_fp = args.output_single
- out1 = single_fp.name
+ single_fp = get_file_writer(args.output_single, args.gzip, args.bzip)
+ out1 = args.output_single.name
else:
# Don't override, just open the default filename from above
- single_fp = open(out1, 'w')
+ single_fp = get_file_writer(open(out1, 'wb'), args.gzip, args.bzip)
print('reading file "%s"' % infile, file=sys.stderr)
print('outputting interleaved pairs to "%s"' % out2, file=sys.stderr)
diff --git a/scripts/extract-partitions.py b/scripts/extract-partitions.py
index b4ee857..9d7093d 100755
--- a/scripts/extract-partitions.py
+++ b/scripts/extract-partitions.py
@@ -25,7 +25,9 @@ import screed
import argparse
import textwrap
import khmer
-from khmer.kfile import check_input_files, check_space
+from khmer.kfile import (check_input_files, check_space,
+ add_output_compression_type,
+ get_file_writer)
from khmer.khmer_args import info
from khmer.utils import write_record
@@ -77,6 +79,7 @@ def get_parser():
khmer.__version__)
parser.add_argument('-f', '--force', default=False, action='store_true',
help='Overwrite output file if it exists')
+ add_output_compression_type(parser)
return parser
@@ -136,8 +139,8 @@ def main(): # pylint: disable=too-many-locals,too-many-branches
break
if args.output_unassigned:
- unassigned_fp = open('%s.unassigned.%s' % (args.prefix, suffix), 'w')
-
+ ofile = open('%s.unassigned.%s' % (args.prefix, suffix), 'wb')
+ unassigned_fp = get_file_writer(ofile, args.gzip, args.bzip)
count = {}
for filename in args.part_filenames:
for index, read, pid in read_partition_file(filename):
@@ -212,9 +215,11 @@ def main(): # pylint: disable=too-many-locals,too-many-branches
# open a bunch of output files for the different groups
group_fps = {}
- for _ in range(group_n):
- group_fp = open('%s.group%04d.%s' % (args.prefix, _, suffix), 'w')
- group_fps[_] = group_fp
+ for index in range(group_n):
+ fname = '%s.group%04d.%s' % (args.prefix, index, suffix)
+ group_fp = get_file_writer(open(fname, 'wb'), args.gzip,
+ args.bzip)
+ group_fps[index] = group_fp
# write 'em all out!
diff --git a/scripts/fastq-to-fasta.py b/scripts/fastq-to-fasta.py
index 0dc5831..5cfb785 100755
--- a/scripts/fastq-to-fasta.py
+++ b/scripts/fastq-to-fasta.py
@@ -14,10 +14,13 @@ Convert FASTQ files to FASTA format.
Use '-h' for parameter help.
"""
-from __future__ import print_function
+from __future__ import print_function, unicode_literals
import sys
import argparse
import screed
+from khmer.kfile import (add_output_compression_type, get_file_writer,
+ is_block, describe_file_handle)
+from khmer.utils import write_record
def get_parser():
@@ -28,20 +31,22 @@ def get_parser():
parser.add_argument('input_sequence', help='The name of the input'
' FASTQ sequence file.')
parser.add_argument('-o', '--output', metavar="filename",
+ type=argparse.FileType('wb'),
help='The name of the output'
' FASTA sequence file.',
- type=argparse.FileType('w'),
default=sys.stdout)
parser.add_argument('-n', '--n_keep', default=False, action='store_true',
- help='Option to keep reads containing \'N\'s in ' +
- 'input_sequence file. Default is to drop reads')
+ help='Option to keep reads containing \'N\'s in '
+ 'input_sequence file. Default is to drop reads')
+ add_output_compression_type(parser)
return parser
def main():
args = get_parser().parse_args()
- print(('fastq from ', args.input_sequence), file=sys.stderr)
+ print(('fastq from ', args.input_sequence), file=sys.stderr)
+ outfp = get_file_writer(args.output, args.gzip, args.bzip)
n_count = 0
for n, record in enumerate(screed.open(args.input_sequence)):
if n % 10000 == 0:
@@ -55,8 +60,8 @@ def main():
n_count += 1
continue
- args.output.write('>' + name + '\n')
- args.output.write(sequence + '\n')
+ del record['quality']
+ write_record(record, outfp)
print('\n' + 'lines from ' + args.input_sequence, file=sys.stderr)
@@ -66,7 +71,8 @@ def main():
else:
print('No lines dropped from file.', file=sys.stderr)
- print('Wrote output to', args.output, file=sys.stderr)
+ print('Wrote output to', describe_file_handle(args.output),
+ file=sys.stderr)
if __name__ == '__main__':
main()
diff --git a/scripts/filter-abund-single.py b/scripts/filter-abund-single.py
index 0aaf76b..8a82555 100755
--- a/scripts/filter-abund-single.py
+++ b/scripts/filter-abund-single.py
@@ -7,10 +7,10 @@
#
# pylint: disable=missing-docstring,invalid-name
"""
-Sequence trimming by abundance w/o counting table.
+Sequence trimming by abundance w/o countgraph.
Trim sequences at k-mers of the given abundance for the given file,
-without loading a prebuilt counting table. Output sequences will be
+without loading a prebuilt countgraph. Output sequences will be
placed in 'infile.abundfilt'.
% python scripts/filter-abund-single.py <data>
@@ -26,10 +26,12 @@ import textwrap
from khmer.thread_utils import ThreadedSequenceProcessor, verbose_loader
from khmer import khmer_args
from khmer.khmer_args import (build_counting_args, report_on_config,
- add_threading_args, info, calculate_tablesize)
+ add_threading_args, info, calculate_graphsize)
from khmer.kfile import (check_input_files, check_space,
- check_space_for_hashtable)
-#
+ check_space_for_graph,
+ add_output_compression_type,
+ get_file_writer)
+
DEFAULT_CUTOFF = 2
@@ -53,38 +55,41 @@ def get_parser():
parser.add_argument('--cutoff', '-C', default=DEFAULT_CUTOFF, type=int,
help="Trim at k-mers below this abundance.")
- parser.add_argument('--savetable', metavar="filename", default='',
+ parser.add_argument('--savegraph', metavar="filename", default='',
help="If present, the name of the file to save the "
- "k-mer counting table to")
+ "k-mer countgraph to")
parser.add_argument('datafile', metavar='input_sequence_filename',
help="FAST[AQ] sequence file to trim")
parser.add_argument('-f', '--force', default=False, action='store_true',
help='Overwrite output file if it exists')
+ add_output_compression_type(parser)
return parser
def main():
info('filter-abund-single.py', ['counting', 'SeqAn'])
args = get_parser().parse_args()
+
check_input_files(args.datafile, args.force)
check_space([args.datafile], args.force)
- if args.savetable:
- tablesize = calculate_tablesize(args, 'countgraph')
- check_space_for_hashtable(args.savetable, tablesize, args.force)
+
+ if args.savegraph:
+ tablesize = calculate_graphsize(args, 'countgraph')
+ check_space_for_graph(args.savegraph, tablesize, args.force)
report_on_config(args)
print('making countgraph', file=sys.stderr)
- htable = khmer_args.create_countgraph(args)
+ graph = khmer_args.create_countgraph(args)
- # first, load reads into hash table
+ # first, load reads into graph
rparser = khmer.ReadParser(args.datafile)
threads = []
print('consuming input, round 1 --', args.datafile, file=sys.stderr)
for _ in range(args.threads):
cur_thread = \
threading.Thread(
- target=htable.consume_fasta_with_reads_parser,
+ target=graph.consume_fasta_with_reads_parser,
args=(rparser, )
)
threads.append(cur_thread)
@@ -94,9 +99,9 @@ def main():
_.join()
print('Total number of unique k-mers: {0}'.format(
- htable.n_unique_kmers()), file=sys.stderr)
+ graph.n_unique_kmers()), file=sys.stderr)
- fp_rate = khmer.calc_expected_collisions(htable, args.force)
+ fp_rate = khmer.calc_expected_collisions(graph, args.force)
print('fp rate estimated to be %1.3f' % fp_rate, file=sys.stderr)
# now, trim.
@@ -107,7 +112,7 @@ def main():
seq = record.sequence
seqN = seq.replace('N', 'A')
- _, trim_at = htable.trim_on_abundance(seqN, args.cutoff)
+ _, trim_at = graph.trim_on_abundance(seqN, args.cutoff)
if trim_at >= args.ksize:
# be sure to not to change the 'N's in the trimmed sequence -
@@ -119,18 +124,19 @@ def main():
# the filtering loop
print('filtering', args.datafile, file=sys.stderr)
outfile = os.path.basename(args.datafile) + '.abundfilt'
- outfp = open(outfile, 'w')
+ outfile = open(outfile, 'wb')
+ outfp = get_file_writer(outfile, args.gzip, args.bzip)
tsp = ThreadedSequenceProcessor(process_fn)
tsp.start(verbose_loader(args.datafile), outfp)
print('output in', outfile, file=sys.stderr)
- if args.savetable:
- print('Saving k-mer counting table filename',
- args.savetable, file=sys.stderr)
- print('...saving to', args.savetable, file=sys.stderr)
- htable.save(args.savetable)
+ if args.savegraph:
+ print('Saving k-mer countgraph filename',
+ args.savegraph, file=sys.stderr)
+ print('...saving to', args.savegraph, file=sys.stderr)
+ graph.save(args.savegraph)
print('wrote to: ', outfile, file=sys.stderr)
if __name__ == '__main__':
diff --git a/scripts/filter-abund.py b/scripts/filter-abund.py
index a5fc98f..1f79a9e 100755
--- a/scripts/filter-abund.py
+++ b/scripts/filter-abund.py
@@ -7,12 +7,12 @@
#
# pylint: disable=missing-docstring,invalid-name
"""
-Sequence trimming by abundance using counting table.
+Sequence trimming by abundance using countgraph.
-Trim sequences at k-mers of the given abundance, based on the given counting
-hash table. Output sequences will be placed in 'infile.abundfilt'.
+Trim sequences at k-mers of the given abundance, based on the given countgraph.
+Output sequences will be placed in 'infile.abundfilt'.
-% python scripts/filter-abund.py <counting.ct> <data1> [ <data2> <...> ]
+% python scripts/filter-abund.py <coungraph> <data1> [ <data2> <...> ]
Use '-h' for parameter help.
"""
@@ -24,9 +24,9 @@ import argparse
import sys
from khmer.thread_utils import ThreadedSequenceProcessor, verbose_loader
from khmer.khmer_args import (ComboFormatter, add_threading_args, info)
-from khmer.kfile import check_input_files, check_space
+from khmer.kfile import (check_input_files, check_space,
+ add_output_compression_type, get_file_writer)
from khmer import __version__
-#
DEFAULT_NORMALIZE_LIMIT = 20
DEFAULT_CUTOFF = 2
@@ -40,15 +40,15 @@ def get_parser():
Example::
- load-into-counting.py -k 20 -x 5e7 table.ct data/100k-filtered.fa
- filter-abund.py -C 2 table.ct data/100k-filtered.fa
+ load-into-counting.py -k 20 -x 5e7 countgraph data/100k-filtered.fa
+ filter-abund.py -C 2 countgraph data/100k-filtered.fa
"""
parser = argparse.ArgumentParser(
description='Trim sequences at a minimum k-mer abundance.',
epilog=textwrap.dedent(epilog),
formatter_class=ComboFormatter)
- parser.add_argument('input_table', metavar='input_counting_table_filename',
- help='The input k-mer counting table filename')
+ parser.add_argument('input_graph', metavar='input_count_graph_filename',
+ help='The input k-mer countgraph filename')
parser.add_argument('input_filename', metavar='input_sequence_filename',
help='Input FAST[AQ] sequence filename', nargs='+')
add_threading_args(parser)
@@ -63,8 +63,8 @@ def get_parser():
help='Base the variable-coverage cutoff on this median'
' k-mer abundance.',
default=DEFAULT_NORMALIZE_LIMIT)
- parser.add_argument('-o', '--out', dest='single_output_file',
- type=argparse.FileType('w'),
+ parser.add_argument('-o', '--output', dest='single_output_file',
+ type=argparse.FileType('wb'),
metavar="optional_output_filename",
help='Output the trimmed sequences into a single file '
'with the given filename instead of creating a new '
@@ -73,6 +73,7 @@ def get_parser():
version='khmer {v}'.format(v=__version__))
parser.add_argument('-f', '--force', default=False, action='store_true',
help='Overwrite output file if it exists')
+ add_output_compression_type(parser)
return parser
@@ -80,7 +81,7 @@ def main():
info('filter-abund.py', ['counting'])
args = get_parser().parse_args()
- check_input_files(args.input_table, args.force)
+ check_input_files(args.input_graph, args.force)
infiles = args.input_filename
if ('-' in infiles or '/dev/stdin' in infiles) and not \
args.single_output_file:
@@ -93,10 +94,10 @@ def main():
check_space(infiles, args.force)
- print('loading counting table:', args.input_table,
+ print('loading countgraph:', args.input_graph,
file=sys.stderr)
- htable = khmer.load_counting_hash(args.input_table)
- ksize = htable.ksize()
+ countgraph = khmer.load_countgraph(args.input_graph)
+ ksize = countgraph.ksize()
print("K:", ksize, file=sys.stderr)
@@ -107,11 +108,11 @@ def main():
seqN = seq.replace('N', 'A')
if args.variable_coverage: # only trim when sequence has high enough C
- med, _, _ = htable.get_median_count(seqN)
+ med, _, _ = countgraph.get_median_count(seqN)
if med < args.normalize_to:
return name, seq
- _, trim_at = htable.trim_on_abundance(seqN, args.cutoff)
+ _, trim_at = countgraph.trim_on_abundance(seqN, args.cutoff)
if trim_at >= ksize:
# be sure to not to change the 'N's in the trimmed sequence -
@@ -120,15 +121,21 @@ def main():
return None, None
+ if args.single_output_file:
+ outfile = args.single_output_file.name
+ outfp = get_file_writer(outfile, args.gzip, args.bzip)
+
# the filtering loop
for infile in infiles:
print('filtering', infile, file=sys.stderr)
if args.single_output_file:
outfile = args.single_output_file.name
- outfp = args.single_output_file
+ outfp = get_file_writer(args.single_output_file, args.gzip,
+ args.bzip)
else:
outfile = os.path.basename(infile) + '.abundfilt'
- outfp = open(outfile, 'w')
+ outfp = open(outfile, 'wb')
+ outfp = get_file_writer(outfp, args.gzip, args.bzip)
tsp = ThreadedSequenceProcessor(process_fn, n_workers=args.threads)
tsp.start(verbose_loader(infile), outfp)
diff --git a/scripts/filter-stoptags.py b/scripts/filter-stoptags.py
index d7e87f2..e2d1cb8 100755
--- a/scripts/filter-stoptags.py
+++ b/scripts/filter-stoptags.py
@@ -65,8 +65,8 @@ def main():
check_space(infiles, args.force)
print('loading stop tags, with K', args.ksize, file=sys.stderr)
- htable = khmer.Hashbits(args.ksize, 1, 1)
- htable.load_stop_tags(stoptags)
+ nodegraph = khmer.Nodegraph(args.ksize, 1, 1)
+ nodegraph.load_stop_tags(stoptags)
def process_fn(record):
name = record['name']
@@ -74,7 +74,7 @@ def main():
if 'N' in seq:
return None, None
- trim_seq, trim_at = htable.trim_on_stoptags(seq)
+ trim_seq, trim_at = nodegraph.trim_on_stoptags(seq)
if trim_at >= args.ksize:
return name, trim_seq
diff --git a/scripts/find-knots.py b/scripts/find-knots.py
index d224e06..3683369 100755
--- a/scripts/find-knots.py
+++ b/scripts/find-knots.py
@@ -23,7 +23,7 @@ import khmer
import sys
from khmer.kfile import check_input_files, check_space
from khmer import khmer_args
-from khmer.khmer_args import (build_counting_args, info, add_loadhash_args,
+from khmer.khmer_args import (build_counting_args, info, add_loadgraph_args,
report_on_config)
# counting hash parameters.
@@ -49,7 +49,7 @@ EXCURSION_KMER_COUNT_THRESHOLD = 2
def get_parser():
epilog = """
- Load an k-mer presence table/tagset pair created by load-graph, and a set
+ Load an k-mer nodegraph/tagset pair created by load-graph, and a set
of pmap files created by partition-graph. Go through each pmap file,
select the largest partition in each, and do the same kind of traversal as
in :program:`make-initial-stoptags.py` from each of the waypoints in that
@@ -82,7 +82,7 @@ def main():
graphbase = args.graphbase
# @RamRS: This might need some more work
- infiles = [graphbase + '.pt', graphbase + '.tagset']
+ infiles = [graphbase, graphbase + '.tagset']
if os.path.exists(graphbase + '.stoptags'):
infiles.append(graphbase + '.stoptags')
for _ in infiles:
@@ -90,16 +90,16 @@ def main():
check_space(infiles, args.force)
- print('loading k-mer presence table %s.pt' % graphbase, file=sys.stderr)
- htable = khmer.load_hashbits(graphbase + '.pt')
+ print('loading k-mer nodegraph %s' % graphbase, file=sys.stderr)
+ graph = khmer.load_nodegraph(graphbase)
print('loading tagset %s.tagset...' % graphbase, file=sys.stderr)
- htable.load_tagset(graphbase + '.tagset')
+ graph.load_tagset(graphbase + '.tagset')
initial_stoptags = False # @CTB regularize with make-initial
if os.path.exists(graphbase + '.stoptags'):
print('loading stoptags %s.stoptags' % graphbase, file=sys.stderr)
- htable.load_stop_tags(graphbase + '.stoptags')
+ graph.load_stop_tags(graphbase + '.stoptags')
initial_stoptags = True
pmap_files = glob.glob(args.graphbase + '.subset.*.pmap')
@@ -115,34 +115,34 @@ def main():
file=sys.stderr)
print('---', file=sys.stderr)
- # create counting hash
- ksize = htable.ksize()
+ # create countgraph
+ ksize = graph.ksize()
counting = khmer_args.create_countgraph(args, ksize=ksize)
# load & merge
for index, subset_file in enumerate(pmap_files):
print('<-', subset_file, file=sys.stderr)
- subset = htable.load_subset_partitionmap(subset_file)
+ subset = graph.load_subset_partitionmap(subset_file)
print('** repartitioning subset... %s' % subset_file, file=sys.stderr)
- htable.repartition_largest_partition(subset, counting,
- EXCURSION_DISTANCE,
- EXCURSION_KMER_THRESHOLD,
- EXCURSION_KMER_COUNT_THRESHOLD)
+ graph.repartition_largest_partition(subset, counting,
+ EXCURSION_DISTANCE,
+ EXCURSION_KMER_THRESHOLD,
+ EXCURSION_KMER_COUNT_THRESHOLD)
print('** merging subset... %s' % subset_file, file=sys.stderr)
- htable.merge_subset(subset)
+ graph.merge_subset(subset)
print('** repartitioning, round 2... %s' %
subset_file, file=sys.stderr)
- size = htable.repartition_largest_partition(
+ size = graph.repartition_largest_partition(
None, counting, EXCURSION_DISTANCE, EXCURSION_KMER_THRESHOLD,
EXCURSION_KMER_COUNT_THRESHOLD)
print('** repartitioned size:', size, file=sys.stderr)
print('saving stoptags binary', file=sys.stderr)
- htable.save_stop_tags(graphbase + '.stoptags')
+ graph.save_stop_tags(graphbase + '.stoptags')
os.rename(subset_file, subset_file + '.processed')
print('(%d of %d)\n' % (index, len(pmap_files)), file=sys.stderr)
diff --git a/scripts/galaxy/README.txt b/scripts/galaxy/README.txt
deleted file mode 100644
index 80e016a..0000000
--- a/scripts/galaxy/README.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-Requires galaxy-central dating from 2014-06-30 or newer
-
-https://bitbucket.org/galaxy/galaxy-central/commits/4de240e5a7819c768b8267c19e477530dab54370
-
-
diff --git a/scripts/galaxy/abundance-dist-single.xml b/scripts/galaxy/abundance-dist-single.xml
deleted file mode 100644
index 900096a..0000000
--- a/scripts/galaxy/abundance-dist-single.xml
+++ /dev/null
@@ -1,98 +0,0 @@
-<tool id="gedlab-khmer-abundance-dist-single"
- name="Abundance Distribution (all-in-one)"
- version="1.1-1"
- force_history_refresh="true">
-
- <description>
- Calculate abundance distribution of the k-mers in a given
- sequence file.
- </description>
- <macros>
- <token name="@BINARY@">abundance-dist-single.py</token>
- <import>macros.xml</import>
- </macros>
- <expand macro="requirements" />
- <command>
-## The command is a Cheetah template which allows some Python based syntax.
-## Lines starting hash hash are comments. Galaxy will turn newlines into spaces
-mkdir output; cd output;
- at BINARY@
- at TABLEPARAMS@
-$zero
-$bigcount
-#if $save_countingtable
---savetable=$optional_output_countingtable
-#end if
---report-total-kmers
---squash
- at THREADS@
-$input_sequence_filename
-$output_histogram_filename
- </command>
-
- <inputs>
- <expand macro="input_sequence_filename" />
- <param name="save_countingtable"
- type="boolean"
- label="Save the k-mer counting table(s) in a file"
- help="(--savetable)" />
- <expand macro="input_zero" />
- <param name="bigcount"
- type="boolean"
- truevalue=""
- falsevalue="--no-bigcount"
- checked="true"
- help="Count k-mers past 255 (--no-bigcount)" />
- <expand macro="tableinputs" />
- </inputs>
- <outputs>
- <data name="optional_output_countingtable"
- format="ct"
- label="${tool.name} k-mer counting table">
- <filter>save_countingtable == True</filter>
- </data>
- <expand macro="abundance-histogram-output" />
- </outputs>
- <expand macro="stdio" />
- <tests>
- <test>
- <param name="input_sequence_filename" value="test-abund-read-2.fa" />
- <param name="type" value="specific" />
- <param name="tablesize_specific" value="1e7" />
- <param name="n_tables" value="2" />
- <param name="ksize" value="17" />
- <param name="no_zero" value="false" />
- <output name="output_histogram_filename">
- <assert_contents>
- <has_text text="1 96 96 0.98" />
- <has_text text="1001 2 98 1.0" />
- </assert_contents>
- </output>
- </test>
- <test>
- <param name="input_sequence_filename" value="test-abund-read-2.fa" />
- <param name="type" value="specific" />
- <param name="tablesize_specific" value="1e7" />
- <param name="n_tables" value="2" />
- <param name="ksize" value="17" />
- <param name="no_zero" value="false" />
- <param name="bigcount" value="false" />
- <output name="output_histogram_filename">
- <assert_contents>
- <has_text text="1 96 96 0.98" />
- <has_text text="255 2 98 1.0" />
- </assert_contents>
- </output>
- </test>
-
- </tests>
- <citations>
- <expand macro="software-citation" />
- <expand macro="counting-citation" />
- </citations>
- <!-- [OPTIONAL] Help displayed in Galaxy -->
- <!--
- <help>
- </help>
- -->
-</tool>
diff --git a/scripts/galaxy/abundance-dist.xml b/scripts/galaxy/abundance-dist.xml
deleted file mode 100644
index 88d47ee..0000000
--- a/scripts/galaxy/abundance-dist.xml
+++ /dev/null
@@ -1,68 +0,0 @@
-<tool id="gedlab-khmer-abundance-dist"
- name="Abundance Distribution"
- version="1.1-1"
- force_history_refresh="true">
-
- <description>
- Calculate abundance distribution of the k-mers in a given sequence
- file using a pre-made k-mer counting table.
- </description>
- <macros>
- <token name="@BINARY@">abundance-dist.py</token>
- <import>macros.xml</import>
- </macros>
- <expand macro="requirements" />
- <command>
-## The command is a Cheetah template which allows some Python based syntax.
-## Lines starting hash hash are comments. Galaxy will turn newlines into spaces
-mkdir output; cd output;
- at BINARY@
---squash
-$input_counting_table_filename
-$input_sequence_filename
-$output_histogram_filename
- </command>
-
- <inputs>
- <expand macro="input_sequence_filename" />
- <expand macro="input_counting_table_filename" />
- <expand macro="input_zero" />
- </inputs>
- <outputs>
- <expand macro="abundance-histogram-output" />
- </outputs>
- <expand macro="stdio" />
- <tests>
- <test>
- <param name="input_sequence_filename" value="test-abund-read-2.fa" />
- <param name="input_counting_table_filename" value="test-abund-read-2.ct" ftype="ct" />
- <param name="no_zero" value="false" />
- <output name="output_histogram_filename">
- <assert_contents>
- <has_line_matching expression="1 96 96 0.98" />
- <has_line_matching expression="1001 2 98 1.0" />
- </assert_contents>
- </output>
- </test>
- <test>
- <param name="input_sequence_filename" value="test-abund-read-2.fa" />
- <param name="input_counting_table_filename" value="test-abund-read-2.nobigcount.ct" ftype="ct" />
- <param name="no_zero" value="false" />
- <output name="output_histogram_filename">
- <assert_contents>
- <has_line_matching expression="1 96 96 0.98" />
- <has_line_matching expression="255 2 98 1.0" />
- </assert_contents>
- </output>
- </test>
- </tests>
- <citations>
- <expand macro="software-citation" />
- <expand macro="counting-citation" />
- </citations>
- <!-- [OPTIONAL] Help displayed in Galaxy -->
- <!--
- <help>
- </help>
- -->
-</tool>
diff --git a/scripts/galaxy/count-median.xml b/scripts/galaxy/count-median.xml
deleted file mode 100644
index da0ad31..0000000
--- a/scripts/galaxy/count-median.xml
+++ /dev/null
@@ -1,58 +0,0 @@
-<tool id="gedlab-khmer-count-median"
- name="Count Median"
- version="1.1-1"
- force_history_refresh="true">
-
- <description>
- Count the median/avg k-mer abundance for each sequence in the
- input file, based on the k-mer counts in the given k-mer
- counting table. Can be used to estimate expression levels
- (mRNAseq) or coverage (genomic/metagenomic).
- </description>
- <macros>
- <token name="@BINARY@">count-median.py</token>
- <import>macros.xml</import>
- </macros>
- <expand macro="requirements" />
- <command>
- at BINARY@
-$input_counting_table_filename
-$input_sequence_filename
-$output_summary_filename
- </command>
-
- <inputs>
- <expand macro="input_sequence_filename" />
- <expand macro="input_counting_table_filename" />
- </inputs>
- <outputs>
- <data name="output_summary_filename" format="text"
- label="${input_sequence_filename} sequence id, median, average, stddev, and seq length" />
- </outputs>
- <expand macro="stdio" />
- <tests>
- <test interactor="api">
- <param name="input_sequence_filename"
- value="test-abund-read-2.fa" />
- <param name="input_counting_table_filename"
- value="test-abund-read-2.ct" ftype="ct" />
- <output name="output_summary_filename">
- <assert_contents>
- <has_line_matching
- expression="seq 1001 1001.0 0.0 18" />
- <has_line_matching
- expression="895:1:37:17593:9954/1 1 103.803741455 303.702941895 114" />
- </assert_contents>
- </output>
- </test>
- </tests>
- <citations>
- <expand macro="software-citation" />
- <expand macro="diginorm-citation" />
- </citations>
- <!-- [OPTIONAL] Help displayed in Galaxy -->
- <!--
- <help>
- </help>
- -->
-</tool>
diff --git a/scripts/galaxy/do-partition.xml b/scripts/galaxy/do-partition.xml
deleted file mode 100644
index 41746d6..0000000
--- a/scripts/galaxy/do-partition.xml
+++ /dev/null
@@ -1,107 +0,0 @@
-<tool id="gedlab-khmer-do-partition"
- name="Sequence partition all-in-one"
- version="1.1-1"
- force_history_refresh="true">
-
- <description>
- Load, partition, and annotate FAST[AQ] sequences
- </description>
- <macros>
- <token name="@BINARY@">do-parition.py</token>
- <import>macros.xml</import>
- </macros>
- <expand macro="requirements" />
- <command>
-mkdir -p output;
- at BINARY@
- at TABLEPARAMS@
- at THREADS@
-output
-#for input in $inputs
-$input
-#end for ;
-mv output.info $infomation ;
-mv *.part output/
- </command>
-
- <inputs>
- <expand macro="input_sequences_filenames" />
- <param name="ksize"
- type="integer"
- value="20"
- label="ksize"
- help="k-mer size to use (--ksize/-k)" />
- <param name="n_tables"
- type="integer"
- min="1"
- value="4"
- label="n_tables"
- help="number of tables to use (--n_tables/-N)" />
- <param name="tablesize_specific"
- type="text"
- label="tablesize"
- help="lower bound on the tablesize to use (--min-tablesize/-x)" />
- </inputs>
- <outputs>
- <data name="information"
- format="text"
- label="${tool.name} summary for #echo ','.join(map(str, $inputs ))#" />
- <expand macro="output_sequences" />
- </outputs>
- <expand macro="stdio" />
-<!-- <tests>
- <test interactor="api">
- <conditional name="parameters">
- <param name="type" value="specific" />
- <param name="inputs" value="test-abund-read-2.fa"/>
- <param name="cutoff" value="1" />
- <param name="ksize" value="17" />
- </conditional>
- <output name="output">
- <discover_dataset name="test-abund-read-2.fa.keep">
- <assert_contents>
- <has_line_matching expression="GGTTGACGGGGCTCAGGGGG" />
- </assert_contents>
- </discover_dataset>
- </output>
- </test>
- <test interactor="api">
- <param name="inputs" value="test-abund-read-2.fa" />
- <param name="cutoff" value="2" />
- <param name="ksize" value="17" />
- <output name="output">
- <discover_dataset name="test-abund-read-2.fa.keep">
- <assert_contents>
- <has_line_matching expression="GGTTGACGGGGCTCAGGGGG" />
- <has_line_matching expression="GGTTGACGGGGCTCAGGG" />
- </assert_contents>
- </discover_dataset>
- </output>
- </test>
- <test interactor="api">
- <param name="inputs" value="test-abund-read-paired.fa" />
- <param name="cutoff" value="1" />
- <param name="ksize" value="17" />
- <param name="paired" value="true" />
- <output name="output">
- <discover_dataset name="test-abund-read-paired.fa.keep">
- <assert_contents>
- <has_line_matching expression="GGTTGACGGGGCTCAGGGGG" />
- <has_line_matching expression="GGTTGACGGGGCTCAGGG" />
- </assert_contents>
- </discover_dataset>
- </output>
- </test>
-
- </tests>
- -->
- <citations>
- <expand macro="software-citation" />
- <expand macro="graph-citation" />
- </citations>
- <!-- [OPTIONAL] Help displayed in Galaxy -->
- <!--
- <help>
- </help>
- -->
-</tool>
diff --git a/scripts/galaxy/extract-partitions.xml b/scripts/galaxy/extract-partitions.xml
deleted file mode 100644
index 5a27d7c..0000000
--- a/scripts/galaxy/extract-partitions.xml
+++ /dev/null
@@ -1,77 +0,0 @@
-<tool id="gedlab-khmer-extract-partitions"
- name="Extract partitions"
- version="1.1-1"
- force_history_refresh="true">
-
- <description>
- Separate sequences that are annotated with partitions into
- grouped files.
- </description>
- <macros>
- <token name="@BINARY@">extract-partitions.py</token>
- <import>macros.xml</import>
- </macros>
- <expand macro="requirements" />
- <command>
-mkdir -p output ;
-cd output ;
- at BINARY@
---max-size $max_size
---min-partition-size $min_partition_size
-$output_unasssigned
-output
-#for input in $inputs
-$input
-#end for ;
-mv output.dist $distribution
- </command>
-
- <inputs>
- <expand macro="input_sequences_filenames" />
- <param name="max_size"
- type="integer"
- label="Max group size"
- help="No more than this many number of sequences will be stored in each output (--max-size/-X)"
- value="1000000" />
- <param name="min_partition_size"
- type="integer"
- label="Min partition size"
- help="The minimum partition size worth keeping (--min-partition-size/-m)"
- value="5" />
- <param name="output_unassigned"
- type="boolean"
- checked="false"
- truevalue="--output-unassigned"
- falsevalue=""
- label="Output unassigned sequences (--output-unassigned/-U)" />
- </inputs>
- <outputs>
- <data name="distribution"
- format="text"
- label="Partition size distribution from ${tool.name}" />
- <expand macro="output_sequences" />
- </outputs>
- <expand macro="stdio" />
-
- <tests>
- <test interactor="api">
- <param name="inputs" value="random-20-a.fa.part"/>
- <output name="distribution">
- <assert_contents>
- <has_line_matching
- expression="99 1 1 99" />
- </assert_contents>
- </output>
- </test>
-
- </tests>
- <citations>
- <expand macro="software-citation" />
- <expand macro="graph-citation" />
- </citations>
- <!-- [OPTIONAL] Help displayed in Galaxy -->
- <!--
- <help>
- </help>
- -->
-</tool>
diff --git a/scripts/galaxy/filter-abund.xml b/scripts/galaxy/filter-abund.xml
deleted file mode 100644
index e13793e..0000000
--- a/scripts/galaxy/filter-abund.xml
+++ /dev/null
@@ -1,88 +0,0 @@
-<tool id="gedlab-khmer-filter-abund"
- name="Filter by abundance"
- version="1.1-1"
- force_history_refresh="true">
-
- <description>
- Trims fastq/fasta sequences at k-mers of a given abundance
- based on a provided k-mer counting table.
- </description>
- <macros>
- <token name="@BINARY@">filter-abund.py</token>
- <import>macros.xml</import>
- </macros>
- <expand macro="requirements" />
- <command>
-mkdir output; cd output;
- at BINARY@
-#if $cutoff != 2
- --cutoff=$cutoff
-#end if
-$variable_coverage
- at THREADS@
-$input_counting_table_filename
-#for input in $inputs
- $input
-#end for
---out $output
- </command>
-
- <inputs>
- <expand macro="input_sequences_filenames" />
- <param name="variable_coverage"
- type="boolean"
- checked="false"
- truevalue="--variable-coverage"
- falsevalue=""
- label="Variable coverage"
- help="Only trim when a sequence has high enough coverage; median abundance > 20 (--variable_coverage)" />
- <param name="cutoff"
- type="integer"
- value="2"
- label="cutoff"
- help="Trim at k-mers below this abundance. (--cutoff)" />
- <expand macro="input_counting_table_filename" />
- </inputs>
- <outputs>
- <!-- <expand macro="output_sequences" /> -->
- <expand macro="output_sequences_single" />
- </outputs>
- <expand macro="stdio" />
- <tests>
- <test interactor="api">
- <param name="inputs" value="test-abund-read-2.fa" />
- <param name="input_counting_table_filename"
- value="test-abund-read-2.ct" ftype="ct" />
- <output name="output">
- <!-- <discover_dataset name="test-abund-read-2.fa.abundfilt"> -->
- <assert_contents>
- <has_text text="GGTTGACGGGGCTCAGGG" />
- </assert_contents>
- <!-- </discover_dataset> -->
- </output>
- </test>
- <test interactor="api">
- <param name="input_sequence_filename"
- value="test-abund-read-2.fa" />
- <param name="input_counting_table_filename"
- value="test-abund-read-2.ct" ftype="ct" />
- <param name="cutoff" value="1" />
- <output name="output">
- <!-- <discover_dataset name="test-abund-read-2.fa.abundfilt"> -->
- <assert_contents>
- <has_text text="GGTTGACGGGGCTCAGGG" />
- </assert_contents>
- <!-- </discover_dataset> -->
- </output>
- </test>
- </tests>
- <citations>
- <expand macro="software-citation" />
- <expand macro="counting-citation" />
- </citations>
- <!-- [OPTIONAL] ReST Help displayed in Galaxy -->
- <!--
- <help>
- </help>
- -->
-</tool>
diff --git a/scripts/galaxy/filter-below-abund.py b/scripts/galaxy/filter-below-abund.py
deleted file mode 120000
index 66a0dc7..0000000
--- a/scripts/galaxy/filter-below-abund.py
+++ /dev/null
@@ -1 +0,0 @@
-../../sandbox/filter-below-abund.py
\ No newline at end of file
diff --git a/scripts/galaxy/filter-below-abund.xml b/scripts/galaxy/filter-below-abund.xml
deleted file mode 100644
index 71c8db0..0000000
--- a/scripts/galaxy/filter-below-abund.xml
+++ /dev/null
@@ -1,65 +0,0 @@
-<tool id="gedlab-khmer-filter-below-abund"
- name="Filter below abundance cutoff of 50"
- version="1.1-1"
- force_history_refresh="true">
-
-<!-- Work in progress, gating on filter-below-abund.py being upgraded -->
- <description>
- Trims fastq/fasta sequences at k-mers with abundance below 50
- based on a provided k-mer counting table.
- </description>
- <macros>
- <token name="@BINARY@">filter-below-abund.py</token>
- <import>macros.xml</import>
- </macros>
- <expand macro="requirements" />
- <command>
-mkdir output; cd output;
- at BINARY@
-$input_counting_table_filename
-#for input in $inputs
- $input
-#end for
- </command>
-
- <inputs>
- <expand macro="input_sequences_filenames" />
- <expand macro="input_counting_table_filename" />
- </inputs>
- <outputs>
- <!-- <expand macro="output_sequences" /> -->
- <expand macro="output_sequences_single" />
- </outputs>
- <expand macro="stdio" />
- <!-- <tests>
- <test interactor="api">
- <param name="inputs" value="test-abund-read-2.fa" />
- <param name="input_counting_table_filename" value="test-abund-read-2.ct" ftype="ct" />
- <output name="output">
- <discover_dataset name="test-abund-read-2.fa.abundfilt">
- </discover_dataset>
- </output>
- </test>
- <test interactor="api">
- <param name="input_sequence_filename" value="test-abund-read-2.fa" />
- <param name="input_counting_table_filename" value="test-abund-read-2.ct" ftype="ct" />
- <param name="cutoff" value="1" />
- <output name="output">
- <discover_dataset name="test-abund-read-2.fa.abundfilt">
- <assert_contents>
- <has_text text="GGTTGACGGGGCTCAGGG" />
- </assert_contents>
- </discover_dataset>
- </output>
- </test>
- </tests> -->
- <citations>
- <expand macro="software-citation" />
- <expand macro="counting-citation" />
- </citations>
- <!-- [OPTIONAL] ReST Help displayed in Galaxy -->
- <!--
- <help>
- </help>
- -->
-</tool>
diff --git a/scripts/galaxy/gedlab.py b/scripts/galaxy/gedlab.py
deleted file mode 100644
index 70b9683..0000000
--- a/scripts/galaxy/gedlab.py
+++ /dev/null
@@ -1,22 +0,0 @@
-"""k-mer count and presence."""
-
-from galaxy.datatypes.binary import Binary
-
-import logging
-
-log = logging.getLogger(__name__)
-
-
-class Count(Binary):
-
- def __init__(self, **kwd):
- Binary.__init__(self, **kwd)
-
-
-class Presence(Binary):
-
- def __init__(self, **kwd):
- Binary.__init__(self, **kwd)
-
-Binary.register_unsniffable_binary_ext("ct")
-Binary.register_unsniffable_binary_ext("pt")
diff --git a/scripts/galaxy/macros.xml b/scripts/galaxy/macros.xml
deleted file mode 100644
index 5a413a5..0000000
--- a/scripts/galaxy/macros.xml
+++ /dev/null
@@ -1,160 +0,0 @@
-<macros>
- <xml name="requirements">
- <requirements>
- <!-- <requirement type="binary">@BINARY@</requirement> -->
- <requirement type="package" version="1.1">khmer</requirement>
- </requirements>
- <version_command>@BINARY@ --version</version_command>
- </xml>
- <token name="@TABLEPARAMS@">#if $parameters.type == "simple"
- --ksize=20
- --n_tables=4
- --min-tablesize=$parameters.tablesize
-#else
- --ksize=$parameters.ksize
- --n_tables=$parameters.n_tables
- --min-tablesize=$parameters.tablesize_specific
- #end if</token>
- <token name="@THREADS@">--threads \${GALAXY_SLOTS:-4}</token>
- <xml name="tableinputs">
- <conditional name="parameters">
- <param name="type"
- type="select"
- label="Advanced Parameters"
- help="ksize, n_tables, a specific tablesize" >
- <option value="simple"
- selected="true">
- Hide
- </option>
- <option value="specific">
- Show
- </option>
- </param>
- <when value="simple">
- <param name="tablesize"
- type="select"
- label="Sample Type"
- display="radio">
- <option value="1e9"
- selected="true">
- Microbial Genome
- </option>
- <option value="2e9">
- Animal Transcriptome
- </option>
- <option value="4e9">
- Small Animal Genome or
- Low-Diversity Metagenome
- </option>
- <option value="16e9">
- Large Animal Genome
- </option>
- </param>
- </when>
- <when value="specific">
- <param name="ksize"
- type="integer"
- value="20"
- label="ksize"
- help="k-mer size to use" />
- <param name="n_tables"
- type="integer"
- min="1"
- value="4"
- label="n_tables"
- help="number of tables to use" />
- <param name="tablesize_specific"
- type="text"
- label="tablesize"
- help="lower bound on the tablesize to use" />
- </when>
- </conditional>
- </xml>
- <xml name="input_sequences_filenames">
- <param name="inputs"
- multiple="true"
- type="data"
- format="fasta,fastq,fastqsanger,fastqsolexa,fastqillumina"
- label="FAST[AQ] file(s)"
- help="Put in order of precedence such as longest reads first." />
- </xml>
- <xml name="input_sequence_filename">
- <param name="input_sequence_filename"
- type="data"
- format="fasta,fastq,fastqsanger,fastqsolexa,fastqillumina"
- label="FAST[AQ] file(s)" />
- </xml>
- <xml name="input_counting_table_filename">
- <param name="input_counting_table_filename"
- type="data"
- format="ct"
- label="the k-mer counting table to query"
- help="The abundances of the k-mers in the input nucleotide sequence file will be calculated using the kmer counts in this k-mer counting table." />
- </xml>
- <xml name="abundance-histogram-output">
- <data name="output_histogram_filename"
- format="text"
- label="${tool.name} k-mer abundance histogram. The
- columns are: (1) k-mer abundance, (2) k-mer count, (3)
- cumulative count, (4) fraction of total distinct
- k-mers.">
- </data>
- </xml>
- <xml name="output_sequences">
- <data name="output"
- format="input"
- label="${tool.name} processed nucleotide sequence file">
- <discover_datasets pattern="__name__" directory="output" visible="true"/>
- </data>
- </xml>
- <xml name="output_sequences_single">
- <data name="output"
- format="input"
- label="${tool.name} processed nucleotide sequence file" />
- </xml>
- <xml name="input_zero">
- <param name="zero"
- type="boolean"
- truevalue=""
- falsevalue="--no-zero"
- checked="true"
- help="Output zero count bins (--no-zero)" />
- </xml>
- <xml name="software-citation">
- <citation type="bibtex">@article{khmer2014,
- author = "Crusoe, Michael and Edvenson, Greg and Fish, Jordan and Howe,
- Adina and McDonald, Eric and Nahum, Joshua and Nanlohy, Kaben and
- Ortiz-Zuazaga, Humberto and Pell, Jason and Simpson, Jared and Scott, Camille
- and Srinivasan, Ramakrishnan Rajaram and Zhang, Qingpeng and Brown, C. Titus",
- title = "The khmer software package: enabling efficient sequence
- analysis",
- year = "2014",
- month = "04",
- publisher = "Figshare",
- url = "http://dx.doi.org/10.6084/m9.figshare.979190"
- }</citation>
- </xml>
- <xml name="diginorm-citation">
- <citation type="bibtex">@unpublished{diginorm,
- author = "Brown, C Titus and Howe, Adina and Zhang, Qingpeng and Pyrkosz,
-Alexis B and Brom, Timothy H",
- title = "A Reference-Free Algorithm for Computational Normalization of
-Shotgun Sequencing Data",
- year = "2012",
- eprint = "arXiv:1203.4802",
- url = "http://arxiv.org/abs/1203.4802",
-}</citation></xml>
- <xml name="graph-citation">
- <citation type="doi">10.1073/pnas.1121464109</citation>
- </xml>
- <xml name="counting-citation">
- <citation type="doi">10.1371/journal.pone.0101271</citation>
- </xml>
- <xml name="stdio">
- <stdio>
- <!-- [HELP] If no exit code rule is defined, the tool will stop if anything is written to STDERR -->
- <exit_code range="1:"
- level="fatal" />
- </stdio>
- </xml>
-</macros>
diff --git a/scripts/galaxy/normalize-by-median.xml b/scripts/galaxy/normalize-by-median.xml
deleted file mode 100644
index bcc4a76..0000000
--- a/scripts/galaxy/normalize-by-median.xml
+++ /dev/null
@@ -1,132 +0,0 @@
-<tool id="gedlab-khmer-normalize-by-median"
- name="Normalize By Median"
- version="1.1-4"
- force_history_refresh="true">
-
- <description>
- Filters a fastq/fasta file using digital normalization via
- median k-mer abundances.
- </description>
- <macros>
- <token name="@BINARY@">normalize-by-median.py</token>
- <import>macros.xml</import>
- </macros>
- <expand macro="requirements" />
- <command>
-mkdir output;
-cd output;
-normalize-by-median.py
-$paired_switch
- at TABLEPARAMS@
---cutoff=$cutoff
-#if $save_countingtable
---savetable=$countingtable
-#end if
-#if $countingtable_to_load
---loadtable=$countingtable_to_load
-#end if
---report-total-kmers
-#for entry in $many_inputs
-#for input in $entry.inputs
-$input
-#end for
-#end for
---out=$output
- </command>
-
- <inputs>
- <repeat name="many_inputs" title="input(s) set" min="1" default="1">
- <expand macro="input_sequences_filenames" />
- </repeat>
- <param name="paired_switch"
- type="boolean"
- checked="false"
- truevalue="--paired"
- falsevalue=""
- label="Are the inputs interleaved paired ends?"
- help="If so, then selecting this option will process the paired ends together." />
-
- <param name="countingtable_to_load"
- type="data"
- format="ct"
- optional="true"
- label="an optional k-mer counting table to load"
- help="The inputs file(s) will be processed using the kmer counts in the specified k-mer counting table file as a starting point." />
-
- <param name="save_countingtable"
- type="boolean"
- label="Save the k-mer counting table(s) in a file"
- help="" />
- <param name="cutoff"
- type="integer"
- min="1"
- value="20"
- label="cutoff" />
- <expand macro="tableinputs" />
- </inputs>
- <outputs>
- <data name="countingtable"
- format="ct"
- label="${tool.name} k-mer counting table">
- <filter>save_countingtable == True</filter>
- </data>
- <!-- <expand macro="output_sequences" /> -->
- <expand macro="output_sequences_single" />
- </outputs>
- <expand macro="stdio" />
-
- <tests>
- <test interactor="api">
- <conditional name="parameters">
- <param name="type" value="specific" />
- <param name="inputs" value="test-abund-read-2.fa"/>
- <param name="cutoff" value="1" />
- <param name="ksize" value="17" />
- </conditional>
- <output name="output">
- <discover_dataset name="test-abund-read-2.fa.keep">
- <assert_contents>
- <has_line_matching expression="GGTTGACGGGGCTCAGGGGG" />
- </assert_contents>
- </discover_dataset>
- </output>
- </test>
- <test interactor="api">
- <param name="inputs" value="test-abund-read-2.fa" />
- <param name="cutoff" value="2" />
- <param name="ksize" value="17" />
- <output name="output">
- <discover_dataset name="test-abund-read-2.fa.keep">
- <assert_contents>
- <has_line_matching expression="GGTTGACGGGGCTCAGGGGG" />
- <has_line_matching expression="GGTTGACGGGGCTCAGGG" />
- </assert_contents>
- </discover_dataset>
- </output>
- </test>
- <test interactor="api">
- <param name="inputs" value="test-abund-read-paired.fa" />
- <param name="cutoff" value="1" />
- <param name="ksize" value="17" />
- <param name="paired" value="true" />
- <output name="output">
- <discover_dataset name="test-abund-read-paired.fa.keep">
- <assert_contents>
- <has_line_matching expression="GGTTGACGGGGCTCAGGGGG" />
- <has_line_matching expression="GGTTGACGGGGCTCAGGG" />
- </assert_contents>
- </discover_dataset>
- </output>
- </test>
-
- </tests>
- <citations>
- <expand macro="software-citation" />
- <expand macro="diginorm-citation" />
- </citations>
- <!-- [OPTIONAL] Help displayed in Galaxy -->
- <!--
- <help>
- </help>
- -->
-</tool>
diff --git a/scripts/galaxy/test-data/random-20-a.fa.part b/scripts/galaxy/test-data/random-20-a.fa.part
deleted file mode 120000
index 8619bdc..0000000
--- a/scripts/galaxy/test-data/random-20-a.fa.part
+++ /dev/null
@@ -1 +0,0 @@
-../../../tests/test-data/random-20-a.fa.part
\ No newline at end of file
diff --git a/scripts/galaxy/test-data/test-abund-read-2.ct b/scripts/galaxy/test-data/test-abund-read-2.ct
deleted file mode 100644
index 3866fcc..0000000
Binary files a/scripts/galaxy/test-data/test-abund-read-2.ct and /dev/null differ
diff --git a/scripts/galaxy/test-data/test-abund-read-2.ct.info b/scripts/galaxy/test-data/test-abund-read-2.ct.info
deleted file mode 100644
index ed01d30..0000000
--- a/scripts/galaxy/test-data/test-abund-read-2.ct.info
+++ /dev/null
@@ -1,2 +0,0 @@
-through end: test-abund-read-2.fa
-fp rate estimated to be 0.000
diff --git a/scripts/galaxy/test-data/test-abund-read-2.fa b/scripts/galaxy/test-data/test-abund-read-2.fa
deleted file mode 120000
index e4e0ea3..0000000
--- a/scripts/galaxy/test-data/test-abund-read-2.fa
+++ /dev/null
@@ -1 +0,0 @@
-../../../tests/test-data/test-abund-read-2.fa
\ No newline at end of file
diff --git a/scripts/galaxy/test-data/test-abund-read-2.nobigcount.ct b/scripts/galaxy/test-data/test-abund-read-2.nobigcount.ct
deleted file mode 100644
index 70c29fc..0000000
Binary files a/scripts/galaxy/test-data/test-abund-read-2.nobigcount.ct and /dev/null differ
diff --git a/scripts/galaxy/test-data/test-abund-read-2.nobigcount.ct.info b/scripts/galaxy/test-data/test-abund-read-2.nobigcount.ct.info
deleted file mode 100644
index ed01d30..0000000
--- a/scripts/galaxy/test-data/test-abund-read-2.nobigcount.ct.info
+++ /dev/null
@@ -1,2 +0,0 @@
-through end: test-abund-read-2.fa
-fp rate estimated to be 0.000
diff --git a/scripts/galaxy/test-data/test-abund-read-paired.fa b/scripts/galaxy/test-data/test-abund-read-paired.fa
deleted file mode 120000
index 425166e..0000000
--- a/scripts/galaxy/test-data/test-abund-read-paired.fa
+++ /dev/null
@@ -1 +0,0 @@
-../../../tests/test-data/test-abund-read-paired.fa
\ No newline at end of file
diff --git a/scripts/galaxy/tool_dependencies.xml b/scripts/galaxy/tool_dependencies.xml
deleted file mode 100644
index 645f3c5..0000000
--- a/scripts/galaxy/tool_dependencies.xml
+++ /dev/null
@@ -1,10 +0,0 @@
-<?xml version="1.0"?>
-<tool_dependency>
- <package name="khmer" version="1.1">
- <install version="1.0">
- <actions>
- <action type="shell_command">easy_install -U setuptools==3.4.1; pip install --user khmer==1.1 || pip install khmer==1.1</action>
- </actions>
- </install>
- </package>
-</tool_dependency>
diff --git a/scripts/interleave-reads.py b/scripts/interleave-reads.py
index ae7d598..286de96 100755
--- a/scripts/interleave-reads.py
+++ b/scripts/interleave-reads.py
@@ -24,8 +24,10 @@ import os
import textwrap
import argparse
import khmer
-from khmer.kfile import check_input_files, check_space
+from khmer.kfile import check_input_files, check_space, is_block
from khmer.khmer_args import info
+from khmer.kfile import (add_output_compression_type, get_file_writer,
+ describe_file_handle)
from khmer.utils import (write_record_pair, check_is_left, check_is_right,
check_is_pair)
@@ -56,12 +58,13 @@ def get_parser():
parser.add_argument('left')
parser.add_argument('right')
parser.add_argument('-o', '--output', metavar="filename",
- type=argparse.FileType('w'),
+ type=argparse.FileType('wb'),
default=sys.stdout)
parser.add_argument('--version', action='version', version='%(prog)s ' +
khmer.__version__)
parser.add_argument('-f', '--force', default=False, action='store_true',
help='Overwrite output file if it exists')
+ add_output_compression_type(parser)
return parser
@@ -77,19 +80,11 @@ def main():
s2_file = args.right
fail = False
- if not os.path.exists(s1_file):
- print("Error! R1 file %s does not exist" % s1_file, file=sys.stderr)
- fail = True
-
- if not os.path.exists(s2_file):
- print("Error! R2 file %s does not exist" % s2_file, file=sys.stderr)
- fail = True
-
- if fail and not args.force:
- sys.exit(1)
print("Interleaving:\n\t%s\n\t%s" % (s1_file, s2_file), file=sys.stderr)
+ outfp = get_file_writer(args.output, args.gzip, args.bzip)
+
counter = 0
screed_iter_1 = screed.open(s1_file)
screed_iter_2 = screed.open(s2_file)
@@ -118,10 +113,10 @@ def main():
"%s %s" % (read1.name, read2.name), file=sys.stderr)
sys.exit(1)
- write_record_pair(read1, read2, args.output)
+ write_record_pair(read1, read2, outfp)
print('final: interleaved %d pairs' % counter, file=sys.stderr)
- print('output written to', args.output.name, file=sys.stderr)
+ print('output written to', describe_file_handle(outfp), file=sys.stderr)
if __name__ == '__main__':
main()
diff --git a/scripts/load-graph.py b/scripts/load-graph.py
index 0b8c334..f9ae753 100755
--- a/scripts/load-graph.py
+++ b/scripts/load-graph.py
@@ -16,13 +16,13 @@ Use '-h' for parameter help.
import sys
-from khmer.khmer_args import build_hashbits_args
+from khmer.khmer_args import build_nodegraph_args
from oxli import build_graph
def get_parser():
- parser = build_hashbits_args(descr="Load sequences into the compressible "
- "graph format plus optional tagset.")
+ parser = build_nodegraph_args(descr="Load sequences into the compressible "
+ "graph format plus optional tagset.")
parser = build_graph.build_parser(parser)
return parser
@@ -30,6 +30,5 @@ def get_parser():
if __name__ == '__main__':
build_graph.main(get_parser().parse_args())
- sys.exit(0)
# vim: set ft=python ts=4 sts=4 sw=4 et tw=79:
diff --git a/scripts/load-into-counting.py b/scripts/load-into-counting.py
index 287d7de..ddb63f0 100755
--- a/scripts/load-into-counting.py
+++ b/scripts/load-into-counting.py
@@ -22,16 +22,16 @@ import textwrap
import khmer
from khmer import khmer_args
from khmer.khmer_args import build_counting_args, report_on_config, info,\
- add_threading_args, calculate_tablesize
+ add_threading_args, calculate_graphsize
from khmer.kfile import check_file_writable
from khmer.kfile import check_input_files
-from khmer.kfile import check_space_for_hashtable
+from khmer.kfile import check_space_for_graph
def get_parser():
epilog = """
Note: with :option:`-b` the output will be the exact size of the
- k-mer counting table and this script will use a constant amount of memory.
+ k-mer countgraph and this script will use a constant amount of memory.
In exchange k-mer counts will stop at 255. The memory usage of this script
with :option:`-b` will be about 1.15x the product of the :option:`-x` and
:option:`-N` numbers.
@@ -48,11 +48,11 @@ def get_parser():
load-into-counting.py -k 20 -x 5e7 -T 4 out.ct data/100k-filtered.fa
"""
- parser = build_counting_args("Build a k-mer counting table from the given"
+ parser = build_counting_args("Build a k-mer countgraph from the given"
" sequences.", epilog=textwrap.dedent(epilog))
add_threading_args(parser)
- parser.add_argument('output_countingtable_filename', help="The name of the"
- " file to write the k-mer counting table to.")
+ parser.add_argument('output_countgraph_filename', help="The name of the"
+ " file to write the k-mer countgraph to.")
parser.add_argument('input_sequence_filename', nargs='+',
help="The names of one or more FAST[AQ] input "
"sequence files.")
@@ -76,20 +76,20 @@ def main():
args = get_parser().parse_args()
report_on_config(args)
- base = args.output_countingtable_filename
+ base = args.output_countgraph_filename
filenames = args.input_sequence_filename
for name in args.input_sequence_filename:
check_input_files(name, args.force)
- tablesize = calculate_tablesize(args, 'countgraph')
- check_space_for_hashtable(args.output_countingtable_filename, tablesize,
- args.force)
+ tablesize = calculate_graphsize(args, 'countgraph')
+ check_space_for_graph(args.output_countgraph_filename, tablesize,
+ args.force)
check_file_writable(base)
check_file_writable(base + ".info")
- print('Saving k-mer counting table to %s' % base, file=sys.stderr)
+ print('Saving k-mer countgraph to %s' % base, file=sys.stderr)
print('Loading kmers from sequences in %s' %
repr(filenames), file=sys.stderr)
@@ -98,8 +98,8 @@ def main():
os.remove(base + '.info')
print('making countgraph', file=sys.stderr)
- htable = khmer_args.create_countgraph(args)
- htable.set_use_bigcount(args.bigcount)
+ countgraph = khmer_args.create_countgraph(args)
+ countgraph.set_use_bigcount(args.bigcount)
filename = None
@@ -113,7 +113,7 @@ def main():
for _ in range(args.threads):
cur_thrd = \
threading.Thread(
- target=htable.consume_fasta_with_reads_parser,
+ target=countgraph.consume_fasta_with_reads_parser,
args=(rparser, )
)
threads.append(cur_thrd)
@@ -123,26 +123,27 @@ def main():
thread.join()
if index > 0 and index % 10 == 0:
- tablesize = calculate_tablesize(args, 'countgraph')
- check_space_for_hashtable(base, tablesize, args.force)
+ tablesize = calculate_graphsize(args, 'countgraph')
+ check_space_for_graph(base, tablesize, args.force)
print('mid-save', base, file=sys.stderr)
- htable.save(base)
+ countgraph.save(base)
with open(base + '.info', 'a') as info_fh:
print('through', filename, file=info_fh)
total_num_reads += rparser.num_reads
- n_kmers = htable.n_unique_kmers()
+ n_kmers = countgraph.n_unique_kmers()
print('Total number of unique k-mers:', n_kmers, file=sys.stderr)
with open(base + '.info', 'a') as info_fp:
print('Total number of unique k-mers:', n_kmers, file=info_fp)
print('saving', base, file=sys.stderr)
- htable.save(base)
+ countgraph.save(base)
# Change max_false_pos=0.2 only if you really grok it. HINT: You don't
fp_rate = \
- khmer.calc_expected_collisions(htable, args.force, max_false_pos=.2)
+ khmer.calc_expected_collisions(
+ countgraph, args.force, max_false_pos=.2)
with open(base + '.info', 'a') as info_fp:
print('fp rate estimated to be %1.3f\n' % fp_rate, file=info_fp)
diff --git a/scripts/make-initial-stoptags.py b/scripts/make-initial-stoptags.py
index e99a690..5d15cd1 100755
--- a/scripts/make-initial-stoptags.py
+++ b/scripts/make-initial-stoptags.py
@@ -42,15 +42,15 @@ EXCURSION_KMER_COUNT_THRESHOLD = 5
def get_parser():
epilog = """
- Loads a k-mer presence table/tagset pair created by load-graph.py, and does
+ Loads a k-mer nodegraph/tagset pair created by load-graph.py, and does
a small set of traversals from graph waypoints; on these traversals, looks
for k-mers that are repeatedly traversed in high-density regions of the
graph, i.e. are highly connected. Outputs those k-mers as an initial set of
stoptags, which can be fed into partition-graph.py, find-knots.py, and
filter-stoptags.py.
- The k-mer counting table size options parameters are for a k-mer counting
- table to keep track of repeatedly-traversed k-mers. The subset size option
+ The k-mer countgraph size options parameters are for a k-mer countgraph
+ to keep track of repeatedly-traversed k-mers. The subset size option
specifies the number of waypoints from which to traverse; for highly
connected data sets, the default (1000) is probably ok.
"""
@@ -77,28 +77,27 @@ def main():
graphbase = args.graphbase
# @RamRS: This might need some more work
- infiles = [graphbase + '.pt', graphbase + '.tagset']
+ infiles = [graphbase, graphbase + '.tagset']
if args.stoptags:
infiles.append(args.stoptags)
for _ in infiles:
check_input_files(_, args.force)
- print('loading htable %s.pt' % graphbase, file=sys.stderr)
- htable = khmer.load_hashbits(graphbase + '.pt')
+ print('loading nodegraph %s.pt' % graphbase, file=sys.stderr)
+ nodegraph = khmer.load_nodegraph(graphbase)
# do we want to load stop tags, and do they exist?
if args.stoptags:
print('loading stoptags from', args.stoptags, file=sys.stderr)
- htable.load_stop_tags(args.stoptags)
+ nodegraph.load_stop_tags(args.stoptags)
print('loading tagset %s.tagset...' % graphbase, file=sys.stderr)
- htable.load_tagset(graphbase + '.tagset')
+ nodegraph.load_tagset(graphbase + '.tagset')
- ksize = htable.ksize()
counting = khmer_args.create_countgraph(args)
# divide up into SUBSET_SIZE fragments
- divvy = htable.divide_tags_into_subsets(args.subset_size)
+ divvy = nodegraph.divide_tags_into_subsets(args.subset_size)
# pick off the first one
if len(divvy) == 1:
@@ -108,17 +107,17 @@ def main():
# partition!
print('doing pre-partitioning from', start, 'to', end, file=sys.stderr)
- subset = htable.do_subset_partition(start, end)
+ subset = nodegraph.do_subset_partition(start, end)
# now, repartition...
print('repartitioning to find HCKs.', file=sys.stderr)
- htable.repartition_largest_partition(subset, counting,
- EXCURSION_DISTANCE,
- EXCURSION_KMER_THRESHOLD,
- EXCURSION_KMER_COUNT_THRESHOLD)
+ nodegraph.repartition_largest_partition(subset, counting,
+ EXCURSION_DISTANCE,
+ EXCURSION_KMER_THRESHOLD,
+ EXCURSION_KMER_COUNT_THRESHOLD)
print('saving stop tags', file=sys.stderr)
- htable.save_stop_tags(graphbase + '.stoptags')
+ nodegraph.save_stop_tags(graphbase + '.stoptags')
print('wrote to:', graphbase + '.stoptags', file=sys.stderr)
if __name__ == '__main__':
diff --git a/scripts/merge-partitions.py b/scripts/merge-partitions.py
index c77d822..7d4c172 100755
--- a/scripts/merge-partitions.py
+++ b/scripts/merge-partitions.py
@@ -61,7 +61,7 @@ def main():
(len(pmap_files), pmap_files[0]), file=sys.stderr)
ksize = args.ksize
- htable = khmer.Hashbits(ksize, 1, 1)
+ nodegraph = khmer.Nodegraph(ksize, 1, 1)
for _ in pmap_files:
check_input_files(_, args.force)
@@ -70,10 +70,10 @@ def main():
for pmap_file in pmap_files:
print('merging', pmap_file, file=sys.stderr)
- htable.merge_subset_from_disk(pmap_file)
+ nodegraph.merge_subset_from_disk(pmap_file)
print('saving merged to', output_file, file=sys.stderr)
- htable.save_partitionmap(output_file)
+ nodegraph.save_partitionmap(output_file)
if args.remove_subsets:
print('removing pmap files', file=sys.stderr)
diff --git a/scripts/normalize-by-median.py b/scripts/normalize-by-median.py
index 68e25b1..a3f5300 100755
--- a/scripts/normalize-by-median.py
+++ b/scripts/normalize-by-median.py
@@ -26,13 +26,14 @@ import khmer
import textwrap
from khmer import khmer_args
from contextlib import contextmanager
-from oxli import functions as oxutils
-from khmer.khmer_args import (build_counting_args, add_loadhash_args,
- report_on_config, info, calculate_tablesize)
+from khmer.khmer_args import (build_counting_args, add_loadgraph_args,
+ report_on_config, info, calculate_graphsize)
import argparse
-from khmer.kfile import (check_space, check_space_for_hashtable,
- check_valid_file_exists)
+from khmer.kfile import (check_space, check_space_for_graph,
+ check_valid_file_exists, add_output_compression_type,
+ get_file_writer, is_block, describe_file_handle)
from khmer.utils import write_record, broken_paired_reader
+from khmer.khmer_logger import (configure_logging, log_info, log_error)
DEFAULT_DESIRED_COVERAGE = 20
@@ -85,10 +86,10 @@ class WithDiagnostics(object):
perc_kept = kept / float(total)
- print('... kept {kept} of {tot} or {perc_kept:.1%} so far'
- .format(kept=kept, tot=total, perc_kept=perc_kept),
- file=sys.stderr)
- print('... in file ' + ifilename, file=sys.stderr)
+ log_info('... kept {kept} of {tot} or {perc_kept:.1%} so'
+ 'far', kept=kept, tot=total,
+ perc_kept=perc_kept)
+ log_info('... in file {name}', name=ifilename)
if report_fp:
print("{total},{kept},{f_kept:.4}"
@@ -102,14 +103,13 @@ class WithDiagnostics(object):
# per file diagnostic output
if total == reads_start:
- print('SKIPPED empty file ' + ifilename, file=sys.stderr)
+ log_info('SKIPPED empty file {name}', name=ifilename)
else:
perc_kept = kept / float(total)
- print('DONE with {inp}; kept {kept} of {total} or {perc_kept:.1%}'
- .format(inp=ifilename, kept=kept, total=total,
- perc_kept=perc_kept),
- file=sys.stderr)
+ log_info('DONE with {inp}; kept {kept} of {total} or '
+ '{perc_kept:.1%}', inp=ifilename, kept=kept, total=total,
+ perc_kept=perc_kept)
# make sure there's at least one report per file, at the end of each
# file.
@@ -126,8 +126,8 @@ class Normalizer(object):
"""Digital normalization algorithm."""
- def __init__(self, desired_coverage, htable):
- self.htable = htable
+ def __init__(self, desired_coverage, countgraph):
+ self.countgraph = countgraph
self.desired_coverage = desired_coverage
def __call__(self, is_paired, read0, read1):
@@ -151,13 +151,13 @@ class Normalizer(object):
for record in batch:
seq = record.sequence.replace('N', 'A')
- if not self.htable.median_at_least(seq, desired_coverage):
+ if not self.countgraph.median_at_least(seq, desired_coverage):
passed_filter = True
if passed_filter:
for record in batch:
seq = record.sequence.replace('N', 'A')
- self.htable.consume(seq)
+ self.countgraph.consume(seq)
yield record
@@ -167,16 +167,16 @@ def catch_io_errors(ifile, out, single_out, force, corrupt_files):
try:
yield
except (IOError, OSError, ValueError) as error:
- print('** ERROR: ' + str(error), file=sys.stderr)
- print('** Failed on {name}: '.format(name=ifile), file=sys.stderr)
+ log_error('** ERROR: {error}', error=str(error))
+ log_error('** Failed on {name}: ', name=ifile)
if not single_out:
os.remove(out.name)
if not force:
- print('** Exiting!', file=sys.stderr)
+ log_error('** Exiting!')
sys.exit(1)
else:
- print('*** Skipping error file, moving on...', file=sys.stderr)
+ log_error('*** Skipping error file, moving on...')
corrupt_files.append(ifile)
@@ -198,16 +198,16 @@ def get_parser():
:option:`--force-single` will ignore all pairing information and treat
reads individually.
- With :option:`-s`/:option:`--savetable`, the k-mer counting table
+ With :option:`-s`/:option:`--savegraph`, the k-mer countgraph
will be saved to the specified file after all sequences have been
- processed. :option:`-l`/:option:`--loadtable` will load the
- specified k-mer counting table before processing the specified
- files. Note that these tables are are in the same format as those
+ processed. :option:`-l`/:option:`--loadgraph` will load the
+ specified k-mer countgraph before processing the specified
+ files. Note that these graphs are are in the same format as those
produced by :program:`load-into-counting.py` and consumed by
:program:`abundance-dist.py`.
To append reads to an output file (rather than overwriting it), send output
- to STDOUT with `--out -` and use UNIX file redirection syntax (`>>`) to
+ to STDOUT with `--output -` and use UNIX file redirection syntax (`>>`) to
append to the file.
Example::
@@ -235,6 +235,8 @@ def get_parser():
parser = build_counting_args(
descr="Do digital normalization (remove mostly redundant sequences)",
epilog=textwrap.dedent(epilog))
+ parser.add_argument('-q', '--quiet', dest='quiet', default=False,
+ action='store_true')
parser.add_argument('-C', '--cutoff', type=int,
default=DEFAULT_DESIRED_COVERAGE)
parser.add_argument('-p', '--paired', action='store_true',
@@ -246,8 +248,8 @@ def get_parser():
metavar="unpaired_reads_filename",
help='include a file of unpaired reads to which '
'-p/--paired does not apply.')
- parser.add_argument('-s', '--savetable', metavar="filename", default='',
- help='save the k-mer counting table to disk after all'
+ parser.add_argument('-s', '--savegraph', metavar="filename", default='',
+ help='save the k-mer countgraph to disk after all'
'reads are loaded.')
parser.add_argument('-R', '--report',
metavar='report_filename', type=argparse.FileType('w'))
@@ -257,32 +259,30 @@ def get_parser():
parser.add_argument('-f', '--force', dest='force',
help='continue past file reading errors',
action='store_true')
- parser.add_argument('-o', '--out', metavar="filename",
- dest='single_output_file',
- type=argparse.FileType('w'),
- default=None, help='only output a single file with '
+ parser.add_argument('-o', '--output', metavar="filename",
+ type=argparse.FileType('wb'),
+ default=None, dest='single_output_file',
+ help='only output a single file with '
'the specified filename; use a single dash "-" to '
'specify that output should go to STDOUT (the '
'terminal)')
parser.add_argument('input_filenames', metavar='input_sequence_filename',
help='Input FAST[AQ] sequence filename.', nargs='+')
- add_loadhash_args(parser)
+ add_loadgraph_args(parser)
+ add_output_compression_type(parser)
return parser
def main(): # pylint: disable=too-many-branches,too-many-statements
- info('normalize-by-median.py', ['diginorm'])
args = get_parser().parse_args()
-
+ configure_logging(args.quiet)
+ info('normalize-by-median.py', ['diginorm'])
report_on_config(args)
report_fp = args.report
force_single = args.force_single
- # if optimization args are given, do optimization
- args = oxutils.do_sanity_checking(args, 0.1)
-
# check for similar filenames
# if we're using a single output file only check for identical filenames
# otherwise, check for identical BASE names as well.
@@ -295,9 +295,8 @@ def main(): # pylint: disable=too-many-branches,too-many-statements
basename = os.path.basename(pathfilename)
if basename in basenames:
- print('ERROR: Duplicate filename--Cannot handle this!',
- file=sys.stderr)
- print('** Exiting!', file=sys.stderr)
+ log_error('ERROR: Duplicate filename--Cannot handle this!')
+ log_error('** Exiting!')
sys.exit(1)
basenames.append(basename)
@@ -305,25 +304,21 @@ def main(): # pylint: disable=too-many-branches,too-many-statements
# check that files exist and there is sufficient output disk space.
check_valid_file_exists(args.input_filenames)
check_space(args.input_filenames, args.force)
- if args.savetable:
- tablesize = calculate_tablesize(args, 'countgraph')
- check_space_for_hashtable(args.savetable, tablesize, args.force)
+ if args.savegraph:
+ graphsize = calculate_graphsize(args, 'countgraph')
+ check_space_for_graph(args.savegraph, graphsize, args.force)
# load or create counting table.
- if args.loadtable:
- print('loading k-mer counting table from ' + args.loadtable,
- file=sys.stderr)
- htable = khmer.load_counting_hash(args.loadtable)
- if args.unique_kmers != 0:
- print('Warning: You have specified a number of unique kmers'
- ' but are loading a precreated counting table--'
- 'argument optimization will NOT be done.', file=sys.stderr)
+ if args.loadgraph:
+ log_info('loading k-mer countgraph from {graph}',
+ graph=args.loadgraph)
+ countgraph = khmer.load_countgraph(args.loadgraph)
else:
- print('making countgraph', file=sys.stderr)
- htable = khmer_args.create_countgraph(args)
+ log_info('making countgraph')
+ countgraph = khmer_args.create_countgraph(args)
# create an object to handle diginorm of all files
- norm = Normalizer(args.cutoff, htable)
+ norm = Normalizer(args.cutoff, countgraph)
with_diagnostics = WithDiagnostics(norm, report_fp, args.report_frequency)
# make a list of all filenames and if they're paired or not;
@@ -340,11 +335,7 @@ def main(): # pylint: disable=too-many-branches,too-many-statements
output_name = None
if args.single_output_file:
- if args.single_output_file is sys.stdout:
- output_name = '/dev/stdout'
- else:
- output_name = args.single_output_file.name
- outfp = args.single_output_file
+ outfp = get_file_writer(args.single_output_file, args.gzip, args.bzip)
else:
if '-' in filenames or '/dev/stdin' in filenames:
print("Accepting input from stdin; output filename must "
@@ -358,7 +349,8 @@ def main(): # pylint: disable=too-many-branches,too-many-statements
for filename, require_paired in files:
if not args.single_output_file:
output_name = os.path.basename(filename) + '.keep'
- outfp = open(output_name, 'w')
+ outfp = open(output_name, 'wb')
+ outfp = get_file_writer(outfp, args.gzip, args.bzip)
# failsafe context manager in case an input file breaks
with catch_io_errors(filename, outfp, args.single_output_file,
@@ -374,32 +366,29 @@ def main(): # pylint: disable=too-many-branches,too-many-statements
if record is not None:
write_record(record, outfp)
- print('output in ' + output_name, file=sys.stderr)
- if output_name is not '/dev/stdout':
+ log_info('output in {name}', name=describe_file_handle(outfp))
+ if not is_block(outfp):
outfp.close()
# finished - print out some diagnostics.
- print('Total number of unique k-mers: {0}'
- .format(htable.n_unique_kmers()),
- file=sys.stderr)
+ log_info('Total number of unique k-mers: {umers}',
+ umers=countgraph.n_unique_kmers())
- if args.savetable:
- print('...saving to ' + args.savetable, file=sys.stderr)
- htable.save(args.savetable)
+ if args.savegraph:
+ log_info('...saving to {name}', name=args.savegraph)
+ countgraph.save(args.savegraph)
fp_rate = \
- khmer.calc_expected_collisions(htable, False, max_false_pos=.8)
+ khmer.calc_expected_collisions(countgraph, False, max_false_pos=.8)
# for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975
- print('fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate),
- file=sys.stderr)
+ log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate)
if args.force and len(corrupt_files) > 0:
- print("** WARNING: Finished with errors!", file=sys.stderr)
- print("** I/O Errors occurred in the following files:",
- file=sys.stderr)
- print("\t", " ".join(corrupt_files), file=sys.stderr)
+ log_error("** WARNING: Finished with errors!")
+ log_error("** I/O Errors occurred in the following files:")
+ log_error("\t" + " ".join(corrupt_files))
if __name__ == '__main__':
diff --git a/scripts/partition-graph.py b/scripts/partition-graph.py
index bb03c0b..b322b6f 100755
--- a/scripts/partition-graph.py
+++ b/scripts/partition-graph.py
@@ -26,18 +26,6 @@ import sys
from khmer.khmer_args import (add_threading_args, info)
from khmer.kfile import check_input_files
-# Debugging Support
-import re
-import platform
-if "Linux" == platform.system():
- def __debug_vm_usage(msg):
- print("===> DEBUG: " + msg, file=sys.stderr)
- for vmstat in re.findall(r".*Vm.*", file("/proc/self/status").read()):
- print(vmstat, file=sys.stderr)
-else:
- def __debug_vm_usage(msg): # pylint: disable=unused-argument
- pass
-
# stdlib queue module was renamed on Python 3
try:
import queue
@@ -51,7 +39,7 @@ DEFAULT_N_THREADS = 4
def worker(queue, basename, stop_big_traversals):
while True:
try:
- (htable, index, start, stop) = queue.get(False)
+ (nodegraph, index, start, stop) = queue.get(False)
except queue.Empty:
print('exiting', file=sys.stderr)
return
@@ -65,11 +53,11 @@ def worker(queue, basename, stop_big_traversals):
# pay attention to stoptags when partitioning; take command line
# direction on whether or not to exhaustively traverse.
- subset = htable.do_subset_partition(start, stop, True,
- stop_big_traversals)
+ subset = nodegraph.do_subset_partition(start, stop, True,
+ stop_big_traversals)
print('saving:', basename, index, file=sys.stderr)
- htable.save_subset_partitionmap(subset, outfile)
+ nodegraph.save_subset_partitionmap(subset, outfile)
del subset
gc.collect()
@@ -84,8 +72,8 @@ def get_parser():
"connectivity", epilog=epilog,
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
- parser.add_argument('basename', help="basename of the input k-mer presence"
- " table + tagset files")
+ parser.add_argument('basename', help="basename of the input k-mer"
+ "nodegraph + tagset files")
parser.add_argument('--stoptags', '-S', metavar='filename', default='',
help="Use stoptags in this file during partitioning")
parser.add_argument('--subset-size', '-s', default=DEFAULT_SUBSET_SIZE,
@@ -107,7 +95,7 @@ def main():
args = get_parser().parse_args()
basename = args.basename
- filenames = [basename + '.pt', basename + '.tagset']
+ filenames = [basename, basename + '.tagset']
for _ in filenames:
check_input_files(_, args.force)
@@ -118,14 +106,14 @@ def main():
print('stoptag file:', args.stoptags, file=sys.stderr)
print('--', file=sys.stderr)
- print('loading ht %s.pt' % basename, file=sys.stderr)
- htable = khmer.load_hashbits(basename + '.pt')
- htable.load_tagset(basename + '.tagset')
+ print('loading nodegraph %s' % basename, file=sys.stderr)
+ nodegraph = khmer.load_nodegraph(basename)
+ nodegraph.load_tagset(basename + '.tagset')
# do we want to load stop tags, and do they exist?
if args.stoptags:
print('loading stoptags from', args.stoptags, file=sys.stderr)
- htable.load_stop_tags(args.stoptags)
+ nodegraph.load_stop_tags(args.stoptags)
# do we want to exhaustively traverse the graph?
stop_big_traversals = args.no_big_traverse
@@ -141,7 +129,7 @@ def main():
#
# divide the tags up into subsets
- divvy = htable.divide_tags_into_subsets(int(args.subset_size))
+ divvy = nodegraph.divide_tags_into_subsets(int(args.subset_size))
n_subsets = len(divvy)
divvy.append(0)
@@ -152,7 +140,7 @@ def main():
for _ in range(0, n_subsets):
start = divvy[_]
end = divvy[_ + 1]
- worker_q.put((htable, _, start, end))
+ worker_q.put((nodegraph, _, start, end))
print('enqueued %d subset tasks' % n_subsets, file=sys.stderr)
open('%s.info' % basename, 'w').write('%d subsets total\n' % (n_subsets))
diff --git a/scripts/sample-reads-randomly.py b/scripts/sample-reads-randomly.py
index 72b5bed..8e2525d 100755
--- a/scripts/sample-reads-randomly.py
+++ b/scripts/sample-reads-randomly.py
@@ -27,7 +27,8 @@ import textwrap
import sys
import khmer
-from khmer.kfile import check_input_files
+from khmer.kfile import (check_input_files, add_output_compression_type,
+ get_file_writer)
from khmer.khmer_args import info
from khmer.utils import write_record, broken_paired_reader
@@ -69,12 +70,13 @@ def get_parser():
parser.add_argument('--force_single', default=False, action='store_true',
help='Ignore read pair information if present')
parser.add_argument('-o', '--output', dest='output_file',
- metavar='output_file',
- type=argparse.FileType('w'), default=None)
+ type=argparse.FileType('wb'),
+ metavar="filename", default=None)
parser.add_argument('--version', action='version', version='%(prog)s ' +
khmer.__version__)
parser.add_argument('-f', '--force', default=False, action='store_true',
help='Overwrite output file if it exits')
+ add_output_compression_type(parser)
return parser
@@ -94,10 +96,9 @@ def main():
#
# Figure out what the output filename is going to be
- #
- output_file = args.output_file
- if output_file:
+ if args.output_file:
+ output_filename = args.output_file.name
if num_samples > 1:
sys.stderr.write(
"Error: cannot specify -o with more than one sample.")
@@ -105,7 +106,6 @@ def main():
print("NOTE: This can be overridden using the --force"
" argument", file=sys.stderr)
sys.exit(1)
- output_filename = output_file.name
else:
filename = args.filenames[0]
if filename in ('/dev/stdin', '-'):
@@ -166,8 +166,12 @@ def main():
if len(reads) == 1:
print('Writing %d sequences to %s' %
(len(reads[0]), output_filename), file=sys.stderr)
+
+ output_file = args.output_file
if not output_file:
- output_file = open(output_filename, 'w')
+ output_file = open(output_filename, 'wb')
+
+ output_file = get_file_writer(output_file, args.gzip, args.bzip)
for records in reads[0]:
write_record(records[0], output_file)
@@ -178,7 +182,8 @@ def main():
n_filename = output_filename + '.%d' % n
print('Writing %d sequences to %s' %
(len(reads[n]), n_filename), file=sys.stderr)
- output_file = open(n_filename, 'w')
+ output_file = get_file_writer(open(n_filename, 'wb'), args.gzip,
+ args.bzip)
for records in reads[n]:
write_record(records[0], output_file)
if records[1] is not None:
diff --git a/scripts/split-paired-reads.py b/scripts/split-paired-reads.py
index b77144a..994db33 100755
--- a/scripts/split-paired-reads.py
+++ b/scripts/split-paired-reads.py
@@ -23,10 +23,12 @@ import os
import textwrap
import argparse
import khmer
-from khmer.kfile import check_input_files, check_space
from khmer.khmer_args import info
-from khmer.utils import (write_record, check_is_left, check_is_right,
- broken_paired_reader)
+from khmer.utils import (write_record, broken_paired_reader,
+ UnpairedReadsError)
+from khmer.kfile import (check_input_files, check_space,
+ add_output_compression_type,
+ get_file_writer, is_block, describe_file_handle)
def get_parser():
@@ -36,16 +38,16 @@ def get_parser():
left- and right- reads separated. This reformats the former to the latter.
The directory into which the left- and right- reads are output may be
- specified using :option:`-o`/:option:`--output-dir`. This directory will be
+ specified using :option:`-d`/:option:`--output-dir`. This directory will be
created if it does not already exist.
Alternatively, you can specify the filenames directly with
:option:`-1`/:option:`--output-first` and
:option:`-2`/:option:`--output-second`, which will override the
- :option:`-o`/:option:`--output-dir` setting on a file-specific basis.
+ :option:`-d`/:option:`--output-dir` setting on a file-specific basis.
- :option:`-p`/:option:`--force-paired` will require the input file to
- be properly interleaved; by default, this is not required.
+ :option:`-0`/:option:'--output-orphans` will allow broken-paired format,
+ and orphaned reads will be saved separately, to the specified file.
Example::
@@ -66,24 +68,25 @@ def get_parser():
parser.add_argument('infile', nargs='?', default='/dev/stdin')
- parser.add_argument('-o', '--output-dir', metavar="output_directory",
+ parser.add_argument('-d', '--output-dir', metavar="output_directory",
dest='output_directory', default='', help='Output '
'split reads to specified directory. Creates '
'directory if necessary')
-
+ parser.add_argument('-0', '--output-orphaned', metavar='output_orphaned',
+ help='Allow "orphaned" reads and extract them to ' +
+ 'this file',
+ type=argparse.FileType('wb'))
parser.add_argument('-1', '--output-first', metavar='output_first',
default=None, help='Output "left" reads to this '
- 'file', type=argparse.FileType('w'))
+ 'file', type=argparse.FileType('wb'))
parser.add_argument('-2', '--output-second', metavar='output_second',
default=None, help='Output "right" reads to this '
- 'file', type=argparse.FileType('w'))
- parser.add_argument('-p', '--force-paired', action='store_true',
- help='Require that reads be interleaved')
-
+ 'file', type=argparse.FileType('wb'))
parser.add_argument('--version', action='version', version='%(prog)s ' +
khmer.__version__)
parser.add_argument('-f', '--force', default=False, action='store_true',
help='Overwrite output file if it exists')
+ add_output_compression_type(parser)
return parser
@@ -97,76 +100,77 @@ def main():
check_input_files(infile, args.force)
check_space(filenames, args.force)
+ basename = os.path.basename(infile)
+
# decide where to put output files - specific directory? or just default?
if infile in ('/dev/stdin', '-'):
if not (args.output_first and args.output_second):
- print("Accepting input from stdin; output filenames must "
- "be provided.", file=sys.stderr)
+ print("Accepting input from stdin; "
+ "output filenames must be provided.", file=sys.stderr)
sys.exit(1)
elif args.output_directory:
if not os.path.exists(args.output_directory):
os.makedirs(args.output_directory)
- out1 = args.output_directory + '/' + os.path.basename(infile) + '.1'
- out2 = args.output_directory + '/' + os.path.basename(infile) + '.2'
+ out1 = os.path.join(args.output_directory, basename + '.1')
+ out2 = os.path.join(args.output_directory, basename + '.2')
else:
- out1 = os.path.basename(infile) + '.1'
- out2 = os.path.basename(infile) + '.2'
+ out1 = basename + '.1'
+ out2 = basename + '.2'
# OVERRIDE output file locations with -1, -2
if args.output_first:
- fp_out1 = args.output_first
+ fp_out1 = get_file_writer(args.output_first, args.gzip, args.bzip)
out1 = fp_out1.name
else:
# Use default filename created above
- fp_out1 = open(out1, 'w')
+ fp_out1 = get_file_writer(open(out1, 'wb'), args.gzip, args.bzip)
if args.output_second:
- fp_out2 = args.output_second
+ fp_out2 = get_file_writer(args.output_second, args.gzip, args.bzip)
out2 = fp_out2.name
else:
# Use default filename created above
- fp_out2 = open(out2, 'w')
+ fp_out2 = get_file_writer(open(out2, 'wb'), args.gzip, args.bzip)
+
+ # put orphaned reads here, if -0!
+ if args.output_orphaned:
+ fp_out0 = get_file_writer(args.output_orphaned, args.gzip, args.bzip)
+ out0 = describe_file_handle(args.output_orphaned)
counter1 = 0
counter2 = 0
+ counter3 = 0
index = None
screed_iter = screed.open(infile)
# walk through all the reads in broken-paired mode.
- paired_iter = broken_paired_reader(screed_iter)
- for index, is_pair, record1, record2 in paired_iter:
- if index % 10000 == 0:
- print('...', index, file=sys.stderr)
-
- # are we requiring pairs?
- if args.force_paired and not is_pair:
- print('ERROR, %s is not part of a pair' %
- record1.name, file=sys.stderr)
- sys.exit(1)
+ paired_iter = broken_paired_reader(screed_iter,
+ require_paired=not args.output_orphaned)
+
+ try:
+ for index, is_pair, record1, record2 in paired_iter:
+ if index % 10000 == 0:
+ print('...', index, file=sys.stderr)
- if is_pair:
- write_record(record1, fp_out1)
- counter1 += 1
- write_record(record2, fp_out2)
- counter2 += 1
- else:
- name = record1.name
- if check_is_left(name):
+ if is_pair:
write_record(record1, fp_out1)
counter1 += 1
- elif check_is_right(name):
- write_record(record1, fp_out2)
+ write_record(record2, fp_out2)
counter2 += 1
- else:
- print("Unrecognized format for read pair information: %s" %
- name, file=sys.stderr)
- print("Exiting.", file=sys.stderr)
- sys.exit(1)
-
- print("DONE; split %d sequences (%d left, %d right)" %
- (counter1 + counter2, counter1, counter2), file=sys.stderr)
- print("left (/1) reads in %s" % out1, file=sys.stderr)
- print("right (/2) reads in %s" % out2, file=sys.stderr)
+ elif args.output_orphaned:
+ write_record(record1, fp_out0)
+ counter3 += 1
+ except UnpairedReadsError as e:
+ print("Unpaired reads found starting at {name}; exiting".format(
+ name=e.r1.name), file=sys.stderr)
+ sys.exit(1)
+
+ print("DONE; split %d sequences (%d left, %d right, %d orphans)" %
+ (counter1 + counter2, counter1, counter2, counter3), file=sys.stderr)
+ print("/1 reads in %s" % out1, file=sys.stderr)
+ print("/2 reads in %s" % out2, file=sys.stderr)
+ if args.output_orphaned:
+ print("orphans in %s" % out0, file=sys.stderr)
if __name__ == '__main__':
main()
diff --git a/scripts/trim-low-abund.py b/scripts/trim-low-abund.py
index ddfc43a..d7d2c9a 100755
--- a/scripts/trim-low-abund.py
+++ b/scripts/trim-low-abund.py
@@ -27,11 +27,12 @@ import argparse
from screed import Record
from khmer import khmer_args
-from khmer.khmer_args import (build_counting_args, info, add_loadhash_args,
- report_on_config, calculate_tablesize)
+from khmer.khmer_args import (build_counting_args, info, add_loadgraph_args,
+ report_on_config, calculate_graphsize)
from khmer.utils import write_record, write_record_pair, broken_paired_reader
-from khmer.kfile import (check_space, check_space_for_hashtable,
- check_valid_file_exists)
+from khmer.kfile import (check_space, check_space_for_graph,
+ check_valid_file_exists, add_output_compression_type,
+ get_file_writer)
DEFAULT_NORMALIZE_LIMIT = 20
DEFAULT_CUTOFF = 2
@@ -83,9 +84,9 @@ def get_parser():
help='base cutoff on this median k-mer abundance',
default=DEFAULT_NORMALIZE_LIMIT)
- parser.add_argument('-o', '--out', metavar="filename",
- type=argparse.FileType('w'),
- default=None, help='only output a single file with '
+ parser.add_argument('-o', '--output', metavar="output_filename",
+ type=argparse.FileType('wb'),
+ help='only output a single file with '
'the specified filename; use a single dash "-" to '
'specify that output should go to STDOUT (the '
'terminal)')
@@ -95,15 +96,16 @@ def get_parser():
help='Only trim low-abundance k-mers from sequences '
'that have high coverage.')
- add_loadhash_args(parser)
- parser.add_argument('-s', '--savetable', metavar="filename", default='',
- help='save the k-mer counting table to disk after all'
+ add_loadgraph_args(parser)
+ parser.add_argument('-s', '--savegraph', metavar="filename", default='',
+ help='save the k-mer countgraph to disk after all'
'reads are loaded.')
# expert options
parser.add_argument('--force', default=False, action='store_true')
parser.add_argument('--ignore-pairs', default=False, action='store_true')
parser.add_argument('--tempdir', '-T', type=str, default='./')
+ add_output_compression_type(parser)
return parser
@@ -125,19 +127,19 @@ def main():
report_on_config(args)
check_valid_file_exists(args.input_filenames)
check_space(args.input_filenames, args.force)
- if args.savetable:
- tablesize = calculate_tablesize(args, 'countgraph')
- check_space_for_hashtable(args.savetable, tablesize, args.force)
+ if args.savegraph:
+ graphsize = calculate_graphsize(args, 'countgraph')
+ check_space_for_graph(args.savegraph, graphsize, args.force)
if ('-' in args.input_filenames or '/dev/stdin' in args.input_filenames) \
- and not args.out:
+ and not args.output:
print("Accepting input from stdin; output filename must "
"be provided with -o.", file=sys.stderr)
sys.exit(1)
- if args.loadtable:
- print('loading countgraph from', args.loadtable, file=sys.stderr)
- ct = khmer.load_counting_hash(args.loadtable)
+ if args.loadgraph:
+ print('loading countgraph from', args.loadgraph, file=sys.stderr)
+ ct = khmer.load_countgraph(args.loadgraph)
else:
print('making countgraph', file=sys.stderr)
ct = khmer_args.create_countgraph(args)
@@ -164,10 +166,12 @@ def main():
for filename in args.input_filenames:
pass2filename = os.path.basename(filename) + '.pass2'
pass2filename = os.path.join(tempdir, pass2filename)
- if args.out is None:
- trimfp = open(os.path.basename(filename) + '.abundtrim', 'w')
+ if args.output is None:
+ trimfp = get_file_writer(open(os.path.basename(filename) +
+ '.abundtrim', 'wb'),
+ args.gzip, args.bzip)
else:
- trimfp = args.out
+ trimfp = get_file_writer(args.output, args.gzip, args.bzip)
pass2list.append((filename, pass2filename, trimfp))
@@ -337,10 +341,10 @@ def main():
print('output in *.abundtrim', file=sys.stderr)
- if args.savetable:
- print("Saving k-mer counting table to",
- args.savetable, file=sys.stderr)
- ct.save(args.savetable)
+ if args.savegraph:
+ print("Saving k-mer countgraph to",
+ args.savegraph, file=sys.stderr)
+ ct.save(args.savegraph)
if __name__ == '__main__':
diff --git a/scripts/unique-kmers.py b/scripts/unique-kmers.py
index ddf854c..9ac283c 100755
--- a/scripts/unique-kmers.py
+++ b/scripts/unique-kmers.py
@@ -26,7 +26,7 @@ import khmer
from khmer.khmer_args import (DEFAULT_K, info, ComboFormatter,
_VersionStdErrAction)
from khmer.utils import write_record
-from oxli.functions import optimal_args_output_gen as output_gen
+from khmer.khmer_args import graphsize_args_report
from khmer import __version__
import screed
@@ -117,11 +117,7 @@ def main():
input_filename = None
for index, input_filename in enumerate(args.input_filenames):
hllcpp = khmer.HLLCounter(args.error_rate, args.ksize)
- for record in screed.open(input_filename):
- seq = record.sequence.upper().replace('N', 'A')
- hllcpp.consume_string(seq)
- if args.stream_out:
- write_record(record, sys.stdout)
+ hllcpp.consume_fasta(input_filename, stream_out=args.stream_out)
cardinality = hllcpp.estimate_cardinality()
print('Estimated number of unique {0}-mers in {1}: {2}'.format(
@@ -138,7 +134,7 @@ def main():
args.ksize, cardinality),
file=sys.stderr)
- to_print = output_gen(cardinality, args.error_rate)
+ to_print = graphsize_args_report(cardinality, args.error_rate)
if args.diagnostics:
print(to_print, file=sys.stderr)
diff --git a/setup.py b/setup.py
index d69c7b8..c4b3778 100755
--- a/setup.py
+++ b/setup.py
@@ -102,23 +102,26 @@ BZIP2DIR = 'third-party/bzip2'
BUILD_DEPENDS = []
BUILD_DEPENDS.extend(path_join("lib", bn + ".hh") for bn in [
"khmer", "kmer_hash", "hashtable", "counting", "hashbits", "labelhash",
- "hllcounter", "khmer_exception", "read_aligner", "subset", "read_parsers"])
+ "hllcounter", "khmer_exception", "read_aligner", "subset", "read_parsers",
+ "traversal"])
SOURCES = ["khmer/_khmer.cc"]
SOURCES.extend(path_join("lib", bn + ".cc") for bn in [
"read_parsers", "kmer_hash", "hashtable",
"hashbits", "labelhash", "counting", "subset", "read_aligner",
- "hllcounter"])
+ "hllcounter", "traversal"])
SOURCES.extend(path_join("third-party", "smhasher", bn + ".cc") for bn in [
"MurmurHash3"])
-EXTRA_COMPILE_ARGS = ['-O3', ]
+# Don't forget to update lib/Makefile with these flags!
+EXTRA_COMPILE_ARGS = ['-O3', '-std=c++11', '-O0']
EXTRA_LINK_ARGS = []
if sys.platform == 'darwin':
# force 64bit only builds
- EXTRA_COMPILE_ARGS.extend(['-arch', 'x86_64'])
+ EXTRA_COMPILE_ARGS.extend(['-arch', 'x86_64', '-mmacosx-version-min=10.7',
+ '-stdlib=libc++'])
if check_for_openmp():
EXTRA_COMPILE_ARGS.extend(['-fopenmp'])
@@ -178,9 +181,7 @@ SETUP_METADATA = \
"url": 'https://khmer.readthedocs.org/',
"packages": ['khmer', 'khmer.tests', 'oxli'],
"package_dir": {'khmer.tests': 'tests'},
- "install_requires": ['screed >= 0.9'],
- # testing screed download link
-
+ "install_requires": ['screed >= 0.9', 'bz2file'],
"extras_require": {':python_version=="2.6"': ['argparse>=1.2.1'],
'docs': ['sphinx', 'sphinxcontrib-autoprogram'],
'tests': ['nose >= 1.0']},
diff --git a/tests/khmer_tst_utils.py b/tests/khmer_tst_utils.py
index 13edeb3..ddded56 100644
--- a/tests/khmer_tst_utils.py
+++ b/tests/khmer_tst_utils.py
@@ -125,10 +125,13 @@ def runscript(scriptname, args, in_directory=None,
if in_directory:
os.chdir(in_directory)
+ else:
+ in_directory = cwd
try:
- print('running:', scriptname, 'in:', in_directory)
- print('arguments', sysargs)
+ print('running:', scriptname, 'in:', in_directory, file=oldout)
+ print('arguments', sysargs, file=oldout)
+
status = _runscript(scriptname, sandbox=sandbox)
except nose.SkipTest:
raise
diff --git a/tests/test-data/multi-output.fa b/tests/test-data/multi-output.fa
new file mode 100644
index 0000000..5f56c70
--- /dev/null
+++ b/tests/test-data/multi-output.fa
@@ -0,0 +1,4 @@
+>assembly.1 895:1:37:17593:9954/1
+GCGAGTGGAGGCCGCAGACGCGAGTGGTGGAGGGAGAAGGCCTGACGGAGGCCGCAGACGCGAGTGGTGGAGGGAGAAGG
+CCTGACGGAGGCCGCAGACGCGAGTGGTGGAGGGAGAAGGCCTGACGGTTGACGGGGCTCAGGGGGCGGCTGACTCCGAG
+AGACAGCAGCCGCAGCTGTCGTCAGGGGATTTCCGGGGCGGAGGCCGCAGACGCGAGTGGTGGAGGGAGAAGGCCTGACG
diff --git a/tests/test-data/normC20k20.ct b/tests/test-data/normC20k20.ct
index 1400ef9..24a33b2 100644
Binary files a/tests/test-data/normC20k20.ct and b/tests/test-data/normC20k20.ct differ
diff --git a/tests/test-data/overlap.out b/tests/test-data/overlap.out
deleted file mode 100644
index db0a6bb..0000000
--- a/tests/test-data/overlap.out
+++ /dev/null
@@ -1,7 +0,0 @@
-/u/qingpeng/Github/newkhmer/tests/test-data/test-overlap1.fa:
-# of unique kmers:440346
-# of occupied bin:440299
-/u/qingpeng/Github/newkhmer/tests/test-data/test-overlap2.fa:
-# of unique kmers:581866
-# of occupied bin:581783
-# of overlap unique kmers:184849
diff --git a/tests/test-data/paired-broken.fq.badleft b/tests/test-data/paired-broken.fq.badleft
new file mode 100644
index 0000000..7059583
--- /dev/null
+++ b/tests/test-data/paired-broken.fq.badleft
@@ -0,0 +1,8 @@
+ at 895:1:37:17593:9954
+GGTTGACGGGGCTCAGGGGGCGGCTGACTCCGAGAGACAGCAGCCGCAGCTGTCGTCAGGGGATTTCCGGGGCGGAGGCCGCAGACGCGAGTGGTGGAGGGAGAAGGCCTGACG
++
+##################################################################################################################
+ at 896:1:37:17593:9954
+GGTTGACGGGGCTCAGGGGGCGGCTGACTCCGAGAGACAGCAGCCGCAGCTGTCGTCAGGGGATTTCCGGGGCGGAGGCCGCAGACGCGAGTGGTGGAGGGAGAAGGCCTGACG
++
+##################################################################################################################
diff --git a/tests/test-data/paired-broken.fq.badright b/tests/test-data/paired-broken.fq.badright
new file mode 100644
index 0000000..a755f3f
--- /dev/null
+++ b/tests/test-data/paired-broken.fq.badright
@@ -0,0 +1,9 @@
+ at 895:1:37:17593:9954
+GGTTGACGGGGCTCAGGGGGCGGCTGACTCCGAGAGACAGCAGCCGCAGCTGTCGTCAGGGGATTTCCGGGGCGGAGGCCGCAGACGCGAGTGGTGGAGGGAGAAGGCCTGACG
++
+##################################################################################################################
+ at 896:1:37:17593:9954
+GGTTGACGGGGCTCAGGGGGCGGCTGACTCCGAGAGACAGCAGCCGCAGCTGTCGTCAGGGGATTTCCGGGGCGGAGGCCGCAGACGCGAGTGGTGGAGGGAGAAGGCCTGACG
++
+##################################################################################################################
+
diff --git a/tests/test-data/paired-broken.fq.paired_bad b/tests/test-data/paired-broken.fq.paired_bad
new file mode 100644
index 0000000..c03a340
--- /dev/null
+++ b/tests/test-data/paired-broken.fq.paired_bad
@@ -0,0 +1,16 @@
+ at 895:1:37:17593:9954/1
+GGTTGACGGGGCTCAGGGGGCGGCTGACTCCGAGAGACAGCAGCCGCAGCTGTCGTCAGGGGATTTCCGGGGCGGAGGCCGCAGACGCGAGTGGTGGAGGGAGAAGGCCTGACG
++
+##################################################################################################################
+ at 895:1:37:17593:9954/2
+GGTTGACGGGGCTCAGGGGGCGGCTGACTCCGAGAGACAGCAGCCGCAGCTGTCGTCAGGGGATTTCCGGGGCGGAGGCCGCAGACGCGAGTGGTGGAGGGAGAAGGCCTGACG
++
+##################################################################################################################
+ at 896:1:37:17593:9954/1
+GGTTGACGGGGCTCAGGGGGCGGCTGACTCCGAGAGACAGCAGCCGCAGCTGTCGTCAGGGGATTTCCGGGGCGGAGGCCGCAGACGCGAGTGGTGGAGGGAGAAGGCCTGACG
++
+##################################################################################################################
+ at 896:1:37:17593:9954/2
+GGTTGACGGGGCTCAGGGGGCGGCTGACTCCGAGAGACAGCAGCCGCAGCTGTCGTCAGGGGATTTCCGGGGCGGAGGCCGCAGACGCGAGTGGTGGAGGGAGAAGGCCTGACG
++
+##################################################################################################################
diff --git a/tests/test-data/test-multi.fa b/tests/test-data/test-multi.fa
new file mode 100644
index 0000000..527ee55
--- /dev/null
+++ b/tests/test-data/test-multi.fa
@@ -0,0 +1,2 @@
+>895:1:37:17593:9954/1
+GCGAGTGGAGGCCGCAGACGCGAGTGGTGGAGGGAGAAGGCCTGACGGAGGCCGCAGACGCGAGTGGTGGAGGGAGAAGGCCTGACGGAGGCCGCAGACGCGAGTGGTGGAGGGAGAAGGCCTGACGGTTGACGGGGCTCAGGGGGCGGCTGACTCCGAGAGACAGCAGCCGCAGCTGTCGTCAGGGGATTTCCGGGGCGGAGGCCGCAGACGCGAGTGGTGGAGGGAGAAGGCCTGACG
diff --git a/tests/test_counting_hash.py b/tests/test_countgraph.py
similarity index 75%
rename from tests/test_counting_hash.py
rename to tests/test_countgraph.py
index 2e21119..c0aec9f 100644
--- a/tests/test_counting_hash.py
+++ b/tests/test_countgraph.py
@@ -40,10 +40,10 @@ def teardown():
utils.cleanup()
-class Test_CountingHash(object):
+class Test_Countgraph(object):
def setup(self):
- self.hi = khmer._CountingHash(12, PRIMES_1m)
+ self.hi = khmer._Countgraph(12, PRIMES_1m)
def test_failed_get(self):
GG = 'G' * 12 # forward_hash: 11184810
@@ -124,7 +124,7 @@ class Test_CountingHash(object):
def test_get_raw_tables():
- ht = khmer.CountingHash(20, 1e5, 4)
+ ht = khmer.Countgraph(20, 1e5, 4)
tables = ht.get_raw_tables()
for size, table in zip(ht.hashsizes(), tables):
@@ -133,7 +133,7 @@ def test_get_raw_tables():
def test_get_raw_tables_view():
- ht = khmer.CountingHash(20, 1e5, 4)
+ ht = khmer.Countgraph(20, 1e5, 4)
tables = ht.get_raw_tables()
for tab in tables:
assert sum(tab.tolist()) == 0
@@ -145,7 +145,7 @@ def test_get_raw_tables_view():
@attr('huge')
def test_toobig():
try:
- ct = khmer.CountingHash(30, 1e13, 1)
+ ct = khmer.Countgraph(30, 1e13, 1)
assert 0, "this should fail"
except MemoryError as err:
print(str(err))
@@ -155,7 +155,7 @@ def test_3_tables():
x = list(PRIMES_1m)
x.append(1000005)
- hi = khmer._CountingHash(12, x)
+ hi = khmer._Countgraph(12, x)
GG = 'G' * 12 # forward_hash: 11184810
assert khmer.forward_hash(GG, 12) == 11184810
@@ -186,7 +186,7 @@ def test_3_tables():
def test_simple_median():
- hi = khmer.CountingHash(6, 1e6, 2)
+ hi = khmer.Countgraph(6, 1e6, 2)
hi.consume("AAAAAA")
(median, average, stddev) = hi.get_median_count("AAAAAA")
@@ -225,7 +225,7 @@ def test_simple_median():
def test_median_too_short():
- hi = khmer.CountingHash(6, 1e6, 2)
+ hi = khmer.Countgraph(6, 1e6, 2)
hi.consume("AAAAAA")
try:
@@ -236,7 +236,7 @@ def test_median_too_short():
def test_median_at_least():
- hi = khmer.CountingHash(6, 1e6, 2)
+ hi = khmer.Countgraph(6, 1e6, 2)
hi.consume("AAAAAA")
assert hi.median_at_least("AAAAAA", 1)
@@ -261,7 +261,7 @@ def test_median_at_least():
def test_median_at_least_single_gt():
K = 20
- hi = khmer.CountingHash(K, 1e6, 2)
+ hi = khmer.Countgraph(K, 1e6, 2)
kmers = ['ATCGATCGATCGATCGATCG',
'GTACGTACGTACGTACGTAC',
@@ -274,7 +274,7 @@ def test_median_at_least_single_gt():
def test_median_at_least_single_lt():
K = 20
- hi = khmer.CountingHash(K, 1e6, 2)
+ hi = khmer.Countgraph(K, 1e6, 2)
kmers = ['ATCGATCGATCGATCGATCG',
'GTACGTACGTACGTACGTAC',
@@ -288,7 +288,7 @@ def test_median_at_least_single_lt():
def test_median_at_least_odd_gt():
# test w/odd number of k-mers
K = 20
- hi = khmer.CountingHash(K, 1e6, 2)
+ hi = khmer.Countgraph(K, 1e6, 2)
seqs = ['ATCGATCGATCGATCGATCGCC',
'GTACGTACGTACGTACGTACCC',
@@ -301,7 +301,7 @@ def test_median_at_least_odd_gt():
def test_median_at_least_odd_lt():
K = 20
- hi = khmer.CountingHash(K, 1e6, 2)
+ hi = khmer.Countgraph(K, 1e6, 2)
seqs = ['ATCGATCGATCGATCGATCGCC',
'GTACGTACGTACGTACGTACCC',
@@ -315,7 +315,7 @@ def test_median_at_least_odd_lt():
# Test median with even number of k-mers
def test_median_at_least_even_gt():
K = 20
- hi = khmer.CountingHash(K, 1e6, 2)
+ hi = khmer.Countgraph(K, 1e6, 2)
seqs = ['ATCGATCGATCGATCGATCGCCC',
'GTACGTACGTACGTACGTACCCC',
@@ -328,7 +328,7 @@ def test_median_at_least_even_gt():
def test_median_at_least_even_lt():
K = 20
- hi = khmer.CountingHash(K, 1e6, 2)
+ hi = khmer.Countgraph(K, 1e6, 2)
seqs = ['ATCGATCGATCGATCGATCGCCC',
'GTACGTACGTACGTACGTACCCC',
@@ -342,7 +342,7 @@ def test_median_at_least_even_lt():
def test_median_at_least_comp():
K = 20
C = 4
- hi = khmer.CountingHash(K, 1e6, 2)
+ hi = khmer.Countgraph(K, 1e6, 2)
seqs = ['ATCGATCGATCGATCGATCGCCC',
'GTACGTACGTACGTACGTACCCC',
@@ -358,7 +358,7 @@ def test_median_at_least_comp():
def test_median_at_least_exception():
- ht = khmer.CountingHash(20, 1e6, 2)
+ ht = khmer.Countgraph(20, 1e6, 2)
try:
ht.median_at_least('ATGGCTGATCGAT', 1)
assert 0, "should have thrown ValueError"
@@ -366,102 +366,8 @@ def test_median_at_least_exception():
pass
-def test_simple_kadian():
- hi = khmer.CountingHash(6, 1e6, 2)
- hi.consume("ACTGCTATCTCTAGAGCTATG")
- assert hi.get_kadian_count("ACTGCTATCTCTAGAGCTATG") == 1
-
- hi = khmer.CountingHash(6, 1e6, 2)
- hi.consume("ACTGCTATCTCTAGAGCTATG")
- hi.consume("ACTGCTATCTCTAGAcCTATG")
- # ---------------^
- x = hi.get_kadian_count("ACTGCTATCTCTAGAGCTATG")
- assert x == 2, x
-
- hi = khmer.CountingHash(6, 1e6, 2)
- hi.consume("ACTGCTATCTCTAGAGCTATG")
- hi.consume("ACTGCTATCTCTAGAcCTATG")
- # ---------------^---^
- x = hi.get_kadian_count("ACTGCTATCTCTAGAGCTATG")
- assert x == 2
-
- hi = khmer.CountingHash(6, 1e6, 2)
- hi.consume("ACTGCTATCTCTAGAGCTATG")
- hi.consume("ACTGCTATCTCTAGtGCTAcG")
- # --------------^^---^
- x = hi.get_kadian_count("ACTGCTATCTCTAGAGCTATG")
- assert x == 1, x
-
-
-def test_simple_kadian_2():
- hi = khmer.CountingHash(6, 1e6, 2)
- hi.consume("ACTGCTATCTCTAGAGCTATG")
- assert hi.get_kadian_count("ACTGCTATCTCTAGAGCTATG") == 1
-
- hi = khmer.CountingHash(6, 1e6, 2)
- hi.consume("ACTGCTATCTCTAGAGCTATG")
- # hi.consume("ACaGCTATCTCTAGAGCTATG")
- hi.consume("ACAGCTATCTCTAGAGCTATG")
- # --^
- x = hi.get_kadian_count("ACTGCTATCTCTAGAGCTATG")
- assert x == 2, x
-
- hi = khmer.CountingHash(6, 1e6, 2)
- hi.consume("ACTGCTATCTCTAGAGCTATG")
- # hi.consume("ACaGCTATCTCTAGAcCTATG")
- hi.consume("ACAGCTATCTCTAGACCTATG")
- # --^ --^
- x = hi.get_kadian_count("ACTGCTATCTCTAGAGCTATG")
- assert x == 1, x
-
- hi = khmer.CountingHash(6, 1e6, 2)
- hi.consume("ACTGCTATCTCTAGAGCTATG")
- # hi.consume("ACTGCTATCgCTAGAGCTATG")
- hi.consume("ACTGCTATCGCTAGAGCTATG")
- # --^
- x = hi.get_kadian_count("ACTGCTATCTCTAGAGCTATG")
- assert x == 2, x
-
-
-def test_2_kadian():
- hi = khmer.CountingHash(6, 1e6, 2)
- hi.consume("ACTGCTATCTCTAGAGCTATG")
- assert hi.get_kadian_count("ACTGCTATCTCTAGAGCTATG", 2) == 1
-
- hi = khmer.CountingHash(6, 1e6, 2)
- hi.consume("ACTGCTATCTCTAGAGCTATG")
- # hi.consume("ACTGCTATCTCTAGAcCTATG")
- hi.consume("ACTGCTATCTCTAGACCTATG")
- # ---------------^
- x = hi.get_kadian_count("ACTGCTATCTCTAGAGCTATG", 2)
- assert x == 2, x
-
- hi = khmer.CountingHash(6, 1e6, 2)
- hi.consume("ACTGCTATCTCTAGAGCTATG")
- # hi.consume("ACTGCTATCTCTAGAcCTAtG")
- hi.consume("ACTGCTATCTCTAGACCTATG")
- # ---------------^---^
- assert hi.get_kadian_count("ACTGCTATCTCTAGAGCTATG", 2) == 2
-
- hi = khmer.CountingHash(6, 1e6, 2)
- hi.consume("ACTGCTATCTCTAGAGCTATG")
- # hi.consume("ACTGCTATCTCTACtcCTAtG")
- hi.consume("ACTGCTATCTCTACTCCTATG")
- # --------------^^---^
- x = hi.get_kadian_count("ACTGCTATCTCTAGAGCTATG", 2)
- assert x == 2, x
-
- hi = khmer.CountingHash(6, 1e6, 2)
- hi.consume("ACTGCTATCTCTAGAGCTATG")
- # hi.consume("ACTGCTgTCTCTACtcCTAtG")
- hi.consume("ACTGCTGTCTCTACTCCTATG")
- # ------^-------^^---^
- x = hi.get_kadian_count("ACTGCTATCTCTAGAGCTATG", 2)
- assert x == 1, x
-
-
def test_get_kmer_counts_too_short():
- hi = khmer.CountingHash(6, 1e6, 2)
+ hi = khmer.Countgraph(6, 1e6, 2)
hi.consume("AAAAAA")
counts = hi.get_kmer_counts("A")
@@ -469,7 +375,7 @@ def test_get_kmer_counts_too_short():
def test_get_kmer_hashes_too_short():
- hi = khmer.CountingHash(6, 1e6, 2)
+ hi = khmer.Countgraph(6, 1e6, 2)
hi.consume("AAAAAA")
hashes = hi.get_kmer_hashes("A")
@@ -477,7 +383,7 @@ def test_get_kmer_hashes_too_short():
def test_get_kmers_too_short():
- hi = khmer.CountingHash(6, 1e6, 2)
+ hi = khmer.Countgraph(6, 1e6, 2)
hi.consume("AAAAAA")
kmers = hi.get_kmers("A")
@@ -485,7 +391,7 @@ def test_get_kmers_too_short():
def test_get_kmer_counts():
- hi = khmer.CountingHash(6, 1e6, 2)
+ hi = khmer.Countgraph(6, 1e6, 2)
hi.consume("AAAAAA")
counts = hi.get_kmer_counts("AAAAAA")
@@ -522,7 +428,7 @@ def test_get_kmer_counts():
def test_get_kmer_hashes():
- hi = khmer.CountingHash(6, 1e6, 2)
+ hi = khmer.Countgraph(6, 1e6, 2)
hi.consume("AAAAAA")
hashes = hi.get_kmer_hashes("AAAAAA")
@@ -559,7 +465,7 @@ def test_get_kmer_hashes():
def test_get_kmers():
- hi = khmer.CountingHash(6, 1e6, 2)
+ hi = khmer.Countgraph(6, 1e6, 2)
kmers = hi.get_kmers("AAAAAA")
assert kmers == ["AAAAAA"]
@@ -576,11 +482,11 @@ def test_save_load_large():
sizes = khmer.get_n_primes_near_x(1, 2 ** 31 + 1000)
- orig = khmer._CountingHash(12, sizes)
+ orig = khmer._Countgraph(12, sizes)
orig.consume_fasta(inpath)
orig.save(savepath)
- loaded = khmer.load_counting_hash(savepath)
+ loaded = khmer.load_countgraph(savepath)
orig_count = orig.n_occupied()
loaded_count = loaded.n_occupied()
@@ -591,6 +497,27 @@ def test_save_load_large():
do_test(ctfile)
+def test_save_load_occupied():
+ def do_test(ctfile):
+ print('working with', ctfile)
+ inpath = utils.get_test_data('random-20-a.fa')
+ savepath = utils.get_temp_filename(ctfile)
+
+ orig = khmer.Countgraph(12, 1e5, 4)
+ orig.consume_fasta(inpath)
+ orig.save(savepath)
+
+ loaded = khmer.load_countgraph(savepath)
+
+ orig_count = orig.n_occupied()
+ loaded_count = loaded.n_occupied()
+ assert orig_count == 3886, orig_count
+ assert loaded_count == orig_count, loaded_count
+
+ for ctfile in ['temp.ct', 'temp.ct.gz']:
+ do_test(ctfile)
+
+
def test_save_load():
inpath = utils.get_test_data('random-20-a.fa')
savepath = utils.get_temp_filename('tempcountingsave0.ht')
@@ -598,20 +525,20 @@ def test_save_load():
sizes = list(PRIMES_1m)
sizes.append(1000005)
- hi = khmer._CountingHash(12, sizes)
+ hi = khmer._Countgraph(12, sizes)
hi.consume_fasta(inpath)
hi.save(savepath)
- ht = khmer._CountingHash(12, sizes)
+ ht = khmer._Countgraph(12, sizes)
try:
ht.load(savepath)
except OSError as err:
assert 0, 'Should not produce an OSError: ' + str(err)
- tracking = khmer._Hashbits(12, sizes)
+ tracking = khmer._Nodegraph(12, sizes)
x = hi.abundance_distribution(inpath, tracking)
- tracking = khmer._Hashbits(12, sizes)
+ tracking = khmer._Nodegraph(12, sizes)
y = ht.abundance_distribution(inpath, tracking)
assert sum(x) == 3966, sum(x)
@@ -625,7 +552,7 @@ def test_load_truncated():
sizes = khmer.get_n_primes_near_x(3, 200)
- hi = khmer._CountingHash(12, sizes)
+ hi = khmer._Countgraph(12, sizes)
hi.consume_fasta(inpath)
hi.save(savepath)
@@ -636,7 +563,7 @@ def test_load_truncated():
fp.close()
try:
- ht = khmer.load_counting_hash(truncpath)
+ ht = khmer.load_countgraph(truncpath)
assert 0, "this should not be reached!"
except OSError as err:
print(str(err))
@@ -652,7 +579,7 @@ def test_load_gz():
sizes.append(1000005)
# save uncompressed hashtable.
- hi = khmer._CountingHash(12, sizes)
+ hi = khmer._Countgraph(12, sizes)
hi.consume_fasta(inpath)
hi.save(savepath)
@@ -664,16 +591,16 @@ def test_load_gz():
in_file.close()
# load compressed hashtable.
- ht = khmer._CountingHash(12, sizes)
+ ht = khmer._Countgraph(12, sizes)
try:
ht.load(loadpath)
except OSError as err:
assert 0, "Should not produce an OSError: " + str(err)
- tracking = khmer._Hashbits(12, sizes)
+ tracking = khmer._Nodegraph(12, sizes)
x = hi.abundance_distribution(inpath, tracking)
- tracking = khmer._Hashbits(12, sizes)
+ tracking = khmer._Nodegraph(12, sizes)
y = ht.abundance_distribution(inpath, tracking)
assert sum(x) == 3966, sum(x)
@@ -687,20 +614,20 @@ def test_save_load_gz():
sizes = list(PRIMES_1m)
sizes.append(1000005)
- hi = khmer._CountingHash(12, sizes)
+ hi = khmer._Countgraph(12, sizes)
hi.consume_fasta(inpath)
hi.save(savepath)
- ht = khmer._CountingHash(12, sizes)
+ ht = khmer._Countgraph(12, sizes)
try:
ht.load(savepath)
except OSError as err:
assert 0, 'Should not produce an OSError: ' + str(err)
- tracking = khmer._Hashbits(12, sizes)
+ tracking = khmer._Nodegraph(12, sizes)
x = hi.abundance_distribution(inpath, tracking)
- tracking = khmer._Hashbits(12, sizes)
+ tracking = khmer._Nodegraph(12, sizes)
y = ht.abundance_distribution(inpath, tracking)
assert sum(x) == 3966, sum(x)
@@ -710,7 +637,7 @@ def test_save_load_gz():
def test_load_empty_files():
def do_load_ct(fname):
with assert_raises(OSError):
- ct = khmer.load_counting_hash(fname)
+ ct = khmer.load_countgraph(fname)
# Check empty files, compressed or not
for ext in ['', '.gz']:
@@ -719,7 +646,7 @@ def test_load_empty_files():
def test_trim_full():
- hi = khmer.CountingHash(6, 1e6, 2)
+ hi = khmer.Countgraph(6, 1e6, 2)
hi.consume(DNA)
hi.consume(DNA)
@@ -729,7 +656,7 @@ def test_trim_full():
def test_trim_short():
- hi = khmer.CountingHash(6, 1e6, 2)
+ hi = khmer.Countgraph(6, 1e6, 2)
hi.consume(DNA)
hi.consume(DNA[:50])
@@ -741,7 +668,7 @@ def test_trim_short():
def test_find_spectral_error_positions_1():
- hi = khmer.CountingHash(8, 1e6, 2)
+ hi = khmer.Countgraph(8, 1e6, 2)
hi.consume(DNA)
hi.consume(DNA[:30])
@@ -754,7 +681,7 @@ def test_find_spectral_error_positions_1():
def test_find_spectral_error_positions_2():
- hi = khmer.CountingHash(8, 1e6, 2)
+ hi = khmer.Countgraph(8, 1e6, 2)
hi.consume(DNA)
hi.consume(DNA)
@@ -764,7 +691,7 @@ def test_find_spectral_error_positions_2():
def test_find_spectral_error_positions_6():
- hi = khmer.CountingHash(8, 1e6, 2)
+ hi = khmer.Countgraph(8, 1e6, 2)
hi.consume(DNA)
hi.consume(DNA[1:])
@@ -777,7 +704,7 @@ def test_find_spectral_error_positions_6():
def test_find_spectral_error_positions_4():
- hi = khmer.CountingHash(8, 1e6, 2)
+ hi = khmer.Countgraph(8, 1e6, 2)
hi.consume(DNA)
@@ -786,7 +713,7 @@ def test_find_spectral_error_positions_4():
def test_find_spectral_error_positions_5():
- hi = khmer.CountingHash(8, 1e6, 2)
+ hi = khmer.Countgraph(8, 1e6, 2)
hi.consume(DNA)
hi.consume(DNA[:10])
@@ -798,7 +725,7 @@ def test_find_spectral_error_positions_5():
def test_find_spectral_error_locs7():
K = 8
- hi = khmer.CountingHash(K, 1e6, 2)
+ hi = khmer.Countgraph(K, 1e6, 2)
hi.consume(DNA)
hi.consume(DNA[K:])
@@ -811,7 +738,7 @@ def test_find_spectral_error_locs7():
def test_find_spectral_error_positions_err():
- hi = khmer.CountingHash(8, 1e6, 2)
+ hi = khmer.Countgraph(8, 1e6, 2)
try:
posns = hi.find_spectral_error_positions(DNA[:6], 1)
@@ -828,7 +755,7 @@ def test_find_spectral_error_positions_err():
def test_maxcount():
# hashtable should saturate at some point so as not to overflow counter
- kh = khmer.CountingHash(4, 4 ** 4, 4)
+ kh = khmer.Countgraph(4, 4 ** 4, 4)
kh.set_use_bigcount(False)
last_count = None
@@ -846,7 +773,7 @@ def test_maxcount():
def test_maxcount_with_bigcount():
# hashtable should not saturate, if use_bigcount is set.
- kh = khmer.CountingHash(4, 4 ** 4, 4)
+ kh = khmer.Countgraph(4, 4 ** 4, 4)
kh.set_use_bigcount(True)
last_count = None
@@ -864,7 +791,7 @@ def test_maxcount_with_bigcount():
def test_maxcount_with_bigcount_save():
# hashtable should not saturate, if use_bigcount is set.
- kh = khmer.CountingHash(4, 4 ** 4, 4)
+ kh = khmer.Countgraph(4, 4 ** 4, 4)
kh.set_use_bigcount(True)
for i in range(0, 1000):
@@ -874,7 +801,7 @@ def test_maxcount_with_bigcount_save():
savepath = utils.get_temp_filename('tempcountingsave.ht')
kh.save(savepath)
- kh = khmer.CountingHash(1, 1, 1)
+ kh = khmer.Countgraph(1, 1, 1)
try:
kh.load(savepath)
except OSError as err:
@@ -887,13 +814,13 @@ def test_maxcount_with_bigcount_save():
def test_bigcount_save():
# hashtable should not saturate, if use_bigcount is set.
- kh = khmer.CountingHash(4, 4 ** 4, 4)
+ kh = khmer.Countgraph(4, 4 ** 4, 4)
kh.set_use_bigcount(True)
savepath = utils.get_temp_filename('tempcountingsave.ht')
kh.save(savepath)
- kh = khmer.CountingHash(1, 1, 1)
+ kh = khmer.Countgraph(1, 1, 1)
try:
kh.load(savepath)
except OSError as err:
@@ -911,13 +838,13 @@ def test_bigcount_save():
def test_nobigcount_save():
- kh = khmer.CountingHash(4, 4 ** 4, 4)
+ kh = khmer.Countgraph(4, 4 ** 4, 4)
# kh.set_use_bigcount(False) <-- this is the default
savepath = utils.get_temp_filename('tempcountingsave.ht')
kh.save(savepath)
- kh = khmer.CountingHash(1, 1, 1)
+ kh = khmer.Countgraph(1, 1, 1)
try:
kh.load(savepath)
except OSError as err:
@@ -935,8 +862,8 @@ def test_nobigcount_save():
def test_bigcount_abund_dist():
- kh = khmer.CountingHash(18, 1e2, 4)
- tracking = khmer.Hashbits(18, 1e2, 4)
+ kh = khmer.Countgraph(18, 1e2, 4)
+ tracking = khmer.Nodegraph(18, 1e2, 4)
kh.set_use_bigcount(True)
seqpath = utils.get_test_data('test-abund-read-2.fa')
@@ -951,8 +878,8 @@ def test_bigcount_abund_dist():
def test_bigcount_abund_dist_2():
- kh = khmer.CountingHash(18, 1e7, 4)
- tracking = khmer.Hashbits(18, 1e7, 4)
+ kh = khmer.Countgraph(18, 1e7, 4)
+ tracking = khmer.Nodegraph(18, 1e7, 4)
kh.set_use_bigcount(True)
seqpath = utils.get_test_data('test-abund-read.fa')
@@ -969,7 +896,7 @@ def test_bigcount_abund_dist_2():
def test_bigcount_overflow():
- kh = khmer.CountingHash(18, 1e7, 4)
+ kh = khmer.Countgraph(18, 1e7, 4)
kh.set_use_bigcount(True)
for i in range(0, 70000):
@@ -979,12 +906,12 @@ def test_bigcount_overflow():
def test_get_ksize():
- kh = khmer.CountingHash(22, 1, 1)
+ kh = khmer.Countgraph(22, 1, 1)
assert kh.ksize() == 22
def test_get_hashsizes():
- kh = khmer.CountingHash(22, 100, 4)
+ kh = khmer.Countgraph(22, 100, 4)
# Py2/3 hack, longify converts to long in py2, remove once py2 isn't
# supported any longer.
expected = utils.longify([97, 89, 83, 79])
@@ -994,14 +921,14 @@ def test_get_hashsizes():
# def test_collect_high_abundance_kmers():
# seqpath = utils.get_test_data('test-abund-read-2.fa')
#
-# kh = khmer.CountingHash(18, 1e6, 4)
+# kh = khmer.Countgraph(18, 1e6, 4)
# hb = kh.collect_high_abundance_kmers(seqpath, 2, 4)
def test_load_notexist_should_fail():
savepath = utils.get_temp_filename('tempcountingsave0.ht')
- hi = khmer.CountingHash(12, 1000, 2)
+ hi = khmer.Countgraph(12, 1000, 2)
try:
hi.load(savepath)
assert 0, "load should fail"
@@ -1013,7 +940,7 @@ def test_load_truncated_should_fail():
inpath = utils.get_test_data('random-20-a.fa')
savepath = utils.get_temp_filename('tempcountingsave0.ht')
- hi = khmer.CountingHash(12, 1000, 2)
+ hi = khmer.Countgraph(12, 1000, 2)
hi.consume_fasta(inpath)
hi.save(savepath)
@@ -1025,7 +952,7 @@ def test_load_truncated_should_fail():
fp.write(data[:1000])
fp.close()
- hi = khmer._CountingHash(12, [1])
+ hi = khmer._Countgraph(12, [1])
try:
hi.load(savepath)
assert 0, "load should fail"
@@ -1036,7 +963,7 @@ def test_load_truncated_should_fail():
def test_load_gz_notexist_should_fail():
savepath = utils.get_temp_filename('tempcountingsave0.ht.gz')
- hi = khmer.CountingHash(12, 1000, 2)
+ hi = khmer.Countgraph(12, 1000, 2)
try:
hi.load(savepath)
assert 0, "load should fail"
@@ -1048,7 +975,7 @@ def test_load_gz_truncated_should_fail():
inpath = utils.get_test_data('random-20-a.fa')
savepath = utils.get_temp_filename('tempcountingsave0.ht.gz')
- hi = khmer.CountingHash(12, 1000, 2)
+ hi = khmer.Countgraph(12, 1000, 2)
hi.consume_fasta(inpath)
hi.save(savepath)
@@ -1060,7 +987,7 @@ def test_load_gz_truncated_should_fail():
fp.write(data[:1000])
fp.close()
- hi = khmer._CountingHash(12, [1])
+ hi = khmer._Countgraph(12, [1])
try:
hi.load(savepath)
assert 0, "load should fail"
@@ -1069,7 +996,7 @@ def test_load_gz_truncated_should_fail():
def test_counting_file_version_check():
- ht = khmer.CountingHash(12, 1, 1)
+ ht = khmer.Countgraph(12, 1, 1)
inpath = utils.get_test_data('badversion-k12.ct')
@@ -1081,7 +1008,7 @@ def test_counting_file_version_check():
def test_counting_gz_file_version_check():
- ht = khmer.CountingHash(12, 1, 1)
+ ht = khmer.Countgraph(12, 1, 1)
inpath = utils.get_test_data('badversion-k12.ct.gz')
@@ -1095,7 +1022,7 @@ def test_counting_gz_file_version_check():
def test_counting_file_type_check():
inpath = utils.get_test_data('goodversion-k12.ht')
- kh = khmer.CountingHash(12, 1, 1)
+ kh = khmer.Countgraph(12, 1, 1)
try:
kh.load(inpath)
@@ -1105,11 +1032,11 @@ def test_counting_file_type_check():
def test_counting_gz_file_type_check():
- ht = khmer.Hashbits(12, 1, 1)
+ ht = khmer.Nodegraph(12, 1, 1)
inpath = utils.get_test_data('goodversion-k12.ht.gz')
- kh = khmer.CountingHash(12, 1, 1)
+ kh = khmer.Countgraph(12, 1, 1)
try:
kh.load(inpath)
@@ -1120,42 +1047,42 @@ def test_counting_gz_file_type_check():
def test_counting_bad_primes_list():
try:
- ht = khmer._CountingHash(12, ["a", "b", "c"], 1)
+ ht = khmer._Countgraph(12, ["a", "b", "c"], 1)
assert 0, "bad list of primes should fail"
except TypeError as e:
print(str(e))
def test_bad_use_bigcount():
- countingtable = khmer.CountingHash(4, 4 ** 4, 4)
- countingtable.set_use_bigcount(True)
- assert countingtable.get_use_bigcount()
+ countgraph = khmer.Countgraph(4, 4 ** 4, 4)
+ countgraph.set_use_bigcount(True)
+ assert countgraph.get_use_bigcount()
try:
- countingtable.get_use_bigcount(True)
+ countgraph.get_use_bigcount(True)
assert 0, "this should fail"
except TypeError as err:
print(str(err))
def test_consume_absentfasta():
- countingtable = khmer.CountingHash(4, 4 ** 4, 4)
+ countgraph = khmer.Countgraph(4, 4 ** 4, 4)
try:
- countingtable.consume_fasta("absent_file.fa")
+ countgraph.consume_fasta("absent_file.fa")
assert 0, "This should fail"
except OSError as err:
print(str(err))
def test_consume_absentfasta_with_reads_parser():
- countingtable = khmer.CountingHash(4, 4 ** 4, 4)
+ countgraph = khmer.Countgraph(4, 4 ** 4, 4)
try:
- countingtable.consume_fasta_with_reads_parser()
+ countgraph.consume_fasta_with_reads_parser()
assert 0, "this should fail"
except TypeError as err:
print(str(err))
try:
readparser = ReadParser(utils.get_test_data('empty-file'))
- countingtable.consume_fasta_with_reads_parser(readparser)
+ countgraph.consume_fasta_with_reads_parser(readparser)
assert 0, "this should fail"
except OSError as err:
print(str(err))
@@ -1164,189 +1091,175 @@ def test_consume_absentfasta_with_reads_parser():
def test_badconsume():
- countingtable = khmer.CountingHash(4, 4 ** 4, 4)
+ countgraph = khmer.Countgraph(4, 4 ** 4, 4)
try:
- countingtable.consume()
+ countgraph.consume()
assert 0, "this should fail"
except TypeError as err:
print(str(err))
try:
- countingtable.consume("AAA")
+ countgraph.consume("AAA")
assert 0, "this should fail"
except ValueError as err:
print(str(err))
def test_get_badmin_count():
- countingtable = khmer.CountingHash(4, 4 ** 4, 4)
+ countgraph = khmer.Countgraph(4, 4 ** 4, 4)
try:
- countingtable.get_min_count()
+ countgraph.get_min_count()
assert 0, "this should fail"
except TypeError as err:
print(str(err))
try:
- countingtable.get_min_count("AAA")
+ countgraph.get_min_count("AAA")
assert 0, "this should fail"
except ValueError as err:
print(str(err))
def test_get_badmax_count():
- countingtable = khmer.CountingHash(4, 4 ** 4, 4)
+ countgraph = khmer.Countgraph(4, 4 ** 4, 4)
try:
- countingtable.get_max_count()
+ countgraph.get_max_count()
assert 0, "this should fail"
except TypeError as err:
print(str(err))
try:
- countingtable.get_max_count("AAA")
+ countgraph.get_max_count("AAA")
assert 0, "this should fail"
except ValueError as err:
print(str(err))
def test_get_badmedian_count():
- countingtable = khmer.CountingHash(4, 4 ** 4, 4)
- try:
- countingtable.get_median_count()
- assert 0, "this should fail"
- except TypeError as err:
- print(str(err))
- try:
- countingtable.get_median_count("AAA")
- assert 0, "this should fail"
- except ValueError as err:
- print(str(err))
-
-
-def test_get_badkadian_count():
- countingtable = khmer.CountingHash(4, 4 ** 4, 4)
+ countgraph = khmer.Countgraph(4, 4 ** 4, 4)
try:
- countingtable.get_kadian_count()
+ countgraph.get_median_count()
assert 0, "this should fail"
except TypeError as err:
print(str(err))
try:
- countingtable.get_kadian_count("AAA")
+ countgraph.get_median_count("AAA")
assert 0, "this should fail"
except ValueError as err:
print(str(err))
def test_badget():
- countingtable = khmer.CountingHash(4, 4 ** 4, 4)
+ countgraph = khmer.Countgraph(4, 4 ** 4, 4)
try:
- countingtable.get()
+ countgraph.get()
assert 0, "this should fail"
except TypeError as err:
print(str(err))
def test_badget_2():
- countingtable = khmer.CountingHash(6, 1e6, 2)
+ countgraph = khmer.Countgraph(6, 1e6, 2)
- countingtable.consume(DNA)
+ countgraph.consume(DNA)
- assert countingtable.get("AGCTTT") == 1
+ assert countgraph.get("AGCTTT") == 1
- assert countingtable.get("GATGAG") == 0
+ assert countgraph.get("GATGAG") == 0
try:
- countingtable.get("AGCTT")
+ countgraph.get("AGCTT")
assert 0, "this should fail"
except ValueError as err:
print(str(err))
def test_badtrim():
- countingtable = khmer.CountingHash(6, 1e6, 2)
+ countgraph = khmer.Countgraph(6, 1e6, 2)
- countingtable.consume(DNA)
+ countgraph.consume(DNA)
try:
- countingtable.trim_on_abundance()
+ countgraph.trim_on_abundance()
assert 0, "this should fail"
except TypeError as err:
print(str(err))
- countingtable.trim_on_abundance("AAAAAA", 1)
+ countgraph.trim_on_abundance("AAAAAA", 1)
def test_badfasta_count_kmers_by_position():
- countingtable = khmer.CountingHash(4, 4 ** 4, 4)
+ countgraph = khmer.Countgraph(4, 4 ** 4, 4)
try:
- countingtable.fasta_count_kmers_by_position()
+ countgraph.fasta_count_kmers_by_position()
except TypeError as err:
print(str(err))
filename = utils.get_test_data("test-short.fa")
try:
- countingtable.fasta_count_kmers_by_position(filename, -1, 0)
+ countgraph.fasta_count_kmers_by_position(filename, -1, 0)
assert 0, "this should fail"
except ValueError as err:
print(str(err))
try:
- countingtable.fasta_count_kmers_by_position(filename, 0, -1)
+ countgraph.fasta_count_kmers_by_position(filename, 0, -1)
assert 0, "this should fail"
except ValueError as err:
print(str(err))
def test_badload():
- countingtable = khmer.CountingHash(4, 4 ** 4, 4)
+ countgraph = khmer.Countgraph(4, 4 ** 4, 4)
try:
- countingtable.load()
+ countgraph.load()
assert 0, "this should fail"
except TypeError as err:
print(str(err))
def test_badsave():
- countingtable = khmer.CountingHash(4, 4 ** 4, 4)
+ countgraph = khmer.Countgraph(4, 4 ** 4, 4)
try:
- countingtable.save()
+ countgraph.save()
assert 0, "this should fail"
except TypeError as err:
print(str(err))
def test_badksize():
- countingtable = khmer.CountingHash(4, 4 ** 4, 4)
+ countgraph = khmer.Countgraph(4, 4 ** 4, 4)
try:
- countingtable.ksize(True)
+ countgraph.ksize(True)
assert 0, "this should fail"
except TypeError as err:
print(str(err))
def test_badhashsizes():
- countingtable = khmer.CountingHash(4, 4 ** 4, 4)
+ countgraph = khmer.Countgraph(4, 4 ** 4, 4)
try:
- countingtable.hashsizes(True)
+ countgraph.hashsizes(True)
assert 0, "this should fail"
except TypeError as err:
print(str(err))
def test_badconsume_and_tag():
- countingtable = khmer.CountingHash(4, 4 ** 4, 4)
+ countgraph = khmer.Countgraph(4, 4 ** 4, 4)
try:
- countingtable.consume_and_tag()
+ countgraph.consume_and_tag()
assert 0, "this should fail"
except TypeError as err:
print(str(err))
def test_consume_fasta_and_tag():
- countingtable = khmer.CountingHash(4, 4 ** 4, 4)
+ countgraph = khmer.Countgraph(4, 4 ** 4, 4)
try:
- countingtable.consume_fasta_and_tag()
+ countgraph.consume_fasta_and_tag()
assert 0, "this should fail"
except TypeError as err:
print(str(err))
- countingtable.consume_fasta_and_tag(utils.get_test_data("test-graph2.fa"))
+ countgraph.consume_fasta_and_tag(utils.get_test_data("test-graph2.fa"))
def test_consume_and_retrieve_tags_1():
- ct = khmer.CountingHash(4, 4 ** 4, 4)
+ ct = khmer.Countgraph(4, 4 ** 4, 4)
# first, for each sequence, build tags.
for record in screed.open(utils.get_test_data('test-graph2.fa')):
@@ -1369,7 +1282,7 @@ def test_consume_and_retrieve_tags_1():
def test_consume_and_retrieve_tags_empty():
- ct = khmer.CountingHash(4, 4 ** 4, 4)
+ ct = khmer.Countgraph(4, 4 ** 4, 4)
# load each sequence but do not build tags - everything should be empty.
for record in screed.open(utils.get_test_data('test-graph2.fa')):
@@ -1393,7 +1306,7 @@ def test_consume_and_retrieve_tags_empty():
def test_find_all_tags_list_error():
- ct = khmer.CountingHash(4, 4 ** 4, 4)
+ ct = khmer.Countgraph(4, 4 ** 4, 4)
# load each sequence but do not build tags - everything should be empty.
for record in screed.open(utils.get_test_data('test-graph2.fa')):
@@ -1415,25 +1328,65 @@ def test_find_all_tags_list_error():
def test_abund_dist_gz_bigcount():
infile = utils.get_temp_filename('test.fa')
shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile)
- outfile = utils.get_temp_filename('test_ct.gz')
script = 'load-into-counting.py'
htfile = utils.get_temp_filename('test_ct')
args = ['-x', str(1e7), '-N', str(2), '-k', str(2), htfile, infile]
utils.runscript(script, args) # create a bigcount table
assert os.path.exists(htfile)
data = open(htfile, 'rb').read()
+
+ outfile = utils.get_temp_filename('test_ct.gz')
f_out = gzip.open(outfile, 'wb') # compress the created bigcount table
f_out.write(data)
f_out.close()
# load the compressed bigcount table
try:
- counting_hash = khmer.load_counting_hash(outfile)
+ countgraph = khmer.load_countgraph(outfile)
+ except OSError as err:
+ assert 0, 'Should not produce OSError: ' + str(err)
+
+ assert countgraph.n_occupied() != 0
+ hashsizes = countgraph.hashsizes()
+ kmer_size = countgraph.ksize()
+ tracking = khmer._Nodegraph(kmer_size, hashsizes)
+ abundances = countgraph.abundance_distribution(infile, tracking)
+ # calculate abundance distribution for compressed bigcount table
+ flag = False
+ # check if abundance is > 255
+ # if ok gzipped bigcount was loaded correctly
+ for _, i in enumerate(abundances):
+ print(_, i)
+ if _ > 255 and i > 0:
+ flag = True
+ break
+ assert flag
+
+
+def test_abund_dist_gz_bigcount_compressed_first():
+ infile = utils.get_temp_filename('test.fa')
+ shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile)
+ script = 'load-into-counting.py'
+ htfile = utils.get_temp_filename('test_ct.gz')
+ args = ['-x', str(1e7), '-N', str(2), '-k', str(2), htfile, infile]
+ utils.runscript(script, args) # create a bigcount table
+ assert os.path.exists(htfile)
+ data = gzip.open(htfile, 'rb').read() # read compressed bigcount table
+
+ outfile = utils.get_temp_filename('test_ct')
+ f_out = open(outfile, 'wb') # output the bigcount table
+ f_out.write(data)
+ f_out.close()
+ # load the compressed bigcount table
+ try:
+ countgraph = khmer.load_countgraph(outfile)
except OSError as err:
assert 0, 'Should not produce OSError: ' + str(err)
- hashsizes = counting_hash.hashsizes()
- kmer_size = counting_hash.ksize()
- tracking = khmer._Hashbits(kmer_size, hashsizes)
- abundances = counting_hash.abundance_distribution(infile, tracking)
+
+ assert countgraph.n_occupied() != 0
+ hashsizes = countgraph.hashsizes()
+ kmer_size = countgraph.ksize()
+ tracking = khmer._Nodegraph(kmer_size, hashsizes)
+ abundances = countgraph.abundance_distribution(infile, tracking)
# calculate abundance distribution for compressed bigcount table
flag = False
# check if abundance is > 255
@@ -1447,7 +1400,7 @@ def test_abund_dist_gz_bigcount():
def test_counting_load_bigcount():
- count_table = khmer.CountingHash(10, 1e5, 4)
+ count_table = khmer.Countgraph(10, 1e5, 4)
count_table.set_use_bigcount(True)
for i in range(500):
print(i, count_table.count('ATATATATAT'))
diff --git a/tests/test_counting_single.py b/tests/test_counting_single.py
index b7256c5..8a6e5b0 100644
--- a/tests/test_counting_single.py
+++ b/tests/test_counting_single.py
@@ -16,27 +16,17 @@ from nose.plugins.attrib import attr
MAX_COUNT = 255
-def test_no_collision():
- kh = khmer._CountingHash(4, [5])
-
- kh.count('AAAA')
- assert kh.get('AAAA') == 1
-
- kh.count('TTTT') # reverse complement
- assert kh.get('TTTT') == 2
-
-
@attr('huge')
def test_toobig():
try:
- ct = khmer.CountingHash(4, 1000000000000, 1)
+ ct = khmer.Countgraph(4, 1000000000000, 1)
assert 0, "this should fail"
except MemoryError as err:
print(str(err))
def test_collision():
- kh = khmer._CountingHash(4, [5])
+ kh = khmer._Countgraph(4, [5])
kh.count('AAAA')
assert kh.get('AAAA') == 1
@@ -46,32 +36,25 @@ def test_collision():
def test_badcount():
- countingtable = khmer._CountingHash(4, [5])
+ countgraph = khmer._Countgraph(4, [5])
try:
- countingtable.count()
+ countgraph.count()
assert 0, "count should require one argument"
except TypeError as err:
print(str(err))
try:
- countingtable.count('ABCDE')
+ countgraph.count('ABCDE')
assert 0, "count should require k-mer size to be equal"
except ValueError as err:
print(str(err))
-def test_hashtable_n_entries():
- countingtable = khmer._CountingHash(4, [5])
- try:
- countingtable.n_entries("nope")
- assert 0, "n_entries should accept no arguments"
- except TypeError as err:
- print(str(err))
-
-
def test_complete_no_collision():
- kh = khmer._CountingHash(4, [4 ** 4])
+ kh = khmer._Countgraph(4, [4 ** 4])
+
+ n_entries = kh.hashsizes()[0]
- for i in range(0, kh.n_entries()):
+ for i in range(0, n_entries):
s = khmer.reverse_hash(i, 4)
kh.count(s)
@@ -79,7 +62,7 @@ def test_complete_no_collision():
n_rc_filled = 0
n_fwd_filled = 0
- for i in range(0, kh.n_entries()):
+ for i in range(0, n_entries):
s = khmer.reverse_hash(i, 4)
if kh.get(s): # string hashing is rc aware
n_rc_filled += 1
@@ -88,16 +71,17 @@ def test_complete_no_collision():
if kh.get(i): # int hashing is not rc aware
n_fwd_filled += 1
- assert n_rc_filled == kh.n_entries(), n_rc_filled
+ assert n_rc_filled == n_entries, n_rc_filled
assert n_palindromes == 16, n_palindromes
- assert n_fwd_filled == kh.n_entries() // 2 + n_palindromes // 2, \
+ assert n_fwd_filled == n_entries // 2 + n_palindromes // 2, \
n_fwd_filled
def test_complete_2_collision():
- kh = khmer._CountingHash(4, [5])
+ kh = khmer._Countgraph(4, [5])
- for i in range(0, kh.n_entries()):
+ n_entries = kh.hashsizes()[0]
+ for i in range(0, n_entries):
s = khmer.reverse_hash(i, 4)
kh.count(s)
@@ -112,13 +96,14 @@ def test_complete_2_collision():
# n_fwd_filled += 1
assert n_rc_filled == 128, n_rc_filled
- # @CTB assert n_fwd_filled == 100 # kt.n_entries() / 2, n_fwd_filled
def test_complete_4_collision():
- kh = khmer._CountingHash(4, [3])
+ kh = khmer._Countgraph(4, [3])
- for i in range(0, kh.n_entries()):
+ n_entries = kh.hashsizes()[0]
+
+ for i in range(0, n_entries):
s = khmer.reverse_hash(i, 4)
kh.count(s)
@@ -133,12 +118,11 @@ def test_complete_4_collision():
# n_fwd_filled += 1
assert n_rc_filled == 64, n_rc_filled
- # @CTB assert n_fwd_filled == kt.n_entries() / 2, n_fwd_filled
def test_maxcount():
# hashtable should saturate at some point so as not to overflow counter
- kh = khmer._CountingHash(4, [5])
+ kh = khmer._Countgraph(4, [5])
last_count = None
for _ in range(0, 10000):
@@ -156,7 +140,7 @@ def test_maxcount():
def test_maxcount_with_bigcount():
# hashtable should not saturate, if use_bigcount is set.
- kh = khmer._CountingHash(4, [5])
+ kh = khmer._Countgraph(4, [5])
kh.set_use_bigcount(True)
last_count = None
@@ -174,7 +158,7 @@ def test_maxcount_with_bigcount():
def test_consume_uniqify_first():
- kh = khmer._CountingHash(4, [5])
+ kh = khmer._Countgraph(4, [5])
s = "TTTT"
s_rc = "AAAA"
@@ -186,7 +170,7 @@ def test_consume_uniqify_first():
def test_maxcount_consume():
# hashtable should saturate at some point so as not to overflow counter
- kh = khmer._CountingHash(4, [5])
+ kh = khmer._Countgraph(4, [5])
s = "A" * 10000
kh.consume(s)
@@ -197,7 +181,7 @@ def test_maxcount_consume():
def test_maxcount_consume_with_bigcount():
# use the bigcount hack to avoid saturating the hashtable.
- kh = khmer._CountingHash(4, [5])
+ kh = khmer._Countgraph(4, [5])
kh.set_use_bigcount(True)
s = "A" * 10000
@@ -208,7 +192,7 @@ def test_maxcount_consume_with_bigcount():
def test_get_mincount():
- kh = khmer._CountingHash(4, [5])
+ kh = khmer._Countgraph(4, [5])
s = "AAAAACGT"
kh.consume(s)
@@ -222,7 +206,7 @@ def test_get_mincount():
def test_get_maxcount():
- kh = khmer._CountingHash(4, [7])
+ kh = khmer._Countgraph(4, [7])
s = "AAAAACGT"
kh.consume(s)
@@ -236,7 +220,7 @@ def test_get_maxcount():
def test_get_maxcount_rc():
- kh = khmer._CountingHash(4, [7])
+ kh = khmer._Countgraph(4, [7])
s = "AAAAACGT"
src = "ACGTTTTT"
@@ -251,7 +235,7 @@ def test_get_maxcount_rc():
def test_get_mincount_rc():
- kh = khmer._CountingHash(4, [5])
+ kh = khmer._Countgraph(4, [5])
s = "AAAAACGT"
src = "ACGTTTTT"
@@ -266,7 +250,7 @@ def test_get_mincount_rc():
def test_badget():
- kh = khmer.CountingHash(6, 4 ** 10, 1)
+ kh = khmer.Countgraph(6, 4 ** 10, 1)
DNA = "AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAG"
@@ -284,7 +268,7 @@ def test_badget():
def test_64bitshift():
- kh = khmer.CountingHash(25, 4, 1)
+ kh = khmer.Countgraph(25, 4, 1)
fullstr = "GTATGCCAGCTCCAACTGGGCCGGTACGAGCAGGCCATTGCCTCTTGCCGCGATGCGTCGGCG"
substr = "ATGCCAGCTCCAACTGGGCCGGTACGAGCAGGCCATTGCCTCTTGC"
@@ -293,7 +277,7 @@ def test_64bitshift():
def test_64bitshift_2():
- kh = khmer.CountingHash(25, 4, 1)
+ kh = khmer.Countgraph(25, 4, 1)
fullstr = "GTATGCCAGCTCCAACTGGGCCGGTACGAGCAGGCCATTGCCTCTTGCCGCGATGCGTCGGCG"
kh.consume(fullstr)
@@ -304,12 +288,12 @@ def test_64bitshift_2():
def test_very_short_read():
short_filename = utils.get_test_data('test-short.fa')
- kh = khmer.CountingHash(9, 4, 1)
+ kh = khmer.Countgraph(9, 4, 1)
n_reads, n_kmers = kh.consume_fasta(short_filename)
assert n_reads == 1, n_reads
assert n_kmers == 0, n_kmers
- kh = khmer.CountingHash(8, 4, 1)
+ kh = khmer.Countgraph(8, 4, 1)
n_reads, n_kmers = kh.consume_fasta(short_filename)
assert n_reads == 1, n_reads
assert n_kmers == 1, n_kmers
@@ -318,7 +302,7 @@ def test_very_short_read():
class Test_ConsumeString(object):
def setup(self):
- self.kh = khmer._CountingHash(4, [4 ** 4])
+ self.kh = khmer._Countgraph(4, [4 ** 4])
def test_n_occupied(self):
assert self.kh.n_occupied() == 0
@@ -371,18 +355,6 @@ class Test_ConsumeString(object):
assert dist[2] == 1
assert sum(dist) == 2
- def test_n_occupied_args(self):
- assert self.kh.n_occupied() == 0
- self.kh.consume('AAAA')
- assert self.kh.n_occupied(0, 1) == 1
- assert self.kh.n_occupied(1, 4 ** 4) == 0, self.kh.n_occupied()
-
- hashvalue = khmer.forward_hash('AACT', 4)
- self.kh.consume('AACT')
- assert self.kh.n_occupied(0, hashvalue + 1) == 2
- assert self.kh.n_occupied(hashvalue + 1, 4 ** 4) == 0
- assert self.kh.n_occupied(hashvalue, hashvalue + 1) == 1
-
def test_simple(self):
n = self.kh.consume('AAAA')
assert n == 1
@@ -414,14 +386,14 @@ class Test_ConsumeString(object):
class Test_AbundanceDistribution(object):
def setup(self):
- self.kh = khmer._CountingHash(4, [5])
+ self.kh = khmer._Countgraph(4, [5])
A_filename = utils.get_test_data('all-A.fa')
self.kh.consume_fasta(A_filename)
def test_count_A(self):
A_filename = utils.get_test_data('all-A.fa')
- tracking = khmer._Hashbits(4, [5])
+ tracking = khmer._Nodegraph(4, [5])
dist = self.kh.abundance_distribution(A_filename, tracking)
assert sum(dist) == 1
diff --git a/tests/test_filter.py b/tests/test_filter.py
index 3af64ff..169a62a 100644
--- a/tests/test_filter.py
+++ b/tests/test_filter.py
@@ -27,7 +27,7 @@ def load_fa_seq_names(filename):
class Test_Filter(object):
def test_abund(self):
- ht = khmer.CountingHash(10, 4 ** 10, 1)
+ ht = khmer.Countgraph(10, 4 ** 10, 1)
filename = utils.get_test_data('test-abund-read.fa')
outname = utils.get_temp_filename('test_abund.out')
diff --git a/tests/test_functions.py b/tests/test_functions.py
index bcc6739..6d64fbd 100644
--- a/tests/test_functions.py
+++ b/tests/test_functions.py
@@ -13,7 +13,7 @@ import collections
from . import khmer_tst_utils as utils
from khmer.utils import (check_is_pair, broken_paired_reader, check_is_left,
check_is_right)
-from khmer.kfile import check_input_files
+from khmer.kfile import check_input_files, get_file_writer
try:
from StringIO import StringIO
except ImportError:
@@ -27,6 +27,19 @@ def test_forward_hash():
assert khmer.forward_hash('GGGG', 4) == 170
+def test_get_file_writer_fail():
+ somefile = utils.get_temp_filename("potato")
+ somefile = open(somefile, "w")
+ stopped = True
+ try:
+ get_file_writer(somefile, True, True)
+ stopped = False
+ except Exception as err:
+ assert "Cannot specify both bzip and gzip" in str(err), str(err)
+
+ assert stopped, "Expected exception"
+
+
def test_forward_hash_no_rc():
h = khmer.forward_hash_no_rc('AAAA', 4)
assert h == 0, h
@@ -90,26 +103,27 @@ def test_get_primes_fal():
assert "unable to find 5 prime numbers < 5" in str(err)
-def test_extract_countinghash_info_badfile():
+def test_extract_countgraph_info_badfile():
try:
- khmer.extract_countinghash_info(
+ khmer.extract_countgraph_info(
utils.get_test_data('test-abund-read-2.fa'))
assert 0, 'this should fail'
except ValueError:
pass
-def test_extract_countinghash_info():
+def test_extract_countgraph_info():
fn = utils.get_temp_filename('test_extract_counting.ct')
for size in [1e6, 2e6, 5e6, 1e7]:
- ht = khmer.CountingHash(25, size, 4)
+ ht = khmer.Countgraph(25, size, 4)
ht.save(fn)
try:
- info = khmer.extract_countinghash_info(fn)
+ info = khmer.extract_countgraph_info(fn)
except ValueError as err:
+ raise
assert 0, 'Should not throw a ValueErorr: ' + str(err)
- ksize, table_size, n_tables, _, _, _ = info
+ ksize, table_size, n_tables, _, _, _, _ = info
print(ksize, table_size, n_tables)
assert(ksize) == 25
@@ -122,27 +136,27 @@ def test_extract_countinghash_info():
assert 0, '...failed to remove ' + fn + str(err)
-def test_extract_hashbits_info_badfile():
+def test_extract_nodegraph_info_badfile():
try:
- khmer.extract_hashbits_info(
+ khmer.extract_nodegraph_info(
utils.get_test_data('test-abund-read-2.fa'))
assert 0, 'this should fail'
except ValueError:
pass
-def test_extract_hashbits_info():
- fn = utils.get_temp_filename('test_extract_hashbits.pt')
+def test_extract_nodegraph_info():
+ fn = utils.get_temp_filename('test_extract_nodegraph.pt')
for size in [1e6, 2e6, 5e6, 1e7]:
- ht = khmer.Hashbits(25, size, 4)
+ ht = khmer.Nodegraph(25, size, 4)
ht.save(fn)
- info = khmer.extract_hashbits_info(fn)
- ksize, table_size, n_tables, _, _ = info
+ info = khmer.extract_nodegraph_info(fn)
+ ksize, table_size, n_tables, _, _, _ = info
print(ksize, table_size, n_tables)
assert(ksize) == 25
- assert table_size == size
+ assert table_size == size, table_size
assert n_tables == 4
try:
diff --git a/tests/test_graph.py b/tests/test_graph.py
index 5afcb92..1963d95 100644
--- a/tests/test_graph.py
+++ b/tests/test_graph.py
@@ -21,7 +21,7 @@ def teardown():
class Test_ExactGraphFu(object):
def setup(self):
- self.ht = khmer.Hashbits(12, 1e4, 2)
+ self.ht = khmer.Nodegraph(12, 1e4, 2)
def test_counts(self):
ht = self.ht
@@ -115,7 +115,7 @@ class Test_ExactGraphFu(object):
class Test_InexactGraphFu(object):
def setup(self):
- self.ht = khmer.Hashbits(12, 4 ** 3 + 1, 2)
+ self.ht = khmer.Nodegraph(12, 4 ** 3 + 1, 2)
def test_graph_links_next_a(self):
ht = self.ht
@@ -199,7 +199,7 @@ class Test_Partitioning(object):
filename = utils.get_test_data('random-20-a.fa')
- ht = khmer._Hashbits(21, [5, 7, 11, 13])
+ ht = khmer._Nodegraph(21, [5, 7, 11, 13])
ht.consume_fasta_and_tag(filename)
output_file = utils.get_temp_filename('part0test')
@@ -216,7 +216,7 @@ class Test_Partitioning(object):
filename = utils.get_test_data('random-20-a.fa')
- ht = khmer._Hashbits(21, [5, 7, 11, 13])
+ ht = khmer._Nodegraph(21, [5, 7, 11, 13])
ht.consume_fasta_and_tag(filename)
output_file = utils.get_temp_filename('parttest')
@@ -231,7 +231,7 @@ class Test_Partitioning(object):
def test_output_fq(self):
filename = utils.get_test_data('random-20-a.fq')
- ht = khmer.Hashbits(20, 1e4, 4)
+ ht = khmer.Nodegraph(20, 1e4, 4)
ht.consume_fasta_and_tag(filename)
subset = ht.do_subset_partition(0, 0)
ht.merge_subset(subset)
@@ -247,7 +247,7 @@ class Test_Partitioning(object):
def test_disconnected_20_a(self):
filename = utils.get_test_data('random-20-a.fa')
- ht = khmer.Hashbits(21, 1e5, 4)
+ ht = khmer.Nodegraph(21, 1e5, 4)
ht.consume_fasta_and_tag(filename)
subset = ht.do_subset_partition(0, 0)
@@ -257,7 +257,7 @@ class Test_Partitioning(object):
def test_connected_20_a(self):
filename = utils.get_test_data('random-20-a.fa')
- ht = khmer.Hashbits(20, 1e4, 4)
+ ht = khmer.Nodegraph(20, 1e4, 4)
ht.consume_fasta_and_tag(filename)
subset = ht.do_subset_partition(0, 0)
@@ -267,7 +267,7 @@ class Test_Partitioning(object):
def test_disconnected_20_b(self):
filename = utils.get_test_data('random-20-b.fa')
- ht = khmer.Hashbits(21, 1e4, 4)
+ ht = khmer.Nodegraph(21, 1e4, 4)
ht.consume_fasta_and_tag(filename)
subset = ht.do_subset_partition(0, 0)
@@ -277,7 +277,7 @@ class Test_Partitioning(object):
def test_connected_20_b(self):
filename = utils.get_test_data('random-20-b.fa')
- ht = khmer.Hashbits(20, 1e4, 4)
+ ht = khmer.Nodegraph(20, 1e4, 4)
ht.consume_fasta_and_tag(filename)
subset = ht.do_subset_partition(0, 0)
@@ -287,7 +287,7 @@ class Test_Partitioning(object):
def test_disconnected_31_c(self):
filename = utils.get_test_data('random-31-c.fa')
- ht = khmer.Hashbits(32, 1e6, 4)
+ ht = khmer.Nodegraph(32, 1e6, 4)
ht.consume_fasta_and_tag(filename)
subset = ht.do_subset_partition(0, 0)
@@ -297,7 +297,7 @@ class Test_Partitioning(object):
def test_connected_31_c(self):
filename = utils.get_test_data('random-31-c.fa')
- ht = khmer.Hashbits(31, 1e5, 4)
+ ht = khmer.Nodegraph(31, 1e5, 4)
ht.consume_fasta_and_tag(filename)
subset = ht.do_subset_partition(0, 0)
@@ -310,7 +310,7 @@ class Test_Partitioning(object):
class Test_PythonAPI(object):
def test_find_all_tags_kmersize(self):
- ht = khmer.Hashbits(20, 4 ** 4 + 1, 2)
+ ht = khmer.Nodegraph(20, 4 ** 4 + 1, 2)
a = "ATTGGGACTCTGGGAGCACTTATCATGGAGAT"
b = "GAGCACTTTAACCCTGCAGAGTGGCCAAGGCT"
@@ -330,7 +330,7 @@ class Test_PythonAPI(object):
pass
def test_ordered_connect(self):
- ht = khmer.Hashbits(20, 4 ** 4 + 1, 2)
+ ht = khmer.Nodegraph(20, 4 ** 4 + 1, 2)
a = "ATTGGGACTCTGGGAGCACTTATCATGGAGAT"
b = "GAGCACTTTAACCCTGCAGAGTGGCCAAGGCT"
diff --git a/tests/test_hashbits.py b/tests/test_hashbits.py
deleted file mode 100644
index 47b5928..0000000
--- a/tests/test_hashbits.py
+++ /dev/null
@@ -1,905 +0,0 @@
-from __future__ import print_function
-from __future__ import absolute_import
-#
-# This file is part of khmer, htabletps://github.com/dib-lab/khmer/, and is
-# Copyrightable (C) Michigan State University, 2009-2015. It is licensed under
-# the three-clause BSD license; see LICENSE.
-# Contact: khmer-project at idyll.org
-#
-# pylint: disable=missing-docstring,protected-access,no-member,
-import khmer
-from khmer import ReadParser
-
-from screed.fasta import fasta_iter
-import screed
-
-from . import khmer_tst_utils as utils
-from nose.plugins.attrib import attr
-
-
-def teardown():
- utils.cleanup()
-
-
- at attr('huge')
-def test_toobig():
- try:
- pt = khmer.Hashbits(32, 1e13, 1)
- assert 0, "This should fail"
- except MemoryError as err:
- print(str(err))
-
-
-def test__get_set_tag_density():
- htableable = khmer._Hashbits(32, [1])
-
- orig = htableable._get_tag_density()
- assert orig != 2
- htableable._set_tag_density(2)
- assert htableable._get_tag_density() == 2
-
-
-def test_update_from():
- htableable = khmer.Hashbits(5, 1000, 4)
- other_htableable = khmer.Hashbits(5, 1000, 4)
-
- assert htableable.get('AAAAA') == 0
- assert htableable.get('GCGCG') == 0
- assert other_htableable.get('AAAAA') == 0
- assert other_htableable.get('GCGCG') == 0
-
- other_htableable.count('AAAAA')
-
- assert htableable.get('AAAAA') == 0
- assert htableable.get('GCGCG') == 0
- assert other_htableable.get('AAAAA') == 1
- assert other_htableable.get('GCGCG') == 0
-
- htableable.count('GCGCG')
-
- assert htableable.get('AAAAA') == 0
- assert htableable.get('GCGCG') == 1
- assert other_htableable.get('AAAAA') == 1
- assert other_htableable.get('GCGCG') == 0
-
- htableable.update(other_htableable)
-
- assert htableable.get('AAAAA') == 1
- assert htableable.get('GCGCG') == 1
- assert other_htableable.get('AAAAA') == 1
- assert other_htableable.get('GCGCG') == 0
-
-
-def test_update_from_diff_ksize_2():
- htableable = khmer.Hashbits(5, 1000, 4)
- other_htableable = khmer.Hashbits(4, 1000, 4)
-
- try:
- htableable.update(other_htableable)
- assert 0, "should not be reached"
- except ValueError as err:
- print(str(err))
-
- try:
- other_htableable.update(htableable)
- assert 0, "should not be reached"
- except ValueError as err:
- print(str(err))
-
-
-def test_update_from_diff_tablesize():
- htableable = khmer.Hashbits(5, 100, 4)
- other_htableable = khmer.Hashbits(5, 1000, 4)
-
- try:
- htableable.update(other_htableable)
- assert 0, "should not be reached"
- except ValueError as err:
- print(str(err))
-
-
-def test_update_from_diff_num_tables():
- htableable = khmer.Hashbits(5, 1000, 3)
- other_htableable = khmer.Hashbits(5, 1000, 4)
-
- try:
- htableable.update(other_htableable)
- assert 0, "should not be reached"
- except ValueError as err:
- print(str(err))
-
-
-def test_n_occupied_1():
- filename = utils.get_test_data('random-20-a.fa')
-
- ksize = 20 # size of kmer
- htable_size = 100000 # size of hashtableable
- num_htableables = 1 # number of hashtableables
-
- # test modified c++ n_occupied code
- htableable = khmer.Hashbits(ksize, htable_size, num_htableables)
-
- for _, record in enumerate(fasta_iter(open(filename))):
- htableable.consume(record['sequence'])
-
- # this number calculated independently
- assert htableable.n_occupied() == 3884, htableable.n_occupied()
-
-
-def test_bloom_python_1():
- # test python code to count unique kmers using bloom filter
- filename = utils.get_test_data('random-20-a.fa')
-
- ksize = 20 # size of kmer
- htable_size = 100000 # size of hashtableable
- num_htableables = 3 # number of hashtableables
-
- htableable = khmer.Hashbits(ksize, htable_size, num_htableables)
-
- n_unique = 0
- for _, record in enumerate(fasta_iter(open(filename))):
- sequence = record['sequence']
- seq_len = len(sequence)
- for n in range(0, seq_len + 1 - ksize):
- kmer = sequence[n:n + ksize]
- if not htableable.get(kmer):
- n_unique += 1
- htableable.count(kmer)
-
- assert n_unique == 3960
- assert htableable.n_occupied() == 3885, htableable.n_occupied()
-
- # this number equals n_unique
- assert htableable.n_unique_kmers() == 3960, htableable.n_unique_kmers()
-
-
-def test_bloom_c_1():
- # test c++ code to count unique kmers using bloom filter
-
- filename = utils.get_test_data('random-20-a.fa')
-
- ksize = 20 # size of kmer
- htable_size = 100000 # size of hashtableable
- num_htableables = 3 # number of hashtableables
-
- htableable = khmer.Hashbits(ksize, htable_size, num_htableables)
-
- for _, record in enumerate(fasta_iter(open(filename))):
- htableable.consume(record['sequence'])
-
- assert htableable.n_occupied() == 3885
- assert htableable.n_unique_kmers() == 3960
-
-
-def test_n_occupied_2(): # simple one
- ksize = 4
- htable_size = 10 # use 11
- num_htableables = 1
-
- htableable = khmer._Hashbits(ksize, [11])
- htableable.count('AAAA') # 00 00 00 00 = 0
- assert htableable.n_occupied() == 1
-
- htableable.count('ACTG') # 00 10 01 11 =
- assert htableable.n_occupied() == 2
-
- htableable.count('AACG') # 00 00 10 11 = 11 # collision 1
-
- assert htableable.n_occupied() == 2
- htableable.count('AGAC') # 00 11 00 10 # collision 2
- assert htableable.n_occupied() == 2, htableable.n_occupied()
-
-
-def test_bloom_c_2(): # simple one
- ksize = 4
-
- # use only 1 hashtableable, no bloom filter
- htableable = khmer._Hashbits(ksize, [11])
- htableable.count('AAAA') # 00 00 00 00 = 0
- htableable.count('ACTG') # 00 10 01 11 =
- assert htableable.n_unique_kmers() == 2
- htableable.count('AACG') # 00 00 10 11 = 11 # collision with 1st kmer
- assert htableable.n_unique_kmers() == 2
- htableable.count('AGAC') # 00 11 00 10 # collision with 2nd kmer
- assert htableable.n_unique_kmers() == 2
-
- # use two hashtableables with 11,13
- other_htableable = khmer._Hashbits(ksize, [11, 13])
- other_htableable.count('AAAA') # 00 00 00 00 = 0
-
- other_htableable.count('ACTG') # 00 10 01 11 = 2*16 +4 +3 = 39
- assert other_htableable.n_unique_kmers() == 2
- # 00 00 10 11 = 11 # collision with only 1st kmer
- other_htableable.count('AACG')
- assert other_htableable.n_unique_kmers() == 3
- other_htableable.count('AGAC')
- # 00 11 00 10 3*16 +2 = 50
- # collision with both 2nd and 3rd kmers
-
- assert other_htableable.n_unique_kmers() == 3
-
-
-def test_filter_if_present():
- htable = khmer._Hashbits(32, [3, 5])
-
- maskfile = utils.get_test_data('filter-test-A.fa')
- inputfile = utils.get_test_data('filter-test-B.fa')
- outfile = utils.get_temp_filename('filter')
-
- htable.consume_fasta(maskfile)
- htable.filter_if_present(inputfile, outfile)
-
- records = list(fasta_iter(open(outfile)))
- assert len(records) == 1
- assert records[0]['name'] == '3'
-
-
-def test_combine_pe():
- inpfile = utils.get_test_data('combine_parts_1.fa')
- htable = khmer._Hashbits(32, [1])
-
- htable.consume_partitioned_fasta(inpfile)
- assert htable.count_partitions() == (2, 0)
-
- first_seq = "CATGCAGAAGTTCCGCAACCATACCGTTCAGT"
- pid1 = htable.get_partition_id(first_seq)
-
- second_seq = "CAAATGTACATGCACTTAAAATCATCCAGCCG"
- pid2 = htable.get_partition_id(second_seq)
-
- assert pid1 == 2
- assert pid2 == 80293
-
- htable.join_partitions(pid1, pid2)
-
- pid1 = htable.get_partition_id(first_seq)
- pid2 = htable.get_partition_id(second_seq)
-
- assert pid1 == pid2
- assert htable.count_partitions() == (1, 0)
-
-
-def test_load_partitioned():
- inpfile = utils.get_test_data('combine_parts_1.fa')
- htable = khmer._Hashbits(32, [1])
-
- htable.consume_partitioned_fasta(inpfile)
- assert htable.count_partitions() == (2, 0)
-
- first_seq = "CATGCAGAAGTTCCGCAACCATACCGTTCAGT"
- assert htable.get(first_seq)
-
- second_seq = "CAAATGTACATGCACTTAAAATCATCCAGCCG"
- assert htable.get(second_seq)
-
- third_s = "CATGCAGAAGTTCCGCAACCATACCGTTCAGTTCCTGGTGGCTA"[-32:]
- assert htable.get(third_s)
-
-
-def test_count_within_radius_simple():
- inpfile = utils.get_test_data('all-A.fa')
- htable = khmer._Hashbits(4, [3, 5])
-
- print(htable.consume_fasta(inpfile))
- n = htable.count_kmers_within_radius('AAAA', 1)
- assert n == 1
-
- n = htable.count_kmers_within_radius('AAAA', 10)
- assert n == 1
-
-
-def test_count_within_radius_big():
- inpfile = utils.get_test_data('random-20-a.fa')
- htable = khmer.Hashbits(20, 1e5, 4)
-
- htable.consume_fasta(inpfile)
- n = htable.count_kmers_within_radius('CGCAGGCTGGATTCTAGAGG', int(1e6))
- assert n == 3961, n
-
- htable = khmer.Hashbits(21, 1e5, 4)
- htable.consume_fasta(inpfile)
- n = htable.count_kmers_within_radius('CGCAGGCTGGATTCTAGAGGC', int(1e6))
- assert n == 39
-
-
-def test_count_kmer_degree():
- inpfile = utils.get_test_data('all-A.fa')
- htable = khmer._Hashbits(4, [3, 5])
- htable.consume_fasta(inpfile)
-
- assert htable.kmer_degree('AAAA') == 2
- assert htable.kmer_degree('AAAT') == 1
- assert htable.kmer_degree('AATA') == 0
- assert htable.kmer_degree('TAAA') == 1
-
-
-def test_save_load_tagset():
- htable = khmer._Hashbits(32, [1])
-
- outfile = utils.get_temp_filename('tagset')
-
- htable.add_tag('A' * 32)
- htable.save_tagset(outfile)
-
- htable.add_tag('G' * 32)
-
- htable.load_tagset(outfile) # implicitly => clear_tags=True
- htable.save_tagset(outfile)
-
- # if tags have been cleared, then the new tagfile will be larger (34 bytes)
- # else smaller (26 bytes).
-
- fp = open(outfile, 'rb')
- data = fp.read()
- fp.close()
- assert len(data) == 30, len(data)
-
-
-def test_save_load_tagset_noclear():
- htable = khmer._Hashbits(32, [1])
-
- outfile = utils.get_temp_filename('tagset')
-
- htable.add_tag('A' * 32)
- htable.save_tagset(outfile)
-
- htable.add_tag('G' * 32)
-
- htable.load_tagset(outfile, False) # set clear_tags => False; zero tags
- htable.save_tagset(outfile)
-
- # if tags have been cleared, then the new tagfile will be large (34 bytes);
- # else small (26 bytes).
-
- fp = open(outfile, 'rb')
- data = fp.read()
- fp.close()
- assert len(data) == 38, len(data)
-
-
-def test_stop_traverse():
- filename = utils.get_test_data('random-20-a.fa')
-
- ksize = 20 # size of kmer
- htable_size = 1e4 # size of hashtableable
- num_htableables = 3 # number of hashtableables
-
- htable = khmer.Hashbits(ksize, htable_size, num_htableables)
-
- # without tagging/joining across consume, this breaks into two partition;
- # with, it is one partition.
- htable.add_stop_tag('TTGCATACGTTGAGCCAGCG')
-
- # DO NOT join reads across stoptags
- htable.consume_fasta_and_tag(filename)
- subset = htable.do_subset_partition(0, 0, True)
- htable.merge_subset(subset)
-
- n, _ = htable.count_partitions()
- assert n == 2, n
-
-
-def test_tag_across_stoptraverse():
- filename = utils.get_test_data('random-20-a.fa')
-
- ksize = 20 # size of kmer
- htable_size = 1e4 # size of hashtableable
- num_htableables = 3 # number of hashtableables
-
- htable = khmer.Hashbits(ksize, htable_size, num_htableables)
-
- # without tagging/joining across consume, this breaks into two partition;
- # with, it is one partition.
- htable.add_stop_tag('CCGAATATATAACAGCGACG')
-
- # DO join reads across
- htable.consume_fasta_and_tag_with_stoptags(filename)
- subset = htable.do_subset_partition(0, 0)
- n, _ = htable.count_partitions()
- assert n == 99 # reads only connected by traversal...
-
- n, _ = htable.subset_count_partitions(subset)
- assert n == 2 # but need main to cross stoptags.
-
- htable.merge_subset(subset)
-
- n, _ = htable.count_partitions() # ta-da!
- assert n == 1, n
-
-
-def test_notag_across_stoptraverse():
- filename = utils.get_test_data('random-20-a.fa')
-
- ksize = 20 # size of kmer
- htable_size = 1e4 # size of hashtableable
- num_htableables = 3 # number of hashtableables
-
- htable = khmer.Hashbits(ksize, htable_size, num_htableables)
-
- # connecting k-mer at the beginning/end of a read: breaks up into two.
- htable.add_stop_tag('TTGCATACGTTGAGCCAGCG')
-
- htable.consume_fasta_and_tag_with_stoptags(filename)
-
- subset = htable.do_subset_partition(0, 0)
- htable.merge_subset(subset)
-
- n, _ = htable.count_partitions()
- assert n == 2, n
-
-
-def test_find_stoptags():
- htable = khmer._Hashbits(5, [1])
- htable.add_stop_tag("AAAAA")
-
- assert htable.identify_stoptags_by_position("AAAAA") == [0]
- assert htable.identify_stoptags_by_position("AAAAAA") == [0, 1]
- assert htable.identify_stoptags_by_position("TTTTT") == [0]
- assert htable.identify_stoptags_by_position("TTTTTT") == [0, 1]
-
-
-def test_find_stoptagsecond_seq():
- htable = khmer._Hashbits(4, [1])
- htable.add_stop_tag("ATGC")
-
- x = htable.identify_stoptags_by_position("ATGCATGCGCAT")
- assert x == [0, 2, 4, 8], x
-
-
-def test_get_ksize():
- kh = khmer._Hashbits(22, [1])
- assert kh.ksize() == 22
-
-
-def test_get_hashsizes():
- kh = khmer.Hashbits(22, 100, 4)
- # Py2/3 hack, longify converts to long in py2, remove once py2 isn't
- # supported any longer.
- expected = utils.longify([97, 89, 83, 79])
- assert kh.hashsizes() == expected, kh.hashsizes()
-
-
-def test_extract_unique_paths_0():
- kh = khmer._Hashbits(10, [5, 7, 11, 13])
-
- x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1)
- assert x == ['ATGGAGAGACACAGATAGACAGGAGTGGCGATG']
-
- kh.consume('ATGGAGAGACACAGATAGACAGGAGTGGCGATG')
- x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1)
- assert not x
-
-
-def test_extract_unique_paths_1():
- kh = khmer._Hashbits(10, [5, 7, 11, 13])
-
- kh.consume('AGTGGCGATG')
- x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1)
- print(x)
- assert x == ['ATGGAGAGACACAGATAGACAGGAGTGGCGAT'] # all but the last k-mer
-
-
-def test_extract_unique_paths_2():
- kh = khmer._Hashbits(10, [5, 7, 11, 13])
-
- kh.consume('ATGGAGAGAC')
- x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1)
- print(x)
- assert x == ['TGGAGAGACACAGATAGACAGGAGTGGCGATG'] # all but the 1st k-mer
-
-
-def test_extract_unique_paths_3():
- kh = khmer._Hashbits(10, [5, 7, 11, 13])
-
- kh.consume('ATGGAGAGAC')
- kh.consume('AGTGGCGATG')
- x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1)
- print(x)
- # all but the 1st/last k-mer
- assert x == ['TGGAGAGACACAGATAGACAGGAGTGGCGAT']
-
-
-def test_extract_unique_paths_4():
- kh = khmer.Hashbits(10, 1e6, 4)
-
- kh.consume('ATGGAGAGAC')
- kh.consume('AGTGGCGATG')
-
- kh.consume('ATAGACAGGA')
-
- x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1)
- print(x)
- assert x == ['TGGAGAGACACAGATAGACAGG', 'TAGACAGGAGTGGCGAT']
-
-
-def test_find_unpart():
- filename = utils.get_test_data('random-20-a.odd.fa')
- filename2 = utils.get_test_data('random-20-a.even.fa')
-
- ksize = 20 # size of kmer
- htable_size = 1e4 # size of hashtableable
- num_htableables = 3 # number of hashtableables
-
- htable = khmer.Hashbits(ksize, htable_size, num_htableables)
- htable.consume_fasta_and_tag(filename)
-
- subset = htable.do_subset_partition(0, 0)
- htable.merge_subset(subset)
-
- n, _ = htable.count_partitions()
- assert n == 49
-
- htable.find_unpart(filename2, True, False)
- n, _ = htable.count_partitions()
- assert n == 1, n # all sequences connect
-
-
-def test_find_unpart_notraverse():
- filename = utils.get_test_data('random-20-a.odd.fa')
- filename2 = utils.get_test_data('random-20-a.even.fa')
-
- ksize = 20 # size of kmer
- htable_size = 1e4 # size of hashtableable
- num_htableables = 3 # number of hashtableables
-
- htable = khmer.Hashbits(ksize, htable_size, num_htableables)
- htable.consume_fasta_and_tag(filename)
-
- subset = htable.do_subset_partition(0, 0)
- htable.merge_subset(subset)
-
- n, _ = htable.count_partitions()
- assert n == 49
-
- htable.find_unpart(filename2, False, False) # <-- don't traverse
- n, _ = htable.count_partitions()
- assert n == 99, n # all sequences disconnected
-
-
-def test_find_unpart_fail():
- filename = utils.get_test_data('random-20-a.odd.fa')
- filename2 = utils.get_test_data('random-20-a.odd.fa') # <- switch to odd
-
- ksize = 20 # size of kmer
- htable_size = 1e4 # size of hashtableable
- num_htableables = 3 # number of hashtableables
-
- htable = khmer.Hashbits(ksize, htable_size, num_htableables)
- htable.consume_fasta_and_tag(filename)
-
- subset = htable.do_subset_partition(0, 0)
- htable.merge_subset(subset)
-
- n, _ = htable.count_partitions()
- assert n == 49
-
- htable.find_unpart(filename2, True, False)
- n, _ = htable.count_partitions()
- assert n == 49, n # only 49 sequences worth of tags
-
-
-def test_simple_median():
- hi = khmer.Hashbits(6, 1e5, 2)
-
- (median, average, stddev) = hi.get_median_count("AAAAAA")
- print(median, average, stddev)
- assert median == 0
- assert average == 0.0
- assert stddev == 0.0
-
- hi.consume("AAAAAA")
- (median, average, stddev) = hi.get_median_count("AAAAAA")
- print(median, average, stddev)
- assert median == 1
- assert average == 1.0
- assert stddev == 0.0
-
-
-def test_badget():
- hbts = khmer.Hashbits(6, 1e6, 1)
-
- dna = "AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAG"
-
- hbts.consume(dna)
-
- assert hbts.get("AGCTTT") == 1
-
- assert hbts.get("GATGAG") == 0
-
- try:
- hbts.get(b"AGCTT")
- assert 0, "this should fail"
- except ValueError as err:
- print(str(err))
-
- try:
- hbts.get(u"AGCTT")
- assert 0, "this should fail"
- except ValueError as err:
- print(str(err))
-
-
-#
-
-
-def test_load_notexist_should_fail():
- savepath = utils.get_temp_filename('temphashbitssave0.htable')
-
- hi = khmer._CountingHash(12, [1])
- try:
- hi.load(savepath)
- assert 0, "load should fail"
- except OSError:
- pass
-
-
-def test_load_truncated_should_fail():
- inpath = utils.get_test_data('random-20-a.fa')
- savepath = utils.get_temp_filename('temphashbitssave0.ct')
-
- hi = khmer.CountingHash(12, 1000, 2)
-
- hi.consume_fasta(inpath)
- hi.save(savepath)
-
- fp = open(savepath, 'rb')
- data = fp.read()
- fp.close()
-
- fp = open(savepath, 'wb')
- fp.write(data[:1000])
- fp.close()
-
- hi = khmer._CountingHash(12, [1])
- try:
- hi.load(savepath)
- assert 0, "load should fail"
- except OSError as e:
- print(str(e))
-
-
-def test_save_load_tagset_notexist():
- htable = khmer._Hashbits(32, [1])
-
- outfile = utils.get_temp_filename('tagset')
- try:
- htable.load_tagset(outfile)
- assert 0, "this test should fail"
- except OSError as e:
- print(str(e))
-
-
-def test_save_load_tagset_trunc():
- htable = khmer._Hashbits(32, [1])
-
- outfile = utils.get_temp_filename('tagset')
-
- htable.add_tag('A' * 32)
- htable.add_tag('G' * 32)
- htable.save_tagset(outfile)
-
- # truncate tagset file...
- fp = open(outfile, 'rb')
- data = fp.read()
- fp.close()
-
- for i in range(len(data)):
- fp = open(outfile, 'wb')
- fp.write(data[:i])
- fp.close()
-
- # try loading it...
- try:
- htable.load_tagset(outfile)
- assert 0, "this test should fail"
- except OSError as err:
- print(str(err), i)
-
- # try loading it...
- try:
- htable.load_tagset(outfile)
- assert 0, "this test should fail"
- except OSError:
- pass
-
-# to build the test files used below, add 'test' to this function
-# and then look in /tmp. You will need to tweak the version info in
-# khmer.hh in order to create "bad" versions, of course. -CTB
-
-
-def _build_testfiles():
- # hashbits file
-
- inpath = utils.get_test_data('random-20-a.fa')
- hi = khmer.Hashbits(12, 2)
- hi.consume_fasta(inpath)
- hi.save('/tmp/goodversion-k12.htable')
-
- # tagset file
-
- htable = khmer._Hashbits(32, [1])
-
- htable.add_tag('A' * 32)
- htable.add_tag('G' * 32)
- htable.save_tagset('/tmp/goodversion-k32.tagset')
-
- # stoptags file
-
- fakelump_fa = utils.get_test_data('fakelump.fa')
-
- htable = khmer.Hashbits(32, 4, 4)
- htable.consume_fasta_and_tag(fakelump_fa)
-
- subset = htable.do_subset_partition(0, 0)
- htable.merge_subset(subset)
-
- EXCURSION_DISTANCE = 40
- EXCURSION_ksizeMER_THRESHOLD = 82
- EXCURSION_ksizeMER_COUNT_THRESHOLD = 1
- counting = khmer.CountingHash(32, 4, 4)
-
- htable.repartition_largest_partition(None, counting,
- EXCURSION_DISTANCE,
- EXCURSION_ksizeMER_THRESHOLD,
- EXCURSION_ksizeMER_COUNT_THRESHOLD)
-
- htable.save_stop_tags('/tmp/goodversion-k32.stoptags')
-
-
-def test_hashbits_file_version_check():
- htable = khmer._Hashbits(12, [1])
-
- inpath = utils.get_test_data('badversion-k12.htable')
-
- try:
- htable.load(inpath)
- assert 0, "this should fail"
- except OSError as e:
- print(str(e))
-
-
-def test_hashbits_file_type_check():
- kh = khmer._CountingHash(12, [1])
- savepath = utils.get_temp_filename('tempcountingsave0.ct')
- kh.save(savepath)
-
- htable = khmer._Hashbits(12, [1])
-
- try:
- htable.load(savepath)
- assert 0, "this should fail"
- except OSError as e:
- print(str(e))
-
-
-def test_stoptags_file_version_check():
- htable = khmer._Hashbits(32, [1])
-
- inpath = utils.get_test_data('badversion-k32.stoptags')
-
- try:
- htable.load_stop_tags(inpath)
- assert 0, "this should fail"
- except OSError as e:
- print(str(e))
-
-
-def test_stoptags_ksize_check():
- htable = khmer._Hashbits(31, [1])
-
- inpath = utils.get_test_data('goodversion-k32.stoptags')
- try:
- htable.load_stop_tags(inpath)
- assert 0, "this should fail"
- except OSError as e:
- print(str(e))
-
-
-def test_stop_tags_filetype_check():
- htable = khmer._Hashbits(31, [1])
-
- inpath = utils.get_test_data('goodversion-k32.tagset')
- try:
- htable.load_stop_tags(inpath)
- assert 0, "this should fail"
- except OSError as e:
- print(str(e))
-
-
-def test_tagset_file_version_check():
- htable = khmer._Hashbits(32, [1])
-
- inpath = utils.get_test_data('badversion-k32.tagset')
-
- try:
- htable.load_tagset(inpath)
- assert 0, "this should fail"
- except OSError as e:
- print(str(e))
-
-
-def test_stop_tags_truncate_check():
- htable = khmer._Hashbits(32, [1])
-
- inpath = utils.get_test_data('goodversion-k32.tagset')
- data = open(inpath, 'rb').read()
-
- truncpath = utils.get_temp_filename('zzz')
- for i in range(len(data)):
- fp = open(truncpath, 'wb')
- fp.write(data[:i])
- fp.close()
-
- try:
- htable.load_stop_tags(truncpath)
- assert 0, "expect failure of previous command"
- except OSError as e:
- print(i, str(e))
-
-
-def test_tagset_ksize_check():
- htable = khmer._Hashbits(31, [1])
-
- inpath = utils.get_test_data('goodversion-k32.tagset')
- try:
- htable.load_tagset(inpath)
- assert 0, "this should fail"
- except OSError as e:
- print(str(e))
-
-
-def test_tagset_filetype_check():
- htable = khmer._Hashbits(31, [1])
-
- inpath = utils.get_test_data('goodversion-k32.stoptags')
- try:
- htable.load_tagset(inpath)
- assert 0, "this should fail"
- except OSError as e:
- print(str(e))
-
-
-def test_bad_primes_list():
- try:
- coutingtable = khmer._Hashbits(31, ["a", "b", "c"], 1)
- assert 0, "Bad primes list should fail"
- except TypeError as e:
- print(str(e))
-
-
-def test_consume_absentfasta_with_reads_parser():
- presencetable = khmer._Hashbits(31, [1])
- try:
- presencetable.consume_fasta_with_reads_parser()
- assert 0, "this should fail"
- except TypeError as err:
- print(str(err))
- try:
- readparser = ReadParser(utils.get_test_data('empty-file'))
- presencetable.consume_fasta_with_reads_parser(readparser)
- assert 0, "this should fail"
- except OSError as err:
- print(str(err))
- except ValueError as err:
- print(str(err))
-
-
-def test_bad_primes():
- try:
- countingtable = khmer._Hashbits.__new__(
- khmer._Hashbits, 6, ["a", "b", "c"])
- assert 0, "this should fail"
- except TypeError as e:
- print(str(e))
-
-
-def test_consume_fasta_and_tag_with_badreads_parser():
- presencetable = khmer.Hashbits(6, 1e6, 2)
- try:
- readsparser = khmer.ReadParser(utils.get_test_data("test-empty.fa"))
- presencetable.consume_fasta_and_tag_with_reads_parser(readsparser)
- assert 0, "this should fail"
- except OSError as e:
- print(str(e))
- except ValueError as e:
- print(str(e))
diff --git a/tests/test_labelhash.py b/tests/test_labelhash.py
index c3567cd..4593673 100644
--- a/tests/test_labelhash.py
+++ b/tests/test_labelhash.py
@@ -9,7 +9,7 @@ from __future__ import absolute_import
# pylint: disable=missing-docstring,protected-access
import os
import khmer
-from khmer import LabelHash, CountingLabelHash
+from khmer import GraphLabels, CountingGraphLabels
from screed.fasta import fasta_iter
import screed
@@ -28,23 +28,23 @@ def teardown():
@attr('huge')
def test_toobig():
try:
- lh = LabelHash(20, 1e13, 1)
+ lh = GraphLabels(20, 1e13, 1)
assert 0, "This should fail."
except MemoryError as err:
print(str(err))
def test_error_create():
- from khmer import _LabelHash
+ from khmer import _GraphLabels
try:
- lh = _LabelHash(None)
+ lh = _GraphLabels(None)
assert 0, "This should fail."
except ValueError as err:
print(str(err))
def test_n_labels():
- lh = LabelHash(20, 1e7, 4)
+ lh = GraphLabels(20, 1e7, 4)
filename = utils.get_test_data('test-labels.fa')
lh.consume_fasta_and_tag_with_labels(filename)
@@ -53,7 +53,7 @@ def test_n_labels():
def test_get_label_dict():
- lb = LabelHash(20, 1e7, 4)
+ lb = GraphLabels(20, 1e7, 4)
filename = utils.get_test_data('test-labels.fa')
lb.consume_fasta_and_tag_with_labels(filename)
@@ -66,7 +66,7 @@ def test_get_label_dict():
def test_get_label_dict_save_load():
- lb_pre = LabelHash(20, 1e7, 4)
+ lb_pre = GraphLabels(20, 1e7, 4)
filename = utils.get_test_data('test-labels.fa')
lb_pre.consume_fasta_and_tag_with_labels(filename)
@@ -74,11 +74,11 @@ def test_get_label_dict_save_load():
savepath = utils.get_temp_filename('saved.labels')
lb_pre.save_labels_and_tags(savepath)
- # trash the old LabelHash
+ # trash the old GraphLabels
del lb_pre
# create new, load labels & tags
- lb = LabelHash(20, 1e7, 4)
+ lb = GraphLabels(20, 1e7, 4)
lb.load_labels_and_tags(savepath)
labels = lb.get_label_dict()
@@ -90,7 +90,7 @@ def test_get_label_dict_save_load():
def test_get_label_dict_save_load_wrong_ksize():
- lb_pre = LabelHash(19, 1e7, 4)
+ lb_pre = GraphLabels(19, 1e7, 4)
filename = utils.get_test_data('test-labels.fa')
lb_pre.consume_fasta_and_tag_with_labels(filename)
@@ -98,11 +98,11 @@ def test_get_label_dict_save_load_wrong_ksize():
savepath = utils.get_temp_filename('saved.labels')
lb_pre.save_labels_and_tags(savepath)
- # trash the old LabelHash
+ # trash the old GraphLabels
del lb_pre
# create new, load labels & tags
- lb = LabelHash(20, 1e7, 4)
+ lb = GraphLabels(20, 1e7, 4)
try:
lb.load_labels_and_tags(savepath)
assert 0, "this should not succeed - different ksize"
@@ -112,7 +112,7 @@ def test_get_label_dict_save_load_wrong_ksize():
def test_save_load_corrupted():
- lb_pre = LabelHash(20, 1e7, 4)
+ lb_pre = GraphLabels(20, 1e7, 4)
filename = utils.get_test_data('test-labels.fa')
lb_pre.consume_fasta_and_tag_with_labels(filename)
@@ -120,10 +120,10 @@ def test_save_load_corrupted():
savepath = utils.get_temp_filename('saved.labels')
lb_pre.save_labels_and_tags(savepath)
- # trash the old LabelHash
+ # trash the old GraphLabels
del lb_pre
- lb = LabelHash(20, 1e7, 4)
+ lb = GraphLabels(20, 1e7, 4)
# produce all possible truncated versions of this file
data = open(savepath, 'rb').read()
@@ -141,7 +141,7 @@ def test_save_load_corrupted():
def test_save_fail_readonly():
- lb_pre = LabelHash(20, 1e7, 4)
+ lb_pre = GraphLabels(20, 1e7, 4)
filename = utils.get_test_data('test-labels.fa')
lb_pre.consume_fasta_and_tag_with_labels(filename)
@@ -160,7 +160,7 @@ def test_save_fail_readonly():
def test_get_tag_labels():
- lb = LabelHash(20, 1e7, 4)
+ lb = GraphLabels(20, 1e7, 4)
filename = utils.get_test_data('single-read.fq')
lb.consume_fasta_and_tag_with_labels(filename)
tag = 173473779682
@@ -171,7 +171,7 @@ def test_get_tag_labels():
def test_consume_fasta_and_tag_with_labels():
- lb = LabelHash(20, 1e7, 4)
+ lb = GraphLabels(20, 1e7, 4)
read_1 = 'ACGTAACCGGTTAAACCCGGGTTTAAAACCCCGGGGTTTT'
filename = utils.get_test_data('test-transcript.fa')
@@ -196,7 +196,7 @@ def test_consume_fasta_and_tag_with_labels():
def test_consume_partitioned_fasta_and_tag_with_labels():
- lb = LabelHash(20, 1e7, 4)
+ lb = GraphLabels(20, 1e7, 4)
filename = utils.get_test_data('real-partition-small.fa')
total_reads, n_consumed = lb.consume_partitioned_fasta_and_tag_with_labels(
@@ -213,7 +213,7 @@ def test_consume_partitioned_fasta_and_tag_with_labels():
def test_consume_sequence_and_tag_with_labels():
- lb = LabelHash(20, 1e6, 4)
+ lb = GraphLabels(20, 1e6, 4)
label = 0
sequence = 'ATGCATCGATCGATCGATCGATCGATCGATCGATCGATCG'
@@ -226,7 +226,7 @@ def test_consume_sequence_and_tag_with_labels():
def test_sweep_tag_neighborhood():
- lb = LabelHash(20, 1e7, 4)
+ lb = GraphLabels(20, 1e7, 4)
filename = utils.get_test_data('single-read.fq')
lb.graph.consume_fasta_and_tag(filename)
@@ -236,7 +236,7 @@ def test_sweep_tag_neighborhood():
def test_sweep_label_neighborhood():
- lb = LabelHash(20, 1e7, 4)
+ lb = GraphLabels(20, 1e7, 4)
filename = utils.get_test_data('single-read.fq')
lb.consume_fasta_and_tag_with_labels(filename)
@@ -254,7 +254,7 @@ def test_sweep_label_neighborhood():
def test_label_tag_correctness():
- lb = LabelHash(20, 1e7, 4)
+ lb = GraphLabels(20, 1e7, 4)
filename = utils.get_test_data('test-labels.fa')
lb.consume_fasta_and_tag_with_labels(filename)
@@ -300,7 +300,7 @@ def test_label_tag_correctness():
def test_counting_label_tag_correctness():
- lb = CountingLabelHash(20, 1e7, 4)
+ lb = CountingGraphLabels(20, 1e7, 4)
filename = utils.get_test_data('test-labels.fa')
lb.consume_fasta_and_tag_with_labels(filename)
@@ -346,7 +346,7 @@ def test_counting_label_tag_correctness():
def test_label_tag_correctness_save_load():
- lb_pre = LabelHash(20, 1e7, 4)
+ lb_pre = GraphLabels(20, 1e7, 4)
filename = utils.get_test_data('test-labels.fa')
lb_pre.consume_fasta_and_tag_with_labels(filename)
@@ -354,11 +354,11 @@ def test_label_tag_correctness_save_load():
savepath = utils.get_temp_filename('saved.labels')
lb_pre.save_labels_and_tags(savepath)
- # trash the old LabelHash
+ # trash the old GraphLabels
del lb_pre
# create new, load labels & tags
- lb = LabelHash(20, 1e7, 4)
+ lb = GraphLabels(20, 1e7, 4)
lb.load_labels_and_tags(savepath)
# read A
@@ -403,7 +403,7 @@ def test_label_tag_correctness_save_load():
def test_load_wrong_filetype():
- lb = LabelHash(20, 1e7, 4)
+ lb = GraphLabels(20, 1e7, 4)
# try to load a tagset
filename = utils.get_test_data('goodversion-k32.tagset')
@@ -425,7 +425,7 @@ def test_load_wrong_filetype():
def test_load_wrong_fileversion():
- lb = LabelHash(20, 1e7, 4)
+ lb = GraphLabels(20, 1e7, 4)
# try to load a tagset from an old version
filename = utils.get_test_data('badversion-k32.tagset')
diff --git a/tests/test_lump.py b/tests/test_lump.py
index 511ee00..406b865 100644
--- a/tests/test_lump.py
+++ b/tests/test_lump.py
@@ -19,7 +19,7 @@ from nose.plugins.attrib import attr
def test_fakelump_together():
fakelump_fa = utils.get_test_data('fakelump.fa')
- ht = khmer.Hashbits(32, 1e5, 4)
+ ht = khmer.Nodegraph(32, 1e5, 4)
ht.consume_fasta_and_tag(fakelump_fa)
subset = ht.do_subset_partition(0, 0)
@@ -35,7 +35,7 @@ def test_fakelump_stop():
fakelump_fa = utils.get_test_data('fakelump.fa')
fakelump_stoptags_txt = utils.get_test_data('fakelump.fa.stoptags.txt')
- ht = khmer.Hashbits(32, 1e5, 4)
+ ht = khmer.Nodegraph(32, 1e5, 4)
ht.consume_fasta_and_tag(fakelump_fa)
for line in open(fakelump_stoptags_txt):
@@ -53,7 +53,7 @@ def test_fakelump_stop():
def test_fakelump_stop2():
fakelump_fa = utils.get_test_data('fakelump.fa')
- ht = khmer.Hashbits(32, 1e5, 4)
+ ht = khmer.Nodegraph(32, 1e5, 4)
ht.consume_fasta_and_tag(fakelump_fa)
ht.add_stop_tag('GGGGAGGGGTGCAGTTGTGACTTGCTCGAGAG')
@@ -71,7 +71,7 @@ def test_fakelump_repartitioning():
fakelump_fa = utils.get_test_data('fakelump.fa')
fakelump_fa_foo = utils.get_temp_filename('fakelump.fa.stopfoo')
- ht = khmer.Hashbits(32, 1e5, 4)
+ ht = khmer.Nodegraph(32, 1e5, 4)
ht.consume_fasta_and_tag(fakelump_fa)
subset = ht.do_subset_partition(0, 0)
@@ -88,7 +88,7 @@ def test_fakelump_repartitioning():
EXCURSION_DISTANCE = 40
EXCURSION_KMER_THRESHOLD = 82
EXCURSION_KMER_COUNT_THRESHOLD = 1
- counting = khmer.CountingHash(32, 1e5, 4)
+ counting = khmer.Countgraph(32, 1e5, 4)
ht.repartition_largest_partition(None, counting,
EXCURSION_DISTANCE,
@@ -99,7 +99,7 @@ def test_fakelump_repartitioning():
# ok, now re-do everything with these stop tags, specifically.
- ht = khmer.Hashbits(32, 1e5, 4)
+ ht = khmer.Nodegraph(32, 1e5, 4)
ht.consume_fasta_and_tag(fakelump_fa)
ht.load_stop_tags(fakelump_fa_foo)
@@ -114,7 +114,7 @@ def test_fakelump_load_stop_tags_trunc():
fakelump_fa = utils.get_test_data('fakelump.fa')
fakelump_fa_foo = utils.get_temp_filename('fakelump.fa.stopfoo')
- ht = khmer.Hashbits(32, 1e5, 4)
+ ht = khmer.Nodegraph(32, 1e5, 4)
ht.consume_fasta_and_tag(fakelump_fa)
subset = ht.do_subset_partition(0, 0)
@@ -131,7 +131,7 @@ def test_fakelump_load_stop_tags_trunc():
EXCURSION_DISTANCE = 40
EXCURSION_KMER_THRESHOLD = 82
EXCURSION_KMER_COUNT_THRESHOLD = 1
- counting = khmer._CountingHash(32, [5, 7, 11, 13])
+ counting = khmer._Countgraph(32, [5, 7, 11, 13])
ht.repartition_largest_partition(None, counting,
EXCURSION_DISTANCE,
@@ -146,7 +146,7 @@ def test_fakelump_load_stop_tags_trunc():
fp.close()
# ok, now try loading these stop tags; should fail.
- ht = khmer._Hashbits(32, [5, 7, 11, 13])
+ ht = khmer._Nodegraph(32, [5, 7, 11, 13])
ht.consume_fasta_and_tag(fakelump_fa)
try:
@@ -160,7 +160,7 @@ def test_fakelump_load_stop_tags_notexist():
fakelump_fa_foo = utils.get_temp_filename('fakelump.fa.stopfoo')
# ok, now try loading these stop tags; should fail.
- ht = khmer._Hashbits(32, [5, 7, 11, 13])
+ ht = khmer._Nodegraph(32, [5, 7, 11, 13])
try:
ht.load_stop_tags(fakelump_fa_foo)
diff --git a/tests/test_nodegraph.py b/tests/test_nodegraph.py
new file mode 100644
index 0000000..4b7db74
--- /dev/null
+++ b/tests/test_nodegraph.py
@@ -0,0 +1,977 @@
+#
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
+# Copyright (C) Michigan State University, 2009-2015. It is licensed under
+# the three-clause BSD license; see LICENSE.
+# Contact: khmer-project at idyll.org
+#
+# pylint: disable=missing-docstring,protected-access,no-member,
+
+from __future__ import print_function
+from __future__ import absolute_import
+
+import khmer
+from khmer import ReadParser
+
+import screed
+
+from . import khmer_tst_utils as utils
+from nose.plugins.attrib import attr
+
+
+def teardown():
+ utils.cleanup()
+
+
+ at attr('huge')
+def test_toobig():
+ try:
+ pt = khmer.Nodegraph(32, 1e13, 1)
+ assert 0, "This should fail"
+ except MemoryError as err:
+ print(str(err))
+
+
+def test__get_set_tag_density():
+ nodegraph = khmer._Nodegraph(32, [1])
+
+ orig = nodegraph._get_tag_density()
+ assert orig != 2
+ nodegraph._set_tag_density(2)
+ assert nodegraph._get_tag_density() == 2
+
+
+def test_update_from():
+ nodegraph = khmer.Nodegraph(5, 1000, 4)
+ other_nodegraph = khmer.Nodegraph(5, 1000, 4)
+
+ assert nodegraph.get('AAAAA') == 0
+ assert nodegraph.get('GCGCG') == 0
+ assert other_nodegraph.get('AAAAA') == 0
+ assert other_nodegraph.get('GCGCG') == 0
+
+ other_nodegraph.count('AAAAA')
+
+ assert nodegraph.get('AAAAA') == 0
+ assert nodegraph.get('GCGCG') == 0
+ assert other_nodegraph.get('AAAAA') == 1
+ assert other_nodegraph.get('GCGCG') == 0
+
+ nodegraph.count('GCGCG')
+
+ assert nodegraph.get('AAAAA') == 0
+ assert nodegraph.get('GCGCG') == 1
+ assert other_nodegraph.get('AAAAA') == 1
+ assert other_nodegraph.get('GCGCG') == 0
+
+ nodegraph.update(other_nodegraph)
+
+ assert nodegraph.get('AAAAA') == 1
+ assert nodegraph.get('GCGCG') == 1
+ assert other_nodegraph.get('AAAAA') == 1
+ assert other_nodegraph.get('GCGCG') == 0
+
+
+def test_update_from_diff_ksize_2():
+ nodegraph = khmer.Nodegraph(5, 1000, 4)
+ other_nodegraph = khmer.Nodegraph(4, 1000, 4)
+
+ try:
+ nodegraph.update(other_nodegraph)
+ assert 0, "should not be reached"
+ except ValueError as err:
+ print(str(err))
+
+ try:
+ other_nodegraph.update(nodegraph)
+ assert 0, "should not be reached"
+ except ValueError as err:
+ print(str(err))
+
+
+def test_update_from_diff_tablesize():
+ nodegraph = khmer.Nodegraph(5, 100, 4)
+ other_nodegraph = khmer.Nodegraph(5, 1000, 4)
+
+ try:
+ nodegraph.update(other_nodegraph)
+ assert 0, "should not be reached"
+ except ValueError as err:
+ print(str(err))
+
+
+def test_update_from_diff_num_tables():
+ nodegraph = khmer.Nodegraph(5, 1000, 3)
+ other_nodegraph = khmer.Nodegraph(5, 1000, 4)
+
+ try:
+ nodegraph.update(other_nodegraph)
+ assert 0, "should not be reached"
+ except ValueError as err:
+ print(str(err))
+
+
+def test_n_occupied_1():
+ filename = utils.get_test_data('random-20-a.fa')
+
+ ksize = 20 # size of kmer
+ htable_size = 100000 # size of hashtable
+ num_nodegraphs = 1 # number of hashtables
+
+ # test modified c++ n_occupied code
+ nodegraph = khmer.Nodegraph(ksize, htable_size, num_nodegraphs)
+
+ for _, record in enumerate(screed.open(filename)):
+ nodegraph.consume(record.sequence)
+
+ # this number calculated independently
+ assert nodegraph.n_occupied() == 3884, nodegraph.n_occupied()
+
+
+def test_bloom_python_1():
+ # test python code to count unique kmers using bloom filter
+ filename = utils.get_test_data('random-20-a.fa')
+
+ ksize = 20 # size of kmer
+ htable_size = 100000 # size of hashtable
+ num_nodegraphs = 3 # number of hashtables
+
+ nodegraph = khmer.Nodegraph(ksize, htable_size, num_nodegraphs)
+
+ n_unique = 0
+ for _, record in enumerate(screed.open(filename)):
+ sequence = record.sequence
+ seq_len = len(sequence)
+ for n in range(0, seq_len + 1 - ksize):
+ kmer = sequence[n:n + ksize]
+ if not nodegraph.get(kmer):
+ n_unique += 1
+ nodegraph.count(kmer)
+
+ assert n_unique == 3960
+ assert nodegraph.n_occupied() == 3884, nodegraph.n_occupied()
+
+ # this number equals n_unique
+ assert nodegraph.n_unique_kmers() == 3960, nodegraph.n_unique_kmers()
+
+
+def test_bloom_c_1():
+ # test c++ code to count unique kmers using bloom filter
+
+ filename = utils.get_test_data('random-20-a.fa')
+
+ ksize = 20 # size of kmer
+ htable_size = 100000 # size of hashtable
+ num_nodegraphs = 3 # number of hashtables
+
+ nodegraph = khmer.Nodegraph(ksize, htable_size, num_nodegraphs)
+
+ for _, record in enumerate(screed.open(filename)):
+ nodegraph.consume(record.sequence)
+
+ assert nodegraph.n_occupied() == 3884
+ assert nodegraph.n_unique_kmers() == 3960
+
+
+def test_n_occupied_2(): # simple one
+ ksize = 4
+ htable_size = 10 # use 11
+ num_nodegraphs = 1
+
+ nodegraph = khmer._Nodegraph(ksize, [11])
+ nodegraph.count('AAAA') # 00 00 00 00 = 0
+ assert nodegraph.n_occupied() == 1
+
+ nodegraph.count('ACTG') # 00 10 01 11 =
+ assert nodegraph.n_occupied() == 2
+
+ nodegraph.count('AACG') # 00 00 10 11 = 11 # collision 1
+
+ assert nodegraph.n_occupied() == 2
+ nodegraph.count('AGAC') # 00 11 00 10 # collision 2
+ assert nodegraph.n_occupied() == 2, nodegraph.n_occupied()
+
+
+def test_bloom_c_2(): # simple one
+ ksize = 4
+
+ # use only 1 hashtable, no bloom filter
+ nodegraph = khmer._Nodegraph(ksize, [11])
+ nodegraph.count('AAAA') # 00 00 00 00 = 0
+ nodegraph.count('ACTG') # 00 10 01 11 =
+ assert nodegraph.n_unique_kmers() == 2
+ nodegraph.count('AACG') # 00 00 10 11 = 11 # collision with 1st kmer
+ assert nodegraph.n_unique_kmers() == 2
+ nodegraph.count('AGAC') # 00 11 00 10 # collision with 2nd kmer
+ assert nodegraph.n_unique_kmers() == 2
+
+ # use two hashtables with 11,13
+ other_nodegraph = khmer._Nodegraph(ksize, [11, 13])
+ other_nodegraph.count('AAAA') # 00 00 00 00 = 0
+
+ other_nodegraph.count('ACTG') # 00 10 01 11 = 2*16 +4 +3 = 39
+ assert other_nodegraph.n_unique_kmers() == 2
+ # 00 00 10 11 = 11 # collision with only 1st kmer
+ other_nodegraph.count('AACG')
+ assert other_nodegraph.n_unique_kmers() == 3
+ other_nodegraph.count('AGAC')
+ # 00 11 00 10 3*16 +2 = 50
+ # collision with both 2nd and 3rd kmers
+
+ assert other_nodegraph.n_unique_kmers() == 3
+
+
+def test_filter_if_present():
+ nodegraph = khmer._Nodegraph(32, [3, 5])
+
+ maskfile = utils.get_test_data('filter-test-A.fa')
+ inputfile = utils.get_test_data('filter-test-B.fa')
+ outfile = utils.get_temp_filename('filter')
+
+ nodegraph.consume_fasta(maskfile)
+ nodegraph.filter_if_present(inputfile, outfile)
+
+ records = list(screed.open(outfile))
+ assert len(records) == 1
+ assert records[0]['name'] == '3'
+
+
+def test_combine_pe():
+ inpfile = utils.get_test_data('combine_parts_1.fa')
+ nodegraph = khmer._Nodegraph(32, [1])
+
+ nodegraph.consume_partitioned_fasta(inpfile)
+ assert nodegraph.count_partitions() == (2, 0)
+
+ first_seq = "CATGCAGAAGTTCCGCAACCATACCGTTCAGT"
+ pid1 = nodegraph.get_partition_id(first_seq)
+
+ second_seq = "CAAATGTACATGCACTTAAAATCATCCAGCCG"
+ pid2 = nodegraph.get_partition_id(second_seq)
+
+ assert pid1 == 2
+ assert pid2 == 80293
+
+ nodegraph.join_partitions(pid1, pid2)
+
+ pid1 = nodegraph.get_partition_id(first_seq)
+ pid2 = nodegraph.get_partition_id(second_seq)
+
+ assert pid1 == pid2
+ assert nodegraph.count_partitions() == (1, 0)
+
+
+def test_load_partitioned():
+ inpfile = utils.get_test_data('combine_parts_1.fa')
+ nodegraph = khmer._Nodegraph(32, [1])
+
+ nodegraph.consume_partitioned_fasta(inpfile)
+ assert nodegraph.count_partitions() == (2, 0)
+
+ first_seq = "CATGCAGAAGTTCCGCAACCATACCGTTCAGT"
+ assert nodegraph.get(first_seq)
+
+ second_seq = "CAAATGTACATGCACTTAAAATCATCCAGCCG"
+ assert nodegraph.get(second_seq)
+
+ third_s = "CATGCAGAAGTTCCGCAACCATACCGTTCAGTTCCTGGTGGCTA"[-32:]
+ assert nodegraph.get(third_s)
+
+
+def test_count_within_radius_simple():
+ inpfile = utils.get_test_data('all-A.fa')
+ nodegraph = khmer._Nodegraph(4, [3, 5])
+
+ print(nodegraph.consume_fasta(inpfile))
+ n = nodegraph.count_kmers_within_radius('AAAA', 1)
+ assert n == 1
+
+ n = nodegraph.count_kmers_within_radius('AAAA', 10)
+ assert n == 1
+
+
+def test_count_within_radius_big():
+ inpfile = utils.get_test_data('random-20-a.fa')
+ nodegraph = khmer.Nodegraph(20, 1e5, 4)
+
+ nodegraph.consume_fasta(inpfile)
+ n = nodegraph.count_kmers_within_radius('CGCAGGCTGGATTCTAGAGG', int(1e6))
+ assert n == 3961, n
+
+ nodegraph = khmer.Nodegraph(21, 1e5, 4)
+ nodegraph.consume_fasta(inpfile)
+ n = nodegraph.count_kmers_within_radius('CGCAGGCTGGATTCTAGAGGC', int(1e6))
+ assert n == 39
+
+
+def test_count_kmer_degree():
+ inpfile = utils.get_test_data('all-A.fa')
+ nodegraph = khmer._Nodegraph(4, [3, 5])
+ nodegraph.consume_fasta(inpfile)
+
+ assert nodegraph.kmer_degree('AAAA') == 2
+ assert nodegraph.kmer_degree('AAAT') == 1
+ assert nodegraph.kmer_degree('AATA') == 0
+ assert nodegraph.kmer_degree('TAAA') == 1
+
+
+def test_save_load_tagset():
+ nodegraph = khmer._Nodegraph(32, [1])
+
+ outfile = utils.get_temp_filename('tagset')
+
+ nodegraph.add_tag('A' * 32)
+ nodegraph.save_tagset(outfile)
+
+ nodegraph.add_tag('G' * 32)
+
+ nodegraph.load_tagset(outfile) # implicitly => clear_tags=True
+ nodegraph.save_tagset(outfile)
+
+ # if tags have been cleared, then the new tagfile will be larger (34 bytes)
+ # else smaller (26 bytes).
+
+ fp = open(outfile, 'rb')
+ data = fp.read()
+ fp.close()
+ assert len(data) == 30, len(data)
+
+
+def test_save_load_tagset_noclear():
+ nodegraph = khmer._Nodegraph(32, [1])
+
+ outfile = utils.get_temp_filename('tagset')
+
+ nodegraph.add_tag('A' * 32)
+ nodegraph.save_tagset(outfile)
+
+ nodegraph.add_tag('G' * 32)
+
+ nodegraph.load_tagset(outfile, False) # set clear_tags => False; zero tags
+ nodegraph.save_tagset(outfile)
+
+ # if tags have been cleared, then the new tagfile will be large (34 bytes);
+ # else small (26 bytes).
+
+ fp = open(outfile, 'rb')
+ data = fp.read()
+ fp.close()
+ assert len(data) == 38, len(data)
+
+
+def test_stop_traverse():
+ filename = utils.get_test_data('random-20-a.fa')
+
+ ksize = 20 # size of kmer
+ htable_size = 1e4 # size of hashtable
+ num_nodegraphs = 3 # number of hashtables
+
+ nodegraph = khmer.Nodegraph(ksize, htable_size, num_nodegraphs)
+
+ # without tagging/joining across consume, this breaks into two partition;
+ # with, it is one partition.
+ nodegraph.add_stop_tag('TTGCATACGTTGAGCCAGCG')
+
+ # DO NOT join reads across stoptags
+ nodegraph.consume_fasta_and_tag(filename)
+ subset = nodegraph.do_subset_partition(0, 0, True)
+ nodegraph.merge_subset(subset)
+
+ n, _ = nodegraph.count_partitions()
+ assert n == 2, n
+
+
+def test_tag_across_stoptraverse():
+ filename = utils.get_test_data('random-20-a.fa')
+
+ ksize = 20 # size of kmer
+ htable_size = 1e4 # size of hashtable
+ num_nodegraphs = 3 # number of hashtables
+
+ nodegraph = khmer.Nodegraph(ksize, htable_size, num_nodegraphs)
+
+ # without tagging/joining across consume, this breaks into two partition;
+ # with, it is one partition.
+ nodegraph.add_stop_tag('CCGAATATATAACAGCGACG')
+
+ # DO join reads across
+ nodegraph.consume_fasta_and_tag_with_stoptags(filename)
+ subset = nodegraph.do_subset_partition(0, 0)
+ n, _ = nodegraph.count_partitions()
+ assert n == 99 # reads only connected by traversal...
+
+ n, _ = nodegraph.subset_count_partitions(subset)
+ assert n == 2 # but need main to cross stoptags.
+
+ nodegraph.merge_subset(subset)
+
+ n, _ = nodegraph.count_partitions() # ta-da!
+ assert n == 1, n
+
+
+def test_notag_across_stoptraverse():
+ filename = utils.get_test_data('random-20-a.fa')
+
+ ksize = 20 # size of kmer
+ htable_size = 1e4 # size of hashtable
+ num_nodegraphs = 3 # number of hashtables
+
+ nodegraph = khmer.Nodegraph(ksize, htable_size, num_nodegraphs)
+
+ # connecting k-mer at the beginning/end of a read: breaks up into two.
+ nodegraph.add_stop_tag('TTGCATACGTTGAGCCAGCG')
+
+ nodegraph.consume_fasta_and_tag_with_stoptags(filename)
+
+ subset = nodegraph.do_subset_partition(0, 0)
+ nodegraph.merge_subset(subset)
+
+ n, _ = nodegraph.count_partitions()
+ assert n == 2, n
+
+
+def test_find_stoptags():
+ nodegraph = khmer._Nodegraph(5, [1])
+ nodegraph.add_stop_tag("AAAAA")
+
+ assert nodegraph.identify_stoptags_by_position("AAAAA") == [0]
+ assert nodegraph.identify_stoptags_by_position("AAAAAA") == [0, 1]
+ assert nodegraph.identify_stoptags_by_position("TTTTT") == [0]
+ assert nodegraph.identify_stoptags_by_position("TTTTTT") == [0, 1]
+
+
+def test_find_stoptagsecond_seq():
+ nodegraph = khmer._Nodegraph(4, [1])
+ nodegraph.add_stop_tag("ATGC")
+
+ x = nodegraph.identify_stoptags_by_position("ATGCATGCGCAT")
+ assert x == [0, 2, 4, 8], x
+
+
+def test_get_ksize():
+ kh = khmer._Nodegraph(22, [1])
+ assert kh.ksize() == 22
+
+
+def test_get_hashsizes():
+ kh = khmer.Nodegraph(22, 100, 4)
+ # Py2/3 hack, longify converts to long in py2, remove once py2 isn't
+ # supported any longer.
+ expected = utils.longify([97, 89, 83, 79])
+ assert kh.hashsizes() == expected, kh.hashsizes()
+
+
+def test_extract_unique_paths_0():
+ kh = khmer._Nodegraph(10, [5, 7, 11, 13])
+
+ x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1)
+ assert x == ['ATGGAGAGACACAGATAGACAGGAGTGGCGATG']
+
+ kh.consume('ATGGAGAGACACAGATAGACAGGAGTGGCGATG')
+ x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1)
+ assert not x
+
+
+def test_extract_unique_paths_1():
+ kh = khmer._Nodegraph(10, [5, 7, 11, 13])
+
+ kh.consume('AGTGGCGATG')
+ x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1)
+ print(x)
+ assert x == ['ATGGAGAGACACAGATAGACAGGAGTGGCGAT'] # all but the last k-mer
+
+
+def test_extract_unique_paths_2():
+ kh = khmer._Nodegraph(10, [5, 7, 11, 13])
+
+ kh.consume('ATGGAGAGAC')
+ x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1)
+ print(x)
+ assert x == ['TGGAGAGACACAGATAGACAGGAGTGGCGATG'] # all but the 1st k-mer
+
+
+def test_extract_unique_paths_3():
+ kh = khmer._Nodegraph(10, [5, 7, 11, 13])
+
+ kh.consume('ATGGAGAGAC')
+ kh.consume('AGTGGCGATG')
+ x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1)
+ print(x)
+ # all but the 1st/last k-mer
+ assert x == ['TGGAGAGACACAGATAGACAGGAGTGGCGAT']
+
+
+def test_extract_unique_paths_4():
+ kh = khmer.Nodegraph(10, 1e6, 4)
+
+ kh.consume('ATGGAGAGAC')
+ kh.consume('AGTGGCGATG')
+
+ kh.consume('ATAGACAGGA')
+
+ x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1)
+ print(x)
+ assert x == ['TGGAGAGACACAGATAGACAGG', 'TAGACAGGAGTGGCGAT']
+
+
+def test_find_unpart():
+ filename = utils.get_test_data('random-20-a.odd.fa')
+ filename2 = utils.get_test_data('random-20-a.even.fa')
+
+ ksize = 20 # size of kmer
+ htable_size = 1e4 # size of hashtable
+ num_nodegraphs = 3 # number of hashtables
+
+ nodegraph = khmer.Nodegraph(ksize, htable_size, num_nodegraphs)
+ nodegraph.consume_fasta_and_tag(filename)
+
+ subset = nodegraph.do_subset_partition(0, 0)
+ nodegraph.merge_subset(subset)
+
+ n, _ = nodegraph.count_partitions()
+ assert n == 49
+
+ nodegraph.find_unpart(filename2, True, False)
+ n, _ = nodegraph.count_partitions()
+ assert n == 1, n # all sequences connect
+
+
+def test_find_unpart_notraverse():
+ filename = utils.get_test_data('random-20-a.odd.fa')
+ filename2 = utils.get_test_data('random-20-a.even.fa')
+
+ ksize = 20 # size of kmer
+ htable_size = 1e4 # size of hashtable
+ num_nodegraphs = 3 # number of hashtables
+
+ nodegraph = khmer.Nodegraph(ksize, htable_size, num_nodegraphs)
+ nodegraph.consume_fasta_and_tag(filename)
+
+ subset = nodegraph.do_subset_partition(0, 0)
+ nodegraph.merge_subset(subset)
+
+ n, _ = nodegraph.count_partitions()
+ assert n == 49
+
+ nodegraph.find_unpart(filename2, False, False) # <-- don't traverse
+ n, _ = nodegraph.count_partitions()
+ assert n == 99, n # all sequences disconnected
+
+
+def test_find_unpart_fail():
+ filename = utils.get_test_data('random-20-a.odd.fa')
+ filename2 = utils.get_test_data('random-20-a.odd.fa') # <- switch to odd
+
+ ksize = 20 # size of kmer
+ htable_size = 1e4 # size of hashtable
+ num_nodegraphs = 3 # number of hashtables
+
+ nodegraph = khmer.Nodegraph(ksize, htable_size, num_nodegraphs)
+ nodegraph.consume_fasta_and_tag(filename)
+
+ subset = nodegraph.do_subset_partition(0, 0)
+ nodegraph.merge_subset(subset)
+
+ n, _ = nodegraph.count_partitions()
+ assert n == 49
+
+ nodegraph.find_unpart(filename2, True, False)
+ n, _ = nodegraph.count_partitions()
+ assert n == 49, n # only 49 sequences worth of tags
+
+
+def test_simple_median():
+ hi = khmer.Nodegraph(6, 1e5, 2)
+
+ (median, average, stddev) = hi.get_median_count("AAAAAA")
+ print(median, average, stddev)
+ assert median == 0
+ assert average == 0.0
+ assert stddev == 0.0
+
+ hi.consume("AAAAAA")
+ (median, average, stddev) = hi.get_median_count("AAAAAA")
+ print(median, average, stddev)
+ assert median == 1
+ assert average == 1.0
+ assert stddev == 0.0
+
+
+def test_badget():
+ hbts = khmer.Nodegraph(6, 1e6, 1)
+
+ dna = "AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAG"
+
+ hbts.consume(dna)
+
+ assert hbts.get("AGCTTT") == 1
+
+ assert hbts.get("GATGAG") == 0
+
+ try:
+ hbts.get(b"AGCTT")
+ assert 0, "this should fail"
+ except ValueError as err:
+ print(str(err))
+
+ try:
+ hbts.get(u"AGCTT")
+ assert 0, "this should fail"
+ except ValueError as err:
+ print(str(err))
+
+
+#
+
+
+def test_load_notexist_should_fail():
+ savepath = utils.get_temp_filename('tempnodegraphsave0.htable')
+
+ hi = khmer._Countgraph(12, [1])
+ try:
+ hi.load(savepath)
+ assert 0, "load should fail"
+ except OSError:
+ pass
+
+
+def test_load_truncated_should_fail():
+ inpath = utils.get_test_data('random-20-a.fa')
+ savepath = utils.get_temp_filename('tempnodegraphsave0.ct')
+
+ hi = khmer.Countgraph(12, 1000, 2)
+
+ hi.consume_fasta(inpath)
+ hi.save(savepath)
+
+ fp = open(savepath, 'rb')
+ data = fp.read()
+ fp.close()
+
+ fp = open(savepath, 'wb')
+ fp.write(data[:1000])
+ fp.close()
+
+ hi = khmer._Countgraph(12, [1])
+ try:
+ hi.load(savepath)
+ assert 0, "load should fail"
+ except OSError as e:
+ print(str(e))
+
+
+def test_save_load_tagset_notexist():
+ nodegraph = khmer._Nodegraph(32, [1])
+
+ outfile = utils.get_temp_filename('tagset')
+ try:
+ nodegraph.load_tagset(outfile)
+ assert 0, "this test should fail"
+ except OSError as e:
+ print(str(e))
+
+
+def test_save_load_tagset_trunc():
+ nodegraph = khmer._Nodegraph(32, [1])
+
+ outfile = utils.get_temp_filename('tagset')
+
+ nodegraph.add_tag('A' * 32)
+ nodegraph.add_tag('G' * 32)
+ nodegraph.save_tagset(outfile)
+
+ # truncate tagset file...
+ fp = open(outfile, 'rb')
+ data = fp.read()
+ fp.close()
+
+ for i in range(len(data)):
+ fp = open(outfile, 'wb')
+ fp.write(data[:i])
+ fp.close()
+
+ # try loading it...
+ try:
+ nodegraph.load_tagset(outfile)
+ assert 0, "this test should fail"
+ except OSError as err:
+ print(str(err), i)
+
+ # try loading it...
+ try:
+ nodegraph.load_tagset(outfile)
+ assert 0, "this test should fail"
+ except OSError:
+ pass
+
+# to build the test files used below, add 'test' to this function
+# and then look in /tmp. You will need to tweak the version info in
+# khmer.hh in order to create "bad" versions, of course. -CTB
+
+
+def _build_testfiles():
+ # nodegraph file
+
+ inpath = utils.get_test_data('random-20-a.fa')
+ hi = khmer.Nodegraph(12, 2)
+ hi.consume_fasta(inpath)
+ hi.save('/tmp/goodversion-k12.htable')
+
+ # tagset file
+
+ nodegraph = khmer._Nodegraph(32, [1])
+
+ nodegraph.add_tag('A' * 32)
+ nodegraph.add_tag('G' * 32)
+ nodegraph.save_tagset('/tmp/goodversion-k32.tagset')
+
+ # stoptags file
+
+ fakelump_fa = utils.get_test_data('fakelump.fa')
+
+ nodegraph = khmer.Nodegraph(32, 4, 4)
+ nodegraph.consume_fasta_and_tag(fakelump_fa)
+
+ subset = nodegraph.do_subset_partition(0, 0)
+ nodegraph.merge_subset(subset)
+
+ EXCURSION_DISTANCE = 40
+ EXCURSION_KMER_THRESHOLD = 82
+ EXCURSION_KMER_COUNT_THRESHOLD = 1
+ counting = khmer.Countgraph(32, 4, 4)
+
+ nodegraph.repartition_largest_partition(None, counting,
+ EXCURSION_DISTANCE,
+ EXCURSION_KMER_THRESHOLD,
+ EXCURSION_KMER_COUNT_THRESHOLD)
+
+ nodegraph.save_stop_tags('/tmp/goodversion-k32.stoptags')
+
+
+def test_hashbits_file_version_check():
+ nodegraph = khmer._Nodegraph(12, [1])
+
+ inpath = utils.get_test_data('badversion-k12.htable')
+
+ try:
+ nodegraph.load(inpath)
+ assert 0, "this should fail"
+ except OSError as e:
+ print(str(e))
+
+
+def test_nodegraph_file_type_check():
+ kh = khmer._Countgraph(12, [1])
+ savepath = utils.get_temp_filename('tempcountingsave0.ct')
+ kh.save(savepath)
+
+ nodegraph = khmer._Nodegraph(12, [1])
+
+ try:
+ nodegraph.load(savepath)
+ assert 0, "this should fail"
+ except OSError as e:
+ print(str(e))
+
+
+def test_stoptags_file_version_check():
+ nodegraph = khmer._Nodegraph(32, [1])
+
+ inpath = utils.get_test_data('badversion-k32.stoptags')
+
+ try:
+ nodegraph.load_stop_tags(inpath)
+ assert 0, "this should fail"
+ except OSError as e:
+ print(str(e))
+
+
+def test_stoptags_ksize_check():
+ nodegraph = khmer._Nodegraph(31, [1])
+
+ inpath = utils.get_test_data('goodversion-k32.stoptags')
+ try:
+ nodegraph.load_stop_tags(inpath)
+ assert 0, "this should fail"
+ except OSError as e:
+ print(str(e))
+
+
+def test_stop_tags_filetype_check():
+ nodegraph = khmer._Nodegraph(31, [1])
+
+ inpath = utils.get_test_data('goodversion-k32.tagset')
+ try:
+ nodegraph.load_stop_tags(inpath)
+ assert 0, "this should fail"
+ except OSError as e:
+ print(str(e))
+
+
+def test_tagset_file_version_check():
+ nodegraph = khmer._Nodegraph(32, [1])
+
+ inpath = utils.get_test_data('badversion-k32.tagset')
+
+ try:
+ nodegraph.load_tagset(inpath)
+ assert 0, "this should fail"
+ except OSError as e:
+ print(str(e))
+
+
+def test_stop_tags_truncate_check():
+ nodegraph = khmer._Nodegraph(32, [1])
+
+ inpath = utils.get_test_data('goodversion-k32.tagset')
+ data = open(inpath, 'rb').read()
+
+ truncpath = utils.get_temp_filename('zzz')
+ for i in range(len(data)):
+ fp = open(truncpath, 'wb')
+ fp.write(data[:i])
+ fp.close()
+
+ try:
+ nodegraph.load_stop_tags(truncpath)
+ assert 0, "expect failure of previous command"
+ except OSError as e:
+ print(i, str(e))
+
+
+def test_tagset_ksize_check():
+ nodegraph = khmer._Nodegraph(31, [1])
+
+ inpath = utils.get_test_data('goodversion-k32.tagset')
+ try:
+ nodegraph.load_tagset(inpath)
+ assert 0, "this should fail"
+ except OSError as e:
+ print(str(e))
+
+
+def test_tagset_filetype_check():
+ nodegraph = khmer._Nodegraph(31, [1])
+
+ inpath = utils.get_test_data('goodversion-k32.stoptags')
+ try:
+ nodegraph.load_tagset(inpath)
+ assert 0, "this should fail"
+ except OSError as e:
+ print(str(e))
+
+
+def test_bad_primes_list():
+ try:
+ coutingtable = khmer._Nodegraph(31, ["a", "b", "c"], 1)
+ assert 0, "Bad primes list should fail"
+ except TypeError as e:
+ print(str(e))
+
+
+def test_consume_absentfasta_with_reads_parser():
+ nodegraph = khmer._Nodegraph(31, [1])
+ try:
+ nodegraph.consume_fasta_with_reads_parser()
+ assert 0, "this should fail"
+ except TypeError as err:
+ print(str(err))
+ try:
+ readparser = ReadParser(utils.get_test_data('empty-file'))
+ nodegraph.consume_fasta_with_reads_parser(readparser)
+ assert 0, "this should fail"
+ except OSError as err:
+ print(str(err))
+ except ValueError as err:
+ print(str(err))
+
+
+def test_bad_primes():
+ try:
+ countgraph = khmer._Nodegraph.__new__(
+ khmer._Nodegraph, 6, ["a", "b", "c"])
+ assert 0, "this should fail"
+ except TypeError as e:
+ print(str(e))
+
+
+def test_consume_fasta_and_tag_with_badreads_parser():
+ nodegraph = khmer.Nodegraph(6, 1e6, 2)
+ try:
+ readsparser = khmer.ReadParser(utils.get_test_data("test-empty.fa"))
+ nodegraph.consume_fasta_and_tag_with_reads_parser(readsparser)
+ assert 0, "this should fail"
+ except OSError as e:
+ print(str(e))
+ except ValueError as e:
+ print(str(e))
+
+
+def test_n_occupied_save_load():
+ filename = utils.get_test_data('random-20-a.fa')
+
+ nodegraph = khmer.Nodegraph(20, 100000, 3)
+
+ for _, record in enumerate(screed.open(filename)):
+ nodegraph.consume(record.sequence)
+
+ assert nodegraph.n_occupied() == 3884
+ assert nodegraph.n_unique_kmers() == 3960
+
+ savefile = utils.get_temp_filename('out')
+ nodegraph.save(savefile)
+
+ ng2 = khmer.load_nodegraph(savefile)
+ assert ng2.n_occupied() == 3884, ng2.n_occupied()
+ assert ng2.n_unique_kmers() == 0 # this is intended behavior, sigh.
+
+
+def test_n_occupied_vs_countgraph():
+ filename = utils.get_test_data('random-20-a.fa')
+
+ nodegraph = khmer.Nodegraph(20, 100000, 3)
+ countgraph = khmer.Countgraph(20, 100000, 3)
+
+ assert nodegraph.n_occupied() == 0, nodegraph.n_occupied()
+ assert countgraph.n_occupied() == 0, countgraph.n_occupied()
+
+ assert nodegraph.n_unique_kmers() == 0, nodegraph.n_unique_kmers()
+ assert countgraph.n_unique_kmers() == 0, countgraph.n_unique_kmers()
+
+ for n, record in enumerate(screed.open(filename)):
+ nodegraph.consume(record.sequence)
+ countgraph.consume(record.sequence)
+
+ assert nodegraph.hashsizes() == nodegraph.hashsizes()
+
+ # these are all the same -- good :).
+ assert nodegraph.n_occupied() == 3884, nodegraph.n_occupied()
+ assert countgraph.n_occupied() == 3884, countgraph.n_occupied()
+
+ assert nodegraph.n_unique_kmers() == 3960, nodegraph.n_unique_kmers()
+ assert countgraph.n_unique_kmers() == 3960, countgraph.n_unique_kmers()
+
+
+def test_n_occupied_vs_countgraph_another_size():
+ filename = utils.get_test_data('random-20-a.fa')
+
+ nodegraph = khmer.Nodegraph(20, 10000, 3)
+ countgraph = khmer.Countgraph(20, 10000, 3)
+
+ assert nodegraph.n_occupied() == 0, nodegraph.n_occupied()
+ assert countgraph.n_occupied() == 0, countgraph.n_occupied()
+
+ assert nodegraph.n_unique_kmers() == 0, nodegraph.n_unique_kmers()
+ assert countgraph.n_unique_kmers() == 0, countgraph.n_unique_kmers()
+
+ for n, record in enumerate(screed.open(filename)):
+ nodegraph.consume(record.sequence)
+ countgraph.consume(record.sequence)
+
+ assert nodegraph.hashsizes() == nodegraph.hashsizes()
+
+ # these are all the same -- good :).
+ assert nodegraph.n_occupied() == 3269, nodegraph.n_occupied()
+ assert countgraph.n_occupied() == 3269, countgraph.n_occupied()
+
+ assert nodegraph.n_unique_kmers() == 3916, nodegraph.n_unique_kmers()
+ assert countgraph.n_unique_kmers() == 3916, countgraph.n_unique_kmers()
diff --git a/tests/test_normalize_by_median.py b/tests/test_normalize_by_median.py
index 41d22b4..0e620c8 100644
--- a/tests/test_normalize_by_median.py
+++ b/tests/test_normalize_by_median.py
@@ -26,12 +26,27 @@ def test_normalize_by_median_indent():
hashfile = utils.get_test_data('normC20k20.ct')
outfile = utils.get_temp_filename('paired-mixed.fa.pe.keep')
script = 'normalize-by-median.py'
- args = ['--loadtable', hashfile, '-o', outfile, infile]
+ args = ['--loadgraph', hashfile, '-o', outfile, infile]
(status, out, err) = utils.runscript(script, args)
assert status == 0, (out, err)
assert os.path.exists(outfile)
+def test_normalize_by_median_loadgraph_with_args():
+ infile = utils.get_test_data("test-abund-read-2.fa")
+ tablefile = utils.get_temp_filename("table")
+ in_dir = os.path.dirname(tablefile)
+
+ script = "load-into-counting.py"
+ args = [tablefile, infile]
+ (status, out, err) = utils.runscript(script, args)
+
+ script = "normalize-by-median.py"
+ args = ["--ksize", "7", "--loadgraph", tablefile, infile]
+ (status, out, err) = utils.runscript(script, args, in_dir)
+ assert 'WARNING: You are loading a saved k-mer countgraph from' in err, err
+
+
def test_normalize_by_median_empty_file():
infile = utils.get_temp_filename('empty')
shutil.copyfile(utils.get_test_data('empty-file'), infile)
@@ -69,6 +84,30 @@ def test_normalize_by_median():
assert "I/O Errors" not in err
+def test_normalize_by_median_quiet():
+ CUTOFF = '1'
+
+ infile = utils.get_temp_filename('test.fa')
+ in_dir = os.path.dirname(infile)
+
+ shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile)
+
+ script = 'normalize-by-median.py'
+ args = ['-C', CUTOFF, '-k', '17', '--quiet', '-M', '2e6', infile]
+ (status, out, err) = utils.runscript(script, args, in_dir)
+
+ assert len(out) == 0, out
+ assert len(err) == 0, err
+
+ outfile = infile + '.keep'
+ assert os.path.exists(outfile), outfile
+
+ seqs = [r.sequence for r in screed.open(outfile)]
+ assert len(seqs) == 1, seqs
+ assert seqs[0].startswith('GGTTGACGGGGCTCAGGGGG'), seqs
+ assert "I/O Errors" not in err
+
+
def test_normalize_by_median_unpaired_final_read():
CUTOFF = '1'
@@ -93,7 +132,7 @@ def test_normalize_by_median_sanity_check_0():
script = 'normalize-by-median.py'
args = ['-U', '1024', '--max-mem', '60', infile]
(status, out, err) = utils.runscript(script, args, in_dir, fail_ok=True)
- assert status != 0
+ assert status != 0, status
assert "recommended false positive ceiling of 0.1!" in err, err
@@ -120,8 +159,28 @@ def test_normalize_by_median_sanity_check_2():
args = ['-U', '83', infile]
(status, out, err) = utils.runscript(script, args, in_dir)
- assert "*** INFO: set memory ceiling using auto optimization." in err, err
- assert "*** Ceiling is: 399 bytes" in err, err
+ assert "*** INFO: set memory ceiling automatically." in err, err
+ assert "*** Ceiling is: 1e+06 bytes" in err, err
+
+
+def test_normalize_by_median_sanity_check_3():
+ infile = utils.get_temp_filename('test.fa')
+ in_dir = os.path.dirname(infile)
+ tablefile = utils.get_temp_filename('table', in_dir)
+
+ shutil.copyfile(utils.get_test_data('test-filter-abund-Ns.fq'), infile)
+
+ script = 'normalize-by-median.py'
+ args = ['-s', tablefile, '-U', '83', '--fp-rate', '0.7', infile]
+ (status, out, err) = utils.runscript(script, args, in_dir)
+ assert "Overriding default fp 0.1 with new fp: 0.7" in err, err
+
+ args = ['--loadgraph', tablefile, '-U', '83', infile]
+ (status, out, err) = utils.runscript(script, args, in_dir)
+
+ assert "WARNING: You have asked that the graph size be auto" in err, err
+ assert "NOT be set automatically" in err, err
+ assert "loading an existing graph" in err, err
def test_normalize_by_median_unforced_badfile():
@@ -168,7 +227,7 @@ def test_normalize_by_median_stdout_3():
(status, out, err) = utils.runscript(script, args, in_dir)
assert 'Total number of unique k-mers: 98' in err, err
- assert 'in /dev/stdout' in err, err
+ assert 'in block device' in err, err
assert "I/O Errors" not in err
@@ -284,7 +343,7 @@ def test_normalize_by_median_unpaired_and_paired():
args = ['-C', CUTOFF, '-k', '17', '-u', unpairedfile, '-p', infile]
(status, out, err) = utils.runscript(script, args, in_dir)
- assert 'Total number of unique k-mers: 4030' in err, err
+ assert 'Total number of unique k-mers: 4061' in err, err
outfile = infile + '.keep'
assert os.path.exists(outfile), outfile
@@ -490,14 +549,14 @@ def test_normalize_by_median_no_bigcount():
counting_ht = _make_counting(infile, K=8)
script = 'normalize-by-median.py'
- args = ['-C', '1000', '-k 8', '--savetable', hashfile, infile]
+ args = ['-C', '1000', '-k 8', '--savegraph', hashfile, infile]
(status, out, err) = utils.runscript(script, args, in_dir)
assert status == 0, (out, err)
print((out, err))
assert os.path.exists(hashfile), hashfile
- kh = khmer.load_counting_hash(hashfile)
+ kh = khmer.load_countgraph(hashfile)
assert kh.get('GGTTGACG') == 255
@@ -518,7 +577,7 @@ def test_normalize_by_median_empty():
assert os.path.exists(outfile), outfile
-def test_normalize_by_median_emptycountingtable():
+def test_normalize_by_median_emptycountgraph():
CUTOFF = '1'
infile = utils.get_temp_filename('test.fa')
@@ -527,7 +586,7 @@ def test_normalize_by_median_emptycountingtable():
shutil.copyfile(utils.get_test_data('test-empty.fa'), infile)
script = 'normalize-by-median.py'
- args = ['-C', CUTOFF, '--loadtable', infile, infile]
+ args = ['-C', CUTOFF, '--loadgraph', infile, infile]
(status, out, err) = utils.runscript(script, args, in_dir, fail_ok=True)
assert status != 0
assert 'ValueError' in err, (status, out, err)
diff --git a/tests/test_oxli_functions.py b/tests/test_oxli_functions.py
index 22e4371..b0606ea 100644
--- a/tests/test_oxli_functions.py
+++ b/tests/test_oxli_functions.py
@@ -12,30 +12,34 @@ from __future__ import unicode_literals
from . import khmer_tst_utils as utils
import khmer
-from oxli import functions
+
+# Technically not from 'oxli' but it's what they are
+from khmer.khmer_args import (estimate_optimal_with_K_and_M,
+ graphsize_args_report,
+ estimate_optimal_with_K_and_f, optimal_size)
def test_estimate_functions_1():
- res = functions.estimate_optimal_with_K_and_M(99, 1024)
+ res = estimate_optimal_with_K_and_M(99, 1024)
assert res[0] == 7, res[0]
assert res[1] == 146, res[1]
assert res[2] == 1022, res[2]
assert abs(.008 - res[3]) < .001, res[3]
- res = functions.estimate_optimal_with_K_and_f(99, 0.00701925498897)
+ res = estimate_optimal_with_K_and_f(99, 0.00701925498897)
assert res[0] == 7, res[0]
assert res[1] == 145, res[1]
assert res[2] == 1015, res[2]
assert abs(.008 - res[3]) < .002, res[3]
- res = functions.estimate_optimal_with_K_and_M(1024, 2)
+ res = estimate_optimal_with_K_and_M(1024, 2)
assert res[0] == 1, res[0]
assert res[1] == 2, res[1]
assert res[2] == 2, res[2]
assert res[3] == 1.0, res[3]
# using a crazy high FP rate just for coverage
- res = functions.estimate_optimal_with_K_and_f(1024, 0.7)
+ res = estimate_optimal_with_K_and_f(1024, 0.7)
assert res[0] == 1, res[0]
assert res[1] == 850, res[1]
assert res[2] == 850, res[2]
@@ -43,13 +47,13 @@ def test_estimate_functions_1():
def test_estimate_functions_namedtup():
- res = functions.estimate_optimal_with_K_and_M(99, 1024)
+ res = estimate_optimal_with_K_and_M(99, 1024)
assert res.num_htables == 7, res[0]
assert res.htable_size == 146, res[1]
assert res.mem_use == 1022, res[2]
assert abs(.008 - res.fp_rate) < .001, res[3]
- res = functions.estimate_optimal_with_K_and_f(99, 0.00701925498897)
+ res = estimate_optimal_with_K_and_f(99, 0.00701925498897)
assert res.num_htables == 7, res[0]
assert res.htable_size == 145, res[1]
assert res.mem_use == 1015, res[2]
@@ -57,27 +61,27 @@ def test_estimate_functions_namedtup():
def test_optimal_size_function():
- res = functions.optimal_size(99, mem_cap=1024)
+ res = optimal_size(99, mem_cap=1024)
assert res.num_htables == 7, res[0]
assert res.htable_size == 146, res[1]
assert res.mem_use == 1022, res[2]
assert abs(.008 - res.fp_rate) < .001, res[3]
- res = functions.optimal_size(99, fp_rate=0.00701925498897)
+ res = optimal_size(99, fp_rate=0.00701925498897)
assert res.num_htables == 7, res[0]
assert res.htable_size == 145, res[1]
assert res.mem_use == 1015, res[2]
assert abs(.008 - res.fp_rate) < .002, res[3]
try:
- functions.optimal_size(99, mem_cap=1024, fp_rate=0.00701925498897)
+ optimal_size(99, mem_cap=1024, fp_rate=0.00701925498897)
assert 0, "this should fail"
except TypeError as err:
print(str(err))
assert "num_kmers and either mem_cap or fp_rate" in str(err)
try:
- functions.optimal_size(99)
+ optimal_size(99)
assert 0, "this should fail"
except TypeError as err:
print(str(err))
@@ -85,4 +89,4 @@ def test_optimal_size_function():
def test_output_gen():
- res = functions.optimal_args_output_gen(99, 0.00701925498897)
+ res = graphsize_args_report(99, 0.00701925498897)
diff --git a/tests/test_read_aligner.py b/tests/test_read_aligner.py
index adbf24e..2779cbd 100644
--- a/tests/test_read_aligner.py
+++ b/tests/test_read_aligner.py
@@ -46,13 +46,13 @@ def neq_(v1, v2):
def test_graph_attribute():
- ch = khmer.CountingHash(10, 1048576, 1)
+ ch = khmer.Countgraph(10, 1048576, 1)
aligner = khmer.ReadAligner(ch, 0, 0)
assert aligner.graph is ch
def test_align_nothing():
- ch = khmer.CountingHash(10, 1048576, 1)
+ ch = khmer.Countgraph(10, 1048576, 1)
read = "ACCAAGGCTCGAGATTTACC"
aligner = khmer.ReadAligner(ch, 0, 0)
@@ -68,7 +68,7 @@ def test_align_nothing():
def test_alignnocov():
- ch = khmer.CountingHash(10, 1048576, 1)
+ ch = khmer.Countgraph(10, 1048576, 1)
read = "ACCTAGGTTCGACATGTACC"
aligner = khmer.ReadAligner(ch, 0, 0)
for i in range(20):
@@ -83,7 +83,7 @@ def test_alignnocov():
def test_align_middle():
- ch = khmer.CountingHash(10, 1048576, 1)
+ ch = khmer.Countgraph(10, 1048576, 1)
read = "TCGACAAGTCCTTGACAGAT"
aligner = khmer.ReadAligner(ch, 0, 0)
for i in range(20):
@@ -100,7 +100,7 @@ def test_align_middle():
def test_align_middle_trunc():
return # @CTB
- ch = khmer.CountingHash(10, 1048576, 1)
+ ch = khmer.Countgraph(10, 1048576, 1)
read = "TCGACAAGTCCTTGACAGATGGGGGG"
aligner = khmer.ReadAligner(ch, 0, 0)
for i in range(20):
@@ -124,7 +124,7 @@ def test_align_middle_trunc():
def test_align_middle_trunc_2():
return # @CTB
- ch = khmer.CountingHash(10, 1048576, 1)
+ ch = khmer.Countgraph(10, 1048576, 1)
read = "GGGGGGGGGGGGTCGACAAGTCCTTGACAGAT"
aligner = khmer.ReadAligner(ch, 0, 0)
for i in range(20):
@@ -146,7 +146,7 @@ def test_align_middle_trunc_2():
def test_align_fwd_nothing():
- ch = khmer.CountingHash(10, 1048576, 1)
+ ch = khmer.Countgraph(10, 1048576, 1)
read = "ACCAAGGCTCGAGATTTACC"
aligner = khmer.ReadAligner(ch, 0, 0)
@@ -162,7 +162,7 @@ def test_align_fwd_nothing():
def test_align_fwd_nocov():
- ch = khmer.CountingHash(10, 1048576, 1)
+ ch = khmer.Countgraph(10, 1048576, 1)
read = "ACCTAGGTTCGACATGTACC"
aligner = khmer.ReadAligner(ch, 0, 0)
for i in range(20):
@@ -177,7 +177,7 @@ def test_align_fwd_nocov():
def test_align_fwd_middle():
- ch = khmer.CountingHash(10, 1048576, 1)
+ ch = khmer.Countgraph(10, 1048576, 1)
read = "TCGACAAGTCCTTGACAGAT"
aligner = khmer.ReadAligner(ch, 0, 0)
for i in range(20):
@@ -193,7 +193,7 @@ def test_align_fwd_middle():
def test_align_fwd_middle_trunc():
return # @CTB
- ch = khmer.CountingHash(10, 1048576, 1)
+ ch = khmer.Countgraph(10, 1048576, 1)
read = "TCGACAAGTCCTTGACAGATGGGGGG"
aligner = khmer.ReadAligner(ch, 0, 0)
for i in range(20):
@@ -215,7 +215,7 @@ def test_align_fwd_middle_trunc():
def test_align_fwd_middle_trunc_2():
- ch = khmer.CountingHash(10, 1048576, 1)
+ ch = khmer.Countgraph(10, 1048576, 1)
read = "GGGGGGGGGGGGTCGACAAGTCCTTGACAGAT"
aligner = khmer.ReadAligner(ch, 0, 0)
for i in range(20):
@@ -235,7 +235,7 @@ def test_align_fwd_middle_trunc_2():
def test_align_fwd_covs_1():
K = 10
- ch = khmer.CountingHash(K, 1048576, 1)
+ ch = khmer.Countgraph(K, 1048576, 1)
read = "GTCGACAAGTCCTTGACAGAT"
aligner = khmer.ReadAligner(ch, 0, 0)
for i in range(19):
@@ -258,7 +258,7 @@ def test_align_fwd_covs_1():
def test_align_fwd_covs_2():
K = 10
- ch = khmer.CountingHash(K, 1048576, 1)
+ ch = khmer.Countgraph(K, 1048576, 1)
read = "GTCGACAAGTCCTTGACAGAT"
aligner = khmer.ReadAligner(ch, 0, 0)
for i in range(19):
@@ -283,7 +283,7 @@ def test_align_fwd_covs_2():
def test_align_fwd_covs_3():
K = 10
- ch = khmer.CountingHash(K, 1048576, 1)
+ ch = khmer.Countgraph(K, 1048576, 1)
read = "GTCGACAAGTCCTTGACAGAT"
aligner = khmer.ReadAligner(ch, 0, 0)
for i in range(19):
@@ -309,7 +309,7 @@ def test_align_fwd_covs_3():
def test_align_fwd_covs_4():
K = 10
- ch = khmer.CountingHash(K, 1048576, 1)
+ ch = khmer.Countgraph(K, 1048576, 1)
read = "GTCGACAAGTCCTTGACAGAT"
aligner = khmer.ReadAligner(ch, 0, 0)
for i in range(19):
@@ -333,7 +333,7 @@ def test_align_fwd_covs_4():
def test_align_fwd_covs_5():
K = 10
- ch = khmer.CountingHash(K, 1048576, 1)
+ ch = khmer.Countgraph(K, 1048576, 1)
read = "GTCGACAAGTCCTTGACAGAT"
aligner = khmer.ReadAligner(ch, 0, 0)
for i in range(19):
@@ -357,7 +357,7 @@ def test_align_fwd_covs_5():
def test_simple_readalign():
return # @CTB
- ch = khmer.CountingHash(10, 1048576, 1)
+ ch = khmer.Countgraph(10, 1048576, 1)
aligner = khmer.ReadAligner(ch, 2, 0)
for i in range(20):
ch.consume("AGAGGGAAAGCTAGGTTCGACATGTCCTTGACAGAT")
@@ -377,7 +377,7 @@ def test_simple_readalign():
def test_readalign():
return # @CTB
- ch = khmer.CountingHash(10, 1048576, 1)
+ ch = khmer.Countgraph(10, 1048576, 1)
aligner = khmer.ReadAligner(ch, 1, 0)
for i in range(20):
ch.consume("AGAGGGAAAGCTAGGTTCGACAAGTCCTTGACAGAT")
@@ -612,7 +612,7 @@ def check_query(aligner, query):
def test_readalign_new():
return # @CTB
- ch = khmer.CountingHash(32, 1048576, 1)
+ ch = khmer.Countgraph(32, 1048576, 1)
aligner = khmer.ReadAligner(ch, 1, 0)
for seq in ht_seqs:
ch.consume(seq)
@@ -624,7 +624,7 @@ def test_readalign_new():
def test_readaligner_load():
- ct = khmer.CountingHash(32, 1048576, 1)
+ ct = khmer.Countgraph(32, 1048576, 1)
parameters_json = utils.get_test_data('readaligner-default.json')
a_aligner = khmer.ReadAligner(ct, 0, 0, filename=parameters_json)
a_scoring_matrix = a_aligner.get_scoring_matrix()
diff --git a/tests/test_sandbox_scripts.py b/tests/test_sandbox_scripts.py
index c968e5a..d33a9d0 100644
--- a/tests/test_sandbox_scripts.py
+++ b/tests/test_sandbox_scripts.py
@@ -287,3 +287,15 @@ def test_count_kmers_2_single():
out = out.splitlines()
assert 'TTGTAACCTGTGTGGGGTCG,1' in out
+
+
+def test_multirename_fasta():
+ infile1 = utils.get_temp_filename('test-multi.fa')
+ multioutfile = utils.get_temp_filename('out.fa')
+ infile2 = utils.get_temp_filename('out.fa')
+ shutil.copyfile(utils.get_test_data('test-multi.fa'), infile1)
+ shutil.copyfile(utils.get_test_data('multi-output.fa'), infile2)
+ args = ['assembly', infile1]
+ _, out, err = utils.runscript('multi-rename.py', args, sandbox=True)
+ r = open(infile2).read()
+ assert r in out
diff --git a/tests/test_script_arguments.py b/tests/test_script_arguments.py
index 25c6b01..9f1045c 100644
--- a/tests/test_script_arguments.py
+++ b/tests/test_script_arguments.py
@@ -53,9 +53,9 @@ def test_check_tablespace():
args = parser.parse_args(['-M', '1e9'])
try:
- tablesize = khmer_args.calculate_tablesize(args, 'countgraph')
- khmer.kfile.check_space_for_hashtable(outfile, tablesize,
- False, _testhook_free_space=0)
+ tablesize = khmer_args.calculate_graphsize(args, 'countgraph')
+ khmer.kfile.check_space_for_graph(outfile, tablesize,
+ False, _testhook_free_space=0)
assert 0, "this should fail"
except SystemExit as e:
print(str(e))
@@ -86,9 +86,9 @@ def test_check_tablespace_force():
args = parser.parse_args(['-M', '1e9'])
try:
- tablesize = khmer_args.calculate_tablesize(args, 'countgraph')
- khmer.kfile.check_space_for_hashtable(outfile, tablesize,
- True, _testhook_free_space=0)
+ tablesize = khmer_args.calculate_graphsize(args, 'countgraph')
+ khmer.kfile.check_space_for_graph(outfile, tablesize,
+ True, _testhook_free_space=0)
assert True, "this should pass"
except SystemExit as e:
print(str(e))
@@ -123,7 +123,8 @@ def test_check_valid_stdin_nowarn():
FakeArgparseObject = collections.namedtuple('FakeArgs',
['ksize', 'n_tables',
'max_tablesize',
- 'max_memory_usage'])
+ 'max_memory_usage',
+ 'unique_kmers'])
def test_create_countgraph_1():
@@ -132,7 +133,7 @@ def test_create_countgraph_1():
max_tablesize = khmer_args.DEFAULT_MAX_TABLESIZE
max_mem = 1e7
- args = FakeArgparseObject(ksize, n_tables, max_tablesize, max_mem)
+ args = FakeArgparseObject(ksize, n_tables, max_tablesize, max_mem, 0)
countgraph = khmer_args.create_countgraph(args)
expected_hashsz = utils.longify([2499997, 2499989, 2499983, 2499967])
@@ -148,7 +149,7 @@ def test_create_countgraph_2():
max_tablesize = khmer_args.DEFAULT_MAX_TABLESIZE
max_mem = 1e7
- args = FakeArgparseObject(ksize, n_tables, max_tablesize, max_mem)
+ args = FakeArgparseObject(ksize, n_tables, max_tablesize, max_mem, 0)
countgraph = khmer_args.create_countgraph(args, ksize=15)
assert countgraph.ksize() == 15
@@ -162,7 +163,7 @@ def test_create_countgraph_3():
max_tablesize = khmer_args.DEFAULT_MAX_TABLESIZE
max_mem = 1e7
- args = FakeArgparseObject(ksize, n_tables, max_tablesize, max_mem)
+ args = FakeArgparseObject(ksize, n_tables, max_tablesize, max_mem, 0)
old_stderr = sys.stderr
sys.stderr = capture = StringIO()
@@ -183,7 +184,7 @@ def test_create_countgraph_4_multiplier():
max_tablesize = khmer_args.DEFAULT_MAX_TABLESIZE
max_mem = 1e7
- args = FakeArgparseObject(ksize, n_tables, max_tablesize, max_mem)
+ args = FakeArgparseObject(ksize, n_tables, max_tablesize, max_mem, 0)
countgraph = khmer_args.create_countgraph(args, multiplier=2.0)
assert sum(countgraph.hashsizes()) < max_mem / 2.0, \
@@ -196,7 +197,7 @@ def test_create_nodegraph_1():
max_tablesize = khmer_args.DEFAULT_MAX_TABLESIZE
max_mem = 1e7
- args = FakeArgparseObject(ksize, n_tables, max_tablesize, max_mem)
+ args = FakeArgparseObject(ksize, n_tables, max_tablesize, max_mem, 0)
nodegraph = khmer_args.create_nodegraph(args)
expected_hashsz = utils.longify([19999999, 19999981, 19999963, 19999927])
@@ -214,7 +215,7 @@ def test_create_nodegraph_2():
max_tablesize = khmer_args.DEFAULT_MAX_TABLESIZE
max_mem = 1e7
- args = FakeArgparseObject(ksize, n_tables, max_tablesize, max_mem)
+ args = FakeArgparseObject(ksize, n_tables, max_tablesize, max_mem, 0)
nodegraph = khmer_args.create_nodegraph(args, ksize=15)
assert nodegraph.ksize() == 15
@@ -228,7 +229,7 @@ def test_create_nodegraph_3():
max_tablesize = khmer_args.DEFAULT_MAX_TABLESIZE
max_mem = 1e7
- args = FakeArgparseObject(ksize, n_tables, max_tablesize, max_mem)
+ args = FakeArgparseObject(ksize, n_tables, max_tablesize, max_mem, 0)
old_stderr = sys.stderr
sys.stderr = capture = StringIO()
@@ -247,20 +248,20 @@ def test_create_nodegraph_4_multiplier():
max_tablesize = khmer_args.DEFAULT_MAX_TABLESIZE
max_mem = 1e7
- args = FakeArgparseObject(ksize, n_tables, max_tablesize, max_mem)
+ args = FakeArgparseObject(ksize, n_tables, max_tablesize, max_mem, 0)
nodegraph = khmer_args.create_nodegraph(args, multiplier=2.0)
assert sum(nodegraph.hashsizes()) / 8.0 < max_mem / 2.0, \
sum(nodegraph.hashsizes())
-def test_report_on_config_bad_hashtype():
+def test_report_on_config_bad_graphtype():
ksize = khmer_args.DEFAULT_K
n_tables = khmer_args.DEFAULT_N_TABLES
max_tablesize = khmer_args.DEFAULT_MAX_TABLESIZE
max_mem = 1e7
- args = FakeArgparseObject(ksize, n_tables, max_tablesize, max_mem)
+ args = FakeArgparseObject(ksize, n_tables, max_tablesize, max_mem, 0)
try:
khmer_args.report_on_config(args, 'foograph')
@@ -277,10 +278,10 @@ def test_fail_calculate_foograph_size():
max_tablesize = khmer_args.DEFAULT_MAX_TABLESIZE
max_mem = 1e7
- args = FakeArgparseObject(ksize, n_tables, max_tablesize, max_mem)
+ args = FakeArgparseObject(ksize, n_tables, max_tablesize, max_mem, 0)
try:
- nodegraph = khmer_args.calculate_tablesize(args, 'foograph')
+ nodegraph = khmer_args.calculate_graphsize(args, 'foograph')
assert 0, "previous statement should fail"
except ValueError as err:
assert "unknown graph type: foograph" in str(err), str(err)
diff --git a/tests/test_scripts.py b/tests/test_scripts.py
index ace3d89..9790f60 100644
--- a/tests/test_scripts.py
+++ b/tests/test_scripts.py
@@ -21,6 +21,7 @@ import traceback
from nose.plugins.attrib import attr
import threading
import bz2
+import gzip
import io
from . import khmer_tst_utils as utils
@@ -50,11 +51,41 @@ def test_load_into_counting():
args.extend([outfile, infile])
(status, out, err) = utils.runscript(script, args)
- assert 'Total number of unique k-mers: 83' in err, err
+ assert 'Total number of unique k-mers: 94' in err, err
assert os.path.exists(outfile)
-def test_load_into_counting_tablesize_warning():
+def test_load_into_counting_autoargs_0():
+ script = 'load-into-counting.py'
+
+ outfile = utils.get_temp_filename('table')
+ infile = utils.get_test_data('test-abund-read-2.fa')
+
+ args = ['-U', '1e7', '--fp-rate', '0.08', outfile, infile]
+ (status, out, err) = utils.runscript(script, args)
+
+ assert os.path.exists(outfile)
+ assert 'INFO: Overriding default fp 0.1 with new fp: 0.08' in err, err
+ assert ' tablesize is too small!' in err, err
+ assert 'Estimated FP rate with current config is: 0.9999546' in err, err
+ assert 'Recommended tablesize is: 1.77407e+07 bytes' in err, err
+
+
+def test_load_into_counting_autoargs_1():
+ script = 'load-into-counting.py'
+
+ outfile = utils.get_temp_filename('table')
+ infile = utils.get_test_data('test-abund-read-2.fa')
+
+ args = ['-U', '1e7', '--max-tablesize', '3e7', outfile, infile]
+ (status, out, err) = utils.runscript(script, args)
+
+ assert os.path.exists(outfile)
+ assert "Ceiling is: 4.80833e+07 bytes" in err, err
+ assert "set memory ceiling automatically." in err, err
+
+
+def test_load_into_count_graphsize_warning():
script = 'load-into-counting.py'
args = ['-k', '20']
@@ -81,7 +112,7 @@ def test_load_into_counting_max_memory_usage_parameter():
assert os.path.exists(outfile)
assert "WARNING: tablesize is default!" not in err
- kh = khmer.load_counting_hash(outfile)
+ kh = khmer.load_countgraph(outfile)
assert sum(kh.hashsizes()) < 3e8
@@ -95,7 +126,7 @@ def test_load_into_counting_abundance_dist_nobig():
args.extend([outfile, infile])
(status, out, err) = utils.runscript(script, args)
- assert 'Total number of unique k-mers: 83' in err, err
+ assert 'Total number of unique k-mers: 94' in err, err
assert os.path.exists(outfile)
htfile = outfile
@@ -107,6 +138,43 @@ def test_load_into_counting_abundance_dist_nobig():
assert 'bigcount' in err, err
+def test_load_into_counting_abundance_dist_squashing():
+ graphfile = utils.get_temp_filename('out.ct')
+ infile = utils.get_test_data('test-abund-read-2.fa')
+
+ args = [graphfile, infile]
+ script = 'load-into-counting.py'
+ utils.runscript(script, args)
+
+ histogram = utils.get_temp_filename('histogram')
+ infile = utils.get_test_data('test-abund-read-2.fa')
+ args = [graphfile, infile, histogram]
+
+ script = 'abundance-dist.py'
+ # make histogram
+ (status, out, err) = utils.runscript(script, args)
+ assert os.path.exists(histogram)
+ # attempt to overwrite histogram; fail
+ failed = True
+ try:
+ (status, out, err) = utils.runscript(script, args)
+ failed = False
+ except AssertionError as error:
+ assert "exists; not squashing" in str(error), str(error)
+
+ assert failed, "Expected to fail"
+ # attempt to overwrite with squashing; should work
+ args = ['-s', graphfile, infile, histogram]
+ (status, out, err) = utils.runscript(script, args)
+ assert "squashing existing file" in err, err
+
+ histfile = open(histogram, 'r')
+ lines = histfile.readlines()
+ # stripping because boo whitespace
+ assert lines[1].strip() == "0,0,0,0.0", lines[1]
+ assert lines[2].strip() == "1,83,83,1.0", lines[2]
+
+
def test_load_into_counting_nonwritable():
script = 'load-into-counting.py'
args = ['-x', '1e3', '-N', '2', '-k', '20']
@@ -400,12 +468,12 @@ def test_filter_abund_1_singlefile():
def test_filter_abund_2_singlefile():
infile = utils.get_temp_filename('test.fa')
in_dir = os.path.dirname(infile)
- tabfile = utils.get_temp_filename('test-savetable.ct')
+ tabfile = utils.get_temp_filename('test-savegraph.ct')
shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile)
script = 'filter-abund-single.py'
- args = ['-x', '1e7', '-N', '2', '-k', '17', '--savetable',
+ args = ['-x', '1e7', '-N', '2', '-k', '17', '--savegraph',
tabfile, infile]
(status, out, err) = utils.runscript(script, args, in_dir)
@@ -511,7 +579,7 @@ def test_filter_abund_7_retain_Ns():
infile = utils.get_temp_filename('test.fq')
in_dir = os.path.dirname(infile)
- # copy test file over to test.fq & load into counting table
+ # copy test file over to test.fq & load into countgraph
shutil.copyfile(utils.get_test_data('test-filter-abund-Ns.fq'), infile)
counting_ht = _make_counting(infile, K=17)
@@ -545,7 +613,7 @@ def test_filter_abund_single_8_retain_Ns():
infile = utils.get_temp_filename('test.fq')
in_dir = os.path.dirname(infile)
- # copy test file over to test.fq & load into counting table
+ # copy test file over to test.fq & load into countgraph
shutil.copyfile(utils.get_test_data('test-filter-abund-Ns.fq'), infile)
script = 'filter-abund-single.py'
@@ -581,7 +649,7 @@ def test_filter_stoptags():
# now, create a file with some stop tags in it --
K = 18
- kh = khmer._Hashbits(K, [1])
+ kh = khmer._Nodegraph(K, [1])
kh.add_stop_tag('GTTGACGGGGCTCAGGGG')
kh.save_stop_tags(stopfile)
del kh
@@ -612,7 +680,7 @@ def test_filter_stoptags_fq():
# now, create a file with some stop tags in it --
K = 18
- kh = khmer._Hashbits(K, [1])
+ kh = khmer._Nodegraph(K, [1])
kh.add_stop_tag('GTTGACGGGGCTCAGGGG')
kh.save_stop_tags(stopfile)
del kh
@@ -710,14 +778,14 @@ def test_load_graph():
assert 'Total number of unique k-mers: 3960' in err, err
- ht_file = outfile + '.pt'
+ ht_file = outfile
assert os.path.exists(ht_file), ht_file
tagset_file = outfile + '.tagset'
assert os.path.exists(tagset_file), tagset_file
try:
- ht = khmer.load_hashbits(ht_file)
+ ht = khmer.load_nodegraph(ht_file)
except OSError as err:
assert 0, str(err)
ht.load_tagset(tagset_file)
@@ -743,13 +811,45 @@ def test_oxli_build_graph():
assert 'Total number of unique k-mers: 3960' in err, err
- ht_file = outfile + '.pt'
+ ht_file = outfile
assert os.path.exists(ht_file), ht_file
tagset_file = outfile + '.tagset'
assert os.path.exists(tagset_file), tagset_file
- ht = khmer.load_hashbits(ht_file)
+ ht = khmer.load_nodegraph(ht_file)
+ ht.load_tagset(tagset_file)
+
+ # check to make sure we get the expected result for this data set
+ # upon partitioning (all in one partition). This is kind of a
+ # roundabout way of checking that load-graph worked :)
+ subset = ht.do_subset_partition(0, 0)
+ x = ht.subset_count_partitions(subset)
+ assert x == (1, 0), x
+
+
+def test_oxli_build_graph_unique_kmers_arg():
+ script = 'oxli'
+ args = ['build-graph', '-x', '1e7', '-N', '2', '-k', '20', '-U', '3960']
+
+ outfile = utils.get_temp_filename('out')
+ infile = utils.get_test_data('random-20-a.fa')
+
+ args.extend([outfile, infile])
+
+ (status, out, err) = utils.runscript(script, args)
+
+ assert 'Total number of unique k-mers: 3960' in err, err
+ assert 'INFO: set memory ceiling automatically' in err, err
+ assert 'Ceiling is: 1e+06 bytes' in err, err
+
+ ht_file = outfile
+ assert os.path.exists(ht_file), ht_file
+
+ tagset_file = outfile + '.tagset'
+ assert os.path.exists(tagset_file), tagset_file
+
+ ht = khmer.load_nodegraph(ht_file)
ht.load_tagset(tagset_file)
# check to make sure we get the expected result for this data set
@@ -778,13 +878,13 @@ def test_load_graph_no_tags():
utils.runscript(script, args)
- ht_file = outfile + '.pt'
+ ht_file = outfile
assert os.path.exists(ht_file), ht_file
tagset_file = outfile + '.tagset'
assert not os.path.exists(tagset_file), tagset_file
- assert khmer.load_hashbits(ht_file)
+ assert khmer.load_nodegraph(ht_file)
# can't think of a good way to make sure this worked, beyond just
# loading the ht file...
@@ -801,13 +901,13 @@ def test_oxli_build_graph_no_tags():
utils.runscript(script, args)
- ht_file = outfile + '.pt'
+ ht_file = outfile
assert os.path.exists(ht_file), ht_file
tagset_file = outfile + '.tagset'
assert not os.path.exists(tagset_file), tagset_file
- assert khmer.load_hashbits(ht_file)
+ assert khmer.load_nodegraph(ht_file)
# can't think of a good way to make sure this worked, beyond just
# loading the ht file...
@@ -852,7 +952,7 @@ def test_load_graph_write_fp():
(status, out, err) = utils.runscript(script, args)
- ht_file = outfile + '.pt'
+ ht_file = outfile
assert os.path.exists(ht_file), ht_file
info_file = outfile + '.info'
@@ -875,7 +975,7 @@ def test_oxli_build_graph_write_fp():
(status, out, err) = utils.runscript(script, args)
- ht_file = outfile + '.pt'
+ ht_file = outfile
assert os.path.exists(ht_file), ht_file
info_file = outfile + '.info'
@@ -921,11 +1021,11 @@ def test_load_graph_max_memory_usage_parameter():
assert 'Total number of unique k-mers: 3960' in err, err
- ht_file = outfile + '.pt'
+ ht_file = outfile
assert os.path.exists(ht_file), ht_file
try:
- ht = khmer.load_hashbits(ht_file)
+ ht = khmer.load_nodegraph(ht_file)
except OSError as err:
assert 0, str(err)
@@ -946,7 +1046,7 @@ def _make_graph(infilename, min_hashsize=1e7, n_hashes=2, ksize=20,
utils.runscript(script, args)
- ht_file = outfile + '.pt'
+ ht_file = outfile
assert os.path.exists(ht_file), ht_file
tagset_file = outfile + '.tagset'
@@ -1044,7 +1144,7 @@ def test_partition_graph_1():
final_pmap_file = graphbase + '.pmap.merged'
assert os.path.exists(final_pmap_file)
- ht = khmer.load_hashbits(graphbase + '.pt')
+ ht = khmer.load_nodegraph(graphbase)
ht.load_tagset(graphbase + '.tagset')
ht.load_partitionmap(final_pmap_file)
@@ -1068,7 +1168,7 @@ def test_partition_graph_nojoin_k21():
final_pmap_file = graphbase + '.pmap.merged'
assert os.path.exists(final_pmap_file)
- ht = khmer.load_hashbits(graphbase + '.pt')
+ ht = khmer.load_nodegraph(graphbase)
ht.load_tagset(graphbase + '.tagset')
ht.load_partitionmap(final_pmap_file)
@@ -1081,7 +1181,7 @@ def test_partition_graph_nojoin_stoptags():
graphbase = _make_graph(utils.get_test_data('random-20-a.fa'))
# add in some stop tags
- ht = khmer.load_hashbits(graphbase + '.pt')
+ ht = khmer.load_nodegraph(graphbase)
ht.add_stop_tag('TTGCATACGTTGAGCCAGCG')
stoptags_file = graphbase + '.stoptags'
ht.save_stop_tags(stoptags_file)
@@ -1100,7 +1200,7 @@ def test_partition_graph_nojoin_stoptags():
final_pmap_file = graphbase + '.pmap.merged'
assert os.path.exists(final_pmap_file)
- ht = khmer.load_hashbits(graphbase + '.pt')
+ ht = khmer.load_nodegraph(graphbase)
ht.load_tagset(graphbase + '.tagset')
ht.load_partitionmap(final_pmap_file)
@@ -1115,7 +1215,7 @@ def test_partition_graph_big_traverse():
final_pmap_file = graphbase + '.pmap.merged'
assert os.path.exists(final_pmap_file)
- ht = khmer.load_hashbits(graphbase + '.pt')
+ ht = khmer.load_nodegraph(graphbase)
ht.load_tagset(graphbase + '.tagset')
ht.load_partitionmap(final_pmap_file)
@@ -1131,7 +1231,7 @@ def test_partition_graph_no_big_traverse():
final_pmap_file = graphbase + '.pmap.merged'
assert os.path.exists(final_pmap_file)
- ht = khmer.load_hashbits(graphbase + '.pt')
+ ht = khmer.load_nodegraph(graphbase)
ht.load_tagset(graphbase + '.tagset')
ht.load_partitionmap(final_pmap_file)
@@ -1155,6 +1255,27 @@ def test_partition_find_knots_execute():
assert os.path.exists(stoptags_file)
+def test_partition_find_knots_existing_stoptags():
+ graphbase = _make_graph(utils.get_test_data('random-20-a.fa'))
+
+ script = 'partition-graph.py'
+ args = [graphbase]
+ utils.runscript(script, args)
+
+ script = 'make-initial-stoptags.py'
+ args = [graphbase]
+ utils.runscript(script, args)
+
+ script = 'find-knots.py'
+ args = [graphbase]
+ (status, out, err) = utils.runscript(script, args)
+
+ stoptags_file = graphbase + '.stoptags'
+ assert os.path.exists(stoptags_file)
+ assert "loading stoptags" in err, err
+ assert "these output stoptags will include the already" in err, err
+
+
def test_annotate_partitions():
seqfile = utils.get_test_data('random-20-a.fa')
graphbase = _make_graph(seqfile, do_partition=True)
@@ -1557,16 +1678,16 @@ def test_abundance_dist_single_nosquash():
assert line == '1001,2,98,1.0', line
-def test_abundance_dist_single_savetable():
+def test_abundance_dist_single_savegraph():
infile = utils.get_temp_filename('test.fa')
outfile = utils.get_temp_filename('test.dist')
- tabfile = utils.get_temp_filename('test-savetable.ct')
+ tabfile = utils.get_temp_filename('test-savegraph.ct')
in_dir = os.path.dirname(infile)
shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile)
script = 'abundance-dist-single.py'
- args = ['-x', '1e7', '-N', '2', '-k', '17', '-z', '--savetable',
+ args = ['-x', '1e7', '-N', '2', '-k', '17', '-z', '--savegraph',
tabfile, infile, outfile]
utils.runscript(script, args, in_dir)
@@ -1634,6 +1755,33 @@ def test_do_partition_2_fq():
assert '46 1::FIZ' in names
+def test_interleave_read_stdout():
+ # create input files
+ infile1 = utils.get_test_data('paired-slash1.fq.1')
+ infile2 = utils.get_test_data('paired-slash1.fq.2')
+
+ # correct output
+ ex_outfile = utils.get_test_data('paired-slash1.fq')
+
+ # actual output file
+ outfile = utils.get_temp_filename('out.fq')
+
+ script = 'interleave-reads.py'
+ args = [infile1, infile2]
+
+ (stats, out, err) = utils.runscript(script, args)
+
+ with open(outfile, 'w') as ofile:
+ ofile.write(out)
+
+ n = 0
+ for r, q in zip(screed.open(ex_outfile), screed.open(outfile)):
+ n += 1
+ assert r.name == q.name
+ assert r.sequence == q.sequence
+ assert n > 0
+
+
def test_interleave_read_seq1_fq():
# create input files
infile1 = utils.get_test_data('paired-slash1.fq.1')
@@ -1658,6 +1806,30 @@ def test_interleave_read_seq1_fq():
assert n > 0
+def test_interleave_read_badleft_badright():
+ # create input files
+ infile1 = utils.get_test_data('paired-broken.fq.badleft')
+ infile2 = utils.get_test_data('paired-broken.fq.badright')
+
+ # correct output
+ ex_outfile = utils.get_test_data('paired-broken.fq.paired_bad')
+
+ # actual output file
+ outfile = utils.get_temp_filename('out.fq')
+
+ script = 'interleave-reads.py'
+ args = [infile1, infile2, '-o', outfile]
+
+ utils.runscript(script, args)
+
+ n = 0
+ for r, q in zip(screed.open(ex_outfile), screed.open(outfile)):
+ n += 1
+ assert r.name == q.name
+ assert r.sequence == q.sequence
+ assert n > 0
+
+
def test_interleave_reads_1_fq():
# test input files
infile1 = utils.get_test_data('paired.fq.1')
@@ -1800,6 +1972,40 @@ def test_make_initial_stoptags():
assert os.path.exists(outfile1), outfile1
+def test_make_initial_stoptags_load_stoptags():
+ # gen input files using load-graph.py -t
+ # should keep test_data directory size down
+ # or something like that
+ # this assumes (obv.) load-graph works properly
+ bzinfile = utils.get_temp_filename('test-reads.fq.bz2')
+ shutil.copyfile(utils.get_test_data('test-reads.fq.bz2'), bzinfile)
+ in_dir = os.path.dirname(bzinfile)
+
+ genscript = 'load-graph.py'
+ genscriptargs = ['test-reads', 'test-reads.fq.bz2']
+ utils.runscript(genscript, genscriptargs, in_dir)
+
+ # test input file gen'd by load-graphs
+ infile = utils.get_temp_filename('test-reads.pt')
+ infile2 = utils.get_temp_filename('test-reads.tagset', in_dir)
+
+ # get file to compare against
+ ex_outfile = utils.get_test_data('test-reads.stoptags')
+
+ # actual output file
+ outfile1 = utils.get_temp_filename('test-reads.stoptags', in_dir)
+
+ script = 'make-initial-stoptags.py'
+ # make-initial-stoptags has weird file argument syntax
+ # read the code before modifying
+ args = ['test-reads']
+
+ utils.runscript(script, args, in_dir)
+ args = ['test-reads', '--stoptags', 'test-reads.stoptags']
+ utils.runscript(script, args, in_dir)
+ assert os.path.exists(outfile1), outfile1
+
+
def test_extract_paired_reads_1_fa():
# test input file
infile = utils.get_test_data('paired-mixed.fa')
@@ -1885,7 +2091,7 @@ def test_extract_paired_reads_3_output_dir():
out_dir = utils.get_temp_filename('output')
script = 'extract-paired-reads.py'
- args = [infile, '-o', out_dir]
+ args = [infile, '-d', out_dir]
utils.runscript(script, args)
@@ -2092,11 +2298,11 @@ def test_split_paired_reads_2_mixed_fq_require_pair():
in_dir = os.path.dirname(infile)
script = 'split-paired-reads.py'
- args = ['-p', infile]
+ args = [infile]
status, out, err = utils.runscript(script, args, in_dir, fail_ok=True)
- assert status == 1
- assert "is not part of a pair" in err
+ assert status == 1, status
+ assert "Unpaired reads found" in err
def test_split_paired_reads_2_stdin_no_out():
@@ -2115,11 +2321,67 @@ def test_split_paired_reads_2_mixed_fq():
in_dir = os.path.dirname(infile)
script = 'split-paired-reads.py'
- args = [infile]
+ args = ['-0', '/dev/null', infile]
+
+ status, out, err = utils.runscript(script, args, in_dir)
+ assert status == 0
+ assert "split 6 sequences (3 left, 3 right, 5 orphans)" in err, err
+
+
+def test_split_paired_reads_2_mixed_fq_orphans_to_file():
+ # test input file
+ infile = utils.get_temp_filename('test.fq')
+ shutil.copyfile(utils.get_test_data('paired-mixed-2.fq'), infile)
+ in_dir = os.path.dirname(infile)
+ outfile = utils.get_temp_filename('out.fq')
+
+ script = 'split-paired-reads.py'
+ args = ['-0', outfile, infile]
+
+ status, out, err = utils.runscript(script, args, in_dir)
+ assert status == 0
+ assert "split 6 sequences (3 left, 3 right, 5 orphans)" in err, err
+
+ n_orphans = len([1 for record in screed.open(outfile)])
+ assert n_orphans == 5
+ n_left = len([1 for record in screed.open(infile + '.1')])
+ assert n_left == 3
+ n_right = len([1 for record in screed.open(infile + '.2')])
+ assert n_right == 3
+ for filename in [outfile, infile + '.1', infile + '.2']:
+ fp = gzip.open(filename)
+ try:
+ fp.read()
+ except IOError as e:
+ assert "Not a gzipped file" in str(e), str(e)
+ fp.close()
+
+
+def test_split_paired_reads_2_mixed_fq_gzfile():
+ # test input file
+ infile = utils.get_temp_filename('test.fq')
+ shutil.copyfile(utils.get_test_data('paired-mixed-2.fq'), infile)
+ in_dir = os.path.dirname(infile)
+ outfile = utils.get_temp_filename('out.fq')
+
+ script = 'split-paired-reads.py'
+ args = ['-0', outfile, '--gzip', infile]
status, out, err = utils.runscript(script, args, in_dir)
assert status == 0
- assert "split 11 sequences (7 left, 4 right)" in err, err
+ assert "split 6 sequences (3 left, 3 right, 5 orphans)" in err, err
+
+ n_orphans = len([1 for record in screed.open(outfile)])
+ assert n_orphans == 5
+ n_left = len([1 for record in screed.open(infile + '.1')])
+ assert n_left == 3
+ n_right = len([1 for record in screed.open(infile + '.2')])
+ assert n_right == 3
+
+ for filename in [outfile, infile + '.1', infile + '.2']:
+ fp = gzip.open(filename)
+ fp.read() # this will fail if not gzip file.
+ fp.close()
def test_split_paired_reads_2_mixed_fq_broken_pairing_format():
@@ -2133,7 +2395,7 @@ def test_split_paired_reads_2_mixed_fq_broken_pairing_format():
status, out, err = utils.runscript(script, args, in_dir, fail_ok=True)
assert status == 1
- assert "Unrecognized format" in err
+ assert "Unpaired reads found starting at 895:1:37:17593:9954" in err, err
def test_split_paired_reads_3_output_dir():
@@ -2223,7 +2485,7 @@ def test_split_paired_reads_3_output_files_left():
outfile2 = utils.get_temp_filename('paired.fq.2', output_dir)
script = 'split-paired-reads.py'
- args = ['-o', output_dir, '-1', outfile1, infile]
+ args = ['-d', output_dir, '-1', outfile1, infile]
utils.runscript(script, args)
@@ -2260,7 +2522,7 @@ def test_split_paired_reads_3_output_files_right():
outfile2 = utils.get_temp_filename('yyy', output_dir)
script = 'split-paired-reads.py'
- args = ['-2', outfile2, '-o', output_dir, infile]
+ args = ['-2', outfile2, '-d', output_dir, infile]
utils.runscript(script, args)
@@ -2372,6 +2634,52 @@ def test_sample_reads_randomly_force_single():
assert seqs == answer
+def test_sample_reads_randomly_force_single_outfile():
+ infile = utils.get_temp_filename('test.fa')
+ in_dir = os.path.dirname(infile)
+
+ shutil.copyfile(utils.get_test_data('test-reads.fa'), infile)
+
+ script = 'sample-reads-randomly.py'
+ # fix random number seed for reproducibility
+ args = ['-N', '10', '-M', '12000', '-R', '1', '--force_single', '-o',
+ in_dir + '/randreads.out']
+
+ args.append(infile)
+ utils.runscript(script, args, in_dir)
+
+ outfile = in_dir + '/randreads.out'
+ assert os.path.exists(outfile), outfile
+
+ seqs = set([r.name for r in screed.open(outfile)])
+ print(list(sorted(seqs)))
+
+ if sys.version_info.major == 2:
+ answer = {'850:2:1:2399:20086/2',
+ '850:2:1:2273:13309/1',
+ '850:2:1:2065:16816/1',
+ '850:2:1:1984:7162/2',
+ '850:2:1:2691:14602/1',
+ '850:2:1:1762:5439/1',
+ '850:2:1:2503:4494/2',
+ '850:2:1:2263:11143/2',
+ '850:2:1:1792:15774/2',
+ '850:2:1:2084:17145/1'}
+ else:
+ answer = {'850:2:1:1199:4197/1',
+ '850:2:1:1251:16575/2',
+ '850:2:1:1267:6790/2',
+ '850:2:1:1601:4443/1',
+ '850:2:1:1625:19325/1',
+ '850:2:1:1832:14607/2',
+ '850:2:1:1946:20852/2',
+ '850:2:1:2401:4896/2',
+ '850:2:1:2562:1308/1',
+ '850:2:1:3123:15968/2'}
+
+ assert seqs == answer
+
+
def test_sample_reads_randomly_fq():
infile = utils.get_temp_filename('test.fq.gz')
in_dir = os.path.dirname(infile)
@@ -2441,7 +2749,7 @@ def test_fastq_to_fasta():
args = [clean_infile, '-n', '-o', clean_outfile]
(status, out, err) = utils.runscript(script, args, in_dir)
- assert len(out.splitlines()) == 2, len(out.splitlines())
+ assert len(out.splitlines()) == 0, len(out.splitlines())
assert "No lines dropped" in err
names = [r.name for r in screed.open(clean_outfile)]
@@ -2449,29 +2757,131 @@ def test_fastq_to_fasta():
args = [n_infile, '-n', '-o', n_outfile]
(status, out, err) = utils.runscript(script, args, in_dir_n)
- assert len(out.splitlines()) == 2
+ assert len(out.splitlines()) == 0
assert "No lines dropped" in err
args = [clean_infile, '-o', clean_outfile]
(status, out, err) = utils.runscript(script, args, in_dir)
- assert len(out.splitlines()) == 2
+ assert len(out.splitlines()) == 0
assert "0 lines dropped" in err
args = [n_infile, '-o', n_outfile]
(status, out, err) = utils.runscript(script, args, in_dir_n)
- assert len(out.splitlines()) == 2, out
+ assert len(out.splitlines()) == 0, out
assert "4 lines dropped" in err, err
args = [clean_infile]
(status, out, err) = utils.runscript(script, args, in_dir)
- assert len(out.splitlines()) > 2
+ assert len(out.splitlines()) > 0
assert "0 lines dropped" in err
args = [n_infile]
(status, out, err) = utils.runscript(script, args, in_dir_n)
- assert len(out.splitlines()) > 2
+ assert len(out.splitlines()) > 0
assert "4 lines dropped" in err
+ args = [clean_infile, '-o', clean_outfile, '--gzip']
+ (status, out, err) = utils.runscript(script, args, in_dir)
+ assert len(out.splitlines()) == 0
+ assert "0 lines dropped" in err
+
+ args = [clean_infile, '-o', clean_outfile, '--bzip']
+ (status, out, err) = utils.runscript(script, args, in_dir)
+ assert len(out.splitlines()) == 0
+ assert "0 lines dropped" in err
+
+
+def test_fastq_to_fasta_streaming_compressed_gzip():
+
+ script = 'fastq-to-fasta.py'
+ infile = utils.get_temp_filename('test-clean.fq')
+ in_dir = os.path.dirname(infile)
+ fifo = utils.get_temp_filename('fifo')
+ copyfilepath = utils.get_temp_filename('copied.fa.gz', in_dir)
+ shutil.copyfile(utils.get_test_data('test-reads.fq.gz'), infile)
+
+ # make a fifo to simulate streaming
+ os.mkfifo(fifo)
+ args = ['--gzip', '-o', fifo, infile]
+ # FIFOs MUST BE OPENED FOR READING BEFORE THEY ARE WRITTEN TO
+ # If this isn't done, they will BLOCK and things will hang.
+ thread = threading.Thread(target=utils.runscript,
+ args=(script, args, in_dir))
+ thread.start()
+ copyfile = io.open(copyfilepath, 'wb')
+ fifofile = io.open(fifo, 'rb')
+
+ # read binary to handle compressed files
+ chunk = fifofile.read(8192)
+ while len(chunk) > 0:
+ copyfile.write(chunk)
+ chunk = fifofile.read(8192)
+
+ fifofile.close()
+ thread.join()
+ copyfile.close()
+
+ # verify that the seqs are there and not broken
+ f = screed.open(copyfilepath)
+ count = 0
+ for _ in f:
+ count += 1
+
+ assert count == 25000, count
+ f.close()
+
+ # verify we're looking at a gzipped file
+ gzfile = io.open(file=copyfilepath, mode='rb', buffering=8192)
+ magic = b"\x1f\x8b\x08" # gzip magic signature
+ file_start = gzfile.peek(len(magic))
+ assert file_start[:3] == magic, file_start[:3]
+
+
+def test_fastq_to_fasta_streaming_compressed_bzip():
+
+ script = 'fastq-to-fasta.py'
+ infile = utils.get_temp_filename('test-clean.fq')
+ in_dir = os.path.dirname(infile)
+ fifo = utils.get_temp_filename('fifo')
+ copyfilepath = utils.get_temp_filename('copied.fa.bz', in_dir)
+ shutil.copyfile(utils.get_test_data('test-reads.fq.gz'), infile)
+
+ # make a fifo to simulate streaming
+ os.mkfifo(fifo)
+ args = ['--bzip', '-o', fifo, infile]
+ # FIFOs MUST BE OPENED FOR READING BEFORE THEY ARE WRITTEN TO
+ # If this isn't done, they will BLOCK and things will hang.
+ thread = threading.Thread(target=utils.runscript,
+ args=(script, args, in_dir))
+ thread.start()
+ copyfile = io.open(copyfilepath, 'wb')
+ fifofile = io.open(fifo, 'rb')
+
+ # read binary to handle compressed files
+ chunk = fifofile.read(8192)
+ while len(chunk) > 0:
+ copyfile.write(chunk)
+ chunk = fifofile.read(8192)
+
+ fifofile.close()
+ thread.join()
+ copyfile.close()
+
+ # verify that the seqs are there and not broken
+ f = screed.open(copyfilepath)
+ count = 0
+ for _ in f:
+ count += 1
+
+ assert count == 25000, count
+ f.close()
+
+ # verify we're looking at a gzipped file
+ bzfile = io.open(file=copyfilepath, mode='rb', buffering=8192)
+ magic = b"\x42\x5a\x68" # bzip magic signature
+ file_start = bzfile.peek(len(magic))
+ assert file_start[:3] == magic, file_start[:3]
+
def test_extract_long_sequences_fa():
@@ -2600,47 +3010,6 @@ def test_sample_reads_randomly_S():
assert seqs == answer
-def test_count_overlap_invalid_datafile():
- seqfile1 = utils.get_temp_filename('test-overlap1.fa')
- in_dir = os.path.dirname(seqfile1)
- shutil.copy(utils.get_test_data('test-overlap1.fa'), seqfile1)
- htfile = _make_graph(seqfile1, ksize=20)
- outfile = utils.get_temp_filename('overlap.out', in_dir)
- script = 'count-overlap.py'
- args = ['--ksize', '20', '--n_tables', '2', '--max-tablesize', '10000000',
- htfile + '.pt', htfile + '.pt', outfile]
- (status, out, err) = utils.runscript(script, args, in_dir, fail_ok=True)
- assert status != 0
- assert "OSError" in err
-
-
-def test_count_overlap_csv():
- seqfile1 = utils.get_temp_filename('test-overlap1.fa')
- in_dir = os.path.dirname(seqfile1)
- seqfile2 = utils.get_temp_filename('test-overlap2.fa', in_dir)
- outfile = utils.get_temp_filename('overlap.out', in_dir)
- curvefile = utils.get_temp_filename('overlap.out.curve', in_dir)
- shutil.copy(utils.get_test_data('test-overlap1.fa'), seqfile1)
- shutil.copy(utils.get_test_data('test-overlap2.fa'), seqfile2)
- htfile = _make_graph(seqfile1, ksize=20)
- script = 'count-overlap.py'
- args = ['--ksize', '20', '--n_tables', '2', '--max-tablesize',
- '10000000', htfile + '.pt', seqfile2, outfile]
- (status, out, err) = utils.runscript(script, args, in_dir)
- assert status == 0
- assert os.path.exists(outfile), outfile
- data = [x.strip() for x in open(outfile)]
- data = set(data)
- assert '# of unique k-mers in dataset2: 759020' in data
- assert '# of overlap unique k-mers: 245547' in data
- assert os.path.exists(curvefile), curvefile
- data = [x.strip() for x in open(curvefile)]
- data = set(data)
- assert '178630,1134' in data, data
- assert '496280,2904' in data
- assert '752031,238558' in data
-
-
def execute_streaming_diginorm(ifilename):
'''Helper function for the matrix of streaming tests for read_parser
using diginorm, i.e. uncompressed fasta, gzip fasta, bz2 fasta,
@@ -2702,13 +3071,13 @@ def _execute_load_graph_streaming(filename):
assert 'Total number of unique k-mers: 3960' in err, err
- ht_file = os.path.join(in_dir, 'out.pt')
+ ht_file = os.path.join(in_dir, 'out')
assert os.path.exists(ht_file), ht_file
tagset_file = os.path.join(in_dir, 'out.tagset')
assert os.path.exists(tagset_file), tagset_file
- ht = khmer.load_hashbits(ht_file)
+ ht = khmer.load_nodegraph(ht_file)
ht.load_tagset(tagset_file)
# check to make sure we get the expected result for this data set
@@ -3099,7 +3468,7 @@ def test_trim_low_abund_trimtest_after_load():
args = ["-k", "17", "-x", "1e7", "-N", "2", saved_table, infile]
utils.runscript('load-into-counting.py', args, in_dir)
- args = ["-Z", "2", "-C", "2", "-V", '--loadtable', saved_table, infile]
+ args = ["-Z", "2", "-C", "2", "-V", '--loadgraph', saved_table, infile]
utils.runscript('trim-low-abund.py', args, in_dir)
outfile = infile + '.abundtrim'
@@ -3120,7 +3489,7 @@ def test_trim_low_abund_trimtest_after_load():
'GGTTGACGGGGCTCAGGGGGCGGCTGACTCCGAGAGACAGCA'
-def test_trim_low_abund_trimtest_savetable():
+def test_trim_low_abund_trimtest_savegraph():
infile = utils.get_temp_filename('test.fa')
in_dir = os.path.dirname(infile)
@@ -3129,7 +3498,7 @@ def test_trim_low_abund_trimtest_savetable():
shutil.copyfile(utils.get_test_data('test-abund-read-2.paired.fq'), infile)
args = ["-k", "17", "-x", "1e7", "-N", "2",
- "-Z", "2", "-C", "2", "-V", '--savetable', saved_table, infile]
+ "-Z", "2", "-C", "2", "-V", '--savegraph', saved_table, infile]
utils.runscript('trim-low-abund.py', args, in_dir)
outfile = infile + '.abundtrim'
diff --git a/tests/test_streaming_io.py b/tests/test_streaming_io.py
index 6ba7ef9..f942edb 100644
--- a/tests/test_streaming_io.py
+++ b/tests/test_streaming_io.py
@@ -81,6 +81,73 @@ def test_interleave_split_2_fail():
in err, err
+def test_interleave_split_3_out1():
+ in1 = utils.get_test_data('paired.fq.1')
+ in2 = utils.get_test_data('paired.fq.2')
+
+ out1 = utils.get_temp_filename('a.fa')
+ out2 = utils.get_temp_filename('b.fa')
+
+ cmd = """
+ {scripts}/interleave-reads.py {in1} {in2} -o - |
+ {scripts}/split-paired-reads.py -1 - -2 {out2} - > {out1}
+ """
+
+ cmd = cmd.format(scripts=scriptpath(),
+ in1=in1, in2=in2,
+ out1=out1, out2=out2)
+
+ run_shell_cmd(cmd)
+
+ assert files_are_equal(in1, out1), diff_files(in1, out1)
+ assert files_are_equal(in2, out2), diff_files(in2, out2)
+
+
+def test_interleave_split_3_out2():
+ in1 = utils.get_test_data('paired.fq.1')
+ in2 = utils.get_test_data('paired.fq.2')
+
+ out1 = utils.get_temp_filename('a.fa')
+ out2 = utils.get_temp_filename('b.fa')
+
+ cmd = """
+ {scripts}/interleave-reads.py {in1} {in2} -o - |
+ {scripts}/split-paired-reads.py -1 {out1} -2 - - > {out2}
+ """
+
+ cmd = cmd.format(scripts=scriptpath(),
+ in1=in1, in2=in2,
+ out1=out1, out2=out2)
+
+ run_shell_cmd(cmd)
+
+ assert files_are_equal(in1, out1), diff_files(in1, out1)
+ assert files_are_equal(in2, out2), diff_files(in2, out2)
+
+
+def test_interleave_split_3_out0():
+ in1 = utils.get_test_data('paired-mixed-broken.fq')
+
+ out1 = utils.get_temp_filename('a.fa')
+ out2 = utils.get_temp_filename('b.fa')
+ out3 = utils.get_temp_filename('c.fa')
+
+ cmd = """
+ cat {in1} |
+ {scripts}/split-paired-reads.py -1 {out1} -2 {out2} -0 - - > {out3}
+ """
+
+ cmd = cmd.format(scripts=scriptpath(),
+ in1=in1,
+ out1=out1, out2=out2, out3=out3)
+
+ run_shell_cmd(cmd)
+
+ assert files_are_equal(in1, out3), diff_files(in1, out3)
+ assert len(open(out1, 'rb').read()) == 0
+ assert len(open(out2, 'rb').read()) == 0
+
+
def test_extract_paired_pe():
in1 = utils.get_test_data('paired-mixed.fq')
out_test = utils.get_test_data('paired-mixed.fq.pe')
@@ -251,7 +318,7 @@ def test_load_into_counting_1():
(status, out, err) = run_shell_cmd(cmd)
assert os.path.exists(out1)
- khmer.load_counting_hash(out1)
+ khmer.load_countgraph(out1)
def test_load_graph_1():
@@ -268,8 +335,8 @@ def test_load_graph_1():
print(cmd)
(status, out, err) = run_shell_cmd(cmd)
- assert os.path.exists(out1 + '.pt')
- khmer.load_hashbits(out1 + '.pt')
+ assert os.path.exists(out1)
+ khmer.load_nodegraph(out1)
def test_filter_abund_1():
diff --git a/tests/test_subset_graph.py b/tests/test_subset_graph.py
index 44f7569..fcf82da 100644
--- a/tests/test_subset_graph.py
+++ b/tests/test_subset_graph.py
@@ -21,7 +21,7 @@ def teardown():
class Test_RandomData(object):
def test_3_merge_013(self):
- ht = khmer.Hashbits(20, 4 ** 4 + 1, 2)
+ ht = khmer.Nodegraph(20, 4 ** 4 + 1, 2)
filename = utils.get_test_data('test-graph2.fa')
@@ -43,7 +43,7 @@ class Test_RandomData(object):
assert n_partitions == 1, n_partitions # combined.
def test_3_merge_023(self):
- ht = khmer.Hashbits(20, 4 ** 4 + 1, 2)
+ ht = khmer.Nodegraph(20, 4 ** 4 + 1, 2)
filename = utils.get_test_data('test-graph2.fa')
(total_reads, total_kmers) = ht.consume_fasta_and_tag(filename)
@@ -64,7 +64,7 @@ class Test_RandomData(object):
assert n_partitions == 1, n_partitions # combined.
def test_5_merge_046(self):
- ht = khmer.Hashbits(20, 4 ** 4 + 1, 2)
+ ht = khmer.Nodegraph(20, 4 ** 4 + 1, 2)
filename = utils.get_test_data('test-graph5.fa')
(total_reads, total_kmers) = ht.consume_fasta_and_tag(filename)
@@ -83,7 +83,7 @@ class Test_RandomData(object):
assert n_partitions == 1, n_partitions # combined.
def test_random_20_a_succ(self):
- ht = khmer.Hashbits(20, 4 ** 7 + 1, 2)
+ ht = khmer.Nodegraph(20, 4 ** 7 + 1, 2)
filename = utils.get_test_data('random-20-a.fa')
outfile = utils.get_temp_filename('out')
@@ -102,7 +102,7 @@ class Test_RandomData(object):
assert n_partitions == 1, n_partitions
def test_random_20_a_succ_II(self):
- ht = khmer.Hashbits(20, 4 ** 7 + 1, 2)
+ ht = khmer.Nodegraph(20, 4 ** 7 + 1, 2)
filename = utils.get_test_data('random-20-a.fa')
outfile = utils.get_temp_filename('out')
@@ -121,7 +121,7 @@ class Test_RandomData(object):
assert n_partitions == 1, n_partitions
def test_random_20_a_succ_III(self):
- ht = khmer.Hashbits(20, 4 ** 7 + 1, 2)
+ ht = khmer.Nodegraph(20, 4 ** 7 + 1, 2)
filename = utils.get_test_data('random-20-a.fa')
outfile = utils.get_temp_filename('out')
@@ -144,7 +144,7 @@ class Test_RandomData(object):
assert n_partitions == 1, n_partitions
def test_random_20_a_succ_IV(self):
- ht = khmer.Hashbits(20, 4 ** 7 + 1, 2)
+ ht = khmer.Nodegraph(20, 4 ** 7 + 1, 2)
filename = utils.get_test_data('random-20-a.fa')
outfile = utils.get_temp_filename('out')
@@ -164,7 +164,7 @@ class Test_RandomData(object):
assert n_partitions == 1, n_partitions
def test_random_20_a_succ_IV_save(self):
- ht = khmer.Hashbits(20, 4 ** 7 + 1, 2)
+ ht = khmer.Nodegraph(20, 4 ** 7 + 1, 2)
filename = utils.get_test_data('random-20-a.fa')
savefile_ht = utils.get_temp_filename('ht')
@@ -177,7 +177,7 @@ class Test_RandomData(object):
ht.save_tagset(savefile_tags)
del ht
- ht = khmer.Hashbits(20, 4 ** 7 + 1, 2)
+ ht = khmer.Nodegraph(20, 4 ** 7 + 1, 2)
ht.load(savefile_ht)
ht.load_tagset(savefile_tags)
@@ -200,7 +200,7 @@ class Test_RandomData(object):
class Test_SaveLoadPmap(object):
def test_save_load_merge(self):
- ht = khmer.Hashbits(20, 4 ** 4 + 1, 2)
+ ht = khmer.Nodegraph(20, 4 ** 4 + 1, 2)
filename = utils.get_test_data('test-graph2.fa')
(total_reads, total_kmers) = ht.consume_fasta_and_tag(filename)
@@ -233,7 +233,7 @@ class Test_SaveLoadPmap(object):
assert n_partitions == 1, n_partitions # combined.
def test_save_load_merge_truncate(self):
- ht = khmer.Hashbits(20, 4 ** 4 + 1, 2)
+ ht = khmer.Nodegraph(20, 4 ** 4 + 1, 2)
filename = utils.get_test_data('test-graph2.fa')
(total_reads, total_kmers) = ht.consume_fasta_and_tag(filename)
@@ -270,7 +270,7 @@ class Test_SaveLoadPmap(object):
print(str(err), i)
def test_save_load_merge_2(self):
- ht = khmer.Hashbits(20, 4 ** 8 + 1, 2)
+ ht = khmer.Nodegraph(20, 4 ** 8 + 1, 2)
filename = utils.get_test_data('random-20-a.fa')
(total_reads, total_kmers) = ht.consume_fasta_and_tag(filename)
@@ -302,7 +302,7 @@ class Test_SaveLoadPmap(object):
assert n_partitions == 1, n_partitions # combined.
def test_save_load_merge_nexist(self):
- ht = khmer._Hashbits(20, [1])
+ ht = khmer._Nodegraph(20, [1])
try:
a = ht.load_subset_partitionmap('this does not exist')
assert 0, "this should not succeed"
@@ -310,7 +310,7 @@ class Test_SaveLoadPmap(object):
print(str(e))
def test_save_merge_from_disk(self):
- ht = khmer.Hashbits(20, 4 ** 4 + 1, 2)
+ ht = khmer.Nodegraph(20, 4 ** 4 + 1, 2)
filename = utils.get_test_data('test-graph2.fa')
(total_reads, total_kmers) = ht.consume_fasta_and_tag(filename)
@@ -339,7 +339,7 @@ class Test_SaveLoadPmap(object):
assert n_partitions == 1, n_partitions # combined.
def test_save_merge_from_disk_2(self):
- ht = khmer.Hashbits(20, 4 ** 7 + 1, 2)
+ ht = khmer.Nodegraph(20, 4 ** 7 + 1, 2)
filename = utils.get_test_data('random-20-a.fa')
(total_reads, total_kmers) = ht.consume_fasta_and_tag(filename)
@@ -368,7 +368,7 @@ class Test_SaveLoadPmap(object):
assert n_partitions == 1, n_partitions # combined.
def test_save_merge_from_disk_file_not_exist(self):
- ht = khmer.Hashbits(20, 4 ** 4 + 1, 2)
+ ht = khmer.Nodegraph(20, 4 ** 4 + 1, 2)
filename = utils.get_test_data('test-graph2.fa')
(total_reads, total_kmers) = ht.consume_fasta_and_tag(filename)
@@ -389,7 +389,7 @@ class Test_SaveLoadPmap(object):
print(str(e))
def test_merge_from_disk_file_bad_type(self):
- ht = khmer.Hashbits(20, 4 ** 4 + 1, 2)
+ ht = khmer.Nodegraph(20, 4 ** 4 + 1, 2)
infile = utils.get_test_data('goodversion-k12.ht')
try:
@@ -399,7 +399,7 @@ class Test_SaveLoadPmap(object):
print(str(e))
def test_merge_from_disk_file_version(self):
- ht = khmer.Hashbits(20, 4 ** 4 + 1, 2)
+ ht = khmer.Nodegraph(20, 4 ** 4 + 1, 2)
infile = utils.get_test_data('badversion-k12.ht')
try:
@@ -409,7 +409,7 @@ class Test_SaveLoadPmap(object):
print(str(e))
def test_save_merge_from_disk_ksize(self):
- ht = khmer.Hashbits(20, 4 ** 4 + 1, 2)
+ ht = khmer.Nodegraph(20, 4 ** 4 + 1, 2)
filename = utils.get_test_data('test-graph2.fa')
(total_reads, total_kmers) = ht.consume_fasta_and_tag(filename)
@@ -424,7 +424,7 @@ class Test_SaveLoadPmap(object):
ht.save_subset_partitionmap(x, outfile1)
del x
- ht = khmer._Hashbits(19, [1])
+ ht = khmer._Nodegraph(19, [1])
try:
ht.merge_subset_from_disk(outfile1)
assert 0, "this should fail"
@@ -433,7 +433,7 @@ class Test_SaveLoadPmap(object):
def test_save_load_merge_on_graph():
- ht = khmer.Hashbits(20, 4 ** 4 + 1, 2)
+ ht = khmer.Nodegraph(20, 4 ** 4 + 1, 2)
filename = utils.get_test_data('test-graph2.fa')
(total_reads, total_kmers) = ht.consume_fasta_and_tag(filename)
@@ -466,7 +466,7 @@ def test_save_load_merge_on_graph():
def test_save_load_on_graph_truncate():
- ht = khmer.Hashbits(20, 4 ** 4 + 1, 2)
+ ht = khmer.Nodegraph(20, 4 ** 4 + 1, 2)
filename = utils.get_test_data('test-graph2.fa')
(total_reads, total_kmers) = ht.consume_fasta_and_tag(filename)
@@ -506,7 +506,7 @@ def test_save_load_on_graph_truncate():
def test_output_partitions():
filename = utils.get_test_data('test-output-partitions.fa')
- ht = khmer._Hashbits(10, [1])
+ ht = khmer._Nodegraph(10, [1])
ht.set_partition_id('TTAGGACTGC', 2)
ht.set_partition_id('TGCGTTTCAA', 3)
ht.set_partition_id('ATACTGTAAA', 4)
@@ -531,7 +531,7 @@ test_output_partitions.runme = True
def test_tiny_real_partitions():
filename = utils.get_test_data('real-partition-tiny.fa')
- ht = khmer.Hashbits(32, 8e2, 4)
+ ht = khmer.Nodegraph(32, 8e2, 4)
ht.consume_fasta_and_tag(filename)
subset = ht.do_subset_partition(0, 0)
@@ -558,7 +558,7 @@ def test_tiny_real_partitions():
def test_small_real_partitions():
filename = utils.get_test_data('real-partition-small.fa')
- ht = khmer.Hashbits(32, 2e3, 4)
+ ht = khmer.Nodegraph(32, 2e3, 4)
ht.consume_fasta_and_tag(filename)
subset = ht.do_subset_partition(0, 0)
@@ -600,7 +600,7 @@ CCTCGGGCCTTTCCGTTCCGTTGCCGCCCAAGCTCTCTAGCATCGAATCGGTCAAGCGGT\
def test_partition_on_abundance_1():
print((a,))
print((b,))
- kh = khmer.CountingHash(20, 1e3, 4)
+ kh = khmer.Countgraph(20, 1e3, 4)
for i in range(10):
print(kh.consume_and_tag(a))
@@ -614,7 +614,7 @@ def test_partition_on_abundance_1():
def test_partition_on_abundance_2():
- kh = khmer.CountingHash(20, 1e3, 4)
+ kh = khmer.Countgraph(20, 1e3, 4)
for i in range(10):
print(kh.consume_and_tag(a))
@@ -628,7 +628,7 @@ def test_partition_on_abundance_2():
def test_partition_on_abundance_3():
- kh = khmer.CountingHash(20, 1e4, 4)
+ kh = khmer.Countgraph(20, 1e4, 4)
for i in range(10):
print(kh.consume_and_tag(a))
@@ -647,7 +647,7 @@ def test_partition_on_abundance_3():
def test_partition_overlap_1():
- kh = khmer.CountingHash(20, 1e3, 4)
+ kh = khmer.Countgraph(20, 1e3, 4)
for i in range(10):
kh.consume_and_tag(a)
@@ -668,7 +668,7 @@ def test_partition_overlap_1():
def test_partition_overlap_2():
- kh = khmer.CountingHash(20, 1e4, 4)
+ kh = khmer.Countgraph(20, 1e4, 4)
for i in range(10):
kh.consume_and_tag(a)
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/khmer.git
More information about the debian-med-commit
mailing list