[med-svn] [khmer] 01/01: Squash of 'big-rebase' down to one commit
Michael Crusoe
misterc-guest at moszumanska.debian.org
Fri Jul 31 13:29:00 UTC 2015
This is an automated email from the git hooks/post-receive script.
misterc-guest pushed a commit to branch master
in repository khmer.
commit 41a340321071670cf5ec3127d72ed0fe7d0d5aca
Author: Michael R. Crusoe <michael.crusoe at gmail.com>
Date: Fri Jul 31 06:26:39 2015 -0700
Squash of 'big-rebase' down to one commit
Was 6f9f62bcebaa62a9a64f90c9115012b2930e7985
---
.gitignore | 3 +
CITATION | 2 +-
ChangeLog | 354 +-
Doxyfile.in | 2 +-
Makefile | 34 +-
README.rst | 15 +-
debian/control | 48 +-
debian/copyright | 44 +
debian/khmer-common.install | 1 +
.../{liboxli1.postinst => khmer-common.postinst} | 1 +
debian/{liboxli1.postrm => khmer-common.postrm} | 2 +-
debian/khmer.install | 0
debian/liboxli-dev.install | 4 +-
debian/liboxli1.install | 3 +-
debian/patches/get_version | 3 +-
debian/patches/pr-1148-changes | 358 ++
debian/patches/series | 2 +
debian/patches/setup-py-py3 | 8 +
debian/rules | 17 +-
debian/upstream/metadata | 10 +-
doc/conf.py | 2 +-
doc/dev/binary-file-formats.rst | 115 +
doc/dev/development.rst | 6 +-
doc/dev/for-khmer-developers.rst | 4 +-
doc/dev/getting-started.rst | 29 +-
doc/dev/hackathon.rst | 12 +-
doc/dev/index.rst | 1 +
doc/dev/release.rst | 20 +-
doc/dev/scripts-and-sandbox.rst | 2 +-
doc/index.rst | 5 +-
doc/release-notes/release-1.0.1.md | 12 +-
doc/release-notes/release-1.0.1.rst | 14 +-
doc/release-notes/release-1.0.md | 12 +-
doc/release-notes/release-1.0.rst | 16 +-
doc/release-notes/release-1.1.md | 14 +-
doc/release-notes/release-1.1.rst | 14 +-
doc/release-notes/release-1.2.md | 14 +-
doc/release-notes/release-1.2.rst | 14 +-
doc/release-notes/release-1.3.md | 2 +-
doc/release-notes/release-1.3.rst | 2 +-
doc/release-notes/release-1.4.md | 4 +-
doc/release-notes/release-1.4.rst | 4 +-
doc/roadmap.rst | 2 +-
doc/run-corn-50m.sh | 4 +-
doc/user/choosing-table-sizes.rst | 74 +-
doc/user/examples.rst | 4 +-
doc/user/getting-help.rst | 2 +-
doc/user/guide.rst | 2 +-
doc/user/known-issues.rst | 14 +-
doc/user/scripts.rst | 12 +-
doc/whats-new-2.0.rst | 29 +
examples/stamps/do.sh | 4 +-
jenkins-build.sh | 2 +-
khmer/__init__.py | 120 +-
khmer/{_khmermodule.cc => _khmer.cc} | 3792 ++++++++++----------
khmer/_version.py | 11 +-
khmer/kfile.py | 72 +-
khmer/khmer_args.py | 175 +-
khmer/thread_utils.py | 66 +-
khmer/utils.py | 46 +-
lib/.check_openmp.cc | 2 +-
lib/Makefile | 2 +-
lib/counting.cc | 70 +-
lib/counting.hh | 2 +-
lib/get_version.py | 3 +-
lib/graphtest.cc | 4 +-
lib/hashbits.cc | 35 +-
lib/hashbits.hh | 11 +-
lib/hashtable.cc | 119 +-
lib/hashtable.hh | 26 +-
lib/hllcounter.cc | 13 +-
lib/hllcounter.hh | 2 +-
lib/ht-diff.cc | 4 +-
lib/khmer.hh | 12 +-
lib/khmer_exception.hh | 4 +-
lib/kmer_hash.cc | 2 +-
lib/kmer_hash.hh | 2 +-
lib/labelhash.cc | 250 +-
lib/labelhash.hh | 33 +-
lib/magic | 11 +
lib/perf_metrics.cc | 4 +-
lib/perf_metrics.hh | 4 +-
lib/primes.hh | 4 +-
lib/read_aligner.cc | 4 +-
lib/read_aligner.hh | 4 +-
lib/read_parsers.cc | 4 +-
lib/read_parsers.hh | 4 +-
lib/subset.cc | 37 +-
lib/subset.hh | 4 +-
lib/test-Colors.cc | 4 +-
lib/test-HashTables.cc | 4 +-
lib/test-Parser.cc | 4 +-
lib/test-compile.cc | 2 +-
lib/trace_logger.cc | 4 +-
lib/trace_logger.hh | 4 +-
oxli/__init__.py | 55 +
oxli/build_graph.py | 103 +
oxli/functions.py | 103 +
sandbox/README.rst | 230 +-
sandbox/abundance-hist-by-position.py | 16 +-
sandbox/assembly-diff-2.py | 19 +-
sandbox/assembly-diff.py | 16 +-
sandbox/assemstats3.py | 28 +-
sandbox/bloom-count-intersection.py | 25 +-
sandbox/bloom-count.py | 15 +-
sandbox/build-sparse-graph.py | 15 +-
sandbox/calc-best-assembly.py | 25 +-
sandbox/calc-error-profile.py | 40 +-
sandbox/calc-median-distribution.py | 12 +-
sandbox/collect-reads.py | 44 +-
sandbox/collect-variants.py | 49 +-
sandbox/correct-errors.py | 41 +-
sandbox/estimate_optimal_hash.py | 109 +
sandbox/extract-single-partition.py | 9 +-
sandbox/fasta-to-abundance-hist.py | 15 +-
sandbox/filter-below-abund.py | 19 +-
sandbox/filter-median-and-pct.py | 21 +-
sandbox/filter-median.py | 17 +-
sandbox/find-high-abund-kmers.py | 51 +-
sandbox/graph-size.py | 31 +-
sandbox/hi-lo-abundance-by-position.py | 13 +-
sandbox/make-coverage.py | 19 +-
sandbox/multi-rename.py | 9 +-
sandbox/normalize-by-median-pct.py | 82 +-
sandbox/optimal_args_hashbits.py | 97 +
sandbox/print-stoptags.py | 8 +-
sandbox/print-tagset.py | 11 +-
sandbox/renumber-partitions.py | 11 +-
sandbox/saturate-by-median.py | 58 +-
sandbox/shuffle-reverse-rotary.py | 13 +-
sandbox/slice-reads-by-coverage.py | 19 +-
sandbox/split-fasta.py | 11 +-
sandbox/split-sequences-by-length.py | 11 +-
sandbox/stoptag-abundance-hist.py | 11 +-
sandbox/stoptags-by-position.py | 11 +-
sandbox/strip-partition.py | 9 +-
sandbox/subset-report.py | 23 +-
sandbox/sweep-files.py | 51 +-
sandbox/sweep-out-reads-with-contigs.py | 13 +-
sandbox/sweep-reads.py | 121 +-
sandbox/sweep-reads2.py | 43 +-
sandbox/sweep-reads3.py | 43 +-
sandbox/unique-kmers.py | 45 +-
sandbox/write-trimmomatic.py | 11 +-
scripts/abundance-dist-single.py | 67 +-
scripts/abundance-dist.py | 25 +-
scripts/annotate-partitions.py | 17 +-
scripts/count-median.py | 18 +-
scripts/count-overlap.py | 22 +-
scripts/do-partition.py | 94 +-
scripts/extract-long-sequences.py | 7 +-
scripts/extract-paired-reads.py | 113 +-
scripts/extract-partitions.py | 72 +-
scripts/fastq-to-fasta.py | 19 +-
scripts/filter-abund-single.py | 49 +-
scripts/filter-abund.py | 33 +-
scripts/filter-stoptags.py | 13 +-
scripts/find-knots.py | 74 +-
scripts/interleave-reads.py | 41 +-
scripts/load-graph.py | 101 +-
scripts/load-into-counting.py | 50 +-
scripts/make-initial-stoptags.py | 23 +-
scripts/merge-partitions.py | 17 +-
scripts/normalize-by-median.py | 396 +-
scripts/partition-graph.py | 64 +-
scripts/readstats.py | 25 +-
scripts/sample-reads-randomly.py | 42 +-
scripts/split-paired-reads.py | 60 +-
scripts/trim-low-abund.py | 94 +-
setup.cfg | 4 +-
setup.py | 22 +-
tests/__init__.py | 4 +-
tests/khmer_tst_utils.py | 32 +-
tests/test-data/badversion-k32.tagset | Bin 16 -> 20 bytes
tests/test-data/dn-test-all-paired-all-keep.fa | 16 +
tests/test-data/dn-test-none-paired.fa | 8 +
tests/test-data/dn-test-some-paired-all-keep.fa | 12 +
tests/test-data/goodversion-k32.tagset | Bin 34 -> 38 bytes
tests/test-data/normC20k20.ct | Bin 40140 -> 40144 bytes
tests/test-data/paired-slash1.fq | 32 +
tests/test-data/paired-slash1.fq.1 | 16 +
tests/test-data/paired-slash1.fq.2 | 16 +
tests/test-data/paired_one.base.dif.fa | 5 +
tests/test-data/test-filter-abund-Ns.fq | 16 +
tests/test_counting_hash.py | 680 +++-
tests/test_counting_single.py | 90 +-
tests/test_filter.py | 16 +-
tests/test_functions.py | 76 +-
tests/test_graph.py | 46 +-
tests/test_hashbits.py | 327 +-
tests/test_hashbits_obj.py | 563 ---
tests/test_hll.py | 76 +-
tests/test_labelhash.py | 764 ++--
tests/test_lump.py | 31 +-
tests/test_normalize_by_median.py | 627 ++++
tests/test_oxli_functions.py | 60 +
tests/test_read_aligner.py | 18 +-
tests/test_read_parsers.py | 36 +-
tests/test_sandbox_scripts.py | 70 +-
tests/test_script_arguments.py | 218 +-
tests/test_scripts.py | 1384 +++----
tests/test_subset_graph.py | 211 +-
tests/test_threaded_sequence_processor.py | 13 +-
tests/test_version.py | 19 +-
versioneer.py | 1 +
205 files changed, 9132 insertions(+), 6122 deletions(-)
diff --git a/.gitignore b/.gitignore
index 543df72..c0abc60 100644
--- a/.gitignore
+++ b/.gitignore
@@ -17,9 +17,12 @@ doc/_build
/.project
/.pydevproject
/.Rhistory
+.idea
coverage.xml
coverage-gcovr.xml
coverage-debug
+htmlcov/
+diff-cover.html
khmer-cov.tgz
cppcheck-result.xml
MANIFEST
diff --git a/CITATION b/CITATION
index 41679ee..3b4b3c6 100644
--- a/CITATION
+++ b/CITATION
@@ -153,7 +153,7 @@ implement the probabilistic k-mer counting described in:
digital normalization of reads in the context of high khmer false positive
rates. khmer is implemented in C++ wrapped in a Python interface, offers a
tested and robust API, and is freely available under the BSD license at
- github.com/ged-lab/khmer.</p>",
+ github.com/dib-lab/khmer.</p>",
number = "7",
doi = "10.1371/journal.pone.0101271"
}
diff --git a/ChangeLog b/ChangeLog
index ec2c779..2aeb3db 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,7 +1,351 @@
-2015-05-13 Scott Sievert <sieve121 at umn.edu>
+2015-06-30 Jacob Fenton <bocajnotnef at gmail.com>
- * changed "doc/LICENSE.txt" to "LICENSE" in tests/*, scripts/*, lib/*,
- sandbox/*, khmer/*
+ * tests/{test_script_arguments,test_functions}.py: changed tests to use
+ stderr redirection to prevent leaks
+ * tests/test_normalize_by_median.py: changed to not duplicate a test
+ * tests/test_script_arguments.py: changed tests to use stderr redirection
+
+2015-06-30 Titus Brown <titus at idyll.org>
+
+ * tests/test_normalize_by_median.py: disabled running
+ test_normalize_by_median_report_fp during normal test running.
+
+2015-06-30 Titus Brown <titus at idyll.org>
+
+ * khmer/khmer_args.py: removed incorrect warning for default max_tablesize
+ when -M is used.
+ * tests/test_scripts.py: added test for correct max_tablesize behavior.
+
+2015-06-30 Titus Brown <titus at idyll.org>
+
+ * setup.cfg: changed 'stop=TRUE' to 'stop=FALSE', so that tests do not
+ stop running at first failure.
+
+2015-06-30 Kevin Murray <spam at kdmurray.id.au>
+
+ * scripts/{extract-paired-reads,split-paired-reads}.py: Fix creation of
+ default output files even when output files were provided on CLI.
+
+2015-06-29 Sherine Awad <drmahmoud at ucdavis.edu>
+
+ * khmer/utils.py: Fix bug in naming in interleave-reads.py
+ * tests/test_scripts.py: Add a test function for the new behavior
+ * tests/test-data/*.fq: Add 3 test files needed for the testing
+
+2015-06-28 Jacob Fenton <bocajnotnef at gmail.com>
+
+ * tests/test_sandbox_scripts.py: made error more informative and not crashy
+ * sandbox/{estimate_optimal_hash,optimal_args_hashbits}.py: minor cleanups
+
+2015-06-28 Qingpeng Zhang <qingpeng at msu.edu>
+
+ * sandbox/{estimate_optimal_hash,optimal_args_hashbits}.py: added sandbox
+ methods for estimating memory usage based on desired fp rate, etc.
+
+2015-06-27 Kevin Murray <spam at kdmurray.id.au>
+
+ * doc/dev/binary-file-formats.rst: Fix issue in ksize documentation for
+ Countgraph
+
+2015-06-27 Kevin Murray <spam at kdmurray.id.au>
+
+ * README.rst: Fix link to virtualenv installation instructions.
+
+2015-06-19 Titus Brown <titus at idyll.org>
+
+ * khmer/__init__.py: split CountingHash into _CountingHash (CPython) and
+ CountingHash to mimic Hashbits behavior; pass IOError through
+ extract_countinghash_info and extract_hashbits_info so that
+ file-does-not-exist errors are correctly reported; fixed FP rate reporting;
+ changed to using get_n_primes_near_x to build hashtable sizes; removed
+ get_n_primes_above_x, new_hashbits, and new_counting_hash functions.
+ * khmer/_khmer.cc: changed tp_flags for KCountingHash so that it could
+ be a base class.
+ * khmer/khmer_args.py: removed environment variable override for hash size
+ defaults; added -M/--max_memory_usage, and functions create_nodegraph()
+ and create_countgraph(). Also renamed --min-tablesize to --max-tablesize.
+ * khmer/kfile.py: fixed check_space_for_hashtable to depend on args obj.
+ * oxli/build_graph.py, scripts/{annotate-partitions.py,count-overlap.py,
+ do-partition.py,filter-stoptags.py,
+ merge-partitions.py}, sandbox/{assembly-diff.py,assembly-diff-2.py,
+ bloom-count-intersection.py,bloom-count.py,build-sparse-graph.py,
+ collect-reads.py,saturate-by-median.py, graph-size.py,print-stoptags.py,
+ print-tagset.py,stoptags-by-position.py, subset-report.py,
+ sweep-out-reads-with-contigs.py,sweep-reads2.py,sweep-reads3.py}: changed
+ hashtype over to 'nodegraph' and 'countgraph' in call to report_on_config;
+ replaced counting hash/hashbits creation with new khmer_args create*
+ functions, and/or new_counting_hash/new_hashbits with CountingHash/Hashbits.
+ * doc/scripts.rst: updated hashtable size help text.
+ * doc/whats-new-2.0.rst: updated with description of -M/--max-memory-usage.
+ * tests/test*.py: switched from new_counting_hash to CountingHash, and
+ new_hashbits to Hashbits; adjusts tests for new behavior of hashtable
+ size calculation.
+ * tests/test_hashbits_obj.py: merged into test_hashbits.py and removed file.
+ * tests/test_script_arguments.py: updated for new check_space_for_hashtable
+ behavior; added tests for create_countgraph and create_nodegraph.
+ * tests/test_counting_single.py: fixed countgraph size & palindrome testing
+ beahavior in test_complete_no_collision.
+
+2015-06-19 Titus Brown <titus at idyll.org>
+
+ * Makefile: temporarily disable 'huge' tests on Linux.
+
+2015-06-17 Titus Brown <titus at idyll.org>
+
+ * scripts/normalize-by-median.py: changed DEFAULT_DESIRED_COVERAGE to 20,
+ and corrected options help.
+ * tests/{test_scripts.py,test_normalize_by_median.py}: moved
+ normalize-by-median.py tests into a their own file.
+ * tests/test-data/{dn-test-all-paired-all-keep.fa,dn-test-none-paired.fa,
+ dn-test-some-paired-all-keep.fa}: added test data files for specific
+ pairing/saturation behavior.
+
+2015-06-16 Kevin Murray <spam at kdmurray.id.au>
+
+ * doc/dev/binary-file-formats.rst: Add documentation of khmer's binary file
+ formats.
+ * doc/dev/index.rst: Add above docs to developer documentation index.
+
+2015-06-14 Michael R. Crusoe <crusoe at ucdavis.edu>
+
+ * khmer/__init__.py,lib/{counting,hashbits,hashtable,subset,labelhash}.cc,
+ lib/khmer.hh: add signature to beginning of all binary file types
+ * tests/test-data/{normC20k20.ct,badversion-k32.tagset,
+ goodversion-k32.tagset}: update to new format by prepending "OXLI" to the
+ data stream
+ * tests/test_{counting_hash,functions,scripts,hashbits,hashbits_obj,
+ labelhash}.py: tests should fail, not error (add try, except + assert
+ blocks). Adapted other tests to cope with the new file formats
+ * lib/magic: new, teaches the unix `file` command about khmer file types
+ * doc/index.rst,doc/whats-new-2.0.rst: document these changes
+
+2015-06-14 Titus Brown <titus at idyll.org>
+
+ * scripts/extract-paired-reads.py: added --output_dir, --paired-output,
+ and --single-output arguments to change output file details; script
+ now accepts stdin, and will output to stdout upon request.
+ * scripts/split-paired-reads.py: changed script to output to stdout upon
+ request; added '-' as stdin input.
+ * tests/test_scripts.py: added tests for new extract-paired-reads.py
+ behavior.
+
+2015-06-14 Titus Brown <titus at idyll.org>
+
+ * tests/test_counting_hash.py: fixed duplicated test
+ 'get_kmer_counts_too_short' by changing to 'get_kmer_hashes_too_short'.
+
+2015-06-14 Jacob Fenton <bocajnotnef at gmail.com>
+
+ * scripts/abundance-dist.py: added weird bigcount circumstance detection
+ * tests/test_scripts.py: added test for the above
+
+2015-06-14 Kevin Murray <spam at kdmurray.id.au>
+
+ * lib/counting.cc: Fix infinite loop in gzipped CountingHash I/O
+ * tests/test_counting_hash.py: Add test of large CountingHash I/O
+ * setup.cfg: Skip tests with the 'huge' label by default
+
+2015-06-13 Michael R. Crusoe <crusoe at ucdavis.edu>
+
+ * Makefile, build-jenkins.sh: unify sphinx dependencies
+ * scripts/readstats.py: fix typo
+
+2015-06-13 Titus Brown <titus at idyll.org>
+
+ * doc/dev/getting-started.rst: update instructions for creating a new
+ branch name to preferred practice (fix/brief_issue_description, instead
+ of fix/issuenum).
+
+2015-06-13 Michael R. Crusoe <crusoe at ucdavis.edu>
+
+ * doc/dev/release.rst: remove false positive from version check
+ * tests/test_{counting_hash,scripts}.py: remove scriptpath no-op method
+
+2015-06-12 Luiz Irber <khmer at luizirber.org>
+
+ * setup.py: revert changes to zlib compilation.
+ * setup.cfg: nose should stop on first error by default.
+ * Makefile, tests/test_threaded_sequence_processor.py,
+ scripts/{do-partition,partition-graph}.py, khmer/thread_utils.py: Remove
+ dependency on future package.
+
+2015-06-12 Michael R. Crusoe <crusoe at ucdavis.edu>
+
+ * setup.py: update screed version to 0.9
+
+2015-06-12 Luiz Irber <khmer at luizirber.org>
+
+ * *.py: refactor for Python 3 compatibility. Clear separation of Unicode
+ and Byte strings, use __future__ imports for compatibility (print function,
+ absolute imports, unicode_literals), fix tests to consider changes to random
+ number generator between Python versions.
+ * khmer/_khmer.cc: rename file, methods return Unicode strings instead of
+ Bytestrings.
+
+2015-06-12 Luiz Irber <khmer at luizirber.org>
+
+ * khmer/{khmermodule.cc},tests/test_hashbits.py: Add Unicode support to
+ hashbits.get method.
+ * tests/test_hll.py: Avoid using translate for revcomp calculation.
+
+2015-06-12 Sarah Guermond <sarah.guermond at gmail.com>
+
+ * scripts/trim-low-abund.py: changed _screed_record_dict to Record
+
+2015-06-11 Sherine Awad <drmahmoud at ucdavis.edu>
+
+ * Change split-paired-reads.py to accept input from stdin.
+ * Add test function to test new behavior of split-paired.
+
+2015-06-10 Camille Scott <camille.scott.w at gmail.com>
+
+ * lib/hashtable.cc: Tweaked median_at_least to reduce number of
+ conditional checks.
+
+2015-06-10 Titus Brown <titus at idyll.org>
+
+ * scripts/find-knots.py: fixed invocation of check_space to take correct
+ arguments.
+ * tests/test_scripts.py: added simple test of find-knots.py execution.
+
+2015-06-09 Jacob Fenton <bocajnotnef at gmail.com>
+
+ * scripts/normalize-by-median.py: implemented broken_paired_reader
+ * tests/test_scripts.py: modified tests to properly use new args
+ * khmer/utils.py: added force-paired option to broken_paired_reader (@ctb)
+
+2015-06-09 Luiz Irber <khmer at luizirber.org>
+
+ * khmer/_khmermodule.cc, lib/hashtable.{cc,hh}: astyle fixes.
+
+2015-06-09 Titus Brown <titus at idyll.org>
+
+ * khmer/_khmermodule.cc: fixed nasty Hashtable.get() bug.
+ * lib/hashtable.{cc,hh}: add Hashtable::get_kmers(), get_kmer_hashes(),
+ and get_kmer_counts().
+ * khmer/_khmermodule.cc: add CPython functions for get_kmers(),
+ get_kmer_hashes(), and get_kmer_counts(); reorganize hashtable_methods.
+ * tests/test_counting_hash.py: add tests for get_kmers(), get_kmer_hashes(),
+ and get_kmer_counts(), as well as for nasty Hashtable.get() bug.
+
+2015-06-08 Camille Scott <camille.scott.w at gmail.com>
+
+ * lib/hashtable.{cc,hh}: Add filter_on_median method to check
+ if median k-mer count is above a cutoff
+ * khmer/_khmermodule.cc: Expose filter_on_median to python-land
+ * scripts/normalize-by-median.py: Switch to new filter_on_median
+ * tests/test_counting_hash.py: Tests for new method
+
+2015-06-08 Luiz Irber <khmer at luizirber.org>
+
+ * tests/test_hll.py: test return values from consume_{string,fasta}.
+
+2015-06-06 Titus Brown <titus at idyll.org>
+
+ * khmer/_khmermodule.cc: added hllcounter_merge.
+ * tests/test_hll.py: added merge tests.
+ * lib/hllcounter.cc: changed HLLCounter::consume_string to uppercase input.
+ * sandbox/unique-kmers.py: added --stream-out option; updated to print out
+ k-mers per file as well as k-mer size used.
+
+2015-06-04 Titus Brown <titus at idyll.org>
+
+ * khmer/_khmermodule.cc: added error handling to load_partitionmap.
+ * lib/subset.cc: modified partitionmap format to detect truncated files;
+ changed untestable sanity checks to assertions.
+ * tests/{test_counting_hash,test_hashbits,test_subset_graph}.py: added
+ tests to try loading all possible truncations of binary save files.
+
+2015-06-04 Titus Brown <titus at idyll.org>
+
+ * khmer/_khmermodule.cc,lib/hashbits.{cc,hh}: add Hashbits::update_from()
+ and Hashbits.update().
+ * tests/test_hashbits.py: associated tests.
+
+2015-06-01 Jacob Fenton <bocajnotnef at gmail.com>
+
+ * scripts/normalize-by-median.py: major refactoring to use context
+ managers and classes; fixed -R
+ * tests/test_scripts.py: added test for normalize's -R arg
+
+2015-06-01 Tamer Mansour <drtamermansour at gmail.com>
+
+ * scripts/normalize-by-median.py: changed to count kmers from both PE reads
+ when either one of them is below the coverage cutoff
+ * tests/test_scripts.py: Added test for new behaviour
+
+2015-05-26 Titus Brown <titus at idyll.org>
+
+ * khmer/_khmermodule.cc: refactor CPython layer so that KHashtable
+ is at base of CountingHash and Hashbits.
+ * lib/hashbits.hh: add n_entries() function from Hashtable::n_entries.
+ * lib/hashtable.hh: add several virtual functions to Hashtable that exist in
+ CountingHash and Hashbits.
+
+2015-05-26 Titus Brown <titus at idyll.org>
+
+ * khmer/{__init__.py,_khmermodule.cc},lib/labelhash.{cc,hh},
+ lib/{hashtable,khmer}.hh: changed LabelHash to be a "friend" of Hashtable,
+ rather than a subclass; allowed initialization with either a CountingHash
+ or a Hashbits; added 'graph' attribute to the Python object to store a
+ reference to host object.
+ * lib/labelhash.{cc,hh}: changed TagPtr maps to Tag maps to fix disastrous
+ bug.
+ * lib/labelhash.{cc,hh}: added save/load_tags_and_labels functions for
+ saving and loading labels.
+ * tests/test_labelhash.py: removed unnecessary tests; added tests for save
+ and load.
+ * sandbox/sweep-reads.py: updated with LabelHash changes.
+
+2015-05-26 Kevin Murray <spam at kdmurray.id.au>
+
+ * lib/Makefile: Remove old libkhmer.so versions during make clean
+
+2015-05-25 Kevin Murray <spam at kdmurray.id.au>
+
+ * Makefile: Fix issue with 'lib' target not building by using FORCE
+
+2015-05-20 Jacob Fenton <bocajnotnef at gmail.com>
+
+ * oxli/{__init__,khmer_api,common}.py,scripts/build-graph.py,
+ tests/test_scripts.py: added oxli module, oxlified load_graph script, tests
+ * scripts/load-graph.py: replaced with oxlified version
+ * setup.py: added oxli module and entry point
+
+2015-05-20 Kevin Murray <spam at kdmurray.id.au>
+
+ * .gitignore: Add htmlcov/ and diff-cover.html to gitignore
+ * Makefile: Use rm -f to remove files to quash error messages on
+ non-existant files
+
+2015-05-18 Sherine Awad <sherine.awad at gmail.com>
+
+ * tests/test_scripts.py: Test loading of compressed counting table
+ with bigcounts,and test abundance with bigcounts
+
+2015-05-18 Michael R. Crusoe <mcrusoe at msu.edu>
+
+ * all files: references to github.com/ged-lab changed to
+ github.com/dib-lab. All GitHub URLs normalized to use HTTPS
+ * README.rst: broken landscape.io badge removed
+ * doc/user/known-issues.rst: removed two known issues fixed in v1.4 release
+
+2015-05-18 Titus Brown <titus at idyll.org>
+
+ * sandbox/{assembly-diff-2.py,sandbox/collect-reads.py},
+ scripts/{count-median.py,filter-abund-single.py,filter-abund.py}: changed
+ sequence-reading behavior to replace 'N' with 'A', to be consistent with
+ rest of code base.
+ * scripts/{filter-abund.py,filter-abund-single.py}: changed behavior of
+ scripts to keep sequences with 'N's in them, and count them as 'A's.
+ * tests/test_scripts.py: added tests for new
+ filter-abund/filter-abund-single behavior.
+ * tests/test-data/test-filter-abund-Ns.fq: new test file for new tests.
+
+2015-05-13 Scott Sievert <sieve121 at umn.edu>
+
+ * tests/*,scripts/*,lib/*,sandbox/*,khmer/*: changed "doc/LICENSE.txt" to
+ "LICENSE" in copyright header.
2015-05-13 Michael R. Crusoe <mcrusoe at msu.edu>
@@ -809,8 +1153,8 @@
New FAST[AQ] parser (from the SeqAn project). Fixes known issue and a
newly found read dropping issue
- https://github.com/ged-lab/khmer/issues/249
- https://github.com/ged-lab/khmer/pull/641
+ https://github.com/dib-lab/khmer/issues/249
+ https://github.com/dib-lab/khmer/pull/641
Supports reading from non-seekable plain and gziped FAST[AQ] files (a.k.a
pipe or streaming support)
diff --git a/Doxyfile.in b/Doxyfile.in
index d76ba0b..c9293f4 100644
--- a/Doxyfile.in
+++ b/Doxyfile.in
@@ -32,7 +32,7 @@ DOXYFILE_ENCODING = UTF-8
# title of most generated pages and in a few other places.
# The default value is: My Project.
-PROJECT_NAME = Khmer
+PROJECT_NAME = khmer
# The PROJECT_NUMBER tag can be used to enter a project or revision number. This
# could be handy for archiving the generated documentation or if some version
diff --git a/Makefile b/Makefile
index 4e1d371..616efdc 100644
--- a/Makefile
+++ b/Makefile
@@ -7,8 +7,8 @@
CPPSOURCES=$(wildcard lib/*.cc lib/*.hh khmer/_khmermodule.cc)
PYSOURCES=$(wildcard khmer/*.py scripts/*.py)
SOURCES=$(PYSOURCES) $(CPPSOURCES) setup.py
-DEVPKGS=sphinxcontrib-autoprogram pep8==1.5.7 diff_cover \
-autopep8 pylint coverage gcovr nose screed pep257
+DEVPKGS=pep8==1.5.7 diff_cover autopep8 pylint coverage gcovr nose pep257 \
+ screed
GCOVRURL=git+https://github.com/nschum/gcovr.git@never-executed-branches
VERSION=$(shell git describe --tags --dirty | sed s/v//)
@@ -20,9 +20,9 @@ CPPCHECK=ls lib/*.cc khmer/_khmermodule.cc | grep -v test | cppcheck -DNDEBUG \
UNAME := $(shell uname)
ifeq ($(UNAME),Linux)
- TESTATTR='!known_failing,!jenkins'
+ TESTATTR='!known_failing,!jenkins,!huge'
else
- TESTATTR='!known_failing,!jenkins,!linux'
+ TESTATTR='!known_failing,!jenkins,!huge'
endif
## all : default task; compile C++ code, build shared object library
@@ -36,7 +36,8 @@ help: Makefile
install-dep: install-dependencies
install-dependencies:
- pip2 install --upgrade $(DEVPKGS) || pip install --upgrade $(DEVPKGS)
+ pip install --upgrade $(DEVPKGS)
+ pip install --upgrade --requirement doc/requirements.txt
## sharedobj : build khmer shared object file
sharedobj: khmer/_khmermodule.so
@@ -63,11 +64,12 @@ dist/khmer-$(VERSION).tar.gz: $(SOURCES)
clean: FORCE
cd lib && ${MAKE} clean || true
cd tests && rm -rf khmertest_* || true
- rm -f khmer/_khmermodule.so || true
- rm khmer/*.pyc lib/*.pyc || true
+ rm -f khmer/_khmermodule.so
+ rm -f khmer/*.pyc lib/*.pyc
./setup.py clean --all || true
- rm coverage-debug || true
- rm -Rf .coverage || true
+ rm -f coverage-debug
+ rm -Rf .coverage
+ rm -f diff-cover.html
debug: FORCE
export CFLAGS="-pg -fprofile-arcs"; python setup.py build_ext --debug \
@@ -101,10 +103,10 @@ cppcheck: $(CPPSOURCES)
## pep8 : check Python code style
pep8: $(PYSOURCES) $(wildcard tests/*.py)
pep8 --exclude=_version.py --show-source --show-pep8 setup.py khmer/ \
- scripts/ tests/ || true
+ scripts/ tests/ oxli/ || true
pep8_report.txt: $(PYSOURCES) $(wildcard tests/*.py)
- pep8 --exclude=_version.py setup.py khmer/ scripts/ tests/ \
+ pep8 --exclude=_version.py setup.py khmer/ scripts/ tests/ oxli/ \
> pep8_report.txt || true
diff_pep8_report: pep8_report.txt
@@ -129,7 +131,7 @@ astyle: $(CPPSOURCES)
## autopep8 : fix most Python code indentation and formatting
autopep8: $(PYSOURCES) $(wildcard tests/*.py)
autopep8 --recursive --in-place --exclude _version.py --ignore E309 \
- setup.py khmer/*.py scripts/*.py tests/*.py
+ setup.py khmer/*.py scripts/*.py tests/*.py oxli/*.py
# A command to automatically run astyle and autopep8 on appropriate files
## format : check/fix all code indentation and formatting (runs astyle and autopep8)
@@ -140,12 +142,12 @@ format: astyle autopep8
pylint: $(PYSOURCES) $(wildcard tests/*.py)
pylint --msg-template="{path}:{line}: [{msg_id}({symbol}), {obj}] {msg}" \
setup.py khmer/[!_]*.py khmer/__init__.py scripts/*.py tests \
- || true
+ oxli/*.py || true
pylint_report.txt: ${PYSOURCES} $(wildcard tests/*.py)
pylint --msg-template="{path}:{line}: [{msg_id}({symbol}), {obj}] {msg}" \
setup.py khmer/[!_]*.py khmer/__init__.py scripts/*.py tests \
- sandbox/*.py > pylint_report.txt || true
+ sandbox/*.py oxli/*.py > pylint_report.txt || true
diff_pylint_report: pylint_report.txt
diff-quality --violations=pylint pylint_report.txt
@@ -154,7 +156,7 @@ diff_pylint_report: pylint_report.txt
# python module we can't tell nosetests to look for them (via an import
# statement). So we run nose inside of coverage.
.coverage: $(PYSOURCES) $(wildcard tests/*.py) khmer/_khmermodule.so
- coverage run --branch --source=scripts,khmer --omit=khmer/_version.py \
+ coverage run --branch --source=scripts,khmer,oxli --omit=khmer/_version.py \
-m nose --with-xunit --attr=\!known_failing --processes=0
coverage.xml: .coverage
@@ -192,7 +194,7 @@ doc/doxygen/html/index.html: ${CPPSOURCES} ${PYSOURCES}
Doxyfile
doxygen
-lib:
+lib: FORCE
cd lib && \
$(MAKE)
diff --git a/README.rst b/README.rst
index 97bd1c4..130cb9b 100644
--- a/README.rst
+++ b/README.rst
@@ -6,27 +6,24 @@ Welcome to khmer: k-mer counting, filtering and graph traversal FTW!
.. image:: https://readthedocs.org/projects/khmer/badge
:target: https://readthedocs.org/projects/khmer/
:alt: Documentation Status
-.. image:: https://badge.fury.io/py/khmer.png
+.. image:: https://badge.fury.io/py/khmer.svg
:target: http://badge.fury.io/py/khmer
:alt: PyPI Package
-.. image:: https://pypip.in/d/khmer/badge.png
+.. image:: https://pypip.in/d/khmer/badge.svg
:target: https://crate.io/packages/khmer
:alt: Downloads Counter
-.. image:: https://pypip.in/license/khmer/badge.png
+.. image:: https://pypip.in/license/khmer/badge.svg
:target: https://pypi.python.org/pypi/khmer/
:alt: License
.. image:: http://ci.ged.msu.edu/job/khmer-master/badge/icon
:target: http://ci.ged.msu.edu/job/khmer-master/
-.. image:: https://landscape.io/github/ged-lab/khmer/master/landscape.png
- :target: https://landscape.io/github/ged-lab/khmer/master
- :alt: Python Code Health
.. image:: https://scan.coverity.com/projects/621/badge.svg
:target: https://scan.coverity.com/projects/621
:alt: Coverity Scan Build Status
The official repository is at
-https://github.com/ged-lab/khmer
+https://github.com/dib-lab/khmer
and you can read the docs online here:
@@ -40,7 +37,7 @@ them, please visit the following URLs:
- Announcements: http://lists.idyll.org/listinfo/khmer-announce
-We chat at https://gitter.im/ged-lab/khmer and the maintainers can be
+We chat at https://gitter.im/dib-lab/khmer and the maintainers can be
contacted at khmer-project at idyll.org.
For getting help with please see this guide: http://khmer.readthedocs.org/user/getting-help.html
@@ -66,7 +63,7 @@ version.
For more details see `doc/install.txt <https://khmer.readthedocs.org/en/latest/user/install.html>`_
The use of a virtualenv is recommended, see
-https://virtualenv.pypa.io/en/latest/virtualenv.html#installation
+https://virtualenv.readthedocs.org/en/latest/installation.html
khmer is under the BSD license; see doc/LICENSE.txt. Distribution,
modification and redistribution, incorporation into other software, and
diff --git a/debian/control b/debian/control
index 70984dc..5e40aca 100644
--- a/debian/control
+++ b/debian/control
@@ -1,37 +1,53 @@
Source: khmer
Maintainer: Debian Med Packaging Team <debian-med-packaging at lists.alioth.debian.org>
-Uploaders: Michael R. Crusoe <crusoe at ucdavis.edu>
+Uploaders: Michael R. Crusoe <crusoe at ucdavis.edu>, Kevin Murray <spam at kdmurray.id.au>
Section: science
Priority: optional
-Build-Depends: python3-screed (>=0.9),
- python3-screed (<<1.0),
- python3-dev,
- debhelper (>= 9),
+Build-Depends: debhelper (>= 9),
dh-python,
dh-exec,
python3 (>= 3.3),
- python3-all-dev,
+ python3-dev (>= 3.3),
+ python3-all-dev (>= 3.3),
python3-setuptools (>= 3.3),
- zlib1g-dev,
- libbz2-dev,
+ python3-screed (>=0.9),
+ python3-screed (<<1.0),
python3-nose,
python3-sphinx,
+ python3-sphinxcontrib.autoprogram,
+ zlib1g-dev,
+ libbz2-dev,
ruby-ronn,
- sphinxcontrib-autoprogram
Standards-Version: 3.9.6
Vcs-Browser: https://anonscm.debian.org/cgit/debian-med/khmer.git
Vcs-Git: git://anonscm.debian.org/debian-med/khmer.git
Homepage: http://khmer.readthedocs.org
-X-Python-Version: 2.7
+X-Python-Version: 3.4
+
+Package: khmer-common
+Architecture: any
+Multi-Arch: same
+Depends: ${misc:Depends},
+ ${shlibs:Depends}
+Description: in-memory DNA sequence kmer counting, filtering & graph traversal
+ khmer is a library and suite of command line tools for working with DNA
+ sequence. It is primarily aimed at short-read sequencing data such as that
+ produced by the Illumina platform. khmer takes a k-mer-centric approach to
+ sequence analysis, hence the name.
+ .
+ This package contains common files for liboxli and khmer
Package: khmer
Architecture: any-amd64 any-arm64 any-mips64 any-mips64el any-ia64 ppc64
-Depends: python3 (>= 3.3),
+Multi-Arch: same
+Section: python
+Depends: khmer-common,
+ python3 (>= 3.3),
python3-screed (>= 0.9),
python3-screed (<< 1.0),
${misc:Depends},
${shlibs:Depends},
- ${python:Depends},
+ ${python3:Depends},
${sphinxdoc:Depends}
Description: in-memory DNA sequence kmer counting, filtering & graph traversal
khmer is a library and suite of command line tools for working with DNA
@@ -43,8 +59,8 @@ Package: liboxli1
Architecture: any
Multi-Arch: same
Section: libs
-Depends: khmer,
- ${shlibs:Depends},
+Depends: khmer-common,
+ ${shlibs:Depends},
${misc:Depends}
Pre-Depends: ${misc:Pre-Depends}
Description: in-memory DNA sequence kmer counting, filtering & graph traversal
@@ -58,8 +74,8 @@ Description: in-memory DNA sequence kmer counting, filtering & graph traversal
Package: liboxli-dev
Architecture: any
Multi-Arch: same
-Section: libs
-Depends: khmer,
+Section: libdevel
+Depends: khmer-common,
${shlibs:Depends},
${misc:Depends}
Pre-Depends: ${misc:Pre-Depends}
diff --git a/debian/copyright b/debian/copyright
index 18abeb9..ae0177b 100644
--- a/debian/copyright
+++ b/debian/copyright
@@ -4,6 +4,7 @@ Upstream-Contact: khmer-project at idyll.org
Source: https://pypi.python.org/pypi/khmer#downloads
Files-Excluded: third-party/bzip2
third-party/zlib
+ .ycm_extra_conf.py
Files: *
Copyright: 2010-2015, Michigan State University
@@ -13,6 +14,24 @@ Files: third-party/seqan/*
Copyright: (c) 2006-2013, Knut Reinert, FU Berlin
License: BSD-3-clause
+Files: third-party/seqan/*/boost*.h
+ third-party/seqan/*/concept_checking.h
+ third-party/seqan/*/fundamental_concepts.h
+Copyright: (c) 2002, Paul Mensonides
+ (c) 2001, Housemarque Oy
+ (c) 2000-2003, John Maddock
+ (c) 2001, Darin Adler
+ (c) 2001, Peter Dimov
+ (c) 2002, Bill Kempf
+ (c) 2002, Jens Maurer
+ (c) 2002-2003, 2006, David Abrahams
+ (c) 2003, Gennaro Prota
+ (c) 2003, Eric Friedman
+ (c) 2010, Eric Jourdanneau, Joel Falcou
+ (C) 2000, Jeremy Siek
+ 2002, The Trustees of Indiana University
+License: boost-1.0
+
Files: third-party/smhasher
Copyright: none.
License: public-domain
@@ -49,3 +68,28 @@ License: BSD-3-clause
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+License: boost-1.0
+ Boost Software License - Version 1.0 - August 17th, 2003
+ .
+ Permission is hereby granted, free of charge, to any person or organization
+ obtaining a copy of the software and accompanying documentation covered by
+ this license (the "Software") to use, reproduce, display, distribute,
+ execute, and transmit the Software, and to prepare derivative works of the
+ Software, and to permit third-parties to whom the Software is furnished to
+ do so, all subject to the following:
+ .
+ The copyright notices in the Software and this entire statement, including
+ the above license grant, this restriction and the following disclaimer,
+ must be included in all copies of the Software, in whole or in part, and
+ all derivative works of the Software, unless such copies or derivative
+ works are solely in the form of machine-executable object code generated by
+ a source language processor.
+ .
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+ SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+ FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+ ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ DEALINGS IN THE SOFTWARE.
diff --git a/debian/khmer-common.install b/debian/khmer-common.install
new file mode 100644
index 0000000..a582dd5
--- /dev/null
+++ b/debian/khmer-common.install
@@ -0,0 +1 @@
+lib/magic usr/share/file/khmer
diff --git a/debian/liboxli1.postinst b/debian/khmer-common.postinst
similarity index 77%
rename from debian/liboxli1.postinst
rename to debian/khmer-common.postinst
index 5f47041..30f33a7 100644
--- a/debian/liboxli1.postinst
+++ b/debian/khmer-common.postinst
@@ -1 +1,2 @@
cd usr/share/file && file --compile khmer
+#DEBHELPER#
diff --git a/debian/liboxli1.postrm b/debian/khmer-common.postrm
similarity index 72%
rename from debian/liboxli1.postrm
rename to debian/khmer-common.postrm
index ed3bb22..1d65e43 100644
--- a/debian/liboxli1.postrm
+++ b/debian/khmer-common.postrm
@@ -1,2 +1,2 @@
-
rm -f usr/share/file/khmer.mgc
+#DEBHELPER#
diff --git a/debian/khmer.install b/debian/khmer.install
old mode 100755
new mode 100644
diff --git a/debian/liboxli-dev.install b/debian/liboxli-dev.install
index ce4f4cd..786b4d4 100755
--- a/debian/liboxli-dev.install
+++ b/debian/liboxli-dev.install
@@ -1,5 +1,7 @@
#!/usr/bin/dh-exec
+lib/liboxli.so usr/lib/${DEB_HOST_MULTIARCH}
+lib/liboxli.so.2 usr/lib/${DEB_HOST_MULTIARCH}
lib/liboxli.a usr/lib/${DEB_HOST_MULTIARCH}
lib/oxli.pc usr/lib/${DEB_HOST_MULTIARCH}/pkgconfig
# NB: we install the headers to the oxli subdir.
-lib/*.hh usr/include/oxli
+lib/*.hh usr/include/oxli
diff --git a/debian/liboxli1.install b/debian/liboxli1.install
index e0d0886..b0934fa 100755
--- a/debian/liboxli1.install
+++ b/debian/liboxli1.install
@@ -1,3 +1,2 @@
#!/usr/bin/dh-exec
-lib/liboxli.so* usr/lib/${DEB_HOST_MULTIARCH}
-lib/magic usr/share/file/khmer
+lib/liboxli.so.2.0rc1+dfsg usr/lib/${DEB_HOST_MULTIARCH}
diff --git a/debian/patches/get_version b/debian/patches/get_version
index 6b4708c..00d1237 100644
--- a/debian/patches/get_version
+++ b/debian/patches/get_version
@@ -8,7 +8,7 @@
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
-@@ -57,16 +58,9 @@
+@@ -57,16 +58,10 @@
# The full version, including alpha/beta/rc tags.
@@ -24,6 +24,7 @@
-sys.path.remove('../')
+release = subprocess.check_output(
+ ['sh', '-c', 'cd ..; dpkg-parsechangelog -S Version'])
++release = release.decode('utf-8')
+
# The short X.Y version.
diff --git a/debian/patches/pr-1148-changes b/debian/patches/pr-1148-changes
new file mode 100644
index 0000000..5882288
--- /dev/null
+++ b/debian/patches/pr-1148-changes
@@ -0,0 +1,358 @@
+--- khmer.orig/lib/Makefile
++++ khmer/lib/Makefile
+@@ -1,3 +1,8 @@
++# Should we use the standard system zlib and libbz2?
++
++USE_SYSTEM_ZLIB ?= false
++USE_SYSTEM_LIBBZ2 ?= false
++
+ # Profile?
+ # Set this variable to true if you wish to profile the codes.
+ WANT_PROFILING=false
+@@ -29,47 +34,61 @@
+ # and are willing to accept the overhead such instrumentation introduces.
+ WITH_INTERNAL_METRICS=false
+
+-
+ PREFIX=/usr/local
+
+ ### NOTE: No user-serviceable parts below this line! ###
+
+ INCLUDES= -I ../third-party/seqan/core/include/ \
+- -I ../third-party/zlib/ \
+- -I ../third-party/bzip2/ \
+ -I ../third-party/smhasher/
+
+-CXXFLAGS=$(INCLUDES)
+-CXX_WARNING_FLAGS=-Wall
+-CXX_OPTIMIZATION_FLAGS=-O3
+-CXX_SHARED_LIB_FLAGS=-fPIC
+-CXXFLAGS+= \
+- $(CXX_WARNING_FLAGS) \
+- $(CXX_OPTIMIZATION_FLAGS) \
+- $(CXX_SHARED_LIB_FLAGS)
+-
+-CFLAGS=$(INCLUDES)
+-C_WARNING_FLAGS=-Wall
+-C_OPTIMIZATION_FLAGS=-O3
+-C_SHARED_LIB_FLAGS=-fPIC
+-CFLAGS+= $(C_WARNING_FLAGS) $(C_OPTIMIZATION_FLAGS) $(C_SHARED_LIB_FLAGS)
++ifeq ($(USE_SYSTEM_ZLIB), false)
++INCLUDES += -I ../third-party/zlib/
++endif
++
++ifeq ($(USE_SYSTEM_LIBBZ2), false)
++INCLUDES += -I ../third-party/bzip2/
++endif
++
++# Warnings in common to C and C++
++WARNINGS=-Wall
+
+-LIBS=
++# Flags in common to C and C++
++COMMON_FLAGS=-O3 -fPIC
++SEQAN_FLAGS=-DSEQAN_HAS_ZLIB=1 -DSEQAN_HAS_BZIP2=1
++
++# Base C/CXXFLAGS
++CPPFLAGS ?=
++CPPFLAGS += $(SEQAN_FLAGS)
++
++CXXFLAGS ?=
++CXXFLAGS += $(COMMON_FLAGS) $(WARNINGS)
++CXXFLAGS += -Wstrict-null-sentinel
++CXXFLAGS += $(INCLUDES) $(CPPFLAGS)
++
++CFLAGS ?=
++CXXFLAGS += $(COMMON_FLAGS) $(WARNINGS)
++CFLAGS += -Wshadow -Wcast-align -Wstrict-prototypes
++CFLAGS += $(INCLUDES) $(CPPFLAGS)
++
++LDFLAGS ?=
++ifneq ($(USE_SYSTEM_ZLIB), false)
++LDFLAGS += -lz
++endif
++
++ifneq ($(USE_SYSTEM_LIBBZ2), false)
++LDFLAGS += -lbz2
++endif
+
+ ifeq ($(WANT_DEBUGGING), true)
+-CXX_DEBUG_FLAGS=-g
+-CXXFLAGS+= $(CXX_DEBUG_FLAGS)
+-CFLAGS+= $(CXX_DEBUG_FLAGS)
+-else
+-CXX_DEBUG_FLAGS=
++DEBUG_FLAGS=-g
++CXXFLAGS += $(DEBUG_FLAGS)
++CFLAGS += $(DEBUG_FLAGS)
+ endif
+
+ ifeq ($(WANT_EXTRA_SANITY_CHECKING), true)
+ DEFINE_KHMER_EXTRA_SANITY_CHECKS=-DKHMER_EXTRA_SANITY_CHECKS
+-CXXFLAGS+= $(DEFINE_KHMER_EXTRA_SANITY_CHECKS)
+-CFLAGS+= $(DEFINE_KHMER_EXTRA_SANITY_CHECKS)
+-else
+-DEFINE_KHMER_EXTRA_SANITY_CHECKS=
++CXXFLAGS += $(DEFINE_KHMER_EXTRA_SANITY_CHECKS)
++CFLAGS += $(DEFINE_KHMER_EXTRA_SANITY_CHECKS)
+ endif
+
+ ifeq ($(WANT_PROFILING), true)
+@@ -77,19 +96,20 @@
+ CXX=tau_cxx.sh
+ endif
+ ifeq ($(PROFILER_OF_CHOICE), gprof)
+-PROFILING_LIBS=-pg
+-CXXFLAGS+= -pg
+-LIBS+= $(PROFILING_LIBS)
++CXXFLAGS += -pg
++CFLAGS += -pg
++LDFLAGS += -pg
+ endif
+ endif
+
+ ifeq ($(WITH_INTERNAL_METRICS), true)
+-CXXFLAGS+= -DWITH_INTERNAL_METRICS
++CXXFLAGS += -DWITH_INTERNAL_METRICS
++CFLAGS += -DWITH_INTERNAL_METRICS
+ endif
+
+ # Place POSIX threads last in linking order, if needed.
+ ifneq ($(shell uname), Linux)
+-LIBS+= -pthread
++LDFLAGS += -pthread
+ endif
+
+
+@@ -99,47 +119,50 @@
+ rm -f chkomp)
+
+ ifeq ($(HAVE_OPENMP), true)
+- CFLAGS += -fopenmp
+- CXXFLAGS += -fopenmp
++CXXFLAGS +=-fopenmp
++CFLAGS +=-fopenmp
+ endif
+
+-VERSION = $(shell python get_version.py)
++ifneq ($(PACKAGE_VERSION),)
++VERSION = $(PACKAGE_VERSION)
++else
++VERSION = $(shell ./get_version.py)
++endif
+
+-LIBVERSION = $(shell python get_version.py | sed -e 's/^\([^-]*\)-.*/\1/')
+-LIBKHMERSO=libkhmer.so.$(LIBVERSION)
++MAJOR_VERSION = $(shell echo $(VERSION) | sed -e 's/^\([^-\.]*\)\.\([^-\.]*\).*/\1/')
++MINOR_VERSION = $(shell echo $(VERSION) | sed -e 's/^\([^-\.]*\)\.\([^-\.]*\).*/\2/')
+
+-CXXFLAGS+= -DVERSION=$(VERSION)
++LIB_VERSION = $(MAJOR_VERSION).$(MINOR_VERSION)
++
++ifeq ($(shell uname), Darwin)
++SHARED_EXT = dylib
++SONAME = liboxli.$(SHARED_EXT).$(MAJOR_VERSION)
++SONAME_FLAGS = -install_name $(PREFIX)/lib/$(SONAME) -compatibility_version $(MAJOR_VERSION) -current_version $(VERSION)
++else
++SHARED_EXT = so
++SONAME = liboxli.$(SHARED_EXT).$(MAJOR_VERSION)
++SONAME_FLAGS = -Wl,-soname=$(SONAME)
++endif
++
++LIBKHMERSO=liboxli.$(SHARED_EXT).$(LIB_VERSION)
++
++CXXFLAGS += -DVERSION=$(VERSION)
+
+ NO_UNIQUE_RC=0
+-CXXFLAGS+= -DNO_UNIQUE_RC=$(NO_UNIQUE_RC)
++CXXFLAGS += -DNO_UNIQUE_RC=$(NO_UNIQUE_RC)
++CFLAGS += -DNO_UNIQUE_RC=$(NO_UNIQUE_RC)
+
+ export CXX
+ export CFLAGS
+ export CXXFLAGS
+-export LIBS
++export LDFLAGS
+ export VERSION
+
+
+ #### Third party dependencies ####
+-# ZLIB
++# ZLIB, use .lo not .o, so we get -fPIC and other library-related flags
+ ZLIB_DIR=../third-party/zlib
+ ZLIB_OBJS_BASE=\
+- adler32.o \
+- crc32.o \
+- deflate.o \
+- infback.o \
+- inffast.o \
+- inflate.o \
+- inftrees.o \
+- trees.o \
+- zutil.o \
+- compress.o \
+- uncompr.o \
+- gzclose.o \
+- gzlib.o \
+- gzread.o \
+- gzwrite.o
+-ZLIB_PIC_OBJS_BASE=\
+ adler32.lo \
+ crc32.lo \
+ deflate.lo \
+@@ -157,7 +180,6 @@
+ gzwrite.lo
+
+ ZLIB_OBJS=$(addprefix $(ZLIB_DIR)/, $(ZLIB_OBJS_BASE))
+-ZLIB_PIC_OBJS=$(addprefix $(ZLIB_DIR)/, $(ZLIB_PIC_OBJS_BASE))
+
+ # BZ2
+ BZIP2_DIR=../third-party/bzip2
+@@ -173,7 +195,7 @@
+ BZIP2_OBJS=$(addprefix $(BZIP2_DIR)/, $(BZIP2_OBJS_BASE))
+
+
+-#### khmer proper below here ####
++#### oxli proper below here ####
+
+ LIBKHMER_OBJS= \
+ counting.o \
+@@ -187,9 +209,22 @@
+ read_parsers.o \
+ subset.o \
+ trace_logger.o \
+- murmur3.o \
+- $(BZIP2_OBJS) \
+- $(ZLIB_PIC_OBJS)
++ murmur3.o
++
++PRECOMILE_OBJS ?=
++PRECLEAN_TARGS ?=
++
++ifeq ($(USE_SYSTEM_ZLIB), false)
++LIBKHMER_OBJS += $(ZLIB_OBJS)
++PRECOMILE_OBJS += $(ZLIB_OBJS)
++PRECLEAN_TARGS += zlibclean
++endif
++
++ifeq ($(USE_SYSTEM_LIBBZ2), false)
++LIBKHMER_OBJS += $(BZIP2_OBJS)
++PRECOMILE_OBJS += $(BZIP2_OBJS)
++PRECLEAN_TARGS += libbz2clean
++endif
+
+ KHMER_HEADERS= \
+ counting.hh \
+@@ -206,38 +241,35 @@
+ subset.hh \
+ trace_logger.hh
+
+-TEST_PROGS = test-Colors test-read-aligner test-compile
+-
+ # START OF RULES #
+
+ # The all rule comes first!
+-all: $(LIBKHMERSO) libkhmer.a khmer.pc
++all: $(LIBKHMERSO) liboxli.a oxli.pc
+
+-clean:
+- rm -f *.o *.a *.so* khmer.pc $(TEST_PROGS)
++zlibclean:
+ (cd $(ZLIB_DIR) && make distclean)
++libbz2clean:
+ (cd $(BZIP2_DIR) && make -f Makefile-libbz2_so clean)
+
+-test: $(TEST_PROGS)
++clean: $(PRECLEAN_TARGS)
++ rm -f *.o *.a *.$(SHARED_EXT)* oxli.pc $(TEST_PROGS)
+
+-install: $(LIBKHMERSO) libkhmer.a khmer.pc $(KHMER_HEADERS)
+- mkdir -p $(PREFIX)/lib $(PREFIX)/lib/pkgconfig $(PREFIX)/include/
+- cp -r $(KHMER_HEADERS) \
+- ../third-party/smhasher/MurmurHash3.h \
+- $(PREFIX)/include/
+- cp khmer.pc $(PREFIX)/lib/pkgconfig/
+- cp $(LIBKHMERSO) libkhmer.a $(PREFIX)/lib
+- ln -sf $(PREFIX)/lib/$(LIBKHMERSO) $(PREFIX)/lib/libkhmer.so
++install: $(LIBKHMERSO) liboxli.a oxli.pc $(KHMER_HEADERS)
++ mkdir -p $(PREFIX)/lib $(PREFIX)/lib/pkgconfig $(PREFIX)/include/oxli
++ cp -r $(KHMER_HEADERS) \
++ ../third-party/smhasher/MurmurHash3.h \
++ $(PREFIX)/include/oxli/
++ cp oxli.pc $(PREFIX)/lib/pkgconfig/
++ cp $(LIBKHMERSO) liboxli.a $(PREFIX)/lib
++ ln -sf $(PREFIX)/lib/$(LIBKHMERSO) $(PREFIX)/lib/$(SONAME)
++ ln -sf $(PREFIX)/lib/$(SONAME) $(PREFIX)/lib/liboxli.$(SHARED_EXT)
+
+-khmer.pc: khmer.pc.in
++oxli.pc: oxli.pc.in
+ sed -e 's, at prefix@,$(PREFIX),' -e 's, at VERSION@,$(VERSION),' $< >$@
+
+ $(ZLIB_OBJS):
+ (cd $(ZLIB_DIR) && ./configure && make $(ZLIB_OBJS_BASE))
+
+-$(ZLIB_PIC_OBJS):
+- (cd $(ZLIB_DIR) && ./configure && make $(ZLIB_PIC_OBJS_BASE))
+-
+ $(BZIP2_OBJS):
+ (cd $(BZIP2_DIR) && make -f Makefile-libbz2_so $(BZIP2_OBJS_BASE))
+
+@@ -245,17 +277,14 @@
+ murmur3.o: ../third-party/smhasher/MurmurHash3.cc
+ $(CXX) $(CXXFLAGS) -c -o $@ $<
+
+-%.o: %.cc $(ZLIB_OBJS) $(ZLIB_PIC_OBJS) $(BZIP2_OBJS) $(KHMER_HEADERS)
+- $(CXX) $(CXXFLAGS) -c -o $@ $<
++%.o: %.cc $(PRECOMILE_OBJS) $(KHMER_HEADERS)
++ $(CXX) $(CXXFLAGS) $(LDFLAGS) -c -o $@ $<
+
+ $(LIBKHMERSO): $(LIBKHMER_OBJS)
+- $(CXX) $(CXXFLAGS) -shared -o $@ $(LIBKHMER_OBJS)
+- ln -sf $(LIBKHMERSO) libkhmer.so
++ $(CXX) $(CXXFLAGS) $(LDFLAGS) $(SONAME_FLAGS) -shared -o $@ $^
++ ln -sf $(LIBKHMERSO) $(SONAME)
++ ln -sf $(SONAME) liboxli.$(SHARED_EXT)
+
+-libkhmer.a: $(LIBKHMER_OBJS)
+- ar rcs $@ $(LIBKHMER_OBJS)
++liboxli.a: $(LIBKHMER_OBJS)
++ ar rcs $@ $^
+ ranlib $@
+-
+-# catch-all rule for test drivers
+-test-%: test-%.cc libkhmer.a
+- $(CXX) $(CXXFLAGS) -I . -o $@ $< libkhmer.a
+--- khmer.orig/lib/khmer.pc.in
++++ /dev/null
+@@ -1,14 +0,0 @@
+-prefix=@prefix@
+-exec_prefix=${prefix}
+-libdir=${exec_prefix}/lib
+-sharedlibdir=${libdir}
+-includedir=${prefix}/include
+-
+-Name: khmer
+-Description: The unsupported core C++ library from the khmer project
+-URL: http://khmer.readthedocs.org/
+-Version: @VERSION@
+-
+-Requires:
+-Libs: -L${libdir} -L${sharedlibdir} -lkhmer
+-Cflags: -I${includedir}
+--- /dev/null
++++ khmer/lib/oxli.pc.in
+@@ -0,0 +1,14 @@
++prefix=@prefix@
++exec_prefix=${prefix}
++libdir=${exec_prefix}/lib
++sharedlibdir=${libdir}
++includedir=${prefix}/include
++
++Name: khmer
++Description: The unsupported core C++ library from the khmer project
++URL: http://khmer.readthedocs.org/
++Version: @VERSION@
++
++Requires:
++Libs: -L${libdir} -L${sharedlibdir} -lkhmer
++Cflags: -I${includedir}
diff --git a/debian/patches/series b/debian/patches/series
index 6048cae..f30874c 100644
--- a/debian/patches/series
+++ b/debian/patches/series
@@ -1,3 +1,5 @@
+setup-py-py3
+pr-1148-changes
local-libs
older-setuptools
disable_google_analytics
diff --git a/debian/patches/setup-py-py3 b/debian/patches/setup-py-py3
new file mode 100644
index 0000000..3b02dc7
--- /dev/null
+++ b/debian/patches/setup-py-py3
@@ -0,0 +1,8 @@
+--- khmer.orig/setup.py
++++ khmer/setup.py
+@@ -1,4 +1,4 @@
+-#! /usr/bin/env python
++#!/usr/bin/env python3
+ # This file is part of khmer, https://github.com/dib-lab/khmer/, and is
+ # Copyright (C) Michigan State University, 2009-2015. It is licensed under
+ # the three-clause BSD license; see doc/LICENSE.txt.
diff --git a/debian/rules b/debian/rules
index 3c0ace2..d309e42 100755
--- a/debian/rules
+++ b/debian/rules
@@ -3,27 +3,20 @@ export DH_VERBOSE=1
export PYBUILD_NAME=khmer
export PYBUILD_SYSTEM=distutils
export PYBUILD_INSTALL_ARGS='--install-scripts=/usr/lib/khmer/bin'
+export PYBUILD_DESTDIR_python3=debian/khmer
+
+# Force C++ library build to use Debian zlib1g-dev/libbz2-dev
export USE_SYSTEM_ZLIB=true
export USE_SYSTEM_LIBBZ2=true
-# dh_python2 renames compiled extensions to include a multiarch triplet
-# ex: _khmermodule.so -> _khmermodule.x86_64-linux-gnu.so
-# While Python can handle the multiarch triplet it can't do so in combination
-# with the 'module' infix. So we remove the 'module' infix before dh_python2
-# runs
-# See https://lists.debian.org/debian-python/2015/02/msg00047.html
-export PYBUILD_AFTER_INSTALL := \
- mv {destdir}{install_dir}/khmer/_khmermodule.so \
- {destdir}{install_dir}/khmer/_khmer.so
-
export PACKAGE_VERSION := $(shell dpkg-parsechangelog | egrep '^Version:' | cut -f 2 -d ' ')
%:
- dh $@ --with python2,sphinxdoc --buildsystem=pybuild --parallel
+ dh $@ --with python3,sphinxdoc --buildsystem=pybuild --parallel
override_dh_auto_build:
dh_auto_build
- ln -s "`pwd`"/.pybuild/pythonX.Y_*/build/khmer/*.so khmer && \
+ ln -sf "`pwd`"/.pybuild/pythonX.Y_*/build/khmer/*.so khmer && \
./setup.py build_sphinx
ronn -r --manual=khmer --organization='Michigan State University' \
debian/khmer.1.ronn
diff --git a/debian/upstream/metadata b/debian/upstream/metadata
index ad49df1..dc157cd 100644
--- a/debian/upstream/metadata
+++ b/debian/upstream/metadata
@@ -1,7 +1,7 @@
-Bug-Database: https://github.com/ged-lab/khmer/issues
-Bug-Submit: https://github.com/ged-lab/khmer/issues/new
+Bug-Database: https://github.com/dib-lab/khmer/issues
+Bug-Submit: https://github.com/dib-lab/khmer/issues/new
Cite-As:
-Changelog: https://raw.githubusercontent.com/ged-lab/khmer/master/ChangeLog
+Changelog: https://raw.githubusercontent.com/dib-lab/khmer/master/ChangeLog
Contact: khmer-project at idyll.org
Funding: AFRI Competitive Grant no. 2010-65205-20361 from the USDA NIFA, National Human Genome Research Institute of the National Institutes of Health under Award Number R01HG007513
Name: khmer
@@ -61,7 +61,7 @@ Reference:
DOI: 10.1186/1471-2105-9-11
PMID: 18184432
ISSN: 1471-2105
-Repository-Browse: https://github.com/ged-lab/khmer
-Repository: https://github.com/ged-lab/khmer.git
+Repository-Browse: https://github.com/dib-lab/khmer
+Repository: https://github.com/dib-lab/khmer.git
diff --git a/doc/conf.py b/doc/conf.py
index 99017b4..e400ab2 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -193,7 +193,7 @@ htmlhelp_basename = 'khmerdoc'
html_context = {
"google_analytics_id" : 'UA-51731094-1',
"disqus_shortname" : 'khmer-docs',
-# "github_base_account" : 'ged-lab',
+# "github_base_account" : 'dib-lab',
"github_project" : 'khmer',
}
diff --git a/doc/dev/binary-file-formats.rst b/doc/dev/binary-file-formats.rst
new file mode 100644
index 0000000..8359af3
--- /dev/null
+++ b/doc/dev/binary-file-formats.rst
@@ -0,0 +1,115 @@
+khmer/Oxli Binary File Formats
+==============================
+
+- C++ macro definitions are given in parenthesis.
+- C++ types are given in square brackets.
+- ``Len`` is the field's size, in bytes, and ``Off`` is the field's zero-based
+ byte offset in the file/section.
+
+khmer v1.4 and previous
+~~~~~~~~~~~~~~~~~~~~~~~
+
+CountingHash
+------------
+
+The header is in the format below, in file offset order. There is no magic
+string.
+
+================== =========== ==============================================
+Field Length Value
+================== =========== ==============================================
+Version 1 ``0x04`` (``SAVED_FORMAT_VERSION``)
+File Type 1 ``0x01`` (``SAVED_COUNTING_HT``)
+Use Bigcount 1 ``1`` if bigcounts is used, else ``0``
+K-size 1 k-mer length, ``1 <= k <= 32``
+Number of Tables 1 Number of Count-min Sketch tables
+================== =========== ==============================================
+
+
+khmer v2.0 formats
+~~~~~~~~~~~~~~~~~~
+
+
+Magic string
+------------
+
+All formats shall have the "magic string" ``OXLI`` as their first bytes, after
+any external compression/encoding (e.g. gzip encapsulation) is removed. Note
+that this makes them incompatible with older versions of khmer.
+
+Countgraph
+----------
+
+(a.k.a ``CountingHash``, a Count-min Sketch)
+
+:Preferred extension: '.ct' (count table)
+
+The header is in the format below, again in the order of file offset.
+
+================== ===== ===== ==============================================
+Field Len Off Value
+================== ===== ===== ==============================================
+Magic string 4 0 ``OXLI`` (``SAVED_SIGNATURE``)
+Version 1 4 ``0x04`` (``SAVED_FORMAT_VERSION``)
+File Type 1 5 ``0x01`` (``SAVED_COUNTING_HT``)
+Use Bigcount 1 6 ``0x01`` if bigcounts is used, else ``0x00``
+K-size 4 7 k-mer length, ``ht._ksize``. [``uint32_t``]
+Number of Tables 1 11 Number of Count-min Sketch tables,
+ ``ht._n_tables``. [``uint8_t``]
+================== ===== ===== ==============================================
+
+Then follows the Countgraph's tables. For each table:
+
+================== ===== ===== ==============================================
+Field Len Off Value
+================== ===== ===== ==============================================
+Table size 8 0 Length of this table, ``ht._tablesizes[i]``.
+ [``uint64_t``]
+Bins N 8 This table's bins, length given by previous
+ field. [``uint8_t``]
+================== ===== ===== ==============================================
+
+Then follows a single value, the [``uint64_t``] number of ``kmer: count``
+pairs. Then follows the Bigcount map, if this number is greater than zero. For
+each kmer:
+
+================== ===== ===== ==============================================
+Field Len Off Value
+================== ===== ===== ==============================================
+Kmer 8 0 Kmer's hash [``HashIntoType/uint64_t``].
+Count 2 8 Kmer's count [``uint16_t``].
+================== ===== ===== ==============================================
+
+
+Nodegraph
+---------
+
+(a.k.a ``HashBits``, a Bloom Filter)
+
+:Preferred extension: '.pt' (presence table)
+
+The header is in the format below, again in the order of file offset. Value
+macro definitions are given in parenthesis
+
+================== ===== ===== ==============================================
+Field Len Off Value
+================== ===== ===== ==============================================
+Magic string 4 0 ``OXLI`` (``SAVED_SIGNATURE``)
+Version 1 4 ``0x04`` (``SAVED_FORMAT_VERSION``)
+File Type 1 5 ``0x02`` (``SAVED_HASHBITS``)
+K-size 4 6 k-mer length, ``ht._ksize``. [``unsigned int``]
+Number of Tables 1 10 Number of Nodegraph tables. ``ht._n_tables``.
+ [``uint8_t``]
+================== ===== ===== ==============================================
+
+Then follows the Nodegraph's tables. For each table:
+
+================== ======= ===== ==============================================
+Field Len Off Value
+================== ======= ===== ==============================================
+Table size 8 0 Length of table, **in bits** (``uint64_t``).
+Bins N/8+1 8 This table's bytes, length given by previous
+ field, divided by 8, plus 1 (``uint8_t``).
+================== ======= ===== ==============================================
+
+.. todo:: Document ``Tags``, ``Stoptags``, ``Subset``, ``Labelset``
diff --git a/doc/dev/development.rst b/doc/dev/development.rst
index d9dac70..414f914 100644
--- a/doc/dev/development.rst
+++ b/doc/dev/development.rst
@@ -23,11 +23,11 @@ git and GitHub strategies
Still in the works, but read `this
<http://scottchacon.com/2011/08/31/github-flow.html>`__.
-Make a branch on ged-lab (preferred so others can contribute) or fork the
+Make a branch on dib-lab (preferred so others can contribute) or fork the
repository and make a branch there.
Each piece or fix you are working on should have its own branch; make a pull-
-request to ged-lab/master to aid in code review, testing, and feedback.
+request to dib-lab/master to aid in code review, testing, and feedback.
If you want your code integrated then it needs to be mergable
@@ -38,7 +38,7 @@ Example pull request update using the command line:
#. Checkout the source branch of the pull request
``git checkout my-pull-request``
#. Pull in the destination of the pull request and resolve any conflicts
- ``git pull git at github.com:ged-lab/khmer.git master``
+ ``git pull git at github.com:dib-lab/khmer.git master``
#. Push your update to the source of the pull request ``git push``
#. Jenkins will automatically attempt to build and test your pull requests.
diff --git a/doc/dev/for-khmer-developers.rst b/doc/dev/for-khmer-developers.rst
index b62d05e..bbf6368 100644
--- a/doc/dev/for-khmer-developers.rst
+++ b/doc/dev/for-khmer-developers.rst
@@ -1,11 +1,11 @@
A guide for khmer committers
============================
-This document is for people with commit rights to github.com/ged-lab/khmer.
+This document is for people with commit rights to github.com/dib-lab/khmer.
----
-If you have commit privileges to the ged-lab/khmer repository, here are a
+If you have commit privileges to the dib-lab/khmer repository, here are a
few useful tips.
First, never merge something unless it's been through a review! This
diff --git a/doc/dev/getting-started.rst b/doc/dev/getting-started.rst
index 87e3ed1..a07c743 100644
--- a/doc/dev/getting-started.rst
+++ b/doc/dev/getting-started.rst
@@ -1,4 +1,4 @@
-.. This file is part of khmer, http://github.com/ged-lab/khmer/, and is
+.. This file is part of khmer, https://github.com/dib-lab/khmer/, and is
Copyright (C) Michigan State University, 2009-2015. It is licensed under
the three-clause BSD license; see doc/LICENSE.txt.
Contact: khmer-project at idyll.org
@@ -61,7 +61,7 @@ One-time Preparation
(We use GitHub to manage khmer contributions.)
-#. Fork `github.com/ged-lab/khmer <http://github.com/ged-lab/khmer>`__.
+#. Fork `github.com/dib-lab/khmer <https://github.com/dib-lab/khmer>`__.
Visit that page, and then click on the 'fork' button (upper right).
@@ -79,10 +79,10 @@ One-time Preparation
(This makes a local copy of khmer on your development machine.)
-#. Add a git reference to the khmer ged-lab repository::
+#. Add a git reference to the khmer dib-lab repository::
cd khmer
- git remote add ged https://github.com/ged-lab/khmer.git
+ git remote add dib https://github.com/dib-lab/khmer.git
cd ../
(This makes it easy for you to pull down the latest changes in the
@@ -171,7 +171,7 @@ Building khmer and running the tests
make
If this fails, we apologize -- please `go create a new issue
- <https://github.com/ged-lab/khmer/issues?direction=desc&sort=created&state=open>`__,
+ <https://github.com/dib-lab/khmer/issues?direction=desc&sort=created&state=open>`__,
paste in the failure message, and we'll try to help you work through it!
(This takes the C++ source code and compiles it into something that Python
@@ -199,8 +199,8 @@ Claiming an issue and starting to develop
#. Find an open issue and claim it.
Go to `the list of open khmer issues
- <https://github.com/ged-lab/khmer/issues?direction=desc&sort=created&state=open>`__
- and find one you like; we suggest starting with `the low-hanging fruit issues <https://github.com/ged-lab/khmer/issues?direction=desc&labels=low-hanging-fruit&page=1&sort=created&state=open>`__).
+ <https://github.com/dib-lab/khmer/issues?direction=desc&sort=created&state=open>`__
+ and find one you like; we suggest starting with `the low-hanging fruit issues <https://github.com/dib-lab/khmer/issues?direction=desc&labels=low-hanging-fruit&page=1&sort=created&state=open>`__).
Once you've found an issue you like, make sure that no one has been
assigned to it (see "assignee", bottom right near "notifications").
@@ -213,17 +213,18 @@ Claiming an issue and starting to develop
from the main khmer master branch::
git checkout master
- git pull ged master
+ git pull dib master
(This pulls in all of the latest changes from whatever we've been
- doing on ged-lab.)
+ doing on dib-lab.)
#. Create a new branch and link it to your fork on GitHub::
- git checkout -b fix/issue_number
- git push -u origin fix/issue_number
+ git checkout -b fix/brief_issue_description
+ git push -u origin fix/brief_issue_description
- where you replace "issue_number" with the number of the issue.
+ where you replace "brief_issue_description" with 2-3 words, separated
+ by underscores, describing the issue.
(This is the set of changes you're going to ask to be merged into khmer.)
@@ -235,10 +236,10 @@ Claiming an issue and starting to develop
#. Periodically update your branch from the main khmer master branch::
- git pull ged master
+ git pull dib master
(This pulls in all of the latest changes from whatever we've been
- doing on ged-lab - important especially during periods of fast change
+ doing on dib-lab - important especially during periods of fast change
or for long-running pull requests.
#. Run the tests and/or build the docs *before* pushing to GitHub::
diff --git a/doc/dev/hackathon.rst b/doc/dev/hackathon.rst
index 4b49aab..a74a0f9 100644
--- a/doc/dev/hackathon.rst
+++ b/doc/dev/hackathon.rst
@@ -7,7 +7,7 @@ blog post <http://ivory.idyll.org/blog/2014-khmer-hackathon.html>`__).
----
Please track `khmer issue #446
-<https://github.com/ged-lab/khmer/issues/446>`__ for up-to-the-minute
+<https://github.com/dib-lab/khmer/issues/446>`__ for up-to-the-minute
information. You can subscribe to this issue (lower right on issue page)
to get automatic e-mail updates.
@@ -21,7 +21,7 @@ khmer is a piece of scientific software that does cool stuff in biology.
can read more about khmer `here <http://figshare.com/articles/The_khmer_software_package_enabling_efficient_sequence_analysis/979190>`__ if you like.)
The important bit about khmer is that we develop it openly,
-at http://github.com/ged-lab/khmer; we use reasonably OK software development
+at https://github.com/dib-lab/khmer; we use reasonably OK software development
practices; and we're interested in spreading the gospel, so to speak.
So! For this Hackathon, we're providing a "mentored software
@@ -37,9 +37,9 @@ To get started, go to :doc:`getting-started`!
You can contact us directly at khmer-project at idyll.org, but if you're
experience trouble of any kind, please feel to `create an issue
-<https://github.com/ged-lab/khmer/issues?direction=desc&sort=created&state=open>`__
+<https://github.com/dib-lab/khmer/issues?direction=desc&sort=created&state=open>`__
where we can help you out. Also keep an eye on `issue #446
-<https://github.com/ged-lab/khmer/issues/446>`__ where we're updating
+<https://github.com/dib-lab/khmer/issues/446>`__ where we're updating
Hackathon information more generally.
Problems, questions, and solutions
@@ -54,7 +54,7 @@ Problems, questions, and solutions
#. There's a bug in this documentation! But I can fix it...
Oh noes! Fixes are welcome -- these docs are in branch
- 'docs/hackathon' on http://github.com/ged-lab/khmer/, so please
+ 'docs/hackathon' on https://github.com/dib-lab/khmer/, so please
send PRs there. Or if you haven't worked through the process yet,
please `add an issue
- <https://github.com/ged-lab/khmer/issues?direction=desc&sort=created&state=open>`__ and we'll be on it.
+ <https://github.com/dib-lab/khmer/issues?direction=desc&sort=created&state=open>`__ and we'll be on it.
diff --git a/doc/dev/index.rst b/doc/dev/index.rst
index d5b2b15..cead1c3 100644
--- a/doc/dev/index.rst
+++ b/doc/dev/index.rst
@@ -26,3 +26,4 @@ Contents:
details
development
crazy-ideas
+ binary-file-formats
diff --git a/doc/dev/release.rst b/doc/dev/release.rst
index 8e6ed67..cb3da63 100644
--- a/doc/dev/release.rst
+++ b/doc/dev/release.rst
@@ -16,7 +16,7 @@ release makers, following this checklist by MRC.
#. The below should be done in a clean checkout::
cd `mktemp -d`
- git clone git at github.com:ged-lab/khmer.git
+ git clone git at github.com:dib-lab/khmer.git
cd khmer
#. (Optional) Check for updates to versioneer::
@@ -65,7 +65,7 @@ release makers, following this checklist by MRC.
the letter 'v'::
git tag v${new_version}-${rc}
- git push --tags git at github.com:ged-lab/khmer.git
+ git push --tags git at github.com:dib-lab/khmer.git
#. Test the release candidate. Bonus: repeat on Mac OS X::
@@ -78,11 +78,11 @@ release makers, following this checklist by MRC.
cd testenv1
source bin/activate
- git clone --depth 1 --branch v${new_version}-${rc} https://github.com/ged-lab/khmer.git
+ git clone --depth 1 --branch v${new_version}-${rc} https://github.com/dib-lab/khmer.git
cd khmer
make install-dependencies
make test
- normalize-by-median.py --version 2>&1 | grep ${new_version}-${rc} && \
+ normalize-by-median.py --version 2>&1 | grep khmer\ ${new_version}-${rc} && \
echo 1st manual version check passed
pip uninstall -y khmer; pip uninstall -y khmer; make install
mkdir ../not-khmer # if there is a subdir named 'khmer' nosetest will execute tests
@@ -92,10 +92,10 @@ release makers, following this checklist by MRC.
# Secondly we test via pip
- cd ../testenv2
+ cd ../../testenv2
source bin/activate
pip install -U setuptools==3.4.1
- pip install -e git+https://github.com/ged-lab/khmer.git@v${new_version}-${rc}#egg=khmer
+ pip install -e git+https://github.com/dib-lab/khmer.git@v${new_version}-${rc}#egg=khmer
cd src/khmer
make install-dependencies
make dist
@@ -103,7 +103,7 @@ release makers, following this checklist by MRC.
cp dist/khmer*tar.gz ../../../testenv3/
pip uninstall -y khmer; pip uninstall -y khmer; make install
cd ../.. # no subdir named khmer here, safe for nosetesting installed khmer module
- normalize-by-median.py --version 2>&1 | grep ${new_version}-${rc} && \
+ normalize-by-median.py --version 2>&1 | grep khmer\ ${new_version}-${rc} && \
echo 2nd manual version check passed
nosetests khmer --attr '!known_failing'
@@ -141,7 +141,7 @@ release makers, following this checklist by MRC.
pip install screed nose
pip install -i https://testpypi.python.org/pypi --pre --no-clean khmer
nosetests khmer --attr '!known_failing'
- normalize-by-median.py --version 2>&1 | grep ${new_version}-${rc} && \
+ normalize-by-median.py --version 2>&1 | grep khmer\ ${new_version}-${rc} && \
echo 3rd manual version check passed
cd build/khmer
make test
@@ -166,8 +166,8 @@ so:
#. Delete the release candidate tag and push the tag updates to GitHub.::
git tag -d v${new_version}-${rc}
- git push git at github.com:ged-lab/khmer.git
- git push --tags git at github.com:ged-lab/khmer.git
+ git push git at github.com:dib-lab/khmer.git
+ git push --tags git at github.com:dib-lab/khmer.git
#. Add the release on GitHub, using the tag you just pushed. Name
it 'version X.Y.Z', and copy and paste in the release notes.
diff --git a/doc/dev/scripts-and-sandbox.rst b/doc/dev/scripts-and-sandbox.rst
index b8a11af..3b149cc 100644
--- a/doc/dev/scripts-and-sandbox.rst
+++ b/doc/dev/scripts-and-sandbox.rst
@@ -66,7 +66,7 @@ Copyright message
Our current Copyright message is::
#
- # This file is part of khmer, http://github.com/ged-lab/khmer/, and is
+ # This file is part of khmer, https://github.com/dib-lab/khmer/, and is
# Copyright (C) Michigan State University, 2009-2015. It is licensed under
# the three-clause BSD license; see doc/LICENSE.txt.
# Contact: khmer-project at idyll.org
diff --git a/doc/index.rst b/doc/index.rst
index 8135f1f..47fcf72 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -12,8 +12,8 @@ khmer -- k-mer counting & filtering FTW
Rajaram Srinivasan, Qingpeng Zhang, and C. Titus Brown
:Contact: khmer-project at idyll.org
-:GitHub: https://github.com/ged-lab/khmer
-:Chat: https://gitter.im/ged-lab/khmer
+:GitHub: https://github.com/dib-lab/khmer
+:Chat: https://gitter.im/dib-lab/khmer
:License: BSD
@@ -52,6 +52,7 @@ Contents:
introduction
contributors
citations
+ whats-new-2.0
release-notes/index
user/index
dev/index
diff --git a/doc/release-notes/release-1.0.1.md b/doc/release-notes/release-1.0.1.md
index 5668731..e2121bc 100644
--- a/doc/release-notes/release-1.0.1.md
+++ b/doc/release-notes/release-1.0.1.md
@@ -29,25 +29,25 @@ All of these are pre-existing.
Some users have reported that normalize-by-median.py will utilize more
memory than it was configured for. This is being investigated in
-https://github.com/ged-lab/khmer/issues/266
+https://github.com/dib-lab/khmer/issues/266
Some FASTQ files confuse our parser when running with more than one thread.
For example, while using load-into-counting.py. If you experience this then
add "--threads=1" to your command line. This issue is being tracked in
-https://github.com/ged-lab/khmer/issues/249
+https://github.com/dib-lab/khmer/issues/249
If your k-mer table (hashfile) gets truncated, perhaps from a full filesystem, then our
-tools currently will get stuck. This is being tracked in https://github.com/ged-lab/khmer/issues/247 and https://github.com/ged-lab/khmer/issues/246
+tools currently will get stuck. This is being tracked in https://github.com/dib-lab/khmer/issues/247 and https://github.com/dib-lab/khmer/issues/246
Paired-end reads from Casava 1.8 currently require renaming for use in
normalize-by-median and abund-filter when used in paired mode. The
-integration of a fix for this is being tracked in https://github.com/ged-lab/khmer/issues/23
+integration of a fix for this is being tracked in https://github.com/dib-lab/khmer/issues/23
annotate-partitions.py only outputs FASTA even if given a FASTQ file. This
-issue is being tracked in https://github.com/ged-lab/khmer/issues/46
+issue is being tracked in https://github.com/dib-lab/khmer/issues/46
A user reported that abundance-dist-single.py fails with small files and many
-threads. This issue is being tracked in https://github.com/ged-lab/khmer/issues/75
+threads. This issue is being tracked in https://github.com/dib-lab/khmer/issues/75
## Contributors
diff --git a/doc/release-notes/release-1.0.1.rst b/doc/release-notes/release-1.0.1.rst
index 8b57b76..d491759 100644
--- a/doc/release-notes/release-1.0.1.rst
+++ b/doc/release-notes/release-1.0.1.rst
@@ -40,30 +40,30 @@ All of these are pre-existing.
Some users have reported that normalize-by-median.py will utilize more
memory than it was configured for. This is being investigated in
-https://github.com/ged-lab/khmer/issues/266
+https://github.com/dib-lab/khmer/issues/266
Some FASTQ files confuse our parser when running with more than one
thread. For example, while using load-into-counting.py. If you
experience this then add "--threads=1" to your command line. This issue
-is being tracked in https://github.com/ged-lab/khmer/issues/249
+is being tracked in https://github.com/dib-lab/khmer/issues/249
If your k-mer table (hashfile) gets truncated, perhaps from a full
filesystem, then our tools currently will get stuck. This is being
-tracked in https://github.com/ged-lab/khmer/issues/247 and
-https://github.com/ged-lab/khmer/issues/246
+tracked in https://github.com/dib-lab/khmer/issues/247 and
+https://github.com/dib-lab/khmer/issues/246
Paired-end reads from Casava 1.8 currently require renaming for use in
normalize-by-median and abund-filter when used in paired mode. The
integration of a fix for this is being tracked in
-https://github.com/ged-lab/khmer/issues/23
+https://github.com/dib-lab/khmer/issues/23
annotate-partitions.py only outputs FASTA even if given a FASTQ file.
This issue is being tracked in
-https://github.com/ged-lab/khmer/issues/46
+https://github.com/dib-lab/khmer/issues/46
A user reported that abundance-dist-single.py fails with small files and
many threads. This issue is being tracked in
-https://github.com/ged-lab/khmer/issues/75
+https://github.com/dib-lab/khmer/issues/75
Contributors
------------
diff --git a/doc/release-notes/release-1.0.md b/doc/release-notes/release-1.0.md
index 99ce8ab..598a2c5 100644
--- a/doc/release-notes/release-1.0.md
+++ b/doc/release-notes/release-1.0.md
@@ -61,25 +61,25 @@ All of these are pre-existing.
Some users have reported that normalize-by-median.py will utilize more
memory than it was configured for. This is being investigated in
-https://github.com/ged-lab/khmer/issues/266
+https://github.com/dib-lab/khmer/issues/266
Some FASTQ files confuse our parser when running with more than one thread.
For example, while using load-into-counting.py. If you experience this then
add "--threads=1" to your command line. This issue is being tracked in
-https://github.com/ged-lab/khmer/issues/249
+https://github.com/dib-lab/khmer/issues/249
If your k-mer table (hashfile) gets truncated, perhaps from a full filesystem, then our
-tools currently will get stuck. This is being tracked in https://github.com/ged-lab/khmer/issues/247 and https://github.com/ged-lab/khmer/issues/96 and https://github.com/ged-lab/khmer/issues/246
+tools currently will get stuck. This is being tracked in https://github.com/dib-lab/khmer/issues/247 and https://github.com/dib-lab/khmer/issues/96 and https://github.com/dib-lab/khmer/issues/246
Paired-end reads from Casava 1.8 currently require renaming for use in
normalize-by-median and abund-filter when used in paired mode. The
-integration of a fix for this is being tracked in https://github.com/ged-lab/khmer/issues/23
+integration of a fix for this is being tracked in https://github.com/dib-lab/khmer/issues/23
annotate-partitions.py only outputs FASTA even if given a FASTQ file. This
-issue is being tracked in https://github.com/ged-lab/khmer/issues/46
+issue is being tracked in https://github.com/dib-lab/khmer/issues/46
A user reported that abundance-dist-single.py fails with small files and many
-threads. This issue is being tracked in https://github.com/ged-lab/khmer/issues/75
+threads. This issue is being tracked in https://github.com/dib-lab/khmer/issues/75
## Contributors
diff --git a/doc/release-notes/release-1.0.rst b/doc/release-notes/release-1.0.rst
index 94dc769..7b7b3ab 100644
--- a/doc/release-notes/release-1.0.rst
+++ b/doc/release-notes/release-1.0.rst
@@ -76,31 +76,31 @@ All of these are pre-existing.
Some users have reported that normalize-by-median.py will utilize more
memory than it was configured for. This is being investigated in
-https://github.com/ged-lab/khmer/issues/266
+https://github.com/dib-lab/khmer/issues/266
Some FASTQ files confuse our parser when running with more than one
thread. For example, while using load-into-counting.py. If you
experience this then add "--threads=1" to your command line. This issue
-is being tracked in https://github.com/ged-lab/khmer/issues/249
+is being tracked in https://github.com/dib-lab/khmer/issues/249
If your k-mer table (hashfile) gets truncated, perhaps from a full
filesystem, then our tools currently will get stuck. This is being
-tracked in https://github.com/ged-lab/khmer/issues/247 and
-https://github.com/ged-lab/khmer/issues/96 and
-https://github.com/ged-lab/khmer/issues/246
+tracked in https://github.com/dib-lab/khmer/issues/247 and
+https://github.com/dib-lab/khmer/issues/96 and
+https://github.com/dib-lab/khmer/issues/246
Paired-end reads from Casava 1.8 currently require renaming for use in
normalize-by-median and abund-filter when used in paired mode. The
integration of a fix for this is being tracked in
-https://github.com/ged-lab/khmer/issues/23
+https://github.com/dib-lab/khmer/issues/23
annotate-partitions.py only outputs FASTA even if given a FASTQ file.
This issue is being tracked in
-https://github.com/ged-lab/khmer/issues/46
+https://github.com/dib-lab/khmer/issues/46
A user reported that abundance-dist-single.py fails with small files and
many threads. This issue is being tracked in
-https://github.com/ged-lab/khmer/issues/75
+https://github.com/dib-lab/khmer/issues/75
Contributors
------------
diff --git a/doc/release-notes/release-1.1.md b/doc/release-notes/release-1.1.md
index 0629f04..cd01bcb 100644
--- a/doc/release-notes/release-1.1.md
+++ b/doc/release-notes/release-1.1.md
@@ -4,7 +4,7 @@ This is v1.1, a minor version release; this version adds several new scripts.
Docs at: https://khmer.readthedocs.org/en/v1.1/
-Release notes w/links: https://github.com/ged-lab/khmer/releases/tag/v1.1
+Release notes w/links: https://github.com/dib-lab/khmer/releases/tag/v1.1
## New items of note:
@@ -32,27 +32,27 @@ All of these are pre-existing.
Some users have reported that normalize-by-median.py will utilize more
memory than it was configured for. This is being investigated in
-https://github.com/ged-lab/khmer/issues/266
+https://github.com/dib-lab/khmer/issues/266
Some FASTQ files confuse our parser when running with more than one thread.
For example, while using load-into-counting.py. If you experience this then
add "--threads=1" to your command line. This issue is being tracked in
-https://github.com/ged-lab/khmer/issues/249
+https://github.com/dib-lab/khmer/issues/249
If your k-mer table is truncated on write, an error may not be reported; this
-is being tracked in https://github.com/ged-lab/khmer/issues/443.
+is being tracked in https://github.com/dib-lab/khmer/issues/443.
However, khmer will now (correctly) fail when trying to read a truncated file
(See #333).
Paired-end reads from Casava 1.8 currently require renaming for use in
normalize-by-median and abund-filter when used in paired mode. The
-integration of a fix for this is being tracked in https://github.com/ged-lab/khmer/issues/23
+integration of a fix for this is being tracked in https://github.com/dib-lab/khmer/issues/23
Some scripts only output FASTA even if given a FASTQ file. This issue
-is being tracked in https://github.com/ged-lab/khmer/issues/46
+is being tracked in https://github.com/dib-lab/khmer/issues/46
A user reported that abundance-dist-single.py fails with small files and many
-threads. This issue is being tracked in https://github.com/ged-lab/khmer/issues/75
+threads. This issue is being tracked in https://github.com/dib-lab/khmer/issues/75
## Contributors
diff --git a/doc/release-notes/release-1.1.rst b/doc/release-notes/release-1.1.rst
index edcd85b..d08da5c 100644
--- a/doc/release-notes/release-1.1.rst
+++ b/doc/release-notes/release-1.1.rst
@@ -7,7 +7,7 @@ scripts.
Docs at: https://khmer.readthedocs.org/en/v1.1/
Release notes w/links:
-https://github.com/ged-lab/khmer/releases/tag/v1.1
+https://github.com/dib-lab/khmer/releases/tag/v1.1
New items of note:
------------------
@@ -46,29 +46,29 @@ All of these are pre-existing.
Some users have reported that normalize-by-median.py will utilize more
memory than it was configured for. This is being investigated in
-https://github.com/ged-lab/khmer/issues/266
+https://github.com/dib-lab/khmer/issues/266
Some FASTQ files confuse our parser when running with more than one
thread. For example, while using load-into-counting.py. If you
experience this then add "--threads=1" to your command line. This issue
-is being tracked in https://github.com/ged-lab/khmer/issues/249
+is being tracked in https://github.com/dib-lab/khmer/issues/249
If your k-mer table is truncated on write, an error may not be reported;
-this is being tracked in https://github.com/ged-lab/khmer/issues/443.
+this is being tracked in https://github.com/dib-lab/khmer/issues/443.
However, khmer will now (correctly) fail when trying to read a truncated
file (See #333).
Paired-end reads from Casava 1.8 currently require renaming for use in
normalize-by-median and abund-filter when used in paired mode. The
integration of a fix for this is being tracked in
-https://github.com/ged-lab/khmer/issues/23
+https://github.com/dib-lab/khmer/issues/23
Some scripts only output FASTA even if given a FASTQ file. This issue is
-being tracked in https://github.com/ged-lab/khmer/issues/46
+being tracked in https://github.com/dib-lab/khmer/issues/46
A user reported that abundance-dist-single.py fails with small files and
many threads. This issue is being tracked in
-https://github.com/ged-lab/khmer/issues/75
+https://github.com/dib-lab/khmer/issues/75
Contributors
------------
diff --git a/doc/release-notes/release-1.2.md b/doc/release-notes/release-1.2.md
index 9f7bd4a..22a6dac 100644
--- a/doc/release-notes/release-1.2.md
+++ b/doc/release-notes/release-1.2.md
@@ -61,33 +61,33 @@ Multithreaded reading will drop reads. This major issue has been present for
several khmer releases and was only found via a much larger test case that we
had been previously using. Credit to @camillescott. Workaround: disable
threading. The next release will fix this and the other FAST[AQ] parsing
-issues. https://github.com/ged-lab/khmer/issues/681
+issues. https://github.com/dib-lab/khmer/issues/681
Some users have reported that normalize-by-median.py will utilize more
memory than it was configured for. This is being investigated in
-https://github.com/ged-lab/khmer/issues/266
+https://github.com/dib-lab/khmer/issues/266
Some FASTQ files confuse our parser when running with more than one thread.
For example, while using load-into-counting.py. If you experience this then
add "--threads=1" to your command line. This issue is being tracked in
-https://github.com/ged-lab/khmer/issues/249
+https://github.com/dib-lab/khmer/issues/249
If your k-mer table is truncated on write, an error may not be reported; this
-is being tracked in https://github.com/ged-lab/khmer/issues/443.
+is being tracked in https://github.com/dib-lab/khmer/issues/443.
However, khmer will now (correctly) fail when trying to read a truncated file
(See #333).
Paired-end reads from Casava 1.8 currently require renaming for use in
normalize-by-median and abund-filter when used in paired mode. The
integration of a fix for this is being tracked in
-https://github.com/ged-lab/khmer/issues/23
+https://github.com/dib-lab/khmer/issues/23
Some scripts only output FASTA even if given a FASTQ file. This issue
-is being tracked in https://github.com/ged-lab/khmer/issues/46
+is being tracked in https://github.com/dib-lab/khmer/issues/46
A user reported that abundance-dist-single.py fails with small files and many
threads. This issue is being tracked in
-https://github.com/ged-lab/khmer/issues/75
+https://github.com/dib-lab/khmer/issues/75
## Contributors
diff --git a/doc/release-notes/release-1.2.rst b/doc/release-notes/release-1.2.rst
index 9e851ec..9a2ee66 100644
--- a/doc/release-notes/release-1.2.rst
+++ b/doc/release-notes/release-1.2.rst
@@ -65,33 +65,33 @@ for several khmer releases and was only found via a much larger test
case that we had been previously using. Credit to @camillescott.
Workaround: disable threading. The next release will fix this and the
other FAST[AQ] parsing issues.
-https://github.com/ged-lab/khmer/issues/681
+https://github.com/dib-lab/khmer/issues/681
Some users have reported that normalize-by-median.py will utilize more
memory than it was configured for. This is being investigated in
-https://github.com/ged-lab/khmer/issues/266
+https://github.com/dib-lab/khmer/issues/266
Some FASTQ files confuse our parser when running with more than one
thread. For example, while using load-into-counting.py. If you
experience this then add "--threads=1" to your command line. This issue
-is being tracked in https://github.com/ged-lab/khmer/issues/249
+is being tracked in https://github.com/dib-lab/khmer/issues/249
If your k-mer table is truncated on write, an error may not be reported;
-this is being tracked in https://github.com/ged-lab/khmer/issues/443.
+this is being tracked in https://github.com/dib-lab/khmer/issues/443.
However, khmer will now (correctly) fail when trying to read a truncated
file (See #333).
Paired-end reads from Casava 1.8 currently require renaming for use in
normalize-by-median and abund-filter when used in paired mode. The
integration of a fix for this is being tracked in
-https://github.com/ged-lab/khmer/issues/23
+https://github.com/dib-lab/khmer/issues/23
Some scripts only output FASTA even if given a FASTQ file. This issue is
-being tracked in https://github.com/ged-lab/khmer/issues/46
+being tracked in https://github.com/dib-lab/khmer/issues/46
A user reported that abundance-dist-single.py fails with small files and
many threads. This issue is being tracked in
-https://github.com/ged-lab/khmer/issues/75
+https://github.com/dib-lab/khmer/issues/75
Contributors
------------
diff --git a/doc/release-notes/release-1.3.md b/doc/release-notes/release-1.3.md
index 95edb2c..9d21c0d 100644
--- a/doc/release-notes/release-1.3.md
+++ b/doc/release-notes/release-1.3.md
@@ -41,7 +41,7 @@ Some users have reported that normalize-by-median.py will utilize more memory
than it was configured for. This is being investigated in #266
If your k-mer table is truncated on write, an error may not be reported; this
-is being tracked in https://github.com/ged-lab/khmer/issues/443. However, khmer
+is being tracked in https://github.com/dib-lab/khmer/issues/443. However, khmer
will now (correctly) fail when trying to read a truncated file (See #333).
Paired-end reads from Casava 1.8 currently require renaming for use in
diff --git a/doc/release-notes/release-1.3.rst b/doc/release-notes/release-1.3.rst
index 68347f3..5b0b317 100644
--- a/doc/release-notes/release-1.3.rst
+++ b/doc/release-notes/release-1.3.rst
@@ -42,7 +42,7 @@ Some users have reported that normalize-by-median.py will utilize more
memory than it was configured for. This is being investigated in #266
If your k-mer table is truncated on write, an error may not be reported;
-this is being tracked in https://github.com/ged-lab/khmer/issues/443.
+this is being tracked in https://github.com/dib-lab/khmer/issues/443.
However, khmer will now (correctly) fail when trying to read a truncated
file (See #333).
diff --git a/doc/release-notes/release-1.4.md b/doc/release-notes/release-1.4.md
index 7329f96..793172e 100644
--- a/doc/release-notes/release-1.4.md
+++ b/doc/release-notes/release-1.4.md
@@ -217,10 +217,10 @@ All of these are pre-existing.
Some users have reported that normalize-by-median.py will utilize more memory
than it was configured for. This is being investigated in
-https://github.com/ged-lab/khmer/issues/266
+https://github.com/dib-lab/khmer/issues/266
Some scripts only output FASTA even if given a FASTQ file. This issue is being
-tracked in https://github.com/ged-lab/khmer/issues/46
+tracked in https://github.com/dib-lab/khmer/issues/46
## Contributors
diff --git a/doc/release-notes/release-1.4.rst b/doc/release-notes/release-1.4.rst
index 667f7d3..c257299 100644
--- a/doc/release-notes/release-1.4.rst
+++ b/doc/release-notes/release-1.4.rst
@@ -233,10 +233,10 @@ All of these are pre-existing.
Some users have reported that normalize-by-median.py will utilize more
memory than it was configured for. This is being investigated in
-https://github.com/ged-lab/khmer/issues/266
+https://github.com/dib-lab/khmer/issues/266
Some scripts only output FASTA even if given a FASTQ file. This issue is
-being tracked in https://github.com/ged-lab/khmer/issues/46
+being tracked in https://github.com/dib-lab/khmer/issues/46
Contributors
------------
diff --git a/doc/roadmap.rst b/doc/roadmap.rst
index d0011ef..cc25cfd 100644
--- a/doc/roadmap.rst
+++ b/doc/roadmap.rst
@@ -12,7 +12,7 @@ fundamental changes need to happen. This document outlines our plan to do so
while minimizing the impact of these changes on our existing users.
The discussion that lead to this document can be read at
-https://github.com/ged-lab/khmer/issues/389
+https://github.com/dib-lab/khmer/issues/389
Remainder of v1.x series
========================
diff --git a/doc/run-corn-50m.sh b/doc/run-corn-50m.sh
index 4755644..8ce5a0b 100644
--- a/doc/run-corn-50m.sh
+++ b/doc/run-corn-50m.sh
@@ -1,7 +1,7 @@
#! /bin/bash
#
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
-# Copyright (C) Michigan State University, 2009-2013. It is licensed under
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
+# Copyright (C) Michigan State University, 2009-2015. It is licensed under
# the three-clause BSD license; see doc/LICENSE.txt.
# Contact: khmer-project at idyll.org
#
diff --git a/doc/user/choosing-table-sizes.rst b/doc/user/choosing-table-sizes.rst
index bbc112d..caba889 100644
--- a/doc/user/choosing-table-sizes.rst
+++ b/doc/user/choosing-table-sizes.rst
@@ -1,53 +1,55 @@
.. vim: set filetype=rst
-==============================
-Choosing table sizes for khmer
-==============================
+==========================
+Setting khmer memory usage
+==========================
If you look at the documentation for the scripts (:doc:`scripts`) you'll
-see two mysterious parameters -- :option:`-N` and :option:`-x`, or, more
-verbosely, :option:`-n_tables` and :option:`--tablesize`. What are these, and
-how do you specify them?
+see a :option:`-M` parameter that sets the maximum memory usage for
+any script that uses k-mer counting tables or k-mer graphs. What is this?
+
+khmer uses a special data structure that lets it store counting tables
+and k-mer graphs in very low memory; the trick is that you must fix
+the amount of memory khmer can use before running it. (See `Pell et
+al., 2012 <http://www.ncbi.nlm.nih.gov/pubmed/22847406>`__ and `Zhang
+et al., 2014 <http://www.ncbi.nlm.nih.gov/pubmed/25062443>`__ for the
+details.) This is what the :option:`-M` parameter does.
+
+If you set it too low, khmer will warn you to set it higher at the end.
+See below for some good choices for various kinds of data.
+
+**Note for khmer 1.x users:** as of khmer 2.0, the :option:`-M`
+parameter sets the :option:`-N`/:option:`--n_tables` and
+:option:`-x`/:option:`--max_tablesize` parameters automatically.
+You can still set these parameters directly if you wish.
The really short version
========================
There is no way (except for experience, rules of thumb, and intuition) to
-know what these parameters should be up front. So, make the product of
-these two parameters be the size of your available memory::
+know what this parameter should be up front. So, use the maximum
+available memory::
- -N 4 -x 4e9
+ -M 16e9
-for a machine with 16 GB of free memory, for example. Also see
-the rules of thumb, below.
+for a machine with 16 GB of free memory, for example.
The short version
=================
-These parameters specify the maximum memory usage of the primary data
+This parameter specifies the maximum memory usage of the primary data
structure in khmer, which is basically N big hash tables of size x.
The **product** of the number of hash tables and the size of the hash
-tables specifies the total amount of memory used.
+tables specifies the total amount of memory used, which is what the
+:option:`-M` parameter sets.
-This table is used to track k-mers. If it is too small, khmer
-will fail in various ways (and should complain), but there is no harm
+These tables are used to track k-mers. If they are too small, khmer
+will fail in various ways (and will complain), but there is no harm
in making it too large. So, **the absolute safest thing to do is to
specify as much memory as is available**. Most scripts will inform
you of the total memory usage, and (at the end) will complain if it's
too small.
-For normalize-by-median, khmer uses one byte per hash entry, so: if
-you had 16 GB of available RAM, you should specify something like ``-N
-4 -x 4e9``, which multiplies out to about 16 GB.
-
-For the graph partitioning stuff, khmer uses only 1 bit per k-mer, so
-you can multiple your available memory by 8: for 16 GB of RAM, you could
-use ::
-
- -N 4 -x 32e9
-
-which multiplies out to 128 Gbits of RAM, or 16 Gbytes.
-
Life is a bit more complicated than this, however, because some scripts --
load-into-counting and load-graph -- keep ancillary information that will
consume memory beyond this table data structure. So if you run out of
@@ -124,26 +126,24 @@ an error-code.
Rules of thumb
--------------
-Just use -N 4, always, and vary the -x parameter.
-
For digital normalization, we recommend:
- - ``-x 2e9`` for any amount of sequencing for a single microbial genome,
+ - ``-M 8e9`` for any amount of sequencing for a single microbial genome,
MDA-amplified or single colony.
- - ``-x 4e9`` for up to a billion mRNAseq reads from any organism. Past that,
+ - ``-M 16e9`` for up to a billion mRNAseq reads from any organism. Past that,
increase it.
- - ``-x 8e9`` for most eukaryotic genome samples.
+ - ``-M 32e9`` for most eukaryotic genome samples.
- - ``-x 8e9`` will also handle most "simple" metagenomic samples (HMP on down)
+ - ``-M 32e9`` will also handle most "simple" metagenomic samples (HMP on down)
- For metagenomic samples that are more complex, such as soil or marine,
- start as high as possible. For example, we are using ``-x 64e9`` for
+ start as high as possible. For example, we are using ``-M 256e9`` for
~300 Gbp of soil reads.
For partitioning of complex metagenome samples, we recommend starting
as high as you can -- something like half your system memory. So if
-you have 256 GB of RAM, use ``-N 4 -x 256e9`` which will use 4 x 256 /
-8 = 128 GB of RAM for the basic graph storage, leaving other memory
-for the ancillary data structures.
+you have 256 GB of RAM, use ``-M 128e9`` which will use 128 GB of RAM
+for the basic graph storage, leaving other memory for the ancillary
+data structures.
diff --git a/doc/user/examples.rst b/doc/user/examples.rst
index e692ede..d7443ea 100644
--- a/doc/user/examples.rst
+++ b/doc/user/examples.rst
@@ -18,9 +18,9 @@ as the effect of digital normalization and partitioning on the k-mer
abundance distribution.
See `the script for running everything
-<https://github.com/ged-lab/khmer/blob/master/examples/stamps/do.sh>`__
+<https://github.com/dib-lab/khmer/blob/master/examples/stamps/do.sh>`__
and `the IPython Notebook
-<http://nbviewer.ipython.org/urls/raw.github.com/ged-lab/khmer/master/examples/stamps%2520k-mer%2520distributions.ipynb>`__.
+<http://nbviewer.ipython.org/urls/raw.github.com/dib-lab/khmer/master/examples/stamps%2520k-mer%2520distributions.ipynb>`__.
For an overall discussion and some slides to explain what's going on,
visit `the Web site for a 2013 HMP metagenome assembly webinar that
diff --git a/doc/user/getting-help.rst b/doc/user/getting-help.rst
index 007b02a..e84ff67 100644
--- a/doc/user/getting-help.rst
+++ b/doc/user/getting-help.rst
@@ -51,4 +51,4 @@ GitHub
------
You are also welcome to report an issue you are having using GitHub::
-https://github.com/ged-lab/khmer/issues/new
+https://github.com/dib-lab/khmer/issues/new
diff --git a/doc/user/guide.rst b/doc/user/guide.rst
index 54458b5..8f3714c 100644
--- a/doc/user/guide.rst
+++ b/doc/user/guide.rst
@@ -29,7 +29,7 @@ khmer is a general `framework for low-memory k-mer counting, filtering,
and advanced trickery <http://khmer.readthedocs.org/en/latest/>`__.
The latest source is always available `here
-<https://github.com/ged-lab/khmer>`__.
+<https://github.com/dib-lab/khmer>`__.
khmer is really focused on short read data, and, more specifically,
Illumina, because that's where we have a too-much-data problem.
diff --git a/doc/user/known-issues.rst b/doc/user/known-issues.rst
index 842adbd..b6ab542 100644
--- a/doc/user/known-issues.rst
+++ b/doc/user/known-issues.rst
@@ -5,17 +5,7 @@ Known Issues
Some users have reported that normalize-by-median.py will utilize more
memory than it was configured for. This is being investigated in
-https://github.com/ged-lab/khmer/issues/266
-
-If your k-mer table is truncated on write, an error may not be reported; this
-is being tracked in https://github.com/ged-lab/khmer/issues/443.
-However, khmer will now (correctly) fail when trying to read a truncated file
-(See #333).
-
-Paired-end reads from Casava 1.8 currently require renaming for use in
-normalize-by-median and abund-filter when used in paired mode. The
-integration of a fix for this is being tracked in
-https://github.com/ged-lab/khmer/issues/23
+https://github.com/dib-lab/khmer/issues/266
Some scripts only output FASTA even if given a FASTQ file. This issue
-is being tracked in https://github.com/ged-lab/khmer/issues/46
+is being tracked in https://github.com/dib-lab/khmer/issues/46
diff --git a/doc/user/scripts.rst b/doc/user/scripts.rst
index 157af7b..3bc0cbb 100644
--- a/doc/user/scripts.rst
+++ b/doc/user/scripts.rst
@@ -10,14 +10,10 @@ distribution. Below is our documentation for these scripts. Note
that all scripts can be given :option:`-h` which will print out
a list of arguments taken by that script.
-Many scripts take :option:`-x` and :option:`-N` parameters, which drive khmer's
-memory usage. These parameters depend on details of your data set; for more information
-on how to choose them, see :doc:`choosing-table-sizes`.
-
-You can also override the default values of :option:`--ksize`/:option:`-k`,
-:option:`--n_tables`/:option:`-N`, and :option:`--min-tablesize`/:option:`-x` with
-the environment variables `KHMER_KSIZE`, `KHMER_N_TABLES`, and
-`KHMER_MIN_TABLESIZE` respectively.
+Scripts that use k-mer counting tables or k-mer graphs take an
+:option:`-M` parameter, which sets the maximum memory usage in bytes.
+This should generally be set as high as possible; see
+:doc:`choosing-table-sizes` for more information.
1. :ref:`scripts-counting`
2. :ref:`scripts-partitioning`
diff --git a/doc/whats-new-2.0.rst b/doc/whats-new-2.0.rst
new file mode 100644
index 0000000..2a7f0c7
--- /dev/null
+++ b/doc/whats-new-2.0.rst
@@ -0,0 +1,29 @@
+.. vim: set filetype=rst
+
+What's New In khmer 2.0?
+########################
+
+Incompatible changes
+====================
+
+New parameter for tablesize/number of table parameters.
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+There is now a :option:`-M`/:option:`--max-memory-usage` parameter
+that sets the number of tables (:option:`-N`/:option:`--num_tables`)
+and tablesize (:option:`-x`/:option:`--max-tablesize`) parameters
+automatically to match the desired memory usage.
+
+(:option:`--min-tablesize` was also renamed to
+:option:`--max-tablesize` to reflect this more desirable behavior.)
+
+Binary file formats have changed!
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+All binary khmer formats (presence tables, counting tables, tag sets,
+stop tags, and partition subsets) have changed. Files are now
+pre-pended with the string ``OXLI`` to indicate that they are from
+this project.
+
+Files of the above types made in previous versions of khmer are not compatible
+with v2.0; the reverse is also true.
diff --git a/examples/stamps/do.sh b/examples/stamps/do.sh
index 4777142..8a170c7 100755
--- a/examples/stamps/do.sh
+++ b/examples/stamps/do.sh
@@ -1,7 +1,7 @@
#!/bin/bash
#
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
-# Copyright (C) Michigan State University, 2009-2013. It is licensed under
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
+# Copyright (C) Michigan State University, 2009-2015. It is licensed under
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
#
diff --git a/jenkins-build.sh b/jenkins-build.sh
index 96007e9..94f7d78 100755
--- a/jenkins-build.sh
+++ b/jenkins-build.sh
@@ -56,7 +56,7 @@ then
#hg clone http://bitbucket.org/mcrusoe/sphinx-contrib
#hg clone http://athyra.ged.msu.edu/~mcrusoe/sphinx-contrib
#pip install --upgrade sphinx-contrib/autoprogram/
- pip install -r doc/requirements.txt
+ #pip install -r doc/requirements.txt # now covered by make install-dep
make doc
fi
make pylint 2>&1 > pylint.out
diff --git a/khmer/__init__.py b/khmer/__init__.py
index b48d763..032ca7b 100644
--- a/khmer/__init__.py
+++ b/khmer/__init__.py
@@ -1,12 +1,14 @@
#
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
# Copyright (C) Michigan State University, 2010-2015. It is licensed under
# the three-clause BSD license; see doc/LICENSE.txt.
# Contact: khmer-project at idyll.org
#
"""This is khmer; please see http://khmer.readthedocs.org/."""
-from khmer._khmer import CountingHash
+from __future__ import print_function
+
+from khmer._khmer import CountingHash as _CountingHash
from khmer._khmer import LabelHash as _LabelHash
from khmer._khmer import Hashbits as _Hashbits
from khmer._khmer import HLLCounter as _HLLCounter
@@ -43,36 +45,6 @@ __version__ = get_versions()['version']
del get_versions
-def new_hashbits(k, starting_size, n_tables=2):
- """Return a new hashbits object. Deprecated.
-
- This factory method is deprecated in favor of creating a Hashbits object
- directly via 'new Hashbits(...)'.
-
- Keyword argument:
- k -- kmer size to use
- starting_size -- lower bound on hashsize to use
- n_tables -- number of hash tables to use (default = 2)
- """
- primes = get_n_primes_above_x(n_tables, starting_size)
-
- return _Hashbits(k, primes)
-
-
-def new_counting_hash(k, starting_size, n_tables=2):
- """Return a new countinghash object.
-
- Keyword arguments:
- k -- kmer size to use
- starting_size -- lower bound on hashsize to use
- n_tables -- number of hash tables to use (default = 2)
- n_threads -- number of simultaneous threads to execute (default = 1)
- """
- primes = get_n_primes_above_x(n_tables, starting_size)
-
- return CountingHash(k, primes)
-
-
def load_hashbits(filename):
"""Load a hashbits object from the given filename and return it.
@@ -91,7 +63,7 @@ def load_counting_hash(filename):
Keyword argument:
filename -- the name of the counting_hash file
"""
- hashtable = CountingHash(1, [1])
+ hashtable = _CountingHash(1, [1])
hashtable.load(filename)
return hashtable
@@ -109,6 +81,7 @@ def extract_hashbits_info(filename):
ksize = None
n_tables = None
table_size = None
+ signature = None
version = None
ht_type = None
@@ -118,11 +91,15 @@ def extract_hashbits_info(filename):
try:
with open(filename, 'rb') as hashbits:
+ signature, = unpack('4s', hashbits.read(4))
version, = unpack('B', hashbits.read(1))
ht_type, = unpack('B', hashbits.read(1))
ksize, = unpack('I', hashbits.read(uint_size))
n_tables, = unpack('B', hashbits.read(uchar_size))
table_size, = unpack('Q', hashbits.read(ulonglong_size))
+ if signature != b"OXLI":
+ raise ValueError("Node graph '{}' is missing file type "
+ "signature".format(filename) + str(signature))
except:
raise ValueError("Presence table '{}' is corrupt ".format(filename))
@@ -141,6 +118,7 @@ def extract_countinghash_info(filename):
ksize = None
n_tables = None
table_size = None
+ signature = None
version = None
ht_type = None
use_bigcount = None
@@ -150,12 +128,16 @@ def extract_countinghash_info(filename):
try:
with open(filename, 'rb') as countinghash:
+ signature, = unpack('4s', countinghash.read(4))
version, = unpack('B', countinghash.read(1))
ht_type, = unpack('B', countinghash.read(1))
use_bigcount, = unpack('B', countinghash.read(1))
ksize, = unpack('I', countinghash.read(uint_size))
n_tables, = unpack('B', countinghash.read(1))
table_size, = unpack('Q', countinghash.read(ulonglong_size))
+ if signature != b'OXLI':
+ raise ValueError("Counting table '{}' is missing file type "
+ "signature. ".format(filename) + str(signature))
except:
raise ValueError("Counting table '{}' is corrupt ".format(filename))
@@ -179,12 +161,20 @@ def calc_expected_collisions(hashtable, force=False, max_false_pos=.2):
fp_all = fp_one ** n_ht
if fp_all > max_false_pos:
- print >>sys.stderr, "**"
- print >>sys.stderr, "** ERROR: the graph structure is too small for "
- print >>sys.stderr, "this data set. Increase k-mer presence table "
- print >>sys.stderr, "size/num of tables."
- print >>sys.stderr, "** Do not use these results!!"
- print >>sys.stderr, "**"
+ print("**", file=sys.stderr)
+ print("** ERROR: the graph structure is too small for ",
+ file=sys.stderr)
+ print("** this data set. Increase data structure size",
+ file=sys.stderr)
+ print("** with --max_memory_usage/-M.", file=sys.stderr)
+ print("**", file=sys.stderr)
+ print("** Do not use these results!!", file=sys.stderr)
+ print("**", file=sys.stderr)
+ print("** (estimated false positive rate of %.3f;" % fp_all,
+ file=sys.stderr)
+ print("max allowable %.3f" % max_false_pos, file=sys.stderr)
+ print("**", file=sys.stderr)
+
if not force:
sys.exit(1)
@@ -215,6 +205,9 @@ def get_n_primes_near_x(number, target):
number -- the number of primes to find
target -- the number to step backwards from
"""
+ if target == 1 and number == 1:
+ return [1]
+
primes = []
i = target - 1
if i % 2 == 0:
@@ -223,27 +216,11 @@ def get_n_primes_near_x(number, target):
if is_prime(i):
primes.append(i)
i -= 2
- return primes
-
-def get_n_primes_above_x(number, target):
- """Forward-find primes smaller than target.
-
- Step forwards until a number of primes (other than 2) have been
- found that are smaller than the target and return them.
+ if len(primes) != number:
+ raise Exception("unable to find %d prime numbers < %d" % (number,
+ target))
- Keyword arguments:
- number -- the number of primes to find
- target -- the number to step forwards from
- """
- primes = []
- i = target + 1
- if i % 2 == 0:
- i += 1
- while len(primes) != number and i > 0:
- if is_prime(i):
- primes.append(i)
- i += 2
return primes
@@ -253,19 +230,38 @@ def get_n_primes_above_x(number, target):
# Additional functionality can be added to these classes as appropriate.
-class LabelHash(_LabelHash):
+class CountingHash(_CountingHash):
def __new__(cls, k, starting_size, n_tables):
- primes = get_n_primes_above_x(n_tables, starting_size)
- c = _LabelHash.__new__(cls, k, primes)
+ primes = get_n_primes_near_x(n_tables, starting_size)
+ c = _CountingHash.__new__(cls, k, primes)
c.primes = primes
return c
+class LabelHash(_LabelHash):
+
+ def __new__(cls, k, starting_size, n_tables):
+ hb = Hashbits(k, starting_size, n_tables)
+ c = _LabelHash.__new__(cls, hb)
+ c.graph = hb
+ return c
+
+
+class CountingLabelHash(_LabelHash):
+
+ def __new__(cls, k, starting_size, n_tables):
+ primes = get_n_primes_near_x(n_tables, starting_size)
+ hb = _CountingHash(k, primes)
+ c = _LabelHash.__new__(cls, hb)
+ c.graph = hb
+ return c
+
+
class Hashbits(_Hashbits):
def __new__(cls, k, starting_size, n_tables):
- primes = get_n_primes_above_x(n_tables, starting_size)
+ primes = get_n_primes_near_x(n_tables, starting_size)
c = _Hashbits.__new__(cls, k, primes)
c.primes = primes
return c
diff --git a/khmer/_khmermodule.cc b/khmer/_khmer.cc
similarity index 72%
rename from khmer/_khmermodule.cc
rename to khmer/_khmer.cc
index 8a90c77..d58e832 100644
--- a/khmer/_khmermodule.cc
+++ b/khmer/_khmer.cc
@@ -1,5 +1,5 @@
//
-// This file is part of khmer, http://github.com/ged-lab/khmer/, and is
+// This file is part of khmer, https://github.com/dib-lab/khmer/, and is
// Copyright (C) Michigan State University, 2009-2015. It is licensed under
// the three-clause BSD license; see doc/LICENSE.txt.
// Contact: khmer-project at idyll.org
@@ -34,6 +34,7 @@ using namespace read_parsers;
#if (PY_MAJOR_VERSION >= 3)
#define PyInt_Check(arg) PyLong_Check(arg)
#define PyInt_AsLong(arg) PyLong_AsLong(arg)
+#define PyInt_FromLong(arg) PyLong_FromLong(arg)
#endif
//
@@ -43,6 +44,27 @@ using namespace read_parsers;
#include "bytesobject.h"
+//
+// Python 2/3 compatibility: Module initialization
+// http://python3porting.com/cextensions.html#module-initialization
+//
+
+#if PY_MAJOR_VERSION >= 3
+ #define MOD_ERROR_VAL NULL
+ #define MOD_SUCCESS_VAL(val) val
+ #define MOD_INIT(name) PyMODINIT_FUNC PyInit_##name(void)
+ #define MOD_DEF(ob, name, doc, methods) \
+ static struct PyModuleDef moduledef = { \
+ PyModuleDef_HEAD_INIT, name, doc, -1, methods, }; \
+ ob = PyModule_Create(&moduledef);
+#else
+ #define MOD_ERROR_VAL
+ #define MOD_SUCCESS_VAL(val)
+ #define MOD_INIT(name) void init##name(void)
+ #define MOD_DEF(ob, name, doc, methods) \
+ ob = Py_InitModule3(name, methods, doc);
+#endif
+
using namespace khmer;
//
@@ -50,7 +72,7 @@ using namespace khmer;
//
extern "C" {
- void init_khmer();
+ MOD_INIT(_khmer);
}
// Configure module logging.
@@ -104,14 +126,6 @@ public:
};
};
-class _khmer_signal : public _khmer_exception
-{
-public:
- _khmer_signal(std::string message) : _khmer_exception(message) { };
-};
-
-typedef pre_partition_info _pre_partition_info;
-
/***********************************************************************/
//
@@ -145,7 +159,7 @@ static
PyObject *
Read_get_name(khmer_Read_Object * obj, void * closure )
{
- return PyBytes_FromString(obj->read->name.c_str()) ;
+ return PyUnicode_FromString(obj->read->name.c_str()) ;
}
@@ -153,7 +167,7 @@ static
PyObject *
Read_get_sequence(khmer_Read_Object * obj, void * closure)
{
- return PyBytes_FromString(obj->read->sequence.c_str()) ;
+ return PyUnicode_FromString(obj->read->sequence.c_str()) ;
}
@@ -161,7 +175,7 @@ static
PyObject *
Read_get_quality(khmer_Read_Object * obj, void * closure)
{
- return PyBytes_FromString(obj->read->quality.c_str()) ;
+ return PyUnicode_FromString(obj->read->quality.c_str()) ;
}
@@ -169,7 +183,7 @@ static
PyObject *
Read_get_annotations(khmer_Read_Object * obj, void * closure)
{
- return PyBytes_FromString(obj->read->annotations.c_str()) ;
+ return PyUnicode_FromString(obj->read->annotations.c_str()) ;
}
@@ -335,7 +349,7 @@ _ReadParser_iternext( PyObject * self )
exc = e.what();
} catch (InvalidRead &e) {
exc = e.what();
- }
+ }
}
Py_END_ALLOW_THREADS
@@ -616,40 +630,57 @@ _PyObject_to_khmer_ReadParser( PyObject * py_object )
return ((python:: khmer_ReadParser_Object *)py_object)->parser;
}
+typedef struct {
+ PyObject_HEAD
+ pre_partition_info * PrePartitionInfo;
+} khmer_PrePartitionInfo_Object;
-/***********************************************************************/
-
-//
-// KCountingHash object
-//
-
-void free_pre_partition_info(void * p)
+static
+void
+khmer_PrePartitionInfo_dealloc(khmer_PrePartitionInfo_Object * obj)
{
- _pre_partition_info * ppi = (_pre_partition_info *) p;
- delete ppi;
+ delete obj->PrePartitionInfo;
+ obj->PrePartitionInfo = NULL;
+ Py_TYPE(obj)->tp_free((PyObject*)obj);
}
-void free_subset_partition_info(void * p)
-{
- SubsetPartition * subset_p = (SubsetPartition *) p;
- delete subset_p;
-}
+static PyTypeObject khmer_PrePartitionInfo_Type = {
+ PyVarObject_HEAD_INIT(NULL, 0) /* init & ob_size */
+ "_khmer.PrePartitionInfo", /* tp_name */
+ sizeof(khmer_PrePartitionInfo_Object),/* tp_basicsize */
+ 0, /* tp_itemsize */
+ (destructor)khmer_PrePartitionInfo_dealloc, /* tp_dealloc */
+ 0, /* tp_print */
+ 0, /* tp_getattr */
+ 0, /* tp_setattr */
+ 0, /* tp_compare */
+ 0, /* tp_repr */
+ 0, /* tp_as_number */
+ 0, /* tp_as_sequence */
+ 0, /* tp_as_mapping */
+ 0, /* tp_hash */
+ 0, /* tp_call */
+ 0, /* tp_str */
+ 0, /* tp_getattro */
+ 0, /* tp_setattro */
+ 0, /* tp_as_buffer */
+ Py_TPFLAGS_DEFAULT, /* tp_flags */
+ "Stores a k-kmer and a set of tagged seen k-mers.", /* tp_doc */
+};
+
+
+/***********************************************************************/
typedef struct {
PyObject_HEAD
- CountingHash * counting;
-} khmer_KCountingHash_Object;
+ Hashtable * hashtable;
+} khmer_KHashtable_Object;
typedef struct {
PyObject_HEAD
SubsetPartition * subset;
} khmer_KSubsetPartition_Object;
-typedef struct {
- PyObject_HEAD
- Hashbits * hashbits;
-} khmer_KHashbits_Object;
-
static void khmer_subset_dealloc(khmer_KSubsetPartition_Object * obj);
static PyTypeObject khmer_KSubsetPartition_Type = {
@@ -677,81 +708,78 @@ static PyTypeObject khmer_KSubsetPartition_Type = {
};
typedef struct {
- PyObject_HEAD
- ReadAligner * aligner;
-} khmer_ReadAligner_Object;
-
-static void khmer_counting_dealloc(khmer_KCountingHash_Object * obj);
-
-static
-PyObject *
-hash_abundance_distribution(khmer_KCountingHash_Object * me, PyObject * args);
-
-static
-PyObject *
-hash_abundance_distribution_with_reads_parser(khmer_KCountingHash_Object * me,
- PyObject * args);
-
-static
-PyObject *
-hash_get_raw_tables(khmer_KCountingHash_Object * self, PyObject * args)
-{
- CountingHash * counting = self->counting;
-
- khmer::Byte ** table_ptrs = counting->get_raw_tables();
- std::vector<HashIntoType> sizes = counting->get_tablesizes();
-
- PyObject * raw_tables = PyList_New(sizes.size());
- for (unsigned int i=0; i<sizes.size(); ++i) {
- PyObject * buf = PyBuffer_FromMemory(table_ptrs[i], sizes[i]);
- if(!PyBuffer_Check(buf)) {
- return NULL;
- }
- PyList_SET_ITEM(raw_tables, i, buf);
- }
-
- return raw_tables;
-}
+ khmer_KHashtable_Object khashtable;
+ Hashbits * hashbits;
+} khmer_KHashbits_Object;
-static
-PyObject *
-hash_set_use_bigcount(khmer_KCountingHash_Object * me, PyObject * args)
-{
- CountingHash * counting = me->counting;
+static void khmer_hashbits_dealloc(khmer_KHashbits_Object * obj);
+static PyObject* khmer_hashbits_new(PyTypeObject * type, PyObject * args,
+ PyObject * kwds);
- PyObject * x;
- if (!PyArg_ParseTuple(args, "O", &x)) {
- return NULL;
- }
- int setme = PyObject_IsTrue(x);
- if (setme < 0) {
- return NULL;
- }
- counting->set_use_bigcount((bool)setme);
+static PyTypeObject khmer_KHashbits_Type
+CPYCHECKER_TYPE_OBJECT_FOR_TYPEDEF("khmer_KHashbits_Object")
+= {
+ PyVarObject_HEAD_INIT(NULL, 0) /* init & ob_size */
+ "_khmer.Hashbits", /* tp_name */
+ sizeof(khmer_KHashbits_Object), /* tp_basicsize */
+ 0, /* tp_itemsize */
+ (destructor)khmer_hashbits_dealloc, /*tp_dealloc*/
+ 0, /*tp_print*/
+ 0, /*tp_getattr*/
+ 0, /*tp_setattr*/
+ 0, /*tp_compare*/
+ 0, /*tp_repr*/
+ 0, /*tp_as_number*/
+ 0, /*tp_as_sequence*/
+ 0, /*tp_as_mapping*/
+ 0, /*tp_hash */
+ 0, /*tp_call*/
+ 0, /*tp_str*/
+ 0, /*tp_getattro*/
+ 0, /*tp_setattro*/
+ 0, /*tp_as_buffer*/
+ Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /*tp_flags*/
+ "hashbits object", /* tp_doc */
+ 0, /* tp_traverse */
+ 0, /* tp_clear */
+ 0, /* tp_richcompare */
+ 0, /* tp_weaklistoffset */
+ 0, /* tp_iter */
+ 0, /* tp_iternext */
+ 0, /* tp_methods */
+ 0, /* tp_members */
+ 0, /* tp_getset */
+ 0, /* tp_base */
+ 0, /* tp_dict */
+ 0, /* tp_descr_get */
+ 0, /* tp_descr_set */
+ 0, /* tp_dictoffset */
+ 0, /* tp_init */
+ 0, /* tp_alloc */
+ khmer_hashbits_new, /* tp_new */
+};
- Py_RETURN_NONE;
-}
static
PyObject *
-hash_get_use_bigcount(khmer_KCountingHash_Object * me, PyObject * args)
+hashtable_get_ksize(khmer_KHashtable_Object * me, PyObject * args)
{
- CountingHash * counting = me->counting;
+ Hashtable * hashtable = me->hashtable;
if (!PyArg_ParseTuple(args, "")) {
return NULL;
}
- bool val = counting->get_use_bigcount();
+ unsigned int k = hashtable->ksize();
- return PyBool_FromLong((int)val);
+ return PyLong_FromLong(k);
}
static
PyObject *
-hash_n_occupied(khmer_KCountingHash_Object * me, PyObject * args)
+hashtable_n_occupied(khmer_KHashtable_Object * me, PyObject * args)
{
- CountingHash * counting = me->counting;
+ Hashtable * hashtable = me->hashtable;
HashIntoType start = 0, stop = 0;
@@ -759,40 +787,40 @@ hash_n_occupied(khmer_KCountingHash_Object * me, PyObject * args)
return NULL;
}
- HashIntoType n = counting->n_occupied(start, stop);
+ HashIntoType n = hashtable->n_occupied(start, stop);
return PyLong_FromUnsignedLongLong(n);
}
static
PyObject *
-hash_n_unique_kmers(khmer_KCountingHash_Object * me, PyObject * args)
+hashtable_n_unique_kmers(khmer_KHashtable_Object * me, PyObject * args)
{
- CountingHash * counting = me->counting;
+ Hashtable * hashtable = me->hashtable;
- HashIntoType n = counting->n_unique_kmers();
+ HashIntoType n = hashtable->n_unique_kmers();
return PyLong_FromUnsignedLongLong(n);
}
static
PyObject *
-hash_n_entries(khmer_KCountingHash_Object * me, PyObject * args)
+hashtable_n_entries(khmer_KHashtable_Object * me, PyObject * args)
{
- CountingHash * counting = me->counting;
+ Hashtable * hashtable = me->hashtable;
if (!PyArg_ParseTuple(args, "")) {
return NULL;
}
- return PyLong_FromUnsignedLongLong(counting->n_entries());
+ return PyLong_FromUnsignedLongLong(hashtable->n_entries());
}
static
PyObject *
-hash_count(khmer_KCountingHash_Object * me, PyObject * args)
+hashtable_count(khmer_KHashtable_Object * me, PyObject * args)
{
- CountingHash * counting = me->counting;
+ Hashtable * hashtable = me->hashtable;
const char * kmer;
@@ -800,41 +828,22 @@ hash_count(khmer_KCountingHash_Object * me, PyObject * args)
return NULL;
}
- if (strlen(kmer) != counting->ksize()) {
+ if (strlen(kmer) != hashtable->ksize()) {
PyErr_SetString(PyExc_ValueError,
"k-mer length must be the same as the hashtable k-size");
return NULL;
}
- counting->count(kmer);
+ hashtable->count(kmer);
return PyLong_FromLong(1);
}
static
PyObject *
-hash_output_fasta_kmer_pos_freq(khmer_KCountingHash_Object * me,
- PyObject * args)
-{
- CountingHash * counting = me->counting;
-
- const char * infile;
- const char * outfile;
-
- if (!PyArg_ParseTuple(args, "ss", &infile, &outfile)) {
- return NULL;
- }
-
- counting->output_fasta_kmer_pos_freq(infile, outfile);
-
- return PyLong_FromLong(0);
-}
-
-static
-PyObject *
-hash_consume_fasta(khmer_KCountingHash_Object * me, PyObject * args)
+hashtable_consume_fasta(khmer_KHashtable_Object * me, PyObject * args)
{
- CountingHash * counting = me->counting;
+ Hashtable * hashtable = me->hashtable;
const char * filename;
@@ -846,10 +855,7 @@ hash_consume_fasta(khmer_KCountingHash_Object * me, PyObject * args)
unsigned long long n_consumed = 0;
unsigned int total_reads = 0;
try {
- counting->consume_fasta(filename, total_reads, n_consumed);
- } catch (_khmer_signal &e) {
- PyErr_SetString(PyExc_IOError, e.get_message().c_str());
- return NULL;
+ hashtable->consume_fasta(filename, total_reads, n_consumed);
} catch (khmer_file_exception &e) {
PyErr_SetString(PyExc_IOError, e.what());
return NULL;
@@ -860,10 +866,10 @@ hash_consume_fasta(khmer_KCountingHash_Object * me, PyObject * args)
static
PyObject *
-hash_consume_fasta_with_reads_parser(khmer_KCountingHash_Object * me,
- PyObject * args)
+hashtable_consume_fasta_with_reads_parser(khmer_KHashtable_Object * me,
+ PyObject * args)
{
- CountingHash * counting = me->counting;
+ Hashtable * hashtable = me->hashtable;
PyObject * rparser_obj = NULL;
@@ -874,32 +880,21 @@ hash_consume_fasta_with_reads_parser(khmer_KCountingHash_Object * me,
read_parsers:: IParser * rparser =
_PyObject_to_khmer_ReadParser( rparser_obj );
- char const * exc = "";
// call the C++ function, and trap signals => Python
unsigned long long n_consumed = 0;
unsigned int total_reads = 0;
- bool exc_raised = false;
Py_BEGIN_ALLOW_THREADS
- try {
- counting->consume_fasta(rparser, total_reads, n_consumed);
- } catch (_khmer_signal &e) {
- exc = e.get_message().c_str();
- exc_raised = true;
- }
+ hashtable->consume_fasta(rparser, total_reads, n_consumed);
Py_END_ALLOW_THREADS
- if (exc_raised) {
- PyErr_SetString(PyExc_IOError, exc);
- return NULL;
- }
return Py_BuildValue("IK", total_reads, n_consumed);
}
static
PyObject *
-hash_consume(khmer_KCountingHash_Object * me, PyObject * args)
+hashtable_consume(khmer_KHashtable_Object * me, PyObject * args)
{
- CountingHash * counting = me->counting;
+ Hashtable * hashtable = me->hashtable;
const char * long_str;
@@ -907,337 +902,375 @@ hash_consume(khmer_KCountingHash_Object * me, PyObject * args)
return NULL;
}
- if (strlen(long_str) < counting->ksize()) {
+ if (strlen(long_str) < hashtable->ksize()) {
PyErr_SetString(PyExc_ValueError,
"string length must >= the hashtable k-mer size");
return NULL;
}
unsigned int n_consumed;
- n_consumed = counting->consume_string(long_str);
+ n_consumed = hashtable->consume_string(long_str);
return PyLong_FromLong(n_consumed);
}
static
PyObject *
-hash_get_min_count(khmer_KCountingHash_Object * me, PyObject * args)
+hashtable_get(khmer_KHashtable_Object * me, PyObject * args)
{
- CountingHash * counting = me->counting;
+ Hashtable * hashtable = me->hashtable;
- const char * long_str;
+ PyObject * arg;
- if (!PyArg_ParseTuple(args, "s", &long_str)) {
+ if (!PyArg_ParseTuple(args, "O", &arg)) {
return NULL;
}
- if (strlen(long_str) < counting->ksize()) {
+ unsigned long count = 0;
+
+ if (PyInt_Check(arg) || PyLong_Check(arg)) {
+ long pos = PyInt_AsLong(arg);
+ count = hashtable->get_count((unsigned int) pos);
+ } else if (PyUnicode_Check(arg)) {
+ std::string s = PyBytes_AsString(PyUnicode_AsEncodedString(
+ arg, "utf-8", "strict"));
+ if (strlen(s.c_str()) != hashtable->ksize()) {
+ PyErr_SetString(PyExc_ValueError,
+ "k-mer size must equal the presence table k-mer size");
+ return NULL;
+ }
+ count = hashtable->get_count(s.c_str());
+ } else if (PyBytes_Check(arg)) {
+ std::string s = PyBytes_AsString(arg);
+
+ if (strlen(s.c_str()) != hashtable->ksize()) {
+ PyErr_SetString(PyExc_ValueError,
+ "k-mer size must equal the counting table k-mer size");
+ return NULL;
+ }
+
+ count = hashtable->get_count(s.c_str());
+ } else {
PyErr_SetString(PyExc_ValueError,
- "string length must >= the hashtable k-mer size");
+ "please pass either a hash value or a string");
return NULL;
}
- BoundedCounterType c = counting->get_min_count(long_str);
- unsigned int N = c;
-
- return PyLong_FromLong(N);
+ return PyLong_FromLong(count);
}
static
PyObject *
-hash_get_max_count(khmer_KCountingHash_Object * me, PyObject * args)
+hashtable_load(khmer_KHashtable_Object * me, PyObject * args)
{
- CountingHash * counting = me->counting;
+ Hashtable * hashtable = me->hashtable;
- const char * long_str;
+ const char * filename = NULL;
- if (!PyArg_ParseTuple(args, "s", &long_str)) {
+ if (!PyArg_ParseTuple(args, "s", &filename)) {
return NULL;
}
- if (strlen(long_str) < counting->ksize()) {
- PyErr_SetString(PyExc_ValueError,
- "string length must >= the hashtable k-mer size");
+ try {
+ hashtable->load(filename);
+ } catch (khmer_file_exception &e) {
+ PyErr_SetString(PyExc_IOError, e.what());
return NULL;
}
- BoundedCounterType c = counting->get_max_count(long_str);
- unsigned int N = c;
-
- return PyLong_FromLong(N);
+ Py_RETURN_NONE;
}
static
PyObject *
-hash_get_median_count(khmer_KCountingHash_Object * me, PyObject * args)
+hashtable_save(khmer_KHashtable_Object * me, PyObject * args)
{
- CountingHash * counting = me->counting;
+ Hashtable * hashtable = me->hashtable;
- const char * long_str;
+ const char * filename = NULL;
- if (!PyArg_ParseTuple(args, "s", &long_str)) {
+ if (!PyArg_ParseTuple(args, "s", &filename)) {
return NULL;
}
- if (strlen(long_str) < counting->ksize()) {
- PyErr_SetString(PyExc_ValueError,
- "string length must >= the hashtable k-mer size");
+ try {
+ hashtable->save(filename);
+ } catch (khmer_file_exception &e) {
+ PyErr_SetString(PyExc_IOError, e.what());
return NULL;
}
- BoundedCounterType med = 0;
- float average = 0, stddev = 0;
-
- counting->get_median_count(long_str, med, average, stddev);
-
- return Py_BuildValue("iff", med, average, stddev);
+ Py_RETURN_NONE;
}
static
PyObject *
-hash_get_kadian_count(khmer_KCountingHash_Object * me, PyObject * args)
+hashtable_get_hashsizes(khmer_KHashtable_Object * me, PyObject * args)
{
- CountingHash * counting = me->counting;
+ Hashtable * hashtable = me->hashtable;
- const char * long_str;
- unsigned int nk = 1;
- if (!PyArg_ParseTuple(args, "s|I", &long_str, &nk)) {
+ if (!PyArg_ParseTuple(args, "")) {
return NULL;
}
- if (strlen(long_str) < counting->ksize()) {
- PyErr_SetString(PyExc_ValueError,
- "string length must >= the hashtable k-mer size");
- return NULL;
- }
+ std::vector<HashIntoType> ts = hashtable->get_tablesizes();
- BoundedCounterType kad = 0;
+ PyObject * x = PyList_New(ts.size());
+ for (size_t i = 0; i < ts.size(); i++) {
+ PyList_SET_ITEM(x, i, PyLong_FromUnsignedLongLong(ts[i]));
+ }
- counting->get_kadian_count(long_str, kad, nk);
-
- return Py_BuildValue("i", kad);
-}
+ return x;
+}
static
PyObject *
-hash_get(khmer_KCountingHash_Object * me, PyObject * args)
+hashtable_consume_and_tag(khmer_KHashtable_Object * me, PyObject * args)
{
- CountingHash * counting = me->counting;
+ Hashtable * hashtable = me->hashtable;
- PyObject * arg;
+ const char * seq;
- if (!PyArg_ParseTuple(args, "O", &arg)) {
+ if (!PyArg_ParseTuple(args, "s", &seq)) {
return NULL;
}
- unsigned long count = 0;
-
- if (PyInt_Check(arg)) {
- long pos = PyInt_AsLong(arg);
- count = counting->get_count((unsigned int) pos);
- } else if (PyBytes_Check(arg)) {
- std::string s = PyBytes_AsString(arg);
+ // call the C++ function, and trap signals => Python
- if (strlen(s.c_str()) != counting->ksize()) {
- PyErr_SetString(PyExc_ValueError,
- "k-mer size must equal the counting table k-mer size");
- return NULL;
- }
+ unsigned long long n_consumed = 0;
- count = counting->get_count(s.c_str());
- }
+ // @CTB needs to normalize
+ hashtable->consume_sequence_and_tag(seq, n_consumed);
- return PyLong_FromLong(count);
+ return Py_BuildValue("K", n_consumed);
}
static
PyObject *
-count_trim_on_abundance(khmer_KCountingHash_Object * me, PyObject * args)
+hashtable_get_tags_and_positions(khmer_KHashtable_Object * me, PyObject * args)
{
- CountingHash * counting = me->counting;
+ Hashtable * hashtable = me->hashtable;
- const char * seq = NULL;
- unsigned int min_count_i = 0;
+ const char * seq;
- if (!PyArg_ParseTuple(args, "sI", &seq, &min_count_i)) {
+ if (!PyArg_ParseTuple(args, "s", &seq)) {
return NULL;
}
- unsigned long trim_at;
- Py_BEGIN_ALLOW_THREADS
+ // call the C++ function, and trap signals => Python
- BoundedCounterType min_count = min_count_i;
+ std::vector<unsigned int> posns;
+ std::vector<HashIntoType> tags;
- trim_at = counting->trim_on_abundance(seq, min_count);
+ unsigned int pos = 1;
+ KMerIterator kmers(seq, hashtable->ksize());
- Py_END_ALLOW_THREADS;
+ while (!kmers.done()) {
+ HashIntoType kmer = kmers.next();
+ if (set_contains(hashtable->all_tags, kmer)) {
+ posns.push_back(pos);
+ tags.push_back(kmer);
+ }
+ pos++;
+ }
- PyObject * trim_seq = PyBytes_FromStringAndSize(seq, trim_at);
- if (trim_seq == NULL) {
- return NULL;
+ PyObject * posns_list = PyList_New(posns.size());
+ for (size_t i = 0; i < posns.size(); i++) {
+ PyObject * tup = Py_BuildValue("IK", posns[i], tags[i]);
+ PyList_SET_ITEM(posns_list, i, tup);
}
- PyObject * ret = Py_BuildValue("Ok", trim_seq, trim_at);
- Py_DECREF(trim_seq);
- return ret;
+ return posns_list;
}
static
PyObject *
-count_trim_below_abundance(khmer_KCountingHash_Object * me, PyObject * args)
+hashtable_find_all_tags_list(khmer_KHashtable_Object * me, PyObject * args)
{
- CountingHash * counting = me->counting;
+ Hashtable * hashtable = me->hashtable;
- const char * seq = NULL;
- BoundedCounterType max_count_i = 0;
+ const char * kmer_s = NULL;
- if (!PyArg_ParseTuple(args, "sH", &seq, &max_count_i)) {
+ if (!PyArg_ParseTuple(args, "s", &kmer_s)) {
return NULL;
}
- unsigned long trim_at;
+ if (strlen(kmer_s) != hashtable->ksize()) {
+ PyErr_SetString(PyExc_ValueError,
+ "k-mer length must equal the counting table k-mer size");
+ return NULL;
+ }
+
+ SeenSet tags;
+
Py_BEGIN_ALLOW_THREADS
- BoundedCounterType max_count = max_count_i;
+ HashIntoType kmer_f, kmer_r;
+ _hash(kmer_s, hashtable->ksize(), kmer_f, kmer_r);
- trim_at = counting->trim_below_abundance(seq, max_count);
+ hashtable->partition->find_all_tags(kmer_f, kmer_r, tags,
+ hashtable->all_tags);
- Py_END_ALLOW_THREADS;
+ Py_END_ALLOW_THREADS
- PyObject * trim_seq = PyBytes_FromStringAndSize(seq, trim_at);
- if (trim_seq == NULL) {
+ PyObject * x = PyList_New(tags.size());
+ if (x == NULL) {
return NULL;
}
- PyObject * ret = Py_BuildValue("Ok", trim_seq, trim_at);
- Py_DECREF(trim_seq);
+ SeenSet::iterator si;
+ unsigned long long i = 0;
+ for (si = tags.begin(); si != tags.end(); ++si) {
+ // type K for python unsigned long long
+ PyList_SET_ITEM(x, i, Py_BuildValue("K", *si));
+ i++;
+ }
- return ret;
+ return x;
}
static
PyObject *
-count_find_spectral_error_positions(khmer_KCountingHash_Object * me,
- PyObject * args)
+hashtable_consume_fasta_and_tag(khmer_KHashtable_Object * me, PyObject * args)
{
- khmer::CountingHash * counting = me->counting;
+ Hashtable * hashtable = me->hashtable;
- char * seq = NULL;
- khmer::BoundedCounterType max_count = 0; // unsigned short int
+ const char * filename;
- if (!PyArg_ParseTuple(args, "sH", &seq, &max_count)) {
+ if (!PyArg_ParseTuple(args, "s", &filename)) {
return NULL;
}
- std::vector<unsigned int> posns;
+ // call the C++ function, and trap signals => Python
- try {
- posns = counting->find_spectral_error_positions(seq, max_count);
- } catch (khmer_exception &e) {
- PyErr_SetString(PyExc_ValueError, e.what());
- return NULL;
- }
+ unsigned long long n_consumed;
+ unsigned int total_reads;
- Py_ssize_t posns_size = posns.size();
+ hashtable->consume_fasta_and_tag(filename, total_reads, n_consumed);
- PyObject * x = PyList_New(posns_size);
- if (x == NULL) {
+ return Py_BuildValue("IK", total_reads, n_consumed);
+}
+
+static
+PyObject *
+hashtable_get_median_count(khmer_KHashtable_Object * me, PyObject * args)
+{
+ Hashtable * hashtable = me->hashtable;
+
+ const char * long_str;
+
+ if (!PyArg_ParseTuple(args, "s", &long_str)) {
return NULL;
}
- for (Py_ssize_t i = 0; i < posns_size; i++) {
- PyList_SET_ITEM(x, i, PyLong_FromLong(posns[i]));
+
+ if (strlen(long_str) < hashtable->ksize()) {
+ PyErr_SetString(PyExc_ValueError,
+ "string length must >= the hashtable k-mer size");
+ return NULL;
}
- return x;
+ BoundedCounterType med = 0;
+ float average = 0, stddev = 0;
+
+ hashtable->get_median_count(long_str, med, average, stddev);
+
+ return Py_BuildValue("iff", med, average, stddev);
}
static
PyObject *
-hash_fasta_count_kmers_by_position(khmer_KCountingHash_Object * me,
- PyObject * args)
+hashtable_median_at_least(khmer_KHashtable_Object * me, PyObject * args)
{
- CountingHash * counting = me->counting;
+ Hashtable * hashtable = me->hashtable;
- const char * inputfile;
- unsigned int max_read_len = 0;
- long max_read_len_long;
- int limit_by_count_int;
+ const char * long_str;
+ unsigned int cutoff;
- if (!PyArg_ParseTuple(args, "sli", &inputfile, &max_read_len_long,
- &limit_by_count_int)) {
+ if (!PyArg_ParseTuple(args, "sI", &long_str, &cutoff)) {
return NULL;
}
- if (max_read_len_long < 0 || max_read_len_long >= pow(2, 32)) {
- PyErr_SetString(
- PyExc_ValueError,
- "The 2nd argument must be positive and less than 2^32");
+
+ if (strlen(long_str) < hashtable->ksize()) {
+ PyErr_SetString(PyExc_ValueError,
+ "string length must >= the hashtable k-mer size");
return NULL;
}
- if (limit_by_count_int < 0 || limit_by_count_int >= pow(2, 16)) {
- PyErr_SetString(
- PyExc_ValueError,
- "The 3rd argument must be positive and less than 2^16");
- return NULL;
+
+ if (hashtable->median_at_least(long_str, cutoff)) {
+ Py_RETURN_TRUE;
}
- max_read_len = (unsigned int) max_read_len_long;
+ Py_RETURN_FALSE;
- unsigned long long * counts;
- counts = counting->fasta_count_kmers_by_position(inputfile, max_read_len,
- (unsigned short) limit_by_count_int);
+}
- PyObject * x = PyList_New(max_read_len);
- if (x == NULL) {
- delete[] counts;
+static
+PyObject *
+hashtable_n_tags(khmer_KHashtable_Object * me, PyObject * args)
+{
+ Hashtable * hashtable = me->hashtable;
+
+ if (!PyArg_ParseTuple(args, "")) {
return NULL;
}
- for (unsigned int i = 0; i < max_read_len; i++) {
- int ret = PyList_SetItem(x, i, PyLong_FromUnsignedLongLong(counts[i]));
- if (ret < 0) {
- delete[] counts;
- return NULL;
- }
+ return PyLong_FromSize_t(hashtable->n_tags());
+}
+
+static
+PyObject *
+hashtable_print_stop_tags(khmer_KHashtable_Object * me, PyObject * args)
+{
+ Hashtable * hashtable = me->hashtable;
+
+ const char * filename = NULL;
+
+ if (!PyArg_ParseTuple(args, "s", &filename)) {
+ return NULL;
}
- delete[] counts;
+ hashtable->print_stop_tags(filename);
- return x;
+ Py_RETURN_NONE;
}
static
PyObject *
-hash_fasta_dump_kmers_by_abundance(khmer_KCountingHash_Object * me,
- PyObject * args)
+hashtable_print_tagset(khmer_KHashtable_Object * me, PyObject * args)
{
- CountingHash * counting = me->counting;
+ Hashtable * hashtable = me->hashtable;
- const char * inputfile;
- int limit_by = 0;
+ const char * filename = NULL;
- if (!PyArg_ParseTuple(args, "si", &inputfile, &limit_by)) {
+ if (!PyArg_ParseTuple(args, "s", &filename)) {
return NULL;
}
- counting->fasta_dump_kmers_by_abundance(inputfile,
- limit_by);
+ hashtable->print_tagset(filename);
Py_RETURN_NONE;
}
static
PyObject *
-hash_load(khmer_KCountingHash_Object * me, PyObject * args)
+hashtable_load_stop_tags(khmer_KHashtable_Object * me, PyObject * args)
{
- CountingHash * counting = me->counting;
+ Hashtable * hashtable = me->hashtable;
const char * filename = NULL;
+ PyObject * clear_tags_o = NULL;
- if (!PyArg_ParseTuple(args, "s", &filename)) {
+ if (!PyArg_ParseTuple(args, "s|O", &filename, &clear_tags_o)) {
return NULL;
}
+ bool clear_tags = true;
+ if (clear_tags_o && !PyObject_IsTrue(clear_tags_o)) {
+ clear_tags = false;
+ }
+
+
try {
- counting->load(filename);
+ hashtable->load_stop_tags(filename, clear_tags);
} catch (khmer_file_exception &e) {
PyErr_SetString(PyExc_IOError, e.what());
return NULL;
@@ -1246,11 +1279,12 @@ hash_load(khmer_KCountingHash_Object * me, PyObject * args)
Py_RETURN_NONE;
}
+
static
PyObject *
-hash_save(khmer_KCountingHash_Object * me, PyObject * args)
+hashtable_save_stop_tags(khmer_KHashtable_Object * me, PyObject * args)
{
- CountingHash * counting = me->counting;
+ Hashtable * hashtable = me->hashtable;
const char * filename = NULL;
@@ -1259,7 +1293,7 @@ hash_save(khmer_KCountingHash_Object * me, PyObject * args)
}
try {
- counting->save(filename);
+ hashtable->save_stop_tags(filename);
} catch (khmer_file_exception &e) {
PyErr_SetString(PyExc_IOError, e.what());
return NULL;
@@ -1268,161 +1302,291 @@ hash_save(khmer_KCountingHash_Object * me, PyObject * args)
Py_RETURN_NONE;
}
+static PyObject * hashtable_traverse_from_tags(khmer_KHashtable_Object * me,
+ PyObject * args);
+
+static PyObject * hashtable_repartition_largest_partition(
+ khmer_KHashtable_Object * me,
+ PyObject * args);
+
static
PyObject *
-hash_get_ksize(khmer_KCountingHash_Object * me, PyObject * args)
+hashtable_calc_connected_graph_size(khmer_KHashtable_Object * me,
+ PyObject * args)
{
- CountingHash * counting = me->counting;
+ Hashtable * hashtable = me->hashtable;
- if (!PyArg_ParseTuple(args, "")) {
+ const char * _kmer;
+ unsigned int max_size = 0;
+ PyObject * break_on_circum_o = NULL;
+ if (!PyArg_ParseTuple(args, "s|IO", &_kmer, &max_size, &break_on_circum_o)) {
return NULL;
}
- unsigned int k = counting->ksize();
+ bool break_on_circum = false;
+ if (break_on_circum_o && PyObject_IsTrue(break_on_circum_o)) {
+ break_on_circum = true;
+ }
+
+ unsigned long long size = 0;
- return PyLong_FromLong(k);
+ Py_BEGIN_ALLOW_THREADS
+ SeenSet keeper;
+ hashtable->calc_connected_graph_size(_kmer, size, keeper, max_size,
+ break_on_circum);
+ Py_END_ALLOW_THREADS
+
+ return PyLong_FromUnsignedLongLong(size);
}
static
PyObject *
-hash_get_hashsizes(khmer_KCountingHash_Object * me, PyObject * args)
+hashtable_kmer_degree(khmer_KHashtable_Object * me, PyObject * args)
{
- CountingHash * counting = me->counting;
+ Hashtable * hashtable = me->hashtable;
+ const char * kmer_s = NULL;
- if (!PyArg_ParseTuple(args, "")) {
+ if (!PyArg_ParseTuple(args, "s", &kmer_s)) {
return NULL;
}
- std::vector<HashIntoType> ts = counting->get_tablesizes();
-
- PyObject * x = PyList_New(ts.size());
- for (size_t i = 0; i < ts.size(); i++) {
- PyList_SET_ITEM(x, i, PyLong_FromUnsignedLongLong(ts[i]));
- }
-
- return x;
+ return PyLong_FromLong(hashtable->kmer_degree(kmer_s));
}
static
PyObject *
-hash_collect_high_abundance_kmers(khmer_KCountingHash_Object * me,
- PyObject * args);
-
-static
-PyObject *
-hash_consume_and_tag(khmer_KCountingHash_Object * me, PyObject * args)
+hashtable_trim_on_stoptags(khmer_KHashtable_Object * me, PyObject * args)
{
- CountingHash * counting = me->counting;
+ Hashtable * hashtable = me->hashtable;
- const char * seq;
+ const char * seq = NULL;
if (!PyArg_ParseTuple(args, "s", &seq)) {
return NULL;
}
- // call the C++ function, and trap signals => Python
+ size_t trim_at;
+ Py_BEGIN_ALLOW_THREADS
- unsigned long long n_consumed = 0;
- try {
- // @CTB needs to normalize
- counting->consume_sequence_and_tag(seq, n_consumed);
- } catch (_khmer_signal &e) {
- PyErr_SetString(PyExc_ValueError, e.get_message().c_str());
- return NULL;
- }
+ trim_at = hashtable->trim_on_stoptags(seq);
- return Py_BuildValue("K", n_consumed);
-}
+ Py_END_ALLOW_THREADS;
+
+ PyObject * trim_seq = PyUnicode_FromStringAndSize(seq, trim_at);
+ if (trim_seq == NULL) {
+ return NULL;
+ }
+ PyObject * ret = Py_BuildValue("Ok", trim_seq, (unsigned long) trim_at);
+ Py_DECREF(trim_seq);
+
+ return ret;
+}
static
PyObject *
-hash_get_tags_and_positions(khmer_KCountingHash_Object * me, PyObject * args)
+hashtable_identify_stoptags_by_position(khmer_KHashtable_Object * me,
+ PyObject * args)
{
- CountingHash * counting = me->counting;
+ Hashtable * hashtable = me->hashtable;
- const char * seq;
+ const char * seq = NULL;
if (!PyArg_ParseTuple(args, "s", &seq)) {
return NULL;
}
- // call the C++ function, and trap signals => Python
-
std::vector<unsigned int> posns;
- std::vector<HashIntoType> tags;
+ Py_BEGIN_ALLOW_THREADS
- unsigned int pos = 1;
- KMerIterator kmers(seq, counting->ksize());
+ hashtable->identify_stop_tags_by_position(seq, posns);
- while (!kmers.done()) {
- HashIntoType kmer = kmers.next();
- if (set_contains(counting->all_tags, kmer)) {
- posns.push_back(pos);
- tags.push_back(kmer);
- }
- pos++;
+ Py_END_ALLOW_THREADS;
+
+ PyObject * x = PyList_New(posns.size());
+
+ for (unsigned int i = 0; i < posns.size(); i++) {
+ PyList_SET_ITEM(x, i, Py_BuildValue("I", posns[i]));
}
- PyObject * posns_list = PyList_New(posns.size());
- for (size_t i = 0; i < posns.size(); i++) {
- PyObject * tup = Py_BuildValue("IK", posns[i], tags[i]);
- PyList_SET_ITEM(posns_list, i, tup);
+ return x;
+}
+
+static
+PyObject *
+hashtable_do_subset_partition(khmer_KHashtable_Object * me, PyObject * args)
+{
+ Hashtable * hashtable = me->hashtable;
+
+ HashIntoType start_kmer = 0, end_kmer = 0;
+ PyObject * break_on_stop_tags_o = NULL;
+ PyObject * stop_big_traversals_o = NULL;
+
+ if (!PyArg_ParseTuple(args, "|KKOO", &start_kmer, &end_kmer,
+ &break_on_stop_tags_o,
+ &stop_big_traversals_o)) {
+ return NULL;
}
- return posns_list;
+ bool break_on_stop_tags = false;
+ if (break_on_stop_tags_o && PyObject_IsTrue(break_on_stop_tags_o)) {
+ break_on_stop_tags = true;
+ }
+ bool stop_big_traversals = false;
+ if (stop_big_traversals_o && PyObject_IsTrue(stop_big_traversals_o)) {
+ stop_big_traversals = true;
+ }
+
+ SubsetPartition * subset_p = NULL;
+ try {
+ Py_BEGIN_ALLOW_THREADS
+ subset_p = new SubsetPartition(hashtable);
+ subset_p->do_partition(start_kmer, end_kmer, break_on_stop_tags,
+ stop_big_traversals);
+ Py_END_ALLOW_THREADS
+ } catch (std::bad_alloc &e) {
+ return PyErr_NoMemory();
+ }
+
+ khmer_KSubsetPartition_Object * subset_obj = (khmer_KSubsetPartition_Object *)\
+ PyObject_New(khmer_KSubsetPartition_Object, &khmer_KSubsetPartition_Type);
+ subset_obj->subset = subset_p;
+
+ return (PyObject *)subset_obj;
}
static
PyObject *
-hash_find_all_tags_list(khmer_KCountingHash_Object * me, PyObject * args)
+hashtable_join_partitions_by_path(khmer_KHashtable_Object * me, PyObject * args)
{
- CountingHash * counting = me->counting;
+ Hashtable * hashtable = me->hashtable;
- const char * kmer_s = NULL;
+ const char * sequence = NULL;
+ if (!PyArg_ParseTuple(args, "s", &sequence)) {
+ return NULL;
+ }
- if (!PyArg_ParseTuple(args, "s", &kmer_s)) {
+ hashtable->partition->join_partitions_by_path(sequence);
+
+ Py_RETURN_NONE;
+}
+
+static
+PyObject *
+hashtable_merge_subset(khmer_KHashtable_Object * me, PyObject * args)
+{
+ Hashtable * hashtable = me->hashtable;
+
+ khmer_KSubsetPartition_Object * subset_obj;
+ if (!PyArg_ParseTuple(args, "O!", &khmer_KSubsetPartition_Type, &subset_obj)) {
return NULL;
}
+ SubsetPartition * subset_p;
+ subset_p = subset_obj->subset;
- if (strlen(kmer_s) != counting->ksize()) {
- PyErr_SetString(PyExc_ValueError,
- "k-mer length must equal the counting table k-mer size");
+ hashtable->partition->merge(subset_p);
+
+ Py_RETURN_NONE;
+}
+
+static
+PyObject *
+hashtable_merge_from_disk(khmer_KHashtable_Object * me, PyObject * args)
+{
+ Hashtable * hashtable = me->hashtable;
+
+ const char * filename = NULL;
+ if (!PyArg_ParseTuple(args, "s", &filename)) {
return NULL;
}
- SeenSet tags;
+ try {
+ hashtable->partition->merge_from_disk(filename);
+ } catch (khmer_file_exception &e) {
+ PyErr_SetString(PyExc_IOError, e.what());
+ return NULL;
+ }
- Py_BEGIN_ALLOW_THREADS
+ Py_RETURN_NONE;
+}
- HashIntoType kmer_f, kmer_r;
- _hash(kmer_s, counting->ksize(), kmer_f, kmer_r);
+static
+PyObject *
+hashtable_consume_fasta_and_tag_with_reads_parser(khmer_KHashtable_Object * me,
+ PyObject * args)
+{
+ Hashtable * hashtable = me->hashtable;
+
+ python::khmer_ReadParser_Object * rparser_obj = NULL;
+
+ if (!PyArg_ParseTuple( args, "O!", &python::khmer_ReadParser_Type,
+ &rparser_obj)) {
+ return NULL;
+ }
- counting->partition->find_all_tags(kmer_f, kmer_r, tags,
- counting->all_tags);
+ read_parsers:: IParser * rparser = rparser_obj-> parser;
+ // call the C++ function, and trap signals => Python
+ unsigned long long n_consumed = 0;
+ unsigned int total_reads = 0;
+ char const * exc = NULL;
+ Py_BEGIN_ALLOW_THREADS
+ try {
+ hashtable->consume_fasta_and_tag(
+ rparser, total_reads, n_consumed
+ );
+ } catch (khmer::read_parsers::NoMoreReadsAvailable &e) {
+ exc = e.what();
+ }
Py_END_ALLOW_THREADS
+ if (exc != NULL) {
+ PyErr_SetString(PyExc_IOError, exc);
+ return NULL;
+ }
- PyObject * x = PyList_New(tags.size());
- if (x == NULL) {
+ return Py_BuildValue("IK", total_reads, n_consumed);
+}
+
+static PyObject * hashtable_consume_fasta_and_traverse(
+ khmer_KHashtable_Object * me,
+ PyObject * args);
+
+
+static
+PyObject *
+hashtable_consume_fasta_and_tag_with_stoptags(khmer_KHashtable_Object * me,
+ PyObject * args)
+{
+ Hashtable * hashtable = me->hashtable;
+
+ const char * filename;
+
+ if (!PyArg_ParseTuple(args, "s", &filename)) {
return NULL;
}
- SeenSet::iterator si;
- unsigned long long i = 0;
- for (si = tags.begin(); si != tags.end(); ++si) {
- // type K for python unsigned long long
- PyList_SET_ITEM(x, i, Py_BuildValue("K", *si));
- i++;
+
+ // call the C++ function, and trap signals => Python
+
+ unsigned long long n_consumed;
+ unsigned int total_reads;
+
+ try {
+ hashtable->consume_fasta_and_tag_with_stoptags(filename,
+ total_reads, n_consumed);
+ } catch (khmer_file_exception &e) {
+ PyErr_SetString(PyExc_IOError, e.what());
+ return NULL;
}
- return x;
+ return Py_BuildValue("IK", total_reads, n_consumed);
}
static
PyObject *
-hash_consume_fasta_and_tag(khmer_KCountingHash_Object * me, PyObject * args)
+hashtable_consume_partitioned_fasta(khmer_KHashtable_Object * me,
+ PyObject * args)
{
- CountingHash * counting = me->counting;
+ Hashtable * hashtable = me->hashtable;
const char * filename;
@@ -1436,9 +1600,9 @@ hash_consume_fasta_and_tag(khmer_KCountingHash_Object * me, PyObject * args)
unsigned int total_reads;
try {
- counting->consume_fasta_and_tag(filename, total_reads, n_consumed);
- } catch (_khmer_signal &e) {
- PyErr_SetString(PyExc_IOError, e.get_message().c_str());
+ hashtable->consume_partitioned_fasta(filename, total_reads, n_consumed);
+ } catch (khmer_file_exception &e) {
+ PyErr_SetString(PyExc_IOError, e.what());
return NULL;
}
@@ -1447,887 +1611,245 @@ hash_consume_fasta_and_tag(khmer_KCountingHash_Object * me, PyObject * args)
static
PyObject *
-hash_do_subset_partition_with_abundance(khmer_KCountingHash_Object * me,
- PyObject * args)
+hashtable_find_all_tags(khmer_KHashtable_Object * me, PyObject * args)
{
- CountingHash * counting = me->counting;
+ Hashtable * hashtable = me->hashtable;
- HashIntoType start_kmer = 0, end_kmer = 0;
- PyObject * break_on_stop_tags_o = NULL;
- PyObject * stop_big_traversals_o = NULL;
- BoundedCounterType min_count, max_count;
+ const char * kmer_s = NULL;
- if (!PyArg_ParseTuple(args, "HH|KKOO",
- &min_count, &max_count,
- &start_kmer, &end_kmer,
- &break_on_stop_tags_o,
- &stop_big_traversals_o)) {
+ if (!PyArg_ParseTuple(args, "s", &kmer_s)) {
return NULL;
}
- bool break_on_stop_tags = false;
- if (break_on_stop_tags_o && PyObject_IsTrue(break_on_stop_tags_o)) {
- break_on_stop_tags = true;
- }
- bool stop_big_traversals = false;
- if (stop_big_traversals_o && PyObject_IsTrue(stop_big_traversals_o)) {
- stop_big_traversals = true;
+ if (strlen(kmer_s) != hashtable->ksize()) {
+ PyErr_SetString( PyExc_ValueError,
+ "k-mer size must equal the k-mer size of the presence table");
+ return NULL;
}
- SubsetPartition * subset_p = NULL;
+ pre_partition_info * ppi = NULL;
+
+ Py_BEGIN_ALLOW_THREADS
+
+ HashIntoType kmer, kmer_f, kmer_r;
+ kmer = _hash(kmer_s, hashtable->ksize(), kmer_f, kmer_r);
+
try {
- Py_BEGIN_ALLOW_THREADS
- subset_p = new SubsetPartition(counting);
- subset_p->do_partition_with_abundance(start_kmer, end_kmer,
- min_count, max_count,
- break_on_stop_tags,
- stop_big_traversals);
- Py_END_ALLOW_THREADS
- } catch (_khmer_signal &e) {
- return NULL;
+ ppi = new pre_partition_info(kmer);
} catch (std::bad_alloc &e) {
return PyErr_NoMemory();
}
+ hashtable->partition->find_all_tags(kmer_f, kmer_r, ppi->tagged_kmers,
+ hashtable->all_tags);
+ hashtable->add_kmer_to_tags(kmer);
- khmer_KSubsetPartition_Object * subset_obj = (khmer_KSubsetPartition_Object *)\
- PyObject_New(khmer_KSubsetPartition_Object, &khmer_KSubsetPartition_Type);
+ Py_END_ALLOW_THREADS
- if (subset_obj == NULL) {
- delete subset_p;
+ khmer_PrePartitionInfo_Object * ppi_obj = (khmer_PrePartitionInfo_Object *) \
+ PyObject_New(khmer_PrePartitionInfo_Object, &khmer_PrePartitionInfo_Type);
+
+ ppi_obj->PrePartitionInfo = ppi;
+
+ return (PyObject*)ppi_obj;
+}
+
+static
+PyObject *
+hashtable_assign_partition_id(khmer_KHashtable_Object * me, PyObject * args)
+{
+ Hashtable * hashtable = me->hashtable;
+
+ khmer_PrePartitionInfo_Object * ppi_obj;
+ if (!PyArg_ParseTuple(args, "O!", &khmer_PrePartitionInfo_Type, &ppi_obj)) {
return NULL;
}
- subset_obj->subset = subset_p;
+ pre_partition_info * ppi;
+ ppi = ppi_obj->PrePartitionInfo;
- return (PyObject *) subset_obj;
+ PartitionID p;
+ p = hashtable->partition->assign_partition_id(ppi->kmer,
+ ppi->tagged_kmers);
+
+ return PyLong_FromLong(p);
}
-static PyMethodDef khmer_counting_methods[] = {
- {
- "ksize",
- (PyCFunction)hash_get_ksize,
- METH_VARARGS,
- ""
- },
- { "hashsizes", (PyCFunction)hash_get_hashsizes, METH_VARARGS, "" },
- { "set_use_bigcount", (PyCFunction)hash_set_use_bigcount, METH_VARARGS, "" },
- { "get_use_bigcount", (PyCFunction)hash_get_use_bigcount, METH_VARARGS, "" },
- { "n_unique_kmers", (PyCFunction)hash_n_unique_kmers, METH_VARARGS, "Count the number of unique kmers" },
- { "n_occupied", (PyCFunction)hash_n_occupied, METH_VARARGS, "Count the number of occupied bins" },
- { "n_entries", (PyCFunction)hash_n_entries, METH_VARARGS, "" },
- { "count", (PyCFunction)hash_count, METH_VARARGS, "Count the given kmer" },
- { "consume", (PyCFunction)hash_consume, METH_VARARGS, "Count all k-mers in the given string" },
- { "consume_fasta", (PyCFunction)hash_consume_fasta, METH_VARARGS, "Count all k-mers in a given file" },
- {
- "consume_fasta_with_reads_parser", (PyCFunction)hash_consume_fasta_with_reads_parser,
- METH_VARARGS, "Count all k-mers using a given reads parser"
- },
- { "output_fasta_kmer_pos_freq", (PyCFunction)hash_output_fasta_kmer_pos_freq, METH_VARARGS, "" },
- { "get", (PyCFunction)hash_get, METH_VARARGS, "Get the count for the given k-mer" },
- {
- "get_raw_tables", (PyCFunction)hash_get_raw_tables,
- METH_VARARGS, "Get a list of the raw tables as memoryview objects"
- },
- { "get_min_count", (PyCFunction)hash_get_min_count, METH_VARARGS, "Get the smallest count of all the k-mers in the string" },
- { "get_max_count", (PyCFunction)hash_get_max_count, METH_VARARGS, "Get the largest count of all the k-mers in the string" },
- { "get_median_count", (PyCFunction)hash_get_median_count, METH_VARARGS, "Get the median, average, and stddev of the k-mer counts in the string" },
- { "get_kadian_count", (PyCFunction)hash_get_kadian_count, METH_VARARGS, "Get the kadian (abundance of k-th rank-ordered k-mer) of the k-mer counts in the string" },
- { "trim_on_abundance", (PyCFunction)count_trim_on_abundance, METH_VARARGS, "Trim on >= abundance" },
- { "trim_below_abundance", (PyCFunction)count_trim_below_abundance, METH_VARARGS, "Trim on >= abundance" },
- { "find_spectral_error_positions", (PyCFunction)count_find_spectral_error_positions, METH_VARARGS, "Identify positions of low-abundance k-mers" },
- { "abundance_distribution", (PyCFunction)hash_abundance_distribution, METH_VARARGS, "" },
- { "abundance_distribution_with_reads_parser", (PyCFunction)hash_abundance_distribution_with_reads_parser, METH_VARARGS, "" },
- { "fasta_count_kmers_by_position", (PyCFunction)hash_fasta_count_kmers_by_position, METH_VARARGS, "" },
- { "fasta_dump_kmers_by_abundance", (PyCFunction)hash_fasta_dump_kmers_by_abundance, METH_VARARGS, "" },
- { "load", (PyCFunction)hash_load, METH_VARARGS, "" },
- { "save", (PyCFunction)hash_save, METH_VARARGS, "" },
- {
- "collect_high_abundance_kmers", (PyCFunction)hash_collect_high_abundance_kmers,
- METH_VARARGS, ""
- },
- { "consume_and_tag", (PyCFunction)hash_consume_and_tag, METH_VARARGS, "Consume a sequence and tag it" },
- { "get_tags_and_positions", (PyCFunction)hash_get_tags_and_positions, METH_VARARGS, "Retrieve tags and their positions in a sequence." },
- { "find_all_tags_list", (PyCFunction)hash_find_all_tags_list, METH_VARARGS, "Find all tags within range of the given k-mer, return as list" },
- { "consume_fasta_and_tag", (PyCFunction)hash_consume_fasta_and_tag, METH_VARARGS, "Count all k-mers in a given file" },
- { "do_subset_partition_with_abundance", (PyCFunction)hash_do_subset_partition_with_abundance, METH_VARARGS, "" },
- {NULL, NULL, 0, NULL} /* sentinel */
-};
+static
+PyObject *
+hashtable_add_tag(khmer_KHashtable_Object * me, PyObject * args)
+{
+ Hashtable * hashtable = me->hashtable;
-static PyObject* _new_counting_hash(PyTypeObject * type, PyObject * args,
- PyObject * kwds);
-
-static PyTypeObject khmer_KCountingHash_Type
-CPYCHECKER_TYPE_OBJECT_FOR_TYPEDEF("khmer_KCountingHash_Object")
-= {
- PyVarObject_HEAD_INIT(NULL, 0) /* init & ob_size */
- "_khmer.KCountingHash", /*tp_name*/
- sizeof(khmer_KCountingHash_Object), /*tp_basicsize*/
- 0, /*tp_itemsize*/
- (destructor)khmer_counting_dealloc, /*tp_dealloc*/
- 0, /*tp_print*/
- 0, /*tp_getattr*/
- 0, /*tp_setattr*/
- 0, /*tp_compare*/
- 0, /*tp_repr*/
- 0, /*tp_as_number*/
- 0, /*tp_as_sequence*/
- 0, /*tp_as_mapping*/
- 0, /*tp_hash */
- 0, /*tp_call*/
- 0, /*tp_str*/
- 0, /*tp_getattro*/
- 0, /*tp_setattro*/
- 0, /*tp_as_buffer*/
- Py_TPFLAGS_DEFAULT, /*tp_flags*/
- "counting hash object", /* tp_doc */
- 0, /* tp_traverse */
- 0, /* tp_clear */
- 0, /* tp_richcompare */
- 0, /* tp_weaklistoffset */
- 0, /* tp_iter */
- 0, /* tp_iternext */
- khmer_counting_methods, /* tp_methods */
- 0, /* tp_members */
- 0, /* tp_getset */
- 0, /* tp_base */
- 0, /* tp_dict */
- 0, /* tp_descr_get */
- 0, /* tp_descr_set */
- 0, /* tp_dictoffset */
- 0, /* tp_init */
- 0, /* tp_alloc */
- _new_counting_hash, /* tp_new */
-};
-
-#define is_counting_obj(v) (Py_TYPE(v) == &khmer_KCountingHash_Type)
-
-//
-// new_hashtable
-//
-
-static PyObject* new_hashtable(PyObject * self, PyObject * args)
-{
- unsigned int k = 0;
- unsigned long long size = 0;
-
- if (!PyArg_ParseTuple(args, "IK", &k, &size)) {
- return NULL;
- }
-
- khmer_KCountingHash_Object * kcounting_obj = (khmer_KCountingHash_Object *) \
- PyObject_New(khmer_KCountingHash_Object, &khmer_KCountingHash_Type);
-
- if (kcounting_obj == NULL) {
- return NULL;
- }
-
- try {
- kcounting_obj->counting = new CountingHash(k, size);
- } catch (std::bad_alloc &e) {
- return PyErr_NoMemory();
- }
-
- return (PyObject *) kcounting_obj;
-}
-
-//
-// new_counting_hash
-//
-
-static PyObject* _new_counting_hash(PyTypeObject * type, PyObject * args,
- PyObject * kwds)
-{
- khmer_KCountingHash_Object * self;
-
- self = (khmer_KCountingHash_Object *)type->tp_alloc(type, 0);
-
- if (self != NULL) {
- WordLength k = 0;
- PyListObject * sizes_list_o = NULL;
-
- if (!PyArg_ParseTuple(args, "bO!", &k, &PyList_Type, &sizes_list_o)) {
- Py_DECREF(self);
- return NULL;
- }
-
- std::vector<HashIntoType> sizes;
- Py_ssize_t sizes_list_o_length = PyList_GET_SIZE(sizes_list_o);
- if (sizes_list_o_length == -1) {
- Py_DECREF(self);
- PyErr_SetString(PyExc_ValueError, "error with hashtable primes!");
- return NULL;
- }
- for (Py_ssize_t i = 0; i < sizes_list_o_length; i++) {
- PyObject * size_o = PyList_GET_ITEM(sizes_list_o, i);
- if (PyLong_Check(size_o)) {
- sizes.push_back((HashIntoType) PyLong_AsUnsignedLongLong(size_o));
- } else if (PyInt_Check(size_o)) {
- sizes.push_back((HashIntoType) PyInt_AsLong(size_o));
- } else if (PyFloat_Check(size_o)) {
- sizes.push_back((HashIntoType) PyFloat_AS_DOUBLE(size_o));
- } else {
- Py_DECREF(self);
- PyErr_SetString(PyExc_TypeError,
- "2nd argument must be a list of ints, longs, or floats");
- return NULL;
- }
- }
-
- try {
- self->counting = new CountingHash(k, sizes);
- } catch (std::bad_alloc &e) {
- return PyErr_NoMemory();
- }
- }
-
- return (PyObject *) self;
-}
-
-//
-// hashbits stuff
-//
-
-static void khmer_hashbits_dealloc(khmer_KHashbits_Object * obj);
-static PyObject* khmer_hashbits_new(PyTypeObject * type, PyObject * args,
- PyObject * kwds);
-static int khmer_hashbits_init(khmer_KHashbits_Object * self, PyObject * args,
- PyObject * kwds);
-
-static PyTypeObject khmer_KHashbits_Type
-CPYCHECKER_TYPE_OBJECT_FOR_TYPEDEF("khmer_KHashbits_Object")
-= {
- PyVarObject_HEAD_INIT(NULL, 0) /* init & ob_size */
- "_khmer.Hashbits", /* tp_name */
- sizeof(khmer_KHashbits_Object), /* tp_basicsize */
- 0, /* tp_itemsize */
- (destructor)khmer_hashbits_dealloc, /*tp_dealloc*/
- 0, /*tp_print*/
- 0, /*tp_getattr*/
- 0, /*tp_setattr*/
- 0, /*tp_compare*/
- 0, /*tp_repr*/
- 0, /*tp_as_number*/
- 0, /*tp_as_sequence*/
- 0, /*tp_as_mapping*/
- 0, /*tp_hash */
- 0, /*tp_call*/
- 0, /*tp_str*/
- 0, /*tp_getattro*/
- 0, /*tp_setattro*/
- 0, /*tp_as_buffer*/
- Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /*tp_flags*/
- "hashbits object", /* tp_doc */
- 0, /* tp_traverse */
- 0, /* tp_clear */
- 0, /* tp_richcompare */
- 0, /* tp_weaklistoffset */
- 0, /* tp_iter */
- 0, /* tp_iternext */
- 0, /* tp_methods */
- 0, /* tp_members */
- 0, /* tp_getset */
- 0, /* tp_base */
- 0, /* tp_dict */
- 0, /* tp_descr_get */
- 0, /* tp_descr_set */
- 0, /* tp_dictoffset */
- (initproc)khmer_hashbits_init, /* tp_init */
- 0, /* tp_alloc */
- khmer_hashbits_new, /* tp_new */
-};
-
-static
-PyObject *
-hash_abundance_distribution_with_reads_parser(khmer_KCountingHash_Object * me,
- PyObject * args)
-{
- CountingHash * counting = me->counting;
-
- khmer :: python :: khmer_ReadParser_Object * rparser_obj = NULL;
- khmer_KHashbits_Object *tracking_obj = NULL;
-
- if (!PyArg_ParseTuple(args, "O!O!", &python::khmer_ReadParser_Type,
- &rparser_obj, &khmer_KHashbits_Type, &tracking_obj)) {
- return NULL;
- }
-
- read_parsers:: IParser * rparser = rparser_obj->parser;
- Hashbits * hashbits = tracking_obj->hashbits;
-
- HashIntoType * dist = NULL;
-
- Py_BEGIN_ALLOW_THREADS
- dist = counting->abundance_distribution(rparser, hashbits);
- Py_END_ALLOW_THREADS
-
- PyObject * x = PyList_New(MAX_BIGCOUNT + 1);
- if (x == NULL) {
- delete[] dist;
- return NULL;
- }
- for (int i = 0; i < MAX_BIGCOUNT + 1; i++) {
- PyList_SET_ITEM(x, i, PyLong_FromUnsignedLongLong(dist[i]));
- }
-
- delete[] dist;
- return x;
-}
-
-static
-PyObject *
-hash_abundance_distribution(khmer_KCountingHash_Object * me, PyObject * args)
-{
- CountingHash * counting = me->counting;
-
- const char * filename = NULL;
- khmer_KHashbits_Object * tracking_obj = NULL;
- if (!PyArg_ParseTuple(args, "sO!", &filename, &khmer_KHashbits_Type,
- &tracking_obj)) {
- return NULL;
- }
-
- Hashbits * hashbits = tracking_obj->hashbits;
- HashIntoType * dist;
-
- char const * result = "";
- bool exception = false;
- Py_BEGIN_ALLOW_THREADS
- try {
- dist = counting->abundance_distribution(filename, hashbits);
- } catch (khmer_file_exception &e) {
- exception = true;
- result = e.what();
- }
- Py_END_ALLOW_THREADS
-
- if (exception) {
- PyErr_SetString(PyExc_IOError, result);
- return NULL;
- }
-
- PyObject * x = PyList_New(MAX_BIGCOUNT + 1);
- if (x == NULL) {
- delete[] dist;
- return NULL;
- }
- for (int i = 0; i < MAX_BIGCOUNT + 1; i++) {
- PyList_SET_ITEM(x, i, PyLong_FromUnsignedLongLong(dist[i]));
- }
-
- delete[] dist;
-
- return x;
-}
-
-static
-PyObject *
-hashbits_n_unique_kmers(khmer_KHashbits_Object * me, PyObject * args)
-{
- Hashbits * hashbits = me->hashbits;
-
- HashIntoType n = hashbits->n_unique_kmers();
-
- return PyLong_FromUnsignedLongLong(n);
-}
-
-
-static
-PyObject *
-hashbits_count_overlap(khmer_KHashbits_Object * me, PyObject * args)
-{
- Hashbits * hashbits = me->hashbits;
- khmer_KHashbits_Object * ht2_argu;
- const char * filename;
- Hashbits * ht2;
-
- if (!PyArg_ParseTuple(args, "sO!", &filename, &khmer_KHashbits_Type,
- &ht2_argu)) {
- return NULL;
- }
-
- ht2 = ht2_argu->hashbits;
-
-// call the C++ function, and trap signals => Python
-
- unsigned long long n_consumed;
- unsigned int total_reads;
- HashIntoType curve[2][100];
-
- try {
- hashbits->consume_fasta_overlap(filename, curve, *ht2, total_reads, n_consumed);
- } catch (_khmer_signal &e) {
- PyErr_SetString(PyExc_IOError, e.get_message().c_str());
- return NULL;
- } catch (InvalidStreamHandle &e) {
- PyErr_SetString(PyExc_IOError, e.what());
- return NULL;
- }
-
- HashIntoType n = hashbits->n_unique_kmers();
- HashIntoType n_overlap = hashbits->n_overlap_kmers();
-
- PyObject * x = PyList_New(200);
-
- for (unsigned int i = 0; i < 100; i++) {
- PyList_SetItem(x, i, Py_BuildValue("K", curve[0][i]));
- }
- for (unsigned int i = 0; i < 100; i++) {
- PyList_SetItem(x, i + 100, Py_BuildValue("K", curve[1][i]));
- }
- return Py_BuildValue("KKO", n, n_overlap, x);
-}
-
-static
-PyObject *
-hashbits_n_occupied(khmer_KHashbits_Object * me, PyObject * args)
-{
- Hashbits * hashbits = me->hashbits;
-
- HashIntoType start = 0, stop = 0;
-
- if (!PyArg_ParseTuple(args, "|KK", &start, &stop)) {
- return NULL;
- }
-
- HashIntoType n = hashbits->n_occupied(start, stop);
-
- return PyLong_FromUnsignedLongLong(n);
-}
-
-static
-PyObject *
-hashbits_n_tags(khmer_KHashbits_Object * me, PyObject * args)
-{
- Hashbits * hashbits = me->hashbits;
-
- if (!PyArg_ParseTuple(args, "")) {
- return NULL;
- }
-
- return PyLong_FromSize_t(hashbits->n_tags());
-}
-
-static
-PyObject *
-hashbits_count(khmer_KHashbits_Object * me, PyObject * args)
-{
- Hashbits * hashbits = me->hashbits;
-
- const char * kmer;
-
- if (!PyArg_ParseTuple(args, "s", &kmer)) {
- return NULL;
- }
-
- if (strlen(kmer) != hashbits->ksize()) {
- PyErr_SetString(PyExc_ValueError,
- "k-mer length must equal the presence table k-mer size");
- return NULL;
- }
-
- hashbits->count(kmer);
-
- return PyLong_FromLong(1);
-}
-
-static
-PyObject *
-hashbits_consume(khmer_KHashbits_Object * me, PyObject * args)
-{
- Hashbits * hashbits = me->hashbits;
-
- const char * long_str;
-
- if (!PyArg_ParseTuple(args, "s", &long_str)) {
- return NULL;
- }
-
- if (strlen(long_str) < hashbits->ksize()) {
- PyErr_SetString(PyExc_ValueError,
- "string length must >= the hashbits k-mer size");
- return NULL;
- }
-
- unsigned int n_consumed;
- n_consumed = hashbits->consume_string(long_str);
-
- return PyLong_FromLong(n_consumed);
-}
-
-static
-PyObject *
-hashbits_print_stop_tags(khmer_KHashbits_Object * me, PyObject * args)
-{
- Hashbits * hashbits = me->hashbits;
-
- const char * filename = NULL;
-
- if (!PyArg_ParseTuple(args, "s", &filename)) {
- return NULL;
- }
-
- hashbits->print_stop_tags(filename);
-
- Py_RETURN_NONE;
-}
-
-static
-PyObject *
-hashbits_print_tagset(khmer_KHashbits_Object * me, PyObject * args)
-{
- Hashbits * hashbits = me->hashbits;
-
- const char * filename = NULL;
-
- if (!PyArg_ParseTuple(args, "s", &filename)) {
- return NULL;
- }
-
- hashbits->print_tagset(filename);
-
- Py_RETURN_NONE;
-}
-
-static
-PyObject *
-hashbits_load_stop_tags(khmer_KHashbits_Object * me, PyObject * args)
-{
- Hashbits * hashbits = me->hashbits;
-
- const char * filename = NULL;
- PyObject * clear_tags_o = NULL;
-
- if (!PyArg_ParseTuple(args, "s|O", &filename, &clear_tags_o)) {
- return NULL;
- }
-
- bool clear_tags = true;
- if (clear_tags_o && !PyObject_IsTrue(clear_tags_o)) {
- clear_tags = false;
- }
-
-
- try {
- hashbits->load_stop_tags(filename, clear_tags);
- } catch (khmer_file_exception &e) {
- PyErr_SetString(PyExc_IOError, e.what());
- return NULL;
- }
-
- Py_RETURN_NONE;
-}
-
-
-static
-PyObject *
-hashbits_save_stop_tags(khmer_KHashbits_Object * me, PyObject * args)
-{
- Hashbits * hashbits = me->hashbits;
-
- const char * filename = NULL;
-
- if (!PyArg_ParseTuple(args, "s", &filename)) {
+ const char * kmer_s = NULL;
+ if (!PyArg_ParseTuple(args, "s", &kmer_s)) {
return NULL;
}
- try {
- hashbits->save_stop_tags(filename);
- } catch (khmer_file_exception &e) {
- PyErr_SetString(PyExc_IOError, e.what());
- return NULL;
- }
+ HashIntoType kmer = _hash(kmer_s, hashtable->ksize());
+ hashtable->add_tag(kmer);
Py_RETURN_NONE;
}
static
PyObject *
-hashbits_traverse_from_tags(khmer_KHashbits_Object * me, PyObject * args)
+hashtable_add_stop_tag(khmer_KHashtable_Object * me, PyObject * args)
{
- Hashbits * hashbits = me->hashbits;
+ Hashtable * hashtable = me->hashtable;
- khmer_KCountingHash_Object * counting_o = NULL;
- unsigned int distance, threshold, frequency;
-
- if (!PyArg_ParseTuple(args, "O!III", &khmer_KCountingHash_Type, &counting_o,
- &distance, &threshold, &frequency)) {
+ const char * kmer_s = NULL;
+ if (!PyArg_ParseTuple(args, "s", &kmer_s)) {
return NULL;
}
- hashbits->traverse_from_tags(distance, threshold, frequency,
- * counting_o->counting);
+ HashIntoType kmer = _hash(kmer_s, hashtable->ksize());
+ hashtable->add_stop_tag(kmer);
Py_RETURN_NONE;
}
static
PyObject *
-hashbits_repartition_largest_partition(khmer_KHashbits_Object * me,
- PyObject * args)
-{
- Hashbits * hashbits = me->hashbits;
-
- khmer_KCountingHash_Object * counting_o = NULL;
- PyObject * subset_o = NULL;
- unsigned int distance, threshold, frequency;
-
- if (!PyArg_ParseTuple(args, "OO!III", &subset_o, &khmer_KCountingHash_Type,
- &counting_o, &distance, &threshold, &frequency)) {
- return NULL;
- }
-
- SubsetPartition * subset_p;
- if (subset_o != Py_None) {
- subset_p = (SubsetPartition *) PyCObject_AsVoidPtr(subset_o);
- } else {
- subset_p = hashbits->partition;
- }
-
- CountingHash * counting = counting_o->counting;
-
- unsigned long next_largest;
- try {
- next_largest = subset_p->repartition_largest_partition(distance,
- threshold, frequency, *counting);
- } catch (khmer_exception &e) {
- PyErr_SetString(PyExc_RuntimeError, e.what());
- return NULL;
- }
-
- return PyLong_FromLong(next_largest);
-}
-
-static
-PyObject *
-hashbits_get(khmer_KHashbits_Object * me, PyObject * args)
+hashtable_get_stop_tags(khmer_KHashtable_Object * me, PyObject * args)
{
- Hashbits * hashbits = me->hashbits;
+ Hashtable * hashtable = me->hashtable;
- PyObject * arg;
-
- if (!PyArg_ParseTuple(args, "O", &arg)) {
+ if (!PyArg_ParseTuple(args, "")) {
return NULL;
}
- unsigned long count = 0;
-
- if (PyInt_Check(arg)) {
- long pos = PyInt_AsLong(arg);
- count = hashbits->get_count((unsigned int) pos);
- } else if (PyBytes_Check(arg)) {
- std::string s = PyBytes_AsString(arg);
-
- if (strlen(s.c_str()) < hashbits->ksize()) {
- PyErr_SetString(PyExc_ValueError,
- "string length must equal the presence table k-mer size");
- return NULL;
- }
+ WordLength k = hashtable->ksize();
+ SeenSet::const_iterator si;
- count = hashbits->get_count(s.c_str());
- } else {
- PyErr_SetString(PyExc_ValueError, "must pass in an int or string");
- return NULL;
+ PyObject * x = PyList_New(hashtable->stop_tags.size());
+ unsigned long long i = 0;
+ for (si = hashtable->stop_tags.begin(); si != hashtable->stop_tags.end();
+ si++) {
+ std::string s = _revhash(*si, k);
+ PyList_SET_ITEM(x, i, Py_BuildValue("s", s.c_str()));
+ i++;
}
- return PyLong_FromLong(count);
+ return x;
}
static
PyObject *
-hashbits_calc_connected_graph_size(khmer_KHashbits_Object * me, PyObject * args)
+hashtable_get_tagset(khmer_KHashtable_Object * me, PyObject * args)
{
- Hashbits * hashbits = me->hashbits;
+ Hashtable * hashtable = me->hashtable;
- const char * _kmer;
- unsigned int max_size = 0;
- PyObject * break_on_circum_o = NULL;
- if (!PyArg_ParseTuple(args, "s|IO", &_kmer, &max_size, &break_on_circum_o)) {
+ if (!PyArg_ParseTuple(args, "")) {
return NULL;
}
- bool break_on_circum = false;
- if (break_on_circum_o && PyObject_IsTrue(break_on_circum_o)) {
- break_on_circum = true;
- }
-
- unsigned long long size = 0;
-
- Py_BEGIN_ALLOW_THREADS
- SeenSet keeper;
- hashbits->calc_connected_graph_size(_kmer, size, keeper, max_size,
- break_on_circum);
- Py_END_ALLOW_THREADS
-
- return PyLong_FromUnsignedLongLong(size);
-}
-
-static
-PyObject *
-hashbits_kmer_degree(khmer_KHashbits_Object * me, PyObject * args)
-{
- Hashbits * hashbits = me->hashbits;
-
- const char * kmer_s = NULL;
+ WordLength k = hashtable->ksize();
+ SeenSet::const_iterator si;
- if (!PyArg_ParseTuple(args, "s", &kmer_s)) {
- return NULL;
+ PyObject * x = PyList_New(hashtable->all_tags.size());
+ unsigned long long i = 0;
+ for (si = hashtable->all_tags.begin(); si != hashtable->all_tags.end(); si++) {
+ std::string s = _revhash(*si, k);
+ PyList_SET_ITEM(x, i, Py_BuildValue("s", s.c_str()));
+ i++;
}
- return PyLong_FromLong(hashbits->kmer_degree(kmer_s));
+ return x;
}
static
PyObject *
-hashbits_trim_on_stoptags(khmer_KHashbits_Object * me, PyObject * args)
+hashtable_output_partitions(khmer_KHashtable_Object * me, PyObject * args)
{
- Hashbits * hashbits = me->hashbits;
+ Hashtable * hashtable = me->hashtable;
- const char * seq = NULL;
+ const char * filename = NULL;
+ const char * output = NULL;
+ PyObject * output_unassigned_o = NULL;
- if (!PyArg_ParseTuple(args, "s", &seq)) {
+ if (!PyArg_ParseTuple(args, "ss|O", &filename, &output,
+ &output_unassigned_o)) {
return NULL;
}
- size_t trim_at;
- Py_BEGIN_ALLOW_THREADS
-
- trim_at = hashbits->trim_on_stoptags(seq);
-
- Py_END_ALLOW_THREADS;
-
- PyObject * trim_seq = PyBytes_FromStringAndSize(seq, trim_at);
- if (trim_seq == NULL) {
- return NULL;
+ bool output_unassigned = false;
+ if (output_unassigned_o != NULL && PyObject_IsTrue(output_unassigned_o)) {
+ output_unassigned = true;
}
- PyObject * ret = Py_BuildValue("Ok", trim_seq, (unsigned long) trim_at);
- Py_DECREF(trim_seq);
-
- return ret;
-}
-
-static
-PyObject *
-hashbits_identify_stoptags_by_position(khmer_KHashbits_Object * me,
- PyObject * args)
-{
- Hashbits * hashbits = me->hashbits;
- const char * seq = NULL;
+ size_t n_partitions = 0;
- if (!PyArg_ParseTuple(args, "s", &seq)) {
+ try {
+ SubsetPartition * subset_p = hashtable->partition;
+ n_partitions = subset_p->output_partitioned_file(filename,
+ output,
+ output_unassigned);
+ } catch (khmer_file_exception &e) {
+ PyErr_SetString(PyExc_IOError, e.what());
return NULL;
}
- std::vector<unsigned int> posns;
- Py_BEGIN_ALLOW_THREADS
-
- hashbits->identify_stop_tags_by_position(seq, posns);
-
- Py_END_ALLOW_THREADS;
-
- PyObject * x = PyList_New(posns.size());
-
- for (unsigned int i = 0; i < posns.size(); i++) {
- PyList_SET_ITEM(x, i, Py_BuildValue("I", posns[i]));
- }
-
- return x;
+ return PyLong_FromLong(n_partitions);
}
static
PyObject *
-hashbits_do_subset_partition(khmer_KHashbits_Object * me, PyObject * args)
+hashtable_find_unpart(khmer_KHashtable_Object * me, PyObject * args)
{
- Hashbits * hashbits = me->hashbits;
+ Hashtable * hashtable = me->hashtable;
- HashIntoType start_kmer = 0, end_kmer = 0;
- PyObject * break_on_stop_tags_o = NULL;
+ const char * filename = NULL;
+ PyObject * traverse_o = NULL;
PyObject * stop_big_traversals_o = NULL;
- if (!PyArg_ParseTuple(args, "|KKOO", &start_kmer, &end_kmer,
- &break_on_stop_tags_o,
+ if (!PyArg_ParseTuple(args, "sOO", &filename, &traverse_o,
&stop_big_traversals_o)) {
return NULL;
}
- bool break_on_stop_tags = false;
- if (break_on_stop_tags_o && PyObject_IsTrue(break_on_stop_tags_o)) {
- break_on_stop_tags = true;
- }
- bool stop_big_traversals = false;
- if (stop_big_traversals_o && PyObject_IsTrue(stop_big_traversals_o)) {
- stop_big_traversals = true;
- }
-
- SubsetPartition * subset_p = NULL;
- try {
- Py_BEGIN_ALLOW_THREADS
- subset_p = new SubsetPartition(hashbits);
- subset_p->do_partition(start_kmer, end_kmer, break_on_stop_tags,
- stop_big_traversals);
- Py_END_ALLOW_THREADS
- } catch (_khmer_signal &e) {
- return NULL;
- } catch (std::bad_alloc &e) {
- return PyErr_NoMemory();
- }
-
- return PyCObject_FromVoidPtr(subset_p, free_subset_partition_info);
-}
-
-static
-PyObject *
-hashbits_join_partitions_by_path(khmer_KHashbits_Object * me, PyObject * args)
-{
- Hashbits * hashbits = me->hashbits;
-
- const char * sequence = NULL;
- if (!PyArg_ParseTuple(args, "s", &sequence)) {
- return NULL;
- }
+ bool traverse = PyObject_IsTrue(traverse_o);
+ bool stop_big_traversals = PyObject_IsTrue(stop_big_traversals_o);
+ unsigned int n_singletons = 0;
- hashbits->partition->join_partitions_by_path(sequence);
+ SubsetPartition * subset_p = hashtable->partition;
+ n_singletons = subset_p->find_unpart(filename, traverse,
+ stop_big_traversals);
- Py_RETURN_NONE;
+ return PyLong_FromLong(n_singletons);
}
static
PyObject *
-hashbits_merge_subset(khmer_KHashbits_Object * me, PyObject * args)
+hashtable_filter_if_present(khmer_KHashtable_Object * me, PyObject * args)
{
- Hashbits * hashbits = me->hashbits;
+ Hashtable * hashtable = me->hashtable;
- PyObject * subset_obj;
- if (!PyArg_ParseTuple(args, "O", &subset_obj)) {
- return NULL;
- }
+ const char * filename = NULL;
+ const char * output = NULL;
- if (!PyCObject_Check(subset_obj)) {
- PyErr_SetString( PyExc_ValueError, "invalid subset");
+ if (!PyArg_ParseTuple(args, "ss", &filename, &output)) {
return NULL;
}
- SubsetPartition * subset_p;
- subset_p = (SubsetPartition *) PyCObject_AsVoidPtr(subset_obj);
-
- hashbits->partition->merge(subset_p);
+ hashtable->filter_if_present(filename, output);
Py_RETURN_NONE;
}
static
PyObject *
-hashbits_merge_from_disk(khmer_KHashbits_Object * me, PyObject * args)
+hashtable_save_partitionmap(khmer_KHashtable_Object * me, PyObject * args)
{
- Hashbits * hashbits = me->hashbits;
+ Hashtable * hashtable = me->hashtable;
const char * filename = NULL;
+
if (!PyArg_ParseTuple(args, "s", &filename)) {
return NULL;
}
try {
- hashbits->partition->merge_from_disk(filename);
+ hashtable->partition->save_partitionmap(filename);
} catch (khmer_file_exception &e) {
PyErr_SetString(PyExc_IOError, e.what());
return NULL;
@@ -2338,1109 +1860,1449 @@ hashbits_merge_from_disk(khmer_KHashbits_Object * me, PyObject * args)
static
PyObject *
-hashbits_consume_fasta(khmer_KHashbits_Object * me, PyObject * args)
+hashtable_load_partitionmap(khmer_KHashtable_Object * me, PyObject * args)
{
- Hashbits * hashbits = me->hashbits;
+ Hashtable * hashtable = me->hashtable;
- const char * filename;
+ const char * filename = NULL;
if (!PyArg_ParseTuple(args, "s", &filename)) {
return NULL;
}
-// call the C++ function, and trap signals => Python
-
- unsigned long long n_consumed = 0;
- unsigned int total_reads = 0;
-
- try {
- hashbits->consume_fasta(filename, total_reads, n_consumed);
- } catch (_khmer_signal &e) {
- PyErr_SetString(PyExc_IOError, e.get_message().c_str());
- return NULL;
- } catch (khmer_file_exception &e) {
- PyErr_SetString(PyExc_IOError, e.what());
- return NULL;
- }
-
- return Py_BuildValue("IK", total_reads, n_consumed);
-}
-
-static
-PyObject *
-hashbits_consume_fasta_with_reads_parser(khmer_KHashbits_Object * me,
- PyObject * args)
-{
- Hashbits * hashbits = me->hashbits;
-
- PyObject * rparser_obj = NULL;
-
- if (!PyArg_ParseTuple(
- args, "O", &rparser_obj)) {
- return NULL;
- }
-
- read_parsers:: IParser * rparser =
- _PyObject_to_khmer_ReadParser( rparser_obj );
-
-// call the C++ function, and trap signals => Python
- unsigned long long n_consumed = 0;
- unsigned int total_reads = 0;
- char const * exc = NULL;
- Py_BEGIN_ALLOW_THREADS
- try {
- hashbits->consume_fasta(rparser, total_reads, n_consumed);
- } catch (_khmer_signal &e) {
- exc = e.get_message().c_str();
- }
-
- Py_END_ALLOW_THREADS
- if (exc != NULL) {
- PyErr_SetString(PyExc_IOError, exc);
+ try {
+ hashtable->partition->load_partitionmap(filename);
+ } catch (khmer_file_exception &e) {
+ PyErr_SetString(PyExc_IOError, e.what());
return NULL;
}
- return Py_BuildValue("IK", total_reads, n_consumed);
+ Py_RETURN_NONE;
}
static
PyObject *
-hashbits_consume_fasta_and_traverse(khmer_KHashbits_Object * me,
- PyObject * args)
+hashtable__validate_partitionmap(khmer_KHashtable_Object * me, PyObject * args)
{
- Hashbits * hashbits = me->hashbits;
-
- const char * filename;
- unsigned int radius, big_threshold, transfer_threshold;
- khmer_KCountingHash_Object * counting_o = NULL;
+ Hashtable * hashtable = me->hashtable;
- if (!PyArg_ParseTuple(args, "sIIIO!", &filename,
- &radius, &big_threshold, &transfer_threshold,
- &khmer_KCountingHash_Type, &counting_o)) {
+ if (!PyArg_ParseTuple(args, "")) {
return NULL;
}
- CountingHash * counting = counting_o->counting;
-
- hashbits->consume_fasta_and_traverse(filename, radius, big_threshold,
- transfer_threshold, *counting);
-
+ hashtable->partition->_validate_pmap();
Py_RETURN_NONE;
}
-void sig(unsigned int total_reads, unsigned int n_consumed)
+static
+PyObject *
+hashtable_count_partitions(khmer_KHashtable_Object * me, PyObject * args)
{
- std::cout << total_reads << " " << n_consumed << std::endl;
+ Hashtable * hashtable = me->hashtable;
+
+ if (!PyArg_ParseTuple(args, "")) {
+ return NULL;
+ }
+
+ size_t n_partitions = 0, n_unassigned = 0;
+ hashtable->partition->count_partitions(n_partitions, n_unassigned);
+
+ return Py_BuildValue("nn", (Py_ssize_t) n_partitions,
+ (Py_ssize_t) n_unassigned);
}
static
PyObject *
-hashbits_consume_fasta_and_tag(khmer_KHashbits_Object * me, PyObject * args)
+hashtable_subset_count_partitions(khmer_KHashtable_Object * me, PyObject * args)
{
- Hashbits * hashbits = me->hashbits;
-
- const char * filename;
+ khmer_KSubsetPartition_Object * subset_obj = NULL;
- if (!PyArg_ParseTuple(args, "s", &filename)) {
+ if (!PyArg_ParseTuple(args, "O!", &khmer_KSubsetPartition_Type, &subset_obj)) {
return NULL;
}
- // call the C++ function, and trap signals => Python
+ size_t n_partitions = 0, n_unassigned = 0;
+ subset_obj->subset->count_partitions(n_partitions, n_unassigned);
- unsigned long long n_consumed;
- unsigned int total_reads;
+ return Py_BuildValue("nn", (Py_ssize_t) n_partitions,
+ (Py_ssize_t) n_unassigned);
+}
- try {
- hashbits->consume_fasta_and_tag(filename, total_reads, n_consumed);
- } catch (_khmer_signal &e) {
- PyErr_SetString(PyExc_IOError, e.get_message().c_str());
+static
+PyObject *
+hashtable_subset_partition_size_distribution(khmer_KHashtable_Object * me,
+ PyObject * args)
+{
+ khmer_KSubsetPartition_Object * subset_obj = NULL;
+ if (!PyArg_ParseTuple(args, "O!", &khmer_KSubsetPartition_Type, &subset_obj)) {
return NULL;
- } catch (khmer_file_exception &e) {
- PyErr_SetString(PyExc_IOError, e.what());
+ }
+
+ SubsetPartition * subset_p;
+ subset_p = subset_obj->subset;
+
+ PartitionCountDistribution d;
+
+ unsigned int n_unassigned = 0;
+ subset_p->partition_size_distribution(d, n_unassigned);
+
+ PyObject * x = PyList_New(d.size());
+ if (x == NULL) {
return NULL;
}
+ PartitionCountDistribution::iterator di;
- return Py_BuildValue("IK", total_reads, n_consumed);
+ unsigned int i;
+ for (i = 0, di = d.begin(); di != d.end(); di++, i++) {
+ PyObject * value = Py_BuildValue("KK", di->first, di->second);
+ if (value == NULL) {
+ Py_DECREF(x);
+ return NULL;
+ }
+ PyList_SET_ITEM(x, i, value);
+ }
+ if (!(i == d.size())) {
+ throw khmer_exception();
+ }
+
+ PyObject * returnValue = Py_BuildValue("NI", x, n_unassigned);
+ if (returnValue == NULL) {
+ Py_DECREF(x);
+ return NULL;
+ }
+ return returnValue;
}
static
PyObject *
-hashbits_consume_fasta_and_tag_with_reads_parser(khmer_KHashbits_Object * me,
- PyObject * args)
+hashtable_load_tagset(khmer_KHashtable_Object * me, PyObject * args)
{
- Hashbits * hashbits = me->hashbits;
+ Hashtable * hashtable = me->hashtable;
- python::khmer_ReadParser_Object * rparser_obj = NULL;
+ const char * filename = NULL;
+ PyObject * clear_tags_o = NULL;
- if (!PyArg_ParseTuple( args, "O!", &python::khmer_ReadParser_Type,
- &rparser_obj)) {
+ if (!PyArg_ParseTuple(args, "s|O", &filename, &clear_tags_o)) {
return NULL;
}
- read_parsers:: IParser * rparser = rparser_obj-> parser;
+ bool clear_tags = true;
+ if (clear_tags_o && !PyObject_IsTrue(clear_tags_o)) {
+ clear_tags = false;
+ }
- // call the C++ function, and trap signals => Python
- unsigned long long n_consumed = 0;
- unsigned int total_reads = 0;
- char const * exc = NULL;
- Py_BEGIN_ALLOW_THREADS
try {
- hashbits->consume_fasta_and_tag(
- rparser, total_reads, n_consumed
- );
- } catch (_khmer_signal &e) {
- exc = e.get_message().c_str();
- } catch (khmer::read_parsers::NoMoreReadsAvailable &e) {
- exc = e.what();
- }
- Py_END_ALLOW_THREADS
- if (exc != NULL) {
- PyErr_SetString(PyExc_IOError, exc);
+ hashtable->load_tagset(filename, clear_tags);
+ } catch (khmer_file_exception &e) {
+ PyErr_SetString(PyExc_IOError, e.what());
return NULL;
}
- return Py_BuildValue("IK", total_reads, n_consumed);
+ Py_RETURN_NONE;
}
static
PyObject *
-hashbits_consume_fasta_and_tag_with_stoptags(khmer_KHashbits_Object * me,
- PyObject * args)
+hashtable_save_tagset(khmer_KHashtable_Object * me, PyObject * args)
{
- Hashbits * hashbits = me->hashbits;
+ Hashtable * hashtable = me->hashtable;
- const char * filename;
+ const char * filename = NULL;
if (!PyArg_ParseTuple(args, "s", &filename)) {
return NULL;
}
- // call the C++ function, and trap signals => Python
-
- unsigned long long n_consumed;
- unsigned int total_reads;
-
try {
- hashbits->consume_fasta_and_tag_with_stoptags(filename,
- total_reads, n_consumed);
- } catch (_khmer_signal &e) {
- PyErr_SetString(PyExc_IOError, e.get_message().c_str());
- return NULL;
+ hashtable->save_tagset(filename);
} catch (khmer_file_exception &e) {
PyErr_SetString(PyExc_IOError, e.what());
return NULL;
}
- return Py_BuildValue("IK", total_reads, n_consumed);
+ Py_RETURN_NONE;
}
static
PyObject *
-hashbits_consume_partitioned_fasta(khmer_KHashbits_Object * me, PyObject * args)
+hashtable_save_subset_partitionmap(khmer_KHashtable_Object * me,
+ PyObject * args)
{
- Hashbits * hashbits = me->hashbits;
-
- const char * filename;
+ const char * filename = NULL;
+ khmer_KSubsetPartition_Object * subset_obj = NULL;
- if (!PyArg_ParseTuple(args, "s", &filename)) {
+ if (!PyArg_ParseTuple(args, "O!s", &khmer_KSubsetPartition_Type, &subset_obj, &filename)) {
return NULL;
}
- // call the C++ function, and trap signals => Python
+ SubsetPartition * subset_p;
+ subset_p = subset_obj->subset;
- unsigned long long n_consumed;
- unsigned int total_reads;
+ Py_BEGIN_ALLOW_THREADS
try {
- hashbits->consume_partitioned_fasta(filename, total_reads, n_consumed);
- } catch (_khmer_signal &e) {
- PyErr_SetString(PyExc_IOError, e.get_message().c_str());
- return NULL;
+ subset_p->save_partitionmap(filename);
} catch (khmer_file_exception &e) {
PyErr_SetString(PyExc_IOError, e.what());
return NULL;
}
- return Py_BuildValue("IK", total_reads, n_consumed);
+ Py_END_ALLOW_THREADS
+
+ Py_RETURN_NONE;
}
static
PyObject *
-hashbits_find_all_tags(khmer_KHashbits_Object * me, PyObject * args)
+hashtable_load_subset_partitionmap(khmer_KHashtable_Object * me,
+ PyObject * args)
{
- Hashbits * hashbits = me->hashbits;
+ Hashtable * hashtable = me->hashtable;
- const char * kmer_s = NULL;
+ const char * filename = NULL;
- if (!PyArg_ParseTuple(args, "s", &kmer_s)) {
+ if (!PyArg_ParseTuple(args, "s", &filename)) {
return NULL;
}
- if (strlen(kmer_s) != hashbits->ksize()) {
- PyErr_SetString( PyExc_ValueError,
- "k-mer size must equal the k-mer size of the presence table");
- return NULL;
+ SubsetPartition * subset_p;
+ try {
+ subset_p = new SubsetPartition(hashtable);
+ } catch (std::bad_alloc &e) {
+ return PyErr_NoMemory();
}
- _pre_partition_info * ppi = NULL;
+ bool fail = false;
+ std::string err;
Py_BEGIN_ALLOW_THREADS
- HashIntoType kmer, kmer_f, kmer_r;
- kmer = _hash(kmer_s, hashbits->ksize(), kmer_f, kmer_r);
-
try {
- ppi = new _pre_partition_info(kmer);
- } catch (std::bad_alloc &e) {
- return PyErr_NoMemory();
+ subset_p->load_partitionmap(filename);
+ } catch (khmer_file_exception &e) {
+ fail = true;
+ err = e.what();
}
- hashbits->partition->find_all_tags(kmer_f, kmer_r, ppi->tagged_kmers,
- hashbits->all_tags);
- hashbits->add_kmer_to_tags(kmer);
Py_END_ALLOW_THREADS
- return PyCObject_FromVoidPtr(ppi, free_pre_partition_info);
+ if (fail) {
+ PyErr_SetString(PyExc_IOError, err.c_str());
+ delete subset_p;
+ return NULL;
+ } else {
+ khmer_KSubsetPartition_Object * subset_obj = (khmer_KSubsetPartition_Object *)\
+ PyObject_New(khmer_KSubsetPartition_Object, &khmer_KSubsetPartition_Type);
+ subset_obj->subset = subset_p;
+ return (PyObject*) subset_obj;
+ }
}
static
PyObject *
-hashbits_assign_partition_id(khmer_KHashbits_Object * me, PyObject * args)
+hashtable__set_tag_density(khmer_KHashtable_Object * me, PyObject * args)
{
- Hashbits * hashbits = me->hashbits;
+ Hashtable * hashtable = me->hashtable;
- PyObject * ppi_obj;
- if (!PyArg_ParseTuple(args, "O", &ppi_obj)) {
+ unsigned int d;
+ if (!PyArg_ParseTuple(args, "I", &d)) {
return NULL;
}
- if (!PyCObject_Check(ppi_obj)) {
- PyErr_SetString( PyExc_ValueError, "invalid pre_partition_info");
+ hashtable->_set_tag_density(d);
+
+ Py_RETURN_NONE;
+}
+
+static
+PyObject *
+hashtable__get_tag_density(khmer_KHashtable_Object * me, PyObject * args)
+{
+ Hashtable * hashtable = me->hashtable;
+
+ if (!PyArg_ParseTuple(args, "")) {
return NULL;
}
- _pre_partition_info * ppi;
- ppi = (_pre_partition_info *) PyCObject_AsVoidPtr(ppi_obj);
+ unsigned int d = hashtable->_get_tag_density();
- PartitionID p;
- p = hashbits->partition->assign_partition_id(ppi->kmer,
- ppi->tagged_kmers);
-
- return PyLong_FromLong(p);
+ return PyLong_FromLong(d);
}
static
PyObject *
-hashbits_add_tag(khmer_KHashbits_Object * me, PyObject * args)
+hashtable__validate_subset_partitionmap(khmer_KHashtable_Object * me,
+ PyObject * args)
{
- Hashbits * hashbits = me->hashbits;
+ khmer_KSubsetPartition_Object * subset_obj = NULL;
- const char * kmer_s = NULL;
- if (!PyArg_ParseTuple(args, "s", &kmer_s)) {
+ if (!PyArg_ParseTuple(args, "O!", &khmer_KSubsetPartition_Type, &subset_obj)) {
return NULL;
}
- HashIntoType kmer = _hash(kmer_s, hashbits->ksize());
- hashbits->add_tag(kmer);
+ subset_obj->subset->_validate_pmap();
Py_RETURN_NONE;
}
static
PyObject *
-hashbits_add_stop_tag(khmer_KHashbits_Object * me, PyObject * args)
+hashtable_set_partition_id(khmer_KHashtable_Object * me, PyObject * args)
{
- Hashbits * hashbits = me->hashbits;
+ Hashtable * hashtable = me->hashtable;
- const char * kmer_s = NULL;
- if (!PyArg_ParseTuple(args, "s", &kmer_s)) {
+ const char * kmer = NULL;
+ PartitionID p = 0;
+
+ if (!PyArg_ParseTuple(args, "sI", &kmer, &p)) {
return NULL;
}
- HashIntoType kmer = _hash(kmer_s, hashbits->ksize());
- hashbits->add_stop_tag(kmer);
+ hashtable->partition->set_partition_id(kmer, p);
Py_RETURN_NONE;
}
static
PyObject *
-hashbits_get_stop_tags(khmer_KHashbits_Object * me, PyObject * args)
+hashtable_join_partitions(khmer_KHashtable_Object * me, PyObject * args)
{
- Hashbits * hashbits = me->hashbits;
+ Hashtable * hashtable = me->hashtable;
- if (!PyArg_ParseTuple(args, "")) {
+ PartitionID p1 = 0, p2 = 0;
+
+ if (!PyArg_ParseTuple(args, "II", &p1, &p2)) {
return NULL;
}
- WordLength k = hashbits->ksize();
- SeenSet::const_iterator si;
-
- PyObject * x = PyList_New(hashbits->stop_tags.size());
- unsigned long long i = 0;
- for (si = hashbits->stop_tags.begin(); si != hashbits->stop_tags.end(); si++) {
- std::string s = _revhash(*si, k);
- PyList_SET_ITEM(x, i, Py_BuildValue("s", s.c_str()));
- i++;
- }
+ p1 = hashtable->partition->join_partitions(p1, p2);
- return x;
+ return PyLong_FromLong(p1);
}
static
PyObject *
-hashbits_get_tagset(khmer_KHashbits_Object * me, PyObject * args)
+hashtable_get_partition_id(khmer_KHashtable_Object * me, PyObject * args)
{
- Hashbits * hashbits = me->hashbits;
+ Hashtable * hashtable = me->hashtable;
- if (!PyArg_ParseTuple(args, "")) {
+ const char * kmer = NULL;
+
+ if (!PyArg_ParseTuple(args, "s", &kmer)) {
return NULL;
}
- WordLength k = hashbits->ksize();
- SeenSet::const_iterator si;
-
- PyObject * x = PyList_New(hashbits->all_tags.size());
- unsigned long long i = 0;
- for (si = hashbits->all_tags.begin(); si != hashbits->all_tags.end(); si++) {
- std::string s = _revhash(*si, k);
- PyList_SET_ITEM(x, i, Py_BuildValue("s", s.c_str()));
- i++;
- }
+ PartitionID partition_id;
+ partition_id = hashtable->partition->get_partition_id(kmer);
- return x;
+ return PyLong_FromLong(partition_id);
}
static
PyObject *
-hashbits_output_partitions(khmer_KHashbits_Object * me, PyObject * args)
+hashtable_is_single_partition(khmer_KHashtable_Object * me, PyObject * args)
{
- Hashbits * hashbits = me->hashbits;
+ Hashtable * hashtable = me->hashtable;
- const char * filename = NULL;
- const char * output = NULL;
- PyObject * output_unassigned_o = NULL;
+ const char * seq = NULL;
- if (!PyArg_ParseTuple(args, "ss|O", &filename, &output,
- &output_unassigned_o)) {
+ if (!PyArg_ParseTuple(args, "s", &seq)) {
return NULL;
}
- bool output_unassigned = false;
- if (output_unassigned_o != NULL && PyObject_IsTrue(output_unassigned_o)) {
- output_unassigned = true;
- }
-
- size_t n_partitions = 0;
+ bool v = hashtable->partition->is_single_partition(seq);
- try {
- SubsetPartition * subset_p = hashbits->partition;
- n_partitions = subset_p->output_partitioned_file(filename,
- output,
- output_unassigned);
- } catch (_khmer_signal &e) {
- PyErr_SetString(PyExc_IOError, e.get_message().c_str());
- return NULL;
- } catch (khmer_file_exception &e) {
- PyErr_SetString(PyExc_IOError, e.what());
- return NULL;
+ PyObject * val;
+ if (v) {
+ val = Py_True;
+ } else {
+ val = Py_False;
}
+ Py_INCREF(val);
- return PyLong_FromLong(n_partitions);
+ return val;
}
static
PyObject *
-hashbits_find_unpart(khmer_KHashbits_Object * me, PyObject * args)
+hashtable_divide_tags_into_subsets(khmer_KHashtable_Object * me,
+ PyObject * args)
{
- Hashbits * hashbits = me->hashbits;
+ Hashtable * hashtable = me->hashtable;
- const char * filename = NULL;
- PyObject * traverse_o = NULL;
- PyObject * stop_big_traversals_o = NULL;
+ unsigned int subset_size = 0;
- if (!PyArg_ParseTuple(args, "sOO", &filename, &traverse_o,
- &stop_big_traversals_o)) {
+ if (!PyArg_ParseTuple(args, "I", &subset_size)) {
return NULL;
}
- bool traverse = PyObject_IsTrue(traverse_o);
- bool stop_big_traversals = PyObject_IsTrue(stop_big_traversals_o);
- unsigned int n_singletons = 0;
+ SeenSet divvy;
+ hashtable->divide_tags_into_subsets(subset_size, divvy);
- try {
- SubsetPartition * subset_p = hashbits->partition;
- n_singletons = subset_p->find_unpart(filename, traverse,
- stop_big_traversals);
- } catch (_khmer_signal &e) {
- return NULL;
+ PyObject * x = PyList_New(divvy.size());
+ unsigned int i = 0;
+ for (SeenSet::const_iterator si = divvy.begin(); si != divvy.end();
+ si++, i++) {
+ PyList_SET_ITEM(x, i, PyLong_FromUnsignedLongLong(*si));
}
- return PyLong_FromLong(n_singletons);
-
- // Py_INCREF(Py_None);
- // return Py_None;
+ return x;
}
static
PyObject *
-hashbits_filter_if_present(khmer_KHashbits_Object * me, PyObject * args)
+hashtable_count_kmers_within_radius(khmer_KHashtable_Object * me,
+ PyObject * args)
{
- Hashbits * hashbits = me->hashbits;
+ Hashtable * hashtable = me->hashtable;
- const char * filename = NULL;
- const char * output = NULL;
+ const char * kmer = NULL;
+ unsigned int radius = 0;
+ unsigned int max_count = 0;
- if (!PyArg_ParseTuple(args, "ss", &filename, &output)) {
+ if (!PyArg_ParseTuple(args, "sI|I", &kmer, &radius, &max_count)) {
return NULL;
}
- try {
- hashbits->filter_if_present(filename, output);
- } catch (_khmer_signal &e) {
- return NULL;
- }
+ unsigned int n;
- Py_RETURN_NONE;
+ Py_BEGIN_ALLOW_THREADS
+
+ HashIntoType kmer_f, kmer_r;
+ _hash(kmer, hashtable->ksize(), kmer_f, kmer_r);
+ n = hashtable->count_kmers_within_radius(kmer_f, kmer_r, radius,
+ max_count);
+
+ Py_END_ALLOW_THREADS
+
+ return PyLong_FromUnsignedLong(n);
}
static
PyObject *
-hashbits_save_partitionmap(khmer_KHashbits_Object * me, PyObject * args)
+hashtable_extract_unique_paths(khmer_KHashtable_Object * me, PyObject * args)
{
- Hashbits * hashbits = me->hashbits;
+ Hashtable * hashtable = me->hashtable;
- const char * filename = NULL;
-
- if (!PyArg_ParseTuple(args, "s", &filename)) {
+ const char * sequence = NULL;
+ unsigned int min_length = 0;
+ float min_unique_f = 0;
+ if (!PyArg_ParseTuple(args, "sIf", &sequence, &min_length, &min_unique_f)) {
return NULL;
}
- try {
- hashbits->partition->save_partitionmap(filename);
- } catch (khmer_file_exception &e) {
- PyErr_SetString(PyExc_IOError, e.what());
+ std::vector<std::string> results;
+ hashtable->extract_unique_paths(sequence, min_length, min_unique_f, results);
+
+ PyObject * x = PyList_New(results.size());
+ if (x == NULL) {
return NULL;
}
- Py_RETURN_NONE;
+ for (unsigned int i = 0; i < results.size(); i++) {
+ PyList_SET_ITEM(x, i, PyUnicode_FromString(results[i].c_str()));
+ }
+
+ return x;
}
+
static
PyObject *
-hashbits_load_partitionmap(khmer_KHashbits_Object * me, PyObject * args)
+hashtable_get_kmers(khmer_KHashtable_Object * me, PyObject * args)
{
- Hashbits * hashbits = me->hashbits;
-
- const char * filename = NULL;
+ Hashtable * hashtable = me->hashtable;
+ const char * sequence;
- if (!PyArg_ParseTuple(args, "s", &filename)) {
+ if (!PyArg_ParseTuple(args, "s", &sequence)) {
return NULL;
}
- hashbits->partition->load_partitionmap(filename);
+ std::vector<std::string> kmers;
- Py_RETURN_NONE;
+ hashtable->get_kmers(sequence, kmers);
+
+ PyObject * x = PyList_New(kmers.size());
+ for (unsigned int i = 0; i < kmers.size(); i++) {
+ PyObject * obj = PyUnicode_FromString(kmers[i].c_str());
+ PyList_SET_ITEM(x, i, obj);
+ }
+
+ return x;
}
static
PyObject *
-hashbits__validate_partitionmap(khmer_KHashbits_Object * me, PyObject * args)
+hashtable_get_kmer_counts(khmer_KHashtable_Object * me, PyObject * args)
{
- Hashbits * hashbits = me->hashbits;
+ Hashtable * hashtable = me->hashtable;
+ const char * sequence;
- if (!PyArg_ParseTuple(args, "")) {
+ if (!PyArg_ParseTuple(args, "s", &sequence)) {
return NULL;
}
- hashbits->partition->_validate_pmap();
+ std::vector<BoundedCounterType> counts;
+ hashtable->get_kmer_counts(sequence, counts);
- Py_RETURN_NONE;
+ PyObject * x = PyList_New(counts.size());
+ for (unsigned int i = 0; i <counts.size(); i++) {
+ PyObject * obj = PyInt_FromLong(counts[i]);
+ PyList_SET_ITEM(x, i, obj);
+ }
+
+ return x;
}
+
static
PyObject *
-hashbits_count_partitions(khmer_KHashbits_Object * me, PyObject * args)
+hashtable_get_kmer_hashes(khmer_KHashtable_Object * me, PyObject * args)
{
- Hashbits * hashbits = me->hashbits;
+ Hashtable * hashtable = me->hashtable;
+ const char * sequence;
- if (!PyArg_ParseTuple(args, "")) {
+ if (!PyArg_ParseTuple(args, "s", &sequence)) {
return NULL;
}
- size_t n_partitions = 0, n_unassigned = 0;
- hashbits->partition->count_partitions(n_partitions, n_unassigned);
+ std::vector<HashIntoType> hashes;
+ hashtable->get_kmer_hashes(sequence, hashes);
- return Py_BuildValue("nn", (Py_ssize_t) n_partitions,
- (Py_ssize_t) n_unassigned);
+ PyObject * x = PyList_New(hashes.size());
+ for (unsigned int i = 0; i < hashes.size(); i++) {
+ PyObject * obj = PyLong_FromUnsignedLongLong(hashes[i]);
+ PyList_SET_ITEM(x, i, obj);
+ }
+
+ return x;
}
+
+static PyMethodDef khmer_hashtable_methods[] = {
+ //
+ // Basic methods
+ //
+
+ {
+ "ksize",
+ (PyCFunction)hashtable_get_ksize, METH_VARARGS,
+ "Returns the k-mer size of this graph."
+ },
+ { "hashsizes", (PyCFunction)hashtable_get_hashsizes, METH_VARARGS, "" },
+ {
+ "n_unique_kmers",
+ (PyCFunction)hashtable_n_unique_kmers, METH_VARARGS,
+ "Count the number of unique kmers in this graph."
+ },
+ {
+ "n_occupied", (PyCFunction)hashtable_n_occupied, METH_VARARGS,
+ "Count the number of occupied bins."
+ },
+ { "n_entries", (PyCFunction)hashtable_n_entries, METH_VARARGS, "" },
+ {
+ "count",
+ (PyCFunction)hashtable_count, METH_VARARGS,
+ "Increment the count of this k-mer."
+ },
+ {
+ "consume",
+ (PyCFunction)hashtable_consume, METH_VARARGS,
+ "Increment the counts of all of the k-mers in the string."
+ },
+ {
+ "consume_fasta",
+ (PyCFunction)hashtable_consume_fasta, METH_VARARGS,
+ "Incrment the counts of all the k-mers in the sequences in the "
+ "given file"
+ },
+ {
+ "consume_fasta_with_reads_parser",
+ (PyCFunction)hashtable_consume_fasta_with_reads_parser, METH_VARARGS,
+ "Count all k-mers retrieved with this reads parser object."
+ },
+ {
+ "get",
+ (PyCFunction)hashtable_get, METH_VARARGS,
+ "Retrieve the count for the given k-mer."
+ },
+ {
+ "load",
+ (PyCFunction)hashtable_load, METH_VARARGS,
+ "Load the graph from the specified file."
+ },
+ {
+ "save",
+ (PyCFunction)hashtable_save, METH_VARARGS,
+ "Save the graph to the specified file."
+ },
+ {
+ "get_median_count",
+ (PyCFunction)hashtable_get_median_count, METH_VARARGS,
+ "Get the median, average, and stddev of the k-mer counts "
+ " in the string"
+ },
+ {
+ "get_kmers",
+ (PyCFunction)hashtable_get_kmers, METH_VARARGS,
+ "Generate an ordered list of all substrings of length k in the string."
+ },
+ {
+ "get_kmer_hashes",
+ (PyCFunction)hashtable_get_kmer_hashes, METH_VARARGS,
+ "Retrieve an ordered list of all hashes of all k-mers in the string."
+ },
+ {
+ "get_kmer_counts",
+ (PyCFunction)hashtable_get_kmer_counts, METH_VARARGS,
+ "Retrieve an ordered list of the counts of all k-mers in the string."
+ },
+
+ //
+ // graph/traversal functionality
+ //
+
+ {
+ "calc_connected_graph_size",
+ (PyCFunction)hashtable_calc_connected_graph_size, METH_VARARGS, ""
+ },
+ {
+ "kmer_degree",
+ (PyCFunction)hashtable_kmer_degree, METH_VARARGS,
+ "Calculate the number of immediate neighbors this k-mer has in "
+ "the graph."
+ },
+ {
+ "count_kmers_within_radius",
+ (PyCFunction)hashtable_count_kmers_within_radius, METH_VARARGS,
+ "Calculate the number of neighbors with given radius in the graph."
+ },
+
+ //
+ // tagging / sparse graph functionality
+ //
+
+ { "consume_and_tag", (PyCFunction)hashtable_consume_and_tag, METH_VARARGS, "Consume a sequence and tag it" },
+ { "get_tags_and_positions", (PyCFunction)hashtable_get_tags_and_positions, METH_VARARGS, "Retrieve tags and their positions in a sequence." },
+ { "find_all_tags_list", (PyCFunction)hashtable_find_all_tags_list, METH_VARARGS, "Find all tags within range of the given k-mer, return as list" },
+ { "consume_fasta_and_tag", (PyCFunction)hashtable_consume_fasta_and_tag, METH_VARARGS, "Count all k-mers in a given file" },
+ { "get_median_count", (PyCFunction)hashtable_get_median_count, METH_VARARGS, "Get the median, average, and stddev of the k-mer counts in the string" },
+ { "median_at_least", (PyCFunction)hashtable_median_at_least, METH_VARARGS, "Return true if the median is at least the given cutoff" },
+ { "extract_unique_paths", (PyCFunction)hashtable_extract_unique_paths, METH_VARARGS, "" },
+ { "print_tagset", (PyCFunction)hashtable_print_tagset, METH_VARARGS, "" },
+ { "add_tag", (PyCFunction)hashtable_add_tag, METH_VARARGS, "" },
+ { "get_tagset", (PyCFunction)hashtable_get_tagset, METH_VARARGS, "" },
+ { "load_tagset", (PyCFunction)hashtable_load_tagset, METH_VARARGS, "" },
+ { "save_tagset", (PyCFunction)hashtable_save_tagset, METH_VARARGS, "" },
+ { "n_tags", (PyCFunction)hashtable_n_tags, METH_VARARGS, "" },
+ { "divide_tags_into_subsets", (PyCFunction)hashtable_divide_tags_into_subsets, METH_VARARGS, "" },
+ { "_get_tag_density", (PyCFunction)hashtable__get_tag_density, METH_VARARGS, "" },
+ { "_set_tag_density", (PyCFunction)hashtable__set_tag_density, METH_VARARGS, "" },
+
+ // partitioning
+ { "do_subset_partition", (PyCFunction)hashtable_do_subset_partition, METH_VARARGS, "" },
+ { "find_all_tags", (PyCFunction)hashtable_find_all_tags, METH_VARARGS, "" },
+ { "assign_partition_id", (PyCFunction)hashtable_assign_partition_id, METH_VARARGS, "" },
+ { "output_partitions", (PyCFunction)hashtable_output_partitions, METH_VARARGS, "" },
+ { "find_unpart", (PyCFunction)hashtable_find_unpart, METH_VARARGS, "" },
+ { "load_partitionmap", (PyCFunction)hashtable_load_partitionmap, METH_VARARGS, "" },
+ { "save_partitionmap", (PyCFunction)hashtable_save_partitionmap, METH_VARARGS, "" },
+ { "_validate_partitionmap", (PyCFunction)hashtable__validate_partitionmap, METH_VARARGS, "" },
+ { "consume_fasta_and_traverse", (PyCFunction)hashtable_consume_fasta_and_traverse, METH_VARARGS, "" },
+ {
+ "consume_fasta_and_tag_with_reads_parser", (PyCFunction)hashtable_consume_fasta_and_tag_with_reads_parser,
+ METH_VARARGS, "Count all k-mers using a given reads parser"
+ },
+ { "consume_partitioned_fasta", (PyCFunction)hashtable_consume_partitioned_fasta, METH_VARARGS, "Count all k-mers in a given file" },
+ { "join_partitions_by_path", (PyCFunction)hashtable_join_partitions_by_path, METH_VARARGS, "" },
+ { "merge_subset", (PyCFunction)hashtable_merge_subset, METH_VARARGS, "" },
+ { "merge_subset_from_disk", (PyCFunction)hashtable_merge_from_disk, METH_VARARGS, "" },
+ { "count_partitions", (PyCFunction)hashtable_count_partitions, METH_VARARGS, "" },
+ { "subset_count_partitions", (PyCFunction)hashtable_subset_count_partitions, METH_VARARGS, "" },
+ { "subset_partition_size_distribution", (PyCFunction)hashtable_subset_partition_size_distribution, METH_VARARGS, "" },
+ { "save_subset_partitionmap", (PyCFunction)hashtable_save_subset_partitionmap, METH_VARARGS },
+ { "load_subset_partitionmap", (PyCFunction)hashtable_load_subset_partitionmap, METH_VARARGS },
+ { "_validate_subset_partitionmap", (PyCFunction)hashtable__validate_subset_partitionmap, METH_VARARGS, "" },
+ { "set_partition_id", (PyCFunction)hashtable_set_partition_id, METH_VARARGS, "" },
+ { "join_partitions", (PyCFunction)hashtable_join_partitions, METH_VARARGS, "" },
+ { "get_partition_id", (PyCFunction)hashtable_get_partition_id, METH_VARARGS, "" },
+ { "is_single_partition", (PyCFunction)hashtable_is_single_partition, METH_VARARGS, "" },
+ { "traverse_from_tags", (PyCFunction)hashtable_traverse_from_tags, METH_VARARGS, "" },
+ { "repartition_largest_partition", (PyCFunction)hashtable_repartition_largest_partition, METH_VARARGS, "" },
+
+ // stop tags
+ { "load_stop_tags", (PyCFunction)hashtable_load_stop_tags, METH_VARARGS, "" },
+ { "save_stop_tags", (PyCFunction)hashtable_save_stop_tags, METH_VARARGS, "" },
+ { "print_stop_tags", (PyCFunction)hashtable_print_stop_tags, METH_VARARGS, "" },
+ { "trim_on_stoptags", (PyCFunction)hashtable_trim_on_stoptags, METH_VARARGS, "" },
+ { "identify_stoptags_by_position", (PyCFunction)hashtable_identify_stoptags_by_position, METH_VARARGS, "" },
+ { "filter_if_present", (PyCFunction)hashtable_filter_if_present, METH_VARARGS, "" },
+ { "add_stop_tag", (PyCFunction)hashtable_add_stop_tag, METH_VARARGS, "" },
+ { "get_stop_tags", (PyCFunction)hashtable_get_stop_tags, METH_VARARGS, "" },
+ { "consume_fasta_and_tag_with_stoptags", (PyCFunction)hashtable_consume_fasta_and_tag_with_stoptags, METH_VARARGS, "Count all k-mers in a given file" },
+
+ {NULL, NULL, 0, NULL} /* sentinel */
+};
+
+static PyTypeObject khmer_KHashtable_Type
+CPYCHECKER_TYPE_OBJECT_FOR_TYPEDEF("khmer_KHashtable_Object")
+= {
+ PyVarObject_HEAD_INIT(NULL, 0) /* init & ob_size */
+ "_khmer.KHashtable ", /*tp_name*/
+ sizeof(khmer_KHashtable_Object) , /*tp_basicsize*/
+ 0, /*tp_itemsize*/
+ 0, /*tp_dealloc*/
+ 0, /*tp_print*/
+ 0, /*tp_getattr*/
+ 0, /*tp_setattr*/
+ 0, /*tp_compare*/
+ 0, /*tp_repr*/
+ 0, /*tp_as_number*/
+ 0, /*tp_as_sequence*/
+ 0, /*tp_as_mapping*/
+ 0, /*tp_hash */
+ 0, /*tp_call*/
+ 0, /*tp_str*/
+ 0, /*tp_getattro*/
+ 0, /*tp_setattro*/
+ 0, /*tp_as_buffer*/
+ Py_TPFLAGS_DEFAULT, /*tp_flags*/
+ "base hashtable object", /* tp_doc */
+ 0, /* tp_traverse */
+ 0, /* tp_clear */
+ 0, /* tp_richcompare */
+ 0, /* tp_weaklistoffset */
+ 0, /* tp_iter */
+ 0, /* tp_iternext */
+ khmer_hashtable_methods, /* tp_methods */
+ 0, /* tp_members */
+ 0, /* tp_getset */
+ 0, /* tp_base */
+ 0, /* tp_dict */
+ 0, /* tp_descr_get */
+ 0, /* tp_descr_set */
+ 0, /* tp_dictoffset */
+ 0, /* tp_init */
+ 0, /* tp_alloc */
+ 0, /* tp_new */
+};
+
+#define is_hashtable_obj(v) (Py_TYPE(v) == &khmer_KHashtable_Type)
+
+//
+// _new_hashtable
+//
+
+//
+// KCountingHash object
+//
+
+typedef struct {
+ khmer_KHashtable_Object khashtable;
+ CountingHash * counting;
+} khmer_KCountingHash_Object;
+
+typedef struct {
+ PyObject_HEAD
+ ReadAligner * aligner;
+} khmer_ReadAligner_Object;
+
+static void khmer_counting_dealloc(khmer_KCountingHash_Object * obj);
+
static
PyObject *
-hashbits_subset_count_partitions(khmer_KHashbits_Object * me, PyObject * args)
+count_trim_on_abundance(khmer_KCountingHash_Object * me, PyObject * args)
{
- PyObject * subset_obj = NULL;
+ CountingHash * counting = me->counting;
+
+ const char * seq = NULL;
+ unsigned int min_count_i = 0;
- if (!PyArg_ParseTuple(args, "O", &subset_obj)) {
+ if (!PyArg_ParseTuple(args, "sI", &seq, &min_count_i)) {
return NULL;
}
- SubsetPartition * subset_p;
- subset_p = (SubsetPartition *) PyCObject_AsVoidPtr(subset_obj);
+ unsigned long trim_at;
+ Py_BEGIN_ALLOW_THREADS
- size_t n_partitions = 0, n_unassigned = 0;
- subset_p->count_partitions(n_partitions, n_unassigned);
+ BoundedCounterType min_count = min_count_i;
- return Py_BuildValue("nn", (Py_ssize_t) n_partitions,
- (Py_ssize_t) n_unassigned);
+ trim_at = counting->trim_on_abundance(seq, min_count);
+
+ Py_END_ALLOW_THREADS;
+
+ PyObject * trim_seq = PyUnicode_FromStringAndSize(seq, trim_at);
+ if (trim_seq == NULL) {
+ return NULL;
+ }
+ PyObject * ret = Py_BuildValue("Ok", trim_seq, trim_at);
+ Py_DECREF(trim_seq);
+
+ return ret;
}
static
PyObject *
-hashbits_subset_partition_size_distribution(khmer_KHashbits_Object * me,
- PyObject * args)
+count_trim_below_abundance(khmer_KCountingHash_Object * me, PyObject * args)
{
- PyObject * subset_obj = NULL;
- if (!PyArg_ParseTuple(args, "O", &subset_obj)) {
+ CountingHash * counting = me->counting;
+
+ const char * seq = NULL;
+ BoundedCounterType max_count_i = 0;
+
+ if (!PyArg_ParseTuple(args, "sH", &seq, &max_count_i)) {
return NULL;
}
- SubsetPartition * subset_p;
- subset_p = (SubsetPartition *) PyCObject_AsVoidPtr(subset_obj);
-
- PartitionCountDistribution d;
+ unsigned long trim_at;
+ Py_BEGIN_ALLOW_THREADS
- unsigned int n_unassigned = 0;
- subset_p->partition_size_distribution(d, n_unassigned);
+ BoundedCounterType max_count = max_count_i;
- PyObject * x = PyList_New(d.size());
- if (x == NULL) {
- return NULL;
- }
- PartitionCountDistribution::iterator di;
+ trim_at = counting->trim_below_abundance(seq, max_count);
- unsigned int i;
- for (i = 0, di = d.begin(); di != d.end(); di++, i++) {
- PyObject * value = Py_BuildValue("KK", di->first, di->second);
- if (value == NULL) {
- Py_DECREF(x);
- return NULL;
- }
- PyList_SET_ITEM(x, i, value);
- }
- if (!(i == d.size())) {
- throw khmer_exception();
- }
+ Py_END_ALLOW_THREADS;
- PyObject * returnValue = Py_BuildValue("NI", x, n_unassigned);
- if (returnValue == NULL) {
- Py_DECREF(x);
+ PyObject * trim_seq = PyUnicode_FromStringAndSize(seq, trim_at);
+ if (trim_seq == NULL) {
return NULL;
}
- return returnValue;
+ PyObject * ret = Py_BuildValue("Ok", trim_seq, trim_at);
+ Py_DECREF(trim_seq);
+
+ return ret;
}
static
PyObject *
-hashbits_load(khmer_KHashbits_Object * me, PyObject * args)
+count_find_spectral_error_positions(khmer_KCountingHash_Object * me,
+ PyObject * args)
{
- Hashbits * hashbits = me->hashbits;
+ khmer::CountingHash * counting = me->counting;
- const char * filename = NULL;
+ char * seq = NULL;
+ khmer::BoundedCounterType max_count = 0; // unsigned short int
- if (!PyArg_ParseTuple(args, "s", &filename)) {
+ if (!PyArg_ParseTuple(args, "sH", &seq, &max_count)) {
return NULL;
}
+ std::vector<unsigned int> posns;
+
try {
- hashbits->load(filename);
- } catch (khmer_file_exception &e) {
- PyErr_SetString(PyExc_IOError, e.what());
+ posns = counting->find_spectral_error_positions(seq, max_count);
+ } catch (khmer_exception &e) {
+ PyErr_SetString(PyExc_ValueError, e.what());
return NULL;
}
- Py_RETURN_NONE;
+ Py_ssize_t posns_size = posns.size();
+
+ PyObject * x = PyList_New(posns_size);
+ if (x == NULL) {
+ return NULL;
+ }
+ for (Py_ssize_t i = 0; i < posns_size; i++) {
+ PyList_SET_ITEM(x, i, PyLong_FromLong(posns[i]));
+ }
+
+ return x;
}
static
PyObject *
-hashbits_save(khmer_KHashbits_Object * me, PyObject * args)
+count_fasta_dump_kmers_by_abundance(khmer_KCountingHash_Object * me,
+ PyObject * args)
{
- Hashbits * hashbits = me->hashbits;
+ CountingHash * counting = me->counting;
- const char * filename = NULL;
+ const char * inputfile;
+ int limit_by = 0;
- if (!PyArg_ParseTuple(args, "s", &filename)) {
+ if (!PyArg_ParseTuple(args, "si", &inputfile, &limit_by)) {
return NULL;
}
- try {
- hashbits->save(filename);
- } catch (khmer_file_exception &e) {
- PyErr_SetString(PyExc_IOError, e.what());
- return NULL;
- }
+ counting->fasta_dump_kmers_by_abundance(inputfile,
+ limit_by);
Py_RETURN_NONE;
}
static
PyObject *
-hashbits_load_tagset(khmer_KHashbits_Object * me, PyObject * args)
+count_get_kadian_count(khmer_KCountingHash_Object * me, PyObject * args)
{
- Hashbits * hashbits = me->hashbits;
+ CountingHash * counting = me->counting;
- const char * filename = NULL;
- PyObject * clear_tags_o = NULL;
+ const char * long_str;
+ unsigned int nk = 1;
- if (!PyArg_ParseTuple(args, "s|O", &filename, &clear_tags_o)) {
+ if (!PyArg_ParseTuple(args, "s|I", &long_str, &nk)) {
return NULL;
}
- bool clear_tags = true;
- if (clear_tags_o && !PyObject_IsTrue(clear_tags_o)) {
- clear_tags = false;
- }
-
- try {
- hashbits->load_tagset(filename, clear_tags);
- } catch (khmer_file_exception &e) {
- PyErr_SetString(PyExc_IOError, e.what());
+ if (strlen(long_str) < counting->ksize()) {
+ PyErr_SetString(PyExc_ValueError,
+ "string length must >= the hashtable k-mer size");
return NULL;
}
- Py_RETURN_NONE;
+ BoundedCounterType kad = 0;
+
+ counting->get_kadian_count(long_str, kad, nk);
+
+ return Py_BuildValue("i", kad);
}
static
PyObject *
-hashbits_save_tagset(khmer_KHashbits_Object * me, PyObject * args)
+count_get_raw_tables(khmer_KCountingHash_Object * self, PyObject * args)
{
- Hashbits * hashbits = me->hashbits;
-
- const char * filename = NULL;
+ CountingHash * counting = self->counting;
- if (!PyArg_ParseTuple(args, "s", &filename)) {
- return NULL;
- }
+ khmer::Byte ** table_ptrs = counting->get_raw_tables();
+ std::vector<HashIntoType> sizes = counting->get_tablesizes();
- try {
- hashbits->save_tagset(filename);
- } catch (khmer_file_exception &e) {
- PyErr_SetString(PyExc_IOError, e.what());
- return NULL;
+ PyObject * raw_tables = PyList_New(sizes.size());
+ for (unsigned int i=0; i<sizes.size(); ++i) {
+ Py_buffer buffer;
+ int res = PyBuffer_FillInfo(&buffer, NULL, table_ptrs[i], sizes[i], 0, PyBUF_FULL_RO);
+ if (res == -1) {
+ return NULL;
+ }
+ PyObject * buf = PyMemoryView_FromBuffer(&buffer);
+ if(!PyMemoryView_Check(buf)) {
+ return NULL;
+ }
+ PyList_SET_ITEM(raw_tables, i, buf);
}
- Py_RETURN_NONE;
+ return raw_tables;
}
static
PyObject *
-hashbits_save_subset_partitionmap(khmer_KHashbits_Object * me, PyObject * args)
+count_set_use_bigcount(khmer_KCountingHash_Object * me, PyObject * args)
{
- const char * filename = NULL;
- PyObject * subset_obj = NULL;
+ CountingHash * counting = me->counting;
- if (!PyArg_ParseTuple(args, "Os", &subset_obj, &filename)) {
+ PyObject * x;
+ if (!PyArg_ParseTuple(args, "O", &x)) {
return NULL;
}
-
- SubsetPartition * subset_p;
- subset_p = (SubsetPartition *) PyCObject_AsVoidPtr(subset_obj);
-
- Py_BEGIN_ALLOW_THREADS
-
- try {
- subset_p->save_partitionmap(filename);
- } catch (khmer_file_exception &e) {
- PyErr_SetString(PyExc_IOError, e.what());
+ int setme = PyObject_IsTrue(x);
+ if (setme < 0) {
return NULL;
}
-
- Py_END_ALLOW_THREADS
+ counting->set_use_bigcount((bool)setme);
Py_RETURN_NONE;
}
static
PyObject *
-hashbits_load_subset_partitionmap(khmer_KHashbits_Object * me, PyObject * args)
+count_get_use_bigcount(khmer_KCountingHash_Object * me, PyObject * args)
{
- Hashbits * hashbits = me->hashbits;
-
- const char * filename = NULL;
+ CountingHash * counting = me->counting;
- if (!PyArg_ParseTuple(args, "s", &filename)) {
+ if (!PyArg_ParseTuple(args, "")) {
return NULL;
}
- SubsetPartition * subset_p;
- try {
- subset_p = new SubsetPartition(hashbits);
- } catch (std::bad_alloc &e) {
- return PyErr_NoMemory();
- }
+ bool val = counting->get_use_bigcount();
- bool fail = false;
- std::string err;
+ return PyBool_FromLong((int)val);
+}
- Py_BEGIN_ALLOW_THREADS
+static
+PyObject *
+count_get_min_count(khmer_KCountingHash_Object * me, PyObject * args)
+{
+ CountingHash * counting = me->counting;
- try {
- subset_p->load_partitionmap(filename);
- } catch (khmer_file_exception &e) {
- fail = true;
- err = e.what();
- }
+ const char * long_str;
- Py_END_ALLOW_THREADS
+ if (!PyArg_ParseTuple(args, "s", &long_str)) {
+ return NULL;
+ }
- if (fail) {
- PyErr_SetString(PyExc_IOError, err.c_str());
- delete subset_p;
+ if (strlen(long_str) < counting->ksize()) {
+ PyErr_SetString(PyExc_ValueError,
+ "string length must >= the hashtable k-mer size");
return NULL;
- } else {
- return PyCObject_FromVoidPtr(subset_p, free_subset_partition_info);
}
+
+ BoundedCounterType c = counting->get_min_count(long_str);
+ unsigned int N = c;
+
+ return PyLong_FromLong(N);
}
static
PyObject *
-hashbits__set_tag_density(khmer_KHashbits_Object * me, PyObject * args)
+count_get_max_count(khmer_KCountingHash_Object * me, PyObject * args)
{
- Hashbits * hashbits = me->hashbits;
+ CountingHash * counting = me->counting;
- unsigned int d;
- if (!PyArg_ParseTuple(args, "I", &d)) {
+ const char * long_str;
+
+ if (!PyArg_ParseTuple(args, "s", &long_str)) {
+ return NULL;
+ }
+
+ if (strlen(long_str) < counting->ksize()) {
+ PyErr_SetString(PyExc_ValueError,
+ "string length must >= the hashtable k-mer size");
return NULL;
}
- hashbits->_set_tag_density(d);
+ BoundedCounterType c = counting->get_max_count(long_str);
+ unsigned int N = c;
- Py_RETURN_NONE;
+ return PyLong_FromLong(N);
}
static
PyObject *
-hashbits__get_tag_density(khmer_KHashbits_Object * me, PyObject * args)
+count_output_fasta_kmer_pos_freq(khmer_KCountingHash_Object * me,
+ PyObject * args)
{
- Hashbits * hashbits = me->hashbits;
+ CountingHash * counting = me->counting;
- if (!PyArg_ParseTuple(args, "")) {
+ const char * infile;
+ const char * outfile;
+
+ if (!PyArg_ParseTuple(args, "ss", &infile, &outfile)) {
return NULL;
}
- unsigned int d = hashbits->_get_tag_density();
+ counting->output_fasta_kmer_pos_freq(infile, outfile);
- return PyLong_FromLong(d);
+ return PyLong_FromLong(0);
}
static
PyObject *
-hashbits__validate_subset_partitionmap(khmer_KHashbits_Object * me,
- PyObject * args)
+count_fasta_count_kmers_by_position(khmer_KCountingHash_Object * me,
+ PyObject * args)
{
- PyObject * subset_obj = NULL;
+ CountingHash * counting = me->counting;
+
+ const char * inputfile;
+ unsigned int max_read_len = 0;
+ long max_read_len_long;
+ int limit_by_count_int;
+
+ if (!PyArg_ParseTuple(args, "sli", &inputfile, &max_read_len_long,
+ &limit_by_count_int)) {
+ return NULL;
+ }
+ if (max_read_len_long < 0 || max_read_len_long >= pow(2, 32)) {
+ PyErr_SetString(
+ PyExc_ValueError,
+ "The 2nd argument must be positive and less than 2^32");
+ return NULL;
+ }
+ if (limit_by_count_int < 0 || limit_by_count_int >= pow(2, 16)) {
+ PyErr_SetString(
+ PyExc_ValueError,
+ "The 3rd argument must be positive and less than 2^16");
+ return NULL;
+ }
+ max_read_len = (unsigned int) max_read_len_long;
+
+ unsigned long long * counts;
+ counts = counting->fasta_count_kmers_by_position(inputfile, max_read_len,
+ (unsigned short) limit_by_count_int);
- if (!PyArg_ParseTuple(args, "O", &subset_obj)) {
+ PyObject * x = PyList_New(max_read_len);
+ if (x == NULL) {
+ delete[] counts;
return NULL;
}
- SubsetPartition * subset_p;
- subset_p = (SubsetPartition *) PyCObject_AsVoidPtr(subset_obj);
- subset_p->_validate_pmap();
+ for (unsigned int i = 0; i < max_read_len; i++) {
+ int ret = PyList_SetItem(x, i, PyLong_FromUnsignedLongLong(counts[i]));
+ if (ret < 0) {
+ delete[] counts;
+ return NULL;
+ }
+ }
- Py_RETURN_NONE;
+ delete[] counts;
+
+ return x;
}
static
PyObject *
-hashbits_set_partition_id(khmer_KHashbits_Object * me, PyObject * args)
+count_abundance_distribution_with_reads_parser(khmer_KCountingHash_Object * me,
+ PyObject * args)
{
- Hashbits * hashbits = me->hashbits;
+ CountingHash * counting = me->counting;
- const char * kmer = NULL;
- PartitionID p = 0;
+ khmer :: python :: khmer_ReadParser_Object * rparser_obj = NULL;
+ khmer_KHashbits_Object *tracking_obj = NULL;
- if (!PyArg_ParseTuple(args, "sI", &kmer, &p)) {
+ if (!PyArg_ParseTuple(args, "O!O!", &python::khmer_ReadParser_Type,
+ &rparser_obj, &khmer_KHashbits_Type, &tracking_obj)) {
+ return NULL;
+ }
+
+ read_parsers:: IParser * rparser = rparser_obj->parser;
+ Hashbits * hashbits = tracking_obj->hashbits;
+
+ HashIntoType * dist = NULL;
+
+ Py_BEGIN_ALLOW_THREADS
+ dist = counting->abundance_distribution(rparser, hashbits);
+ Py_END_ALLOW_THREADS
+
+ PyObject * x = PyList_New(MAX_BIGCOUNT + 1);
+ if (x == NULL) {
+ delete[] dist;
return NULL;
}
+ for (int i = 0; i < MAX_BIGCOUNT + 1; i++) {
+ PyList_SET_ITEM(x, i, PyLong_FromUnsignedLongLong(dist[i]));
+ }
- hashbits->partition->set_partition_id(kmer, p);
-
- Py_RETURN_NONE;
+ delete[] dist;
+ return x;
}
static
PyObject *
-hashbits_join_partitions(khmer_KHashbits_Object * me, PyObject * args)
+count_abundance_distribution(khmer_KCountingHash_Object * me, PyObject * args)
{
- Hashbits * hashbits = me->hashbits;
-
- PartitionID p1 = 0, p2 = 0;
+ CountingHash * counting = me->counting;
- if (!PyArg_ParseTuple(args, "II", &p1, &p2)) {
+ const char * filename = NULL;
+ khmer_KHashbits_Object * tracking_obj = NULL;
+ if (!PyArg_ParseTuple(args, "sO!", &filename, &khmer_KHashbits_Type,
+ &tracking_obj)) {
return NULL;
}
- p1 = hashbits->partition->join_partitions(p1, p2);
-
- return PyLong_FromLong(p1);
-}
+ Hashbits * hashbits = tracking_obj->hashbits;
+ HashIntoType * dist;
-static
-PyObject *
-hashbits_get_partition_id(khmer_KHashbits_Object * me, PyObject * args)
-{
- Hashbits * hashbits = me->hashbits;
+ char const * result = "";
+ bool exception = false;
+ Py_BEGIN_ALLOW_THREADS
+ try {
+ dist = counting->abundance_distribution(filename, hashbits);
+ } catch (khmer_file_exception &e) {
+ exception = true;
+ result = e.what();
+ }
+ Py_END_ALLOW_THREADS
- const char * kmer = NULL;
+ if (exception) {
+ PyErr_SetString(PyExc_IOError, result);
+ return NULL;
+ }
- if (!PyArg_ParseTuple(args, "s", &kmer)) {
+ PyObject * x = PyList_New(MAX_BIGCOUNT + 1);
+ if (x == NULL) {
+ delete[] dist;
return NULL;
}
+ for (int i = 0; i < MAX_BIGCOUNT + 1; i++) {
+ PyList_SET_ITEM(x, i, PyLong_FromUnsignedLongLong(dist[i]));
+ }
- PartitionID partition_id;
- partition_id = hashbits->partition->get_partition_id(kmer);
+ delete[] dist;
- return PyLong_FromLong(partition_id);
+ return x;
}
static
PyObject *
-hashbits_is_single_partition(khmer_KHashbits_Object * me, PyObject * args)
+count_do_subset_partition_with_abundance(khmer_KCountingHash_Object * me,
+ PyObject * args)
{
- Hashbits * hashbits = me->hashbits;
+ CountingHash * counting = me->counting;
- const char * seq = NULL;
+ HashIntoType start_kmer = 0, end_kmer = 0;
+ PyObject * break_on_stop_tags_o = NULL;
+ PyObject * stop_big_traversals_o = NULL;
+ BoundedCounterType min_count, max_count;
- if (!PyArg_ParseTuple(args, "s", &seq)) {
+ if (!PyArg_ParseTuple(args, "HH|KKOO",
+ &min_count, &max_count,
+ &start_kmer, &end_kmer,
+ &break_on_stop_tags_o,
+ &stop_big_traversals_o)) {
return NULL;
}
- bool v = hashbits->partition->is_single_partition(seq);
-
- PyObject * val;
- if (v) {
- val = Py_True;
- } else {
- val = Py_False;
+ bool break_on_stop_tags = false;
+ if (break_on_stop_tags_o && PyObject_IsTrue(break_on_stop_tags_o)) {
+ break_on_stop_tags = true;
+ }
+ bool stop_big_traversals = false;
+ if (stop_big_traversals_o && PyObject_IsTrue(stop_big_traversals_o)) {
+ stop_big_traversals = true;
}
- Py_INCREF(val);
-
- return val;
-}
-static
-PyObject *
-hashbits_divide_tags_into_subsets(khmer_KHashbits_Object * me, PyObject * args)
-{
- Hashbits * hashbits = me->hashbits;
+ SubsetPartition * subset_p = NULL;
+ try {
+ Py_BEGIN_ALLOW_THREADS
+ subset_p = new SubsetPartition(counting);
+ subset_p->do_partition_with_abundance(start_kmer, end_kmer,
+ min_count, max_count,
+ break_on_stop_tags,
+ stop_big_traversals);
+ Py_END_ALLOW_THREADS
+ } catch (std::bad_alloc &e) {
+ return PyErr_NoMemory();
+ }
- unsigned int subset_size = 0;
+ khmer_KSubsetPartition_Object * subset_obj = (khmer_KSubsetPartition_Object *)\
+ PyObject_New(khmer_KSubsetPartition_Object, &khmer_KSubsetPartition_Type);
- if (!PyArg_ParseTuple(args, "I", &subset_size)) {
+ if (subset_obj == NULL) {
+ delete subset_p;
return NULL;
}
- SeenSet divvy;
- hashbits->divide_tags_into_subsets(subset_size, divvy);
-
- PyObject * x = PyList_New(divvy.size());
- unsigned int i = 0;
- for (SeenSet::const_iterator si = divvy.begin(); si != divvy.end();
- si++, i++) {
- PyList_SET_ITEM(x, i, PyLong_FromUnsignedLongLong(*si));
- }
+ subset_obj->subset = subset_p;
- return x;
+ return (PyObject *) subset_obj;
}
-static
-PyObject *
-hashbits_count_kmers_within_radius(khmer_KHashbits_Object * me, PyObject * args)
-{
- Hashbits * hashbits = me->hashbits;
-
- const char * kmer = NULL;
- unsigned int radius = 0;
- unsigned int max_count = 0;
+static PyMethodDef khmer_counting_methods[] = {
+ { "set_use_bigcount", (PyCFunction)count_set_use_bigcount, METH_VARARGS, "" },
+ { "get_use_bigcount", (PyCFunction)count_get_use_bigcount, METH_VARARGS, "" },
+ { "output_fasta_kmer_pos_freq", (PyCFunction)count_output_fasta_kmer_pos_freq, METH_VARARGS, "" },
+ { "get_min_count", (PyCFunction)count_get_min_count, METH_VARARGS, "Get the smallest count of all the k-mers in the string" },
+ { "get_max_count", (PyCFunction)count_get_max_count, METH_VARARGS, "Get the largest count of all the k-mers in the string" },
+ { "get_kadian_count", (PyCFunction)count_get_kadian_count, METH_VARARGS, "Get the kadian (abundance of k-th rank-ordered k-mer) of the k-mer counts in the string" },
+ { "trim_on_abundance", (PyCFunction)count_trim_on_abundance, METH_VARARGS, "Trim on >= abundance" },
+ { "trim_below_abundance", (PyCFunction)count_trim_below_abundance, METH_VARARGS, "Trim on >= abundance" },
+ { "find_spectral_error_positions", (PyCFunction)count_find_spectral_error_positions, METH_VARARGS, "Identify positions of low-abundance k-mers" },
+ { "abundance_distribution", (PyCFunction)count_abundance_distribution, METH_VARARGS, "" },
+ { "abundance_distribution_with_reads_parser", (PyCFunction)count_abundance_distribution_with_reads_parser, METH_VARARGS, "" },
+ { "fasta_count_kmers_by_position", (PyCFunction)count_fasta_count_kmers_by_position, METH_VARARGS, "" },
+ { "fasta_dump_kmers_by_abundance", (PyCFunction)count_fasta_dump_kmers_by_abundance, METH_VARARGS, "" },
+ {
+ "get_raw_tables", (PyCFunction)count_get_raw_tables,
+ METH_VARARGS, "Get a list of the raw tables as memoryview objects"
+ },
+ { "do_subset_partition_with_abundance", (PyCFunction)count_do_subset_partition_with_abundance, METH_VARARGS, "" },
+ {NULL, NULL, 0, NULL} /* sentinel */
+};
- if (!PyArg_ParseTuple(args, "sI|I", &kmer, &radius, &max_count)) {
- return NULL;
- }
+static PyObject* _new_counting_hash(PyTypeObject * type, PyObject * args,
+ PyObject * kwds);
- unsigned int n;
+static PyTypeObject khmer_KCountingHash_Type
+CPYCHECKER_TYPE_OBJECT_FOR_TYPEDEF("khmer_KCountingHash_Object")
+= {
+ PyVarObject_HEAD_INIT(NULL, 0) /* init & ob_size */
+ "_khmer.CountingHash", /*tp_name*/
+ sizeof(khmer_KCountingHash_Object), /*tp_basicsize*/
+ 0, /*tp_itemsize*/
+ (destructor)khmer_counting_dealloc, /*tp_dealloc*/
+ 0, /*tp_print*/
+ 0, /*tp_getattr*/
+ 0, /*tp_setattr*/
+ 0, /*tp_compare*/
+ 0, /*tp_repr*/
+ 0, /*tp_as_number*/
+ 0, /*tp_as_sequence*/
+ 0, /*tp_as_mapping*/
+ 0, /*tp_hash */
+ 0, /*tp_call*/
+ 0, /*tp_str*/
+ 0, /*tp_getattro*/
+ 0, /*tp_setattro*/
+ 0, /*tp_as_buffer*/
+ Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /*tp_flags*/
+ "counting hash object", /* tp_doc */
+ 0, /* tp_traverse */
+ 0, /* tp_clear */
+ 0, /* tp_richcompare */
+ 0, /* tp_weaklistoffset */
+ 0, /* tp_iter */
+ 0, /* tp_iternext */
+ khmer_counting_methods, /* tp_methods */
+ 0, /* tp_members */
+ 0, /* tp_getset */
+ 0, /* tp_base */
+ 0, /* tp_dict */
+ 0, /* tp_descr_get */
+ 0, /* tp_descr_set */
+ 0, /* tp_dictoffset */
+ 0, /* tp_init */
+ 0, /* tp_alloc */
+ _new_counting_hash, /* tp_new */
+};
- Py_BEGIN_ALLOW_THREADS
+#define is_counting_obj(v) (Py_TYPE(v) == &khmer_KCountingHash_Type)
- HashIntoType kmer_f, kmer_r;
- _hash(kmer, hashbits->ksize(), kmer_f, kmer_r);
- n = hashbits->count_kmers_within_radius(kmer_f, kmer_r, radius,
- max_count);
+//
+// new_hashtable
+//
- Py_END_ALLOW_THREADS
+static PyObject* new_hashtable(PyObject * self, PyObject * args)
+{
+ unsigned int k = 0;
+ unsigned long long size = 0;
- return PyLong_FromUnsignedLong(n);
-}
+ if (!PyArg_ParseTuple(args, "IK", &k, &size)) {
+ return NULL;
+ }
-static
-PyObject *
-hashbits_get_ksize(khmer_KHashbits_Object * me, PyObject * args)
-{
- Hashbits * hashbits = me->hashbits;
+ khmer_KCountingHash_Object * kcounting_obj = (khmer_KCountingHash_Object *) \
+ PyObject_New(khmer_KCountingHash_Object, &khmer_KCountingHash_Type);
- if (!PyArg_ParseTuple(args, "")) {
+ if (kcounting_obj == NULL) {
return NULL;
}
- unsigned int k = hashbits->ksize();
+ try {
+ kcounting_obj->counting = new CountingHash(k, size);
+ } catch (std::bad_alloc &e) {
+ return PyErr_NoMemory();
+ }
+ kcounting_obj->khashtable.hashtable = kcounting_obj->counting;
- return PyLong_FromLong(k);
+ return (PyObject *) kcounting_obj;
}
+//
+// _new_counting_hash
+//
-static
-PyObject *
-hashbits_get_hashsizes(khmer_KHashbits_Object * me, PyObject * args)
+static PyObject* _new_counting_hash(PyTypeObject * type, PyObject * args,
+ PyObject * kwds)
{
- Hashbits * hashbits = me->hashbits;
+ khmer_KCountingHash_Object * self;
- if (!PyArg_ParseTuple(args, "")) {
- return NULL;
- }
+ self = (khmer_KCountingHash_Object *)type->tp_alloc(type, 0);
- std::vector<HashIntoType> ts = hashbits->get_tablesizes();
+ if (self != NULL) {
+ WordLength k = 0;
+ PyListObject * sizes_list_o = NULL;
- PyObject * x = PyList_New(ts.size());
- for (size_t i = 0; i < ts.size(); i++) {
- PyList_SET_ITEM(x, i, PyLong_FromUnsignedLongLong(ts[i]));
+ if (!PyArg_ParseTuple(args, "bO!", &k, &PyList_Type, &sizes_list_o)) {
+ Py_DECREF(self);
+ return NULL;
+ }
+
+ std::vector<HashIntoType> sizes;
+ Py_ssize_t sizes_list_o_length = PyList_GET_SIZE(sizes_list_o);
+ if (sizes_list_o_length == -1) {
+ Py_DECREF(self);
+ PyErr_SetString(PyExc_ValueError, "error with hashtable primes!");
+ return NULL;
+ }
+ for (Py_ssize_t i = 0; i < sizes_list_o_length; i++) {
+ PyObject * size_o = PyList_GET_ITEM(sizes_list_o, i);
+ if (PyLong_Check(size_o)) {
+ sizes.push_back((HashIntoType) PyLong_AsUnsignedLongLong(size_o));
+ } else if (PyInt_Check(size_o)) {
+ sizes.push_back((HashIntoType) PyInt_AsLong(size_o));
+ } else if (PyFloat_Check(size_o)) {
+ sizes.push_back((HashIntoType) PyFloat_AS_DOUBLE(size_o));
+ } else {
+ Py_DECREF(self);
+ PyErr_SetString(PyExc_TypeError,
+ "2nd argument must be a list of ints, longs, or floats");
+ return NULL;
+ }
+ }
+
+ try {
+ self->counting = new CountingHash(k, sizes);
+ } catch (std::bad_alloc &e) {
+ Py_DECREF(self);
+ return PyErr_NoMemory();
+ }
+ self->khashtable.hashtable = (Hashtable *) self->counting;
}
- return x;
+ return (PyObject *) self;
}
static
PyObject *
-hashbits_extract_unique_paths(khmer_KHashbits_Object * me, PyObject * args)
+hashbits_count_overlap(khmer_KHashbits_Object * me, PyObject * args)
{
Hashbits * hashbits = me->hashbits;
+ khmer_KHashbits_Object * ht2_argu;
+ const char * filename;
+ Hashbits * ht2;
- const char * sequence = NULL;
- unsigned int min_length = 0;
- float min_unique_f = 0;
- if (!PyArg_ParseTuple(args, "sIf", &sequence, &min_length, &min_unique_f)) {
+ if (!PyArg_ParseTuple(args, "sO!", &filename, &khmer_KHashbits_Type,
+ &ht2_argu)) {
return NULL;
}
- std::vector<std::string> results;
- hashbits->extract_unique_paths(sequence, min_length, min_unique_f, results);
+ ht2 = ht2_argu->hashbits;
- PyObject * x = PyList_New(results.size());
- if (x == NULL) {
+// call the C++ function, and trap signals => Python
+
+ unsigned long long n_consumed;
+ unsigned int total_reads;
+ HashIntoType curve[2][100];
+
+ try {
+ hashbits->consume_fasta_overlap(filename, curve, *ht2, total_reads, n_consumed);
+ } catch (InvalidStreamHandle &e) {
+ PyErr_SetString(PyExc_IOError, e.what());
return NULL;
}
- for (unsigned int i = 0; i < results.size(); i++) {
- PyList_SET_ITEM(x, i, PyBytes_FromString(results[i].c_str()));
- }
+ HashIntoType n = hashbits->n_unique_kmers();
+ HashIntoType n_overlap = hashbits->n_overlap_kmers();
- return x;
+ PyObject * x = PyList_New(200);
+
+ for (unsigned int i = 0; i < 100; i++) {
+ PyList_SetItem(x, i, Py_BuildValue("K", curve[0][i]));
+ }
+ for (unsigned int i = 0; i < 100; i++) {
+ PyList_SetItem(x, i + 100, Py_BuildValue("K", curve[1][i]));
+ }
+ return Py_BuildValue("KKO", n, n_overlap, x);
}
static
PyObject *
-hashbits_get_median_count(khmer_KHashbits_Object * me, PyObject * args)
+hashbits_update(khmer_KHashbits_Object * me, PyObject * args)
{
Hashbits * hashbits = me->hashbits;
+ Hashbits * other;
+ khmer_KHashbits_Object * other_o;
- const char * long_str;
-
- if (!PyArg_ParseTuple(args, "s", &long_str)) {
+ if (!PyArg_ParseTuple(args, "O!", &khmer_KHashbits_Type, &other_o)) {
return NULL;
}
- if (strlen(long_str) < hashbits->ksize()) {
- PyErr_SetString(PyExc_ValueError,
- "string length must >= the hashtable k-mer size");
+ other = other_o->hashbits;
+
+ try {
+ hashbits->update_from(*other);
+ } catch (khmer_exception &e) {
+ PyErr_SetString(PyExc_ValueError, e.what());
return NULL;
}
- BoundedCounterType med = 0;
- float average = 0, stddev = 0;
-
- hashbits->get_median_count(long_str, med, average, stddev);
-
- return Py_BuildValue("iff", med, average, stddev);
+ Py_RETURN_NONE;
}
static PyMethodDef khmer_hashbits_methods[] = {
- { "extract_unique_paths", (PyCFunction)hashbits_extract_unique_paths, METH_VARARGS, "" },
- { "ksize", (PyCFunction)hashbits_get_ksize, METH_VARARGS, "" },
- { "hashsizes", (PyCFunction)hashbits_get_hashsizes, METH_VARARGS, "" },
- { "n_occupied", (PyCFunction)hashbits_n_occupied, METH_VARARGS, "Count the number of occupied bins" },
- { "n_unique_kmers", (PyCFunction)hashbits_n_unique_kmers, METH_VARARGS, "Count the number of unique kmers" },
- { "count", (PyCFunction)hashbits_count, METH_VARARGS, "Count the given kmer" },
{ "count_overlap", (PyCFunction)hashbits_count_overlap, METH_VARARGS, "Count overlap kmers in two datasets" },
- { "consume", (PyCFunction)hashbits_consume, METH_VARARGS, "Count all k-mers in the given string" },
- { "load_stop_tags", (PyCFunction)hashbits_load_stop_tags, METH_VARARGS, "" },
- { "save_stop_tags", (PyCFunction)hashbits_save_stop_tags, METH_VARARGS, "" },
- { "print_stop_tags", (PyCFunction)hashbits_print_stop_tags, METH_VARARGS, "" },
- { "print_tagset", (PyCFunction)hashbits_print_tagset, METH_VARARGS, "" },
- { "get", (PyCFunction)hashbits_get, METH_VARARGS, "Get the count for the given k-mer" },
- { "calc_connected_graph_size", (PyCFunction)hashbits_calc_connected_graph_size, METH_VARARGS, "" },
- { "kmer_degree", (PyCFunction)hashbits_kmer_degree, METH_VARARGS, "" },
- { "trim_on_stoptags", (PyCFunction)hashbits_trim_on_stoptags, METH_VARARGS, "" },
- { "identify_stoptags_by_position", (PyCFunction)hashbits_identify_stoptags_by_position, METH_VARARGS, "" },
- { "do_subset_partition", (PyCFunction)hashbits_do_subset_partition, METH_VARARGS, "" },
- { "find_all_tags", (PyCFunction)hashbits_find_all_tags, METH_VARARGS, "" },
- { "assign_partition_id", (PyCFunction)hashbits_assign_partition_id, METH_VARARGS, "" },
- { "output_partitions", (PyCFunction)hashbits_output_partitions, METH_VARARGS, "" },
- { "find_unpart", (PyCFunction)hashbits_find_unpart, METH_VARARGS, "" },
- { "filter_if_present", (PyCFunction)hashbits_filter_if_present, METH_VARARGS, "" },
- { "add_tag", (PyCFunction)hashbits_add_tag, METH_VARARGS, "" },
- { "add_stop_tag", (PyCFunction)hashbits_add_stop_tag, METH_VARARGS, "" },
- { "get_stop_tags", (PyCFunction)hashbits_get_stop_tags, METH_VARARGS, "" },
- { "get_tagset", (PyCFunction)hashbits_get_tagset, METH_VARARGS, "" },
- { "load", (PyCFunction)hashbits_load, METH_VARARGS, "" },
- { "save", (PyCFunction)hashbits_save, METH_VARARGS, "" },
- { "load_tagset", (PyCFunction)hashbits_load_tagset, METH_VARARGS, "" },
- { "save_tagset", (PyCFunction)hashbits_save_tagset, METH_VARARGS, "" },
- { "n_tags", (PyCFunction)hashbits_n_tags, METH_VARARGS, "" },
- { "divide_tags_into_subsets", (PyCFunction)hashbits_divide_tags_into_subsets, METH_VARARGS, "" },
- { "load_partitionmap", (PyCFunction)hashbits_load_partitionmap, METH_VARARGS, "" },
- { "save_partitionmap", (PyCFunction)hashbits_save_partitionmap, METH_VARARGS, "" },
- { "_validate_partitionmap", (PyCFunction)hashbits__validate_partitionmap, METH_VARARGS, "" },
- { "_get_tag_density", (PyCFunction)hashbits__get_tag_density, METH_VARARGS, "" },
- { "_set_tag_density", (PyCFunction)hashbits__set_tag_density, METH_VARARGS, "" },
- { "consume_fasta", (PyCFunction)hashbits_consume_fasta, METH_VARARGS, "Count all k-mers in a given file" },
- { "consume_fasta_with_reads_parser", (PyCFunction)hashbits_consume_fasta_with_reads_parser, METH_VARARGS, "Count all k-mers in a given file" },
- { "consume_fasta_and_tag", (PyCFunction)hashbits_consume_fasta_and_tag, METH_VARARGS, "Count all k-mers in a given file" },
{
- "consume_fasta_and_tag_with_reads_parser", (PyCFunction)hashbits_consume_fasta_and_tag_with_reads_parser,
- METH_VARARGS, "Count all k-mers using a given reads parser"
+ "update",
+ (PyCFunction) hashbits_update, METH_VARARGS,
+ "a set update: update this nodegraph with all the entries from the other"
},
- { "consume_fasta_and_traverse", (PyCFunction)hashbits_consume_fasta_and_traverse, METH_VARARGS, "" },
- { "consume_fasta_and_tag_with_stoptags", (PyCFunction)hashbits_consume_fasta_and_tag_with_stoptags, METH_VARARGS, "Count all k-mers in a given file" },
- { "consume_partitioned_fasta", (PyCFunction)hashbits_consume_partitioned_fasta, METH_VARARGS, "Count all k-mers in a given file" },
- { "join_partitions_by_path", (PyCFunction)hashbits_join_partitions_by_path, METH_VARARGS, "" },
- { "merge_subset", (PyCFunction)hashbits_merge_subset, METH_VARARGS, "" },
- { "merge_subset_from_disk", (PyCFunction)hashbits_merge_from_disk, METH_VARARGS, "" },
- { "count_partitions", (PyCFunction)hashbits_count_partitions, METH_VARARGS, "" },
- { "subset_count_partitions", (PyCFunction)hashbits_subset_count_partitions, METH_VARARGS, "" },
- { "subset_partition_size_distribution", (PyCFunction)hashbits_subset_partition_size_distribution, METH_VARARGS, "" },
- { "save_subset_partitionmap", (PyCFunction)hashbits_save_subset_partitionmap, METH_VARARGS },
- { "load_subset_partitionmap", (PyCFunction)hashbits_load_subset_partitionmap, METH_VARARGS },
- { "_validate_subset_partitionmap", (PyCFunction)hashbits__validate_subset_partitionmap, METH_VARARGS, "" },
- { "set_partition_id", (PyCFunction)hashbits_set_partition_id, METH_VARARGS, "" },
- { "join_partitions", (PyCFunction)hashbits_join_partitions, METH_VARARGS, "" },
- { "get_partition_id", (PyCFunction)hashbits_get_partition_id, METH_VARARGS, "" },
- { "is_single_partition", (PyCFunction)hashbits_is_single_partition, METH_VARARGS, "" },
- { "count_kmers_within_radius", (PyCFunction)hashbits_count_kmers_within_radius, METH_VARARGS, "" },
- { "traverse_from_tags", (PyCFunction)hashbits_traverse_from_tags, METH_VARARGS, "" },
- { "repartition_largest_partition", (PyCFunction)hashbits_repartition_largest_partition, METH_VARARGS, "" },
- { "get_median_count", (PyCFunction)hashbits_get_median_count, METH_VARARGS, "Get the median, average, and stddev of the k-mer counts in the string" },
{NULL, NULL, 0, NULL} /* sentinel */
};
@@ -3484,19 +3346,14 @@ static PyObject* khmer_hashbits_new(PyTypeObject * type, PyObject * args,
try {
self->hashbits = new Hashbits(k, sizes);
} catch (std::bad_alloc &e) {
+ Py_DECREF(self);
return PyErr_NoMemory();
}
+ self->khashtable.hashtable = self->hashbits;
}
return (PyObject *) self;
}
-// there are no attributes that we need at this time, so we'll just return 0
-static int khmer_hashbits_init(khmer_KHashbits_Object * self, PyObject * args,
- PyObject * kwds)
-{
- return 0;
-}
-
#define is_hashbits_obj(v) (Py_TYPE(v) == &khmer_KHashbits_Type)
////////////////////////////////////////////////////////////////////////////
@@ -3724,13 +3581,10 @@ static PyMethodDef khmer_subset_methods[] = {
// LabelHash addition
typedef struct {
- //PyObject_HEAD
- khmer_KHashbits_Object khashbits;
+ PyObject_HEAD
LabelHash * labelhash;
} khmer_KLabelHash_Object;
-static int khmer_labelhash_init(khmer_KLabelHash_Object * self, PyObject *args,
- PyObject *kwds);
static PyObject * khmer_labelhash_new(PyTypeObject * type, PyObject *args,
PyObject *kwds);
@@ -3748,9 +3602,6 @@ static void khmer_labelhash_dealloc(khmer_KLabelHash_Object * obj)
Py_TYPE(obj)->tp_free((PyObject*)obj);
}
-// a little weird; we don't actually want to call Hashbits' new method. Rather, we
-// define our own new method, and redirect the base's hashbits object to point to our
-// labelhash object
static PyObject * khmer_labelhash_new(PyTypeObject *type, PyObject *args,
PyObject *kwds)
{
@@ -3758,59 +3609,38 @@ static PyObject * khmer_labelhash_new(PyTypeObject *type, PyObject *args,
self = (khmer_KLabelHash_Object*)type->tp_alloc(type, 0);
if (self != NULL) {
- WordLength k = 0;
- PyListObject * sizes_list_o = NULL;
+ PyObject * hashtable_o;
+ khmer::Hashtable * hashtable = NULL;
- if (!PyArg_ParseTuple(args, "bO!", &k, &PyList_Type, &sizes_list_o)) {
+ if (!PyArg_ParseTuple(args, "O", &hashtable_o)) {
Py_DECREF(self);
return NULL;
}
- std::vector<HashIntoType> sizes;
- Py_ssize_t sizes_list_o_length = PyList_GET_SIZE(sizes_list_o);
- for (Py_ssize_t i = 0; i < sizes_list_o_length; i++) {
- PyObject * size_o = PyList_GET_ITEM(sizes_list_o, i);
- if (PyLong_Check(size_o)) {
- sizes.push_back((HashIntoType) PyLong_AsUnsignedLongLong(size_o));
- } else if (PyInt_Check(size_o)) {
- sizes.push_back((HashIntoType) PyInt_AsLong(size_o));
- } else if (PyFloat_Check(size_o)) {
- sizes.push_back((HashIntoType) PyFloat_AS_DOUBLE(size_o));
- } else {
- Py_DECREF(self);
- PyErr_SetString(PyExc_TypeError,
- "2nd argument must be a list of ints, longs, or floats");
- return NULL;
- }
+ if (PyObject_TypeCheck(hashtable_o, &khmer_KHashbits_Type)) {
+ khmer_KHashbits_Object * kho = (khmer_KHashbits_Object *) hashtable_o;
+ hashtable = kho->hashbits;
+ } else if (PyObject_TypeCheck(hashtable_o, &khmer_KCountingHash_Type)) {
+ khmer_KCountingHash_Object * cho = (khmer_KCountingHash_Object *) hashtable_o;
+ hashtable = cho->counting;
+ } else {
+ PyErr_SetString(PyExc_ValueError,
+ "graph object must be a NodeGraph or CountGraph");
+ Py_DECREF(self);
+ return NULL;
}
-
- // We want the hashbits pointer in the base class to point to our labelhash,
- // so that the KHashbits methods are called on the correct object (a LabelHash)
try {
- self->labelhash = new LabelHash(k, sizes);
+ self->labelhash = new LabelHash(hashtable);
} catch (std::bad_alloc &e) {
Py_DECREF(self);
return PyErr_NoMemory();
}
- self->khashbits.hashbits = (Hashbits *)self->labelhash;
}
return (PyObject *) self;
}
-static int khmer_labelhash_init(khmer_KLabelHash_Object * self, PyObject *args,
- PyObject *kwds)
-{
- if (khmer_KHashbits_Type.tp_init((PyObject *)self, args, kwds) < 0) {
- return -1;
- }
- //std::cout << "testing my pointer ref to hashbits: " << self->khashbits.hashbits->n_tags() << std::endl;
- //std::cout << "hashbits: " << self->khashbits.hashbits << std::endl;
- //std::cout << "labelhash: " << self->labelhash << std::endl;
- return 0;
-}
-
static
PyObject *
labelhash_get_label_dict(khmer_KLabelHash_Object * me, PyObject * args)
@@ -3858,8 +3688,6 @@ labelhash_consume_fasta_and_tag_with_labels(khmer_KLabelHash_Object * me,
try {
hb->consume_fasta_and_tag_with_labels(filename, total_reads,
n_consumed);
- } catch (_khmer_signal &e) {
- exc = e.get_message().c_str();
} catch (khmer_file_exception &e) {
exc = e.what();
}
@@ -3894,10 +3722,6 @@ labelhash_consume_partitioned_fasta_and_tag_with_labels(
try {
labelhash->consume_partitioned_fasta_and_tag_with_labels(filename,
total_reads, n_consumed);
- } catch (_khmer_signal &e) {
- PyErr_SetString(PyExc_IOError,
- "error parsing in consume_partitioned_fasta_and_tag_with_labels");
- return NULL;
} catch (khmer_file_exception &e) {
PyErr_SetString(PyExc_IOError, e.what());
return NULL;
@@ -3918,12 +3742,7 @@ labelhash_consume_sequence_and_tag_with_labels(khmer_KLabelHash_Object * me,
}
unsigned long long n_consumed = 0;
Label * the_label = hb->check_and_allocate_label(c);
-
- try {
- hb->consume_sequence_and_tag_with_labels(seq, n_consumed, *the_label);
- } catch (_khmer_signal &e) {
- return NULL;
- }
+ hb->consume_sequence_and_tag_with_labels(seq, n_consumed, *the_label);
return Py_BuildValue("K", n_consumed);
}
@@ -3945,7 +3764,7 @@ labelhash_sweep_label_neighborhood(khmer_KLabelHash_Object * me,
return NULL;
}
- unsigned int range = (2 * hb->_get_tag_density()) + 1;
+ unsigned int range = (2 * hb->graph->_get_tag_density()) + 1;
if (r >= 0) {
range = r;
}
@@ -3959,7 +3778,7 @@ labelhash_sweep_label_neighborhood(khmer_KLabelHash_Object * me,
stop_big_traversals = true;
}
- if (strlen(seq) < hb->ksize()) {
+ if (strlen(seq) < hb->graph->ksize()) {
PyErr_SetString(PyExc_ValueError,
"string length must >= the hashtable k-mer size");
return NULL;
@@ -3968,23 +3787,14 @@ labelhash_sweep_label_neighborhood(khmer_KLabelHash_Object * me,
//std::pair<TagLabelPtrPair::iterator, TagLabelPtrPair::iterator> ret;
LabelPtrSet found_labels;
- bool exc_raised = false;
//unsigned int num_traversed = 0;
//Py_BEGIN_ALLOW_THREADS
- try {
- hb->sweep_label_neighborhood(seq, found_labels, range, break_on_stop_tags,
+ hb->sweep_label_neighborhood(seq, found_labels, range, break_on_stop_tags,
stop_big_traversals);
- } catch (_khmer_signal &e) {
- exc_raised = true;
- }
//Py_END_ALLOW_THREADS
//printf("...%u kmers traversed\n", num_traversed);
- if (exc_raised) {
- return NULL;
- }
-
PyObject * x = PyList_New(found_labels.size());
LabelPtrSet::const_iterator si;
unsigned long long i = 0;
@@ -4017,7 +3827,7 @@ labelhash_sweep_tag_neighborhood(khmer_KLabelHash_Object * me, PyObject * args)
return NULL;
}
- unsigned int range = (2 * labelhash->_get_tag_density()) + 1;
+ unsigned int range = (2 * labelhash->graph->_get_tag_density()) + 1;
if (r >= 0) {
range = r;
}
@@ -4031,7 +3841,7 @@ labelhash_sweep_tag_neighborhood(khmer_KLabelHash_Object * me, PyObject * args)
stop_big_traversals = true;
}
- if (strlen(seq) < labelhash->ksize()) {
+ if (strlen(seq) < labelhash->graph->ksize()) {
PyErr_SetString(PyExc_ValueError,
"string length must >= the hashtable k-mer size");
return NULL;
@@ -4041,8 +3851,10 @@ labelhash_sweep_tag_neighborhood(khmer_KLabelHash_Object * me, PyObject * args)
//Py_BEGIN_ALLOW_THREADS
- labelhash->partition->sweep_for_tags(seq, tagged_kmers,
- labelhash->all_tags, range, break_on_stop_tags, stop_big_traversals);
+ labelhash->graph->partition->sweep_for_tags(seq, tagged_kmers,
+ labelhash->graph->all_tags,
+ range, break_on_stop_tags,
+ stop_big_traversals);
//Py_END_ALLOW_THREADS
@@ -4103,6 +3915,48 @@ labelhash_n_labels(khmer_KLabelHash_Object * me, PyObject * args)
return PyLong_FromSize_t(labelhash->n_labels());
}
+static
+PyObject *
+labelhash_save_labels_and_tags(khmer_KLabelHash_Object * me, PyObject * args)
+{
+ const char * filename = NULL;
+ LabelHash * labelhash = me->labelhash;
+
+ if (!PyArg_ParseTuple(args, "s", &filename)) {
+ return NULL;
+ }
+
+ try {
+ labelhash->save_labels_and_tags(filename);
+ } catch (khmer_file_exception &e) {
+ PyErr_SetString(PyExc_IOError, e.what());
+ return NULL;
+ }
+
+ Py_RETURN_NONE;
+}
+
+static
+PyObject *
+labelhash_load_labels_and_tags(khmer_KLabelHash_Object * me, PyObject * args)
+{
+ const char * filename = NULL;
+ LabelHash * labelhash = me->labelhash;
+
+ if (!PyArg_ParseTuple(args, "s", &filename)) {
+ return NULL;
+ }
+
+ try {
+ labelhash->load_labels_and_tags(filename);
+ } catch (khmer_file_exception &e) {
+ PyErr_SetString(PyExc_IOError, e.what());
+ return NULL;
+ }
+
+ Py_RETURN_NONE;
+}
+
static PyMethodDef khmer_labelhash_methods[] = {
{ "consume_fasta_and_tag_with_labels", (PyCFunction)labelhash_consume_fasta_and_tag_with_labels, METH_VARARGS, "" },
{ "sweep_label_neighborhood", (PyCFunction)labelhash_sweep_label_neighborhood, METH_VARARGS, "" },
@@ -4112,7 +3966,8 @@ static PyMethodDef khmer_labelhash_methods[] = {
{"consume_sequence_and_tag_with_labels", (PyCFunction)labelhash_consume_sequence_and_tag_with_labels, METH_VARARGS, "" },
{"n_labels", (PyCFunction)labelhash_n_labels, METH_VARARGS, ""},
{"get_label_dict", (PyCFunction)labelhash_get_label_dict, METH_VARARGS, "" },
- {NULL, NULL, 0, NULL} /* sentinel */
+ { "save_labels_and_tags", (PyCFunction)labelhash_save_labels_and_tags, METH_VARARGS, "" },
+ { "load_labels_and_tags", (PyCFunction)labelhash_load_labels_and_tags, METH_VARARGS, "" }, {NULL, NULL, 0, NULL} /* sentinel */
};
static PyTypeObject khmer_KLabelHash_Type = {
@@ -4151,11 +4006,67 @@ static PyTypeObject khmer_KLabelHash_Type = {
0, /* tp_descr_get */
0, /* tp_descr_set */
0, /* tp_dictoffset */
- (initproc)khmer_labelhash_init, /* tp_init */
+ 0, /* tp_init */
0, /* tp_alloc */
khmer_labelhash_new, /* tp_new */
};
+static
+PyObject *
+hashtable_traverse_from_tags(khmer_KHashtable_Object * me, PyObject * args)
+{
+ Hashtable * hashtable = me->hashtable;
+
+ khmer_KCountingHash_Object * counting_o = NULL;
+ unsigned int distance, threshold, frequency;
+
+ if (!PyArg_ParseTuple(args, "O!III", &khmer_KCountingHash_Type, &counting_o,
+ &distance, &threshold, &frequency)) {
+ return NULL;
+ }
+
+ hashtable->traverse_from_tags(distance, threshold, frequency,
+ * counting_o->counting);
+
+ Py_RETURN_NONE;
+}
+
+static
+PyObject *
+hashtable_repartition_largest_partition(khmer_KHashtable_Object * me,
+ PyObject * args)
+{
+ Hashtable * hashtable = me->hashtable;
+ khmer_KCountingHash_Object * counting_o = NULL;
+ PyObject * subset_o = NULL;
+ SubsetPartition * subset_p;
+ unsigned int distance, threshold, frequency;
+
+ if (!PyArg_ParseTuple(args, "OO!III", &subset_o, &khmer_KCountingHash_Type,
+ &counting_o, &distance, &threshold, &frequency)) {
+ return NULL;
+ }
+
+ if (subset_o != Py_None) {
+ subset_p = ((khmer_KSubsetPartition_Object *) subset_o)->subset;
+ } else {
+ subset_p = hashtable->partition;
+ }
+
+ CountingHash * counting = counting_o->counting;
+
+ unsigned long next_largest;
+ try {
+ next_largest = subset_p->repartition_largest_partition(distance,
+ threshold, frequency, *counting);
+ } catch (khmer_exception &e) {
+ PyErr_SetString(PyExc_RuntimeError, e.what());
+ return NULL;
+ }
+
+ return PyLong_FromLong(next_largest);
+}
+
static PyObject * readaligner_align(khmer_ReadAligner_Object * me,
PyObject * args)
{
@@ -4267,41 +4178,30 @@ static PyTypeObject khmer_ReadAlignerType = {
khmer_ReadAligner_new, /* tp_new */
};
-static PyObject * hash_collect_high_abundance_kmers(khmer_KCountingHash_Object *
- me , PyObject * args)
+static
+PyObject *
+hashtable_consume_fasta_and_traverse(khmer_KHashtable_Object * me,
+ PyObject * args)
{
- CountingHash * counting = me->counting;
+ Hashtable * hashtable = me->hashtable;
- const char * filename = NULL;
- unsigned int lower_count, upper_count;
+ const char * filename;
+ unsigned int radius, big_threshold, transfer_threshold;
+ khmer_KCountingHash_Object * counting_o = NULL;
- if (!PyArg_ParseTuple(args, "sII", &filename, &lower_count, &upper_count)) {
+ if (!PyArg_ParseTuple(args, "sIIIO!", &filename,
+ &radius, &big_threshold, &transfer_threshold,
+ &khmer_KCountingHash_Type, &counting_o)) {
return NULL;
}
- SeenSet found_kmers;
- counting->collect_high_abundance_kmers(filename, lower_count, upper_count,
- found_kmers);
-
- // create a new hashbits object...
- std::vector<HashIntoType> sizes;
- sizes.push_back(1);
+ CountingHash * counting = counting_o->counting;
- khmer_KHashbits_Object * khashbits_obj = (khmer_KHashbits_Object *) \
- PyObject_New(khmer_KHashbits_Object, &khmer_KHashbits_Type);
- if (khashbits_obj == NULL) {
- return NULL;
- }
+ hashtable->consume_fasta_and_traverse(filename, radius, big_threshold,
+ transfer_threshold, *counting);
- // ...and set the collected kmers as the stoptags.
- try {
- khashbits_obj->hashbits = new Hashbits(counting->ksize(), sizes);
- } catch (std::bad_alloc &e) {
- return PyErr_NoMemory();
- }
- khashbits_obj->hashbits->stop_tags.swap(found_kmers);
- return (PyObject *) khashbits_obj;
+ Py_RETURN_NONE;
}
//
@@ -4455,9 +4355,6 @@ static PyObject * hllcounter_consume_fasta(khmer_KHLLCounter_Object * me,
unsigned int total_reads = 0;
try {
me->hllcounter->consume_fasta(filename, total_reads, n_consumed);
- } catch (_khmer_signal &e) {
- PyErr_SetString(PyExc_IOError, e.get_message().c_str());
- return NULL;
} catch (khmer_file_exception &e) {
PyErr_SetString(PyExc_IOError, e.what());
return NULL;
@@ -4466,6 +4363,9 @@ static PyObject * hllcounter_consume_fasta(khmer_KHLLCounter_Object * me,
return Py_BuildValue("IK", total_reads, n_consumed);
}
+static PyObject * hllcounter_merge(khmer_KHLLCounter_Object * me,
+ PyObject * args);
+
static
PyObject *
hllcounter_get_erate(khmer_KHLLCounter_Object * me)
@@ -4590,6 +4490,11 @@ static PyMethodDef khmer_hllcounter_methods[] = {
"Read sequences from file, break into k-mers, "
"and add each k-mer to the counter."
},
+ {
+ "merge", (PyCFunction)hllcounter_merge,
+ METH_VARARGS,
+ "Merge other counter into this one."
+ },
{NULL} /* Sentinel */
};
@@ -4668,6 +4573,24 @@ static PyTypeObject khmer_KHLLCounter_Type = {
#define is_hllcounter_obj(v) (Py_TYPE(v) == &khmer_KHLLCounter_Type)
+static PyObject * hllcounter_merge(khmer_KHLLCounter_Object * me,
+ PyObject * args)
+{
+ khmer_KHLLCounter_Object * other;
+
+ if (!PyArg_ParseTuple(args, "O!", &khmer_KHLLCounter_Type, &other)) {
+ return NULL;
+ }
+
+ try {
+ me->hllcounter->merge(*(other->hllcounter));
+ } catch (khmer_exception &e) {
+ PyErr_SetString(PyExc_ValueError, e.what());
+ return NULL;
+ }
+
+ Py_RETURN_NONE;
+}
//////////////////////////////
// standalone functions
@@ -4734,7 +4657,7 @@ static PyObject * reverse_hash(PyObject * self, PyObject * args)
return NULL;
}
- return PyBytes_FromString(_revhash(val, ksize).c_str());
+ return PyUnicode_FromString(_revhash(val, ksize).c_str());
}
static PyObject * murmur3_forward_hash(PyObject * self, PyObject * args)
@@ -4771,7 +4694,7 @@ get_version_cpp( PyObject * self, PyObject * args )
#define xstr(s) str(s)
#define str(s) #s
std::string dVersion = xstr(VERSION);
- return PyBytes_FromString(dVersion.c_str());
+ return PyUnicode_FromString(dVersion.c_str());
}
@@ -4829,89 +4752,106 @@ static PyMethodDef KhmerMethods[] = {
{ NULL, NULL, 0, NULL } // sentinel
};
-PyMODINIT_FUNC
-init_khmer(void)
+MOD_INIT(_khmer)
{
using namespace python;
+ if (PyType_Ready(&khmer_KHashtable_Type) < 0) {
+ return MOD_ERROR_VAL;
+ }
+
+ khmer_KCountingHash_Type.tp_base = &khmer_KHashtable_Type;
if (PyType_Ready(&khmer_KCountingHash_Type) < 0) {
- return;
+ return MOD_ERROR_VAL;
+ }
+
+ if (PyType_Ready(&khmer_PrePartitionInfo_Type) < 0) {
+ return MOD_ERROR_VAL;
}
khmer_KSubsetPartition_Type.tp_methods = khmer_subset_methods;
if (PyType_Ready(&khmer_KSubsetPartition_Type) < 0) {
- return;
+ return MOD_ERROR_VAL;
}
+ khmer_KHashbits_Type.tp_base = &khmer_KHashtable_Type;
khmer_KHashbits_Type.tp_methods = khmer_hashbits_methods;
if (PyType_Ready(&khmer_KHashbits_Type) < 0) {
- return;
+ return MOD_ERROR_VAL;
}
- // add LabelHash
khmer_KLabelHash_Type.tp_base = &khmer_KHashbits_Type;
+ khmer_KLabelHash_Type.tp_methods = khmer_labelhash_methods;
+ khmer_KLabelHash_Type.tp_new = khmer_labelhash_new;
if (PyType_Ready(&khmer_KLabelHash_Type) < 0) {
- return;
- }
-
- if (PyType_Ready(&khmer_ReadAlignerType) < 0) {
- return;
+ return MOD_ERROR_VAL;
}
if (PyType_Ready(&khmer_KHLLCounter_Type) < 0) {
- return;
+ return MOD_ERROR_VAL;
}
if (PyType_Ready(&khmer_ReadAlignerType) < 0) {
- return;
+ return MOD_ERROR_VAL;
}
_init_ReadParser_Type_constants();
if (PyType_Ready( &khmer_ReadParser_Type ) < 0) {
- return;
+ return MOD_ERROR_VAL;
}
if (PyType_Ready(&khmer_Read_Type ) < 0) {
- return;
+ return MOD_ERROR_VAL;
}
if (PyType_Ready(&khmer_ReadPairIterator_Type ) < 0) {
- return;
+ return MOD_ERROR_VAL;
}
PyObject * m;
- m = Py_InitModule3( "_khmer", KhmerMethods,
- "interface for the khmer module low-level extensions" );
+
+ MOD_DEF(m, "_khmer", "interface for the khmer module low-level extensions",
+ KhmerMethods);
+
if (m == NULL) {
- return;
+ return MOD_ERROR_VAL;
}
Py_INCREF(&khmer_ReadParser_Type);
if (PyModule_AddObject( m, "ReadParser",
(PyObject *)&khmer_ReadParser_Type ) < 0) {
- return;
+ return MOD_ERROR_VAL;
}
Py_INCREF(&khmer_KCountingHash_Type);
if (PyModule_AddObject( m, "CountingHash",
(PyObject *)&khmer_KCountingHash_Type ) < 0) {
- return;
+ return MOD_ERROR_VAL;
}
Py_INCREF(&khmer_KHashbits_Type);
if (PyModule_AddObject(m, "Hashbits", (PyObject *)&khmer_KHashbits_Type) < 0) {
- return;
+ return MOD_ERROR_VAL;
}
Py_INCREF(&khmer_KLabelHash_Type);
if (PyModule_AddObject(m, "LabelHash",
(PyObject *)&khmer_KLabelHash_Type) < 0) {
- return;
+ return MOD_ERROR_VAL;
}
Py_INCREF(&khmer_KHLLCounter_Type);
- PyModule_AddObject(m, "HLLCounter", (PyObject *)&khmer_KHLLCounter_Type);
+ if (PyModule_AddObject(m, "HLLCounter",
+ (PyObject *)&khmer_KHLLCounter_Type) < 0) {
+ return MOD_ERROR_VAL;
+ }
+
Py_INCREF(&khmer_ReadAlignerType);
- PyModule_AddObject(m, "ReadAligner", (PyObject *)&khmer_ReadAlignerType);
+ if (PyModule_AddObject(m, "ReadAligner",
+ (PyObject *)&khmer_ReadAlignerType) < 0) {
+ return MOD_ERROR_VAL;
+ }
+
+ return MOD_SUCCESS_VAL(m);
}
// vim: set ft=cpp sts=4 sw=4 tw=79:
diff --git a/khmer/_version.py b/khmer/_version.py
index acac511..5f61635 100644
--- a/khmer/_version.py
+++ b/khmer/_version.py
@@ -1,3 +1,4 @@
+from __future__ import print_function
# This file helps to compute a version number in source trees obtained from
# git-archive tarball (such as those provided by githubs download-from-tag
@@ -15,8 +16,8 @@ import subprocess
import sys
# these strings will be replaced by git during git-archive
-git_refnames = " (tag: v1.4)"
-git_full = "ffb8865d28ccf584e5de3362f9150a6b020eaa11"
+git_refnames = " (tag: v2.0-rc1)"
+git_full = "bbd38a6d3d0960f71c65dd46ecda3b61584a8b4c"
# these strings are filled in when 'setup.py versioneer' creates _version.py
tag_prefix = "v"
@@ -114,7 +115,7 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose=False):
# "stabilization", as well as "HEAD" and "master".
tags = set([r for r in refs if re.search(r'\d', r)])
if verbose:
- print("discarding '%s', no digits" % ",".join(refs-tags))
+ print("discarding '%s', no digits" % ",".join(refs - tags))
if verbose:
print("likely tags: %s" % ",".join(sorted(tags)))
for ref in sorted(tags):
@@ -144,13 +145,13 @@ def git_parse_vcs_describe(git_describe, tag_prefix, verbose=False):
# now we have TAG-NUM-gHEX or HEX
if "-" not in git_describe: # just HEX
- return "0+untagged.g"+git_describe+dirty_suffix, dirty
+ return "0+untagged.g" + git_describe + dirty_suffix, dirty
# just TAG-NUM-gHEX
mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe)
if not mo:
# unparseable. Maybe git-describe is misbehaving?
- return "0+unparseable"+dirty_suffix, dirty
+ return "0+unparseable" + dirty_suffix, dirty
# tag
full_tag = mo.group(1)
diff --git a/khmer/kfile.py b/khmer/kfile.py
index a5ff6b4..9a01f59 100644
--- a/khmer/kfile.py
+++ b/khmer/kfile.py
@@ -1,5 +1,5 @@
#
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
# Copyright (C) Michigan State University, 2014-2015. It is licensed under
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
@@ -7,10 +7,13 @@
"""File handling/checking utilities for command-line scripts."""
+from __future__ import print_function, unicode_literals
+
import os
import sys
import errno
from stat import S_ISBLK, S_ISFIFO
+from khmer import khmer_args
def check_input_files(file_path, force):
@@ -22,16 +25,16 @@ def check_input_files(file_path, force):
"""
mode = None
- if file_path is '-':
+ if file_path == '-':
return
try:
mode = os.stat(file_path).st_mode
except OSError:
- print >>sys.stderr, "ERROR: Input file %s does not exist" % \
- file_path
+ print("ERROR: Input file %s does not exist" %
+ file_path, file=sys.stderr)
if not force:
- print >>sys.stderr, "Exiting"
+ print("Exiting", file=sys.stderr)
sys.exit(1)
else:
return
@@ -41,14 +44,14 @@ def check_input_files(file_path, force):
return
if not os.path.exists(file_path):
- print >>sys.stderr, "ERROR: Input file %s does not exist; exiting" % \
- file_path
+ print("ERROR: Input file %s does not exist; exiting" %
+ file_path, file=sys.stderr)
if not force:
sys.exit(1)
else:
if os.stat(file_path).st_size == 0:
- print >>sys.stderr, "ERROR: Input file %s is empty; exiting." % \
- file_path
+ print("ERROR: Input file %s is empty; exiting." %
+ file_path, file=sys.stderr)
if not force:
sys.exit(1)
@@ -59,11 +62,11 @@ def check_file_writable(file_path):
file_obj = open(file_path, "a")
except IOError as error:
if error.errno == errno.EACCES:
- print >>sys.stderr, "ERROR: File %s does not have write " \
- % file_path + "permission; exiting"
+ print("ERROR: File %s does not have write "
+ % file_path + "permission; exiting", file=sys.stderr)
sys.exit(1)
else:
- print >>sys.stderr, "ERROR: " + error.strerror
+ print("ERROR: " + error.strerror, file=sys.stderr)
else:
file_obj.close()
return
@@ -97,20 +100,23 @@ def check_space(in_files, force, _testhook_free_space=None):
size_diff = total_size - free_space
if size_diff > 0:
- print >>sys.stderr, "ERROR: Not enough free space on disk " \
- "for output files;\n" \
- " Need at least %.1f GB more." \
- % (float(size_diff) / 1e9)
- print >>sys.stderr, " Estimated output size: %.1f GB" \
- % (float(total_size) / 1e9,)
- print >>sys.stderr, " Free space: %.1f GB" \
- % (float(free_space) / 1e9,)
+ print("ERROR: Not enough free space on disk "
+ "for output files;\n"
+ " Need at least %.1f GB more."
+ % (float(size_diff) / 1e9), file=sys.stderr)
+ print(" Estimated output size: %.1f GB"
+ % (float(total_size) / 1e9,), file=sys.stderr)
+ print(" Free space: %.1f GB"
+ % (float(free_space) / 1e9,), file=sys.stderr)
if not force:
sys.exit(1)
-def check_space_for_hashtable(hash_size, force, _testhook_free_space=None):
+def check_space_for_hashtable(args, hashtype, force,
+ _testhook_free_space=None):
"""Check we have enough size to write a hash table."""
+ hash_size = khmer_args._calculate_tablesize(args, hashtype)
+
cwd = os.getcwd()
dir_path = os.path.dirname(os.path.realpath(cwd))
target = os.statvfs(dir_path)
@@ -121,14 +127,14 @@ def check_space_for_hashtable(hash_size, force, _testhook_free_space=None):
size_diff = hash_size - free_space
if size_diff > 0:
- print >>sys.stderr, "ERROR: Not enough free space on disk " \
- "for saved table files;" \
- " Need at least %s GB more." \
- % (float(size_diff) / 1e9,)
- print >>sys.stderr, " Table size: %.1f GB" \
- % (float(hash_size) / 1e9,)
- print >>sys.stderr, " Free space: %.1f GB" \
- % (float(free_space) / 1e9,)
+ print("ERROR: Not enough free space on disk "
+ "for saved table files;"
+ " Need at least %s GB more."
+ % (float(size_diff) / 1e9,), file=sys.stderr)
+ print(" Table size: %.1f GB"
+ % (float(hash_size) / 1e9,), file=sys.stderr)
+ print(" Free space: %.1f GB"
+ % (float(free_space) / 1e9,), file=sys.stderr)
if not force:
sys.exit(1)
@@ -146,8 +152,8 @@ def check_valid_file_exists(in_files):
if os.stat(in_file).st_size > 0:
return
else:
- print >>sys.stderr, 'WARNING: Input file %s is empty' % \
- in_file
+ print('WARNING: Input file %s is empty' %
+ in_file, file=sys.stderr)
else:
- print >>sys.stderr, 'WARNING: Input file %s not found' % \
- in_file
+ print('WARNING: Input file %s not found' %
+ in_file, file=sys.stderr)
diff --git a/khmer/khmer_args.py b/khmer/khmer_args.py
index c33d501..a9e9358 100644
--- a/khmer/khmer_args.py
+++ b/khmer/khmer_args.py
@@ -1,53 +1,74 @@
#
# vim: set encoding=utf-8
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
-# Copyright (C) Michigan State University, 2014. It is licensed under
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
+# Copyright (C) Michigan State University, 2014-2015. It is licensed under
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
#
+from __future__ import unicode_literals
+
import sys
import os
import argparse
+from argparse import _VersionAction
+
+import screed
+import khmer
from khmer import extract_countinghash_info, extract_hashbits_info
from khmer import __version__
-import screed
+from khmer.utils import print_error
+
DEFAULT_K = 32
DEFAULT_N_TABLES = 4
-DEFAULT_MIN_TABLESIZE = 1e6
+DEFAULT_MAX_TABLESIZE = 1e6
DEFAULT_N_THREADS = 1
+class _VersionStdErrAction(_VersionAction):
+
+ def __call__(self, parser, namespace, values, option_string=None):
+ version = self.version
+ if version is None:
+ version = parser.version
+ formatter = parser._get_formatter()
+ formatter.add_text(version)
+ parser._print_message(formatter.format_help(), sys.stderr)
+ parser.exit()
+
+
class ComboFormatter(argparse.ArgumentDefaultsHelpFormatter,
argparse.RawDescriptionHelpFormatter):
pass
-def build_hash_args(descr=None, epilog=None):
+def build_hash_args(descr=None, epilog=None, parser=None):
"""Build an ArgumentParser with args for bloom filter based scripts."""
- parser = argparse.ArgumentParser(
- description=descr, epilog=epilog,
- formatter_class=ComboFormatter)
+ if parser is None:
+ parser = argparse.ArgumentParser(description=descr, epilog=epilog,
+ formatter_class=ComboFormatter)
- env_ksize = os.environ.get('KHMER_KSIZE', DEFAULT_K)
- env_n_tables = os.environ.get('KHMER_N_TABLES', DEFAULT_N_TABLES)
- env_tablesize = os.environ.get('KHMER_MIN_TABLESIZE',
- DEFAULT_MIN_TABLESIZE)
-
- parser.add_argument('--version', action='version',
+ parser.add_argument('--version', action=_VersionStdErrAction,
version='khmer {v}'.format(v=__version__))
parser.add_argument('-q', '--quiet', dest='quiet', default=False,
action='store_true')
- parser.add_argument('--ksize', '-k', type=int, default=env_ksize,
+ parser.add_argument('--ksize', '-k', type=int, default=DEFAULT_K,
help='k-mer size to use')
+
parser.add_argument('--n_tables', '-N', type=int,
- default=env_n_tables,
+ default=DEFAULT_N_TABLES,
help='number of k-mer counting tables to use')
- parser.add_argument('--min-tablesize', '-x', type=float,
- default=env_tablesize,
- help='lower bound on tablesize to use')
+
+ group = parser.add_mutually_exclusive_group()
+ group.add_argument('--max-tablesize', '-x', type=float,
+ default=DEFAULT_MAX_TABLESIZE,
+ help='upper bound on tablesize to use; overrides ' +
+ '--max-memory-usage/-M.')
+ group.add_argument('-M', '--max-memory-usage', type=float,
+ help='maximum amount of memory to use for data ' +
+ 'structure.')
return parser
@@ -55,15 +76,16 @@ def build_hash_args(descr=None, epilog=None):
def build_counting_args(descr=None, epilog=None):
"""Build an ArgumentParser with args for counting_hash based scripts."""
parser = build_hash_args(descr=descr, epilog=epilog)
- parser.hashtype = 'counting'
+ parser.hashtype = 'countgraph'
return parser
-def build_hashbits_args(descr=None, epilog=None):
+def build_hashbits_args(descr=None, epilog=None, parser=None):
"""Build an ArgumentParser with args for hashbits based scripts."""
- parser = build_hash_args(descr=descr, epilog=epilog)
- parser.hashtype = 'hashbits'
+
+ parser = build_hash_args(descr=descr, epilog=epilog, parser=parser)
+ parser.hashtype = 'nodegraph'
return parser
@@ -75,31 +97,24 @@ def add_loadhash_args(parser):
class LoadAction(argparse.Action):
def __call__(self, parser, namespace, values, option_string=None):
- env_ksize = os.environ.get('KHMER_KSIZE', DEFAULT_K)
- env_n_tables = os.environ.get('KHMER_N_TABLES', DEFAULT_N_TABLES)
- env_tablesize = os.environ.get('KHMER_MIN_TABLESIZE',
- DEFAULT_MIN_TABLESIZE)
-
- from khmer.utils import print_error
-
setattr(namespace, self.dest, values)
- if getattr(namespace, 'ksize') != env_ksize or \
- getattr(namespace, 'n_tables') != env_n_tables or \
- getattr(namespace, 'min_tablesize') != env_tablesize:
+ if getattr(namespace, 'ksize') != DEFAULT_K or \
+ getattr(namespace, 'n_tables') != DEFAULT_N_TABLES or \
+ getattr(namespace, 'max_tablesize') != DEFAULT_MAX_TABLESIZE:
if values:
print_error('''
** WARNING: You are loading a saved k-mer table from
-{hashfile}, but have set k-mer table parameters.
-Your values for ksize, n_tables, and tablesize
-will be ignored.'''.format(hashfile=values))
+** {hashfile}, but have set k-mer table parameters.
+** Your values for ksize, n_tables, and tablesize
+** will be ignored.'''.format(hashfile=values))
if hasattr(parser, 'hashtype'):
info = None
- if parser.hashtype == 'hashbits':
+ if parser.hashtype == 'nodegraph':
info = extract_hashbits_info(
getattr(namespace, self.dest))
- elif parser.hashtype == 'counting':
+ elif parser.hashtype == 'countgraph':
info = extract_countinghash_info(
getattr(namespace, self.dest))
if info:
@@ -108,52 +123,96 @@ will be ignored.'''.format(hashfile=values))
n = info[2]
setattr(namespace, 'ksize', K)
setattr(namespace, 'n_tables', n)
- setattr(namespace, 'min_tablesize', x)
+ setattr(namespace, 'max_tablesize', x)
parser.add_argument('-l', '--loadtable', metavar="filename", default=None,
help='load a precomputed k-mer table from disk',
action=LoadAction)
-def report_on_config(args, hashtype='counting'):
+def _calculate_tablesize(args, hashtype, multiplier=1.0):
+ if hashtype not in ('countgraph', 'nodegraph'):
+ raise Exception("unknown graph type: %s" % (hashtype,))
+
+ if args.max_memory_usage:
+ if hashtype == 'countgraph':
+ tablesize = args.max_memory_usage / args.n_tables / \
+ float(multiplier)
+ elif hashtype == 'nodegraph':
+ tablesize = 8. * args.max_memory_usage / args.n_tables / \
+ float(multiplier)
+ else:
+ tablesize = args.max_tablesize
+
+ return tablesize
+
+
+def create_nodegraph(args, ksize=None, multiplier=1.0):
+ if ksize is None:
+ ksize = args.ksize
+ if ksize > 32:
+ print_error("\n** ERROR: khmer only supports k-mer sizes <= 32.\n")
+ sys.exit(1)
+
+ tablesize = _calculate_tablesize(args, 'nodegraph', multiplier=multiplier)
+ return khmer.Hashbits(ksize, tablesize, args.n_tables)
+
+
+def create_countgraph(args, ksize=None, multiplier=1.0):
+ if ksize is None:
+ ksize = args.ksize
+ if ksize > 32:
+ print_error("\n** ERROR: khmer only supports k-mer sizes <= 32.\n")
+ sys.exit(1)
+
+ tablesize = _calculate_tablesize(args, 'countgraph', multiplier=multiplier)
+ return khmer.CountingHash(ksize, tablesize, args.n_tables)
+
+
+def report_on_config(args, hashtype='countgraph'):
"""Print out configuration.
Summarize the configuration produced by the command-line arguments
made available by this module.
"""
from khmer.utils import print_error
+ if hashtype not in ('countgraph', 'nodegraph'):
+ raise Exception("unknown graph type: %s" % (hashtype,))
if args.quiet:
return
+ tablesize = _calculate_tablesize(args, hashtype)
+
print_error("\nPARAMETERS:")
print_error(" - kmer size = {0} \t\t(-k)".format(args.ksize))
print_error(" - n tables = {0} \t\t(-N)".format(args.n_tables))
print_error(
- " - min tablesize = {0:5.2g} \t(-x)".format(args.min_tablesize)
+ " - max tablesize = {0:5.2g} \t(-x)".format(tablesize)
)
print_error("")
- if hashtype == 'counting':
+ if hashtype == 'countgraph':
print_error(
"Estimated memory usage is {0:.2g} bytes "
- "(n_tables x min_tablesize)".format(
- args.n_tables * args.min_tablesize))
- elif hashtype == 'hashbits':
+ "(n_tables x max_tablesize)".format(
+ args.n_tables * tablesize))
+ elif hashtype == 'nodegraph':
print_error(
"Estimated memory usage is {0:.2g} bytes "
- "(n_tables x min_tablesize / 8)".format(args.n_tables *
- args.min_tablesize / 8)
+ "(n_tables x max_tablesize / 8)".format(args.n_tables *
+ tablesize / 8)
)
print_error("-" * 8)
- if DEFAULT_MIN_TABLESIZE == args.min_tablesize and \
- not hasattr(args, 'loadtable'):
- print_error(
- "** WARNING: tablesize is default! "
- "You absodefly want to increase this!\n** "
- "Please read the docs!\n"
- )
+ if DEFAULT_MAX_TABLESIZE == tablesize and \
+ not getattr(args, 'loadtable', None):
+ print_error('''\
+
+** WARNING: tablesize is default!
+** You probably want to increase this with -M/--max-memory-usage!
+** Please read the docs!
+''')
def add_threading_args(parser):
@@ -196,7 +255,13 @@ def info(scriptname, algorithm_list=None):
for alg in algorithm_list:
sys.stderr.write("|| * ")
- sys.stderr.write(_algorithms[alg])
+ algstr = _algorithms[alg].encode(
+ 'utf-8', 'surrogateescape').decode('utf-8', 'replace')
+ try:
+ sys.stderr.write(algstr)
+ except UnicodeEncodeError:
+ sys.stderr.write(
+ algstr.encode(sys.getfilesystemencoding(), 'replace'))
sys.stderr.write("\n")
sys.stderr.write("||\n|| Please see http://khmer.readthedocs.org/en/"
diff --git a/khmer/thread_utils.py b/khmer/thread_utils.py
index a604fe2..41c3914 100644
--- a/khmer/thread_utils.py
+++ b/khmer/thread_utils.py
@@ -1,16 +1,25 @@
#
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
-# Copyright (C) Michigan State University, 2009-2013. It is licensed under
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
+# Copyright (C) Michigan State University, 2009-2015. It is licensed under
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
#
+
"""Utilities for dealing with multithreaded processing of short reads."""
+from __future__ import print_function, unicode_literals
+
import threading
-import Queue
import sys
import screed
from khmer import utils
+
+# stdlib queue module was renamed on Python 3
+try:
+ import queue
+except ImportError:
+ import Queue as queue
+
DEFAULT_WORKER_THREADS = 8
DEFAULT_GROUPSIZE = 100
@@ -20,7 +29,7 @@ def verbose_loader(filename):
screed_iter = screed.open(filename, parse_description=False)
for n, record in enumerate(screed_iter):
if n % 100000 == 0:
- print >>sys.stderr, '... filtering', n
+ print('... filtering', n, file=sys.stderr)
yield record
verbose_fasta_iter = verbose_loader
@@ -49,8 +58,8 @@ class ThreadedSequenceProcessor(object):
self.n_workers = n_workers
self.group_size = group_size
- self.inqueue = Queue.Queue(self.QUEUESIZE)
- self.outqueue = Queue.Queue(self.QUEUESIZE)
+ self.inqueue = queue.Queue(self.QUEUESIZE)
+ self.outqueue = queue.Queue(self.QUEUESIZE)
self.worker_count = 0
self.worker_count_lock = threading.Lock()
@@ -65,7 +74,7 @@ class ThreadedSequenceProcessor(object):
def start(self, inputiter, outfp):
if self.verbose:
- print >>sys.stderr, 'starting threads'
+ print('starting threads', file=sys.stderr)
try:
for _ in range(self.n_workers):
@@ -74,18 +83,18 @@ class ThreadedSequenceProcessor(object):
t.start()
if self.verbose:
- print >>sys.stderr, 'starting writer'
+ print('starting writer', file=sys.stderr)
w = threading.Thread(target=self.do_write, args=(outfp,))
w.start()
if self.verbose:
- print >>sys.stderr, 'loading...'
+ print('loading...', file=sys.stderr)
self.push_sequences(inputiter)
if self.verbose:
- print >>sys.stderr, 'done loading in sequences'
+ print('done loading in sequences', file=sys.stderr)
self.done = True
w.join()
@@ -129,7 +138,7 @@ class ThreadedSequenceProcessor(object):
while not self.done or not inq.empty():
try:
g = inq.get(True, 1)
- except Queue.Empty:
+ except queue.Empty:
continue
bp_processed = 0
@@ -157,17 +166,16 @@ class ThreadedSequenceProcessor(object):
self.bp_written += bp_written
if self.verbose and self.n_processed % 500000 == 0:
- print >>sys.stderr, \
- "processed %d / wrote %d / removed %d" % \
- (self.n_processed, self.n_written,
- self.n_processed - self.n_written)
- print >>sys.stderr, \
- "processed %d bp / wrote %d bp / removed %d bp" % \
- (self.bp_processed, self.bp_written,
- self.bp_processed - self.bp_written)
+ print("processed %d / wrote %d / removed %d" %
+ (self.n_processed, self.n_written,
+ self.n_processed - self.n_written), file=sys.stderr)
+ print("processed %d bp / wrote %d bp / removed %d bp" %
+ (self.bp_processed, self.bp_written,
+ self.bp_processed - self.bp_written),
+ file=sys.stderr)
discarded = self.bp_processed - self.bp_written
f = float(discarded) / float(self.bp_processed) * 100
- print >>sys.stderr, "discarded %.1f%%" % f
+ print("discarded %.1f%%" % f, file=sys.stderr)
# end of thread; exit, decrement worker count.
with self.worker_count_lock:
@@ -178,7 +186,7 @@ class ThreadedSequenceProcessor(object):
while self.worker_count > 0 or not outq.empty():
try:
g = outq.get(True, 1)
- except Queue.Empty:
+ except queue.Empty:
continue
for name, seq, quality in g.seqlist:
@@ -188,16 +196,14 @@ class ThreadedSequenceProcessor(object):
outfp.write('>%s\n%s\n' % (name, seq,))
if self.verbose:
- print >>sys.stderr, \
- "DONE writing.\nprocessed %d / wrote %d / removed %d" % \
- (self.n_processed, self.n_written,
- self.n_processed - self.n_written)
- print >>sys.stderr, \
- "processed %d bp / wrote %d bp / removed %d bp" % \
- (self.bp_processed, self.bp_written,
- self.bp_processed - self.bp_written)
+ print("DONE writing.\nprocessed %d / wrote %d / removed %d" %
+ (self.n_processed, self.n_written,
+ self.n_processed - self.n_written), file=sys.stderr)
+ print("processed %d bp / wrote %d bp / removed %d bp" %
+ (self.bp_processed, self.bp_written,
+ self.bp_processed - self.bp_written), file=sys.stderr)
discarded = self.bp_processed - self.bp_written
f = float(discarded) / float(self.bp_processed) * 100
- print >>sys.stderr, "discarded %.1f%%" % f
+ print("discarded %.1f%%" % f, file=sys.stderr)
# vim: set ft=python ts=4 sts=4 sw=4 et tw=79:
diff --git a/khmer/utils.py b/khmer/utils.py
index 1495fa2..0e1d5e1 100644
--- a/khmer/utils.py
+++ b/khmer/utils.py
@@ -1,5 +1,6 @@
+from __future__ import print_function, unicode_literals
#
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
# Copyright (C) Michigan State University, 2009-2015. It is licensed under
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
@@ -12,7 +13,7 @@ def print_error(msg):
"""Print the given message to 'stderr'."""
import sys
- print >>sys.stderr, msg
+ print(msg, file=sys.stderr)
def _split_left_right(name):
@@ -54,6 +55,10 @@ def check_is_pair(record1, record2):
elif lhs1 == lhs2 and rhs1.startswith('1:') and rhs2.startswith('2:'):
return True
+ # handle @name seq/1
+ elif lhs1 == lhs2 and rhs1.endswith('/1') and rhs2.endswith('/2'):
+ return True
+
return False
@@ -70,6 +75,9 @@ def check_is_left(name):
elif rhs.startswith('1:'): # handle '@name 1:rst'
return True
+ elif rhs.endswith('/1'): # handles '@name seq/1'
+ return True
+
return False
@@ -86,10 +94,14 @@ def check_is_right(name):
elif rhs.startswith('2:'): # handle '@name 2:rst'
return True
+ elif rhs.endswith('/2'): # handles '@name seq/2'
+ return True
+
return False
-def broken_paired_reader(screed_iter, min_length=None, force_single=False):
+def broken_paired_reader(screed_iter, min_length=None,
+ force_single=False, require_paired=False):
"""Read pairs from a stream.
A generator that yields singletons and pairs from a stream of FASTA/FASTQ
@@ -117,6 +129,9 @@ def broken_paired_reader(screed_iter, min_length=None, force_single=False):
prev_record = None
n = 0
+ if force_single and require_paired:
+ raise ValueError("force_single and require_paired cannot both be set!")
+
# handle the majority of the stream.
for record in screed_iter:
# ignore short reads
@@ -130,6 +145,9 @@ def broken_paired_reader(screed_iter, min_length=None, force_single=False):
n += 2
record = None
else: # orphan.
+ if require_paired:
+ raise ValueError("Unpaired reads when require_paired"
+ " is set!")
yield n, False, prev_record, None
n += 1
@@ -138,21 +156,27 @@ def broken_paired_reader(screed_iter, min_length=None, force_single=False):
# handle the last record, if it exists (i.e. last two records not a pair)
if prev_record:
+ if require_paired:
+ raise ValueError("Unpaired reads when require_paired is set!")
yield n, False, prev_record, None
def write_record(record, fileobj):
"""Write sequence record to 'fileobj' in FASTA/FASTQ format."""
if hasattr(record, 'quality'):
- fileobj.write(
- '@{name}\n{seq}\n'
- '+\n{qual}\n'.format(name=record.name,
- seq=record.sequence,
- qual=record.quality))
+ recstr = '@{name}\n{sequence}\n+\n{quality}\n'.format(
+ name=record.name,
+ sequence=record.sequence,
+ quality=record.quality)
else:
- fileobj.write(
- '>{name}\n{seq}\n'.format(name=record.name,
- seq=record.sequence))
+ recstr = '>{name}\n{sequence}\n'.format(
+ name=record.name,
+ sequence=record.sequence)
+
+ try:
+ fileobj.write(bytes(recstr, 'utf-8'))
+ except TypeError:
+ fileobj.write(recstr)
def write_record_pair(read1, read2, fileobj):
diff --git a/lib/.check_openmp.cc b/lib/.check_openmp.cc
index d4375ca..52dfdd5 100644
--- a/lib/.check_openmp.cc
+++ b/lib/.check_openmp.cc
@@ -1,5 +1,5 @@
//
-// This file is part of khmer, http://github.com/ged-lab/khmer/, and is
+// This file is part of khmer, https://github.com/dib-lab/khmer/, and is
// Copyright (C) Michigan State University, 2015. It is licensed under
// the three-clause BSD license; see doc/LICENSE.txt.
// Contact: khmer-project at idyll.org
diff --git a/lib/Makefile b/lib/Makefile
index c100a0f..f3518ea 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -214,7 +214,7 @@ TEST_PROGS = test-Colors test-read-aligner test-compile
all: $(LIBKHMERSO) libkhmer.a khmer.pc
clean:
- rm -f *.o *.a *.so khmer.pc $(LIBKHMERSO) $(TEST_PROGS)
+ rm -f *.o *.a *.so* khmer.pc $(TEST_PROGS)
(cd $(ZLIB_DIR) && make distclean)
(cd $(BZIP2_DIR) && make -f Makefile-libbz2_so clean)
diff --git a/lib/counting.cc b/lib/counting.cc
index b201bdc..27b23ad 100644
--- a/lib/counting.cc
+++ b/lib/counting.cc
@@ -1,5 +1,5 @@
//
-// This file is part of khmer, http://github.com/ged-lab/khmer/, and is
+// This file is part of khmer, https://github.com/dib-lab/khmer/, and is
// Copyright (C) Michigan State University, 2009-2015. It is licensed under
// the three-clause BSD license; see LICENSE.
// Contact: khmer-project at idyll.org
@@ -511,11 +511,19 @@ CountingHashFileReader::CountingHashFileReader(
unsigned int save_ksize = 0;
unsigned char save_n_tables = 0;
unsigned long long save_tablesize = 0;
+ char signature [4];
unsigned char version = 0, ht_type = 0, use_bigcount = 0;
+ infile.read(signature, 4);
infile.read((char *) &version, 1);
infile.read((char *) &ht_type, 1);
- if (!(version == SAVED_FORMAT_VERSION)) {
+ if (!(std::string(signature, 4) == SAVED_SIGNATURE)) {
+ std::ostringstream err;
+ err << "Does not start with signature for a khmer " <<
+ "file: " << signature << " Should be: " <<
+ SAVED_SIGNATURE;
+ throw khmer_file_exception(err.str());
+ } else if (!(version == SAVED_FORMAT_VERSION)) {
std::ostringstream err;
err << "Incorrect file format version " << (int) version
<< " while reading k-mer count file from " << infilename
@@ -612,16 +620,23 @@ CountingHashGzFileReader::CountingHashGzFileReader(
unsigned int save_ksize = 0;
unsigned char save_n_tables = 0;
unsigned long long save_tablesize = 0;
+ char signature [4];
unsigned char version, ht_type, use_bigcount;
+ int read_s = gzread(infile, signature, 4);
int read_v = gzread(infile, (char *) &version, 1);
int read_t = gzread(infile, (char *) &ht_type, 1);
-
- if (read_v <= 0 || read_t <= 0) {
+ if (read_s <= 0 || read_v <= 0 || read_t <= 0) {
std::string err = "K-mer count file read error: " + infilename + " "
+ strerror(errno);
gzclose(infile);
throw khmer_file_exception(err);
+ } else if (!(std::string(signature, 4) == SAVED_SIGNATURE)) {
+ std::ostringstream err;
+ err << "Does not start with signature for a khmer " <<
+ "file: " << signature << " Should be: " <<
+ SAVED_SIGNATURE;
+ throw khmer_file_exception(err.str());
} else if (!(version == SAVED_FORMAT_VERSION)
|| !(ht_type == SAVED_COUNTING_HT)) {
if (!(version == SAVED_FORMAT_VERSION)) {
@@ -685,8 +700,15 @@ CountingHashGzFileReader::CountingHashGzFileReader(
HashIntoType loaded = 0;
while (loaded != tablesize) {
- read_b = gzread(infile, (char *) ht._counts[i],
- (unsigned) (tablesize - loaded));
+ unsigned long long to_read_ll = tablesize - loaded;
+ unsigned int to_read_int;
+ // Zlib can only read chunks of at most INT_MAX bytes.
+ if (to_read_ll > INT_MAX) {
+ to_read_int = INT_MAX;
+ } else {
+ to_read_int = to_read_ll;
+ }
+ read_b = gzread(infile, (char *) ht._counts[i], to_read_int);
if (read_b <= 0) {
std::string gzerr = gzerror(infile, &read_b);
@@ -761,6 +783,7 @@ CountingHashFileWriter::CountingHashFileWriter(
ofstream outfile(outfilename.c_str(), ios::binary);
+ outfile.write(SAVED_SIGNATURE, 4);
unsigned char version = SAVED_FORMAT_VERSION;
outfile.write((const char *) &version, 1);
@@ -823,6 +846,7 @@ CountingHashGzFileWriter::CountingHashGzFileWriter(
}
}
+ gzwrite(outfile, SAVED_SIGNATURE, 4);
unsigned char version = SAVED_FORMAT_VERSION;
gzwrite(outfile, (const char *) &version, 1);
@@ -845,8 +869,38 @@ CountingHashGzFileWriter::CountingHashGzFileWriter(
sizeof(save_tablesize));
unsigned long long written = 0;
while (written != save_tablesize) {
- written += gzwrite(outfile, (const char *) ht._counts[i],
- (int) (save_tablesize - written));
+ unsigned long long to_write_ll = save_tablesize - written;
+ unsigned int to_write_int;
+ int gz_result;
+ // Zlib can only write chunks of at most INT_MAX bytes.
+ if (to_write_ll > INT_MAX) {
+ to_write_int = INT_MAX;
+ } else {
+ to_write_int = to_write_ll;
+ }
+ gz_result = gzwrite(outfile, (const char *) ht._counts[i],
+ to_write_int);
+ // Zlib returns 0 on error
+ if (gz_result == 0) {
+ int errcode = 0;
+ const char *err_msg;
+ std::ostringstream msg;
+
+ msg << "gzwrite failed while writing counting hash: ";
+ // Get zlib error
+ err_msg = gzerror(outfile, &errcode);
+ if (errcode != Z_ERRNO) {
+ // Zlib error, not stdlib
+ msg << err_msg;
+ gzclearerr(outfile);
+ } else {
+ // stdlib error
+ msg << strerror(errno);
+ }
+ gzclose(outfile);
+ throw khmer_file_exception(msg.str().c_str());
+ }
+ written += gz_result;
}
}
diff --git a/lib/counting.hh b/lib/counting.hh
index 31d0f0f..4849870 100644
--- a/lib/counting.hh
+++ b/lib/counting.hh
@@ -1,5 +1,5 @@
//
-// This file is part of khmer, http://github.com/ged-lab/khmer/, and is
+// This file is part of khmer, https://github.com/dib-lab/khmer/, and is
// Copyright (C) Michigan State University, 2009-2015. It is licensed under
// the three-clause BSD license; see LICENSE.
// Contact: khmer-project at idyll.org
diff --git a/lib/get_version.py b/lib/get_version.py
index 6ad89b8..5d7fa66 100644
--- a/lib/get_version.py
+++ b/lib/get_version.py
@@ -1,3 +1,4 @@
+from __future__ import print_function
import sys
sys.path.insert(0, '../')
import versioneer
@@ -7,4 +8,4 @@ versioneer.versionfile_build = '../khmer/_version.py'
versioneer.tag_prefix = 'v' # tags are like v1.2.0
versioneer.parentdir_prefix = '..'
-print versioneer.get_version()
+print(versioneer.get_version())
diff --git a/lib/graphtest.cc b/lib/graphtest.cc
index 438971c..c4ad3f7 100644
--- a/lib/graphtest.cc
+++ b/lib/graphtest.cc
@@ -1,6 +1,6 @@
//
-// This file is part of khmer, http://github.com/ged-lab/khmer/, and is
-// Copyright (C) Michigan State University, 2009-2013. It is licensed under
+// This file is part of khmer, https://github.com/dib-lab/khmer/, and is
+// Copyright (C) Michigan State University, 2009-2015. It is licensed under
// the three-clause BSD license; see LICENSE.
// Contact: khmer-project at idyll.org
//
diff --git a/lib/hashbits.cc b/lib/hashbits.cc
index 270f6d1..8119305 100644
--- a/lib/hashbits.cc
+++ b/lib/hashbits.cc
@@ -1,6 +1,6 @@
//
-// This file is part of khmer, http://github.com/ged-lab/khmer/, and is
-// Copyright (C) Michigan State University, 2009-2013. It is licensed under
+// This file is part of khmer, https://github.com/dib-lab/khmer/, and is
+// Copyright (C) Michigan State University, 2009-2015. It is licensed under
// the three-clause BSD license; see LICENSE.
// Contact: khmer-project at idyll.org
//
@@ -29,6 +29,7 @@ void Hashbits::save(std::string outfilename)
ofstream outfile(outfilename.c_str(), ios::binary);
+ outfile.write(SAVED_SIGNATURE, 4);
unsigned char version = SAVED_FORMAT_VERSION;
outfile.write((const char *) &version, 1);
@@ -90,11 +91,19 @@ void Hashbits::load(std::string infilename)
unsigned int save_ksize = 0;
unsigned char save_n_tables = 0;
unsigned long long save_tablesize = 0;
+ char signature[4];
unsigned char version, ht_type;
+ infile.read(signature, 4);
infile.read((char *) &version, 1);
infile.read((char *) &ht_type, 1);
- if (!(version == SAVED_FORMAT_VERSION)) {
+ if (!(std::string(signature, 4) == SAVED_SIGNATURE)) {
+ std::ostringstream err;
+ err << "Does not start with signature for a khmer " <<
+ "file: " << signature << " Should be: " <<
+ SAVED_SIGNATURE;
+ throw khmer_file_exception(err.str());
+ } else if (!(version == SAVED_FORMAT_VERSION)) {
std::ostringstream err;
err << "Incorrect file format version " << (int) version
<< " while reading k-mer graph from " << infilename
@@ -253,4 +262,24 @@ unsigned int Hashbits::consume_string_overlap(const std::string &s,
return n_consumed;
}
+void Hashbits::update_from(const Hashbits &other)
+{
+ if (_ksize != other._ksize) {
+ throw khmer_exception("both nodegraphs must have same k size");
+ }
+ if (_tablesizes != other._tablesizes) {
+ throw khmer_exception("both nodegraphs must have same table sizes");
+ }
+ for (unsigned int table_num = 0; table_num < _n_tables; table_num++) {
+ Byte * me = _counts[table_num];
+ Byte * ot = other._counts[table_num];
+ HashIntoType tablesize = _tablesizes[table_num];
+ HashIntoType tablebytes = tablesize / 8 + 1;
+
+ for (HashIntoType index = 0; index < tablebytes; index++) {
+ me[index] |= ot[index]; // bitwise or
+ }
+ }
+}
+
// vim: set sts=2 sw=2:
diff --git a/lib/hashbits.hh b/lib/hashbits.hh
index 894c526..3139650 100644
--- a/lib/hashbits.hh
+++ b/lib/hashbits.hh
@@ -1,6 +1,6 @@
//
-// This file is part of khmer, http://github.com/ged-lab/khmer/, and is
-// Copyright (C) Michigan State University, 2009-2013. It is licensed under
+// This file is part of khmer, https://github.com/dib-lab/khmer/, and is
+// Copyright (C) Michigan State University, 2009-2015. It is licensed under
// the three-clause BSD license; see LICENSE.
// Contact: khmer-project at idyll.org
//
@@ -246,6 +246,13 @@ public:
}
return 1;
}
+ // accessors to get table info
+ const HashIntoType n_entries() const
+ {
+ return _tablesizes[0];
+ }
+
+ void update_from(const Hashbits &other);
};
};
diff --git a/lib/hashtable.cc b/lib/hashtable.cc
index 726484e..cf4c2cb 100644
--- a/lib/hashtable.cc
+++ b/lib/hashtable.cc
@@ -1,6 +1,6 @@
//
-// This file is part of khmer, http://github.com/ged-lab/khmer/, and is
-// Copyright (C) Michigan State University, 2009-2013. It is licensed under
+// This file is part of khmer, https://github.com/dib-lab/khmer/, and is
+// Copyright (C) Michigan State University, 2009-2015. It is licensed under
// the three-clause BSD license; see LICENSE.
// Contact: khmer-project at idyll.org
//
@@ -193,24 +193,10 @@ void Hashtable::get_median_count(const std::string &s,
float &stddev)
{
std::vector<BoundedCounterType> counts;
- KMerIterator kmers(s.c_str(), _ksize);
-
- while(!kmers.done()) {
- HashIntoType kmer = kmers.next();
- BoundedCounterType count = this->get_count(kmer);
- counts.push_back(count);
- }
+ this->get_kmer_counts(s, counts);
if (!counts.size()) {
- throw khmer_exception();
- }
-
- if (!counts.size()) {
- median = 0;
- average = 0;
- stddev = 0;
-
- return;
+ throw khmer_exception("no k-mer counts for this string; too short?");
}
average = 0;
@@ -232,6 +218,42 @@ void Hashtable::get_median_count(const std::string &s,
median = counts[counts.size() / 2]; // rounds down
}
+//
+// Optimized filter function for normalize-by-median
+//
+bool Hashtable::median_at_least(const std::string &s,
+ unsigned int cutoff)
+{
+ KMerIterator kmers(s.c_str(), _ksize);
+ unsigned int min_req = 0.5 + float(s.size() - _ksize + 1) / 2;
+ unsigned int num_cutoff_kmers = 0;
+
+ // first loop:
+ // accumulate at least min_req worth of counts before checking to see
+ // if we have enough high-abundance k-mers to indicate success.
+ for (unsigned int i = 0; i < min_req; ++i) {
+ HashIntoType kmer = kmers.next();
+ if (this->get_count(kmer) >= cutoff) {
+ ++num_cutoff_kmers;
+ }
+ }
+
+ // second loop: now check to see if we pass the threshold for each k-mer.
+ if (num_cutoff_kmers >= min_req) {
+ return true;
+ }
+ while(!kmers.done()) {
+ HashIntoType kmer = kmers.next();
+ if (this->get_count(kmer) >= cutoff) {
+ ++num_cutoff_kmers;
+ if (num_cutoff_kmers >= min_req) {
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
void Hashtable::save_tagset(std::string outfilename)
{
ofstream outfile(outfilename.c_str(), ios::binary);
@@ -240,6 +262,7 @@ void Hashtable::save_tagset(std::string outfilename)
HashIntoType * buf = new HashIntoType[tagset_size];
+ outfile.write(SAVED_SIGNATURE, 4);
unsigned char version = SAVED_FORMAT_VERSION;
outfile.write((const char *) &version, 1);
@@ -291,15 +314,23 @@ void Hashtable::load_tagset(std::string infilename, bool clear_tags)
}
unsigned char version, ht_type;
+ char signature[4];
unsigned int save_ksize = 0;
size_t tagset_size = 0;
HashIntoType * buf = NULL;
try {
+ infile.read(signature, 4);
infile.read((char *) &version, 1);
infile.read((char *) &ht_type, 1);
- if (!(version == SAVED_FORMAT_VERSION)) {
+ if (!(std::string(signature, 4) == SAVED_SIGNATURE)) {
+ std::ostringstream err;
+ err << "Incorrect file signature " << signature
+ << " while reading tagset from " << infilename
+ << "; should be " << SAVED_SIGNATURE;
+ throw khmer_file_exception(err.str());
+ } else if (!(version == SAVED_FORMAT_VERSION)) {
std::ostringstream err;
err << "Incorrect file format version " << (int) version
<< " while reading tagset from " << infilename
@@ -1265,14 +1296,22 @@ void Hashtable::load_stop_tags(std::string infilename, bool clear_tags)
}
unsigned char version, ht_type;
+ char signature[4];
unsigned int save_ksize = 0;
size_t tagset_size = 0;
try {
+ infile.read(signature, 4);
infile.read((char *) &version, 1);
infile.read((char *) &ht_type, 1);
- if (!(version == SAVED_FORMAT_VERSION)) {
+ if (!(std::string(signature, 4) == SAVED_SIGNATURE)) {
+ std::ostringstream err;
+ err << "Incorrect file signature " << signature
+ << " while reading stoptags from " << infilename
+ << "; should be " << SAVED_SIGNATURE;
+ throw khmer_file_exception(err.str());
+ } else if (!(version == SAVED_FORMAT_VERSION)) {
std::ostringstream err;
err << "Incorrect file format version " << (int) version
<< " while reading stoptags from " << infilename
@@ -1315,6 +1354,7 @@ void Hashtable::save_stop_tags(std::string outfilename)
HashIntoType * buf = new HashIntoType[tagset_size];
+ outfile.write(SAVED_SIGNATURE, 4);
unsigned char version = SAVED_FORMAT_VERSION;
outfile.write((const char *) &version, 1);
@@ -1502,4 +1542,43 @@ void Hashtable::extract_unique_paths(std::string seq,
}
}
}
+
+
+void Hashtable::get_kmers(const std::string &s,
+ std::vector<std::string> &kmers_vec) const
+{
+ if (s.length() < _ksize) {
+ return;
+ }
+ for (unsigned int i = 0; i < s.length() - _ksize + 1; i++) {
+ std::string sub = s.substr(i, i + _ksize);
+ kmers_vec.push_back(sub);
+ }
+}
+
+
+void Hashtable::get_kmer_hashes(const std::string &s,
+ std::vector<HashIntoType> &kmers_vec) const
+{
+ KMerIterator kmers(s.c_str(), _ksize);
+
+ while(!kmers.done()) {
+ HashIntoType kmer = kmers.next();
+ kmers_vec.push_back(kmer);
+ }
+}
+
+
+void Hashtable::get_kmer_counts(const std::string &s,
+ std::vector<BoundedCounterType> &counts) const
+{
+ KMerIterator kmers(s.c_str(), _ksize);
+
+ while(!kmers.done()) {
+ HashIntoType kmer = kmers.next();
+ BoundedCounterType c = this->get_count(kmer);
+ counts.push_back(c);
+ }
+}
+
// vim: set sts=2 sw=2:
diff --git a/lib/hashtable.hh b/lib/hashtable.hh
index dd0f521..d18be70 100644
--- a/lib/hashtable.hh
+++ b/lib/hashtable.hh
@@ -1,6 +1,6 @@
//
-// This file is part of khmer, http://github.com/ged-lab/khmer/, and is
-// Copyright (C) Michigan State University, 2009-2013. It is licensed under
+// This file is part of khmer, https://github.com/dib-lab/khmer/, and is
+// Copyright (C) Michigan State University, 2009-2015. It is licensed under
// the three-clause BSD license; see LICENSE.
// Contact: khmer-project at idyll.org
//
@@ -174,6 +174,7 @@ public:
class Hashtable // Base class implementation of a Bloom ht.
{
friend class SubsetPartition;
+ friend class LabelHash;
protected:
unsigned int _tag_density;
@@ -291,6 +292,9 @@ public:
unsigned long long &n_consumed
);
+ bool median_at_least(const std::string &s,
+ unsigned int cutoff);
+
void get_median_count(const std::string &s,
BoundedCounterType &median,
float &average,
@@ -388,6 +392,12 @@ public:
virtual BoundedCounterType test_and_set_bits(const char * kmer) = 0;
virtual BoundedCounterType test_and_set_bits(HashIntoType khash) = 0;
+ virtual std::vector<HashIntoType> get_tablesizes() const = 0;
+ virtual const size_t n_tables() const = 0;
+ virtual const HashIntoType n_occupied(HashIntoType start=0,
+ HashIntoType stop=0) const = 0;
+ virtual const HashIntoType n_entries() const = 0;
+
void filter_if_present(const std::string &infilename,
const std::string &outputfilename);
@@ -455,6 +465,18 @@ public:
return kmer_degree(kmer_f, kmer_r);
}
+
+ // return all k-mer substrings, on the forward strand.
+ void get_kmers(const std::string &s, std::vector<std::string> &kmers)
+ const;
+
+ // return hash values for all k-mer substrings
+ void get_kmer_hashes(const std::string &s,
+ std::vector<HashIntoType> &kmers) const;
+
+ // return counts of all k-mers in this string.
+ void get_kmer_counts(const std::string &s,
+ std::vector<BoundedCounterType> &counts) const;
};
};
diff --git a/lib/hllcounter.cc b/lib/hllcounter.cc
index 299e70e..91ac9f7 100644
--- a/lib/hllcounter.cc
+++ b/lib/hllcounter.cc
@@ -1,5 +1,5 @@
//
-// This file is part of khmer, http://github.com/ged-lab/khmer/, and is
+// This file is part of khmer, https://github.com/dib-lab/khmer/, and is
// Copyright (C) Michigan State University, 2014-2015. It is licensed under
// the three-clause BSD license; see LICENSE.
// Contact: khmer-project at idyll.org
@@ -16,6 +16,7 @@
#include "khmer.hh"
#include "kmer_hash.hh"
#include "read_parsers.hh"
+#include "khmer_exception.hh"
#ifdef _OPENMP
#include <omp.h>
@@ -321,10 +322,15 @@ void HLLCounter::add(const std::string &value)
this->M[j] = std::max(this->M[j], get_rho(x >> this->p, 64 - this->p));
}
-unsigned int HLLCounter::consume_string(const std::string &s)
+unsigned int HLLCounter::consume_string(const std::string &inp)
{
unsigned int n_consumed = 0;
std::string kmer = "";
+ std::string s = inp;
+
+ for (unsigned int i = 0; i < s.length(); i++) {
+ s[i] &= 0xdf; // toupper - knock out the "lowercase bit"
+ }
for(std::string::const_iterator it = s.begin(); it != s.end(); ++it) {
kmer.push_back(*it);
@@ -455,6 +461,9 @@ bool HLLCounter::check_and_normalize_read(std::string &read) const
void HLLCounter::merge(HLLCounter &other)
{
+ if (this->p != other.p || this->_ksize != other._ksize) {
+ throw khmer_exception("HLLCounters to be merged must be created with same parameters");
+ }
for(unsigned int i=0; i < this->M.size(); ++i) {
this->M[i] = std::max(other.M[i], this->M[i]);
}
diff --git a/lib/hllcounter.hh b/lib/hllcounter.hh
index f314107..f03cff0 100644
--- a/lib/hllcounter.hh
+++ b/lib/hllcounter.hh
@@ -1,5 +1,5 @@
//
-// This file is part of khmer, http://github.com/ged-lab/khmer/, and is
+// This file is part of khmer, https://github.com/dib-lab/khmer/, and is
// Copyright (C) Michigan State University, 2014-2015. It is licensed under
// the three-clause BSD license; see LICENSE.
// Contact: khmer-project at idyll.org
diff --git a/lib/ht-diff.cc b/lib/ht-diff.cc
index 85f342d..e6b9bed 100644
--- a/lib/ht-diff.cc
+++ b/lib/ht-diff.cc
@@ -1,6 +1,6 @@
//
-// This file is part of khmer, http://github.com/ged-lab/khmer/, and is
-// Copyright (C) Michigan State University, 2009-2013. It is licensed under
+// This file is part of khmer, https://github.com/dib-lab/khmer/, and is
+// Copyright (C) Michigan State University, 2009-2015. It is licensed under
// the three-clause BSD license; see LICENSE.
// Contact: khmer-project at idyll.org
//
diff --git a/lib/khmer.hh b/lib/khmer.hh
index 661b144..60928c4 100644
--- a/lib/khmer.hh
+++ b/lib/khmer.hh
@@ -1,6 +1,6 @@
//
-// This file is part of khmer, http://github.com/ged-lab/khmer/, and is
-// Copyright (C) Michigan State University, 2009-2013. It is licensed under
+// This file is part of khmer, https://github.com/dib-lab/khmer/, and is
+// Copyright (C) Michigan State University, 2009-2015. It is licensed under
// the three-clause BSD license; see LICENSE.
// Contact: khmer-project at idyll.org
//
@@ -53,12 +53,14 @@ private:\
# define CIRCUM_RADIUS 2 // @CTB remove
# define CIRCUM_MAX_VOL 200 // @CTB remove
+# define SAVED_SIGNATURE "OXLI"
# define SAVED_FORMAT_VERSION 4
# define SAVED_COUNTING_HT 1
# define SAVED_HASHBITS 2
# define SAVED_TAGS 3
# define SAVED_STOPTAGS 4
# define SAVED_SUBSET 5
+# define SAVED_LABELSET 6
# define VERBOSE_REPARTITION 0
@@ -104,11 +106,11 @@ PartitionCountDistribution;
// types used in @camillescott's sparse labeling extension
typedef unsigned long long int Label;
typedef std::multimap<HashIntoType, Label*> TagLabelPtrMap;
-typedef std::multimap<Label, HashIntoType*> LabelTagPtrMap;
+typedef std::multimap<Label, HashIntoType> LabelTagMap;
typedef std::pair<HashIntoType, Label*> TagLabelPtrPair;
-typedef std::pair<Label, HashIntoType*> LabelTagPtrPair;
+typedef std::pair<Label, HashIntoType> LabelTagPair;
typedef std::set<Label*> LabelPtrSet;
-typedef std::set<HashIntoType*> TagPtrSet;
+typedef std::set<HashIntoType> TagSet;
typedef std::map<Label, Label*> LabelPtrMap;
template <typename T>
diff --git a/lib/khmer_exception.hh b/lib/khmer_exception.hh
index e278335..95553df 100644
--- a/lib/khmer_exception.hh
+++ b/lib/khmer_exception.hh
@@ -1,6 +1,6 @@
//
-// This file is part of khmer, http://github.com/ged-lab/khmer/, and is
-// Copyright (C) Michigan State University, 2009-2013. It is licensed under
+// This file is part of khmer, https://github.com/dib-lab/khmer/, and is
+// Copyright (C) Michigan State University, 2009-2015. It is licensed under
// the three-clause BSD license; see LICENSE.
// Contact: khmer-project at idyll.org
//
diff --git a/lib/kmer_hash.cc b/lib/kmer_hash.cc
index deacbf1..61c3741 100644
--- a/lib/kmer_hash.cc
+++ b/lib/kmer_hash.cc
@@ -1,5 +1,5 @@
//
-// This file is part of khmer, http://github.com/ged-lab/khmer/, and is
+// This file is part of khmer, https://github.com/dib-lab/khmer/, and is
// Copyright (C) Michigan State University, 2009-2015. It is licensed under
// the three-clause BSD license; see LICENSE.
// Contact: khmer-project at idyll.org
diff --git a/lib/kmer_hash.hh b/lib/kmer_hash.hh
index f9033a3..33ea909 100644
--- a/lib/kmer_hash.hh
+++ b/lib/kmer_hash.hh
@@ -1,5 +1,5 @@
//
-// This file is part of khmer, http://github.com/ged-lab/khmer/, and is
+// This file is part of khmer, https://github.com/dib-lab/khmer/, and is
// Copyright (C) Michigan State University, 2009-2015. It is licensed under
// the three-clause BSD license; see LICENSE.
// Contact: khmer-project at idyll.org
diff --git a/lib/labelhash.cc b/lib/labelhash.cc
index 1ba2414..b6366f5 100644
--- a/lib/labelhash.cc
+++ b/lib/labelhash.cc
@@ -1,15 +1,21 @@
//
-// This file is part of khmer, http://github.com/ged-lab/khmer/, and is
-// Copyright (C) Michigan State University, 2009-2013. It is licensed under
+// This file is part of khmer, https://github.com/dib-lab/khmer/, and is
+// Copyright (C) Michigan State University, 2009-2015. It is licensed under
// the three-clause BSD license; see LICENSE.
// Contact: khmer-project at idyll.org
//
#include "labelhash.hh"
+#include <sstream>
+#include <errno.h>
+
+#define IO_BUF_SIZE 250*1000*1000
+
#define LABEL_DBG 0
#define printdbg(m) if(LABEL_DBG) std::cout << #m << std::endl;
+using namespace std;
using namespace khmer;
using namespace khmer:: read_parsers;
@@ -61,7 +67,7 @@ LabelHash::consume_fasta_and_tag_with_labels(
while (!parser->is_complete( )) {
read = parser->get_next_read( );
- if (check_and_normalize_read( read.sequence )) {
+ if (graph->check_and_normalize_read( read.sequence )) {
// TODO: make threadsafe!
unsigned long long this_n_consumed = 0;
the_label = check_and_allocate_label(_tag_label);
@@ -83,9 +89,10 @@ LabelHash::consume_fasta_and_tag_with_labels(
#if (0)
// run callback, if specified
if (total_reads_TL % CALLBACK_PERIOD == 0 && callback) {
- std::cout << "n tags: " << all_tags.size() << "\n";
+ std::cout << "n tags: " << graph->all_tags.size() << "\n";
try {
- callback("consume_fasta_and_tag_with_labels", callback_data, total_reads_TL,
+ callback("consume_fasta_and_tag_with_labels", callback_data,
+ total_reads_TL,
n_consumed);
} catch (...) {
delete parser;
@@ -126,8 +133,9 @@ void LabelHash::consume_partitioned_fasta_and_tag_with_labels(
read = parser->get_next_read();
seq = read.sequence;
- if (check_and_normalize_read(seq)) {
- // First, figure out what the partition is (if non-zero), and save that.
+ if (graph->check_and_normalize_read(seq)) {
+ // First, figure out what the partition is (if non-zero), and
+ // save that.
printdbg(parsing partition id)
p = _parse_partition_id(read.name);
printdbg(checking label and allocating if necessary) {
@@ -168,7 +176,7 @@ void LabelHash::link_tag_and_label(HashIntoType& kmer, Label& kmer_label)
{
printdbg(linking tag and label)
tag_labels.insert(TagLabelPtrPair(kmer, &kmer_label));
- label_tag_ptrs.insert(LabelTagPtrPair(kmer_label, &kmer));
+ label_tag_ptrs.insert(LabelTagPair(kmer_label, kmer));
printdbg(done linking tag and label)
}
@@ -182,17 +190,17 @@ void LabelHash::consume_sequence_and_tag_with_labels(const std::string& seq,
bool kmer_tagged;
- KMerIterator kmers(seq.c_str(), _ksize);
+ KMerIterator kmers(seq.c_str(), graph->_ksize);
HashIntoType kmer;
- unsigned int since = _tag_density / 2 + 1;
+ unsigned int since = graph->_tag_density / 2 + 1;
printdbg(entering while loop)
while(!kmers.done()) {
kmer = kmers.next();
bool is_new_kmer;
- if ((is_new_kmer = test_and_set_bits( kmer ))) {
+ if ((is_new_kmer = graph->test_and_set_bits( kmer ))) {
++n_consumed;
printdbg(test_and_set_bits)
}
@@ -203,7 +211,7 @@ void LabelHash::consume_sequence_and_tag_with_labels(const std::string& seq,
} else {
printdbg(entering tag spin lock)
//ACQUIRE_ALL_TAGS_SPIN_LOCK
- kmer_tagged = set_contains(all_tags, kmer);
+ kmer_tagged = set_contains(graph->all_tags, kmer);
//RELEASE_ALL_TAGS_SPIN_LOCK
printdbg(released tag spin lock)
if (kmer_tagged) {
@@ -228,7 +236,7 @@ void LabelHash::consume_sequence_and_tag_with_labels(const std::string& seq,
}
}
#else
- if (!is_new_kmer && set_contains(all_tags, kmer)) {
+ if (!is_new_kmer && set_contains(graph->all_tags, kmer)) {
since = 1;
if (found_tags) {
found_tags->insert(kmer);
@@ -238,11 +246,12 @@ void LabelHash::consume_sequence_and_tag_with_labels(const std::string& seq,
}
#endif
//
- if (since >= _tag_density) {
- printdbg(exceeded tag density: drop a tag and label -- getting tag lock)
+ if (since >= graph->_tag_density) {
+ printdbg(exceeded tag density: drop a tag and label --
+ getting tag lock)
//ACQUIRE_ALL_TAGS_SPIN_LOCK
printdbg(in tag spin lock)
- all_tags.insert(kmer);
+ graph->all_tags.insert(kmer);
//RELEASE_ALL_TAGS_SPIN_LOCK
printdbg(released tag spin lock)
@@ -260,9 +269,9 @@ void LabelHash::consume_sequence_and_tag_with_labels(const std::string& seq,
printdbg(moving to next iter)
} // iteration over kmers
printdbg(finished iteration: dropping last tag)
- if (since >= _tag_density/2 - 1) {
+ if (since >= graph->_tag_density/2 - 1) {
//ACQUIRE_ALL_TAGS_SPIN_LOCK
- all_tags.insert(kmer); // insert the last k-mer, too.
+ graph->all_tags.insert(kmer); // insert the last k-mer, too.
//RELEASE_ALL_TAGS_SPIN_LOCK
// Label code: TODO: MAKE THREADSAFE!
@@ -284,12 +293,13 @@ unsigned int LabelHash::sweep_label_neighborhood(const std::string& seq,
SeenSet tagged_kmers;
unsigned int num_traversed;
- num_traversed = partition->sweep_for_tags(seq, tagged_kmers, all_tags,
+ num_traversed = graph->partition->sweep_for_tags(seq, tagged_kmers,
+ graph->all_tags,
range, break_on_stoptags, stop_big_traversals);
traverse_labels_and_resolve(tagged_kmers, found_labels);
//printf("range=%u ", range);
if (range == 0) {
- if (!(num_traversed == seq.length()-ksize()+1)) {
+ if (!(num_traversed == seq.length()-graph->ksize()+1)) {
throw khmer_exception();
}
}
@@ -305,14 +315,6 @@ LabelPtrSet LabelHash::get_tag_labels(const HashIntoType& tag)
return labels;
}
-TagPtrSet LabelHash::get_label_tags(const Label& label)
-{
- TagPtrSet tags;
- //unsigned int num_tags;
- _get_tags_from_label(label, label_tag_ptrs, tags);
- return tags;
-}
-
void LabelHash::traverse_labels_and_resolve(const SeenSet& tagged_kmers,
LabelPtrSet& found_labels)
{
@@ -336,3 +338,195 @@ LabelHash::~LabelHash()
delete itr->second;
}
}
+
+
+// Save a partition map to disk.
+
+void LabelHash::save_labels_and_tags(std::string filename)
+{
+ ofstream outfile(filename.c_str(), ios::binary);
+
+ outfile.write(SAVED_SIGNATURE, 4);
+ unsigned char version = SAVED_FORMAT_VERSION;
+ outfile.write((const char *) &version, 1);
+
+ unsigned char ht_type = SAVED_LABELSET;
+ outfile.write((const char *) &ht_type, 1);
+
+ unsigned int save_ksize = graph->ksize();
+ outfile.write((const char *) &save_ksize, sizeof(save_ksize));
+
+ unsigned long n_labeltags = tag_labels.size();
+ outfile.write((const char *) &n_labeltags, sizeof(n_labeltags));
+
+ ///
+
+ char * buf = NULL;
+ buf = new char[IO_BUF_SIZE];
+ unsigned int n_bytes = 0;
+
+ // For each tag in the partition map, save the tag and the associated
+ // partition ID.
+
+ TagLabelPtrMap::const_iterator pi = tag_labels.begin();
+ for (; pi != tag_labels.end(); ++pi) {
+ HashIntoType *k_p = (HashIntoType *) (buf + n_bytes);
+ *k_p = pi->first;
+ n_bytes += sizeof(HashIntoType);
+
+ Label * l_p = (Label *) (buf + n_bytes);
+ *l_p = *(pi->second);
+ n_bytes += sizeof(Label);
+
+ // flush to disk
+ if (n_bytes >= IO_BUF_SIZE - sizeof(HashIntoType) - sizeof(Label)) {
+ outfile.write(buf, n_bytes);
+ n_bytes = 0;
+ }
+ }
+ // save remainder.
+ if (n_bytes) {
+ outfile.write(buf, n_bytes);
+ }
+
+ if (outfile.fail()) {
+ delete[] buf;
+ throw khmer_file_exception(strerror(errno));
+ }
+ outfile.close();
+
+ delete[] buf;
+}
+
+void LabelHash::load_labels_and_tags(std::string filename)
+{
+ ifstream infile;
+
+ // configure ifstream to raise exceptions for everything.
+ infile.exceptions(std::ifstream::failbit | std::ifstream::badbit);
+
+ try {
+ infile.open(filename.c_str(), ios::binary);
+ } catch (std::ifstream::failure &e) {
+ std::string err;
+ if (!infile.is_open()) {
+ err = "Cannot open labels/tags file: " + filename;
+ } else {
+ err = "Unknown error in opening file: " + filename;
+ }
+ throw khmer_file_exception(err);
+ }
+
+ unsigned long n_labeltags = 1;
+ try {
+ unsigned int save_ksize = 0;
+ char signature[4];
+ unsigned char version = 0, ht_type = 0;
+
+ infile.read(signature, 4);
+ infile.read((char *) &version, 1);
+ infile.read((char *) &ht_type, 1);
+ if (!(std::string(signature, 4) == SAVED_SIGNATURE)) {
+ std::ostringstream err;
+ err << "Incorrect file signature " << signature
+ << " while reading labels/tags from " << filename
+ << " Should be: " << SAVED_SIGNATURE;
+ throw khmer_file_exception(err.str());
+ } else if (!(version == SAVED_FORMAT_VERSION)) {
+ std::ostringstream err;
+ err << "Incorrect file format version " << (int) version
+ << " while reading labels/tags from " << filename;
+ throw khmer_file_exception(err.str());
+ } else if (!(ht_type == SAVED_LABELSET)) {
+ std::ostringstream err;
+ err << "Incorrect file format type " << (int) ht_type
+ << " while reading labels/tags from " << filename;
+ throw khmer_file_exception(err.str());
+ }
+
+ infile.read((char *) &save_ksize, sizeof(save_ksize));
+ if (!(save_ksize == graph->ksize())) {
+ std::ostringstream err;
+ err << "Incorrect k-mer size " << save_ksize
+ << " while reading labels/tags from " << filename;
+ throw khmer_file_exception(err.str());
+ }
+
+ infile.read((char *) &n_labeltags, sizeof(n_labeltags));
+ } catch (std::ifstream::failure &e) {
+ std::string err;
+ err = "Unknown error reading header info from: " + filename;
+ throw khmer_file_exception(err);
+ }
+
+ char * buf = new char[IO_BUF_SIZE];
+
+ unsigned long loaded = 0;
+ long remainder;
+
+
+ HashIntoType * kmer_p = NULL;
+ Label * labelp = NULL;
+
+ remainder = 0;
+ unsigned int iteration = 0;
+ while (!infile.eof()) {
+ unsigned int i;
+
+ try {
+ infile.read(buf + remainder, IO_BUF_SIZE - remainder);
+ } catch (std::ifstream::failure &e) {
+
+ // We may get an exception here if we fail to read all the
+ // expected bytes due to EOF -- only pass it up if we read
+ // _nothing_. Note that the while loop exits on EOF.
+
+ if (infile.gcount() == 0) {
+ delete[] buf;
+
+ std::string err;
+ err = "Unknown error reading data from: " + filename;
+ throw khmer_file_exception(err);
+ }
+ }
+
+ long n_bytes = infile.gcount() + remainder;
+ remainder = n_bytes % (sizeof(Label) + sizeof(HashIntoType));
+ n_bytes -= remainder;
+
+ iteration++;
+
+ for (i = 0; i < n_bytes;) {
+ kmer_p = (HashIntoType *) (buf + i);
+ i += sizeof(HashIntoType);
+
+ labelp = (Label *) (buf + i);
+ i += sizeof(Label);
+
+ Label * labelp2;
+
+ graph->all_tags.insert(*kmer_p);
+ labelp2 = check_and_allocate_label(*labelp);
+ link_tag_and_label(*kmer_p, *labelp2);
+
+ loaded++;
+ }
+ if (!(i == n_bytes)) {
+ delete[] buf;
+ throw khmer_file_exception("unknown error reading labels and tags");
+ }
+ memcpy(buf, buf + n_bytes, remainder);
+ }
+
+ if (remainder != 0) {
+ delete[] buf;
+ throw khmer_file_exception("unknown error reading labels and tags");
+ }
+
+ if (loaded != n_labeltags) {
+ delete[] buf;
+ throw khmer_file_exception("error loading labels: too few loaded");
+ }
+
+ delete[] buf;
+}
diff --git a/lib/labelhash.hh b/lib/labelhash.hh
index 3f06631..d2f754f 100644
--- a/lib/labelhash.hh
+++ b/lib/labelhash.hh
@@ -1,6 +1,6 @@
//
-// This file is part of khmer, http://github.com/ged-lab/khmer/, and is
-// Copyright (C) Michigan State University, 2009-2013. It is licensed under
+// This file is part of khmer, https://github.com/dib-lab/khmer/, and is
+// Copyright (C) Michigan State University, 2009-2015. It is licensed under
// the three-clause BSD license; see LICENSE.
// Contact: khmer-project at idyll.org
//
@@ -18,7 +18,7 @@
namespace khmer
{
-class LabelHash : public khmer::Hashbits
+class LabelHash
{
protected:
// Does the given tag already have the given label?
@@ -37,14 +37,14 @@ protected:
}
// Does the given label already have a tag associated with it?
- bool _cmap_contains_tag(const LabelTagPtrMap& cmap,
+ bool _cmap_contains_tag(const LabelTagMap& cmap,
Label& the_label,
HashIntoType& kmer)
{
- std::pair<LabelTagPtrMap::const_iterator, LabelTagPtrMap::const_iterator> ret;
+ std::pair<LabelTagMap::const_iterator, LabelTagMap::const_iterator> ret;
ret = cmap.equal_range(the_label);
- for (LabelTagPtrMap::const_iterator it=ret.first; it!=ret.second; ++it) {
- if(*(it->second) == kmer) {
+ for (LabelTagMap::const_iterator it=ret.first; it!=ret.second; ++it) {
+ if(it->second == kmer) {
return true;
}
}
@@ -66,13 +66,13 @@ protected:
}
unsigned int _get_tags_from_label(const Label& label,
- const LabelTagPtrMap& cmap,
- TagPtrSet& labeled_tags)
+ const LabelTagMap& cmap,
+ TagSet& labeled_tags)
{
unsigned int num_tags = 0;
- std::pair<LabelTagPtrMap::const_iterator, LabelTagPtrMap::const_iterator> ret;
+ std::pair<LabelTagMap::const_iterator, LabelTagMap::const_iterator> ret;
ret = cmap.equal_range(label);
- for (LabelTagPtrMap::const_iterator it=ret.first; it!=ret.second; ++it) {
+ for (LabelTagMap::const_iterator it=ret.first; it!=ret.second; ++it) {
labeled_tags.insert(it->second);
++num_tags;
}
@@ -82,19 +82,18 @@ protected:
uint32_t _tag_labels_spin_lock;
public:
+ khmer::Hashtable * graph;
- LabelHash( WordLength ksize, std::vector<HashIntoType>& tablesizes)
- : khmer::Hashbits(ksize, tablesizes)
+ explicit LabelHash(Hashtable * ht) : graph(ht)
{
_tag_labels_spin_lock = 0;
- _all_tags_spin_lock = 0;
}
~LabelHash();
TagLabelPtrMap tag_labels;
- LabelTagPtrMap label_tag_ptrs;
+ LabelTagMap label_tag_ptrs;
LabelPtrMap label_ptrs;
size_t n_labels() const
@@ -140,7 +139,6 @@ public:
SeenSet * new_tags = 0);
LabelPtrSet get_tag_labels(const HashIntoType& tag);
- TagPtrSet get_label_tags(const Label& label);
void link_tag_and_label(HashIntoType& kmer, Label& label);
@@ -153,6 +151,9 @@ public:
void traverse_labels_and_resolve(const SeenSet& tagged_kmers,
LabelPtrSet& found_labels);
+ void save_labels_and_tags(std::string);
+ void load_labels_and_tags(std::string);
+
};
};
diff --git a/lib/magic b/lib/magic
new file mode 100644
index 0000000..1ab0707
--- /dev/null
+++ b/lib/magic
@@ -0,0 +1,11 @@
+# Magic descriptions of khmer file types for file(1) command.
+# Format is described in magic(5).
+
+0 string OXLI Binary from the khmer project,
+>4 byte x file format version %u,
+>5 byte 1 k-mer count table
+>5 byte 2 k-mer presence table
+>5 byte 3 k-mer tagset
+>5 byte 4 k-mer partition stoptags
+>5 byte 5 k-mer partition subset
+>5 byte 6 labels/tags for implicit k-mer De Bruijn graph
diff --git a/lib/perf_metrics.cc b/lib/perf_metrics.cc
index 374be0f..a0c348a 100644
--- a/lib/perf_metrics.cc
+++ b/lib/perf_metrics.cc
@@ -1,6 +1,6 @@
//
-// This file is part of khmer, http://github.com/ged-lab/khmer/, and is
-// Copyright (C) Michigan State University, 2009-2013. It is licensed under
+// This file is part of khmer, https://github.com/dib-lab/khmer/, and is
+// Copyright (C) Michigan State University, 2009-2015. It is licensed under
// the three-clause BSD license; see LICENSE.
// Contact: khmer-project at idyll.org
//
diff --git a/lib/perf_metrics.hh b/lib/perf_metrics.hh
index b45c052..63a0e49 100644
--- a/lib/perf_metrics.hh
+++ b/lib/perf_metrics.hh
@@ -1,6 +1,6 @@
//
-// This file is part of khmer, http://github.com/ged-lab/khmer/, and is
-// Copyright (C) Michigan State University, 2009-2013. It is licensed under
+// This file is part of khmer, https://github.com/dib-lab/khmer/, and is
+// Copyright (C) Michigan State University, 2009-2015. It is licensed under
// the three-clause BSD license; see LICENSE.
// Contact: khmer-project at idyll.org
//
diff --git a/lib/primes.hh b/lib/primes.hh
index 938f4fa..ba806e4 100644
--- a/lib/primes.hh
+++ b/lib/primes.hh
@@ -1,6 +1,6 @@
//
-// This file is part of khmer, http://github.com/ged-lab/khmer/, and is
-// Copyright (C) Michigan State University, 2009-2013. It is licensed under
+// This file is part of khmer, https://github.com/dib-lab/khmer/, and is
+// Copyright (C) Michigan State University, 2009-2015. It is licensed under
// the three-clause BSD license; see LICENSE.
// Contact: khmer-project at idyll.org
//
diff --git a/lib/read_aligner.cc b/lib/read_aligner.cc
index 9bf4343..569cc52 100644
--- a/lib/read_aligner.cc
+++ b/lib/read_aligner.cc
@@ -1,6 +1,6 @@
//
-// This file is part of khmer, http://github.com/ged-lab/khmer/, and is
-// Copyright (C) Michigan State University, 2009-2013. It is licensed under
+// This file is part of khmer, https://github.com/dib-lab/khmer/, and is
+// Copyright (C) Michigan State University, 2009-2015. It is licensed under
// the three-clause BSD license; see LICENSE. Contact: ctb at msu.edu
//
#include "read_aligner.hh"
diff --git a/lib/read_aligner.hh b/lib/read_aligner.hh
index 3d7e7a5..20b1654 100644
--- a/lib/read_aligner.hh
+++ b/lib/read_aligner.hh
@@ -1,6 +1,6 @@
//
-// This file is part of khmer, http://github.com/ged-lab/khmer/, and is
-// Copyright (C) Michigan State University, 2009-2013. It is licensed under
+// This file is part of khmer, https://github.com/dib-lab/khmer/, and is
+// Copyright (C) Michigan State University, 2009-2015. It is licensed under
// the three-clause BSD license; see LICENSE. Contact: ctb at msu.edu
//
diff --git a/lib/read_parsers.cc b/lib/read_parsers.cc
index 6bb4b35..1e8506c 100644
--- a/lib/read_parsers.cc
+++ b/lib/read_parsers.cc
@@ -1,6 +1,6 @@
//
-// This file is part of khmer, http://github.com/ged-lab/khmer/, and is
-// Copyright (C) Michigan State University, 2009-2013. It is licensed under
+// This file is part of khmer, https://github.com/dib-lab/khmer/, and is
+// Copyright (C) Michigan State University, 2009-2015. It is licensed under
// the three-clause BSD license; see LICENSE.
// Contact: khmer-project at idyll.org
//
diff --git a/lib/read_parsers.hh b/lib/read_parsers.hh
index d030a0b..b8ceadf 100644
--- a/lib/read_parsers.hh
+++ b/lib/read_parsers.hh
@@ -1,6 +1,6 @@
//
-// This file is part of khmer, http://github.com/ged-lab/khmer/, and is
-// Copyright (C) Michigan State University, 2009-2013. It is licensed under
+// This file is part of khmer, https://github.com/dib-lab/khmer/, and is
+// Copyright (C) Michigan State University, 2009-2015. It is licensed under
// the three-clause BSD license; see LICENSE.
// Contact: khmer-project at idyll.org
//
diff --git a/lib/subset.cc b/lib/subset.cc
index 339a2c9..c62ed39 100644
--- a/lib/subset.cc
+++ b/lib/subset.cc
@@ -1,6 +1,6 @@
//
-// This file is part of khmer, http://github.com/ged-lab/khmer/, and is
-// Copyright (C) Michigan State University, 2009-2014. It is licensed under
+// This file is part of khmer, https://github.com/dib-lab/khmer/, and is
+// Copyright (C) Michigan State University, 2009-2015. It is licensed under
// the three-clause BSD license; see LICENSE.
// Contact: khmer-project at idyll.org
//
@@ -11,6 +11,7 @@
#include <sstream>
#include <errno.h>
+#include <assert.h>
#define IO_BUF_SIZE 250*1000*1000
#define BIG_TRAVERSALS_ARE 200
@@ -1255,6 +1256,7 @@ void SubsetPartition::_merge_other(
void SubsetPartition::merge_from_disk(string other_filename)
{
ifstream infile;
+ unsigned long long expected_pmap_size;
// configure ifstream to raise exceptions for everything.
infile.exceptions(std::ifstream::failbit | std::ifstream::badbit);
@@ -1273,11 +1275,19 @@ void SubsetPartition::merge_from_disk(string other_filename)
try {
unsigned int save_ksize = 0;
+ char signature[4];
unsigned char version, ht_type;
+ infile.read(signature, 4);
infile.read((char *) &version, 1);
infile.read((char *) &ht_type, 1);
- if (!(version == SAVED_FORMAT_VERSION)) {
+ if (!(std::string(signature, 4) == SAVED_SIGNATURE)) {
+ std::ostringstream err;
+ err << "Incorrect file signature " << signature
+ << " while reading subset pmap from " << other_filename
+ << " Should be: " << SAVED_SIGNATURE;
+ throw khmer_file_exception(err.str());
+ } else if (!(version == SAVED_FORMAT_VERSION)) {
std::ostringstream err;
err << "Incorrect file format version " << (int) version
<< " while reading subset pmap from " << other_filename;
@@ -1296,6 +1306,8 @@ void SubsetPartition::merge_from_disk(string other_filename)
<< " while reading subset pmap from " << other_filename;
throw khmer_file_exception(err.str());
}
+
+ infile.read((char *) &expected_pmap_size, sizeof(expected_pmap_size));
} catch (std::ifstream::failure &e) {
std::string err;
err = "Unknown error reading header info from: " + other_filename;
@@ -1332,6 +1344,7 @@ void SubsetPartition::merge_from_disk(string other_filename)
// _nothing_. Note that the while loop exits on EOF.
if (infile.gcount() == 0) {
+ delete[] buf;
std::string err;
err = "Unknown error reading data from: " + other_filename;
throw khmer_file_exception(err);
@@ -1350,21 +1363,21 @@ void SubsetPartition::merge_from_disk(string other_filename)
diskp = (PartitionID *) (buf + i);
i += sizeof(PartitionID);
- if (!(*diskp != 0)) { // sanity check.
- throw khmer_exception();
- }
+ assert((*diskp != 0)); // sanity check!
_merge_other(*kmer_p, *diskp, diskp_to_pp);
loaded++;
}
- if (!(i == n_bytes)) {
- throw khmer_exception();
- }
+ assert(i == n_bytes);
memcpy(buf, buf + n_bytes, remainder);
}
-
delete[] buf;
+
+ if (loaded != expected_pmap_size) {
+ throw khmer_file_exception("error loading partitionmap - "
+ "invalid # of items");
+ }
}
// Save a partition map to disk.
@@ -1374,6 +1387,7 @@ void SubsetPartition::save_partitionmap(string pmap_filename)
ofstream outfile(pmap_filename.c_str(), ios::binary);
unsigned char version = SAVED_FORMAT_VERSION;
+ outfile.write(SAVED_SIGNATURE, 4);
outfile.write((const char *) &version, 1);
unsigned char ht_type = SAVED_SUBSET;
@@ -1382,6 +1396,9 @@ void SubsetPartition::save_partitionmap(string pmap_filename)
unsigned int save_ksize = _ht->ksize();
outfile.write((const char *) &save_ksize, sizeof(save_ksize));
+ unsigned long long pmap_size = partition_map.size();
+ outfile.write((const char *) &pmap_size, sizeof(pmap_size));
+
///
char * buf = NULL;
diff --git a/lib/subset.hh b/lib/subset.hh
index b1c06df..a7d053e 100644
--- a/lib/subset.hh
+++ b/lib/subset.hh
@@ -1,6 +1,6 @@
//
-// This file is part of khmer, http://github.com/ged-lab/khmer/, and is
-// Copyright (C) Michigan State University, 2009-2014. It is licensed under
+// This file is part of khmer, https://github.com/dib-lab/khmer/, and is
+// Copyright (C) Michigan State University, 2009-2015. It is licensed under
// the three-clause BSD license; see LICENSE.
// Contact: khmer-project at idyll.org
//
diff --git a/lib/test-Colors.cc b/lib/test-Colors.cc
index 7e70309..92e14d4 100644
--- a/lib/test-Colors.cc
+++ b/lib/test-Colors.cc
@@ -1,6 +1,6 @@
//
-// This file is part of khmer, http://github.com/ged-lab/khmer/, and is
-// Copyright (C) Michigan State University, 2013. It is licensed under
+// This file is part of khmer, https://github.com/dib-lab/khmer/, and is
+// Copyright (C) Michigan State University, 2013-2015. It is licensed under
// the three-clause BSD license; see LICENSE.
// Contact: khmer-project at idyll.org
//
diff --git a/lib/test-HashTables.cc b/lib/test-HashTables.cc
index dccf1ef..baa0fd2 100644
--- a/lib/test-HashTables.cc
+++ b/lib/test-HashTables.cc
@@ -1,6 +1,6 @@
//
-// This file is part of khmer, http://github.com/ged-lab/khmer/, and is
-// Copyright (C) Michigan State University, 2009-2013. It is licensed under
+// This file is part of khmer, https://github.com/dib-lab/khmer/, and is
+// Copyright (C) Michigan State University, 2009-2015. It is licensed under
// the three-clause BSD license; see LICENSE.
// Contact: khmer-project at idyll.org
//
diff --git a/lib/test-Parser.cc b/lib/test-Parser.cc
index 5d068cd..1424cef 100644
--- a/lib/test-Parser.cc
+++ b/lib/test-Parser.cc
@@ -1,6 +1,6 @@
//
-// This file is part of khmer, http://github.com/ged-lab/khmer/, and is
-// Copyright (C) Michigan State University, 2009-2013. It is licensed under
+// This file is part of khmer, https://github.com/dib-lab/khmer/, and is
+// Copyright (C) Michigan State University, 2009-2015. It is licensed under
// the three-clause BSD license; see LICENSE.
// Contact: khmer-project at idyll.org
//
diff --git a/lib/test-compile.cc b/lib/test-compile.cc
index cedfe4c..f0c4bb0 100644
--- a/lib/test-compile.cc
+++ b/lib/test-compile.cc
@@ -1,5 +1,5 @@
//
-// This file is part of khmer, http://github.com/ged-lab/khmer/, and is
+// This file is part of khmer, https://github.com/dib-lab/khmer/, and is
// Copyright (C) Michigan State University, 2015. It is licensed under
// the three-clause BSD license; see LICENSE.
// Contact: khmer-project at idyll.org
diff --git a/lib/trace_logger.cc b/lib/trace_logger.cc
index 06dd34a..9600d99 100644
--- a/lib/trace_logger.cc
+++ b/lib/trace_logger.cc
@@ -1,6 +1,6 @@
//
-// This file is part of khmer, http://github.com/ged-lab/khmer/, and is
-// Copyright (C) Michigan State University, 2009-2013. It is licensed under
+// This file is part of khmer, https://github.com/dib-lab/khmer/, and is
+// Copyright (C) Michigan State University, 2009-2015. It is licensed under
// the three-clause BSD license; see LICENSE.
// Contact: khmer-project at idyll.org
//
diff --git a/lib/trace_logger.hh b/lib/trace_logger.hh
index 54d57e1..9c24161 100644
--- a/lib/trace_logger.hh
+++ b/lib/trace_logger.hh
@@ -1,6 +1,6 @@
//
-// This file is part of khmer, http://github.com/ged-lab/khmer/, and is
-// Copyright (C) Michigan State University, 2009-2013. It is licensed under
+// This file is part of khmer, https://github.com/dib-lab/khmer/, and is
+// Copyright (C) Michigan State University, 2009-2015. It is licensed under
// the three-clause BSD license; see LICENSE.
// Contact: khmer-project at idyll.org
//
diff --git a/oxli/__init__.py b/oxli/__init__.py
new file mode 100755
index 0000000..742c4fc
--- /dev/null
+++ b/oxli/__init__.py
@@ -0,0 +1,55 @@
+#!/usr/bin/env python
+#
+# This file is a part of khmer, http://github.com/ged-lab/khmer/, and is
+# Copyright (C) Michigan State University, 2015. It is licensed under the
+# three-clause BSD license; see doc/LICENSE.txt.
+# Contact: khmer-project at idyll.org
+#
+
+"""
+Single entry point script for khmer
+"""
+
+import argparse
+import textwrap
+from khmer import khmer_args
+from oxli import build_graph
+
+
+def get_parser():
+ """
+ returns the parser object for the oxli subcommand handler
+ """
+
+ parser = argparse.ArgumentParser(
+ description='Single entry point script for khmer',
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+
+ subparsers = parser.add_subparsers()
+
+ # build-graph (formerly load-graph) parsers here
+ parser_build_graph = \
+ subparsers.add_parser('build-graph',
+ help="Load sequences into the compressible graph"
+ "format plus optional tagset",
+ description="Load sequences into the "
+ "compressible graph format plus optional tagset")
+
+ khmer_args.build_hashbits_args("Load sequences into the compressible"
+ "graph format plus optional tagset.",
+ None, parser=parser_build_graph)
+ build_graph.build_parser(parser_build_graph)
+ parser_build_graph.set_defaults(func=build_graph.main)
+
+ return parser
+
+
+def main():
+ """
+ main function; does the parsing and kicks off the subcommand
+ """
+ args = get_parser().parse_args()
+ args.func(args)
+
+if __name__ == '__main__':
+ main()
diff --git a/oxli/build_graph.py b/oxli/build_graph.py
new file mode 100644
index 0000000..43b5a33
--- /dev/null
+++ b/oxli/build_graph.py
@@ -0,0 +1,103 @@
+#! /usr/bin/env python
+#
+# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
+# Copyright (C) Michigan State University, 2009-2015. It is licensed under
+# the three-clause BSD license; see doc/LICENSE.txt.
+# Contact: khmer-project at idyll.org
+#
+# pylint: disable=invalid-name,missing-docstring
+"""
+Build a graph from the given sequences, save in <ptname>.
+
+% python scripts/load-graph.py <ptname> <data1> [ <data2> <...> ]
+
+Use '-h' for parameter help.
+"""
+
+from __future__ import print_function, absolute_import, unicode_literals
+
+import sys
+
+import khmer
+from khmer import khmer_args
+from khmer.khmer_args import (report_on_config, info, add_threading_args)
+from khmer.kfile import check_input_files, check_space
+from khmer.kfile import check_space_for_hashtable
+from oxli import functions
+
+
+def build_parser(parser):
+ add_threading_args(parser)
+ parser.add_argument('--no-build-tagset', '-n', default=False,
+ action='store_true', dest='no_build_tagset',
+ help='Do NOT construct tagset while loading sequences')
+ parser.add_argument('output_filename',
+ metavar='output_presence_table_filename', help='output'
+ ' k-mer presence table filename.')
+ parser.add_argument('input_filenames', metavar='input_sequence_filename',
+ nargs='+', help='input FAST[AQ] sequence filename')
+ parser.add_argument('-f', '--force', default=False, action='store_true',
+ help='Overwrite output file if it exists')
+ return parser
+
+
+def main(args):
+ info('build-graph.py', ['graph', 'SeqAn'])
+
+ report_on_config(args, hashtype='nodegraph')
+ base = args.output_filename
+ filenames = args.input_filenames
+
+ for fname in args.input_filenames:
+ check_input_files(fname, args.force)
+
+ check_space(args.input_filenames, args.force)
+ check_space_for_hashtable(args, 'nodegraph', args.force)
+
+ print('Saving k-mer presence table to %s' % base, file=sys.stderr)
+ print('Loading kmers from sequences in %s' %
+ repr(filenames), file=sys.stderr)
+ if args.no_build_tagset:
+ print('We WILL NOT build the tagset.', file=sys.stderr)
+ else:
+ print('We WILL build the tagset (for partitioning/traversal).',
+ file=sys.stderr)
+
+ print('making nodegraph', file=sys.stderr)
+ htable = khmer_args.create_nodegraph(args)
+
+ functions.build_graph(filenames, htable, args.threads,
+ not args.no_build_tagset)
+
+ print('Total number of unique k-mers: {0}'.format(htable.n_unique_kmers()),
+ file=sys.stderr)
+
+ print('saving k-mer presence table in', base + '.pt', file=sys.stderr)
+ htable.save(base + '.pt')
+
+ if not args.no_build_tagset:
+ print('saving tagset in', base + '.tagset', file=sys.stderr)
+ htable.save_tagset(base + '.tagset')
+
+ info_fp = open(base + '.info', 'w')
+ info_fp.write('%d unique k-mers' % htable.n_unique_kmers())
+
+ fp_rate = \
+ khmer.calc_expected_collisions(htable, args.force, max_false_pos=.15)
+ # 0.18 is ACTUAL MAX. Do not change.
+
+ print('false positive rate estimated to be %1.3f' % fp_rate,
+ file=sys.stderr)
+ print('\nfalse positive rate estimated to be %1.3f' % fp_rate,
+ file=info_fp)
+
+ print('wrote to', base + '.info and', base + '.pt', file=sys.stderr)
+ if not args.no_build_tagset:
+ print('and ' + base + '.tagset', file=sys.stderr)
+
+ sys.exit(0)
+
+if __name__ == '__main__':
+ main(None)
+
+# vim: set ft=python ts=4 sts=4 sw=4 et tw=79:
diff --git a/oxli/functions.py b/oxli/functions.py
new file mode 100644
index 0000000..e429efd
--- /dev/null
+++ b/oxli/functions.py
@@ -0,0 +1,103 @@
+#
+# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
+# Copyright (C) Michigan State University, 2009-2015. It is licensed under
+# the three-clause BSD license; see doc/LICENSE.txt.
+# Contact: khmer-project at idyll.org
+#
+
+
+from collections import namedtuple
+import threading
+import math
+import khmer.utils
+
+
+def estimate_optimal_with_N_and_M(N, M):
+ """
+ Utility function for estimating optimal counting table args where N is the
+ number of unique kmer and M is the allotted amount of memory
+ """
+ Z = math.log(2)*(M/float(N))
+ intZ = int(Z)
+ if intZ == 0:
+ intZ = 1
+ H = int(M/intZ)
+ M = H*intZ
+ f2 = (1-math.exp(-N/float(H)))**intZ
+ res = namedtuple("result", ["num_htables", "htable_size", "mem_use",
+ "fp_rate"])
+ return res(intZ, H, M, f2)
+
+
+def estimate_optimal_with_N_and_f(N, f):
+ """
+ Utility function for estimating optimal memory where N is the number of
+ unique kmers and f is the desired false positive rate
+ """
+ Z = math.log(f, 0.5)
+ intZ = int(Z)
+ if intZ == 0:
+ intZ = 1
+
+ H1 = int(-N/(math.log(1-f**(1/float(intZ)))))
+ M1 = H1 * intZ
+ f1 = (1-math.exp(-N/float(H1)))**intZ
+
+ res = namedtuple("result", ["num_htables", "htable_size", "mem_use",
+ "fp_rate"])
+ return res(intZ, H1, M1, f1)
+
+
+def optimal_args_output_gen(unique_kmers, fp_rate):
+ """
+ Assembles output string for optimal arg sandbox scripts
+ """
+ to_print = []
+
+ to_print.append('') # blank line
+ to_print.append('number of unique k-mers: \t{0}'.format(unique_kmers))
+ to_print.append('false positive rate: \t{:>.3f}'.format(fp_rate))
+ to_print.append('') # blank line
+ to_print.append('If you have expected false positive rate to achieve:')
+ to_print.append('expected_fp\tnumber_hashtable(Z)\tsize_hashtable(H)\t'
+ 'expected_memory_usage')
+
+ for fp_rate in range(1, 10):
+ Z, H, M, f = estimate_optimal_with_N_and_f(unique_kmers, fp_rate/10.0)
+ to_print.append('{:11.3f}\t{:19}\t{:17e}\t{:21e}'.format(f, Z, H, M))
+
+ mem_list = [1, 5, 10, 20, 50, 100, 200, 300, 400, 500, 1000, 2000, 5000]
+
+ to_print.append('') # blank line
+ to_print.append('If you have expected memory to use:')
+ to_print.append('expected_memory_usage\tnumber_hashtable(Z)\t'
+ 'size_hashtable(H)\texpected_fp')
+
+ for mem in mem_list:
+ Z, H, M, f = estimate_optimal_with_N_and_M(unique_kmers,
+ mem*1000000000)
+ to_print.append('{:21e}\t{:19}\t{:17e}\t{:11.3f}'.format(M, Z, H, f))
+ return "\n".join(to_print)
+
+
+def build_graph(ifilenames, graph, num_threads=1, tags=False):
+ """
+ Algorithm to construct a counting graph from a set of input files
+ """
+
+ if tags:
+ eat = graph.consume_fasta_and_tag_with_reads_parser
+ else:
+ eat = graph.consume_fasta_with_reads_parser
+
+ for _, ifile in enumerate(ifilenames):
+ rparser = khmer.ReadParser(ifile)
+ threads = []
+
+ for _ in range(num_threads):
+ cur_thread = threading.Thread(target=eat, args=(rparser,))
+ threads.append(cur_thread)
+ cur_thread.start()
+
+ for thread in threads:
+ thread.join()
diff --git a/sandbox/README.rst b/sandbox/README.rst
index f50b1a7..48af420 100644
--- a/sandbox/README.rst
+++ b/sandbox/README.rst
@@ -18,10 +18,10 @@ Awaiting promotion to sandbox:
Scripts with recipes:
-* calc-median-distribution.py - plot coverage distribution; see `khmer-recipes #1 <https://github.com/ged-lab/khmer-recipes/tree/master/001-extract-reads-by-coverage>`__
-* collect-reads.py - subsample reads until a particular average coverage; see `khmer-recipes #2 <https://github.com/ged-lab/khmer-recipes/tree/master/002-collect-subset-of-high-coverage>`__
-* saturate-by-median.py - calculate collector's curve on shotgun sequencing; see `khmer-recipes #4 <https://github.com/ged-lab/khmer-recipes/tree/master/004-estimate-sequencing-saturation>`__
-* slice-reads-by-coverage.py - extract reads based on coverage; see `khmer-recipes #1 <https://github.com/ged-lab/khmer-recipes/tree/master/001-extract-reads-by-coverage>`__
+* calc-median-distribution.py - plot coverage distribution; see `khmer-recipes #1 <https://github.com/dib-lab/khmer-recipes/tree/master/001-extract-reads-by-coverage>`__
+* collect-reads.py - subsample reads until a particular average coverage; see `khmer-recipes #2 <https://github.com/dib-lab/khmer-recipes/tree/master/002-collect-subset-of-high-coverage>`__
+* saturate-by-median.py - calculate collector's curve on shotgun sequencing; see `khmer-recipes #4 <https://github.com/dib-lab/khmer-recipes/tree/master/004-estimate-sequencing-saturation>`__
+* slice-reads-by-coverage.py - extract reads based on coverage; see `khmer-recipes #1 <https://github.com/dib-lab/khmer-recipes/tree/master/001-extract-reads-by-coverage>`__
To keep, document, and build recipes for:
@@ -70,117 +70,117 @@ Good ideas to rewrite using newer tools/approaches:
Present in commit d295bc847 but removed thereafter:
-* `combine-pe.py <https://github.com/ged-lab/khmer/blob/d295bc8477022e8c34649f131a2abe333a891d3d/sandbox/combine-pe.py>`__ - combine partitions based on shared PE reads.
-* `compare-partitions.py <https://github.com/ged-lab/khmer/blob/d295bc8477022e8c34649f131a2abe333a891d3d/sandbox/compare-partitions.py>`__ - compare read membership in partitions.
-* `count-within-radius.py <https://github.com/ged-lab/khmer/blob/d295bc8477022e8c34649f131a2abe333a891d3d/sandbox/count-within-radius.py>`__ - calculating graph density by position with seq
-* `degree-by-position.py <https://github.com/ged-lab/khmer/blob/d295bc8477022e8c34649f131a2abe333a891d3d/sandbox/degree-by-position.py>`__ - calculating graph degree by position in seq
-* `dn-identify-errors.py <https://github.com/ged-lab/khmer/blob/d295bc8477022e8c34649f131a2abe333a891d3d/sandbox/dn-identify-errors.py>`__ - prototype script to identify errors in reads based on diginorm principles
-* `ec.py <https://github.com/ged-lab/khmer/blob/d295bc8477022e8c34649f131a2abe333a891d3d/sandbox/ec.py>`__ - new error correction foo
-* `error-correct-pass2.py <https://github.com/ged-lab/khmer/blob/d295bc8477022e8c34649f131a2abe333a891d3d/sandbox/error-correct-pass2.py>`__ - new error correction foo
-* `find-unpart.py <https://github.com/ged-lab/khmer/blob/d295bc8477022e8c34649f131a2abe333a891d3d/sandbox/find-unpart.py>`__ - something to do with finding unpartitioned sequences
-* `normalize-by-align.py <https://github.com/ged-lab/khmer/blob/d295bc8477022e8c34649f131a2abe333a891d3d/sandbox/normalize-by-align.py>`__ - new error correction foo
-* `read_aligner.py <https://github.com/ged-lab/khmer/blob/d295bc8477022e8c34649f131a2abe333a891d3d/sandbox/read_aligner.py>`__ - new error correction foo
-* `shuffle-fasta.py <https://github.com/ged-lab/khmer/blob/d295bc8477022e8c34649f131a2abe333a891d3d/sandbox/shuffle-fasta.py>`__ - FASTA file shuffler for small FASTA files
-* `to-casava-1.8-fastq.py <https://github.com/ged-lab/khmer/blob/d295bc8477022e8c34649f131a2abe333a891d3d/sandbox/to-casava-1.8-fastq.py>`__ - convert reads to different Casava format
-* `uniqify-sequences.py <https://github.com/ged-lab/khmer/blob/d295bc8477022e8c34649f131a2abe333a891d3d/sandbox/uniqify-sequences.py>`__ - print out paths that are unique in the graph
-* `write-interleave.py <https://github.com/ged-lab/khmer/blob/d295bc8477022e8c34649f131a2abe333a891d3d/sandbox/write-interleave.py>`__ - is this used by any protocol etc?
+* `combine-pe.py <https://github.com/dib-lab/khmer/blob/d295bc8477022e8c34649f131a2abe333a891d3d/sandbox/combine-pe.py>`__ - combine partitions based on shared PE reads.
+* `compare-partitions.py <https://github.com/dib-lab/khmer/blob/d295bc8477022e8c34649f131a2abe333a891d3d/sandbox/compare-partitions.py>`__ - compare read membership in partitions.
+* `count-within-radius.py <https://github.com/dib-lab/khmer/blob/d295bc8477022e8c34649f131a2abe333a891d3d/sandbox/count-within-radius.py>`__ - calculating graph density by position with seq
+* `degree-by-position.py <https://github.com/dib-lab/khmer/blob/d295bc8477022e8c34649f131a2abe333a891d3d/sandbox/degree-by-position.py>`__ - calculating graph degree by position in seq
+* `dn-identify-errors.py <https://github.com/dib-lab/khmer/blob/d295bc8477022e8c34649f131a2abe333a891d3d/sandbox/dn-identify-errors.py>`__ - prototype script to identify errors in reads based on diginorm principles
+* `ec.py <https://github.com/dib-lab/khmer/blob/d295bc8477022e8c34649f131a2abe333a891d3d/sandbox/ec.py>`__ - new error correction foo
+* `error-correct-pass2.py <https://github.com/dib-lab/khmer/blob/d295bc8477022e8c34649f131a2abe333a891d3d/sandbox/error-correct-pass2.py>`__ - new error correction foo
+* `find-unpart.py <https://github.com/dib-lab/khmer/blob/d295bc8477022e8c34649f131a2abe333a891d3d/sandbox/find-unpart.py>`__ - something to do with finding unpartitioned sequences
+* `normalize-by-align.py <https://github.com/dib-lab/khmer/blob/d295bc8477022e8c34649f131a2abe333a891d3d/sandbox/normalize-by-align.py>`__ - new error correction foo
+* `read_aligner.py <https://github.com/dib-lab/khmer/blob/d295bc8477022e8c34649f131a2abe333a891d3d/sandbox/read_aligner.py>`__ - new error correction foo
+* `shuffle-fasta.py <https://github.com/dib-lab/khmer/blob/d295bc8477022e8c34649f131a2abe333a891d3d/sandbox/shuffle-fasta.py>`__ - FASTA file shuffler for small FASTA files
+* `to-casava-1.8-fastq.py <https://github.com/dib-lab/khmer/blob/d295bc8477022e8c34649f131a2abe333a891d3d/sandbox/to-casava-1.8-fastq.py>`__ - convert reads to different Casava format
+* `uniqify-sequences.py <https://github.com/dib-lab/khmer/blob/d295bc8477022e8c34649f131a2abe333a891d3d/sandbox/uniqify-sequences.py>`__ - print out paths that are unique in the graph
+* `write-interleave.py <https://github.com/dib-lab/khmer/blob/d295bc8477022e8c34649f131a2abe333a891d3d/sandbox/write-interleave.py>`__ - is this used by any protocol etc?
Present in commit 691b0b3ae but removed thereafter:
-* `annotate-with-median-count.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/annotate-with-median-count.py>`__ - replaced by count-median.py
-* `assemble-individual-partitions.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/assemble-individual-partitions.py>`__ - better done with parallel
-* `assemstats.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/assemstats.py>`__ - statistics gathering; see assemstats3.
-* `assemstats2.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/assemstats2.py>`__ - statistics gathering; see assemstats3.
-* `abund-ablate-reads.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/abund-ablate-reads.py>`__ - trim reads of high abundance k-mers.
-* `bench-graphsize-orig.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/bench-graphsize-orig.py>`__ - benchmarking script for graphsize elimination
-* `bench-graphsize-th.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/bench-graphsize-th.py>`__ - benchmarking script for graphsize elimination
-* `bin-reads-by-abundance.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/bin-reads-by-abundance.py>`__ - see slice-reads-by-coverage.py
-* `bowtie-parser.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/bowtie-parser.py>`__ - parse bowtie map file
-* `calc-degree.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/calc-degree.py>`__ - various k-mer statistics
-* `calc-kmer-partition-counts.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/calc-kmer-partition-counts.py>`__ - various k-mer statistics
-* `calc-kmer-read-abunds.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/calc-kmer-read-abunds.py>`__ - various k-mer statistics
-* `calc-kmer-read-stats.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/calc-kmer-read-stats.py>`__ - various k-mer statistics
-* `calc-kmer-to-partition-ratio.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/calc-kmer-to-partition-ratio.py>`__ - various k-mer statistics
-* `calc-sequence-entropy.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/calc-sequence-entropy.py>`__ - calculate per-sequence entropy
-* `choose-largest-assembly.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/choose-largest-assembly.py>`__ - see calc-best-assembly.py
-* `consume-and-traverse.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/consume-and-traverse.py>`__ - replaced by load-graph.py
-* `contig-coverage.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/contig-coverage.py>`__ - calculate coverage of contigs by k-mers
-* `count-circum-by-position.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/count-circum-by-position.py>`__ - k-mer graph statistics by position within read
-* `count-density-by-position.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/count-density-by-position.py>`__ - k-mer graph stats by position within read
-* `count-distance-to-volume.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/count-distance-to-volume.py>`__ - k-mer stats from graph
-* `count-median-abund-by-partition.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/count-median-abund-by-partition.py>`__ - count median k-mer abundance by partition;
-* `count-shared-kmers-btw-assemblies.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/count-shared-kmers-btw-assemblies.py>`__ - count shared k-mers between assemblies;
-* `ctb-iterative-bench-2-old.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/ctb-iterative-bench-2-old.py>`__ - old benchmarking code
-* `ctb-iterative-bench.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/ctb-iterative-bench.py>`__ - old benchmarking code
-* `discard-high-abund.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/discard-high-abund.py>`__ - discard reads by coverage; see slice-reads-by-coverage.py
-* `discard-pre-high-abund.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/discard-pre-high-abund.py>`__ - discard reads by coverage; see slice-reads-by-coverage.py
-* `do-intertable-part.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/do-intertable-part.py>`__ - unused partitioning method
-* `do-partition-2.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/do-partition-2.py>`__ - replaced by scripts/do-partition.py
-* `do-partition-stop.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/do-partition-stop.py>`__ - replaced by scripts/do-partition.py
-* `do-partition.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/do-partition.py>`__ - moved to scripts/
-* `do-subset-merge.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/do-subset-merge.py>`__ - replaced by scripts/merge-partitions.py
-* `do-th-subset-calc.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/do-th-subset-calc.py>`__ - unused benchmarking scripts
-* `do-th-subset-load.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/do-th-subset-load.py>`__ - unused benchmarking scripts
-* `do-th-subset-save.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/do-th-subset-save.py>`__ - unused benchmarking scripts
-* `extract-surrender.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/extract-surrender.py>`__ - no longer used partitioning feature
-* `extract-with-median-count.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/extract-with-median-count.py>`__ - see slice-reads-by-coverage.py
-* `fasta-to-fastq.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/fasta-to-fastq.py>`__ - just a bad idea
-* `filter-above-median.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/filter-above-median.py>`__ - replaced by filter-below-abund.py
-* `filter-abund-output-by-length.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/filter-abund-output-by-length.py>`__ - replaced by filter-abund/filter-below-abund
-* `filter-area.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/filter-area.py>`__ - trim highly connected k-mers
-* `filter-degree.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/filter-degree.py>`__ - trim highly connected k-mers
-* `filter-density-explosion.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/filter-density-explosion.py>`__ - trim highly connected k-mers
-* `filter-if-present.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/filter-if-present.py>`__ - replaced by filter-abund and others
-* `filter-max255.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/filter-max255.py>`__ - remove reads w/high-abundance k-mers.
-* `filter-min2-multi.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/filter-min2-multi.py>`__ - remove reads w/low-abundance k-mers
-* `filter-sodd.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/filter-sodd.py>`__ - no longer used partitioning feature
-* `filter-subsets-by-partsize.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/filter-subsets-by-partsize.py>`__ - deprecated way to filter out partitions by size
-* `get-occupancy.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/get-occupancy.py>`__ - utility script no longer needed
-* `get-occupancy2.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/get-occupancy2.py>`__ - utility script no longer needed
-* `graph-partition-separate.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/graph-partition-separate.py>`__ - deprecated graph partitioning stuff
-* `graph-size-circum-trim.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/graph-size-circum-trim.py>`__ - experimental mods to graph-size.py
-* `graph-size-degree-trim.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/graph-size-degree-trim.py>`__ - experimental mods to graph-size.py
-* `graph-size-py.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/graph-size-py.py>`__ - experimental mods to graph-size.py
-* `join_pe.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/join_pe.py>`__ - silly attempts to deal with PE interleaving?
-* `keep-stoptags.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/keep-stoptags.py>`__ - trim at stoptags
-* `label-pairs.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/label-pairs.py>`__ - deprecated PE fixing script
-* `length-dist.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/length-dist.py>`__ - deprecated length distribution calc script
-* `load-ht-and-tags.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/load-ht-and-tags.py>`__ - load and examine hashtable & tags
-* `multi-abyss.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/multi-abyss.py>`__ - better done with parallel
-* `make-coverage-by-position-for-node.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/make-coverage-by-position-for-node.py>`__ - deprecated coverage calculation
-* `make-coverage-histogram.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/make-coverage-histogram.py>`__ - build coverage histograms
-* `make-random.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/make-random.py>`__ - make random DNA; see dbg-graph-null project.
-* `make-read-stats.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/make-read-stats.py>`__ - see readstats.py
-* `multi-stats.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/multi-stats.py>`__ - see readstats.py
-* `multi-velvet.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/multi-velvet.py>`__ - better done with parallel
-* `normalize-by-min.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/normalize-by-min.py>`__ - normalize by min k-mer abundance in seq; just a bad idea
-* `occupy.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/occupy.py>`__ - no longer needed utility script
-* `parse-bowtie-pe.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/parse-bowtie-pe.py>`__ - no longer needed utility script
-* `parse-stats.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/parse-stats.py>`__ - partition stats
-* `partition-by-contig.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/partition-by-contig.py>`__ - various approaches to partitioning
-* `partition-by-contig2.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/partition-by-contig2.py>`__ - various approaches to partitioning
-* `partition-size-dist-running.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/partition-size-dist-running.py>`__ - various approaches to partitioning
-* `partition-size-dist.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/partition-size-dist.py>`__ - various approaches to partitioning
-* `path-compare-to-vectors.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/path-compare-to-vectors.py>`__ - ??
-* `print-exact-abund-kmer.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/print-exact-abund-kmer.py>`__ - ??
-* `print-high-density-kmers.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/print-high-density-kmers.py>`__ - display high abundance k-mers
-* `quality-trim-pe.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/quality-trim-pe.py>`__ - no longer needed utility script
-* `quality-trim.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/quality-trim.py>`__ - no longer needed utility script
-* `reformat.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/reformat.py>`__ - FASTA sequence description line reformatter for partitioned files
-* `remove-N.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/remove-N.py>`__ - eliminate sequences that have Ns in them
-* `softmask-high-abund.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/softmask-high-abund.py>`__ - softmask high abundance sequences (convert ACGT to acgt)
-* `split-fasta-on-circum.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/split-fasta-on-circum.py>`__ - various ways of breaking sequences on graph properties
-* `split-fasta-on-circum2.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/split-fasta-on-circum2.py>`__ - various ways of breaking sequences on graph properties
-* `split-fasta-on-circum3.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/split-fasta-on-circum3.py>`__ - various ways of breaking sequences on graph properties
-* `split-fasta-on-circum4.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/split-fasta-on-circum4.py>`__ - various ways of breaking sequences on graph properties
-* `split-fasta-on-degree-th.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/split-fasta-on-degree-th.py>`__ - various ways of breaking sequences on graph properties
-* `split-fasta-on-degree.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/split-fasta-on-degree.py>`__ - various ways of breaking sequences on graph properties
-* `split-fasta-on-density.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/split-fasta-on-density.py>`__ - various ways of breaking sequences on graph properties
-* `split-N.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/split-N.py>`__ - truncate sequences on N
-* `split-reads-on-median-diff.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/split-reads-on-median-diff.py>`__ - various ways of breaking sequences on graph properties
-* `summarize.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/summarize.py>`__ - sequence stats calculator
-* `sweep_perf.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/sweep_perf.py>`__ - benchmarking tool
-* `test_scripts.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/test_scripts.py>`__ - old test file
-* `traverse-contigs.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/traverse-contigs.py>`__ - deprecated graph traversal stuff
-* `traverse-from-reads.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/traverse-from-reads.py>`__ - deprecated graph traversal stuff
-* `validate-partitioning.py <https://github.com/ged-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/validate-partitioning.py>`__ - unneeded test
+* `annotate-with-median-count.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/annotate-with-median-count.py>`__ - replaced by count-median.py
+* `assemble-individual-partitions.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/assemble-individual-partitions.py>`__ - better done with parallel
+* `assemstats.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/assemstats.py>`__ - statistics gathering; see assemstats3.
+* `assemstats2.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/assemstats2.py>`__ - statistics gathering; see assemstats3.
+* `abund-ablate-reads.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/abund-ablate-reads.py>`__ - trim reads of high abundance k-mers.
+* `bench-graphsize-orig.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/bench-graphsize-orig.py>`__ - benchmarking script for graphsize elimination
+* `bench-graphsize-th.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/bench-graphsize-th.py>`__ - benchmarking script for graphsize elimination
+* `bin-reads-by-abundance.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/bin-reads-by-abundance.py>`__ - see slice-reads-by-coverage.py
+* `bowtie-parser.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/bowtie-parser.py>`__ - parse bowtie map file
+* `calc-degree.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/calc-degree.py>`__ - various k-mer statistics
+* `calc-kmer-partition-counts.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/calc-kmer-partition-counts.py>`__ - various k-mer statistics
+* `calc-kmer-read-abunds.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/calc-kmer-read-abunds.py>`__ - various k-mer statistics
+* `calc-kmer-read-stats.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/calc-kmer-read-stats.py>`__ - various k-mer statistics
+* `calc-kmer-to-partition-ratio.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/calc-kmer-to-partition-ratio.py>`__ - various k-mer statistics
+* `calc-sequence-entropy.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/calc-sequence-entropy.py>`__ - calculate per-sequence entropy
+* `choose-largest-assembly.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/choose-largest-assembly.py>`__ - see calc-best-assembly.py
+* `consume-and-traverse.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/consume-and-traverse.py>`__ - replaced by load-graph.py
+* `contig-coverage.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/contig-coverage.py>`__ - calculate coverage of contigs by k-mers
+* `count-circum-by-position.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/count-circum-by-position.py>`__ - k-mer graph statistics by position within read
+* `count-density-by-position.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/count-density-by-position.py>`__ - k-mer graph stats by position within read
+* `count-distance-to-volume.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/count-distance-to-volume.py>`__ - k-mer stats from graph
+* `count-median-abund-by-partition.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/count-median-abund-by-partition.py>`__ - count median k-mer abundance by partition;
+* `count-shared-kmers-btw-assemblies.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/count-shared-kmers-btw-assemblies.py>`__ - count shared k-mers between assemblies;
+* `ctb-iterative-bench-2-old.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/ctb-iterative-bench-2-old.py>`__ - old benchmarking code
+* `ctb-iterative-bench.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/ctb-iterative-bench.py>`__ - old benchmarking code
+* `discard-high-abund.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/discard-high-abund.py>`__ - discard reads by coverage; see slice-reads-by-coverage.py
+* `discard-pre-high-abund.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/discard-pre-high-abund.py>`__ - discard reads by coverage; see slice-reads-by-coverage.py
+* `do-intertable-part.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/do-intertable-part.py>`__ - unused partitioning method
+* `do-partition-2.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/do-partition-2.py>`__ - replaced by scripts/do-partition.py
+* `do-partition-stop.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/do-partition-stop.py>`__ - replaced by scripts/do-partition.py
+* `do-partition.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/do-partition.py>`__ - moved to scripts/
+* `do-subset-merge.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/do-subset-merge.py>`__ - replaced by scripts/merge-partitions.py
+* `do-th-subset-calc.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/do-th-subset-calc.py>`__ - unused benchmarking scripts
+* `do-th-subset-load.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/do-th-subset-load.py>`__ - unused benchmarking scripts
+* `do-th-subset-save.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/do-th-subset-save.py>`__ - unused benchmarking scripts
+* `extract-surrender.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/extract-surrender.py>`__ - no longer used partitioning feature
+* `extract-with-median-count.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/extract-with-median-count.py>`__ - see slice-reads-by-coverage.py
+* `fasta-to-fastq.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/fasta-to-fastq.py>`__ - just a bad idea
+* `filter-above-median.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/filter-above-median.py>`__ - replaced by filter-below-abund.py
+* `filter-abund-output-by-length.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/filter-abund-output-by-length.py>`__ - replaced by filter-abund/filter-below-abund
+* `filter-area.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/filter-area.py>`__ - trim highly connected k-mers
+* `filter-degree.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/filter-degree.py>`__ - trim highly connected k-mers
+* `filter-density-explosion.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/filter-density-explosion.py>`__ - trim highly connected k-mers
+* `filter-if-present.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/filter-if-present.py>`__ - replaced by filter-abund and others
+* `filter-max255.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/filter-max255.py>`__ - remove reads w/high-abundance k-mers.
+* `filter-min2-multi.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/filter-min2-multi.py>`__ - remove reads w/low-abundance k-mers
+* `filter-sodd.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/filter-sodd.py>`__ - no longer used partitioning feature
+* `filter-subsets-by-partsize.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/filter-subsets-by-partsize.py>`__ - deprecated way to filter out partitions by size
+* `get-occupancy.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/get-occupancy.py>`__ - utility script no longer needed
+* `get-occupancy2.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/get-occupancy2.py>`__ - utility script no longer needed
+* `graph-partition-separate.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/graph-partition-separate.py>`__ - deprecated graph partitioning stuff
+* `graph-size-circum-trim.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/graph-size-circum-trim.py>`__ - experimental mods to graph-size.py
+* `graph-size-degree-trim.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/graph-size-degree-trim.py>`__ - experimental mods to graph-size.py
+* `graph-size-py.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/graph-size-py.py>`__ - experimental mods to graph-size.py
+* `join_pe.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/join_pe.py>`__ - silly attempts to deal with PE interleaving?
+* `keep-stoptags.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/keep-stoptags.py>`__ - trim at stoptags
+* `label-pairs.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/label-pairs.py>`__ - deprecated PE fixing script
+* `length-dist.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/length-dist.py>`__ - deprecated length distribution calc script
+* `load-ht-and-tags.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/load-ht-and-tags.py>`__ - load and examine hashtable & tags
+* `multi-abyss.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/multi-abyss.py>`__ - better done with parallel
+* `make-coverage-by-position-for-node.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/make-coverage-by-position-for-node.py>`__ - deprecated coverage calculation
+* `make-coverage-histogram.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/make-coverage-histogram.py>`__ - build coverage histograms
+* `make-random.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/make-random.py>`__ - make random DNA; see dbg-graph-null project.
+* `make-read-stats.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/make-read-stats.py>`__ - see readstats.py
+* `multi-stats.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/multi-stats.py>`__ - see readstats.py
+* `multi-velvet.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/multi-velvet.py>`__ - better done with parallel
+* `normalize-by-min.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/normalize-by-min.py>`__ - normalize by min k-mer abundance in seq; just a bad idea
+* `occupy.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/occupy.py>`__ - no longer needed utility script
+* `parse-bowtie-pe.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/parse-bowtie-pe.py>`__ - no longer needed utility script
+* `parse-stats.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/parse-stats.py>`__ - partition stats
+* `partition-by-contig.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/partition-by-contig.py>`__ - various approaches to partitioning
+* `partition-by-contig2.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/partition-by-contig2.py>`__ - various approaches to partitioning
+* `partition-size-dist-running.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/partition-size-dist-running.py>`__ - various approaches to partitioning
+* `partition-size-dist.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/partition-size-dist.py>`__ - various approaches to partitioning
+* `path-compare-to-vectors.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/path-compare-to-vectors.py>`__ - ??
+* `print-exact-abund-kmer.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/print-exact-abund-kmer.py>`__ - ??
+* `print-high-density-kmers.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/print-high-density-kmers.py>`__ - display high abundance k-mers
+* `quality-trim-pe.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/quality-trim-pe.py>`__ - no longer needed utility script
+* `quality-trim.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/quality-trim.py>`__ - no longer needed utility script
+* `reformat.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/reformat.py>`__ - FASTA sequence description line reformatter for partitioned files
+* `remove-N.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/remove-N.py>`__ - eliminate sequences that have Ns in them
+* `softmask-high-abund.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/softmask-high-abund.py>`__ - softmask high abundance sequences (convert ACGT to acgt)
+* `split-fasta-on-circum.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/split-fasta-on-circum.py>`__ - various ways of breaking sequences on graph properties
+* `split-fasta-on-circum2.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/split-fasta-on-circum2.py>`__ - various ways of breaking sequences on graph properties
+* `split-fasta-on-circum3.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/split-fasta-on-circum3.py>`__ - various ways of breaking sequences on graph properties
+* `split-fasta-on-circum4.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/split-fasta-on-circum4.py>`__ - various ways of breaking sequences on graph properties
+* `split-fasta-on-degree-th.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/split-fasta-on-degree-th.py>`__ - various ways of breaking sequences on graph properties
+* `split-fasta-on-degree.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/split-fasta-on-degree.py>`__ - various ways of breaking sequences on graph properties
+* `split-fasta-on-density.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/split-fasta-on-density.py>`__ - various ways of breaking sequences on graph properties
+* `split-N.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/split-N.py>`__ - truncate sequences on N
+* `split-reads-on-median-diff.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/split-reads-on-median-diff.py>`__ - various ways of breaking sequences on graph properties
+* `summarize.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/summarize.py>`__ - sequence stats calculator
+* `sweep_perf.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/sweep_perf.py>`__ - benchmarking tool
+* `test_scripts.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/test_scripts.py>`__ - old test file
+* `traverse-contigs.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/traverse-contigs.py>`__ - deprecated graph traversal stuff
+* `traverse-from-reads.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/traverse-from-reads.py>`__ - deprecated graph traversal stuff
+* `validate-partitioning.py <https://github.com/dib-lab/khmer/tree/691b0b3aefe83e9e8f5f2b80f5f9516664a4654a/sandbox/validate-partitioning.py>`__ - unneeded test
diff --git a/sandbox/abundance-hist-by-position.py b/sandbox/abundance-hist-by-position.py
index 4ba6c3b..5a2125f 100755
--- a/sandbox/abundance-hist-by-position.py
+++ b/sandbox/abundance-hist-by-position.py
@@ -1,10 +1,12 @@
-#! /usr/bin/env python2
+#! /usr/bin/env python
#
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
-# Copyright (C) Michigan State University, 2009-2013. It is licensed under
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
+# Copyright (C) Michigan State University, 2009-2015. It is licensed under
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
#
+from __future__ import division
+from __future__ import print_function
import sys
def main():
@@ -14,11 +16,11 @@ def main():
freqfile = sys.argv[1]
- print >>sys.stderr, 'opening .freq file:', freqfile
+ print('opening .freq file:', freqfile, file=sys.stderr)
fd = open(freqfile)
for n, line in enumerate(fd):
if n % 100000 == 0:
- print >>sys.stderr, '...', n
+ print('...', n, file=sys.stderr)
tok = line.split()
@@ -26,7 +28,7 @@ def main():
countSum[i] += int(tok[i])
countN[i] += 1
- print >>sys.stderr, 'summarizing.'
+ print('summarizing.', file=sys.stderr)
y = [0.0] * len(countSum)
for i in range(len(countSum)):
@@ -34,7 +36,7 @@ def main():
y[i] = float(countSum[i]) / float(countN[i])
for n, i in enumerate(y):
- print n, i
+ print(n, i)
if __name__ == '__main__':
main()
diff --git a/sandbox/assembly-diff-2.py b/sandbox/assembly-diff-2.py
index c81bea7..1d39969 100755
--- a/sandbox/assembly-diff-2.py
+++ b/sandbox/assembly-diff-2.py
@@ -1,10 +1,11 @@
-#! /usr/bin/env python2
+#! /usr/bin/env python
#
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
-# Copyright (C) Michigan State University, 2009-2013. It is licensed under
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
+# Copyright (C) Michigan State University, 2009-2015. It is licensed under
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
#
+from __future__ import print_function
import sys
import khmer
import screed
@@ -23,24 +24,24 @@ def main():
filename2 = sys.argv[2]
uniq2 = open(os.path.basename(sys.argv[2]) + '.uniq', 'w')
- kh = khmer.new_hashbits(K, HASHTABLE_SIZE, N_HT)
+ kh = khmer.Hashbits(K, HASHTABLE_SIZE, N_HT)
for n, record in enumerate(screed.open(filename1)):
if n % 10000 == 0:
- print '...', filename1, n
- seq = record.sequence.upper().replace('N', 'G')
+ print('...', filename1, n)
+ seq = record.sequence.upper().replace('N', 'A')
kh.consume(seq)
path_n = 0
for n, record in enumerate(screed.open(filename2)):
if n % 10000 == 0:
- print '...', filename2, n
- seq = record.sequence.upper().replace('N', 'G')
+ print('...', filename2, n)
+ seq = record.sequence.upper().replace('N', 'A')
paths = kh.extract_unique_paths(seq, UNIQUE_LEN, UNIQUE_F)
kh.consume(seq)
for path in paths:
path_n += 1
- print >>uniq2, '>%s from:%s\n%s' % (path_n, record.name, path)
+ print('>%s from:%s\n%s' % (path_n, record.name, path), file=uniq2)
if __name__ == '__main__':
diff --git a/sandbox/assembly-diff.py b/sandbox/assembly-diff.py
index 189187c..e3fdee0 100755
--- a/sandbox/assembly-diff.py
+++ b/sandbox/assembly-diff.py
@@ -1,10 +1,12 @@
-#! /usr/bin/env python2
+#! /usr/bin/env python
#
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
-# Copyright (C) Michigan State University, 2009-2013. It is licensed under
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
+# Copyright (C) Michigan State University, 2009-2015. It is licensed under
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
#
+from __future__ import division
+from __future__ import print_function
import sys
import khmer
import screed
@@ -24,9 +26,9 @@ def main():
uniq2 = open(os.path.basename(sys.argv[2]) + '.uniq', 'w')
paths = sys.argv[3]
- kh1 = khmer.new_hashbits(K, HASHTABLE_SIZE, N_HT)
+ kh1 = khmer.Hashbits(K, HASHTABLE_SIZE, N_HT)
kh1.consume_fasta(filename1)
- kh2 = khmer.new_hashbits(K, HASHTABLE_SIZE, N_HT)
+ kh2 = khmer.Hashbits(K, HASHTABLE_SIZE, N_HT)
kh2.consume_fasta(filename2)
for record in screed.open(paths):
@@ -59,9 +61,9 @@ def main():
present2 = False
if present1 and not present2:
- print >>uniq1, '>%s\n%s' % (record.name, record.sequence)
+ print('>%s\n%s' % (record.name, record.sequence), file=uniq1)
elif present2 and not present1:
- print >>uniq2, '>%s\n%s' % (record.name, record.sequence)
+ print('>%s\n%s' % (record.name, record.sequence), file=uniq2)
if __name__ == '__main__':
diff --git a/sandbox/assemstats3.py b/sandbox/assemstats3.py
index a085128..3b7ae42 100755
--- a/sandbox/assemstats3.py
+++ b/sandbox/assemstats3.py
@@ -1,7 +1,7 @@
-#! /usr/bin/env python2
+#! /usr/bin/env python
#
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
-# Copyright (C) Michigan State University, 2009-2013. It is licensed under
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
+# Copyright (C) Michigan State University, 2009-2015. It is licensed under
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
#
@@ -21,6 +21,8 @@ Once completed, you should be able to run this script as is.
Author: Jason Pell (pelljaso at cse.msu.edu)
'''
+from __future__ import division
+from __future__ import print_function
import screed
import sys
@@ -89,23 +91,23 @@ def main():
totalSum = 0
if len(sys.argv) < 3:
- print "Usage: python assemstats.py <min contig length> [ FASTA files ]"
+ print("Usage: python assemstats.py <min contig length> [ FASTA files ]")
return
try:
minLen = int(sys.argv[1])
except ValueError:
- print "Minimum contig length must be an integer."
+ print("Minimum contig length must be an integer.")
return
- print '** cutoff:', minLen
- print "N\tsum\tmax\tfilename"
+ print('** cutoff:', minLen)
+ print("N\tsum\tmax\tfilename")
for filename in sys.argv[2:]:
if not os.path.exists(filename):
- print >>sys.stderr, "WARNING: file %s does not exist." % filename
+ print("WARNING: file %s does not exist." % filename, file=sys.stderr)
continue
-
+
lens = getLens(filename)
trimmedLens = trimLens(lens, minLen)
@@ -121,12 +123,12 @@ def main():
totalN += statTrimmedN
totalSum += statSum
- print "%d\t%d\t%d\t%s" % (statTrimmedN, statSum, statMax, filename)
+ print("%d\t%d\t%d\t%s" % (statTrimmedN, statSum, statMax, filename))
if len(sys.argv) > 3 and totalN:
- print '--'
- print 'TOTAL: %g in %d contigs (mean size %d)' % (
- totalSum, totalN, totalSum / float(totalN) + .5)
+ print('--')
+ print('TOTAL: %g in %d contigs (mean size %d)' % (
+ totalSum, totalN, totalSum / totalN + .5))
main()
diff --git a/sandbox/bloom-count-intersection.py b/sandbox/bloom-count-intersection.py
index d6dbc36..71405d4 100755
--- a/sandbox/bloom-count-intersection.py
+++ b/sandbox/bloom-count-intersection.py
@@ -1,7 +1,8 @@
-#! /usr/bin/env python2
+from __future__ import print_function
+#! /usr/bin/env python
#
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
-# Copyright (C) Michigan State University, 2009-2013. It is licensed under
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
+# Copyright (C) Michigan State University, 2009-2015. It is licensed under
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
#
@@ -19,7 +20,7 @@ def main():
HT_SIZE = int(sys.argv[3]) # size of hashtable
N_HT = int(sys.argv[4]) # number of hashtables
- ht = khmer.new_hashbits(K, HT_SIZE, N_HT)
+ ht = khmer.Hashbits(K, HT_SIZE, N_HT)
n_unique = 0
for n, record in enumerate(fasta_iter(open(filename))):
@@ -30,12 +31,12 @@ def main():
if (not ht.get(kmer)):
n_unique += 1
ht.count(kmer)
- print filename, 'has been consumed.'
- print '# of unique kmers:', n_unique
- print '# of occupied bin:', ht.n_occupied()
+ print(filename, 'has been consumed.')
+ print('# of unique kmers:', n_unique)
+ print('# of occupied bin:', ht.n_occupied())
filename2 = sys.argv[5]
- ht2 = khmer.new_hashbits(K, HT_SIZE, N_HT)
+ ht2 = khmer.Hashbits(K, HT_SIZE, N_HT)
n_unique = 0
n_overlap = 0
for n, record in enumerate(fasta_iter(open(filename2))):
@@ -49,11 +50,11 @@ def main():
n_overlap += 1
ht2.count(kmer)
- print filename2, 'has been consumed.'
- print '# of unique kmers:', n_unique
- print '# of occupied bin:', ht2.n_occupied()
+ print(filename2, 'has been consumed.')
+ print('# of unique kmers:', n_unique)
+ print('# of occupied bin:', ht2.n_occupied())
- print n_overlap, 'unique kmers appears in both ', filename, ' and ', filename2
+ print(n_overlap, 'unique kmers appears in both ', filename, ' and ', filename2)
if __name__ == '__main__':
diff --git a/sandbox/bloom-count.py b/sandbox/bloom-count.py
index 19a3c80..fc833cc 100755
--- a/sandbox/bloom-count.py
+++ b/sandbox/bloom-count.py
@@ -1,7 +1,8 @@
-#! /usr/bin/env python2
+from __future__ import print_function
+#! /usr/bin/env python
#
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
-# Copyright (C) Michigan State University, 2009-2013. It is licensed under
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
+# Copyright (C) Michigan State University, 2009-2015. It is licensed under
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
#
@@ -19,7 +20,7 @@ def main():
HT_SIZE = int(sys.argv[3]) # size of hashtable
N_HT = int(sys.argv[4]) # number of hashtables
- ht = khmer.new_hashbits(K, HT_SIZE, N_HT)
+ ht = khmer.Hashbits(K, HT_SIZE, N_HT)
n_unique = 0
for n, record in enumerate(fasta_iter(open(filename))):
@@ -31,9 +32,9 @@ def main():
n_unique += 1
ht.count(kmer)
- print n_unique
- print ht.n_occupied()
- print ht.n_unique_kmers()
+ print(n_unique)
+ print(ht.n_occupied())
+ print(ht.n_unique_kmers())
if __name__ == '__main__':
diff --git a/sandbox/build-sparse-graph.py b/sandbox/build-sparse-graph.py
index 4941603..6686d73 100755
--- a/sandbox/build-sparse-graph.py
+++ b/sandbox/build-sparse-graph.py
@@ -1,7 +1,8 @@
-#! /usr/bin/env python2
+from __future__ import print_function
+#! /usr/bin/env python
#
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
-# Copyright (C) Michigan State University, 2013. It is licensed under
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
+# Copyright (C) Michigan State University, 2013-2015. It is licensed under
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
#
@@ -19,7 +20,7 @@ def main():
K = int(sys.argv[1])
x = float(sys.argv[2])
- ht = khmer.new_hashbits(K, x, 4)
+ ht = khmer.Hashbits(K, x, 4)
sparse_graph = gt.Graph()
hashes = sparse_graph.new_vertex_property("long long")
@@ -27,13 +28,13 @@ def main():
for n, record in enumerate(screed.open(input_fasta)):
if n % 1000 == 0:
- print >>sys.stderr, '...loaded and tagged {} sequences'.format(n)
+ print('...loaded and tagged {} sequences'.format(n), file=sys.stderr)
name = record.name
sequence = record.sequence
ht.consume_sequence_and_tag_with_labels(sequence, n)
tags = ht.sweep_tag_neighborhood(sequence, 0)
- for i in xrange(len(tags) - 1):
+ for i in range(len(tags) - 1):
src = tags[i]
dst = tags[i + 1]
@@ -58,7 +59,7 @@ def main():
if new:
e = sparse_graph.add_edge(srcv, dstv)
- print 'Sparse graph has {} nodes, {} edges'.format(sparse_graph.num_vertices(), sparse_graph.num_edges())
+ print('Sparse graph has {} nodes, {} edges'.format(sparse_graph.num_vertices(), sparse_graph.num_edges()))
comp = gt.label_largest_component(sparse_graph, directed=False)
#pos = gt.radial_tree_layout(sparse_graph, sparse_graph.vertex(0))
gt.graph_draw(sparse_graph, output_size=(
diff --git a/sandbox/calc-best-assembly.py b/sandbox/calc-best-assembly.py
index 9582e44..931a24d 100755
--- a/sandbox/calc-best-assembly.py
+++ b/sandbox/calc-best-assembly.py
@@ -1,9 +1,10 @@
-#! /usr/bin/env python2
+#! /usr/bin/env python
#
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
# Copyright (C) Michigan State University, 2009-2015. It is licensed under
# the three-clause BSD license; see LICENSE. Contact: ctb at msu.edu
#
+from __future__ import print_function
import screed
import argparse
import sys
@@ -34,31 +35,31 @@ def main():
try:
total = calculate_bp_above_cutoff(filename, args.cutoff)
except IOError:
- print >>sys.stderr, "** WARNING: %s does not exist, skipping" %\
- filename
+ print("** WARNING: %s does not exist, skipping" %\
+ filename, file=sys.stderr)
continue
stats.append((total, filename))
if not args.quiet:
- print >>sys.stderr, "assembly %s has %d bp > %d" % (filename,
+ print("assembly %s has %d bp > %d" % (filename,
total,
- args.cutoff)
+ args.cutoff), file=sys.stderr)
stats.sort(reverse=True)
best_total, winner_file = stats[0]
- print >>sys.stderr, '----'
- print >>sys.stderr, "assembly %s wins: %d total bp > %d" % (winner_file,
+ print('----', file=sys.stderr)
+ print("assembly %s wins: %d total bp > %d" % (winner_file,
best_total,
- args.cutoff)
+ args.cutoff), file=sys.stderr)
if args.output_file:
for record in screed.open(winner_file, parse_description=False):
- print >>args.output_file, '>%s\n%s' % (record.name,
- record.sequence)
+ print('>%s\n%s' % (record.name,
+ record.sequence), file=args.output_file)
- print winner_file
+ print(winner_file)
if __name__ == '__main__':
main()
diff --git a/sandbox/calc-error-profile.py b/sandbox/calc-error-profile.py
index f2ffddf..2cceb30 100755
--- a/sandbox/calc-error-profile.py
+++ b/sandbox/calc-error-profile.py
@@ -1,6 +1,6 @@
-#! /usr/bin/env python2
+#! /usr/bin/env python
#
-# This script is part of khmer, http://github.com/ged-lab/khmer/, and is
+# This script is part of khmer, https://github.com/dib-lab/khmer/, and is
# Copyright (C) Michigan State University, 2009-2015. It is licensed under
# the three-clause BSD license; see LICENSE. Contact: ctb at msu.edu
#
@@ -12,6 +12,8 @@ reads. The output is placed in <infile>.errhist in the cwd by default.
Reads FASTQ and FASTA input.
"""
+from __future__ import division
+from __future__ import print_function
import sys
import argparse
@@ -65,7 +67,7 @@ def main():
# build a small counting hash w/default parameters. In general there
# should be no need to change these parameters.
- ht = khmer.new_counting_hash(K, HASHSIZE, N_HT)
+ ht = khmer.CountingHash(K, HASHSIZE, N_HT)
# initialize list to contain counts of errors by position
positions = [0] * MAX_SEQ_LEN
@@ -80,13 +82,13 @@ def main():
# look for errors.
total = 0
for filename in args.filenames:
- print >>sys.stderr, 'opening', filename
+ print('opening', filename, file=sys.stderr)
for n, record in enumerate(screed.open(filename)):
total += 1
if total % CHECK_EXIT == 0:
- print >>sys.stderr, '...', total, n_consumed, n_checked
+ print('...', total, n_consumed, n_checked, file=sys.stderr)
# two exit conditions: first, have we hit our max reads limit?
if total >= MAX_READS:
@@ -111,8 +113,8 @@ def main():
lengths.append(len(seq))
if args.errors_per_read:
- print >>args.errors_per_read, record.name, \
- ",".join(map(str, posns))
+ print(record.name, \
+ ",".join(map(str, posns)), file=args.errors_per_read)
# track the positions => errors
for p in posns:
@@ -131,24 +133,24 @@ def main():
# write!
output_file.write('position error_count error_fraction\n')
for n, i in enumerate(positions[:max_length]):
- print >>output_file, n, i, float(i) / float(length_count[n])
+ print(n, i, float(i) / float(length_count[n]), file=output_file)
output_file.close()
- print >>sys.stderr, ''
- print >>sys.stderr, 'total sequences:', total
- print >>sys.stderr, 'n consumed:', n_consumed
- print >>sys.stderr, 'n checked:', n_checked
- print >>sys.stderr, 'bp consumed:', bp_consumed, bp_consumed / float(C)
- print >>sys.stderr, 'error rate: %.2f%%' % \
- (100.0 * sum(positions) / float(sum(lengths)))
+ print('', file=sys.stderr)
+ print('total sequences:', total, file=sys.stderr)
+ print('n consumed:', n_consumed, file=sys.stderr)
+ print('n checked:', n_checked, file=sys.stderr)
+ print('bp consumed:', bp_consumed, bp_consumed / float(C), file=sys.stderr)
+ print('error rate: %.2f%%' % \
+ (100.0 * sum(positions) / float(sum(lengths))), file=sys.stderr)
- print >>sys.stderr, 'Error histogram is in %s' % output_filename
+ print('Error histogram is in %s' % output_filename, file=sys.stderr)
if not exit_condition(n_consumed, n_checked):
- print >>sys.stderr, ""
- print >>sys.stderr, "** WARNING: not enough reads to get a good result"
- print >>sys.stderr, "** Is this high diversity sample / small subset?"
+ print("", file=sys.stderr)
+ print("** WARNING: not enough reads to get a good result", file=sys.stderr)
+ print("** Is this high diversity sample / small subset?", file=sys.stderr)
sys.exit(-1)
diff --git a/sandbox/calc-median-distribution.py b/sandbox/calc-median-distribution.py
index c6ef1d0..a67a99c 100755
--- a/sandbox/calc-median-distribution.py
+++ b/sandbox/calc-median-distribution.py
@@ -1,10 +1,12 @@
-#! /usr/bin/env python2
+#! /usr/bin/env python
#
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
-# Copyright (C) Michigan State University, 2009-2013. It is licensed under
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
+# Copyright (C) Michigan State University, 2009-2015. It is licensed under
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
#
+from __future__ import division
+from __future__ import print_function
import sys
import khmer
import argparse
@@ -27,7 +29,7 @@ def main():
outfp = open(histout, 'w')
- print 'hashtable from', hashfile
+ print('hashtable from', hashfile)
ht = khmer.load_counting_hash(hashfile)
hist = {}
@@ -37,7 +39,7 @@ def main():
for n, record in enumerate(screed.open(seqfile)):
if n > 0 and n % 100000 == 0:
- print '...', n
+ print('...', n)
seq = record.sequence.replace('N', 'A')
diff --git a/sandbox/collect-reads.py b/sandbox/collect-reads.py
index ca16608..f02c0ea 100755
--- a/sandbox/collect-reads.py
+++ b/sandbox/collect-reads.py
@@ -1,6 +1,6 @@
-#! /usr/bin/env python2
+#! /usr/bin/env python
#
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
# Copyright (C) Michigan State University, 2014-2015. It is licensed under
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
@@ -14,10 +14,13 @@ Place reads into -o output_file.
Use '-h' for parameter help.
"""
+from __future__ import division
+from __future__ import print_function
import sys
import textwrap
import khmer
+from khmer import khmer_args
from khmer.khmer_args import build_counting_args, report_on_config, info
from khmer.kfile import check_input_files, check_space
from khmer.kfile import check_space_for_hashtable
@@ -74,15 +77,15 @@ def main():
check_input_files(name, False)
check_space(args.input_sequence_filename, False)
- check_space_for_hashtable(args.n_tables * args.min_tablesize, False)
+ check_space_for_hashtable(args, 'countgraph', False)
- print 'Saving k-mer counting table to %s' % base
- print 'Loading sequences from %s' % repr(filenames)
+ print('Saving k-mer counting table to %s' % base)
+ print('Loading sequences from %s' % repr(filenames))
if args.output:
- print 'Outputting sequences to', args.output
+ print('Outputting sequences to', args.output)
- print 'making k-mer counting table'
- htable = khmer.new_counting_hash(args.ksize, args.min_tablesize)
+ print('making countgraph', file=sys.stderr)
+ htable = khmer_args.create_countgraph(args)
htable.set_use_bigcount(args.bigcount)
total_coverage = 0.
@@ -92,7 +95,7 @@ def main():
for record in screed.open(filename):
seq = record.sequence.upper()
if 'N' in seq:
- seq = seq.replace('N', 'G')
+ seq = seq.replace('N', 'A')
try:
med, _, _ = htable.get_median_count(seq)
@@ -103,8 +106,8 @@ def main():
n += 1
if total_coverage / float(n) > args.coverage:
- print 'reached target average coverage:', \
- total_coverage / float(n)
+ print('reached target average coverage:', \
+ total_coverage / float(n))
break
htable.consume(seq)
@@ -112,29 +115,30 @@ def main():
args.output.write(output_single(record))
if n % 100000 == 0:
- print '...', index, filename, n, total_coverage / float(n)
+ print('...', index, filename, n, total_coverage / float(n))
if total_coverage / float(n) > args.coverage:
break
- print 'Collected %d reads' % (n,)
+ print('Collected %d reads' % (n,))
if args.report_total_kmers:
- print >> sys.stderr, 'Total number of k-mers: {0}'.format(
- htable.n_occupied())
+ print('Total number of k-mers: {0}'.format(
+ htable.n_occupied()), file=sys.stderr)
- print 'saving', base
+ print('saving', base)
htable.save(base)
info_fp = open(base + '.info', 'w')
info_fp.write('through end: %s\n' % filenames[-1])
# Change 0.2 only if you really grok it. HINT: You don't.
- fp_rate = khmer.calc_expected_collisions(htable, args.force, max_false_pos=.2)
- print 'fp rate estimated to be %1.3f' % fp_rate
- print >> info_fp, 'fp rate estimated to be %1.3f' % fp_rate
+ fp_rate = khmer.calc_expected_collisions(htable, False,
+ max_false_pos=.2)
+ print('fp rate estimated to be %1.3f' % fp_rate)
+ print('fp rate estimated to be %1.3f' % fp_rate, file=info_fp)
- print 'DONE.'
+ print('DONE.')
if __name__ == '__main__':
main()
diff --git a/sandbox/collect-variants.py b/sandbox/collect-variants.py
index 3a3669b..db368f0 100755
--- a/sandbox/collect-variants.py
+++ b/sandbox/collect-variants.py
@@ -1,7 +1,7 @@
-#! /usr/bin/env python2
+#! /usr/bin/env python
#
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
-# Copyright (C) Michigan State University, 2013-2014. It is licensed under
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
+# Copyright (C) Michigan State University, 2013-2015. It is licensed under
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
#
@@ -12,6 +12,7 @@ Use '-h' for parameter help.
TODO: add to sandbox README
"""
+from __future__ import print_function
import sys
import screed
@@ -41,16 +42,16 @@ def main():
args = parser.parse_args()
if not args.quiet:
- print >>sys.stderr, '\nPARAMETERS:'
- print >>sys.stderr, ' - kmer size = %d \t\t(-k)' % args.ksize
- print >>sys.stderr, ' - n hashes = %d \t\t(-N)' % args.n_tables
- print >>sys.stderr, ' - min hashsize = %-5.2g \t(-x)' % \
- args.min_tablesize
- print >>sys.stderr, ''
- print >>sys.stderr, 'Estimated memory usage is %.2g bytes ' \
+ print('\nPARAMETERS:', file=sys.stderr)
+ print(' - kmer size = %d \t\t(-k)' % args.ksize, file=sys.stderr)
+ print(' - n hashes = %d \t\t(-N)' % args.n_tables, file=sys.stderr)
+ print(' - min hashsize = %-5.2g \t(-x)' % \
+ args.min_tablesize, file=sys.stderr)
+ print('', file=sys.stderr)
+ print('Estimated memory usage is %.2g bytes ' \
'(n_hashes x min_hashsize)' % \
- (args.n_tables * args.min_tablesize)
- print >>sys.stderr, '-' * 8
+ (args.n_tables * args.min_tablesize), file=sys.stderr)
+ print('-' * 8, file=sys.stderr)
K = args.ksize
HT_SIZE = args.min_tablesize
@@ -59,11 +60,11 @@ def main():
filenames = args.input_filenames
if args.loadhash:
- print 'loading hashtable from', args.loadhash
+ print('loading hashtable from', args.loadhash)
ht = khmer.load_counting_hash(args.loadhash)
else:
- print 'making hashtable'
- ht = khmer.new_counting_hash(K, HT_SIZE, N_HT)
+ print('making hashtable')
+ ht = khmer.CountingHash(K, HT_SIZE, N_HT)
aligner = khmer.ReadAligner(ht, args.trusted_cutoff, args.bits_theta)
@@ -80,9 +81,9 @@ def main():
for n, record in enumerate(screed.open(input_filename)):
if n > 0 and n % 10000 == 0:
- print '... kept', total - discarded, 'of', total, ', or', \
- int(100. - discarded / float(total) * 100.), '%'
- print '... in file', input_filename
+ print('... kept', total - discarded, 'of', total, ', or', \
+ int(100. - discarded / float(total) * 100.), '%')
+ print('... in file', input_filename)
total += 1
@@ -133,19 +134,19 @@ def main():
discarded += 1
if total:
- print 'DONE with', input_filename, \
+ print('DONE with', input_filename, \
'; kept', total - discarded, 'of', total, 'or', \
- int(100. - discarded / float(total) * 100.), '%'
- print 'output in', output_name
+ int(100. - discarded / float(total) * 100.), '%')
+ print('output in', output_name)
if args.savehash:
- print 'Saving hashfile through', input_filename
- print '...saving to', args.savehash
+ print('Saving hashfile through', input_filename)
+ print('...saving to', args.savehash)
ht.save(args.savehash)
# Change 0.2 only if you really grok it. HINT: You don't.
fp_rate = khmer.calc_expected_collisions(ht, args.force, max_false_pos=.2)
- print 'fp rate estimated to be %1.3f' % fp_rate
+ print('fp rate estimated to be %1.3f' % fp_rate)
if __name__ == '__main__':
diff --git a/sandbox/correct-errors.py b/sandbox/correct-errors.py
index 83b6ec6..71c6890 100755
--- a/sandbox/correct-errors.py
+++ b/sandbox/correct-errors.py
@@ -1,7 +1,7 @@
-#! /usr/bin/env python2
+#! /usr/bin/env python
#
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
-# Copyright (C) Michigan State University, 2009-2014. It is licensed under
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
+# Copyright (C) Michigan State University, 2009-2015. It is licensed under
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
#
@@ -18,6 +18,7 @@ TODO: move output_single elsewhere
TODO: add to sandbox/README
TODO: change name to correct-reads?
"""
+from __future__ import print_function
import sys
import screed
import os
@@ -90,13 +91,13 @@ def main():
NORMALIZE_LIMIT = args.normalize_to
- print 'making hashtable'
- ht = khmer.new_counting_hash(K, HT_SIZE, N_HT)
+ print('making hashtable')
+ ht = khmer.CountingHash(K, HT_SIZE, N_HT)
aligner = khmer.ReadAligner(ht, args.trusted_cov, args.bits_theta)
tempdir = tempfile.mkdtemp('khmer', 'tmp', args.tempdir)
- print 'created temporary directory %s; use -T to change location' % tempdir
+ print('created temporary directory %s; use -T to change location' % tempdir)
###
@@ -120,8 +121,8 @@ def main():
total_reads += 1
if n % 10000 == 0:
- print '...', n, filename, n_aligned, n_corrected, save_pass2, \
- total_reads
+ print('...', n, filename, n_aligned, n_corrected, save_pass2, \
+ total_reads)
seq = read.sequence.replace('N', 'A')
# build the alignment...
@@ -165,18 +166,18 @@ def main():
pass2fp.close()
corrfp.close()
- print '%s: kept aside %d of %d from first pass, in %s' % \
- (filename, save_pass2, n, filename)
- print 'aligned %d of %d reads so far' % (n_aligned, total_reads)
- print 'changed %d of %d reads so far' % (n_corrected, total_reads)
+ print('%s: kept aside %d of %d from first pass, in %s' % \
+ (filename, save_pass2, n, filename))
+ print('aligned %d of %d reads so far' % (n_aligned, total_reads))
+ print('changed %d of %d reads so far' % (n_corrected, total_reads))
for orig_filename, pass2filename, corrfilename in pass2list:
- print 'second pass: looking at sequences kept aside in %s' % \
- pass2filename
+ print('second pass: looking at sequences kept aside in %s' % \
+ pass2filename)
for n, read in enumerate(screed.open(pass2filename)):
if n % 10000 == 0:
- print '... x 2', n, pass2filename, n_aligned, n_corrected, \
- total_reads
+ print('... x 2', n, pass2filename, n_aligned, n_corrected, \
+ total_reads)
corrfp = open(corrfilename, 'a')
@@ -205,14 +206,14 @@ def main():
corrfp.write(output_single(read, corrected))
- print 'removing %s' % pass2filename
+ print('removing %s' % pass2filename)
os.unlink(pass2filename)
- print 'removing temp directory & contents (%s)' % tempdir
+ print('removing temp directory & contents (%s)' % tempdir)
shutil.rmtree(tempdir)
- print 'Aligned %d of %d total' % (n_aligned, total_reads)
- print 'Changed %d of %d total' % (n_corrected, total_reads)
+ print('Aligned %d of %d total' % (n_aligned, total_reads))
+ print('Changed %d of %d total' % (n_corrected, total_reads))
if __name__ == '__main__':
main()
diff --git a/sandbox/estimate_optimal_hash.py b/sandbox/estimate_optimal_hash.py
new file mode 100755
index 0000000..63e4932
--- /dev/null
+++ b/sandbox/estimate_optimal_hash.py
@@ -0,0 +1,109 @@
+#! /usr/bin/env python2
+#
+# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
+# Copyright (C) Michigan State University, 2009-2015. It is licensed under
+# the three-clause BSD license; see doc/LICENSE.txt.
+# Contact: khmer-project at idyll.org
+#
+# pylint: disable=missing-docstring,invalid-name
+"""
+Estimate optimal choice of hash table parameters
+
+First scenario: we know the number of k-mers N and the size of memory
+available to use M. We want to know the optimal number of hash table Z
+to get the lowest false positive rate.
+
+For this scenario, use only "-M" and "number of kmers".
+% sandbox/estimate_optimal_hash.py <number_of_kmers> -M <size_of_memory>
+
+Second scenario: we know the number of k-mers N and the desired maximum
+false positive rate f. We want to know the minimum memory usage required
+to achieve f.
+
+For this scenario, use only "-f" and "number of kmers".
+% sandbox/estimate_optimal_hash.py <number_of_kmers> -f <desired_fpr>
+
+Use '-h' for parameter help.
+
+"""
+from __future__ import print_function
+import argparse
+import khmer, oxli
+from khmer.khmer_args import info
+from oxli.functions import estimate_optimal_with_N_and_M
+from oxli.functions import estimate_optimal_with_N_and_f
+import textwrap
+import sys
+
+def get_parser():
+ epilog = """
+
+ First scenario: we know the number of k-mers N and the size of memory
+ available to use M. We want to know the optimal number of hash table Z
+ to get the lowest false positive rate.
+
+ For this scenario, use only "-M" and "number of kmers".
+ % sandbox/estimate_optimal_hash.py <number_of_kmers> -M <size_of_memory>
+
+ Second scenario: we know the number of k-mers N and the desired maximum
+ false positive rate f. We want to know the minimum memory usage required
+ to achieve f.
+
+ For this scenario, use only "-f" and "number of kmers".
+ % sandbox/estimate_optimal_hash.py <number_of_kmers> -f <desired_fpr>
+
+ """
+ parser = argparse.ArgumentParser(
+ description='Estimate optimal choice of hash table parameters',
+ formatter_class=argparse.RawDescriptionHelpFormatter,
+ epilog=textwrap.dedent(epilog))
+
+ parser.add_argument('-N', help='number of estimated distinct k-mers',
+ type = int)
+ group = parser.add_mutually_exclusive_group()
+ group.add_argument('-M', help='size of memory available to use',
+ type = int)
+ group.add_argument('-f', help='desired maximum false posotive rate',
+ type = float)
+ parser.add_argument('--version', action='version', version='%(prog)s '
+ + khmer.__version__)
+ return parser
+
+
+def main():
+ info('estimate_optimal_hash.py', ['counting'])
+ args = get_parser().parse_args()
+ N = args.N
+ if args.M:
+ M = args.M
+ result = estimate_optimal_with_N_and_M(N,M)
+ print("number of estimated distinct k-mers: ", N, file=sys.stderr)
+ print("size of memory available to use: ", M, file=sys.stderr)
+ print("optimal number of hash tables: ", result.num_htables,
+ file=sys.stderr)
+ print("optimal size of hash tables: ", result.htable_size,
+ file=sys.stderr)
+ print("estimated false positive rate: ", result.fp_rate,
+ file=sys.stderr)
+ print("estimated usage of memory: ", result.mem_use,
+ file=sys.stderr)
+
+ elif args.f:
+ f = args.f
+ result = estimate_optimal_with_N_and_f(N,f)
+ print("number of estimated distinct k-mers: ", N, file=sys.stderr)
+ print("desired maximum false positive rate: ", f, file=sys.stderr)
+ print("optimal number of hash tables: ", result.num_htables,
+ file=sys.stderr)
+ print("optimal size of hash tables: ", result.htable_size,
+ file=sys.stderr)
+ print("estimated false positive rate: ", result.fp_rate,
+ file=sys.stderr)
+ print("estimated usage of memory: ", result.mem_use,
+ file=sys.stderr)
+
+ else:
+ get_parser().error('No action requested, add -M (size of memory available to use) or -f (desired maximum false posotive rate)')
+
+if __name__ == '__main__':
+ main()
diff --git a/sandbox/extract-single-partition.py b/sandbox/extract-single-partition.py
index 0e890da..ccc0f28 100755
--- a/sandbox/extract-single-partition.py
+++ b/sandbox/extract-single-partition.py
@@ -1,10 +1,11 @@
-#! /usr/bin/env python2
+#! /usr/bin/env python
#
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
-# Copyright (C) Michigan State University, 2009-2013. It is licensed under
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
+# Copyright (C) Michigan State University, 2009-2015. It is licensed under
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
#
+from __future__ import print_function
import sys
from screed.fasta import fasta_iter
@@ -21,7 +22,7 @@ def main():
count = 0
for n, name, pid, seq in read_partition_file(open(sys.argv[1])):
if pid == select_pid:
- print '>%s\t%d\n%s' % (name, pid, seq)
+ print('>%s\t%d\n%s' % (name, pid, seq))
count += 1
if n % 10000 == 0:
diff --git a/sandbox/fasta-to-abundance-hist.py b/sandbox/fasta-to-abundance-hist.py
index d101f16..b4776cf 100755
--- a/sandbox/fasta-to-abundance-hist.py
+++ b/sandbox/fasta-to-abundance-hist.py
@@ -1,10 +1,11 @@
-#! /usr/bin/env python2
+#! /usr/bin/env python
#
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
-# Copyright (C) Michigan State University, 2009-2013. It is licensed under
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
+# Copyright (C) Michigan State University, 2009-2015. It is licensed under
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
#
+from __future__ import print_function
import sys
import khmer
@@ -16,13 +17,11 @@ def main():
n_consumed = len(files) * [0]
n_seq_kept = len(files) * [0]
- print 'loading ht'
- ht = khmer.new_counting_hash(1, 1, 1)
-
- ht.load(sys.argv[1])
+ print('loading ht')
+ ht = khmer.load_counting_hash(sys.argv[1])
for i, infile in enumerate(files):
- print 'outputting', infile + '.freq'
+ print('outputting', infile + '.freq')
ht.output_fasta_kmer_pos_freq(infile, infile + ".freq")
diff --git a/sandbox/filter-below-abund.py b/sandbox/filter-below-abund.py
index 9b4ef76..e46ddee 100755
--- a/sandbox/filter-below-abund.py
+++ b/sandbox/filter-below-abund.py
@@ -1,10 +1,11 @@
-#! /usr/bin/env python2
+#! /usr/bin/env python
#
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
-# Copyright (C) Michigan State University, 2009-2013. It is licensed under
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
+# Copyright (C) Michigan State University, 2009-2015. It is licensed under
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
#
+from __future__ import print_function
import sys
import os
import khmer
@@ -22,17 +23,17 @@ def main():
counting_ht = sys.argv[1]
infiles = sys.argv[2:]
- print 'file with ht: %s' % counting_ht
- print '-- settings:'
- print 'N THREADS', WORKER_THREADS
- print '--'
+ print('file with ht: %s' % counting_ht)
+ print('-- settings:')
+ print('N THREADS', WORKER_THREADS)
+ print('--')
- print 'making hashtable'
+ print('making hashtable')
ht = khmer.load_counting_hash(counting_ht)
K = ht.ksize()
for infile in infiles:
- print 'filtering', infile
+ print('filtering', infile)
outfile = os.path.basename(infile) + '.below'
outfp = open(outfile, 'w')
diff --git a/sandbox/filter-median-and-pct.py b/sandbox/filter-median-and-pct.py
index ed9e251..698aff5 100755
--- a/sandbox/filter-median-and-pct.py
+++ b/sandbox/filter-median-and-pct.py
@@ -1,7 +1,7 @@
-#! /usr/bin/env python2
+#! /usr/bin/env python
#
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
-# Copyright (C) Michigan State University, 2009-2013. It is licensed under
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
+# Copyright (C) Michigan State University, 2009-2015. It is licensed under
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
#
@@ -13,6 +13,7 @@ hash table. Output sequences will be placed in 'infile.medpctfilt'.
Use '-h' for parameter help.
"""
+from __future__ import print_function
import sys
import screed.fasta
import os
@@ -37,15 +38,15 @@ def main():
counting_ht = args.input_table
infiles = args.input_filenames
- print 'file with ht: %s' % counting_ht
+ print('file with ht: %s' % counting_ht)
- print 'loading hashtable'
+ print('loading hashtable')
ht = khmer.load_counting_hash(counting_ht)
K = ht.ksize()
xxxfp = None
- print "K:", K
+ print("K:", K)
# the filtering function.
def process_fn(record):
@@ -64,20 +65,20 @@ def main():
# the filtering loop
for infile in infiles:
- print 'filtering', infile
+ print('filtering', infile)
xxxfp = open(os.path.basename(infile) + '.medpctfilt.stats', 'w')
outfile = os.path.basename(infile) + '.medpctfilt'
outfp = open(outfile, 'w')
for n, record in enumerate(screed.open(infile)):
if n % 100000 == 0:
- print '...', n
+ print('...', n)
name, seq = process_fn(record)
if name and seq:
- print >>outfp, '>%s\n%s' % (name, seq)
+ print('>%s\n%s' % (name, seq), file=outfp)
- print 'output in', outfile
+ print('output in', outfile)
if __name__ == '__main__':
main()
diff --git a/sandbox/filter-median.py b/sandbox/filter-median.py
index c1a126f..a417c3d 100755
--- a/sandbox/filter-median.py
+++ b/sandbox/filter-median.py
@@ -1,7 +1,7 @@
-#! /usr/bin/env python2
+#! /usr/bin/env python
#
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
-# Copyright (C) Michigan State University, 2009-2013. It is licensed under
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
+# Copyright (C) Michigan State University, 2009-2015. It is licensed under
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
#
@@ -13,6 +13,7 @@ hash table. Output sequences will be placed in 'infile.medfilt'.
Use '-h' for parameter help.
"""
+from __future__ import print_function
import sys
import screed.fasta
import os
@@ -37,13 +38,13 @@ def main():
counting_ht = args.input_table
infiles = args.input_filenames
- print 'file with ht: %s' % counting_ht
+ print('file with ht: %s' % counting_ht)
- print 'loading hashtable'
+ print('loading hashtable')
ht = khmer.load_counting_hash(counting_ht)
K = ht.ksize()
- print "K:", K
+ print("K:", K)
# the filtering function.
def process_fn(record):
@@ -59,14 +60,14 @@ def main():
# the filtering loop
for infile in infiles:
- print 'filtering', infile
+ print('filtering', infile)
outfile = os.path.basename(infile) + '.medfilt'
outfp = open(outfile, 'w')
tsp = ThreadedSequenceProcessor(process_fn)
tsp.start(verbose_loader(infile), outfp)
- print 'output in', outfile
+ print('output in', outfile)
if __name__ == '__main__':
main()
diff --git a/sandbox/find-high-abund-kmers.py b/sandbox/find-high-abund-kmers.py
index 687ab41..db43686 100755
--- a/sandbox/find-high-abund-kmers.py
+++ b/sandbox/find-high-abund-kmers.py
@@ -1,18 +1,19 @@
-#! /usr/bin/env python2
+#! /usr/bin/env python
#
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
-# Copyright (C) Michigan State University, 2009-2013. It is licensed under
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
+# Copyright (C) Michigan State University, 2009-2015. It is licensed under
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
#
"""
@@
"""
+from __future__ import print_function
import sys
import screed
import khmer
-from khmer.khmer_args import build_counting_args, DEFAULT_MIN_TABLESIZE
+from khmer.khmer_args import build_counting_args, DEFAULT_MAX_TABLESIZE
DEFAULT_LOWER_CUTOFF = 2000
DEFAULT_UPPER_CUTOFF = 65535
@@ -33,21 +34,21 @@ def main():
args = parser.parse_args()
if not args.quiet:
- if args.min_hashsize == DEFAULT_MIN_HASHSIZE:
- print >>sys.stderr, "** WARNING: hashsize is default! " \
+ if args.min_hashsize == DEFAULT_MAX_TABLESIZE:
+ print("** WARNING: hashsize is default! " \
"You absodefly want to increase this!\n** " \
- "Please read the docs!"
-
- print >>sys.stderr, '\nPARAMETERS:'
- print >>sys.stderr, ' - kmer size = %d \t\t(-k)' % args.ksize
- print >>sys.stderr, ' - n hashes = %d \t\t(-N)' % args.n_hashes
- print >>sys.stderr, ' - min hashsize = %-5.2g \t(-x)' % \
- args.min_hashsize
- print >>sys.stderr, ''
- print >>sys.stderr, 'Estimated memory usage is %.2g bytes " \
+ "Please read the docs!", file=sys.stderr)
+
+ print('\nPARAMETERS:', file=sys.stderr)
+ print(' - kmer size = %d \t\t(-k)' % args.ksize, file=sys.stderr)
+ print(' - n hashes = %d \t\t(-N)' % args.n_hashes, file=sys.stderr)
+ print(' - min hashsize = %-5.2g \t(-x)' % \
+ args.min_hashsize, file=sys.stderr)
+ print('', file=sys.stderr)
+ print('Estimated memory usage is %.2g bytes " \
"(n_hashes x min_hashsize)' % (
- args.n_hashes * args.min_hashsize)
- print >>sys.stderr, '-' * 8
+ args.n_hashes * args.min_hashsize), file=sys.stderr)
+ print('-' * 8, file=sys.stderr)
K = args.ksize
HT_SIZE = args.min_hashsize
@@ -56,23 +57,23 @@ def main():
output = args.output_filename
input = args.input_filename
- print 'lower cutoff:', args.lower_cutoff
- print 'upper cutoff:', args.upper_cutoff
- print 'Saving stoptags to %s' % output
- print 'Loading sequences in %s' % input
+ print('lower cutoff:', args.lower_cutoff)
+ print('upper cutoff:', args.upper_cutoff)
+ print('Saving stoptags to %s' % output)
+ print('Loading sequences in %s' % input)
###
- print 'making hashtable'
- ht = khmer.new_counting_hash(K, HT_SIZE, N_HT)
+ print('making hashtable')
+ ht = khmer.CountingHash(K, HT_SIZE, N_HT)
ht.set_use_bigcount(True)
- print 'consuming input', input
+ print('consuming input', input)
hb = ht.collect_high_abundance_kmers(input,
args.lower_cutoff,
args.upper_cutoff)
- print 'saving stoptags', output
+ print('saving stoptags', output)
hb.save_stop_tags(output)
if __name__ == '__main__':
diff --git a/sandbox/graph-size.py b/sandbox/graph-size.py
index 00891b5..41cdf07 100755
--- a/sandbox/graph-size.py
+++ b/sandbox/graph-size.py
@@ -1,10 +1,11 @@
-#! /usr/bin/env python2
+#! /usr/bin/env python
#
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
-# Copyright (C) Michigan State University, 2009-2013. It is licensed under
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
+# Copyright (C) Michigan State University, 2009-2015. It is licensed under
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
#
+from __future__ import print_function
import khmer
import sys
import screed
@@ -30,19 +31,19 @@ def main():
if len(sys.argv) == 3:
outfile = sys.argv[2]
- print 'input file to graphsize filter: %s' % infile
- print 'filtering to output:', outfile
- print '-- settings:'
- print 'K', K
- print 'HASHTABLE SIZE %g' % HASHTABLE_SIZE
- print 'N HASHTABLES %d' % N_HT
- print 'THRESHOLD', THRESHOLD
- print 'N THREADS', WORKER_THREADS
- print '--'
+ print('input file to graphsize filter: %s' % infile)
+ print('filtering to output:', outfile)
+ print('-- settings:')
+ print('K', K)
+ print('HASHTABLE SIZE %g' % HASHTABLE_SIZE)
+ print('N HASHTABLES %d' % N_HT)
+ print('THRESHOLD', THRESHOLD)
+ print('N THREADS', WORKER_THREADS)
+ print('--')
- print 'creating ht'
- ht = khmer.new_hashbits(K, HASHTABLE_SIZE, N_HT)
- print 'eating fa', infile
+ print('creating ht')
+ ht = khmer.Hashbits(K, HASHTABLE_SIZE, N_HT)
+ print('eating fa', infile)
total_reads, n_consumed = ht.consume_fasta(infile)
outfp = open(outfile, 'w')
diff --git a/sandbox/hi-lo-abundance-by-position.py b/sandbox/hi-lo-abundance-by-position.py
index eca923f..5c9e353 100755
--- a/sandbox/hi-lo-abundance-by-position.py
+++ b/sandbox/hi-lo-abundance-by-position.py
@@ -1,10 +1,11 @@
-#! /usr/bin/env python2
+#! /usr/bin/env python
#
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
-# Copyright (C) Michigan State University, 2009-2013. It is licensed under
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
+# Copyright (C) Michigan State University, 2009-2015. It is licensed under
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
#
+from __future__ import print_function
import sys
import os
import khmer
@@ -20,16 +21,16 @@ def main():
filename = sys.argv[2]
outfile = os.path.basename(filename)
- print 'loading kh file', hashfile
+ print('loading kh file', hashfile)
ht = khmer.load_counting_hash(hashfile)
x = ht.fasta_count_kmers_by_position(filename, 100, 1)
write_dist(x, open(outfile + '.pos.abund=1', 'w'))
- print 'wrote', outfile + '.pos.abund=1'
+ print('wrote', outfile + '.pos.abund=1')
y = ht.fasta_count_kmers_by_position(filename, 100, 255)
write_dist(y, open(outfile + '.pos.abund=255', 'w'))
- print 'wrote', outfile + '.pos.abund=255'
+ print('wrote', outfile + '.pos.abund=255')
if __name__ == '__main__':
diff --git a/sandbox/make-coverage.py b/sandbox/make-coverage.py
index 4828539..fb4ce39 100755
--- a/sandbox/make-coverage.py
+++ b/sandbox/make-coverage.py
@@ -1,10 +1,13 @@
-#! /usr/bin/env python2
+#! /usr/bin/env python
#
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
# Copyright (C) Michigan State University, 2009-2015. It is licensed under
# the three-clause BSD license; see LICENSE.txt.
# Contact: khmer-project at idyll.org
#
+
+from __future__ import print_function
+
import screed
import sys
@@ -16,13 +19,13 @@ def main():
lengths = {}
for n, record in enumerate(screed.open(dbfile)):
if n % 100000 == 0:
- print '...', n
+ print('...', n)
lengths[record.name] = len(record.sequence)
sums = {}
for n, line in enumerate(open(mapfile)):
if n % 100000 == 0:
- print '... 2x', n
+ print('... 2x', n)
x = line.split('\t')
name = x[2]
readlen = len(x[4])
@@ -38,10 +41,12 @@ def main():
outfp = open(dbfile + '.cov', 'w')
for n, record in enumerate(screed.open(dbfile)):
if n % 100000 == 0:
- print '...', n
+ print('...', n)
- print >>outfp, ">%s[cov=%d]\n%s" % (
- record.name, rpkms.get(record.name, 0), record.sequence)
+ print(">%s[cov=%d]\n%s" % (record.name,
+ rpkms.get(record.name, 0),
+ record.sequence),
+ file=outfp)
if __name__ == '__main__':
main()
diff --git a/sandbox/multi-rename.py b/sandbox/multi-rename.py
index 4c03f08..f9c3a3d 100755
--- a/sandbox/multi-rename.py
+++ b/sandbox/multi-rename.py
@@ -1,10 +1,11 @@
-#! /usr/bin/env python2
+#! /usr/bin/env python
#
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
-# Copyright (C) Michigan State University, 2009-2013. It is licensed under
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
+# Copyright (C) Michigan State University, 2009-2015. It is licensed under
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
#
+from __future__ import print_function
import screed
import sys
@@ -18,7 +19,7 @@ def main():
for record in screed.open(filename):
if len(record.sequence) >= CUTOFF:
n += 1
- print '>%s.%s %s\n%s' % (prefix, n, record.name, record.sequence)
+ print('>%s.%s %s\n%s' % (prefix, n, record.name, record.sequence))
if __name__ == '__main__':
diff --git a/sandbox/normalize-by-median-pct.py b/sandbox/normalize-by-median-pct.py
index b23d6d8..f7dad25 100755
--- a/sandbox/normalize-by-median-pct.py
+++ b/sandbox/normalize-by-median-pct.py
@@ -1,7 +1,7 @@
-#! /usr/bin/env python2
+#! /usr/bin/env python
#
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
-# Copyright (C) Michigan State University, 2009-2013. It is licensed under
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
+# Copyright (C) Michigan State University, 2009-2015. It is licensed under
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
#
@@ -13,13 +13,15 @@ DESIRED_COVERAGE. Output sequences will be placed in 'infile.keep'.
Use '-h' for parameter help.
"""
+from __future__ import division
+from __future__ import print_function
import sys
import screed
import os
import khmer
-from itertools import izip
-from khmer.khmer_args import build_counting_args, DEFAULT_MIN_TABLESIZE
+
+from khmer.khmer_args import build_counting_args, DEFAULT_MAX_TABLESIZE
import argparse
DEFAULT_DESIRED_COVERAGE = 5
@@ -30,7 +32,7 @@ DEFAULT_DESIRED_COVERAGE = 5
def batchwise(t, size):
it = iter(t)
- return izip(*[it] * size)
+ return zip(*[it] * size)
# Returns true if the pair of records are properly pairs
@@ -56,18 +58,18 @@ def main():
args = parser.parse_args()
if not args.quiet:
- if args.min_hashsize == DEFAULT_MIN_HASHSIZE and not args.loadhash:
- print>>sys.stderr, "** WARNING: hashsize is default! You absodefly want to increase this!\n** Please read the docs!"
-
- print>>sys.stderr, '\nPARAMETERS:'
- print>>sys.stderr, ' - kmer size = %d \t\t(-k)' % args.ksize
- print>>sys.stderr, ' - n hashes = %d \t\t(-N)' % args.n_hashes
- print>>sys.stderr, ' - min hashsize = %-5.2g \t(-x)' % args.min_hashsize
- print>>sys.stderr, ' - paired = %s \t\t(-p)' % args.paired
- print>>sys.stderr, ''
- print>>sys.stderr, 'Estimated memory usage is %.2g bytes (n_hashes x min_hashsize)' % (
- args.n_hashes * args.min_hashsize)
- print>>sys.stderr, '-' * 8
+ if args.min_hashsize == DEFAULT_MAX_HASHSIZE and not args.loadhash:
+ print("** WARNING: hashsize is default! You absodefly want to increase this!\n** Please read the docs!", file=sys.stderr)
+
+ print('\nPARAMETERS:', file=sys.stderr)
+ print(' - kmer size = %d \t\t(-k)' % args.ksize, file=sys.stderr)
+ print(' - n hashes = %d \t\t(-N)' % args.n_hashes, file=sys.stderr)
+ print(' - min hashsize = %-5.2g \t(-x)' % args.min_hashsize, file=sys.stderr)
+ print(' - paired = %s \t\t(-p)' % args.paired, file=sys.stderr)
+ print('', file=sys.stderr)
+ print('Estimated memory usage is %.2g bytes (n_hashes x min_hashsize)' % (
+ args.n_hashes * args.min_hashsize), file=sys.stderr)
+ print('-' * 8, file=sys.stderr)
K = args.ksize
HT_SIZE = args.min_hashsize
@@ -82,11 +84,11 @@ def main():
batch_size = 2
if args.loadhash:
- print 'loading hashtable from', args.loadhash
+ print('loading hashtable from', args.loadhash)
ht = khmer.load_counting_hash(args.loadhash)
else:
- print 'making hashtable'
- ht = khmer.new_counting_hash(K, HT_SIZE, N_HT)
+ print('making hashtable')
+ ht = khmer.CountingHash(K, HT_SIZE, N_HT)
total = 0
discarded = 0
@@ -98,13 +100,13 @@ def main():
n = -1
for n, batch in enumerate(batchwise(screed.open(input_filename), batch_size)):
if n > 0 and n % 100000 == 0:
- print '... kept', total - discarded, 'of', total, ', or', \
- int(100. - discarded / float(total) * 100.), '%'
- print '... in file', input_filename
+ print('... kept', total - discarded, 'of', total, ', or', \
+ int(100. - discarded / float(total) * 100.), '%')
+ print('... in file', input_filename)
if report_fp:
- print>>report_fp, total, total - discarded, \
- 1. - (discarded / float(total))
+ print(total, total - discarded, \
+ 1. - (discarded / float(total)), file=report_fp)
report_fp.flush()
total += batch_size
@@ -112,8 +114,8 @@ def main():
# If in paired mode, check that the reads are properly interleaved
if args.paired:
if not validpair(batch[0], batch[1]):
- print >>sys.stderr, 'Error: Improperly interleaved pairs %s %s' % (
- batch[0].name, batch[1].name)
+ print('Error: Improperly interleaved pairs %s %s' % (
+ batch[0].name, batch[1].name), file=sys.stderr)
sys.exit(-1)
# Emit the batch of reads if any read passes the filter
@@ -150,27 +152,27 @@ def main():
discarded += batch_size
if -1 < n:
- print 'DONE with', input_filename, '; kept', total - discarded, 'of',\
- total, 'or', int(100. - discarded / float(total) * 100.), '%'
- print 'output in', output_name
+ print('DONE with', input_filename, '; kept', total - discarded, 'of',\
+ total, 'or', int(100. - discarded / float(total) * 100.), '%')
+ print('output in', output_name)
else:
- print 'SKIPPED empty file', input_filename
+ print('SKIPPED empty file', input_filename)
if args.savehash:
- print 'Saving hashfile through', input_filename
- print '...saving to', args.savehash
+ print('Saving hashfile through', input_filename)
+ print('...saving to', args.savehash)
ht.save(args.savehash)
# Change 0.2 only if you really grok it. HINT: You don't.
fp_rate = khmer.calc_expected_collisions(ht)
- print 'fp rate estimated to be %1.3f' % fp_rate
+ print('fp rate estimated to be %1.3f' % fp_rate)
if fp_rate > 0.20:
- print >>sys.stderr, "**"
- print >>sys.stderr, "** ERROR: the counting hash is too small for"
- print >>sys.stderr, "** this data set. Increase hashsize/num ht."
- print >>sys.stderr, "**"
- print >>sys.stderr, "** Do not use these results!!"
+ print("**", file=sys.stderr)
+ print("** ERROR: the counting hash is too small for", file=sys.stderr)
+ print("** this data set. Increase hashsize/num ht.", file=sys.stderr)
+ print("**", file=sys.stderr)
+ print("** Do not use these results!!", file=sys.stderr)
sys.exit(-1)
if __name__ == '__main__':
diff --git a/sandbox/optimal_args_hashbits.py b/sandbox/optimal_args_hashbits.py
new file mode 100644
index 0000000..1fba596
--- /dev/null
+++ b/sandbox/optimal_args_hashbits.py
@@ -0,0 +1,97 @@
+#! /usr/bin/env python2
+#
+# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
+# Copyright (C) Michigan State University, 2009-2015. It is licensed under
+# the three-clause BSD license; see doc/LICENSE.txt.
+# Contact: khmer-project at idyll.org
+#
+# pylint: disable=invalid-name,missing-docstring
+"""
+Estimate optimal arguments using hashbits counting.
+
+% python sandbox/optimal_args_hashbits.py <data1> [ <data2> <...> ]
+
+Use '-h' for parameter help.
+"""
+from __future__ import print_function
+
+import sys
+import math
+import threading
+
+import khmer
+from khmer.khmer_args import build_hashbits_args
+from khmer.khmer_args import (report_on_config, info, add_threading_args)
+from khmer.kfile import check_input_files, check_space
+from khmer.kfile import check_space
+from oxli.functions import optimal_args_output_gen as output_gen
+
+
+def get_parser():
+ parser = build_hashbits_args(descr="Load sequences into the compressible "
+ "graph format plus optional tagset.")
+ add_threading_args(parser)
+ parser.add_argument('input_filenames', metavar='input_sequence_filename',
+ nargs='+', help='input FAST[AQ] sequence filename')
+ return parser
+
+
+def main():
+ info('optimal_args_hashbits.py', ['graph', 'SeqAn'])
+ args = get_parser().parse_args()
+ report_on_config(args, hashtype='hashbits')
+
+
+ filenames = args.input_filenames
+ base = filenames[0]
+ for _ in args.input_filenames:
+ check_input_files(_, False)
+
+ check_space(args.input_filenames, False)
+
+ print('Counting kmers from sequences in %s' % repr(filenames),
+ file=sys.stderr)
+
+ htable = khmer.new_hashbits(args.ksize, args.min_tablesize, args.n_tables)
+ target_method = htable.consume_fasta_with_reads_parser
+
+ for _, filename in enumerate(filenames):
+ rparser = khmer.ReadParser(filename)
+ threads = []
+ print('consuming input', filename, file=sys.stderr)
+ for num in xrange(args.threads):
+ cur_thread = threading.Thread(
+ target=target_method, args=(rparser,))
+ threads.append(cur_thread)
+ cur_thread.start()
+
+ for thread in threads:
+ thread.join()
+ unique_kmers = htable.n_unique_kmers()
+ print('Total number of unique k-mers: {0}'.format(unique_kmers),
+ file=sys.stderr)
+
+ info_optimal = open(base + '.optimal_args', 'w')
+
+ fp_rate = khmer.calc_expected_collisions(htable)
+ print('fp rate estimated to be %1.3f' % fp_rate, file=sys.stderr)
+
+ if fp_rate > 0.15: # 0.18 is ACTUAL MAX. Do not change.
+ print("**", file=sys.stderr)
+ print("** ERROR: the graph structure is too small for this data set."
+ "Increase table size/# tables.", file=sys.stderr)
+ print("**", file=sys.stderr)
+ if not False:
+ sys.exit(1)
+
+ to_print = output_gen(unique_kmers,fp_rate)
+
+ print(to_print, file=info_optimal)
+
+ print('optimal arguments were written to', base + '.optimal_args',
+ file=sys.stderr)
+
+if __name__ == '__main__':
+ main()
+
+# vim: set ft=python ts=4 sts=4 sw=4 et tw=79:
diff --git a/sandbox/print-stoptags.py b/sandbox/print-stoptags.py
index 6fc327c..1e59b44 100755
--- a/sandbox/print-stoptags.py
+++ b/sandbox/print-stoptags.py
@@ -1,7 +1,7 @@
-#! /usr/bin/env python2
+#! /usr/bin/env python
#
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
-# Copyright (C) Michigan State University, 2009-2013. It is licensed under
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
+# Copyright (C) Michigan State University, 2009-2015. It is licensed under
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
#
@@ -13,7 +13,7 @@ K = 32
def main():
- ht = khmer.new_hashbits(32, 1, 1)
+ ht = khmer.Hashbits(32, 1, 1)
ht.load_stop_tags(sys.argv[1])
ht.print_stop_tags(os.path.basename(sys.argv[1]) + '.txt')
diff --git a/sandbox/print-tagset.py b/sandbox/print-tagset.py
index dbe9c77..c861278 100755
--- a/sandbox/print-tagset.py
+++ b/sandbox/print-tagset.py
@@ -1,10 +1,11 @@
-#! /usr/bin/env python2
+#! /usr/bin/env python
#
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
-# Copyright (C) Michigan State University, 2009-2013. It is licensed under
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
+# Copyright (C) Michigan State University, 2009-2015. It is licensed under
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
#
+from __future__ import print_function
import khmer
import sys
import os
@@ -13,9 +14,9 @@ K = 32
def main():
- ht = khmer.new_hashbits(32, 1, 1)
+ ht = khmer.Hashbits(32, 1, 1)
ht.load_tagset(sys.argv[1])
- print 'loaded!'
+ print('loaded!')
ht.print_tagset(os.path.basename(sys.argv[1]) + '.txt')
diff --git a/sandbox/renumber-partitions.py b/sandbox/renumber-partitions.py
index 7a3df7b..92d3134 100755
--- a/sandbox/renumber-partitions.py
+++ b/sandbox/renumber-partitions.py
@@ -1,10 +1,11 @@
-#! /usr/bin/env python2
+#! /usr/bin/env python
#
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
-# Copyright (C) Michigan State University, 2009-2013. It is licensed under
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
+# Copyright (C) Michigan State University, 2009-2015. It is licensed under
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
#
+from __future__ import print_function
import sys
import screed
import gzip
@@ -21,7 +22,7 @@ def main():
old_to_new = {}
for n, record in enumerate(screed.open(filename)):
if n > 0 and n % 10000 == 0:
- print '...', os.path.basename(filename), n
+ print('...', os.path.basename(filename), n)
partition = record.name.split()[-1]
name = record.name.split()[0]
@@ -33,7 +34,7 @@ def main():
outfp.write('>%s\t%d\n%s\n' % (name, new_part, record.sequence))
outfp.close()
- print 'renumbered %d partitions in %s' % (len(old_to_new), filename)
+ print('renumbered %d partitions in %s' % (len(old_to_new), filename))
if __name__ == '__main__':
diff --git a/sandbox/saturate-by-median.py b/sandbox/saturate-by-median.py
index 8d5eba4..40aee14 100755
--- a/sandbox/saturate-by-median.py
+++ b/sandbox/saturate-by-median.py
@@ -1,6 +1,6 @@
-#! /usr/bin/env python2
+#! /usr/bin/env python
#
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
# Copyright (C) Michigan State University, 2009-2015. It is licensed under
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
@@ -11,15 +11,17 @@ Count saturation curve for reads with a coverage of 1, but collect
reads whether or not they have high coverage. This is better for
assessing saturation of (esp) low-coverage data sets.
"""
+from __future__ import division
+from __future__ import print_function
import sys
import screed
import os
import khmer
import textwrap
-from itertools import izip
+
from khmer.khmer_args import (build_counting_args, add_loadhash_args,
- report_on_config, info)
+ report_on_config, info, create_countgraph)
import argparse
from khmer.kfile import (check_space, check_space_for_hashtable,
check_valid_file_exists)
@@ -32,7 +34,7 @@ DEFAULT_DESIRED_COVERAGE = 1
def batchwise(coll, size):
iter_coll = iter(coll)
- return izip(*[iter_coll] * size)
+ return zip(*[iter_coll] * size)
# Returns true if the pair of records are properly pairs
@@ -61,14 +63,14 @@ def normalize_by_median(input_filename, htable, args, report_fp=None,
for index, batch in enumerate(batchwise(screed.open(
input_filename), batch_size)):
if index > 0 and index % report_frequency == 0:
- print '... kept {kept} of {total} or {perc:2}%'.format(
+ print('... kept {kept} of {total} or {perc:2}%'.format(
kept=total - discarded, total=total,
- perc=int(100. - discarded / float(total) * 100.))
- print '... in file', input_filename
+ perc=int(100. - discarded / float(total) * 100.)))
+ print('... in file', input_filename)
if report_fp:
- print >> report_fp, total, total - discarded, \
- 1. - (discarded / float(total))
+ print(total, total - discarded, \
+ 1. - (discarded / float(total)), file=report_fp)
report_fp.flush()
total += batch_size
@@ -103,8 +105,8 @@ def normalize_by_median(input_filename, htable, args, report_fp=None,
def handle_error(error, input_name):
- print >> sys.stderr, '** ERROR:', error
- print >> sys.stderr, '** Failed on {name}: '.format(name=input_name)
+ print('** ERROR:', error, file=sys.stderr)
+ print('** Failed on {name}: '.format(name=input_name), file=sys.stderr)
def get_parser():
epilog = ("""
@@ -185,19 +187,18 @@ def main(): # pylint: disable=too-many-branches,too-many-statements
check_valid_file_exists(args.input_filenames)
check_space(args.input_filenames, False)
if args.savetable:
- check_space_for_hashtable(args.n_tables * args.min_tablesize, False)
+ check_space_for_hashtable(args, 'countgraph', False)
# list to save error files along with throwing exceptions
if args.force:
corrupt_files = []
if args.loadtable:
- print 'loading k-mer counting table from', args.loadtable
+ print('loading k-mer counting table from', args.loadtable)
htable = khmer.load_counting_hash(args.loadtable)
else:
- print 'making k-mer counting table'
- htable = khmer.new_counting_hash(args.ksize, args.min_tablesize,
- args.n_tables)
+ print('making countgraph')
+ htable = create_countgraph(args)
total = 0
discarded = 0
@@ -214,36 +215,37 @@ def main(): # pylint: disable=too-many-branches,too-many-statements
except IOError as err:
handle_error(err, input_filename)
if not args.force:
- print >> sys.stderr, '** Exiting!'
+ print('** Exiting!', file=sys.stderr)
sys.exit(1)
else:
- print >> sys.stderr, '*** Skipping error file, moving on...'
+ print('*** Skipping error file, moving on...', file=sys.stderr)
corrupt_files.append(input_filename)
else:
if total_acc == 0 and discarded_acc == 0:
- print 'SKIPPED empty file', input_filename
+ print('SKIPPED empty file', input_filename)
else:
total += total_acc
discarded += discarded_acc
- print 'DONE with {inp}; kept {kept} of {total} or {perc:2}%'\
+ print('DONE with {inp}; kept {kept} of {total} or {perc:2}%'\
.format(inp=input_filename,
kept=total - discarded, total=total,
- perc=int(100. - discarded / float(total) * 100.))
+ perc=int(100. - discarded / float(total) * 100.)))
if args.savetable:
- print 'Saving k-mer counting table through', input_filename
- print '...saving to', args.savetable
+ print('Saving k-mer counting table through', input_filename)
+ print('...saving to', args.savetable)
htable.save(args.savetable)
# re: threshold, see Zhang et al.,
# http://arxiv.org/abs/1309.2975
fp_rate = khmer.calc_expected_collisions(htable, args.force, max_false_pos=.8)
- print 'fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate)
+ print('fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate))
if args.force and len(corrupt_files) > 0:
- print >> sys.stderr, "** WARNING: Finished with errors!"
- print >> sys.stderr, "** IOErrors occurred in the following files:"
- print >> sys.stderr, "\t", " ".join(corrupt_files)
+ print("** WARNING: Finished with errors!", file=sys.stderr)
+ print("** IOErrors occurred in the following files:", file=sys.stderr)
+ print("\t", " ".join(corrupt_files), file=sys.stderr)
+
if __name__ == '__main__':
main()
diff --git a/sandbox/shuffle-reverse-rotary.py b/sandbox/shuffle-reverse-rotary.py
index 9a289bf..70d4d92 100755
--- a/sandbox/shuffle-reverse-rotary.py
+++ b/sandbox/shuffle-reverse-rotary.py
@@ -1,7 +1,8 @@
-#! /usr/bin/env python2
+from __future__ import print_function
+#! /usr/bin/env python
#
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
-# Copyright (C) Michigan State University, 2009-2013. It is licensed under
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
+# Copyright (C) Michigan State University, 2009-2015. It is licensed under
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
#
@@ -30,12 +31,12 @@ def main():
for record in screed.open(filename):
total += 1
if total % 10000 == 0:
- print '...', total
+ print('...', total)
loc = total % ROTARY_SIZE
fp_d[loc].write('>%s\n%s\n' % (record.name, record.sequence))
- print 'reverse-rotary shuffled %d sequences into %d files (%s.NNN)' % \
- (total, ROTARY_SIZE, prefix)
+ print('reverse-rotary shuffled %d sequences into %d files (%s.NNN)' % \
+ (total, ROTARY_SIZE, prefix))
if __name__ == '__main__':
diff --git a/sandbox/slice-reads-by-coverage.py b/sandbox/slice-reads-by-coverage.py
index 6b4896b..2093aa6 100755
--- a/sandbox/slice-reads-by-coverage.py
+++ b/sandbox/slice-reads-by-coverage.py
@@ -1,9 +1,10 @@
-#! /usr/bin/env python2
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
-# Copyright (C) Michigan State University, 2014. It is licensed under
+#! /usr/bin/env python
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
+# Copyright (C) Michigan State University, 2014-2015. It is licensed under
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
+from __future__ import print_function
import argparse
import screed
import sys
@@ -26,16 +27,16 @@ def main():
parser.add_argument('output_readfile')
args = parser.parse_args()
- print >>sys.stderr, 'min_coverage: %s' % args.min_coverage
- print >>sys.stderr, 'max_coverage: %s' % args.max_coverage
+ print('min_coverage: %s' % args.min_coverage, file=sys.stderr)
+ print('max_coverage: %s' % args.max_coverage, file=sys.stderr)
if not (args.min_coverage or args.max_coverage):
- print >>sys.stderr, "neither min nor max coverage specified!? exiting!"
+ print("neither min nor max coverage specified!? exiting!", file=sys.stderr)
sys.exit(1)
if args.min_coverage and args.max_coverage and \
args.max_coverage < args.min_coverage:
- print >>sys.stderr, "min_coverage > max_coverage!? exiting!"
+ print("min_coverage > max_coverage!? exiting!", file=sys.stderr)
sys.exit(1)
htable = khmer.load_counting_hash(args.input_counting_table)
@@ -46,7 +47,7 @@ def main():
n = 0
for n, record in enumerate(screed.open(args.input_readfile)):
if n % 100000 == 0:
- print >>sys.stderr, '...', n, n_kept
+ print('...', n, n_kept, file=sys.stderr)
seq = record.sequence.upper()
seq = seq.replace('N', 'A')
@@ -68,7 +69,7 @@ def main():
output_fp.write(output_single(record))
- print >>sys.stderr, 'consumed %d reads; kept %d' % (n, n_kept)
+ print('consumed %d reads; kept %d' % (n, n_kept), file=sys.stderr)
if __name__ == '__main__':
main()
diff --git a/sandbox/split-fasta.py b/sandbox/split-fasta.py
index 0339693..bf395da 100755
--- a/sandbox/split-fasta.py
+++ b/sandbox/split-fasta.py
@@ -1,10 +1,11 @@
-#! /usr/bin/env python2
+#! /usr/bin/env python
#
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
-# Copyright (C) Michigan State University, 2009-2013. It is licensed under
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
+# Copyright (C) Michigan State University, 2009-2015. It is licensed under
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
#
+from __future__ import print_function
import sys
import screed
@@ -17,12 +18,12 @@ def main():
division = -1
for n, record in enumerate(screed.open(filename)):
if n % 100000 == 0:
- print '...', n
+ print('...', n)
if n % size == 0:
division += 1
new_name = '%s.%04d.fa' % (prefix, division)
- print 'opening', new_name
+ print('opening', new_name)
fp = open(new_name, 'w')
fp.write('>%s\n%s\n' % (record['name'], record['sequence']))
diff --git a/sandbox/split-sequences-by-length.py b/sandbox/split-sequences-by-length.py
index e83c0fe..8e5aeb3 100755
--- a/sandbox/split-sequences-by-length.py
+++ b/sandbox/split-sequences-by-length.py
@@ -1,7 +1,7 @@
-#! /usr/bin/env python2
+#! /usr/bin/env python
#
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
-# Copyright (C) Michigan State University, 2009-2013. It is licensed under
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
+# Copyright (C) Michigan State University, 2009-2015. It is licensed under
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
#
@@ -13,6 +13,7 @@ hash table. Output sequences will be placed in 'infile.abundfilt'.
Use '-h' for parameter help.
"""
+from __future__ import print_function
import sys
import screed.fasta
import os
@@ -48,12 +49,12 @@ def main():
n = 0
for filename in filenames:
- print 'opening'
+ print('opening')
for record in screed.open(filename):
out.save(record.name, record.sequence)
n += 1
if n % 10000 == 0:
- print '...', n
+ print('...', n)
if __name__ == '__main__':
main()
diff --git a/sandbox/stoptag-abundance-hist.py b/sandbox/stoptag-abundance-hist.py
index b1c3d8a..616a9f4 100755
--- a/sandbox/stoptag-abundance-hist.py
+++ b/sandbox/stoptag-abundance-hist.py
@@ -1,10 +1,11 @@
-#! /usr/bin/env python2
+#! /usr/bin/env python
#
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
-# Copyright (C) Michigan State University, 2009-2013. It is licensed under
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
+# Copyright (C) Michigan State University, 2009-2015. It is licensed under
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
#
+from __future__ import print_function
import sys
import khmer
import os
@@ -33,7 +34,7 @@ def main():
d[count] = d.get(count, 0) + 1
if count > 1000:
- print >>outabund, sequence, count
+ print(sequence, count, file=outabund)
outfp = open(figure + '.countshist', 'w')
sofar = 0
@@ -41,7 +42,7 @@ def main():
for k in sorted(d.keys()):
sofar += d[k]
sofar_cumu += k * d[k]
- print >>outfp, k, d[k], sofar, sofar_cumu
+ print(k, d[k], sofar, sofar_cumu, file=outfp)
hist(counts, normed=True, cumulative=True, bins=100, range=(1, 1000))
savefig(figure)
diff --git a/sandbox/stoptags-by-position.py b/sandbox/stoptags-by-position.py
index 653b441..1b92fa6 100755
--- a/sandbox/stoptags-by-position.py
+++ b/sandbox/stoptags-by-position.py
@@ -1,7 +1,8 @@
-#! /usr/bin/env python2
+from __future__ import print_function
+#! /usr/bin/env python
#
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
-# Copyright (C) Michigan State University, 2009-2013. It is licensed under
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
+# Copyright (C) Michigan State University, 2009-2015. It is licensed under
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
#
@@ -13,7 +14,7 @@ K = 32
def main():
- ht = khmer.new_hashbits(K, 1, 1)
+ ht = khmer.Hashbits(K, 1, 1)
x = [0] * 255
y = [0] * 255
@@ -38,7 +39,7 @@ def main():
for i, (n, m) in enumerate(zip(x, y)):
if m:
- print '%d,%d,%d' % (i, n, m)
+ print('%d,%d,%d' % (i, n, m))
if __name__ == '__main__':
diff --git a/sandbox/strip-partition.py b/sandbox/strip-partition.py
index 5dbd82d..ab972ab 100755
--- a/sandbox/strip-partition.py
+++ b/sandbox/strip-partition.py
@@ -1,10 +1,11 @@
-#! /usr/bin/env python2
+#! /usr/bin/env python
#
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
-# Copyright (C) Michigan State University, 2009-2013. It is licensed under
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
+# Copyright (C) Michigan State University, 2009-2015. It is licensed under
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
#
+from __future__ import print_function
import screed
import sys
@@ -16,7 +17,7 @@ def main():
name = name.split()[0]
- print '>%s\n%s' % (name, sequence,)
+ print('>%s\n%s' % (name, sequence,))
if __name__ == '__main__':
diff --git a/sandbox/subset-report.py b/sandbox/subset-report.py
index dd580d8..fe230e6 100755
--- a/sandbox/subset-report.py
+++ b/sandbox/subset-report.py
@@ -1,10 +1,11 @@
-#! /usr/bin/env python2
+#! /usr/bin/env python
#
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
-# Copyright (C) Michigan State University, 2009-2013. It is licensed under
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
+# Copyright (C) Michigan State University, 2009-2015. It is licensed under
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
#
+from __future__ import print_function
import khmer
import sys
import gc
@@ -16,21 +17,21 @@ K = 32
def main():
subset_filenames = sys.argv[1:]
- ht = khmer.new_hashbits(K, 1, 1)
+ ht = khmer.Hashbits(K, 1, 1)
for filename in subset_filenames:
- print '--'
- print 'partition map:', filename
+ print('--')
+ print('partition map:', filename)
subset = ht.load_subset_partitionmap(filename)
n_part, n_orphan = ht.subset_count_partitions(subset)
- print 'num partitions:', n_part
- print 'num orphans:', n_orphan
+ print('num partitions:', n_part)
+ print('num orphans:', n_orphan)
(dist, n_unassigned) = ht.subset_partition_size_distribution(subset)
for (size, count) in dist:
- print size, count
- print '%d unassigned tags' % n_unassigned
+ print(size, count)
+ print('%d unassigned tags' % n_unassigned)
- print '--'
+ print('--')
if __name__ == '__main__':
diff --git a/sandbox/sweep-files.py b/sandbox/sweep-files.py
index 74f0675..6b44ea6 100755
--- a/sandbox/sweep-files.py
+++ b/sandbox/sweep-files.py
@@ -1,6 +1,6 @@
-#! /usr/bin/env python2
+#! /usr/bin/env python
#
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
# Copyright (C) Michigan State University, 2009-2015. It is licensed under
# the three-clause BSD license; see LICENSE. Contact: ctb at msu.edu
#
@@ -12,6 +12,7 @@ Find all reads connected to the given contigs on a per-partition basis.
% sweep-files.py -r <range> --db <fasta/q files> \
--query <fasta/q files separate>
"""
+from __future__ import print_function
EPILOG = """
Output will be a collection of fasta/q files, each corresponding to a database
@@ -107,7 +108,7 @@ def main():
if args.ksize < MIN_KSIZE:
args.ksize = MIN_KSIZE
- report_on_config(args, hashtype='hashbits')
+ report_on_config(args, hashtype='nodegraph')
K = args.ksize
HT_SIZE = args.min_tablesize
@@ -121,7 +122,7 @@ def main():
# de Bruin graph; open a file and output queue for each file as well.
ht = khmer.LabelHash(K, HT_SIZE, N_HT)
try:
- print >>sys.stderr, 'consuming and labeling input sequences...'
+ print('consuming and labeling input sequences...', file=sys.stderr)
for i, dbfile in enumerate(args.db):
@@ -132,18 +133,17 @@ def main():
for n, record in enumerate(screed.open(dbfile)):
if n % 50000 == 0:
- print >>sys.stderr, \
- '...consumed {n} sequences...'.format(n=n)
+ print('...consumed {n} sequences...'.format(n=n), file=sys.stderr)
ht.consume_sequence_and_tag_with_labels(record.sequence, i)
except IOError as e:
- print >>sys.stderr, '!! ERROR: !!', e
- print >>sys.stderr, '...error setting up outputs. exiting...'
+ print('!! ERROR: !!', e, file=sys.stderr)
+ print('...error setting up outputs. exiting...', file=sys.stderr)
- print >>sys.stderr, 'done consuming input sequence. \
+ print('done consuming input sequence. \
added {t} tags and {l} labels...' \
- .format(t=ht.n_tags(), l=ht.n_labels())
+ .format(t=ht.n_tags(), l=ht.n_labels()), file=sys.stderr)
n_orphaned = 0
n_labeled = 0
@@ -152,21 +152,20 @@ def main():
# Iterate through all the reads and check for the labels with which they
# intersect. Queue to the corresponding label when found.
for read_file in args.query:
- print >>sys.stderr, '** sweeping {read_file} for labels...'.format(
- read_file=read_file)
+ print('** sweeping {read_file} for labels...'.format(
+ read_file=read_file), file=sys.stderr)
try:
read_fp = screed.open(read_file)
except IOError as error:
- print >>sys.stderr, '!! ERROR: !!', error
- print >>sys.stderr, '*** Could not open {fn}, skipping...'.format(
- fn=read_file)
+ print('!! ERROR: !!', error, file=sys.stderr)
+ print('*** Could not open {fn}, skipping...'.format(
+ fn=read_file), file=sys.stderr)
else:
for n, record in enumerate(read_fp):
if n % 50000 == 0 and n > 0:
- print >>sys.stderr, \
- '\tswept {n} reads [{nc} labeled, {no} orphaned]' \
+ print('\tswept {n} reads [{nc} labeled, {no} orphaned]' \
.format(n=n, nc=n_labeled,
- no=n_orphaned)
+ no=n_orphaned), file=sys.stderr)
seq = record.sequence
try:
labels = ht.sweep_label_neighborhood(seq, traversal_range)
@@ -184,19 +183,19 @@ def main():
else:
n_orphaned += 1
- print >>sys.stderr, '** End of file {fn}...'.format(fn=read_file)
+ print('** End of file {fn}...'.format(fn=read_file), file=sys.stderr)
read_fp.close()
# gotta output anything left in the buffers at the end!
- print >>sys.stderr, '** End of run...'
- for q in outputs.values():
+ print('** End of run...', file=sys.stderr)
+ for q in list(outputs.values()):
q.clear()
- print >>sys.stderr, 'swept {n_reads}...'.format(
- n_reads=n_labeled + n_orphaned)
- print >>sys.stderr, '...with {nc} labeled and {no} orphaned'.format(
- nc=n_labeled, no=n_orphaned)
- print >>sys.stderr, '...and {nmc} multilabeled'.format(nmc=n_mlabeled)
+ print('swept {n_reads}...'.format(
+ n_reads=n_labeled + n_orphaned), file=sys.stderr)
+ print('...with {nc} labeled and {no} orphaned'.format(
+ nc=n_labeled, no=n_orphaned), file=sys.stderr)
+ print('...and {nmc} multilabeled'.format(nmc=n_mlabeled), file=sys.stderr)
if __name__ == '__main__':
main()
diff --git a/sandbox/sweep-out-reads-with-contigs.py b/sandbox/sweep-out-reads-with-contigs.py
index 29b3e3b..f053f62 100755
--- a/sandbox/sweep-out-reads-with-contigs.py
+++ b/sandbox/sweep-out-reads-with-contigs.py
@@ -1,10 +1,11 @@
-#! /usr/bin/env python2
+#! /usr/bin/env python
#
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
-# Copyright (C) Michigan State University, 2009-2013. It is licensed under
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
+# Copyright (C) Michigan State University, 2009-2015. It is licensed under
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
#
+from __future__ import print_function
import sys
import khmer
import os.path
@@ -21,18 +22,18 @@ def main():
outfile = sys.argv[3]
# create a hashbits data structure
- ht = khmer.new_hashbits(K, 1, 1)
+ ht = khmer.Hashbits(K, 1, 1)
# tag every k-mer in the contigs
ht._set_tag_density(0)
# load contigs, connect into N partitions
- print 'loading contigs from', contigfile
+ print('loading contigs from', contigfile)
ht.consume_fasta_and_tag(contigfile)
subset = ht.do_subset_partition(0, 0)
ht.merge_subset(subset)
- print 'outputting contig-partitioned reads to', outfile
+ print('outputting contig-partitioned reads to', outfile)
ht.output_partitions(readsfile, outfile, True)
diff --git a/sandbox/sweep-reads.py b/sandbox/sweep-reads.py
index a344938..fbf2ccb 100755
--- a/sandbox/sweep-reads.py
+++ b/sandbox/sweep-reads.py
@@ -1,11 +1,14 @@
-#! /usr/bin/env python2
+from __future__ import print_function, unicode_literals
+#! /usr/bin/env python
#
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
# Copyright (C) Michigan State University, 2009-2015. It is licensed under
# the three-clause BSD license; see LICENSE. Contact: ctb at msu.edu
#
# pylint: disable=invalid-name,missing-docstring,no-member
+from io import open
+
from khmer import utils
"""
@@ -47,7 +50,7 @@ DEFAULT_BUFFER_SIZE = 10
DEFAULT_OUT_PREF = 'reads'
DEFAULT_RANGE = -1
-MIN_HSIZE = 4e7
+MAX_HSIZE = 4e7
MIN_KSIZE = 21
@@ -103,12 +106,12 @@ class ReadBufferManager(object):
self.num_write_errors = 0
self.num_file_errors = 0
- print >>sys.stderr, '''Init new ReadBuffer [
+ print('''Init new ReadBuffer [
Max Buffers: {num_bufs}
Max Reads: {max_reads}
Buffer flush: {buf_flush}
]'''.format(num_bufs=self.max_buffers, max_reads=self.max_reads,
- buf_flush=self.buffer_flush)
+ buf_flush=self.buffer_flush), file=sys.stderr)
def flush_buffer(self, buf_id):
fn = '{prefix}_{buffer_id}.{ext}'.format(prefix=self.output_pref,
@@ -119,9 +122,9 @@ class ReadBufferManager(object):
try:
outfp = open(fpath, 'a')
except IOError as _:
- print >>sys.stderr, '!! ERROR: {_} !!'.format(_=_)
- print >>sys.stderr, '*** Failed to open {fn} for \
- buffer flush'.format(fn=fpath)
+ print('!! ERROR: {_} !!'.format(_=_), file=sys.stderr)
+ print('*** Failed to open {fn} for \
+ buffer flush'.format(fn=fpath), file=sys.stderr)
self.num_file_errors += 1
else:
outfp.write(buf.flush())
@@ -142,16 +145,16 @@ class ReadBufferManager(object):
self.cur_reads += 1
if self.cur_reads > self.max_reads:
- print >>sys.stderr, '** Reached max num reads...'
+ print('** Reached max num reads...', file=sys.stderr)
self.flush_all()
if len(self.buffers) > self.max_buffers:
# self.clean_buffers(2)
- print >>sys.stderr, '** Reached max num buffers...'
+ print('** Reached max num buffers...', file=sys.stderr)
self.flush_all()
def flush_all(self):
- print >>sys.stderr, '*** Flushing all to files...'
- buf_ids = self.buffers.keys()
+ print('*** Flushing all to files...', file=sys.stderr)
+ buf_ids = list(self.buffers.keys())
for buf_id in buf_ids:
self.flush_buffer(buf_id)
assert self.cur_reads == 0
@@ -205,15 +208,15 @@ def main():
parser = get_parser()
args = parser.parse_args()
- if args.min_tablesize < MIN_HSIZE:
- args.min_tablesize = MIN_HSIZE
+ if args.max_tablesize < MAX_HSIZE:
+ args.max_tablesize = MAX_HSIZE
if args.ksize < MIN_KSIZE:
args.ksize = MIN_KSIZE
- report_on_config(args, hashtype='hashbits')
+ report_on_config(args, hashtype='nodegraph')
K = args.ksize
- HT_SIZE = args.min_tablesize
+ HT_SIZE = args.max_tablesize
N_HT = args.n_tables
traversal_range = args.traversal_range
@@ -239,7 +242,7 @@ def main():
# figure out input file type (FA/FQ) -- based on first file
ix = iter(screed.open(args.input_files[0]))
- record = ix.next()
+ record = next(ix)
del ix
extension = 'fa'
@@ -252,21 +255,19 @@ def main():
# consume the partitioned fasta with which to label the graph
ht = khmer.LabelHash(K, HT_SIZE, N_HT)
try:
- print >>sys.stderr, 'consuming input sequences...'
+ print('consuming input sequences...', file=sys.stderr)
if args.label_by_pid:
- print >>sys.stderr, '...labeling by partition id (pid)'
+ print('...labeling by partition id (pid)', file=sys.stderr)
ht.consume_partitioned_fasta_and_tag_with_labels(input_fastp)
elif args.label_by_seq:
- print >>sys.stderr, '...labeling by sequence'
+ print('...labeling by sequence', file=sys.stderr)
for n, record in enumerate(screed.open(input_fastp)):
if n % 50000 == 0:
- print >>sys.stderr, \
- '...consumed {n} sequences...'.format(n=n)
+ print('...consumed {n} sequences...'.format(n=n), file=sys.stderr)
ht.consume_sequence_and_tag_with_labels(record.sequence, n)
else:
- print >>sys.stderr, \
- '...labeling to create groups of size {s}'.format(
- s=args.group_size)
+ print('...labeling to create groups of size {s}'.format(
+ s=args.group_size), file=sys.stderr)
label = -1
g = 0
try:
@@ -283,25 +284,25 @@ def main():
pref=output_pref, g=g,
ext=extension), 'wb')
if n % 50000 == 0:
- print >>sys.stderr, \
- '...consumed {n} sequences...'.format(n=n)
+ print('...consumed {n} sequences...'.format(n=n), file=sys.stderr)
ht.consume_sequence_and_tag_with_labels(record.sequence,
label)
write_record(record, outfp)
-
+
except IOError as e:
- print >>sys.stderr, '!! ERROR !!', e
- print >>sys.stderr, '...error splitting input. exiting...'
+ print('!! ERROR !!', e, file=sys.stderr)
+ print('...error splitting input. exiting...', file=sys.stderr)
except IOError as e:
- print >>sys.stderr, '!! ERROR: !!', e
- print >>sys.stderr, '...error consuming \
- {i}. exiting...'.format(i=input_fastp)
+ print('!! ERROR: !!', e, file=sys.stderr)
+ print('...error consuming \
+ {i}. exiting...'.format(i=input_fastp), file=sys.stderr)
- print >>sys.stderr, 'done consuming input sequence. \
+ print('done consuming input sequence. \
added {t} tags and {l} \
- labels...'.format(t=ht.n_tags(), l=ht.n_labels())
+ labels...'.format(t=ht.graph.n_tags(),
+ l=ht.n_labels()))
label_dict = defaultdict(int)
label_number_dist = []
@@ -313,27 +314,27 @@ def main():
total_t = time.clock()
start_t = time.clock()
for read_file in args.input_files:
- print >>sys.stderr, '** sweeping {read_file} for labels...'.format(
- read_file=read_file)
+ print('** sweeping {read_file} for labels...'.format(
+ read_file=read_file), file=sys.stderr)
file_t = 0.0
try:
read_fp = screed.open(read_file)
except IOError as error:
- print >>sys.stderr, '!! ERROR: !!', error
- print >>sys.stderr, '*** Could not open {fn}, skipping...'.format(
- fn=read_file)
+ print('!! ERROR: !!', error, file=sys.stderr)
+ print('*** Could not open {fn}, skipping...'.format(
+ fn=read_file), file=sys.stderr)
else:
for _, record in enumerate(read_fp):
if _ % 50000 == 0:
end_t = time.clock()
batch_t = end_t - start_t
file_t += batch_t
- print >>sys.stderr, '\tswept {n} reads [{nc} labeled, \
+ print('\tswept {n} reads [{nc} labeled, \
{no} orphaned] \
** {sec}s ({sect}s total)' \
.format(n=_, nc=n_labeled,
no=n_orphaned,
- sec=batch_t, sect=file_t)
+ sec=batch_t, sect=file_t), file=sys.stderr)
start_t = time.clock()
seq = record.sequence
name = record.name
@@ -360,37 +361,37 @@ def main():
n_orphaned += 1
output_buffer.queue(seq_str, 'orphaned')
label_dict['orphaned'] += 1
- print >>sys.stderr, '** End of file {fn}...'.format(fn=read_file)
+ print('** End of file {fn}...'.format(fn=read_file), file=sys.stderr)
output_buffer.flush_all()
read_fp.close()
# gotta output anything left in the buffers at the end!
- print >>sys.stderr, '** End of run...'
+ print('** End of run...', file=sys.stderr)
output_buffer.flush_all()
total_t = time.clock() - total_t
if output_buffer.num_write_errors > 0 or output_buffer.num_file_errors > 0:
- print >>sys.stderr, '! WARNING: Sweep finished with errors !'
- print >>sys.stderr, '** {writee} reads not written'.format(
- writee=output_buffer.num_write_errors)
- print >>sys.stderr, '** {filee} errors opening files'.format(
- filee=output_buffer.num_file_errors)
-
- print >>sys.stderr, 'swept {n_reads} for labels...'.format(
- n_reads=n_labeled + n_orphaned)
- print >>sys.stderr, '...with {nc} labeled and {no} orphaned'.format(
- nc=n_labeled, no=n_orphaned)
- print >>sys.stderr, '...and {nmc} multilabeled'.format(nmc=n_mlabeled)
-
- print >>sys.stderr, '** outputting label number distribution...'
+ print('! WARNING: Sweep finished with errors !', file=sys.stderr)
+ print('** {writee} reads not written'.format(
+ writee=output_buffer.num_write_errors), file=sys.stderr)
+ print('** {filee} errors opening files'.format(
+ filee=output_buffer.num_file_errors), file=sys.stderr)
+
+ print('swept {n_reads} for labels...'.format(
+ n_reads=n_labeled + n_orphaned), file=sys.stderr)
+ print('...with {nc} labeled and {no} orphaned'.format(
+ nc=n_labeled, no=n_orphaned), file=sys.stderr)
+ print('...and {nmc} multilabeled'.format(nmc=n_mlabeled), file=sys.stderr)
+
+ print('** outputting label number distribution...', file=sys.stderr)
fn = os.path.join(outdir, '{pref}.dist.txt'.format(pref=output_pref))
- with open(fn, 'wb') as outfp:
+ with open(fn, 'w', encoding='utf-8') as outfp:
for nc in label_number_dist:
outfp.write('{nc}\n'.format(nc=nc))
fn = os.path.join(outdir, '{pref}.counts.csv'.format(pref=output_pref))
- print >>sys.stderr, '** outputting label read counts...'
- with open(fn, 'wb') as outfp:
+ print('** outputting label read counts...', file=sys.stderr)
+ with open(fn, 'w', encoding='utf-8') as outfp:
for k in label_dict:
outfp.write('{l},{c}\n'.format(l=k, c=label_dict[k]))
diff --git a/sandbox/sweep-reads2.py b/sandbox/sweep-reads2.py
index 4ccea30..38d6e6a 100755
--- a/sandbox/sweep-reads2.py
+++ b/sandbox/sweep-reads2.py
@@ -1,7 +1,7 @@
-#! /usr/bin/env python2
+#! /usr/bin/env python
#
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
-# Copyright (C) Michigan State University, 2009-2013. It is licensed under
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
+# Copyright (C) Michigan State University, 2009-2015. It is licensed under
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
#
@@ -14,12 +14,13 @@ Results end up in <search reads>.sweep2.
Use '-h' for parameter help.
"""
+from __future__ import print_function
import sys
import khmer
import os.path
import screed
-from khmer.khmer_args import (build_hashbits_args, DEFAULT_MIN_TABLESIZE)
+from khmer.khmer_args import (build_hashbits_args, DEFAULT_MAX_TABLESIZE)
def main():
@@ -30,21 +31,21 @@ def main():
args = parser.parse_args()
if not args.quiet:
- if args.min_hashsize == DEFAULT_MIN_HASHSIZE:
- print >>sys.stderr, "** WARNING: hashsize is default! " \
+ if args.min_hashsize == DEFAULT_MAX_TABLESIZE:
+ print("** WARNING: hashsize is default! " \
"You absodefly want to increase this!\n** " \
- "Please read the docs!"
-
- print >>sys.stderr, '\nPARAMETERS:'
- print >>sys.stderr, ' - kmer size = %d \t\t(-k)' % args.ksize
- print >>sys.stderr, ' - n hashes = %d \t\t(-N)' % args.n_hashes
- print >>sys.stderr, ' - min hashsize = %-5.2g \t(-x)' % \
- args.min_hashsize
- print >>sys.stderr, ''
- print >>sys.stderr, 'Estimated memory usage is %.2g bytes ' \
+ "Please read the docs!", file=sys.stderr)
+
+ print('\nPARAMETERS:', file=sys.stderr)
+ print(' - kmer size = %d \t\t(-k)' % args.ksize, file=sys.stderr)
+ print(' - n hashes = %d \t\t(-N)' % args.n_hashes, file=sys.stderr)
+ print(' - min hashsize = %-5.2g \t(-x)' % \
+ args.min_hashsize, file=sys.stderr)
+ print('', file=sys.stderr)
+ print('Estimated memory usage is %.2g bytes ' \
'(n_hashes x min_hashsize / 8)' % (
- args.n_hashes * args.min_hashsize / 8.)
- print >>sys.stderr, '-' * 8
+ args.n_hashes * args.min_hashsize / 8.), file=sys.stderr)
+ print('-' * 8, file=sys.stderr)
K = args.ksize
HT_SIZE = args.min_hashsize
@@ -57,13 +58,13 @@ def main():
outfp = open(outfile, 'w')
# create a hashbits data structure
- ht = khmer.new_hashbits(K, HT_SIZE, N_HT)
+ ht = khmer.Hashbits(K, HT_SIZE, N_HT)
# load contigs, connect into N partitions
- print 'loading input reads from', inp
+ print('loading input reads from', inp)
ht.consume_fasta(inp)
- print 'starting sweep.'
+ print('starting sweep.')
n = 0
m = 0
@@ -72,7 +73,7 @@ def main():
continue
if n % 10000 == 0:
- print '...', n, m
+ print('...', n, m)
count = ht.get_median_count(record.sequence)[0]
if count:
diff --git a/sandbox/sweep-reads3.py b/sandbox/sweep-reads3.py
index 9e66e1f..c0c5329 100755
--- a/sandbox/sweep-reads3.py
+++ b/sandbox/sweep-reads3.py
@@ -1,7 +1,7 @@
-#! /usr/bin/env python2
+#! /usr/bin/env python
#
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
-# Copyright (C) Michigan State University, 2009-2013. It is licensed under
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
+# Copyright (C) Michigan State University, 2009-2015. It is licensed under
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
#
@@ -14,12 +14,13 @@ Results end up in <search reads>.sweep3.
Use '-h' for parameter help.
"""
+from __future__ import print_function
import sys
import os.path
import screed
import khmer
-from khmer.khmer_args import (build_hashbits_args, DEFAULT_MIN_TABLESIZE)
+from khmer.khmer_args import (build_hashbits_args, DEFAULT_MAX_TABLESIZE)
def output_single(r):
@@ -37,21 +38,21 @@ def main():
args = parser.parse_args()
if not args.quiet:
- if args.min_hashsize == DEFAULT_MIN_HASHSIZE:
- print >>sys.stderr, "** WARNING: hashsize is default! " \
+ if args.min_hashsize == DEFAULT_MAX_TABLESIZE:
+ print("** WARNING: hashsize is default! " \
"You absodefly want to increase this!\n** " \
- "Please read the docs!"
-
- print >>sys.stderr, '\nPARAMETERS:'
- print >>sys.stderr, ' - kmer size = %d \t\t(-k)' % args.ksize
- print >>sys.stderr, ' - n hashes = %d \t\t(-N)' % args.n_hashes
- print >>sys.stderr, ' - min hashsize = %-5.2g \t(-x)' % \
- args.min_hashsize
- print >>sys.stderr, ''
- print >>sys.stderr, 'Estimated memory usage is %.2g bytes ' \
+ "Please read the docs!", file=sys.stderr)
+
+ print('\nPARAMETERS:', file=sys.stderr)
+ print(' - kmer size = %d \t\t(-k)' % args.ksize, file=sys.stderr)
+ print(' - n hashes = %d \t\t(-N)' % args.n_hashes, file=sys.stderr)
+ print(' - min hashsize = %-5.2g \t(-x)' % \
+ args.min_hashsize, file=sys.stderr)
+ print('', file=sys.stderr)
+ print('Estimated memory usage is %.2g bytes ' \
'(n_hashes x min_hashsize / 8)' % (
- args.n_hashes * args.min_hashsize * len(args.input_filenames) / 8.)
- print >>sys.stderr, '-' * 8
+ args.n_hashes * args.min_hashsize * len(args.input_filenames) / 8.), file=sys.stderr)
+ print('-' * 8, file=sys.stderr)
K = args.ksize
HT_SIZE = args.min_hashsize
@@ -63,7 +64,7 @@ def main():
query_list = []
for n, inp_name in enumerate(inputlist):
# create a hashbits data structure
- ht = khmer.new_hashbits(K, HT_SIZE, N_HT)
+ ht = khmer.Hashbits(K, HT_SIZE, N_HT)
outfile = os.path.basename(inp_name) + '.sweep3'
outfp = open(outfile, 'w')
@@ -73,10 +74,10 @@ def main():
ht = query_list[n][0]
# load contigs, connect into N partitions
- print 'loading input reads from', inp_name
+ print('loading input reads from', inp_name)
ht.consume_fasta(inp_name)
- print 'starting sweep.'
+ print('starting sweep.')
n = 0
m = 0
@@ -85,7 +86,7 @@ def main():
continue
if n % 10000 == 0:
- print '...', n, m
+ print('...', n, m)
for ht, outfp in query_list:
count = ht.get_median_count(record.sequence)[0]
diff --git a/sandbox/unique-kmers.py b/sandbox/unique-kmers.py
index 6aa5b6a..aa78d88 100755
--- a/sandbox/unique-kmers.py
+++ b/sandbox/unique-kmers.py
@@ -1,6 +1,6 @@
-#! /usr/bin/env python2
+#! /usr/bin/env python
#
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
# Copyright (C) Michigan State University, 2009-2015. It is licensed under
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
@@ -13,6 +13,7 @@ Estimate number of unique k-mers, with precision <= ERROR_RATE.
Use '-h' for parameter help.
"""
+from __future__ import print_function
import argparse
@@ -22,8 +23,9 @@ import textwrap
import khmer
from khmer.khmer_args import DEFAULT_K, info, ComboFormatter
+from oxli.functions import optimal_args_output_gen as output_gen
from khmer import __version__
-
+import screed
def get_parser():
descr = "Estimate number of unique k-mers, with precision <= ERROR_RATE."
@@ -66,6 +68,9 @@ def get_parser():
parser.add_argument('-R', '--report',
metavar='filename', type=argparse.FileType('w'))
+ parser.add_argument('--stream-out', '-S', default=False,
+ action='store_true')
+
parser.add_argument('input_filenames', metavar='input_sequence_filename',
help='Input FAST[AQ] sequence filename.', nargs='+')
@@ -77,19 +82,39 @@ def main():
info('unique-kmers.py', ['SeqAn', 'hll'])
args = get_parser().parse_args()
- hllcpp = khmer.HLLCounter(args.error_rate, args.ksize)
+ total_hll = khmer.HLLCounter(args.error_rate, args.ksize)
report_fp = args.report
input_filename = None
for index, input_filename in enumerate(args.input_filenames):
- hllcpp.consume_fasta(input_filename)
-
- cardinality = hllcpp.estimate_cardinality()
- print >> sys.stdout, 'Estimated number of unique k-mers: {0}'.format(
- cardinality)
+ hllcpp = khmer.HLLCounter(args.error_rate, args.ksize)
+ for record in screed.open(input_filename):
+ seq = record.sequence.upper().replace('N', 'A')
+ hllcpp.consume_string(seq)
+ if args.stream_out:
+ write_record(record, sys.stdout)
+
+ cardinality = hllcpp.estimate_cardinality()
+ print('Estimated number of unique {0}-mers in {1}: {2}'.format(
+ args.ksize, input_filename, cardinality),
+ file=sys.stderr)
+
+ if report_fp:
+ print(cardinality, args.ksize, '(total)', file=report_fp)
+ report_fp.flush()
+ total_hll.merge(hllcpp)
+
+ cardinality = total_hll.estimate_cardinality()
+ print('Total estimated number of unique {0}-mers: {1}'.format(
+ args.ksize, cardinality),
+ file=sys.stderr)
+
+ to_print = output_gen(cardinality, args.error_rate)
+ print(to_print)
if report_fp:
- print >> report_fp, cardinality
+ print(cardinality, args.ksize, 'total', file=report_fp)
+ print(to_print, file=report_fp)
report_fp.flush()
if __name__ == "__main__":
diff --git a/sandbox/write-trimmomatic.py b/sandbox/write-trimmomatic.py
index 49c9638..5ab61e0 100755
--- a/sandbox/write-trimmomatic.py
+++ b/sandbox/write-trimmomatic.py
@@ -1,10 +1,11 @@
-#! /usr/bin/env python2
+#! /usr/bin/env python
#
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
-# Copyright (C) Michigan State University, 2009-2013. It is licensed under
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
+# Copyright (C) Michigan State University, 2009-2015. It is licensed under
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
#
+from __future__ import print_function
import glob
filelist = glob.glob('*R1*.fastq.gz')
@@ -13,7 +14,7 @@ for r1 in filelist:
r2 = r1.replace('R1', 'R2')
final_pe = r1[:-9] + '.pe.fq.gz'
final_se = r1[:-9] + '.se.fq.gz'
- print """\
+ print("""\
mkdir trim
cd trim
java -jar /usr/local/bin/trimmomatic-0.30.jar PE ../%s ../%s s1_pe s1_se s2_pe s2_se ILLUMINACLIP:/usr/local/share/adapters/TruSeq3-PE.fa:2:30:10
@@ -24,4 +25,4 @@ cd ..
rm -r ./trim/
chmod u-w %s %s
-""" % (r1, r2, final_pe, final_se, final_pe, final_se)
+""" % (r1, r2, final_pe, final_se, final_pe, final_se))
diff --git a/scripts/abundance-dist-single.py b/scripts/abundance-dist-single.py
index f719a5a..10d2109 100755
--- a/scripts/abundance-dist-single.py
+++ b/scripts/abundance-dist-single.py
@@ -1,6 +1,6 @@
-#! /usr/bin/env python2
+#! /usr/bin/env python
#
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
# Copyright (C) Michigan State University, 2010-2015. It is licensed under
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
@@ -15,12 +15,14 @@ The script does not load a prebuilt k-mer counting table.
Use '-h' for parameter help.
"""
+from __future__ import print_function
import os
import sys
import csv
import khmer
import threading
import textwrap
+from khmer import khmer_args
from khmer.khmer_args import (build_counting_args, add_threading_args,
report_on_config, info)
from khmer.kfile import (check_input_files, check_space,
@@ -78,13 +80,12 @@ def main(): # pylint: disable=too-many-locals,too-many-branches
check_input_files(args.input_sequence_filename, args.force)
check_space([args.input_sequence_filename], args.force)
if args.savetable:
- check_space_for_hashtable(args.n_tables * args.min_tablesize,
- args.force)
+ check_space_for_hashtable(args, 'countgraph', args.force)
if (not args.squash_output and
os.path.exists(args.output_histogram_filename)):
- print >> sys.stderr, 'ERROR: %s exists; not squashing.' % \
- args.output_histogram_filename
+ print('ERROR: %s exists; not squashing.' %
+ args.output_histogram_filename, file=sys.stderr)
sys.exit(1)
else:
hist_fp = open(args.output_histogram_filename, 'w')
@@ -94,26 +95,24 @@ def main(): # pylint: disable=too-many-locals,too-many-branches
hist_fp_csv.writerow(['abundance', 'count', 'cumulative',
'cumulative_fraction'])
- print >>sys.stderr, 'making k-mer counting table'
- counting_hash = khmer.new_counting_hash(args.ksize, args.min_tablesize,
- args.n_tables)
+ print('making countgraph', file=sys.stderr)
+ counting_hash = khmer_args.create_countgraph(args, multiplier=1.1)
counting_hash.set_use_bigcount(args.bigcount)
- print >> sys.stderr, 'building k-mer tracking table'
- tracking = khmer.new_hashbits(counting_hash.ksize(), args.min_tablesize,
- args.n_tables)
+ print('building k-mer tracking table', file=sys.stderr)
+ tracking = khmer_args.create_nodegraph(args, multiplier=1.1)
- print >>sys.stderr, 'kmer_size:', counting_hash.ksize()
- print >>sys.stderr, 'k-mer counting table sizes:', \
- counting_hash.hashsizes()
- print >>sys.stderr, 'outputting to', args.output_histogram_filename
+ print('kmer_size:', counting_hash.ksize(), file=sys.stderr)
+ print('k-mer counting table sizes:',
+ counting_hash.hashsizes(), file=sys.stderr)
+ print('outputting to', args.output_histogram_filename, file=sys.stderr)
# start loading
rparser = khmer.ReadParser(args.input_sequence_filename)
threads = []
- print >>sys.stderr, 'consuming input, round 1 --', \
- args.input_sequence_filename
- for _ in xrange(args.threads):
+ print('consuming input, round 1 --',
+ args.input_sequence_filename, file=sys.stderr)
+ for _ in range(args.threads):
thread = \
threading.Thread(
target=counting_hash.consume_fasta_with_reads_parser,
@@ -126,8 +125,8 @@ def main(): # pylint: disable=too-many-locals,too-many-branches
thread.join()
if args.report_total_kmers:
- print >> sys.stderr, 'Total number of unique k-mers: {0}'.format(
- counting_hash.n_unique_kmers())
+ print('Total number of unique k-mers: {0}'.format(
+ counting_hash.n_unique_kmers()), file=sys.stderr)
abundance_lists = []
@@ -136,13 +135,13 @@ def main(): # pylint: disable=too-many-locals,too-many-branches
read_parser, tracking)
abundance_lists.append(abundances)
- print >>sys.stderr, 'preparing hist from %s...' % \
- args.input_sequence_filename
+ print('preparing hist from %s...' %
+ args.input_sequence_filename, file=sys.stderr)
rparser = khmer.ReadParser(args.input_sequence_filename)
threads = []
- print >>sys.stderr, 'consuming input, round 2 --', \
- args.input_sequence_filename
- for _ in xrange(args.threads):
+ print('consuming input, round 2 --',
+ args.input_sequence_filename, file=sys.stderr)
+ for _ in range(args.threads):
thread = \
threading.Thread(
target=__do_abundance_dist__,
@@ -163,10 +162,10 @@ def main(): # pylint: disable=too-many-locals,too-many-branches
total = sum(abundance.values())
if 0 == total:
- print >> sys.stderr, \
- "ERROR: abundance distribution is uniformly zero; " \
- "nothing to report."
- print >> sys.stderr, "\tPlease verify that the input files are valid."
+ print("ERROR: abundance distribution is uniformly zero; "
+ "nothing to report.", file=sys.stderr)
+ print(
+ "\tPlease verify that the input files are valid.", file=sys.stderr)
sys.exit(1)
sofar = 0
@@ -180,17 +179,17 @@ def main(): # pylint: disable=too-many-locals,too-many-branches
if args.csv:
hist_fp_csv.writerow([_, i, sofar, round(frac, 3)])
else:
- print >> hist_fp, _, i, sofar, round(frac, 3)
+ print(_, i, sofar, round(frac, 3), file=hist_fp)
if sofar == total:
break
if args.savetable:
- print >>sys.stderr, 'Saving k-mer counting table ', args.savetable
- print >>sys.stderr, '...saving to', args.savetable
+ print('Saving k-mer counting table ', args.savetable, file=sys.stderr)
+ print('...saving to', args.savetable, file=sys.stderr)
counting_hash.save(args.savetable)
- print >> sys.stderr, 'wrote to: ' + args.output_histogram_filename
+ print('wrote to: ' + args.output_histogram_filename, file=sys.stderr)
if __name__ == '__main__':
main()
diff --git a/scripts/abundance-dist.py b/scripts/abundance-dist.py
index 966bfe8..7661ec6 100755
--- a/scripts/abundance-dist.py
+++ b/scripts/abundance-dist.py
@@ -1,6 +1,6 @@
-#! /usr/bin/env python2
+#! /usr/bin/env python
#
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
# Copyright (C) Michigan State University, 2010-2015. It is licensed under
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
@@ -44,6 +44,9 @@ def get_parser():
parser.add_argument('-s', '--squash', dest='squash_output', default=False,
action='store_true',
help='Overwrite existing output_histogram_filename')
+ parser.add_argument('-b', '--no-bigcount', dest='bigcount', default=True,
+ action='store_false',
+ help='Do not count k-mers past 255')
parser.add_argument('--csv', default=False, action='store_true',
help='Use the CSV format for the histogram. '
'Includes column headers.')
@@ -63,19 +66,27 @@ def main():
for infile in infiles:
check_input_files(infile, args.force)
- print ('hashtable from', args.input_counting_table_filename,
- file=sys.stderr)
+ print('hashtable from', args.input_counting_table_filename,
+ file=sys.stderr)
counting_hash = khmer.load_counting_hash(
args.input_counting_table_filename)
+ if not counting_hash.get_use_bigcount() and args.bigcount:
+ print("WARNING: The loaded graph has bigcount DISABLED while bigcount"
+ " reporting is ENABLED--counts higher than 255 will not be "
+ "reported.",
+ file=sys.stderr)
+
+ counting_hash.set_use_bigcount(args.bigcount)
+
kmer_size = counting_hash.ksize()
hashsizes = counting_hash.hashsizes()
tracking = khmer._Hashbits( # pylint: disable=protected-access
kmer_size, hashsizes)
- print ('K:', kmer_size, file=sys.stderr)
- print ('HT sizes:', hashsizes, file=sys.stderr)
- print ('outputting to', args.output_histogram_filename, file=sys.stderr)
+ print('K:', kmer_size, file=sys.stderr)
+ print('HT sizes:', hashsizes, file=sys.stderr)
+ print('outputting to', args.output_histogram_filename, file=sys.stderr)
if os.path.exists(args.output_histogram_filename):
if not args.squash_output:
diff --git a/scripts/annotate-partitions.py b/scripts/annotate-partitions.py
index aed932e..e09f958 100755
--- a/scripts/annotate-partitions.py
+++ b/scripts/annotate-partitions.py
@@ -1,6 +1,6 @@
-#! /usr/bin/env python2
+#! /usr/bin/env python
#
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
# Copyright (C) Michigan State University, 2009-2015. It is licensed under
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
@@ -15,6 +15,7 @@ Partition-annotated sequences will be in <fileN>.part.
Use '-h' for parameter help.
"""
+from __future__ import print_function
import os
import argparse
@@ -66,7 +67,7 @@ def main():
ksize = args.ksize
filenames = args.input_filenames
- htable = khmer.new_hashbits(ksize, 1, 1)
+ htable = khmer.Hashbits(ksize, 1, 1)
partitionmap_file = args.graphbase + '.pmap.merged'
@@ -76,16 +77,16 @@ def main():
check_space(filenames, args.force)
- print >>sys.stderr, 'loading partition map from:', partitionmap_file
+ print('loading partition map from:', partitionmap_file, file=sys.stderr)
htable.load_partitionmap(partitionmap_file)
for infile in filenames:
- print >>sys.stderr, 'outputting partitions for', infile
+ print('outputting partitions for', infile, file=sys.stderr)
outfile = os.path.basename(infile) + '.part'
part_count = htable.output_partitions(infile, outfile)
- print >>sys.stderr, 'output %d partitions for %s' % (
- part_count, infile)
- print >>sys.stderr, 'partitions are in', outfile
+ print('output %d partitions for %s' % (
+ part_count, infile), file=sys.stderr)
+ print('partitions are in', outfile, file=sys.stderr)
if __name__ == '__main__':
main()
diff --git a/scripts/count-median.py b/scripts/count-median.py
index 58b36a6..19e6473 100755
--- a/scripts/count-median.py
+++ b/scripts/count-median.py
@@ -1,6 +1,6 @@
-#! /usr/bin/env python2
+#! /usr/bin/env python
#
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
# Copyright (C) Michigan State University, 2009-2015. It is licensed under
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
@@ -19,8 +19,9 @@ Use '-h' for parameter help.
The output file contains sequence id, median, average, stddev, and seq length.
-NOTE: All 'N's in the input sequences are converted to 'G's.
+NOTE: All 'N's in the input sequences are converted to 'A's.
"""
+from __future__ import print_function
import screed
import argparse
import sys
@@ -49,7 +50,7 @@ def get_parser():
count-median.py counts.ct tests/test-data/test-reads.fq.gz medians.txt
- NOTE: All 'N's in the input sequences are converted to 'G's.
+ NOTE: All 'N's in the input sequences are converted to 'A's.
"""
parser = argparse.ArgumentParser(
description='Count k-mers summary stats for sequences',
@@ -85,11 +86,11 @@ def main():
check_space(infiles, args.force)
- print >>sys.stderr, 'loading k-mer counting table from', htfile
+ print('loading k-mer counting table from', htfile, file=sys.stderr)
htable = khmer.load_counting_hash(htfile)
ksize = htable.ksize()
- print >>sys.stderr, 'writing to', output_filename
+ print('writing to', output_filename, file=sys.stderr)
output = open(output_filename, 'w')
if args.csv:
@@ -105,14 +106,15 @@ def main():
parse_description=parse_description):
seq = record.sequence.upper()
if 'N' in seq:
- seq = seq.replace('N', 'G')
+ seq = seq.replace('N', 'A')
if ksize <= len(seq):
medn, ave, stdev = htable.get_median_count(seq)
+ ave, stdev = [round(x, 9) for x in (ave, stdev)]
if args.csv:
output.writerow([record.name, medn, ave, stdev, len(seq)])
else:
- print >> output, record.name, medn, ave, stdev, len(seq)
+ print(record.name, medn, ave, stdev, len(seq), file=output)
if __name__ == '__main__':
main()
diff --git a/scripts/count-overlap.py b/scripts/count-overlap.py
index a5d7f95..a8c715d 100755
--- a/scripts/count-overlap.py
+++ b/scripts/count-overlap.py
@@ -1,7 +1,7 @@
-#! /usr/bin/env python2
+#! /usr/bin/env python
#
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
-# Copyright (C) Michigan State University, 2012-2015. It is licensed under
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
+# Copyright (C) Michigan State University, 2012-2015-2015. It is licensed under
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
#
@@ -18,17 +18,15 @@ usage: count-overlap_cpp.py [-h] [-q] [--ksize KSIZE] [--n_tables N_HASHES]
Use '-h' for parameter help.
"""
+from __future__ import print_function
import sys
import csv
import khmer
import textwrap
+from khmer import khmer_args
from khmer.kfile import check_input_files, check_space
from khmer.khmer_args import (build_hashbits_args, report_on_config, info)
-DEFAULT_K = 32
-DEFAULT_N_HT = 4
-DEFAULT_HASHSIZE = 1e6
-
def get_parser():
epilog = """
@@ -57,14 +55,14 @@ def get_parser():
def main():
info('count-overlap.py', ['counting'])
args = get_parser().parse_args()
- report_on_config(args, hashtype='hashbits')
+ report_on_config(args, hashtype='nodegraph')
for infile in [args.ptfile, args.fafile]:
check_input_files(infile, args.force)
check_space([args.ptfile, args.fafile], args.force)
- print >>sys.stderr, 'loading k-mer presence table from', args.ptfile
+ print('loading k-mer presence table from', args.ptfile, file=sys.stderr)
ht1 = khmer.load_hashbits(args.ptfile)
kmer_size = ht1.ksize()
@@ -75,7 +73,7 @@ def main():
# write headers:
f_curve_obj_csv.writerow(['input_seq', 'overlap_kmer'])
- ht2 = khmer.new_hashbits(kmer_size, args.min_tablesize, args.n_tables)
+ ht2 = khmer_args.create_nodegraph(args, ksize=kmer_size)
(n_unique, n_overlap, list_curve) = ht2.count_overlap(args.fafile, ht1)
@@ -93,9 +91,9 @@ dataset2: %s
if args.csv:
f_curve_obj_csv.writerow([list_curve[100 + i], list_curve[i]])
else:
- print >> f_curve_obj, list_curve[100 + i], list_curve[i]
+ print(list_curve[100 + i], list_curve[i], file=f_curve_obj)
- print >> sys.stderr, 'wrote to: ' + args.report_filename
+ print('wrote to: ' + args.report_filename, file=sys.stderr)
if __name__ == '__main__':
main()
diff --git a/scripts/do-partition.py b/scripts/do-partition.py
index 0fcf64f..f502145 100755
--- a/scripts/do-partition.py
+++ b/scripts/do-partition.py
@@ -1,6 +1,6 @@
-#! /usr/bin/env python2
+#! /usr/bin/env python
#
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
# Copyright (C) Michigan State University, 2009-2015. It is licensed under
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
@@ -13,15 +13,16 @@ Do all the partition steps in one script.
Use '-h' for parameter help.
"""
+from __future__ import print_function
import khmer
import sys
import threading
-import Queue
import gc
import os.path
import os
import textwrap
+from khmer import khmer_args
from khmer.khmer_args import (build_hashbits_args, report_on_config, info,
add_threading_args)
import glob
@@ -29,6 +30,12 @@ from khmer.kfile import check_input_files, check_space
import re
import platform
+# stdlib queue module was renamed on Python 3
+try:
+ import queue
+except ImportError:
+ import Queue as queue
+
DEFAULT_SUBSET_SIZE = int(1e5)
DEFAULT_N_THREADS = 4
DEFAULT_K = 32
@@ -36,9 +43,9 @@ DEFAULT_K = 32
# Debugging Support
if "Linux" == platform.system():
def __debug_vm_usage(msg):
- print >>sys.stderr, "===> DEBUG: " + msg
+ print("===> DEBUG: " + msg, file=sys.stderr)
for vmstat in re.findall(r".*Vm.*", file("/proc/self/status").read()):
- print vmstat
+ print(vmstat)
else:
def __debug_vm_usage(msg): # pylint: disable=unused-argument
pass
@@ -48,23 +55,23 @@ def worker(queue, basename, stop_big_traversals):
while True:
try:
(htable, index, start, stop) = queue.get(False)
- except Queue.Empty:
- print >>sys.stderr, 'exiting'
+ except queue.Empty:
+ print('exiting', file=sys.stderr)
return
outfile = basename + '.subset.%d.pmap' % (index,)
if os.path.exists(outfile):
- print >>sys.stderr, 'SKIPPING', outfile, ' -- already exists'
+ print('SKIPPING', outfile, ' -- already exists', file=sys.stderr)
continue
- print >>sys.stderr, 'starting:', basename, index
+ print('starting:', basename, index, file=sys.stderr)
# pay attention to stoptags when partitioning; take command line
# direction on whether or not to exhaustively traverse.
subset = htable.do_subset_partition(start, stop, True,
stop_big_traversals)
- print >>sys.stderr, 'saving:', basename, index
+ print('saving:', basename, index, file=sys.stderr)
htable.save_subset_partitionmap(subset, outfile)
del subset
gc.collect()
@@ -107,45 +114,46 @@ def main(): # pylint: disable=too-many-locals,too-many-statements
info('do-partition.py', ['graph'])
args = get_parser().parse_args()
- report_on_config(args, hashtype='hashbits')
+ report_on_config(args, hashtype='nodegraph')
for infile in args.input_filenames:
check_input_files(infile, args.force)
check_space(args.input_filenames, args.force)
- print >>sys.stderr, 'Saving k-mer presence table to %s' % args.graphbase
- print >>sys.stderr, 'Loading kmers from sequences in %s' % \
- repr(args.input_filenames)
- print >>sys.stderr, '--'
- print >>sys.stderr, 'SUBSET SIZE', args.subset_size
- print >>sys.stderr, 'N THREADS', args.threads
- print >>sys.stderr, '--'
+ print('Saving k-mer presence table to %s' %
+ args.graphbase, file=sys.stderr)
+ print('Loading kmers from sequences in %s' %
+ repr(args.input_filenames), file=sys.stderr)
+ print('--', file=sys.stderr)
+ print('SUBSET SIZE', args.subset_size, file=sys.stderr)
+ print('N THREADS', args.threads, file=sys.stderr)
+ print('--', file=sys.stderr)
# load-graph
- print >>sys.stderr, 'making k-mer presence table'
- htable = khmer.new_hashbits(args.ksize, args.min_tablesize, args.n_tables)
+ print('making nodegraph', file=sys.stderr)
+ htable = khmer_args.create_nodegraph(args)
for _, filename in enumerate(args.input_filenames):
- print >>sys.stderr, 'consuming input', filename
+ print('consuming input', filename, file=sys.stderr)
htable.consume_fasta_and_tag(filename)
# 0.18 is ACTUAL MAX. Do not change.
fp_rate = \
khmer.calc_expected_collisions(htable, args.force, max_false_pos=.15)
- print >>sys.stderr, 'fp rate estimated to be %1.3f' % fp_rate
+ print('fp rate estimated to be %1.3f' % fp_rate, file=sys.stderr)
# partition-graph
# do we want to exhaustively traverse the graph?
stop_big_traversals = args.no_big_traverse
if stop_big_traversals:
- print >>sys.stderr, '** This script brakes for lumps: ', \
- 'stop_big_traversals is true.'
+ print('** This script brakes for lumps: ',
+ 'stop_big_traversals is true.', file=sys.stderr)
else:
- print >>sys.stderr, '** Traverse all the things:', \
- ' stop_big_traversals is false.'
+ print('** Traverse all the things:',
+ ' stop_big_traversals is false.', file=sys.stderr)
#
# now, partition!
@@ -157,7 +165,7 @@ def main(): # pylint: disable=too-many-locals,too-many-statements
divvy.append(0)
# build a queue of tasks:
- worker_q = Queue.Queue()
+ worker_q = queue.Queue()
# break up the subsets into a list of worker tasks
for _ in range(0, n_subsets):
@@ -165,7 +173,7 @@ def main(): # pylint: disable=too-many-locals,too-many-statements
end = divvy[_ + 1]
worker_q.put((htable, _, start, end))
- print >>sys.stderr, 'enqueued %d subset tasks' % n_subsets
+ print('enqueued %d subset tasks' % n_subsets, file=sys.stderr)
open('%s.info' % args.graphbase, 'w').write('%d subsets total\n'
% (n_subsets))
@@ -173,8 +181,8 @@ def main(): # pylint: disable=too-many-locals,too-many-statements
args.threads = n_subsets
# start threads!
- print >>sys.stderr, 'starting %d threads' % args.threads
- print >>sys.stderr, '---'
+ print('starting %d threads' % args.threads, file=sys.stderr)
+ print('---', file=sys.stderr)
threads = []
for _ in range(args.threads):
@@ -186,43 +194,43 @@ def main(): # pylint: disable=too-many-locals,too-many-statements
assert threading.active_count() == args.threads + 1
- print >>sys.stderr, 'done starting threads'
+ print('done starting threads', file=sys.stderr)
# wait for threads
for _ in threads:
_.join()
- print >>sys.stderr, '---'
- print >>sys.stderr, 'done making subsets! see %s.subset.*.pmap' % \
- (args.graphbase,)
+ print('---', file=sys.stderr)
+ print('done making subsets! see %s.subset.*.pmap' %
+ (args.graphbase,), file=sys.stderr)
# merge-partitions
pmap_files = glob.glob(args.graphbase + '.subset.*.pmap')
- print >>sys.stderr, 'loading %d pmap files (first one: %s)' % \
- (len(pmap_files), pmap_files[0])
+ print('loading %d pmap files (first one: %s)' %
+ (len(pmap_files), pmap_files[0]), file=sys.stderr)
- htable = khmer.new_hashbits(args.ksize, 1, 1)
+ htable = khmer.Hashbits(args.ksize, 1, 1)
for pmap_file in pmap_files:
- print >>sys.stderr, 'merging', pmap_file
+ print('merging', pmap_file, file=sys.stderr)
htable.merge_subset_from_disk(pmap_file)
if args.remove_subsets:
- print >>sys.stderr, 'removing pmap files'
+ print('removing pmap files', file=sys.stderr)
for pmap_file in pmap_files:
os.unlink(pmap_file)
# annotate-partitions
for infile in args.input_filenames:
- print >>sys.stderr, 'outputting partitions for', infile
+ print('outputting partitions for', infile, file=sys.stderr)
outfile = os.path.basename(infile) + '.part'
part_count = htable.output_partitions(infile, outfile)
- print >>sys.stderr, 'output %d partitions for %s' % (
- part_count, infile)
- print >>sys.stderr, 'partitions are in', outfile
+ print('output %d partitions for %s' % (
+ part_count, infile), file=sys.stderr)
+ print('partitions are in', outfile, file=sys.stderr)
if __name__ == '__main__':
main()
diff --git a/scripts/extract-long-sequences.py b/scripts/extract-long-sequences.py
index dcaf481..7155070 100755
--- a/scripts/extract-long-sequences.py
+++ b/scripts/extract-long-sequences.py
@@ -1,6 +1,6 @@
-#! /usr/bin/env python2
+#! /usr/bin/env python
#
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
# Copyright (C) Michigan State University, 2009-2015. It is licensed under
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
@@ -18,6 +18,7 @@ length.
Use '-h' for parameter help.
"""
+from __future__ import print_function
import argparse
import screed
import sys
@@ -47,7 +48,7 @@ def main():
for record in screed.open(filename, parse_description=False):
if len(record['sequence']) >= args.length:
write_record(record, outfp)
- print >> sys.stderr, 'wrote to: ' + args.output
+ print('wrote to: ' + args.output, file=sys.stderr)
if __name__ == '__main__':
main()
diff --git a/scripts/extract-paired-reads.py b/scripts/extract-paired-reads.py
index ef224cb..ea8bebf 100755
--- a/scripts/extract-paired-reads.py
+++ b/scripts/extract-paired-reads.py
@@ -1,6 +1,6 @@
-#! /usr/bin/env python2
+#! /usr/bin/env python
#
-# This script is part of khmer, http://github.com/ged-lab/khmer/, and is
+# This script is part of khmer, https://github.com/dib-lab/khmer/, and is
# Copyright (C) Michigan State University, 2009-2015. It is licensed under
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
@@ -16,6 +16,7 @@ extract them into separate files (.pe and .se).
Reads FASTQ and FASTA input, retains format for output.
"""
+from __future__ import print_function
import screed
import sys
import os.path
@@ -30,15 +31,27 @@ from khmer.utils import broken_paired_reader, write_record, write_record_pair
def get_parser():
epilog = """
- The output is two files, <input file>.pe and <input file>.se, placed in the
- current directory. The .pe file contains interleaved and properly paired
- sequences, while the .se file contains orphan sequences.
-
- Many assemblers (e.g. Velvet) require that you give them either perfectly
- interleaved files, or files containing only single reads. This script takes
- files that were originally interleaved but where reads may have been
- orphaned via error filtering, application of abundance filtering, digital
- normalization in non-paired mode, or partitioning.
+ Many read-handling programs (assemblers, mappers, etc.) require
+ that you give them either perfectly interleaved files, or files
+ containing only single reads. This script takes files that were
+ originally interleaved but where reads may have been orphaned (via
+ error filtering, application of abundance filtering, digital
+ normalization in non-paired mode, or partitioning) and separates
+ the interleaved reads from the orphaned reads.
+
+ The default output is two files, <input file>.pe and <input
+ file>.se, placed in the current directory. The .pe file contains
+ interleaved and properly paired sequences, while the .se file
+ contains orphan sequences.
+
+ The directory into which the interleaved and orphaned reads are
+ output may be specified using :option:`-o`/:option:`--output-dir`.
+ This directory will be created if it does not already exist.
+
+ Alternatively, you can specify the filenames directly with
+ :option:`-p`/:option:`--output-paired` and
+ :option:`-s`/:option:`--output-single`, which will override the
+ :option:`-o`/:option:`--output-dir` option.
Example::
@@ -47,9 +60,21 @@ def get_parser():
parser = argparse.ArgumentParser(
description='Take a mixture of reads and split into pairs and '
'orphans.', epilog=textwrap.dedent(epilog))
- parser.add_argument('infile')
+ parser.add_argument('infile', nargs='?', default='/dev/stdin')
parser.add_argument('--version', action='version', version='%(prog)s ' +
khmer.__version__)
+
+ parser.add_argument('-o', '--output-dir', default='', help='Output '
+ 'split reads to specified directory. Creates '
+ 'directory if necessary')
+
+ parser.add_argument('-p', '--output-paired', metavar='output_paired',
+ default=None, help='Output paired reads to this '
+ 'file', type=argparse.FileType('w'))
+ parser.add_argument('-s', '--output-single', metavar='output_single',
+ default=None, help='Output orphaned reads to this '
+ 'file', type=argparse.FileType('w'))
+
parser.add_argument('-f', '--force', default=False, action='store_true',
help='Overwrite output file if it exists')
return parser
@@ -59,28 +84,50 @@ def main():
info('extract-paired-reads.py')
args = get_parser().parse_args()
- check_input_files(args.infile, args.force)
- infiles = [args.infile]
- check_space(infiles, args.force)
-
- outfile = os.path.basename(args.infile)
- if len(sys.argv) > 2:
- outfile = sys.argv[2]
-
- single_fp = open(outfile + '.se', 'w')
- paired_fp = open(outfile + '.pe', 'w')
-
- print >>sys.stderr, 'reading file "%s"' % args.infile
- print >>sys.stderr, 'outputting interleaved pairs to "%s.pe"' % outfile
- print >>sys.stderr, 'outputting orphans to "%s.se"' % outfile
+ infile = args.infile
+ check_input_files(infile, args.force)
+ check_space([infile], args.force)
+
+ # decide where to put output files - specific directory? or just default?
+ if infile == '/dev/stdin' or infile == '-':
+ if not (args.output_paired and args.output_single):
+ print("Accepting input from stdin; output filenames must be "
+ "provided.", file=sys.stderr)
+ sys.exit(1)
+ elif args.output_dir:
+ if not os.path.exists(args.output_dir):
+ os.makedirs(args.output_dir)
+ out1 = args.output_dir + '/' + os.path.basename(infile) + '.se'
+ out2 = args.output_dir + '/' + os.path.basename(infile) + '.pe'
+ else:
+ out1 = os.path.basename(infile) + '.se'
+ out2 = os.path.basename(infile) + '.pe'
+
+ # OVERRIDE default output file locations with -p, -s
+ if args.output_paired:
+ paired_fp = args.output_paired
+ out2 = paired_fp.name
+ else:
+ # Don't override, just open the default filename from above
+ paired_fp = open(out2, 'w')
+ if args.output_single:
+ single_fp = args.output_single
+ out1 = single_fp.name
+ else:
+ # Don't override, just open the default filename from above
+ single_fp = open(out1, 'w')
+
+ print('reading file "%s"' % infile, file=sys.stderr)
+ print('outputting interleaved pairs to "%s"' % out2, file=sys.stderr)
+ print('outputting orphans to "%s"' % out1, file=sys.stderr)
n_pe = 0
n_se = 0
- screed_iter = screed.open(args.infile, parse_description=False)
+ screed_iter = screed.open(infile, parse_description=False)
for index, is_pair, read1, read2 in broken_paired_reader(screed_iter):
if index % 100000 == 0 and index > 0:
- print >>sys.stderr, '...', index
+ print('...', index, file=sys.stderr)
if is_pair:
write_record_pair(read1, read2, paired_fp)
@@ -95,12 +142,12 @@ def main():
if n_pe == 0:
raise Exception("no paired reads!? check file formats...")
- print >>sys.stderr, 'DONE; read %d sequences,' \
- ' %d pairs and %d singletons' % \
- (n_pe * 2 + n_se, n_pe, n_se)
+ print('DONE; read %d sequences,'
+ ' %d pairs and %d singletons' %
+ (n_pe * 2 + n_se, n_pe, n_se), file=sys.stderr)
- print >> sys.stderr, 'wrote to: ' + outfile \
- + '.se' + ' and ' + outfile + '.pe'
+ print('wrote to: %s and %s' % (out2, out1),
+ file=sys.stderr)
if __name__ == '__main__':
diff --git a/scripts/extract-partitions.py b/scripts/extract-partitions.py
index 6cbd37d..777db38 100755
--- a/scripts/extract-partitions.py
+++ b/scripts/extract-partitions.py
@@ -1,6 +1,6 @@
-#! /usr/bin/env python2
+#! /usr/bin/env python
#
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
# Copyright (C) Michigan State University, 2009-2015. It is licensed under
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
@@ -18,6 +18,7 @@ Use '-h' for parameter help.
@CTB note that if threshold is != 1, those sequences will not be output
by output_unassigned...
"""
+from __future__ import print_function
import sys
import screed
@@ -94,24 +95,24 @@ def main(): # pylint: disable=too-many-locals,too-many-branches
check_space(args.part_filenames, args.force)
- print >>sys.stderr, '---'
- print >>sys.stderr, 'reading partitioned files:', repr(args.part_filenames)
+ print('---', file=sys.stderr)
+ print('reading partitioned files:', repr(
+ args.part_filenames), file=sys.stderr)
if args.output_groups:
- print >>sys.stderr, 'outputting to files named "%s.groupN.fa"' % \
- args.prefix
- print >>sys.stderr, 'min reads to keep a partition:', \
- args.min_part_size
- print >>sys.stderr, 'max size of a group file:', args.max_size
+ print('outputting to files named "%s.groupN.fa"' %
+ args.prefix, file=sys.stderr)
+ print('min reads to keep a partition:',
+ args.min_part_size, file=sys.stderr)
+ print('max size of a group file:', args.max_size, file=sys.stderr)
else:
- print >>sys.stderr, 'NOT outputting groups! Beware!'
+ print('NOT outputting groups! Beware!', file=sys.stderr)
if args.output_unassigned:
- print >>sys.stderr, \
- 'outputting unassigned reads to "%s.unassigned.fa"' % \
- args.prefix
- print >>sys.stderr, 'partition size distribution will go to %s' \
- % distfilename
- print >>sys.stderr, '---'
+ print('outputting unassigned reads to "%s.unassigned.fa"' %
+ args.prefix, file=sys.stderr)
+ print('partition size distribution will go to %s'
+ % distfilename, file=sys.stderr)
+ print('---', file=sys.stderr)
#
@@ -142,7 +143,7 @@ def main(): # pylint: disable=too-many-locals,too-many-branches
for filename in args.part_filenames:
for index, read, pid in read_partition_file(filename):
if index % 100000 == 0:
- print >>sys.stderr, '...', index
+ print('...', index, file=sys.stderr)
count[pid] = count.get(pid, 0) + 1
@@ -159,7 +160,7 @@ def main(): # pylint: disable=too-many-locals,too-many-branches
# develop histogram of partition sizes
dist = {}
- for pid, size in count.items():
+ for pid, size in list(count.items()):
dist[size] = dist.get(size, 0) + 1
# output histogram
@@ -177,7 +178,7 @@ def main(): # pylint: disable=too-many-locals,too-many-branches
sys.exit(0)
# sort groups by size
- divvy = sorted(count.items(), key=lambda y: y[1])
+ divvy = sorted(list(count.items()), key=lambda y: y[1])
divvy = [y for y in divvy if y[1] > args.min_part_size]
# divvy up into different groups, based on having max_size sequences
@@ -205,9 +206,9 @@ def main(): # pylint: disable=too-many-locals,too-many-branches
# print 'group_d', partition_id, group_n
group_n += 1
- print >>sys.stderr, '%d groups' % group_n
+ print('%d groups' % group_n, file=sys.stderr)
if group_n == 0:
- print >>sys.stderr, 'nothing to output; exiting!'
+ print('nothing to output; exiting!', file=sys.stderr)
return
# open a bunch of output files for the different groups
@@ -225,7 +226,7 @@ def main(): # pylint: disable=too-many-locals,too-many-branches
for index, read, partition_id in read_partition_file(filename):
total_seqs += 1
if index % 100000 == 0:
- print >>sys.stderr, '...x2', index
+ print('...x2', index, file=sys.stderr)
if partition_id == 0:
continue
@@ -242,20 +243,19 @@ def main(): # pylint: disable=too-many-locals,too-many-branches
write_record(read, outfp)
part_seqs += 1
- print >>sys.stderr, '---'
- print >>sys.stderr, 'Of %d total seqs,' % total_seqs
- print >>sys.stderr, 'extracted %d partitioned seqs into group files,' % \
- part_seqs
- print >>sys.stderr, \
- 'discarded %d sequences from small partitions (see -m),' % \
- toosmall_parts
- print >>sys.stderr, 'and found %d unpartitioned sequences (see -U).' % \
- n_unassigned
- print >>sys.stderr, ''
- print >>sys.stderr, 'Created %d group files named %s.groupXXXX.%s' % \
- (len(group_fps),
- args.prefix,
- suffix)
+ print('---', file=sys.stderr)
+ print('Of %d total seqs,' % total_seqs, file=sys.stderr)
+ print('extracted %d partitioned seqs into group files,' %
+ part_seqs, file=sys.stderr)
+ print('discarded %d sequences from small partitions (see -m),' %
+ toosmall_parts, file=sys.stderr)
+ print('and found %d unpartitioned sequences (see -U).' %
+ n_unassigned, file=sys.stderr)
+ print('', file=sys.stderr)
+ print('Created %d group files named %s.groupXXXX.%s' %
+ (len(group_fps),
+ args.prefix,
+ suffix), file=sys.stderr)
if __name__ == '__main__':
main()
diff --git a/scripts/fastq-to-fasta.py b/scripts/fastq-to-fasta.py
index 76393ea..ef21cda 100755
--- a/scripts/fastq-to-fasta.py
+++ b/scripts/fastq-to-fasta.py
@@ -1,7 +1,7 @@
-#! /usr/bin/env python2
+#! /usr/bin/env python
#
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
-# Copyright (C) Michigan State University, 2009-2014. It is licensed under
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
+# Copyright (C) Michigan State University, 2009-2015. It is licensed under
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
#
@@ -14,6 +14,7 @@ Convert FASTQ files to FASTA format.
Use '-h' for parameter help.
"""
+from __future__ import print_function
import sys
import argparse
import screed
@@ -39,13 +40,13 @@ def get_parser():
def main():
args = get_parser().parse_args()
- print >> sys.stderr, ('fastq from ', args.input_sequence)
+ print(('fastq from ', args.input_sequence), file=sys.stderr)
n_count = 0
for n, record in enumerate(screed.open(args.input_sequence,
parse_description=False)):
if n % 10000 == 0:
- print>>sys.stderr, '...', n
+ print('...', n, file=sys.stderr)
sequence = record['sequence']
name = record['name']
@@ -58,15 +59,15 @@ def main():
args.output.write('>' + name + '\n')
args.output.write(sequence + '\n')
- print >> sys.stderr, '\n' + 'lines from ' + args.input_sequence
+ print('\n' + 'lines from ' + args.input_sequence, file=sys.stderr)
if not args.n_keep:
- print >> sys.stderr, str(n_count) + ' lines dropped.'
+ print(str(n_count) + ' lines dropped.', file=sys.stderr)
else:
- print >> sys.stderr, 'No lines dropped from file.'
+ print('No lines dropped from file.', file=sys.stderr)
- print >> sys.stderr, 'Wrote output to', args.output
+ print('Wrote output to', args.output, file=sys.stderr)
if __name__ == '__main__':
main()
diff --git a/scripts/filter-abund-single.py b/scripts/filter-abund-single.py
index 5300b5b..b22a494 100755
--- a/scripts/filter-abund-single.py
+++ b/scripts/filter-abund-single.py
@@ -1,6 +1,6 @@
-#! /usr/bin/env python2
+#! /usr/bin/env python
#
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
# Copyright (C) Michigan State University, 2009-2015. It is licensed under
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
@@ -17,12 +17,14 @@ placed in 'infile.abundfilt'.
Use '-h' for parameter help.
"""
+from __future__ import print_function
import os
import sys
import khmer
import threading
import textwrap
from khmer.thread_utils import ThreadedSequenceProcessor, verbose_loader
+from khmer import khmer_args
from khmer.khmer_args import (build_counting_args, report_on_config,
add_threading_args, info)
from khmer.kfile import (check_input_files, check_space,
@@ -69,19 +71,17 @@ def main():
check_input_files(args.datafile, args.force)
check_space([args.datafile], args.force)
if args.savetable:
- check_space_for_hashtable(
- args.n_tables * args.min_tablesize, args.force)
+ check_space_for_hashtable(args, 'countgraph', args.force)
report_on_config(args)
- print >>sys.stderr, 'making k-mer counting table'
- htable = khmer.new_counting_hash(args.ksize, args.min_tablesize,
- args.n_tables)
+ print('making countgraph', file=sys.stderr)
+ htable = khmer_args.create_countgraph(args)
# first, load reads into hash table
rparser = khmer.ReadParser(args.datafile)
threads = []
- print >>sys.stderr, 'consuming input, round 1 --', args.datafile
- for _ in xrange(args.threads):
+ print('consuming input, round 1 --', args.datafile, file=sys.stderr)
+ for _ in range(args.threads):
cur_thread = \
threading.Thread(
target=htable.consume_fasta_with_reads_parser,
@@ -94,44 +94,45 @@ def main():
_.join()
if args.report_total_kmers:
- print >> sys.stderr, 'Total number of unique k-mers: {0}'.format(
- htable.n_unique_kmers())
+ print('Total number of unique k-mers: {0}'.format(
+ htable.n_unique_kmers()), file=sys.stderr)
fp_rate = khmer.calc_expected_collisions(htable, args.force)
- print >>sys.stderr, 'fp rate estimated to be %1.3f' % fp_rate
+ print('fp rate estimated to be %1.3f' % fp_rate, file=sys.stderr)
# now, trim.
# the filtering function.
def process_fn(record):
- name = record['name']
- seq = record['sequence']
- if 'N' in seq:
- return None, None
+ name = record.name
+ seq = record.sequence
+ seqN = seq.replace('N', 'A')
- trim_seq, trim_at = htable.trim_on_abundance(seq, args.cutoff)
+ _, trim_at = htable.trim_on_abundance(seqN, args.cutoff)
if trim_at >= args.ksize:
- return name, trim_seq
+ # be sure to not to change the 'N's in the trimmed sequence -
+ # so, return 'seq' and not 'seqN'.
+ return name, seq[:trim_at]
return None, None
# the filtering loop
- print >>sys.stderr, 'filtering', args.datafile
+ print('filtering', args.datafile, file=sys.stderr)
outfile = os.path.basename(args.datafile) + '.abundfilt'
outfp = open(outfile, 'w')
tsp = ThreadedSequenceProcessor(process_fn)
tsp.start(verbose_loader(args.datafile), outfp)
- print >>sys.stderr, 'output in', outfile
+ print('output in', outfile, file=sys.stderr)
if args.savetable:
- print >>sys.stderr, 'Saving k-mer counting table filename', \
- args.savetable
- print >>sys.stderr, '...saving to', args.savetable
+ print('Saving k-mer counting table filename',
+ args.savetable, file=sys.stderr)
+ print('...saving to', args.savetable, file=sys.stderr)
htable.save(args.savetable)
- print >>sys.stderr, 'wrote to: ', outfile
+ print('wrote to: ', outfile, file=sys.stderr)
if __name__ == '__main__':
main()
diff --git a/scripts/filter-abund.py b/scripts/filter-abund.py
index 4701c42..e994c60 100755
--- a/scripts/filter-abund.py
+++ b/scripts/filter-abund.py
@@ -1,6 +1,6 @@
-#! /usr/bin/env python2
+#! /usr/bin/env python
#
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
# Copyright (C) Michigan State University, 2009-2015. It is licensed under
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
@@ -16,6 +16,7 @@ hash table. Output sequences will be placed in 'infile.abundfilt'.
Use '-h' for parameter help.
"""
+from __future__ import print_function
import os
import khmer
import textwrap
@@ -80,39 +81,41 @@ def main():
check_input_files(args.input_table, args.force)
infiles = args.input_filename
- for _ in infiles:
- check_input_files(_, args.force)
+ for filename in infiles:
+ check_input_files(filename, args.force)
check_space(infiles, args.force)
- print >>sys.stderr, 'loading hashtable'
+ print('loading counting table:', args.input_table,
+ file=sys.stderr)
htable = khmer.load_counting_hash(args.input_table)
ksize = htable.ksize()
- print >>sys.stderr, "K:", ksize
+ print("K:", ksize, file=sys.stderr)
# the filtering function.
def process_fn(record):
- name = record['name']
- seq = record['sequence']
- if 'N' in seq:
- return None, None
+ name = record.name
+ seq = record.sequence
+ seqN = seq.replace('N', 'A')
if args.variable_coverage: # only trim when sequence has high enough C
- med, _, _ = htable.get_median_count(seq)
+ med, _, _ = htable.get_median_count(seqN)
if med < args.normalize_to:
return name, seq
- trim_seq, trim_at = htable.trim_on_abundance(seq, args.cutoff)
+ _, trim_at = htable.trim_on_abundance(seqN, args.cutoff)
if trim_at >= ksize:
- return name, trim_seq
+ # be sure to not to change the 'N's in the trimmed sequence -
+ # so, return 'seq' and not 'seqN'.
+ return name, seq[:trim_at]
return None, None
# the filtering loop
for infile in infiles:
- print >>sys.stderr, 'filtering', infile
+ print('filtering', infile, file=sys.stderr)
if args.single_output_filename != '':
outfile = args.single_output_filename
outfp = open(outfile, 'a')
@@ -123,7 +126,7 @@ def main():
tsp = ThreadedSequenceProcessor(process_fn, n_workers=args.threads)
tsp.start(verbose_loader(infile), outfp)
- print >>sys.stderr, 'output in', outfile
+ print('output in', outfile, file=sys.stderr)
if __name__ == '__main__':
main()
diff --git a/scripts/filter-stoptags.py b/scripts/filter-stoptags.py
index 24175ff..d7e87f2 100755
--- a/scripts/filter-stoptags.py
+++ b/scripts/filter-stoptags.py
@@ -1,6 +1,6 @@
-#! /usr/bin/env python2
+#! /usr/bin/env python
#
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
# Copyright (C) Michigan State University, 2009-2015. It is licensed under
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
@@ -16,6 +16,7 @@ will be placed in 'infile.stopfilt'.
Use '-h' for parameter help.
"""
+from __future__ import print_function
import os
import khmer
@@ -63,8 +64,8 @@ def main():
check_space(infiles, args.force)
- print >>sys.stderr, 'loading stop tags, with K', args.ksize
- htable = khmer.new_hashbits(args.ksize, 1, 1)
+ print('loading stop tags, with K', args.ksize, file=sys.stderr)
+ htable = khmer.Hashbits(args.ksize, 1, 1)
htable.load_stop_tags(stoptags)
def process_fn(record):
@@ -82,7 +83,7 @@ def main():
# the filtering loop
for infile in infiles:
- print >>sys.stderr, 'filtering', infile
+ print('filtering', infile, file=sys.stderr)
outfile = os.path.basename(infile) + '.stopfilt'
outfp = open(outfile, 'w')
@@ -90,7 +91,7 @@ def main():
tsp = ThreadedSequenceProcessor(process_fn)
tsp.start(verbose_loader(infile), outfp)
- print >>sys.stderr, 'output in', outfile
+ print('output in', outfile, file=sys.stderr)
if __name__ == '__main__':
main()
diff --git a/scripts/find-knots.py b/scripts/find-knots.py
index bd41bfb..d224e06 100755
--- a/scripts/find-knots.py
+++ b/scripts/find-knots.py
@@ -1,6 +1,6 @@
-#! /usr/bin/env python2
+#! /usr/bin/env python
#
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
# Copyright (C) Michigan State University, 2009-2015. It is licensed under
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
@@ -13,6 +13,7 @@ k-mers are output into a .stoptags file, for later use in partitioning.
% python scripts/find-knots.py <base>
"""
+from __future__ import print_function
import argparse
import glob
@@ -21,7 +22,9 @@ import textwrap
import khmer
import sys
from khmer.kfile import check_input_files, check_space
-from khmer.khmer_args import info
+from khmer import khmer_args
+from khmer.khmer_args import (build_counting_args, info, add_loadhash_args,
+ report_on_config)
# counting hash parameters.
DEFAULT_COUNTING_HT_SIZE = 3e6 # number of bytes
@@ -61,21 +64,14 @@ def get_parser():
process, and if you eliminate the already-processed pmap files, you can
continue where you left off.
"""
- parser = argparse.ArgumentParser(
- description="Find all highly connected k-mers.",
- epilog=textwrap.dedent(epilog),
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-
- parser.add_argument('--n_tables', '-N', type=int,
- default=DEFAULT_COUNTING_HT_N,
- help='number of k-mer counting tables to use')
- parser.add_argument('--min-tablesize', '-x', type=float,
- default=DEFAULT_COUNTING_HT_SIZE, help='lower bound on'
- ' the size of the k-mer counting table(s)')
+ parser = build_counting_args(
+ descr="Find all highly connected k-mers.",
+ epilog=textwrap.dedent(epilog))
+
parser.add_argument('graphbase', help='Basename for the input and output '
'files.')
- parser.add_argument('--version', action='version', version='%(prog)s ' +
- khmer.__version__)
+ parser.add_argument('-f', '--force', default=False, action='store_true',
+ help='Continue past warnings')
return parser
@@ -90,65 +86,67 @@ def main():
if os.path.exists(graphbase + '.stoptags'):
infiles.append(graphbase + '.stoptags')
for _ in infiles:
- check_input_files(_, False)
+ check_input_files(_, args.force)
- check_space(infiles)
+ check_space(infiles, args.force)
- print >>sys.stderr, 'loading k-mer presence table %s.pt' % graphbase
+ print('loading k-mer presence table %s.pt' % graphbase, file=sys.stderr)
htable = khmer.load_hashbits(graphbase + '.pt')
- print >>sys.stderr, 'loading tagset %s.tagset...' % graphbase
+ print('loading tagset %s.tagset...' % graphbase, file=sys.stderr)
htable.load_tagset(graphbase + '.tagset')
initial_stoptags = False # @CTB regularize with make-initial
if os.path.exists(graphbase + '.stoptags'):
- print >>sys.stderr, 'loading stoptags %s.stoptags' % graphbase
+ print('loading stoptags %s.stoptags' % graphbase, file=sys.stderr)
htable.load_stop_tags(graphbase + '.stoptags')
initial_stoptags = True
pmap_files = glob.glob(args.graphbase + '.subset.*.pmap')
- print >>sys.stderr, 'loading %d pmap files (first one: %s)' % \
- (len(pmap_files), pmap_files[0])
- print >>sys.stderr, '---'
- print >>sys.stderr, 'output stoptags will be in', graphbase + '.stoptags'
+ print('loading %d pmap files (first one: %s)' %
+ (len(pmap_files), pmap_files[0]), file=sys.stderr)
+ print('---', file=sys.stderr)
+ print('output stoptags will be in',
+ graphbase + '.stoptags', file=sys.stderr)
if initial_stoptags:
- print >>sys.stderr, \
- '(these output stoptags will include the already-loaded set)'
- print >>sys.stderr, '---'
+ print(
+ '(these output stoptags will include the already-loaded set)',
+ file=sys.stderr)
+ print('---', file=sys.stderr)
# create counting hash
ksize = htable.ksize()
- counting = khmer.new_counting_hash(ksize, args.min_tablesize,
- args.n_tables)
+ counting = khmer_args.create_countgraph(args, ksize=ksize)
# load & merge
for index, subset_file in enumerate(pmap_files):
- print >>sys.stderr, '<-', subset_file
+ print('<-', subset_file, file=sys.stderr)
subset = htable.load_subset_partitionmap(subset_file)
- print >>sys.stderr, '** repartitioning subset... %s' % subset_file
+ print('** repartitioning subset... %s' % subset_file, file=sys.stderr)
htable.repartition_largest_partition(subset, counting,
EXCURSION_DISTANCE,
EXCURSION_KMER_THRESHOLD,
EXCURSION_KMER_COUNT_THRESHOLD)
- print >>sys.stderr, '** merging subset... %s' % subset_file
+ print('** merging subset... %s' % subset_file, file=sys.stderr)
htable.merge_subset(subset)
- print >>sys.stderr, '** repartitioning, round 2... %s' % subset_file
+ print('** repartitioning, round 2... %s' %
+ subset_file, file=sys.stderr)
size = htable.repartition_largest_partition(
None, counting, EXCURSION_DISTANCE, EXCURSION_KMER_THRESHOLD,
EXCURSION_KMER_COUNT_THRESHOLD)
- print >>sys.stderr, '** repartitioned size:', size
+ print('** repartitioned size:', size, file=sys.stderr)
- print >>sys.stderr, 'saving stoptags binary'
+ print('saving stoptags binary', file=sys.stderr)
htable.save_stop_tags(graphbase + '.stoptags')
os.rename(subset_file, subset_file + '.processed')
- print >>sys.stderr, '(%d of %d)\n' % (index, len(pmap_files))
+ print('(%d of %d)\n' % (index, len(pmap_files)), file=sys.stderr)
- print >>sys.stderr, 'done!'
+ print('done!', file=sys.stderr)
if __name__ == '__main__':
main()
diff --git a/scripts/interleave-reads.py b/scripts/interleave-reads.py
index 4f2696d..94d5776 100755
--- a/scripts/interleave-reads.py
+++ b/scripts/interleave-reads.py
@@ -1,6 +1,6 @@
-#! /usr/bin/env python2
+#! /usr/bin/env python
#
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
# Copyright (C) Michigan State University, 2009-2015. It is licensed under
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
@@ -16,13 +16,13 @@ and interleave them.
By default, output is sent to stdout; or use -o. Use '-h' for parameter help.
"""
+from __future__ import print_function
# TODO: take fa as well?
# support gzip option?
import screed
import sys
-import itertools
import os
import textwrap
import argparse
@@ -32,6 +32,11 @@ from khmer.khmer_args import info
from khmer.utils import (write_record_pair, check_is_left, check_is_right,
check_is_pair)
+try:
+ from itertools import zip_longest
+except ImportError:
+ from itertools import izip_longest as zip_longest
+
def get_parser():
epilog = """
@@ -77,38 +82,38 @@ def main():
else:
s2_file = s1_file.replace('_R1_', '_R2_')
if s1_file == s2_file:
- print >>sys.stderr, ("ERROR: given only one filename, that "
- "doesn't contain _R1_. Exiting.")
+ print(("ERROR: given only one filename, that "
+ "doesn't contain _R1_. Exiting."), file=sys.stderr)
sys.exit(1)
- print >> sys.stderr, ("given only one file; "
- "guessing that R2 file is %s" % s2_file)
+ print(("given only one file; "
+ "guessing that R2 file is %s" % s2_file), file=sys.stderr)
fail = False
if not os.path.exists(s1_file):
- print >> sys.stderr, "Error! R1 file %s does not exist" % s1_file
+ print("Error! R1 file %s does not exist" % s1_file, file=sys.stderr)
fail = True
if not os.path.exists(s2_file):
- print >> sys.stderr, "Error! R2 file %s does not exist" % s2_file
+ print("Error! R2 file %s does not exist" % s2_file, file=sys.stderr)
fail = True
if fail and not args.force:
sys.exit(1)
- print >> sys.stderr, "Interleaving:\n\t%s\n\t%s" % (s1_file, s2_file)
+ print("Interleaving:\n\t%s\n\t%s" % (s1_file, s2_file), file=sys.stderr)
counter = 0
screed_iter_1 = screed.open(s1_file, parse_description=False)
screed_iter_2 = screed.open(s2_file, parse_description=False)
- for read1, read2 in itertools.izip_longest(screed_iter_1, screed_iter_2):
+ for read1, read2 in zip_longest(screed_iter_1, screed_iter_2):
if read1 is None or read2 is None:
- print >>sys.stderr, ("ERROR: Input files contain different number"
- " of records.")
+ print(("ERROR: Input files contain different number"
+ " of records."), file=sys.stderr)
sys.exit(1)
if counter % 100000 == 0:
- print >> sys.stderr, '...', counter, 'pairs'
+ print('...', counter, 'pairs', file=sys.stderr)
counter += 1
name1 = read1.name
@@ -122,14 +127,14 @@ def main():
read2.name = name2
if not check_is_pair(read1, read2):
- print >>sys.stderr, "ERROR: This doesn't look like paired data! " \
- "%s %s" % (read1.name, read2.name)
+ print("ERROR: This doesn't look like paired data! "
+ "%s %s" % (read1.name, read2.name), file=sys.stderr)
sys.exit(1)
write_record_pair(read1, read2, args.output)
- print >> sys.stderr, 'final: interleaved %d pairs' % counter
- print >> sys.stderr, 'output written to', args.output.name
+ print('final: interleaved %d pairs' % counter, file=sys.stderr)
+ print('output written to', args.output.name, file=sys.stderr)
if __name__ == '__main__':
main()
diff --git a/scripts/load-graph.py b/scripts/load-graph.py
index 9fbc7a3..999403e 100755
--- a/scripts/load-graph.py
+++ b/scripts/load-graph.py
@@ -1,6 +1,6 @@
-#! /usr/bin/env python2
+#! /usr/bin/env python
#
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
# Copyright (C) Michigan State University, 2009-2015. It is licensed under
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
@@ -13,6 +13,7 @@ Build a graph from the given sequences, save in <ptname>.
Use '-h' for parameter help.
"""
+from __future__ import print_function, unicode_literals
import sys
import threading
@@ -22,101 +23,21 @@ from khmer.khmer_args import build_hashbits_args
from khmer.khmer_args import (report_on_config, info, add_threading_args)
from khmer.kfile import check_input_files, check_space
from khmer.kfile import check_space_for_hashtable
+from oxli import build_graph
def get_parser():
parser = build_hashbits_args(descr="Load sequences into the compressible "
- "graph format plus optional tagset.")
- add_threading_args(parser)
- parser.add_argument('--no-build-tagset', '-n', default=False,
- action='store_true', dest='no_build_tagset',
- help='Do NOT construct tagset while loading sequences')
- parser.add_argument('output_filename',
- metavar='output_presence_table_filename', help='output'
- ' k-mer presence table filename.')
- parser.add_argument('input_filenames', metavar='input_sequence_filename',
- nargs='+', help='input FAST[AQ] sequence filename')
- parser.add_argument('--report-total-kmers', '-t', action='store_true',
- help="Prints the total number of k-mers to stderr")
- parser.add_argument('--write-fp-rate', '-w', action='store_true',
- help="Write false positive rate into .info file")
- parser.add_argument('-f', '--force', default=False, action='store_true',
- help='Overwrite output file if it exists')
- return parser
-
-
-def main():
- info('load-graph.py', ['graph', 'SeqAn'])
- args = get_parser().parse_args()
- report_on_config(args, hashtype='hashbits')
-
- base = args.output_filename
- filenames = args.input_filenames
-
- for _ in args.input_filenames:
- check_input_files(_, args.force)
-
- check_space(args.input_filenames, args.force)
- check_space_for_hashtable(
- (float(args.n_tables * args.min_tablesize) / 8.), args.force)
-
- print >>sys.stderr, 'Saving k-mer presence table to %s' % base
- print >>sys.stderr, 'Loading kmers from sequences in %s' % repr(filenames)
- if args.no_build_tagset:
- print >>sys.stderr, 'We WILL NOT build the tagset.'
- else:
- print >>sys.stderr, 'We WILL build the tagset', \
- ' (for partitioning/traversal).'
-
- print >>sys.stderr, 'making k-mer presence table'
- htable = khmer.new_hashbits(args.ksize, args.min_tablesize, args.n_tables)
-
- if args.no_build_tagset:
- target_method = htable.consume_fasta_with_reads_parser
- else:
- target_method = htable.consume_fasta_and_tag_with_reads_parser
+ "graph format plus optional tagset.")
- for _, filename in enumerate(filenames):
- rparser = khmer.ReadParser(filename)
- threads = []
- print >>sys.stderr, 'consuming input', filename
- for num in xrange(args.threads):
- cur_thread = threading.Thread(
- target=target_method, args=(rparser,))
- threads.append(cur_thread)
- cur_thread.start()
-
- for thread in threads:
- thread.join()
-
- if args.report_total_kmers:
- print >> sys.stderr, 'Total number of unique k-mers: {0}'.format(
- htable.n_unique_kmers())
-
- print >>sys.stderr, 'saving k-mer presence table in', base + '.pt'
- htable.save(base + '.pt')
-
- if not args.no_build_tagset:
- print >>sys.stderr, 'saving tagset in', base + '.tagset'
- htable.save_tagset(base + '.tagset')
-
- info_fp = open(base + '.info', 'w')
- info_fp.write('%d unique k-mers' % htable.n_unique_kmers())
-
- fp_rate = \
- khmer.calc_expected_collisions(htable, args.force, max_false_pos=.15)
- # 0.18 is ACTUAL MAX. Do not change.
-
- print >>sys.stderr, 'fp rate estimated to be %1.3f' % fp_rate
- if args.write_fp_rate:
- print >> info_fp, \
- '\nfalse positive rate estimated to be %1.3f' % fp_rate
+ parser = build_graph.build_parser(parser)
+ return parser
- print >> sys.stderr, 'wrote to', base + '.info and', base + '.pt'
- if not args.no_build_tagset:
- print >> sys.stderr, 'and ' + base + '.tagset'
if __name__ == '__main__':
- main()
+ parser = get_parser()
+ args = parser.parse_args()
+ build_graph.main(args)
+ sys.exit(0)
# vim: set ft=python ts=4 sts=4 sw=4 et tw=79:
diff --git a/scripts/load-into-counting.py b/scripts/load-into-counting.py
index b81e767..f907c36 100755
--- a/scripts/load-into-counting.py
+++ b/scripts/load-into-counting.py
@@ -1,6 +1,6 @@
-#! /usr/bin/env python2
+#! /usr/bin/env python
#
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
# Copyright (C) Michigan State University, 2009-2015. It is licensed under
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
@@ -12,6 +12,7 @@ Build a counting Bloom filter from the given sequences, save in <htname>.
Use '-h' for parameter help.
"""
+from __future__ import print_function, unicode_literals
import json
import os
@@ -19,6 +20,7 @@ import sys
import threading
import textwrap
import khmer
+from khmer import khmer_args
from khmer.khmer_args import build_counting_args, report_on_config, info,\
add_threading_args
from khmer.kfile import check_file_writable
@@ -58,8 +60,8 @@ def get_parser():
action='store_false', help="The default behaviour is "
"to count past 255 using bigcount. This flag turns "
"bigcount off, limiting counts to 255.")
- parser.add_argument('--summary-info', '-s', default=None, metavar="FORMAT",
- choices=['json', 'tsv'],
+ parser.add_argument('--summary-info', '-s', type=str, default=None,
+ metavar="FORMAT", choices=[str('json'), str('tsv')],
help="What format should the machine readable run "
"summary be in? (json or tsv, disabled by default)")
parser.add_argument('--report-total-kmers', '-t', action='store_true',
@@ -83,21 +85,21 @@ def main():
check_input_files(name, args.force)
check_space(args.input_sequence_filename, args.force)
- check_space_for_hashtable(args.n_tables * args.min_tablesize, args.force)
+ check_space_for_hashtable(args, 'countgraph', args.force)
check_file_writable(base)
check_file_writable(base + ".info")
- print >>sys.stderr, 'Saving k-mer counting table to %s' % base
- print >>sys.stderr, 'Loading kmers from sequences in %s' % repr(filenames)
+ print('Saving k-mer counting table to %s' % base, file=sys.stderr)
+ print('Loading kmers from sequences in %s' %
+ repr(filenames), file=sys.stderr)
# clobber the '.info' file now, as we always open in append mode below
if os.path.exists(base + '.info'):
os.remove(base + '.info')
- print >>sys.stderr, 'making k-mer counting table'
- htable = khmer.new_counting_hash(args.ksize, args.min_tablesize,
- args.n_tables)
+ print('making countgraph', file=sys.stderr)
+ htable = khmer_args.create_countgraph(args)
htable.set_use_bigcount(args.bigcount)
filename = None
@@ -108,8 +110,8 @@ def main():
rparser = khmer.ReadParser(filename)
threads = []
- print >>sys.stderr, 'consuming input', filename
- for _ in xrange(args.threads):
+ print('consuming input', filename, file=sys.stderr)
+ for _ in range(args.threads):
cur_thrd = \
threading.Thread(
target=htable.consume_fasta_with_reads_parser,
@@ -122,21 +124,21 @@ def main():
thread.join()
if index > 0 and index % 10 == 0:
- check_space_for_hashtable(args.n_tables * args.min_tablesize,
- args.force)
- print >>sys.stderr, 'mid-save', base
+ check_space_for_hashtable(args, 'countgraph', args.force)
+ print('mid-save', base, file=sys.stderr)
+
htable.save(base)
with open(base + '.info', 'a') as info_fh:
- print >> info_fh, 'through', filename
+ print('through', filename, file=info_fh)
total_num_reads += rparser.num_reads
n_kmers = htable.n_unique_kmers()
if args.report_total_kmers:
- print >> sys.stderr, 'Total number of unique k-mers:', n_kmers
+ print('Total number of unique k-mers:', n_kmers, file=sys.stderr)
with open(base + '.info', 'a') as info_fp:
- print >>info_fp, 'Total number of unique k-mers:', n_kmers
+ print('Total number of unique k-mers:', n_kmers, file=info_fp)
- print >>sys.stderr, 'saving', base
+ print('saving', base, file=sys.stderr)
htable.save(base)
# Change max_false_pos=0.2 only if you really grok it. HINT: You don't
@@ -144,12 +146,12 @@ def main():
khmer.calc_expected_collisions(htable, args.force, max_false_pos=.2)
with open(base + '.info', 'a') as info_fp:
- print >> info_fp, 'fp rate estimated to be %1.3f\n' % fp_rate
+ print('fp rate estimated to be %1.3f\n' % fp_rate, file=info_fp)
if args.summary_info:
mr_fmt = args.summary_info.lower()
mr_file = base + '.info.' + mr_fmt
- print >> sys.stderr, "Writing summmary info to", mr_file
+ print("Writing summmary info to", mr_file, file=sys.stderr)
with open(mr_file, 'w') as mr_fh:
if mr_fmt == 'json':
mr_data = {
@@ -173,10 +175,10 @@ def main():
]
mr_fh.write("\t".join(vals) + "\n")
- print >> sys.stderr, 'fp rate estimated to be %1.3f' % fp_rate
+ print('fp rate estimated to be %1.3f' % fp_rate, file=sys.stderr)
- print >>sys.stderr, 'DONE.'
- print >>sys.stderr, 'wrote to:', base + '.info'
+ print('DONE.', file=sys.stderr)
+ print('wrote to:', base + '.info', file=sys.stderr)
if __name__ == '__main__':
main()
diff --git a/scripts/make-initial-stoptags.py b/scripts/make-initial-stoptags.py
index 48a2741..29a08ef 100755
--- a/scripts/make-initial-stoptags.py
+++ b/scripts/make-initial-stoptags.py
@@ -1,6 +1,6 @@
-#! /usr/bin/env python2
+#! /usr/bin/env python
#
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
# Copyright (C) Michigan State University, 2009-2015. It is licensed under
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
@@ -11,10 +11,12 @@ Find an initial set of highly connected k-mers, to save on repartitioning time.
% python scripts/make-initial-stoptags.py <base>
"""
+from __future__ import print_function
import sys
import textwrap
import khmer
+from khmer import khmer_args
from khmer.khmer_args import (build_counting_args, info)
from khmer.kfile import check_input_files, check_space
@@ -83,20 +85,19 @@ def main():
check_space(infiles, args.force)
- print >>sys.stderr, 'loading htable %s.pt' % graphbase
+ print('loading htable %s.pt' % graphbase, file=sys.stderr)
htable = khmer.load_hashbits(graphbase + '.pt')
# do we want to load stop tags, and do they exist?
if args.stoptags:
- print >>sys.stderr, 'loading stoptags from', args.stoptags
+ print('loading stoptags from', args.stoptags, file=sys.stderr)
htable.load_stop_tags(args.stoptags)
- print >>sys.stderr, 'loading tagset %s.tagset...' % graphbase
+ print('loading tagset %s.tagset...' % graphbase, file=sys.stderr)
htable.load_tagset(graphbase + '.tagset')
ksize = htable.ksize()
- counting = khmer.new_counting_hash(ksize, args.min_tablesize,
- args.n_tables)
+ counting = khmer_args.create_countgraph(args)
# divide up into SUBSET_SIZE fragments
divvy = htable.divide_tags_into_subsets(args.subset_size)
@@ -108,19 +109,19 @@ def main():
start, end = divvy[:2]
# partition!
- print >>sys.stderr, 'doing pre-partitioning from', start, 'to', end
+ print('doing pre-partitioning from', start, 'to', end, file=sys.stderr)
subset = htable.do_subset_partition(start, end)
# now, repartition...
- print >>sys.stderr, 'repartitioning to find HCKs.'
+ print('repartitioning to find HCKs.', file=sys.stderr)
htable.repartition_largest_partition(subset, counting,
EXCURSION_DISTANCE,
EXCURSION_KMER_THRESHOLD,
EXCURSION_KMER_COUNT_THRESHOLD)
- print >>sys.stderr, 'saving stop tags'
+ print('saving stop tags', file=sys.stderr)
htable.save_stop_tags(graphbase + '.stoptags')
- print >> sys.stderr, 'wrote to:', graphbase + '.stoptags'
+ print('wrote to:', graphbase + '.stoptags', file=sys.stderr)
if __name__ == '__main__':
main()
diff --git a/scripts/merge-partitions.py b/scripts/merge-partitions.py
index 99f6a84..c77d822 100755
--- a/scripts/merge-partitions.py
+++ b/scripts/merge-partitions.py
@@ -1,6 +1,6 @@
-#! /usr/bin/env python2
+#! /usr/bin/env python
#
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
# Copyright (C) Michigan State University, 2009-2015. It is licensed under
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
@@ -14,6 +14,7 @@ Merge multiple pmap files into a single one.
Load <base>.subset.*.pmap and merge into a single pmap file. Final
merged pmap file will be in <base>.pmap.merged.
"""
+from __future__ import print_function
import argparse
import glob
@@ -56,11 +57,11 @@ def main():
output_file = args.graphbase + '.pmap.merged'
pmap_files = glob.glob(args.graphbase + '.subset.*.pmap')
- print >>sys.stderr, 'loading %d pmap files (first one: %s)' % \
- (len(pmap_files), pmap_files[0])
+ print('loading %d pmap files (first one: %s)' %
+ (len(pmap_files), pmap_files[0]), file=sys.stderr)
ksize = args.ksize
- htable = khmer.new_hashbits(ksize, 1, 1)
+ htable = khmer.Hashbits(ksize, 1, 1)
for _ in pmap_files:
check_input_files(_, args.force)
@@ -68,14 +69,14 @@ def main():
check_space(pmap_files, args.force)
for pmap_file in pmap_files:
- print >>sys.stderr, 'merging', pmap_file
+ print('merging', pmap_file, file=sys.stderr)
htable.merge_subset_from_disk(pmap_file)
- print >>sys.stderr, 'saving merged to', output_file
+ print('saving merged to', output_file, file=sys.stderr)
htable.save_partitionmap(output_file)
if args.remove_subsets:
- print >>sys.stderr, 'removing pmap files'
+ print('removing pmap files', file=sys.stderr)
for pmap_file in pmap_files:
os.unlink(pmap_file)
diff --git a/scripts/normalize-by-median.py b/scripts/normalize-by-median.py
index 22c84dc..3bb2ba7 100755
--- a/scripts/normalize-by-median.py
+++ b/scripts/normalize-by-median.py
@@ -1,6 +1,6 @@
-#! /usr/bin/env python2
+#! /usr/bin/env python
#
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
# Copyright (C) Michigan State University, 2009-2015. It is licensed under
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
@@ -17,162 +17,139 @@ option to output to STDOUT.
Use '-h' for parameter help.
"""
+from __future__ import print_function
import sys
import screed
import os
import khmer
import textwrap
-from itertools import izip
+from khmer import khmer_args
+from contextlib import contextmanager
+
from khmer.khmer_args import (build_counting_args, add_loadhash_args,
report_on_config, info)
import argparse
from khmer.kfile import (check_space, check_space_for_hashtable,
check_valid_file_exists)
-from khmer.utils import write_record, check_is_pair
-DEFAULT_DESIRED_COVERAGE = 10
-
-# Iterate a collection in arbitrary batches
-# from: http://stackoverflow.com/questions/4628290/pairs-from-single-list
-
+from khmer.utils import write_record, check_is_pair, broken_paired_reader
-def batchwise(coll, size):
- iter_coll = iter(coll)
- return izip(*[iter_coll] * size)
-# Returns true if the pair of records are properly pairs
+DEFAULT_DESIRED_COVERAGE = 20
-# pylint: disable=too-many-locals,too-many-branches
-def normalize_by_median(input_filename, outfp, htable, paired, cutoff,
- report_fp=None):
+def WithDiagnostics(ifilename, norm, reader, fp):
+ """
+ Generator/context manager to do boilerplate output of statistics using a
+ Normalizer object.
+ """
- desired_coverage = cutoff
- ksize = htable.ksize()
+ index = 0
- # In paired mode we read two records at a time
- batch_size = 1
- if paired:
- batch_size = 2
+ # per read diagnostic output
+ for index, record in enumerate(norm(reader)):
- index = -1
- total = 0
- discarded = 0
- for index, batch in enumerate(batchwise(screed.open(
- input_filename, parse_description=False), batch_size)):
- if index > 0 and index % 100000 == 0:
- print >>sys.stderr, '... kept {kept} of {total} or'\
- ' {perc:2}%'.format(kept=total - discarded, total=total,
- perc=int(100. - discarded /
- float(total) * 100.))
- print >>sys.stderr, '... in file', input_filename
+ if norm.total % 100000 == 0:
+ print('... kept {kept} of {total} or {perc:2}% so far'
+ .format(kept=norm.total - norm.discarded,
+ total=norm.total,
+ perc=int(100. - norm.discarded /
+ float(norm.total) * 100.)),
+ file=sys.stderr)
- if report_fp:
- print >> report_fp, total, total - discarded, \
- 1. - (discarded / float(total))
- report_fp.flush()
+ print('... in file ' + ifilename, file=sys.stderr)
- total += batch_size
+ yield record
- # If in paired mode, check that the reads are properly interleaved
+ # per file diagnostic output
+ if norm.total == 0:
+ print('SKIPPED empty file ' + ifilename, file=sys.stderr)
+ else:
+ print('DONE with {inp}; kept {kept} of {total} or {perc:2}%'
+ .format(inp=ifilename, kept=norm.total - norm.discarded,
+ total=norm.total, perc=int(100. - norm.discarded /
+ float(norm.total) * 100.)),
+ file=sys.stderr)
- if paired:
- if not check_is_pair(batch[0], batch[1]):
- raise IOError('Error: Improperly interleaved pairs \
- {b0} {b1}'.format(b0=batch[0].name, b1=batch[1].name))
+ if fp:
+ print("{total} {kept} {discarded}"
+ .format(total=norm.total, kept=norm.total - norm.discarded,
+ discarded=1. - (norm.discarded / float(norm.total))),
+ file=fp)
+ fp.flush()
- # Emit the batch of reads if any read passes the filter
- # and all reads are longer than K
- passed_filter = False
- passed_length = True
- for record in batch:
- if len(record.sequence) < ksize:
- passed_length = False
- continue
- seq = record.sequence.replace('N', 'A')
- med, _, _ = htable.get_median_count(seq)
+class Normalizer(object):
+ """
+ Digital normalization algorithm.
+ """
- if med < desired_coverage:
- htable.consume(seq)
- passed_filter = True
+ def __init__(self, desired_coverage, htable):
+ self.htable = htable
+ self.desired_coverage = desired_coverage
- # Emit records if any passed
- if passed_length and passed_filter:
- for record in batch:
- write_record(record, outfp)
- else:
- discarded += batch_size
+ self.total = 0
+ self.discarded = 0
- if report_fp:
- print >> report_fp, total, total - discarded, \
- 1. - (discarded / float(total))
- report_fp.flush()
+ def __call__(self, reader):
+ """
+ Actually does digital normalization - the core algorithm.
- return total, discarded
+ * get one (unpaired) or two (paired) reads;
+ * sanitize the sequences (convert Ns to As);
+ * get the median k-mer count of one/both reads;
+ * if any read's median k-mer count is below desired coverage, keep all;
+ * consume and yield kept reads.
+ """
+ desired_coverage = self.desired_coverage
-def handle_error(error, output_name, input_name, fail_save, htable):
- print >> sys.stderr, '** ERROR:', error
- print >> sys.stderr, '** Failed on {name}: '.format(name=input_name)
- if fail_save:
- tablename = os.path.basename(input_name) + '.ct.failed'
- print >> sys.stderr, \
- '** ...dumping k-mer counting table to {tn}'.format(tn=tablename)
- htable.save(tablename)
- try:
- os.remove(output_name)
- except: # pylint: disable=bare-except
- print >> sys.stderr, '** ERROR: problem removing corrupt filtered file'
+ for index, is_paired, read0, read1 in reader:
+ passed_filter = False
+ self.total += 1
-def normalize_by_median_and_check(input_filename, htable, single_output_file,
- fail_save, paired, cutoff, force,
- corrupt_files, report_fp=None):
- total = 0
- discarded = 0
+ if is_paired:
+ self.total += 1
- total_acc = None
- discarded_acc = None
+ batch = []
+ batch.append(read0)
+ if read1 is not None:
+ batch.append(read1)
- if single_output_file:
- if single_output_file is sys.stdout:
- output_name = '/dev/stdout'
- else:
- output_name = single_output_file.name
- outfp = single_output_file
+ for record in batch:
+ seq = record.sequence.replace('N', 'A')
+ if not self.htable.median_at_least(seq, desired_coverage):
+ passed_filter = True
+
+ if passed_filter:
+ for record in batch:
+ seq = record.sequence.replace('N', 'A')
+ self.htable.consume(seq)
+ yield record
+ else:
+ self.discarded += len(batch)
- else:
- output_name = os.path.basename(input_filename) + '.keep'
- outfp = open(output_name, 'w')
+ at contextmanager
+def CatchIOErrors(ifile, out, single_out, force, corrupt_files):
+ """
+ Context manager to do boilerplate handling of IOErrors.
+ """
try:
- total_acc, discarded_acc = normalize_by_median(
- input_filename, outfp, htable, paired, cutoff, report_fp=None)
- except IOError as err:
- handle_error(err, output_name, input_filename, fail_save,
- htable)
+ yield
+ except (IOError, ValueError) as error:
+ print('** ERROR: ' + str(error), file=sys.stderr)
+ print('** Failed on {name}: '.format(name=ifile), file=sys.stderr)
+ if not single_out:
+ os.remove(out.name)
if not force:
- print >> sys.stderr, '** Exiting!'
+ print('** Exiting!', file=sys.stderr)
sys.exit(1)
else:
- print >> sys.stderr, '*** Skipping error file, moving on...'
- corrupt_files.append(input_filename)
- else:
- if total_acc == 0 and discarded_acc == 0:
- print >> sys.stderr, 'SKIPPED empty file', input_filename
- else:
- total += total_acc
- discarded += discarded_acc
- print >> sys.stderr, \
- 'DONE with {inp}; kept {kept} of {total} or {perc:2}%'\
- .format(inp=input_filename, kept=total - discarded,
- total=total, perc=int(100. - discarded /
- float(total) * 100.))
- print >> sys.stderr, 'output in', output_name
-
- return total_acc, discarded_acc, corrupt_files
+ print('*** Skipping error file, moving on...', file=sys.stderr)
+ corrupt_files.append(ifile)
def get_parser():
@@ -180,12 +157,18 @@ def get_parser():
Discard sequences based on whether or not their median k-mer abundance lies
above a specified cutoff. Kept sequences will be placed in <fileN>.keep.
- Paired end reads will be considered together if :option:`-p` is set. If
- either read will be kept, then both will be kept. This should result in
- keeping (or discarding) each sequencing fragment. This helps with retention
- of repeats, especially. With :option: `-u`/:option:`--unpaired-reads`,
- unpaired reads from the specified file will be read after the paired data
- is read.
+ By default, paired end reads will be considered together; if
+ either read should be kept, both will be kept. (This keeps both
+ reads from a fragment, and helps with retention of repeats.)
+ Unpaired reads are treated individually.
+
+ If :option:`-p`/`--paired` is set, then proper pairing is required
+ and the script will exit on unpaired reads, although
+ :option:`--unpaired-reads` can be used to supply a file of orphan
+ reads to be read after the paired reads.
+
+ :option:`--force-single` will ignore all pairing information and treat
+ reads individually.
With :option:`-s`/:option:`--savetable`, the k-mer counting table
will be saved to the specified file after all sequences have been
@@ -198,11 +181,6 @@ def get_parser():
produced by :program:`load-into-counting.py` and consumed by
:program:`abundance-dist.py`.
- :option:`-f`/:option:`--fault-tolerant` will force the program to continue
- upon encountering a formatting error in a sequence file; the k-mer counting
- table up to that point will be dumped, and processing will continue on the
- next file.
-
To append reads to an output file (rather than overwriting it), send output
to STDOUT with `--out -` and use UNIX file redirection syntax (`>>`) to
append to the file.
@@ -234,24 +212,23 @@ def get_parser():
epilog=textwrap.dedent(epilog))
parser.add_argument('-C', '--cutoff', type=int,
default=DEFAULT_DESIRED_COVERAGE)
- parser.add_argument('-p', '--paired', action='store_true')
+ parser.add_argument('-p', '--paired', action='store_true',
+ help='require that all sequences be properly paired')
+ parser.add_argument('--force-single', dest='force_single',
+ action='store_true',
+ help='treat all sequences as single-ended/unpaired')
parser.add_argument('-u', '--unpaired-reads',
- metavar="unpaired_reads_filename", help='with paired data only,\
- include an unpaired file')
+ metavar="unpaired_reads_filename",
+ help='include a file of unpaired reads to which '
+ '-p/--paired does not apply.')
parser.add_argument('-s', '--savetable', metavar="filename", default='',
help='save the k-mer counting table to disk after all'
'reads are loaded.')
parser.add_argument('-R', '--report',
metavar='filename', type=argparse.FileType('w'))
- parser.add_argument('-f', '--fault-tolerant', dest='force',
+ parser.add_argument('-f', '--force', dest='force',
help='continue on next file if read errors are \
encountered', action='store_true')
- parser.add_argument('--save-on-failure', dest='fail_save',
- action='store_false', default=True,
- help='Save k-mer counting table when an error occurs')
- parser.add_argument('-d', '--dump-frequency', dest='dump_frequency',
- type=int, help='dump k-mer counting table every d '
- 'files', default=-1)
parser.add_argument('-o', '--out', metavar="filename",
dest='single_output_file',
type=argparse.FileType('w'),
@@ -261,11 +238,6 @@ def get_parser():
'terminal)')
parser.add_argument('input_filenames', metavar='input_sequence_filename',
help='Input FAST[AQ] sequence filename.', nargs='+')
- parser.add_argument('--report-total-kmers', '-t', action='store_true',
- help="Prints the total number of k-mers"
- " post-normalization to stderr")
- parser.add_argument('--force', default=False, action='store_true',
- help='Overwrite output file if it exists')
add_loadhash_args(parser)
return parser
@@ -277,88 +249,116 @@ def main(): # pylint: disable=too-many-branches,too-many-statements
report_on_config(args)
report_fp = args.report
+ force_single = args.force_single
# check for similar filenames
+ # if we're using a single output file only check for identical filenames
+ # otherwise, check for identical BASE names as well.
filenames = []
+ basenames = []
for pathfilename in args.input_filenames:
- filename = pathfilename.split('/')[-1]
- if (filename in filenames):
- print >>sys.stderr, "WARNING: At least two input files are named \
-%s . (The script normalize-by-median.py can not handle this, only one .keep \
-file for one of the input files will be generated.)" % filename
- else:
- filenames.append(filename)
+ filenames.append(pathfilename)
+ if args.single_output_file:
+ continue # nothing more to worry about
+
+ basename = os.path.basename(pathfilename)
+ if basename in basenames:
+ print('ERROR: Duplicate filename--Cannot handle this!',
+ file=sys.stderr)
+ print('** Exiting!', file=sys.stderr)
+ sys.exit(1)
+
+ basenames.append(basename)
- # check for others
+ # check that files exist and there is sufficient output disk space.
check_valid_file_exists(args.input_filenames)
check_space(args.input_filenames, args.force)
if args.savetable:
- check_space_for_hashtable(
- args.n_tables * args.min_tablesize, args.force)
-
- # list to save error files along with throwing exceptions
- corrupt_files = []
+ check_space_for_hashtable(args, 'countgraph', args.force)
+ # load or create counting table.
if args.loadtable:
- print 'loading k-mer counting table from', args.loadtable
+ print('loading k-mer counting table from ' + args.loadtable,
+ file=sys.stderr)
htable = khmer.load_counting_hash(args.loadtable)
else:
- print >> sys.stderr, 'making k-mer counting table'
- htable = khmer.new_counting_hash(args.ksize, args.min_tablesize,
- args.n_tables)
+ print('making countgraph', file=sys.stderr)
+ htable = khmer_args.create_countgraph(args)
input_filename = None
- for index, input_filename in enumerate(args.input_filenames):
- total_acc, discarded_acc, corrupt_files = \
- normalize_by_median_and_check(
- input_filename, htable, args.single_output_file,
- args.fail_save, args.paired, args.cutoff, args.force,
- corrupt_files, report_fp)
-
- if (args.dump_frequency > 0 and
- index > 0 and index % args.dump_frequency == 0):
- print 'Backup: Saving k-mer counting file through', input_filename
- if args.savetable:
- hashname = args.savetable
- print '...saving to', hashname
- else:
- hashname = 'backup.ct'
- print 'Nothing given for savetable, saving to', hashname
- htable.save(hashname)
+ # create an object to handle diginorm of all files
+ norm = Normalizer(args.cutoff, htable)
+
+ # make a list of all filenames and if they're paired or not;
+ # if we don't know if they're paired, default to allowing but not
+ # forcing pairing.
+ files = []
+ for e in filenames:
+ files.append([e, args.paired])
+ if args.unpaired_reads:
+ files.append([args.unpaired_reads, False])
+
+ corrupt_files = []
+ outfp = None
+ output_name = None
- if args.paired and args.unpaired_reads:
- args.paired = False
- output_name = args.unpaired_reads
+ if args.single_output_file:
+ if args.single_output_file is sys.stdout:
+ output_name = '/dev/stdout'
+ else:
+ output_name = args.single_output_file.name
+ outfp = args.single_output_file
+
+ #
+ # main loop: iterate over all files given, do diginorm.
+ #
+
+ for filename, require_paired in files:
if not args.single_output_file:
- output_name = os.path.basename(args.unpaired_reads) + '.keep'
- outfp = open(output_name, 'w')
- total_acc, discarded_acc, corrupt_files = \
- normalize_by_median_and_check(
- args.unpaired_reads, htable, args.single_output_file,
- args.fail_save, args.paired, args.cutoff, args.force,
- corrupt_files, report_fp)
-
- if args.report_total_kmers:
- print >> sys.stderr, 'Total number of unique k-mers: {0}'.format(
- htable.n_unique_kmers())
+ output_name = os.path.basename(filename) + '.keep'
+ outfp = open(output_name, 'w')
+
+ # failsafe context manager in case an input file breaks
+ with CatchIOErrors(filename, outfp, args.single_output_file,
+ args.force, corrupt_files):
+
+ screed_iter = screed.open(filename, parse_description=False)
+ reader = broken_paired_reader(screed_iter, min_length=args.ksize,
+ force_single=force_single,
+ require_paired=require_paired)
+
+ # actually do diginorm
+ for record in WithDiagnostics(filename, norm, reader, report_fp):
+ if record is not None:
+ write_record(record, outfp)
+
+ print('output in ' + output_name, file=sys.stderr)
+ if output_name is not '/dev/stdout':
+ outfp.close()
+
+ # finished - print out some diagnostics.
+
+ print('Total number of unique k-mers: {0}'
+ .format(htable.n_unique_kmers()),
+ file=sys.stderr)
if args.savetable:
- print 'Saving k-mer counting table through', input_filename
- print '...saving to', args.savetable
+ print('...saving to ' + args.savetable, file=sys.stderr)
htable.save(args.savetable)
fp_rate = \
khmer.calc_expected_collisions(htable, args.force, max_false_pos=.8)
# for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975
- print >> sys.stderr, \
- 'fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate)
+ print('fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate),
+ file=sys.stderr)
if args.force and len(corrupt_files) > 0:
- print >> sys.stderr, "** WARNING: Finished with errors!"
- print >> sys.stderr, "** IOErrors occurred in the following files:"
- print >> sys.stderr, "\t", " ".join(corrupt_files)
+ print("** WARNING: Finished with errors!", file=sys.stderr)
+ print("** IOErrors occurred in the following files:", file=sys.stderr)
+ print("\t", " ".join(corrupt_files), file=sys.stderr)
+
if __name__ == '__main__':
main()
diff --git a/scripts/partition-graph.py b/scripts/partition-graph.py
index 63f1e1a..73666e2 100755
--- a/scripts/partition-graph.py
+++ b/scripts/partition-graph.py
@@ -1,6 +1,6 @@
-#! /usr/bin/env python2
+#! /usr/bin/env python
#
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
# Copyright (C) Michigan State University, 2009-2015. It is licensed under
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
@@ -15,9 +15,9 @@ This will output many <base>.subset.N.pmap files.
Use '-h' for parameter help.
"""
+from __future__ import print_function
import threading
-import Queue
import gc
import os.path
import argparse
@@ -31,13 +31,19 @@ import re
import platform
if "Linux" == platform.system():
def __debug_vm_usage(msg):
- print >>sys.stderr, "===> DEBUG: " + msg
+ print("===> DEBUG: " + msg, file=sys.stderr)
for vmstat in re.findall(r".*Vm.*", file("/proc/self/status").read()):
- print >>sys.stderr, vmstat
+ print(vmstat, file=sys.stderr)
else:
def __debug_vm_usage(msg): # pylint: disable=unused-argument
pass
+# stdlib queue module was renamed on Python 3
+try:
+ import queue
+except ImportError:
+ import Queue as queue
+
DEFAULT_SUBSET_SIZE = int(1e5)
DEFAULT_N_THREADS = 4
@@ -46,23 +52,23 @@ def worker(queue, basename, stop_big_traversals):
while True:
try:
(htable, index, start, stop) = queue.get(False)
- except Queue.Empty:
- print >>sys.stderr, 'exiting'
+ except queue.Empty:
+ print('exiting', file=sys.stderr)
return
outfile = basename + '.subset.%d.pmap' % (index,)
if os.path.exists(outfile):
- print >>sys.stderr, 'SKIPPING', outfile, ' -- already exists'
+ print('SKIPPING', outfile, ' -- already exists', file=sys.stderr)
continue
- print >>sys.stderr, 'starting:', basename, index
+ print('starting:', basename, index, file=sys.stderr)
# pay attention to stoptags when partitioning; take command line
# direction on whether or not to exhaustively traverse.
subset = htable.do_subset_partition(start, stop, True,
stop_big_traversals)
- print >>sys.stderr, 'saving:', basename, index
+ print('saving:', basename, index, file=sys.stderr)
htable.save_subset_partitionmap(subset, outfile)
del subset
gc.collect()
@@ -107,30 +113,30 @@ def main():
check_space(filenames, args.force)
- print >>sys.stderr, '--'
- print >>sys.stderr, 'SUBSET SIZE', args.subset_size
- print >>sys.stderr, 'N THREADS', args.threads
+ print('--', file=sys.stderr)
+ print('SUBSET SIZE', args.subset_size, file=sys.stderr)
+ print('N THREADS', args.threads, file=sys.stderr)
if args.stoptags:
- print >>sys.stderr, 'stoptag file:', args.stoptags
- print >>sys.stderr, '--'
+ print('stoptag file:', args.stoptags, file=sys.stderr)
+ print('--', file=sys.stderr)
- print >>sys.stderr, 'loading ht %s.pt' % basename
+ print('loading ht %s.pt' % basename, file=sys.stderr)
htable = khmer.load_hashbits(basename + '.pt')
htable.load_tagset(basename + '.tagset')
# do we want to load stop tags, and do they exist?
if args.stoptags:
- print >>sys.stderr, 'loading stoptags from', args.stoptags
+ print('loading stoptags from', args.stoptags, file=sys.stderr)
htable.load_stop_tags(args.stoptags)
# do we want to exhaustively traverse the graph?
stop_big_traversals = args.no_big_traverse
if stop_big_traversals:
- print >>sys.stderr, '** This script brakes for lumps:', \
- ' stop_big_traversals is true.'
+ print('** This script brakes for lumps:',
+ ' stop_big_traversals is true.', file=sys.stderr)
else:
- print >>sys.stderr, '** Traverse all the things:', \
- ' stop_big_traversals is false.'
+ print('** Traverse all the things:',
+ ' stop_big_traversals is false.', file=sys.stderr)
#
# now, partition!
@@ -142,7 +148,7 @@ def main():
divvy.append(0)
# build a queue of tasks:
- worker_q = Queue.Queue()
+ worker_q = queue.Queue()
# break up the subsets into a list of worker tasks
for _ in range(0, n_subsets):
@@ -150,7 +156,7 @@ def main():
end = divvy[_ + 1]
worker_q.put((htable, _, start, end))
- print >>sys.stderr, 'enqueued %d subset tasks' % n_subsets
+ print('enqueued %d subset tasks' % n_subsets, file=sys.stderr)
open('%s.info' % basename, 'w').write('%d subsets total\n' % (n_subsets))
n_threads = args.threads
@@ -158,8 +164,8 @@ def main():
n_threads = n_subsets
# start threads!
- print >>sys.stderr, 'starting %d threads' % n_threads
- print >>sys.stderr, '---'
+ print('starting %d threads' % n_threads, file=sys.stderr)
+ print('---', file=sys.stderr)
threads = []
for _ in range(n_threads):
@@ -168,15 +174,15 @@ def main():
threads.append(cur_thrd)
cur_thrd.start()
- print >>sys.stderr, 'done starting threads'
+ print('done starting threads', file=sys.stderr)
# wait for threads
for _ in threads:
_.join()
- print >>sys.stderr, '---'
- print >>sys.stderr, 'done making subsets! see %s.subset.*.pmap' % \
- (basename,)
+ print('---', file=sys.stderr)
+ print('done making subsets! see %s.subset.*.pmap' %
+ (basename,), file=sys.stderr)
if __name__ == '__main__':
main()
diff --git a/scripts/readstats.py b/scripts/readstats.py
index d3c5cd5..d8e995f 100755
--- a/scripts/readstats.py
+++ b/scripts/readstats.py
@@ -1,7 +1,7 @@
-#! /usr/bin/env python2
+#! /usr/bin/env python
#
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
-# Copyright (C) Michigan State University, 2009-2013. It is licensed under
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
+# Copyright (C) Michigan State University, 2009-2015. It is licensed under
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
#
@@ -12,6 +12,7 @@ Display summary statistics for one or more FASTA/FASTQ files.
Use '-h' for parameter help.
"""
+from __future__ import print_function
import sys
import csv
@@ -26,7 +27,7 @@ def get_parser():
Report number of bases, number of sequences, and average sequence length
for one or more FASTA/FASTQ files; and report aggregate statistics at end.
- With :option:`-o`/:options:`--output`, the output will be saved to the
+ With :option:`-o`/:option:`--output`, the output will be saved to the
specified file.
Example::
@@ -130,7 +131,7 @@ def analyze_file(filename):
input_iter = screed.open(filename, parse_description=False)
for record in input_iter:
if seqs % 100000 == 0:
- print >>sys.stderr, '...', filename, seqs
+ print('...', filename, seqs, file=sys.stderr)
bps += len(record.sequence)
seqs += 1
return bps, seqs
@@ -150,8 +151,8 @@ def main():
try:
bps, seqs = analyze_file(filename)
except (IOError, OSError, EOFError) as exc:
- print >>sys.stderr, 'ERROR in opening %s:' % filename
- print >>sys.stderr, ' ', str(exc)
+ print('ERROR in opening %s:' % filename, file=sys.stderr)
+ print(' ', str(exc), file=sys.stderr)
continue
if seqs:
@@ -161,11 +162,9 @@ def main():
seqs,
avg,
filename)
-
- print >>sys.stderr, '... found', msg
-
+ print('... found', msg, file=sys.stderr)
else:
- print >>sys.stderr, 'No sequences found in %s' % filename
+ print('No sequences found in %s' % filename, file=sys.stderr)
if statistics:
if args.csv:
@@ -176,8 +175,8 @@ def main():
for stat in statistics:
out.append(*stat)
else:
- print >>args.outfp, \
- 'No sequences found in %d files' % len(args.filenames)
+ print('No sequences found in %d files' %
+ len(args.filenames), file=args.outfp)
if __name__ == '__main__':
diff --git a/scripts/sample-reads-randomly.py b/scripts/sample-reads-randomly.py
index 64323e4..79b4777 100755
--- a/scripts/sample-reads-randomly.py
+++ b/scripts/sample-reads-randomly.py
@@ -1,6 +1,6 @@
-#! /usr/bin/env python2
+#! /usr/bin/env python
#
-# This script is part of khmer, http://github.com/ged-lab/khmer/, and is
+# This script is part of khmer, https://github.com/dib-lab/khmer/, and is
# Copyright (C) Michigan State University, 2009-2015. It is licensed under
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
@@ -17,6 +17,7 @@ By default take one subsample, but take -S samples if specified.
Reads FASTQ and FASTA input, retains format for output.
"""
+from __future__ import print_function
import argparse
import screed
@@ -110,17 +111,18 @@ def main():
output_filename = os.path.basename(filename) + '.subset'
if num_samples == 1:
- print >>sys.stderr, 'Subsampling %d reads using reservoir sampling.' %\
- args.num_reads
- print >>sys.stderr, 'Subsampled reads will be placed in %s' % \
- output_filename
- print >>sys.stderr, ''
+ print('Subsampling %d reads using reservoir sampling.' %
+ args.num_reads, file=sys.stderr)
+ print('Subsampled reads will be placed in %s' %
+ output_filename, file=sys.stderr)
+ print('', file=sys.stderr)
else: # > 1
- print >>sys.stderr, 'Subsampling %d reads, %d times,' \
- % (args.num_reads, num_samples), ' using reservoir sampling.'
- print >>sys.stderr, 'Subsampled reads will be placed in %s.N' \
- % output_filename
- print >>sys.stderr, ''
+ print('Subsampling %d reads, %d times,'
+ % (args.num_reads, num_samples), ' using reservoir sampling.',
+ file=sys.stderr)
+ print('Subsampled reads will be placed in %s.N'
+ % output_filename, file=sys.stderr)
+ print('', file=sys.stderr)
reads = []
for n in range(num_samples):
@@ -128,17 +130,17 @@ def main():
# read through all the sequences and load/resample the reservoir
for filename in args.filenames:
- print >>sys.stderr, 'opening', filename, 'for reading'
+ print('opening', filename, 'for reading', file=sys.stderr)
screed_iter = screed.open(filename, parse_description=False)
for count, (_, ispair, rcrd1, rcrd2) in enumerate(broken_paired_reader(
screed_iter,
force_single=args.force_single)):
if count % 10000 == 0:
- print >>sys.stderr, '...', count, 'reads scanned'
+ print('...', count, 'reads scanned', file=sys.stderr)
if count >= args.max_reads:
- print >>sys.stderr, 'reached upper limit of %d reads' % \
- args.max_reads, '(see -M); exiting'
+ print('reached upper limit of %d reads' %
+ args.max_reads, '(see -M); exiting', file=sys.stderr)
break
# collect first N reads
@@ -158,8 +160,8 @@ def main():
# output all the subsampled reads:
if len(reads) == 1:
- print >>sys.stderr, 'Writing %d sequences to %s' % \
- (len(reads[0]), output_filename)
+ print('Writing %d sequences to %s' %
+ (len(reads[0]), output_filename), file=sys.stderr)
if not output_file:
output_file = open(output_filename, 'w')
@@ -170,8 +172,8 @@ def main():
else:
for n in range(num_samples):
n_filename = output_filename + '.%d' % n
- print >>sys.stderr, 'Writing %d sequences to %s' % \
- (len(reads[n]), n_filename)
+ print('Writing %d sequences to %s' %
+ (len(reads[n]), n_filename), file=sys.stderr)
output_file = open(n_filename, 'w')
for records in reads[n]:
write_record(records[0], output_file)
diff --git a/scripts/split-paired-reads.py b/scripts/split-paired-reads.py
index f80cf65..e9eac94 100755
--- a/scripts/split-paired-reads.py
+++ b/scripts/split-paired-reads.py
@@ -1,6 +1,6 @@
-#! /usr/bin/env python2
+#! /usr/bin/env python
#
-# This script is part of khmer, http://github.com/ged-lab/khmer/, and is
+# This script is part of khmer, https://github.com/dib-lab/khmer/, and is
# Copyright (C) Michigan State University, 2009-2015. It is licensed under
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
@@ -16,6 +16,7 @@ files (.1 and .2).
Reads FASTQ and FASTA input, retains format for output.
"""
+from __future__ import print_function
import screed
import sys
import os
@@ -63,7 +64,7 @@ def get_parser():
epilog=textwrap.dedent(epilog),
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
- parser.add_argument('infile')
+ parser.add_argument('infile', nargs='?', default='/dev/stdin')
parser.add_argument('-o', '--output-dir', metavar="output_directory",
dest='output_directory', default='', help='Output '
@@ -72,10 +73,10 @@ def get_parser():
parser.add_argument('-1', '--output-first', metavar='output_first',
default=None, help='Output "left" reads to this '
- 'file')
+ 'file', type=argparse.FileType('w'))
parser.add_argument('-2', '--output-second', metavar='output_second',
default=None, help='Output "right" reads to this '
- 'file')
+ 'file', type=argparse.FileType('w'))
parser.add_argument('-p', '--force-paired', action='store_true',
help='Require that reads be interleaved')
@@ -92,12 +93,17 @@ def main():
infile = args.infile
- check_input_files(infile, args.force)
filenames = [infile]
+ check_input_files(infile, args.force)
check_space(filenames, args.force)
# decide where to put output files - specific directory? or just default?
- if args.output_directory:
+ if infile == '/dev/stdin' or infile == '-':
+ if not (args.output_first and args.output_second):
+ print >>sys.stderr, ("Accepting input from stdin; "
+ "output filenames must be provided.")
+ sys.exit(1)
+ elif args.output_directory:
if not os.path.exists(args.output_directory):
os.makedirs(args.output_directory)
out1 = args.output_directory + '/' + os.path.basename(infile) + '.1'
@@ -108,12 +114,17 @@ def main():
# OVERRIDE output file locations with -1, -2
if args.output_first:
- out1 = args.output_first
+ fp_out1 = args.output_first
+ out1 = fp_out1.name
+ else:
+ # Use default filename created above
+ fp_out1 = open(out1, 'w')
if args.output_second:
- out2 = args.output_second
-
- fp_out1 = open(out1, 'w')
- fp_out2 = open(out2, 'w')
+ fp_out2 = args.output_second
+ out2 = fp_out2.name
+ else:
+ # Use default filename created above
+ fp_out2 = open(out2, 'w')
counter1 = 0
counter2 = 0
@@ -122,14 +133,15 @@ def main():
screed_iter = screed.open(infile, parse_description=False)
# walk through all the reads in broken-paired mode.
- for index, is_pair, record1, record2 in broken_paired_reader(screed_iter):
- if index % 100000 == 0 and index:
- print >> sys.stderr, '...', index
+ paired_iter = broken_paired_reader(screed_iter)
+ for index, is_pair, record1, record2 in paired_iter:
+ if index % 10000 == 0:
+ print('...', index, file=sys.stderr)
# are we requiring pairs?
if args.force_paired and not is_pair:
- print >>sys.stderr, 'ERROR, %s is not part of a pair' % \
- record1.name
+ print('ERROR, %s is not part of a pair' %
+ record1.name, file=sys.stderr)
sys.exit(1)
if is_pair:
@@ -146,15 +158,15 @@ def main():
write_record(record1, fp_out2)
counter2 += 1
else:
- print >>sys.stderr, \
- "Unrecognized format for read pair information: %s" % name
- print >>sys.stderr, "Exiting."
+ print("Unrecognized format for read pair information: %s" %
+ name, file=sys.stderr)
+ print("Exiting.", file=sys.stderr)
sys.exit(1)
- print >> sys.stderr, "DONE; split %d sequences (%d left, %d right)" % \
- (counter1 + counter2, counter1, counter2)
- print >> sys.stderr, "/1 reads in %s" % out1
- print >> sys.stderr, "/2 reads in %s" % out2
+ print("DONE; split %d sequences (%d left, %d right)" %
+ (counter1 + counter2, counter1, counter2), file=sys.stderr)
+ print("/1 reads in %s" % out1, file=sys.stderr)
+ print("/2 reads in %s" % out2, file=sys.stderr)
if __name__ == '__main__':
main()
diff --git a/scripts/trim-low-abund.py b/scripts/trim-low-abund.py
index 564e24e..741b181 100755
--- a/scripts/trim-low-abund.py
+++ b/scripts/trim-low-abund.py
@@ -1,6 +1,6 @@
-#! /usr/bin/env python2
+#! /usr/bin/env python
#
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
# Copyright (C) Michigan State University, 2009-2015. It is licensed under
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
@@ -14,6 +14,7 @@ Output sequences will be placed in 'infile.abundtrim'.
Use -h for parameter help.
"""
+from __future__ import print_function
import sys
import screed
import os
@@ -23,7 +24,9 @@ import shutil
import textwrap
import argparse
-from screed.screedRecord import _screed_record_dict
+from screed import Record
+from khmer import khmer_args
+
from khmer.khmer_args import (build_counting_args, info, add_loadhash_args,
report_on_config)
from khmer.utils import write_record, write_record_pair, broken_paired_reader
@@ -35,7 +38,7 @@ DEFAULT_CUTOFF = 2
def trim_record(read, trim_at):
- new_read = _screed_record_dict()
+ new_read = Record()
new_read.name = read.name
new_read.sequence = read.sequence[:trim_at]
if hasattr(read, 'quality'):
@@ -113,8 +116,8 @@ def main():
###
if len(set(args.input_filenames)) != len(args.input_filenames):
- print >>sys.stderr, \
- "Error: Cannot input the same filename multiple times."
+ print("Error: Cannot input the same filename multiple times.",
+ file=sys.stderr)
sys.exit(1)
###
@@ -123,24 +126,22 @@ def main():
check_valid_file_exists(args.input_filenames)
check_space(args.input_filenames, args.force)
if args.savetable:
- check_space_for_hashtable(
- args.n_tables * args.min_tablesize, args.force)
-
- K = args.ksize
-
- CUTOFF = args.cutoff
- NORMALIZE_LIMIT = args.normalize_to
+ check_space_for_hashtable(args, 'countgraph', args.force)
if args.loadtable:
- print >>sys.stderr, 'loading k-mer counting table from', args.loadtable
+ print('loading countgraph from', args.loadtable, file=sys.stderr)
ct = khmer.load_counting_hash(args.loadtable)
else:
- print >>sys.stderr, 'making k-mer counting table'
- ct = khmer.new_counting_hash(K, args.min_tablesize, args.n_tables)
+ print('making countgraph', file=sys.stderr)
+ ct = khmer_args.create_countgraph(args)
+
+ K = ct.ksize()
+ CUTOFF = args.cutoff
+ NORMALIZE_LIMIT = args.normalize_to
tempdir = tempfile.mkdtemp('khmer', 'tmp', args.tempdir)
- print >>sys.stderr, 'created temporary directory %s; ' \
- 'use -T to change location' % tempdir
+ print('created temporary directory %s; '
+ 'use -T to change location' % tempdir, file=sys.stderr)
# ### FIRST PASS ###
@@ -173,8 +174,8 @@ def main():
force_single=args.ignore_pairs)
for n, is_pair, read1, read2 in paired_iter:
if n % 10000 == 0:
- print >>sys.stderr, '...', n, filename, save_pass2, \
- n_reads, n_bp, written_reads, written_bp
+ print('...', n, filename, save_pass2, n_reads, n_bp,
+ written_reads, written_bp, file=sys.stderr)
# we want to track paired reads here, to make sure that pairs
# are not split between first pass and second pass.
@@ -240,8 +241,9 @@ def main():
pass2fp.close()
- print >>sys.stderr, '%s: kept aside %d of %d from first pass, in %s' \
- % (filename, save_pass2, n, filename)
+ print('%s: kept aside %d of %d from first pass, in %s' %
+ (filename, save_pass2, n, filename),
+ file=sys.stderr)
save_pass2_total += save_pass2
# ### SECOND PASS. ###
@@ -249,8 +251,9 @@ def main():
skipped_n = 0
skipped_bp = 0
for _, pass2filename, trimfp in pass2list:
- print >>sys.stderr, ('second pass: looking at sequences kept aside '
- 'in %s') % pass2filename
+ print('second pass: looking at sequences kept aside in %s' %
+ pass2filename,
+ file=sys.stderr)
# note that for this second pass, we don't care about paired
# reads - they will be output in the same order they're read in,
@@ -260,8 +263,8 @@ def main():
for n, read in enumerate(screed.open(pass2filename,
parse_description=False)):
if n % 10000 == 0:
- print >>sys.stderr, '... x 2', n, pass2filename, \
- written_reads, written_bp
+ print('... x 2', n, pass2filename,
+ written_reads, written_bp, file=sys.stderr)
seq = read.sequence.replace('N', 'A')
med, _, _ = ct.get_median_count(seq)
@@ -288,42 +291,45 @@ def main():
if trim_at != len(read.sequence):
trimmed_reads += 1
- print >>sys.stderr, 'removing %s' % pass2filename
+ print('removing %s' % pass2filename, file=sys.stderr)
os.unlink(pass2filename)
- print >>sys.stderr, 'removing temp directory & contents (%s)' % tempdir
+ print('removing temp directory & contents (%s)' % tempdir, file=sys.stderr)
shutil.rmtree(tempdir)
n_passes = 1.0 + (float(save_pass2_total) / n_reads)
percent_reads_trimmed = float(trimmed_reads + (n_reads - written_reads)) /\
n_reads * 100.0
- print >>sys.stderr, 'read %d reads, %d bp' % (n_reads, n_bp,)
- print >>sys.stderr, 'wrote %d reads, %d bp' % (written_reads, written_bp,)
- print >>sys.stderr, 'looked at %d reads twice (%.2f passes)' % \
- (save_pass2_total, n_passes)
- print >>sys.stderr, 'removed %d reads and trimmed %d reads (%.2f%%)' % \
- (n_reads - written_reads, trimmed_reads, percent_reads_trimmed)
- print >>sys.stderr, 'trimmed or removed %.2f%% of bases (%d total)' % \
- ((1 - (written_bp / float(n_bp))) * 100.0, n_bp - written_bp)
+ print('read %d reads, %d bp' % (n_reads, n_bp,))
+ print('wrote %d reads, %d bp' % (written_reads, written_bp,))
+ print('looked at %d reads twice (%.2f passes)' % (save_pass2_total,
+ n_passes))
+ print('removed %d reads and trimmed %d reads (%.2f%%)' %
+ (n_reads - written_reads, trimmed_reads, percent_reads_trimmed))
+ print('trimmed or removed %.2f%% of bases (%d total)' %
+ ((1 - (written_bp / float(n_bp))) * 100.0, n_bp - written_bp))
if args.variable_coverage:
percent_reads_hicov = 100.0 * float(n_reads - skipped_n) / n_reads
- print >>sys.stderr, '%d reads were high coverage (%.2f%%);' % \
- (n_reads - skipped_n, percent_reads_hicov)
- print >>sys.stderr, ('skipped %d reads/%d bases because of low'
- 'coverage') % (skipped_n, skipped_bp)
+ print('%d reads were high coverage (%.2f%%);' % (n_reads - skipped_n,
+ percent_reads_hicov),
+ file=sys.stderr)
+ print('skipped %d reads/%d bases because of low coverage' %
+ (skipped_n, skipped_bp),
+ file=sys.stderr)
fp_rate = \
khmer.calc_expected_collisions(ct, args.force, max_false_pos=.8)
# for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975
- print >>sys.stderr, \
- 'fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate)
+ print('fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate),
+ file=sys.stderr)
- print >>sys.stderr, 'output in *.abundtrim'
+ print('output in *.abundtrim', file=sys.stderr)
if args.savetable:
- print >>sys.stderr, "Saving k-mer counting table to", args.savetable
+ print("Saving k-mer counting table to",
+ args.savetable, file=sys.stderr)
ct.save(args.savetable)
diff --git a/setup.cfg b/setup.cfg
index d5e5068..a04f6d1 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,7 +1,7 @@
[nosetests]
verbosity = 2
-stop = TRUE
-attr = !known_failing,!jenkins,!linux
+stop = FALSE
+attr = !known_failing,!jenkins,!huge
#processes = -1 # breaks xunit output
[build_ext]
diff --git a/setup.py b/setup.py
index 5a21bdb..c9d785e 100755
--- a/setup.py
+++ b/setup.py
@@ -1,5 +1,5 @@
-#! /usr/bin/env python2
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
+#! /usr/bin/env python
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
# Copyright (C) Michigan State University, 2009-2015. It is licensed under
# the three-clause BSD license; see doc/LICENSE.txt.
# Contact: khmer-project at idyll.org
@@ -104,7 +104,7 @@ BUILD_DEPENDS.extend(path_join("lib", bn + ".hh") for bn in [
"khmer", "kmer_hash", "hashtable", "counting", "hashbits", "labelhash",
"hllcounter", "khmer_exception", "read_aligner", "subset", "read_parsers"])
-SOURCES = ["khmer/_khmermodule.cc"]
+SOURCES = ["khmer/_khmer.cc"]
SOURCES.extend(path_join("lib", bn + ".cc") for bn in [
"trace_logger", "perf_metrics", "read_parsers", "kmer_hash", "hashtable",
"hashbits", "labelhash", "counting", "subset", "read_aligner",
@@ -134,7 +134,7 @@ EXTENSION_MOD_DICT = \
"define_macros": [("VERSION", versioneer.get_version()), ],
}
-EXTENSION_MOD = Extension("khmer._khmermodule", # pylint: disable=W0142
+EXTENSION_MOD = Extension("khmer._khmer", # pylint: disable=W0142
** EXTENSION_MOD_DICT)
SCRIPTS = []
SCRIPTS.extend([path_join("scripts", script)
@@ -150,8 +150,9 @@ CLASSIFIERS = [
"Operating System :: POSIX :: Linux",
"Operating System :: MacOS :: MacOS X",
"Programming Language :: C++",
- "Programming Language :: Python :: 2 :: Only",
"Programming Language :: Python :: 2.7",
+ "Programming Language :: Python :: 3.3",
+ "Programming Language :: Python :: 3.4",
"Topic :: Scientific/Engineering :: Bio-Informatics",
]
if "-rc" in versioneer.get_version():
@@ -174,16 +175,21 @@ SETUP_METADATA = \
# "maintainer_email": 'mcrusoe at msu.edu', # so don't include it
# http://docs.python.org/2/distutils/setupscript.html
# additiona-meta-data note #3
- "url": 'http://ged.msu.edu/',
- "packages": ['khmer', 'khmer.tests'],
+ "url": 'https://khmer.readthedocs.org/',
+ "packages": ['khmer', 'khmer.tests', 'oxli'],
"package_dir": {'khmer.tests': 'tests'},
- "install_requires": ['screed >= 0.8'],
+ "install_requires": ['screed >= 0.9'],
# testing screed download link
"extras_require": {':python_version=="2.6"': ['argparse>=1.2.1'],
'docs': ['sphinx', 'sphinxcontrib-autoprogram'],
'tests': ['nose >= 1.0']},
"scripts": SCRIPTS,
+ "entry_points": {
+ 'console_scripts': [
+ "oxli = oxli:main"
+ ]
+ },
"ext_modules": [EXTENSION_MOD, ],
# "platforms": '', # empty as is conveyed by the classifiers below
# "license": '', # empty as is conveyed by the classifier below
diff --git a/tests/__init__.py b/tests/__init__.py
index 520b18d..cee3cea 100644
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -1,5 +1,5 @@
#
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
-# Copyright (C) Michigan State University, 2009-2013. It is licensed under
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
+# Copyright (C) Michigan State University, 2009-2015. It is licensed under
# the three-clause BSD license; see doc/LICENSE.txt.
# Contact: khmer-project at idyll.org
diff --git a/tests/khmer_tst_utils.py b/tests/khmer_tst_utils.py
index d52303f..0cbe36c 100644
--- a/tests/khmer_tst_utils.py
+++ b/tests/khmer_tst_utils.py
@@ -1,5 +1,6 @@
+from __future__ import print_function
#
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
# Copyright (C) Michigan State University, 2009-2015. It is licensed under
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
@@ -8,11 +9,16 @@ import tempfile
import os
import shutil
from pkg_resources import Requirement, resource_filename, ResolutionError
-from cStringIO import StringIO
import nose
import sys
import traceback
import subprocess
+from io import open
+
+try:
+ from StringIO import StringIO
+except ImportError:
+ from io import StringIO
def get_test_data(filename):
@@ -100,8 +106,8 @@ def runscript(scriptname, args, in_directory=None,
os.chdir(in_directory)
try:
- print 'running:', scriptname, 'in:', in_directory
- print 'arguments', sysargs
+ print('running:', scriptname, 'in:', in_directory)
+ print('arguments', sysargs)
status = _runscript(scriptname, sandbox=sandbox)
except nose.SkipTest:
raise
@@ -118,8 +124,8 @@ def runscript(scriptname, args, in_directory=None,
os.chdir(cwd)
if status != 0 and not fail_ok:
- print out
- print err
+ print(out)
+ print(err)
assert False, (status, out, err)
return status, out, err
@@ -148,10 +154,12 @@ def runscriptredirect(scriptname, args, stdinfilename, in_directory=None,
os.chdir(in_directory)
sysargs = 'cat ' + stdinfilename + ' | python ' + scriptfile + \
" " + args
- out = open(os.path.join(in_directory, "out"), 'w+b')
- err = open(os.path.join(in_directory, "err"), 'w+b')
- print 'running:', scriptname, 'in:', in_directory
- print 'arguments', sysargs
+ out = open(
+ os.path.join(in_directory, "out"), 'w+', encoding='utf-8')
+ err = open(
+ os.path.join(in_directory, "err"), 'w+', encoding='utf-8')
+ print('running:', scriptname, 'in:', in_directory)
+ print('arguments', sysargs)
status = subprocess.call(args=sysargs, stdout=out, stderr=err,
shell=True)
os.chdir(cwd)
@@ -160,8 +168,8 @@ def runscriptredirect(scriptname, args, stdinfilename, in_directory=None,
out = out.read()
err.seek(0)
err = err.read()
- print out
- print err
+ print(out)
+ print(err)
assert False, (status, out, err)
return status, out, err
diff --git a/tests/test-data/badversion-k32.tagset b/tests/test-data/badversion-k32.tagset
index 8a4e4b0..a568d68 100644
Binary files a/tests/test-data/badversion-k32.tagset and b/tests/test-data/badversion-k32.tagset differ
diff --git a/tests/test-data/dn-test-all-paired-all-keep.fa b/tests/test-data/dn-test-all-paired-all-keep.fa
new file mode 100644
index 0000000..4584f4a
--- /dev/null
+++ b/tests/test-data/dn-test-all-paired-all-keep.fa
@@ -0,0 +1,16 @@
+>a/1
+ACTTCATCGTAGCTTACCACCGAGCGATCATTCCTTCTATCGGAAGGCGTCAGAACCGAATTTGGAGTATGGTTCTGAGACGGGAACATGCCTTGACCAG
+>a/2
+GGAGCTGGCAGCCACAGCACCGCAAGTTGATTTCGGGGCCGAATCACCAATATTAGAGGCAACGATGCATAACTCGTCTGTCGTAATGTTTAGCCGTTTTC
+>b/1
+ACTTCATCGTAGCTTACCACCGAGCGATCATTCCTTCTATCGGAAGGCGTCAGAACCGAATTTGGAGTATGGTTCTGAGACGGGAACATGCCTTGACCAG
+>b/2
+GTTAAAGCCGTGCGCGAGAGCGAACTGCAACATACCTTCCCTTATCCAGGAAAGGAGGTAGTTATACGGTCACTTTGTGGACTCTACCCGTAGATTCCGGT
+>c/1
+GAATGCACCTTCCGTAGTGGAGTGTTCGTCCCCACCTACGAGACTTCAATTCAGATACCGGCGTGAGTCGCAGTTATGATAAGAGACAGTGCTCTAGTACG
+>c/2
+ACTTCATCGTAGCTTACCACCGAGCGATCATTCCTTCTATCGGAAGGCGTCAGAACCGAATTTGGAGTATGGTTCTGAGACGGGAACATGCCTTGACCAG
+>d/1
+ACTTCATCGTAGCTTACCACCGAGCGATCATTCCTTCTATCGGAAGGCGTCAGAACCGAATTTGGAGTATGGTTCTGAGACGGGAACATGCCTTGACCAG
+>d/2
+GGCGTACATGCCTCACAATTTAGGCCCCGGGACCGCAGCACACGCCTAACAATATCTCAGATTAGGGAGTAATCCTGATATGGTTTGGTAGTGGCTATCTG
\ No newline at end of file
diff --git a/tests/test-data/dn-test-none-paired.fa b/tests/test-data/dn-test-none-paired.fa
new file mode 100644
index 0000000..9553d5d
--- /dev/null
+++ b/tests/test-data/dn-test-none-paired.fa
@@ -0,0 +1,8 @@
+>a/1
+ACTTCATCGTAGCTTACCACCGAGCGATCATTCCTTCTATCGGAAGGCGTCAGAACCGAATTTGGAGTATGGTTCTGAGACGGGAACATGCCTTGACCAG
+>b/2
+GTTAAAGCCGTGCGCGAGAGCGAACTGCAACATACCTTCCCTTATCCAGGAAAGGAGGTAGTTATACGGTCACTTTGTGGACTCTACCCGTAGATTCCGGT
+>c/2
+ACTTCATCGTAGCTTACCACCGAGCGATCATTCCTTCTATCGGAAGGCGTCAGAACCGAATTTGGAGTATGGTTCTGAGACGGGAACATGCCTTGACCAG
+>d/1
+GGCGTACATGCCTCACAATTTAGGCCCCGGGACCGCAGCACACGCCTAACAATATCTCAGATTAGGGAGTAATCCTGATATGGTTTGGTAGTGGCTATCTG
diff --git a/tests/test-data/dn-test-some-paired-all-keep.fa b/tests/test-data/dn-test-some-paired-all-keep.fa
new file mode 100644
index 0000000..dc65521
--- /dev/null
+++ b/tests/test-data/dn-test-some-paired-all-keep.fa
@@ -0,0 +1,12 @@
+>a/1
+ACTTCATCGTAGCTTACCACCGAGCGATCATTCCTTCTATCGGAAGGCGTCAGAACCGAATTTGGAGTATGGTTCTGAGACGGGAACATGCCTTGACCAG
+>a/2
+GGAGCTGGCAGCCACAGCACCGCAAGTTGATTTCGGGGCCGAATCACCAATATTAGAGGCAACGATGCATAACTCGTCTGTCGTAATGTTTAGCCGTTTTC
+>b/2
+GTTAAAGCCGTGCGCGAGAGCGAACTGCAACATACCTTCCCTTATCCAGGAAAGGAGGTAGTTATACGGTCACTTTGTGGACTCTACCCGTAGATTCCGGT
+>c/1
+GAATGCACCTTCCGTAGTGGAGTGTTCGTCCCCACCTACGAGACTTCAATTCAGATACCGGCGTGAGTCGCAGTTATGATAAGAGACAGTGCTCTAGTACG
+>c/2
+ACTTCATCGTAGCTTACCACCGAGCGATCATTCCTTCTATCGGAAGGCGTCAGAACCGAATTTGGAGTATGGTTCTGAGACGGGAACATGCCTTGACCAG
+>d/2
+GGCGTACATGCCTCACAATTTAGGCCCCGGGACCGCAGCACACGCCTAACAATATCTCAGATTAGGGAGTAATCCTGATATGGTTTGGTAGTGGCTATCTG
diff --git a/tests/test-data/goodversion-k32.tagset b/tests/test-data/goodversion-k32.tagset
index 9d78e7a..2fe701e 100644
Binary files a/tests/test-data/goodversion-k32.tagset and b/tests/test-data/goodversion-k32.tagset differ
diff --git a/tests/test-data/normC20k20.ct b/tests/test-data/normC20k20.ct
index 9694a07..1400ef9 100644
Binary files a/tests/test-data/normC20k20.ct and b/tests/test-data/normC20k20.ct differ
diff --git a/tests/test-data/paired-slash1.fq b/tests/test-data/paired-slash1.fq
new file mode 100644
index 0000000..995c7c3
--- /dev/null
+++ b/tests/test-data/paired-slash1.fq
@@ -0,0 +1,32 @@
+ at SRR797058.3 HWI-ST600:227:C0WR4ACXX:7:1101:17167:2000/1
+NCTACCAAAAAAATGCCCGATAATTCTGACCATTCCTTCCTCATTCTCGTCTGGCGTTTGGTCACGACGCACGATACCTTCTGCACTTGTCAAGACAGCGG
++
+#00@#################################################################################################
+ at SRR797058.3 HWI-ST600:227:C0WR4ACXX:7:1101:17167:2000/2
+CTTGACAAGAGCAGAAGTTATCTTGCCTCGGGACCAAACGCCAGACGAGCACGAGGGAGCGATCGTCCGCATTAGCCGGCATTCTTTTGCTAGCAGATCGG
++
+=?###################################################################################################
+ at SRR797058.3 HWI-ST600:227:C0WR4ACXX:7:1101:17167:2000/1
+NCTACCAAAAAAATGCCCGATAATTCTGACCATTCCTTCCTCATTCTCGTCTGGCGTTTGGTCACGACGCACGATACCTTCTGCACTTGTCAAGACAGCGG
++
+#00@#################################################################################################
+ at SRR797058.3 HWI-ST600:227:C0WR4ACXX:7:1101:17167:2000/2
+CTTGACAAGAGCAGAAGTTATCTTGCCTCGGGACCAAACGCCAGACGAGCACGAGGGAGCGATCGTCCGCATTAGCCGGCATTCTTTTGCTAGCAGATCGG
++
+=?###################################################################################################
+ at SRR797058.3 HWI-ST600:227:C0WR4ACXX:7:1101:17167:2000/1
+NCTACCAAAAAAATGCCCGATAATTCTGACCATTCCTTCCTCATTCTCGTCTGGCGTTTGGTCACGACGCACGATACCTTCTGCACTTGTCAAGACAGCGG
++
+#00@#################################################################################################
+ at SRR797058.3 HWI-ST600:227:C0WR4ACXX:7:1101:17167:2000/2
+CTTGACAAGAGCAGAAGTTATCTTGCCTCGGGACCAAACGCCAGACGAGCACGAGGGAGCGATCGTCCGCATTAGCCGGCATTCTTTTGCTAGCAGATCGG
++
+=?###################################################################################################
+ at SRR797058.3 HWI-ST600:227:C0WR4ACXX:7:1101:17167:2000/1
+NCTACCAAAAAAATGCCCGATAATTCTGACCATTCCTTCCTCATTCTCGTCTGGCGTTTGGTCACGACGCACGATACCTTCTGCACTTGTCAAGACAGCGG
++
+#00@#################################################################################################
+ at SRR797058.3 HWI-ST600:227:C0WR4ACXX:7:1101:17167:2000/2
+CTTGACAAGAGCAGAAGTTATCTTGCCTCGGGACCAAACGCCAGACGAGCACGAGGGAGCGATCGTCCGCATTAGCCGGCATTCTTTTGCTAGCAGATCGG
++
+=?###################################################################################################
diff --git a/tests/test-data/paired-slash1.fq.1 b/tests/test-data/paired-slash1.fq.1
new file mode 100644
index 0000000..4c95773
--- /dev/null
+++ b/tests/test-data/paired-slash1.fq.1
@@ -0,0 +1,16 @@
+ at SRR797058.3 HWI-ST600:227:C0WR4ACXX:7:1101:17167:2000/1
+NCTACCAAAAAAATGCCCGATAATTCTGACCATTCCTTCCTCATTCTCGTCTGGCGTTTGGTCACGACGCACGATACCTTCTGCACTTGTCAAGACAGCGG
++
+#00@#################################################################################################
+ at SRR797058.3 HWI-ST600:227:C0WR4ACXX:7:1101:17167:2000/1
+NCTACCAAAAAAATGCCCGATAATTCTGACCATTCCTTCCTCATTCTCGTCTGGCGTTTGGTCACGACGCACGATACCTTCTGCACTTGTCAAGACAGCGG
++
+#00@#################################################################################################
+ at SRR797058.3 HWI-ST600:227:C0WR4ACXX:7:1101:17167:2000/1
+NCTACCAAAAAAATGCCCGATAATTCTGACCATTCCTTCCTCATTCTCGTCTGGCGTTTGGTCACGACGCACGATACCTTCTGCACTTGTCAAGACAGCGG
++
+#00@#################################################################################################
+ at SRR797058.3 HWI-ST600:227:C0WR4ACXX:7:1101:17167:2000/1
+NCTACCAAAAAAATGCCCGATAATTCTGACCATTCCTTCCTCATTCTCGTCTGGCGTTTGGTCACGACGCACGATACCTTCTGCACTTGTCAAGACAGCGG
++
+#00@#################################################################################################
diff --git a/tests/test-data/paired-slash1.fq.2 b/tests/test-data/paired-slash1.fq.2
new file mode 100644
index 0000000..46a8ba0
--- /dev/null
+++ b/tests/test-data/paired-slash1.fq.2
@@ -0,0 +1,16 @@
+ at SRR797058.3 HWI-ST600:227:C0WR4ACXX:7:1101:17167:2000/2
+CTTGACAAGAGCAGAAGTTATCTTGCCTCGGGACCAAACGCCAGACGAGCACGAGGGAGCGATCGTCCGCATTAGCCGGCATTCTTTTGCTAGCAGATCGG
++
+=?###################################################################################################
+ at SRR797058.3 HWI-ST600:227:C0WR4ACXX:7:1101:17167:2000/2
+CTTGACAAGAGCAGAAGTTATCTTGCCTCGGGACCAAACGCCAGACGAGCACGAGGGAGCGATCGTCCGCATTAGCCGGCATTCTTTTGCTAGCAGATCGG
++
+=?###################################################################################################
+ at SRR797058.3 HWI-ST600:227:C0WR4ACXX:7:1101:17167:2000/2
+CTTGACAAGAGCAGAAGTTATCTTGCCTCGGGACCAAACGCCAGACGAGCACGAGGGAGCGATCGTCCGCATTAGCCGGCATTCTTTTGCTAGCAGATCGG
++
+=?###################################################################################################
+ at SRR797058.3 HWI-ST600:227:C0WR4ACXX:7:1101:17167:2000/2
+CTTGACAAGAGCAGAAGTTATCTTGCCTCGGGACCAAACGCCAGACGAGCACGAGGGAGCGATCGTCCGCATTAGCCGGCATTCTTTTGCTAGCAGATCGG
++
+=?###################################################################################################
diff --git a/tests/test-data/paired_one.base.dif.fa b/tests/test-data/paired_one.base.dif.fa
new file mode 100644
index 0000000..1ea9040
--- /dev/null
+++ b/tests/test-data/paired_one.base.dif.fa
@@ -0,0 +1,5 @@
+>895:1:37:17593:9954/1
+GGTTGACGGGGCTCAGGGGGCGGCTGACTCCGAGAGACAGCAGCCGCAGCTGTCGTCAGGGGATTTCCGGGGCGGAGGCCGCAGACGCGAGTGGTGGAGGGAGAAGGCCTGACA
+>895:1:37:17593:9954/2
+GGTTGACGGGGCTCAGGGGGCGGCTGACTCCGAGAGACAGCAGCCGCAGCTGTCGTCAGGGGATTTCCGGGGCGGAGGCCGCAGACGCGAGTGGTGGAGGGAGAAGGCCTGACG
+
diff --git a/tests/test-data/test-filter-abund-Ns.fq b/tests/test-data/test-filter-abund-Ns.fq
new file mode 100644
index 0000000..f80289f
--- /dev/null
+++ b/tests/test-data/test-filter-abund-Ns.fq
@@ -0,0 +1,16 @@
+ at 895:1:37:17593:9954 1::FOO_withN
+GGTTGACGGGGCTCAGGGGGCGGCTGACTCCGAGNGACAGCAGCCGCAGCTGTCGTCAGGGGATTTCCGGGGCGGAGGCCGCAGACGCGAGTGGTGGAGGGAGAAGGCCTGACG
++
+##################################################################################################################
+ at 895:1:37:17593:9954 1::FOO
+GGTTGACGGGGCTCAGGGGGCGGCTGACTCCGAGAGACAGCAGCCGCAGCTGTCGTCAGGGGATTTCCGGGGCGGAGGCCGCAGACGCGAGTGGTGGAGGGAGAAGGCCTGACG
++
+##################################################################################################################
+ at 895:1:37:17593:9954 1::FOO
+GGTTGACGGGGCTCAGGGGGCGGCTGACTCCGAGAGACAGCAGCCGCAGCTGTCGTCAGGGGATTTCCGGGGCGGAGGCCGCAGACGCGAGTGGTGGAGGGAGAAGGCCTGACG
++
+##################################################################################################################
+ at 895:1:37:17593:9954 1::FOO
+GGTTGACGGGGCTCAGGGGGCGGCTGACTCCGAGAGACAGCAGCCGCAGCTGTCGTCAGGGGATTTCCGGGGCGGAGGCCGCAGACGCGAGTGGTGGAGGGAGAAGGCCTGACG
++
+##################################################################################################################
diff --git a/tests/test_counting_hash.py b/tests/test_counting_hash.py
index 849eb21..5e08aba 100644
--- a/tests/test_counting_hash.py
+++ b/tests/test_counting_hash.py
@@ -1,14 +1,19 @@
+from __future__ import print_function
+from __future__ import absolute_import, unicode_literals
#
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
-# Copyright (C) Michigan State University, 2009-2013. It is licensed under
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
+# Copyright (C) Michigan State University, 2009-2015. It is licensed under
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
#
# pylint: disable=missing-docstring,protected-access
import gzip
+import os
+import shutil
+
import khmer
-import khmer_tst_utils as utils
+from . import khmer_tst_utils as utils
from khmer import ReadParser
import screed
@@ -38,11 +43,26 @@ def teardown():
class Test_CountingHash(object):
def setup(self):
- self.hi = khmer.CountingHash(12, PRIMES_1m)
+ self.hi = khmer._CountingHash(12, PRIMES_1m)
+
+ def test_failed_get(self):
+ GG = 'G' * 12 # forward_hash: 11184810
+ GGhash = khmer.forward_hash(GG, 12)
+ assert khmer.forward_hash(GG, 12) == 11184810
+
+ hi = self.hi
+ hi.consume(GG)
+
+ try:
+ hi.get(float(GGhash))
+ assert "the previous statement should fail"
+ except ValueError as err:
+ print(str(err))
def test_collision_1(self):
GG = 'G' * 12 # forward_hash: 11184810
+ GGhash = khmer.forward_hash(GG, 12)
assert khmer.forward_hash(GG, 12) == 11184810
collision_1 = 'AAACGTATGACT'
@@ -59,6 +79,7 @@ class Test_CountingHash(object):
hi.consume(collision_1)
assert hi.get(GG) == 1
+ assert hi.get(GGhash) == 1
def test_collision_2(self):
@@ -103,47 +124,38 @@ class Test_CountingHash(object):
def test_get_raw_tables():
- ht = khmer.new_counting_hash(20, 1e5, 4)
+ ht = khmer.CountingHash(20, 1e5, 4)
tables = ht.get_raw_tables()
for size, table in zip(ht.hashsizes(), tables):
- assert isinstance(table, buffer)
+ assert isinstance(table, memoryview)
assert size == len(table)
def test_get_raw_tables_view():
- try:
- memoryview
- except NameError:
- raise nose.SkipTest("This test requires memoryview")
- ht = khmer.new_counting_hash(20, 1e5, 4)
+ ht = khmer.CountingHash(20, 1e5, 4)
tables = ht.get_raw_tables()
for tab in tables:
- try:
- memv = memoryview(tab)
- except TypeError:
- raise nose.SkipTest("This test needs a higher version of Python.")
- assert sum(memv.tolist()) == 0
+ assert sum(tab.tolist()) == 0
ht.consume('AAAATTTTCCCCGGGGAAAA')
for tab in tables:
- memv = memoryview(tab)
- assert sum(memv.tolist()) == 1
+ assert sum(tab.tolist()) == 1
- at attr('linux')
+ at attr('huge')
def test_toobig():
try:
- ct = khmer.new_counting_hash(30, 1e13, 1)
+ ct = khmer.CountingHash(30, 1e13, 1)
assert 0, "this should fail"
except MemoryError as err:
- print str(err)
+ print(str(err))
def test_3_tables():
x = list(PRIMES_1m)
x.append(1000005)
- hi = khmer.CountingHash(12, x)
+ hi = khmer._CountingHash(12, x)
GG = 'G' * 12 # forward_hash: 11184810
assert khmer.forward_hash(GG, 12) == 11184810
@@ -174,64 +186,206 @@ def test_3_tables():
def test_simple_median():
- hi = khmer.new_counting_hash(6, 1e6, 2)
+ hi = khmer.CountingHash(6, 1e6, 2)
hi.consume("AAAAAA")
(median, average, stddev) = hi.get_median_count("AAAAAA")
- print median, average, stddev
+ print(median, average, stddev)
assert median == 1
assert average == 1.0
assert stddev == 0.0
hi.consume("AAAAAA")
(median, average, stddev) = hi.get_median_count("AAAAAA")
- print median, average, stddev
+ print(median, average, stddev)
assert median == 2
assert average == 2.0
assert stddev == 0.0
hi.consume("AAAAAT")
(median, average, stddev) = hi.get_median_count("AAAAAAT")
- print median, average, stddev
+ print(median, average, stddev)
assert median == 2
assert average == 1.5
assert int(stddev * 100) == 50 # .5
hi.consume("AAAAAT")
(median, average, stddev) = hi.get_median_count("AAAAAAT")
- print median, average, stddev
+ print(median, average, stddev)
assert median == 2
assert average == 2.0
assert stddev == 0.0
hi.consume("AAAAAT")
(median, average, stddev) = hi.get_median_count("AAAAAAT")
- print median, average, stddev
+ print(median, average, stddev)
assert median == 3
assert average == 2.5
assert int(stddev * 100) == 50 # .5
+def test_median_too_short():
+ hi = khmer.CountingHash(6, 1e6, 2)
+
+ hi.consume("AAAAAA")
+ try:
+ hi.get_median_count("A")
+ assert 0, "this should fail"
+ except ValueError:
+ pass
+
+
+def test_median_at_least():
+ hi = khmer.CountingHash(6, 1e6, 2)
+
+ hi.consume("AAAAAA")
+ assert hi.median_at_least("AAAAAA", 1)
+ assert hi.median_at_least("AAAAAA", 2) is False
+
+ hi.consume("AAAAAA")
+ assert hi.median_at_least("AAAAAA", 2)
+ assert hi.median_at_least("AAAAAA", 3) is False
+
+ hi.consume("AAAAAA")
+ assert hi.median_at_least("AAAAAA", 3)
+ assert hi.median_at_least("AAAAAA", 4) is False
+
+ hi.consume("AAAAAA")
+ assert hi.median_at_least("AAAAAA", 4)
+ assert hi.median_at_least("AAAAAA", 5) is False
+
+ hi.consume("AAAAAA")
+ assert hi.median_at_least("AAAAAA", 5)
+ assert hi.median_at_least("AAAAAA", 6) is False
+
+
+def test_median_at_least_single_gt():
+ K = 20
+ hi = khmer.CountingHash(K, 1e6, 2)
+
+ kmers = ['ATCGATCGATCGATCGATCG',
+ 'GTACGTACGTACGTACGTAC',
+ 'TTAGTTAGTTAGTTAGTTAG']
+
+ for kmer in kmers:
+ hi.consume(kmer)
+ assert hi.median_at_least(kmer, 1) is True
+
+
+def test_median_at_least_single_lt():
+ K = 20
+ hi = khmer.CountingHash(K, 1e6, 2)
+
+ kmers = ['ATCGATCGATCGATCGATCG',
+ 'GTACGTACGTACGTACGTAC',
+ 'TTAGTTAGTTAGTTAGTTAG']
+
+ for kmer in kmers:
+ hi.consume(kmer)
+ assert hi.median_at_least(kmer, 2) is False
+
+
+def test_median_at_least_odd_gt():
+ # test w/odd number of k-mers
+ K = 20
+ hi = khmer.CountingHash(K, 1e6, 2)
+
+ seqs = ['ATCGATCGATCGATCGATCGCC',
+ 'GTACGTACGTACGTACGTACCC',
+ 'TTAGTTAGTTAGTTAGTTAGCC']
+
+ for seq in seqs:
+ hi.consume(seq)
+ assert hi.median_at_least(seq, 1) is True
+
+
+def test_median_at_least_odd_lt():
+ K = 20
+ hi = khmer.CountingHash(K, 1e6, 2)
+
+ seqs = ['ATCGATCGATCGATCGATCGCC',
+ 'GTACGTACGTACGTACGTACCC',
+ 'TTAGTTAGTTAGTTAGTTAGCC']
+
+ for seq in seqs:
+ hi.consume(seq)
+ assert hi.median_at_least(seq, 2) is False
+
+
+# Test median with even number of k-mers
+def test_median_at_least_even_gt():
+ K = 20
+ hi = khmer.CountingHash(K, 1e6, 2)
+
+ seqs = ['ATCGATCGATCGATCGATCGCCC',
+ 'GTACGTACGTACGTACGTACCCC',
+ 'TTAGTTAGTTAGTTAGTTAGCCC']
+
+ for seq in seqs:
+ hi.consume(seq)
+ assert hi.median_at_least(seq, 1) is True
+
+
+def test_median_at_least_even_lt():
+ K = 20
+ hi = khmer.CountingHash(K, 1e6, 2)
+
+ seqs = ['ATCGATCGATCGATCGATCGCCC',
+ 'GTACGTACGTACGTACGTACCCC',
+ 'TTAGTTAGTTAGTTAGTTAGCCC']
+
+ for seq in seqs:
+ hi.consume(seq)
+ assert hi.median_at_least(seq, 2) is False
+
+
+def test_median_at_least_comp():
+ K = 20
+ C = 4
+ hi = khmer.CountingHash(K, 1e6, 2)
+
+ seqs = ['ATCGATCGATCGATCGATCGCCC',
+ 'GTACGTACGTACGTACGTACCCC',
+ 'TTAGTTAGTTAGTTAGTTAGCCC']
+
+ for seq in seqs:
+ hi.consume(seq)
+ hi.consume(seq)
+ hi.consume(seq)
+
+ med, _, _ = hi.get_median_count(seq)
+ assert hi.median_at_least(seq, C) is (med >= C)
+
+
+def test_median_at_least_exception():
+ ht = khmer.CountingHash(20, 1e6, 2)
+ try:
+ ht.median_at_least('ATGGCTGATCGAT', 1)
+ assert 0, "should have thrown ValueError"
+ except ValueError as e:
+ pass
+
+
def test_simple_kadian():
- hi = khmer.new_counting_hash(6, 1e6, 2)
+ hi = khmer.CountingHash(6, 1e6, 2)
hi.consume("ACTGCTATCTCTAGAGCTATG")
assert hi.get_kadian_count("ACTGCTATCTCTAGAGCTATG") == 1
- hi = khmer.new_counting_hash(6, 1e6, 2)
+ hi = khmer.CountingHash(6, 1e6, 2)
hi.consume("ACTGCTATCTCTAGAGCTATG")
hi.consume("ACTGCTATCTCTAGAcCTATG")
# ---------------^
x = hi.get_kadian_count("ACTGCTATCTCTAGAGCTATG")
assert x == 2, x
- hi = khmer.new_counting_hash(6, 1e6, 2)
+ hi = khmer.CountingHash(6, 1e6, 2)
hi.consume("ACTGCTATCTCTAGAGCTATG")
hi.consume("ACTGCTATCTCTAGAcCTATG")
# ---------------^---^
x = hi.get_kadian_count("ACTGCTATCTCTAGAGCTATG")
assert x == 2
- hi = khmer.new_counting_hash(6, 1e6, 2)
+ hi = khmer.CountingHash(6, 1e6, 2)
hi.consume("ACTGCTATCTCTAGAGCTATG")
hi.consume("ACTGCTATCTCTAGtGCTAcG")
# --------------^^---^
@@ -240,11 +394,11 @@ def test_simple_kadian():
def test_simple_kadian_2():
- hi = khmer.new_counting_hash(6, 1e6, 2)
+ hi = khmer.CountingHash(6, 1e6, 2)
hi.consume("ACTGCTATCTCTAGAGCTATG")
assert hi.get_kadian_count("ACTGCTATCTCTAGAGCTATG") == 1
- hi = khmer.new_counting_hash(6, 1e6, 2)
+ hi = khmer.CountingHash(6, 1e6, 2)
hi.consume("ACTGCTATCTCTAGAGCTATG")
# hi.consume("ACaGCTATCTCTAGAGCTATG")
hi.consume("ACAGCTATCTCTAGAGCTATG")
@@ -252,7 +406,7 @@ def test_simple_kadian_2():
x = hi.get_kadian_count("ACTGCTATCTCTAGAGCTATG")
assert x == 2, x
- hi = khmer.new_counting_hash(6, 1e6, 2)
+ hi = khmer.CountingHash(6, 1e6, 2)
hi.consume("ACTGCTATCTCTAGAGCTATG")
# hi.consume("ACaGCTATCTCTAGAcCTATG")
hi.consume("ACAGCTATCTCTAGACCTATG")
@@ -260,7 +414,7 @@ def test_simple_kadian_2():
x = hi.get_kadian_count("ACTGCTATCTCTAGAGCTATG")
assert x == 1, x
- hi = khmer.new_counting_hash(6, 1e6, 2)
+ hi = khmer.CountingHash(6, 1e6, 2)
hi.consume("ACTGCTATCTCTAGAGCTATG")
# hi.consume("ACTGCTATCgCTAGAGCTATG")
hi.consume("ACTGCTATCGCTAGAGCTATG")
@@ -270,11 +424,11 @@ def test_simple_kadian_2():
def test_2_kadian():
- hi = khmer.new_counting_hash(6, 1e6, 2)
+ hi = khmer.CountingHash(6, 1e6, 2)
hi.consume("ACTGCTATCTCTAGAGCTATG")
assert hi.get_kadian_count("ACTGCTATCTCTAGAGCTATG", 2) == 1
- hi = khmer.new_counting_hash(6, 1e6, 2)
+ hi = khmer.CountingHash(6, 1e6, 2)
hi.consume("ACTGCTATCTCTAGAGCTATG")
# hi.consume("ACTGCTATCTCTAGAcCTATG")
hi.consume("ACTGCTATCTCTAGACCTATG")
@@ -282,14 +436,14 @@ def test_2_kadian():
x = hi.get_kadian_count("ACTGCTATCTCTAGAGCTATG", 2)
assert x == 2, x
- hi = khmer.new_counting_hash(6, 1e6, 2)
+ hi = khmer.CountingHash(6, 1e6, 2)
hi.consume("ACTGCTATCTCTAGAGCTATG")
# hi.consume("ACTGCTATCTCTAGAcCTAtG")
hi.consume("ACTGCTATCTCTAGACCTATG")
# ---------------^---^
assert hi.get_kadian_count("ACTGCTATCTCTAGAGCTATG", 2) == 2
- hi = khmer.new_counting_hash(6, 1e6, 2)
+ hi = khmer.CountingHash(6, 1e6, 2)
hi.consume("ACTGCTATCTCTAGAGCTATG")
# hi.consume("ACTGCTATCTCTACtcCTAtG")
hi.consume("ACTGCTATCTCTACTCCTATG")
@@ -297,7 +451,7 @@ def test_2_kadian():
x = hi.get_kadian_count("ACTGCTATCTCTAGAGCTATG", 2)
assert x == 2, x
- hi = khmer.new_counting_hash(6, 1e6, 2)
+ hi = khmer.CountingHash(6, 1e6, 2)
hi.consume("ACTGCTATCTCTAGAGCTATG")
# hi.consume("ACTGCTgTCTCTACtcCTAtG")
hi.consume("ACTGCTGTCTCTACTCCTATG")
@@ -306,6 +460,137 @@ def test_2_kadian():
assert x == 1, x
+def test_get_kmer_counts_too_short():
+ hi = khmer.CountingHash(6, 1e6, 2)
+
+ hi.consume("AAAAAA")
+ counts = hi.get_kmer_counts("A")
+ assert len(counts) == 0
+
+
+def test_get_kmer_hashes_too_short():
+ hi = khmer.CountingHash(6, 1e6, 2)
+
+ hi.consume("AAAAAA")
+ hashes = hi.get_kmer_hashes("A")
+ assert len(hashes) == 0
+
+
+def test_get_kmers_too_short():
+ hi = khmer.CountingHash(6, 1e6, 2)
+
+ hi.consume("AAAAAA")
+ kmers = hi.get_kmers("A")
+ assert len(kmers) == 0
+
+
+def test_get_kmer_counts():
+ hi = khmer.CountingHash(6, 1e6, 2)
+
+ hi.consume("AAAAAA")
+ counts = hi.get_kmer_counts("AAAAAA")
+ print(counts)
+ assert len(counts) == 1
+ assert counts[0] == 1
+
+ hi.consume("AAAAAA")
+ counts = hi.get_kmer_counts("AAAAAA")
+ print(counts)
+ assert len(counts) == 1
+ assert counts[0] == 2
+
+ hi.consume("AAAAAT")
+ counts = hi.get_kmer_counts("AAAAAAT")
+ print(counts)
+ assert len(counts) == 2
+ assert counts[0] == 2
+ assert counts[1] == 1
+
+ hi.consume("AAAAAT")
+ counts = hi.get_kmer_counts("AAAAAAT")
+ print(counts)
+ assert len(counts) == 2
+ assert counts[0] == 2
+ assert counts[1] == 2
+
+ hi.consume("AAAAAT")
+ counts = hi.get_kmer_counts("AAAAAAT")
+ print(counts)
+ assert len(counts) == 2
+ assert counts[0] == 2
+ assert counts[1] == 3
+
+
+def test_get_kmer_hashes():
+ hi = khmer.CountingHash(6, 1e6, 2)
+
+ hi.consume("AAAAAA")
+ hashes = hi.get_kmer_hashes("AAAAAA")
+ print(hashes)
+ assert len(hashes) == 1
+ assert hi.get(hashes[0]) == 1
+
+ hi.consume("AAAAAA")
+ hashes = hi.get_kmer_hashes("AAAAAA")
+ print(hashes)
+ assert len(hashes) == 1
+ assert hi.get(hashes[0]) == 2
+
+ hi.consume("AAAAAT")
+ hashes = hi.get_kmer_hashes("AAAAAAT")
+ print(hashes)
+ assert len(hashes) == 2
+ assert hi.get(hashes[0]) == 2
+ assert hi.get(hashes[1]) == 1
+
+ hi.consume("AAAAAT")
+ hashes = hi.get_kmer_hashes("AAAAAAT")
+ print(hashes)
+ assert len(hashes) == 2
+ assert hi.get(hashes[0]) == 2
+ assert hi.get(hashes[1]) == 2
+
+ hi.consume("AAAAAT")
+ hashes = hi.get_kmer_hashes("AAAAAAT")
+ print(hashes)
+ assert len(hashes) == 2
+ assert hi.get(hashes[0]) == 2
+ assert hi.get(hashes[1]) == 3
+
+
+def test_get_kmers():
+ hi = khmer.CountingHash(6, 1e6, 2)
+
+ kmers = hi.get_kmers("AAAAAA")
+ assert kmers == ["AAAAAA"]
+
+ kmers = hi.get_kmers("AAAAAAT")
+ assert kmers == ["AAAAAA", "AAAAAT"]
+
+
+ at attr("huge")
+def test_save_load_large():
+ def do_test(ctfile):
+ inpath = utils.get_test_data('random-20-a.fa')
+ savepath = utils.get_temp_filename(ctfile)
+
+ sizes = khmer.get_n_primes_near_x(1, 2**31 + 1000)
+
+ orig = khmer._CountingHash(12, sizes)
+ orig.consume_fasta(inpath)
+ orig.save(savepath)
+
+ loaded = khmer.load_counting_hash(savepath)
+
+ orig_count = orig.n_occupied()
+ loaded_count = loaded.n_occupied()
+ assert orig_count == 3966, orig_count
+ assert loaded_count == orig_count, loaded_count
+
+ for ctfile in ['temp.ct.gz', 'temp.ct']:
+ do_test(ctfile)
+
+
def test_save_load():
inpath = utils.get_test_data('random-20-a.fa')
savepath = utils.get_temp_filename('tempcountingsave0.ht')
@@ -313,12 +598,15 @@ def test_save_load():
sizes = list(PRIMES_1m)
sizes.append(1000005)
- hi = khmer.CountingHash(12, sizes)
+ hi = khmer._CountingHash(12, sizes)
hi.consume_fasta(inpath)
hi.save(savepath)
- ht = khmer.CountingHash(12, sizes)
- ht.load(savepath)
+ ht = khmer._CountingHash(12, sizes)
+ try:
+ ht.load(savepath)
+ except IOError as err:
+ assert 0, 'Should not produce an IOError: ' + str(err)
tracking = khmer._Hashbits(12, sizes)
x = hi.abundance_distribution(inpath, tracking)
@@ -330,6 +618,30 @@ def test_save_load():
assert x == y, (x, y)
+def test_load_truncated():
+ inpath = utils.get_test_data('random-20-a.fa')
+ savepath = utils.get_temp_filename('save.ht')
+ truncpath = utils.get_temp_filename('trunc.ht')
+
+ sizes = khmer.get_n_primes_near_x(3, 200)
+
+ hi = khmer._CountingHash(12, sizes)
+ hi.consume_fasta(inpath)
+ hi.save(savepath)
+
+ data = open(savepath, 'rb').read()
+ for i in range(len(data)):
+ fp = open(truncpath, 'wb')
+ fp.write(data[:i])
+ fp.close()
+
+ try:
+ ht = khmer.load_counting_hash(truncpath)
+ assert 0, "this should not be reached!"
+ except IOError as err:
+ print(str(err))
+
+
def test_load_gz():
inpath = utils.get_test_data('random-20-a.fa')
@@ -340,7 +652,7 @@ def test_load_gz():
sizes.append(1000005)
# save uncompressed hashtable.
- hi = khmer.CountingHash(12, sizes)
+ hi = khmer._CountingHash(12, sizes)
hi.consume_fasta(inpath)
hi.save(savepath)
@@ -352,8 +664,11 @@ def test_load_gz():
in_file.close()
# load compressed hashtable.
- ht = khmer.CountingHash(12, sizes)
- ht.load(loadpath)
+ ht = khmer._CountingHash(12, sizes)
+ try:
+ ht.load(loadpath)
+ except IOError as err:
+ assert 0, "Should not produce an IOError: " + str(err)
tracking = khmer._Hashbits(12, sizes)
x = hi.abundance_distribution(inpath, tracking)
@@ -372,12 +687,15 @@ def test_save_load_gz():
sizes = list(PRIMES_1m)
sizes.append(1000005)
- hi = khmer.CountingHash(12, sizes)
+ hi = khmer._CountingHash(12, sizes)
hi.consume_fasta(inpath)
hi.save(savepath)
- ht = khmer.CountingHash(12, sizes)
- ht.load(savepath)
+ ht = khmer._CountingHash(12, sizes)
+ try:
+ ht.load(savepath)
+ except IOError as err:
+ assert 0, 'Should not produce an IOError: ' + str(err)
tracking = khmer._Hashbits(12, sizes)
x = hi.abundance_distribution(inpath, tracking)
@@ -390,7 +708,7 @@ def test_save_load_gz():
def test_trim_full():
- hi = khmer.new_counting_hash(6, 1e6, 2)
+ hi = khmer.CountingHash(6, 1e6, 2)
hi.consume(DNA)
hi.consume(DNA)
@@ -400,7 +718,7 @@ def test_trim_full():
def test_trim_short():
- hi = khmer.new_counting_hash(6, 1e6, 2)
+ hi = khmer.CountingHash(6, 1e6, 2)
hi.consume(DNA)
hi.consume(DNA[:50])
@@ -412,20 +730,20 @@ def test_trim_short():
def test_find_spectral_error_positions_1():
- hi = khmer.new_counting_hash(8, 1e6, 2)
+ hi = khmer.CountingHash(8, 1e6, 2)
hi.consume(DNA)
hi.consume(DNA[:30])
for n in range(len(DNA) - 8 + 1):
- print n, hi.get(DNA[n:n + 8])
+ print(n, hi.get(DNA[n:n + 8]))
posns = hi.find_spectral_error_positions(DNA, 1)
assert posns == [30], posns
def test_find_spectral_error_positions_2():
- hi = khmer.new_counting_hash(8, 1e6, 2)
+ hi = khmer.CountingHash(8, 1e6, 2)
hi.consume(DNA)
hi.consume(DNA)
@@ -435,20 +753,20 @@ def test_find_spectral_error_positions_2():
def test_find_spectral_error_positions_6():
- hi = khmer.new_counting_hash(8, 1e6, 2)
+ hi = khmer.CountingHash(8, 1e6, 2)
hi.consume(DNA)
hi.consume(DNA[1:])
for n in range(len(DNA) - 8 + 1):
- print n, hi.get(DNA[n:n + 8])
+ print(n, hi.get(DNA[n:n + 8]))
posns = hi.find_spectral_error_positions(DNA, 1)
assert posns == [0], posns
def test_find_spectral_error_positions_4():
- hi = khmer.new_counting_hash(8, 1e6, 2)
+ hi = khmer.CountingHash(8, 1e6, 2)
hi.consume(DNA)
@@ -457,7 +775,7 @@ def test_find_spectral_error_positions_4():
def test_find_spectral_error_positions_5():
- hi = khmer.new_counting_hash(8, 1e6, 2)
+ hi = khmer.CountingHash(8, 1e6, 2)
hi.consume(DNA)
hi.consume(DNA[:10])
@@ -467,22 +785,22 @@ def test_find_spectral_error_positions_5():
assert posns == [10], posns
-def test_find_spectral_error_positions_6():
+def test_find_spectral_error_locs7():
K = 8
- hi = khmer.new_counting_hash(K, 1e6, 2)
+ hi = khmer.CountingHash(K, 1e6, 2)
hi.consume(DNA)
hi.consume(DNA[K:])
for n in range(len(DNA) - 8 + 1):
- print n, hi.get(DNA[n:n + 8])
+ print(n, hi.get(DNA[n:n + 8]))
posns = hi.find_spectral_error_positions(DNA, 1)
assert posns == [7], posns
def test_find_spectral_error_positions_err():
- hi = khmer.new_counting_hash(8, 1e6, 2)
+ hi = khmer.CountingHash(8, 1e6, 2)
try:
posns = hi.find_spectral_error_positions(DNA[:6], 1)
@@ -499,7 +817,7 @@ def test_find_spectral_error_positions_err():
def test_maxcount():
# hashtable should saturate at some point so as not to overflow counter
- kh = khmer.new_counting_hash(4, 4 ** 4, 4)
+ kh = khmer.CountingHash(4, 4 ** 4, 4)
kh.set_use_bigcount(False)
last_count = None
@@ -517,7 +835,7 @@ def test_maxcount():
def test_maxcount_with_bigcount():
# hashtable should not saturate, if use_bigcount is set.
- kh = khmer.new_counting_hash(4, 4 ** 4, 4)
+ kh = khmer.CountingHash(4, 4 ** 4, 4)
kh.set_use_bigcount(True)
last_count = None
@@ -535,7 +853,7 @@ def test_maxcount_with_bigcount():
def test_maxcount_with_bigcount_save():
# hashtable should not saturate, if use_bigcount is set.
- kh = khmer.new_counting_hash(4, 4 ** 4, 4)
+ kh = khmer.CountingHash(4, 4 ** 4, 4)
kh.set_use_bigcount(True)
for i in range(0, 1000):
@@ -545,8 +863,11 @@ def test_maxcount_with_bigcount_save():
savepath = utils.get_temp_filename('tempcountingsave.ht')
kh.save(savepath)
- kh = khmer.new_counting_hash(1, 1, 1)
- kh.load(savepath)
+ kh = khmer.CountingHash(1, 1, 1)
+ try:
+ kh.load(savepath)
+ except IOError as err:
+ assert 0, "Should not produce an IOError: " + str(err)
c = kh.get('AAAA')
assert c == 1000, "should be able to count to 1000: %d" % c
@@ -555,14 +876,17 @@ def test_maxcount_with_bigcount_save():
def test_bigcount_save():
# hashtable should not saturate, if use_bigcount is set.
- kh = khmer.new_counting_hash(4, 4 ** 4, 4)
+ kh = khmer.CountingHash(4, 4 ** 4, 4)
kh.set_use_bigcount(True)
savepath = utils.get_temp_filename('tempcountingsave.ht')
kh.save(savepath)
- kh = khmer.new_counting_hash(1, 1, 1)
- kh.load(savepath)
+ kh = khmer.CountingHash(1, 1, 1)
+ try:
+ kh.load(savepath)
+ except IOError as err:
+ assert 0, "Should not produce an IOError: " + str(err)
# set_use_bigcount should still be True after load (i.e. should be saved)
@@ -576,14 +900,17 @@ def test_bigcount_save():
def test_nobigcount_save():
- kh = khmer.new_counting_hash(4, 4 ** 4, 4)
+ kh = khmer.CountingHash(4, 4 ** 4, 4)
# kh.set_use_bigcount(False) <-- this is the default
savepath = utils.get_temp_filename('tempcountingsave.ht')
kh.save(savepath)
- kh = khmer.new_counting_hash(1, 1, 1)
- kh.load(savepath)
+ kh = khmer.CountingHash(1, 1, 1)
+ try:
+ kh.load(savepath)
+ except IOError as err:
+ assert 0, 'Should not produce an IOError: ' + str(err)
# set_use_bigcount should still be False after load (i.e. should be saved)
@@ -597,8 +924,8 @@ def test_nobigcount_save():
def test_bigcount_abund_dist():
- kh = khmer.new_counting_hash(18, 1e2, 4)
- tracking = khmer.new_hashbits(18, 1e2, 4)
+ kh = khmer.CountingHash(18, 1e2, 4)
+ tracking = khmer.Hashbits(18, 1e2, 4)
kh.set_use_bigcount(True)
seqpath = utils.get_test_data('test-abund-read-2.fa')
@@ -606,15 +933,15 @@ def test_bigcount_abund_dist():
kh.consume_fasta(seqpath)
dist = kh.abundance_distribution(seqpath, tracking)
- print kh.get('GGTTGACGGGGCTCAGGG')
+ print(kh.get('GGTTGACGGGGCTCAGGG'))
pdist = [(i, dist[i]) for i in range(len(dist)) if dist[i]]
- assert dist[1001] == 1, pdist
+ assert dist[1002] == 1, pdist
def test_bigcount_abund_dist_2():
- kh = khmer.new_counting_hash(18, 1e7, 4)
- tracking = khmer.new_hashbits(18, 1e7, 4)
+ kh = khmer.CountingHash(18, 1e7, 4)
+ tracking = khmer.Hashbits(18, 1e7, 4)
kh.set_use_bigcount(True)
seqpath = utils.get_test_data('test-abund-read.fa')
@@ -624,14 +951,14 @@ def test_bigcount_abund_dist_2():
kh.count('GGTTGACGGGGCTCAGGG')
dist = kh.abundance_distribution(seqpath, tracking)
- print kh.get('GGTTGACGGGGCTCAGGG')
+ print(kh.get('GGTTGACGGGGCTCAGGG'))
pdist = [(i, dist[i]) for i in range(len(dist)) if dist[i]]
assert dist[1001] == 1, pdist
def test_bigcount_overflow():
- kh = khmer.new_counting_hash(18, 1e7, 4)
+ kh = khmer.CountingHash(18, 1e7, 4)
kh.set_use_bigcount(True)
for i in range(0, 70000):
@@ -641,18 +968,18 @@ def test_bigcount_overflow():
def test_get_ksize():
- kh = khmer.new_counting_hash(22, 1, 1)
+ kh = khmer.CountingHash(22, 1, 1)
assert kh.ksize() == 22
def test_get_hashsizes():
- kh = khmer.new_counting_hash(22, 100, 4)
- assert kh.hashsizes() == [101, 103, 107, 109], kh.hashsizes()
+ kh = khmer.CountingHash(22, 100, 4)
+ assert kh.hashsizes() == [97L, 89L, 83L, 79L], kh.hashsizes()
# def test_collect_high_abundance_kmers():
# seqpath = utils.get_test_data('test-abund-read-2.fa')
#
-# kh = khmer.new_counting_hash(18, 1e6, 4)
+# kh = khmer.CountingHash(18, 1e6, 4)
# hb = kh.collect_high_abundance_kmers(seqpath, 2, 4)
@@ -662,19 +989,19 @@ def test_get_hashsizes():
def test_load_notexist_should_fail():
savepath = utils.get_temp_filename('tempcountingsave0.ht')
- hi = khmer.new_counting_hash(12, 1000)
+ hi = khmer.CountingHash(12, 1000, 2)
try:
hi.load(savepath)
assert 0, "load should fail"
except IOError as e:
- print str(e)
+ print(str(e))
def test_load_truncated_should_fail():
inpath = utils.get_test_data('random-20-a.fa')
savepath = utils.get_temp_filename('tempcountingsave0.ht')
- hi = khmer.new_counting_hash(12, 1000)
+ hi = khmer.CountingHash(12, 1000, 2)
hi.consume_fasta(inpath)
hi.save(savepath)
@@ -686,30 +1013,30 @@ def test_load_truncated_should_fail():
fp.write(data[:1000])
fp.close()
- hi = khmer.new_counting_hash(12, 1)
+ hi = khmer._CountingHash(12, [1])
try:
hi.load(savepath)
assert 0, "load should fail"
except IOError as e:
- print str(e)
+ print(str(e))
def test_load_gz_notexist_should_fail():
savepath = utils.get_temp_filename('tempcountingsave0.ht.gz')
- hi = khmer.new_counting_hash(12, 1000)
+ hi = khmer.CountingHash(12, 1000, 2)
try:
hi.load(savepath)
assert 0, "load should fail"
except IOError as e:
- print str(e)
+ print(str(e))
def test_load_gz_truncated_should_fail():
inpath = utils.get_test_data('random-20-a.fa')
savepath = utils.get_temp_filename('tempcountingsave0.ht.gz')
- hi = khmer.new_counting_hash(12, 1000)
+ hi = khmer.CountingHash(12, 1000, 2)
hi.consume_fasta(inpath)
hi.save(savepath)
@@ -721,16 +1048,16 @@ def test_load_gz_truncated_should_fail():
fp.write(data[:1000])
fp.close()
- hi = khmer.new_counting_hash(12, 1)
+ hi = khmer._CountingHash(12, [1])
try:
hi.load(savepath)
assert 0, "load should fail"
except IOError as e:
- print str(e)
+ print(str(e))
def test_counting_file_version_check():
- ht = khmer.new_counting_hash(12, 1, 1)
+ ht = khmer.CountingHash(12, 1, 1)
inpath = utils.get_test_data('badversion-k12.ct')
@@ -738,11 +1065,11 @@ def test_counting_file_version_check():
ht.load(inpath)
assert 0, "this should fail"
except IOError as e:
- print str(e)
+ print(str(e))
def test_counting_gz_file_version_check():
- ht = khmer.new_counting_hash(12, 1, 1)
+ ht = khmer.CountingHash(12, 1, 1)
inpath = utils.get_test_data('badversion-k12.ct.gz')
@@ -750,161 +1077,161 @@ def test_counting_gz_file_version_check():
ht.load(inpath)
assert 0, "this should fail"
except IOError as e:
- print str(e)
+ print(str(e))
def test_counting_file_type_check():
inpath = utils.get_test_data('goodversion-k12.ht')
- kh = khmer.new_counting_hash(12, 1, 1)
+ kh = khmer.CountingHash(12, 1, 1)
try:
kh.load(inpath)
assert 0, "this should fail"
except IOError as e:
- print str(e)
+ print(str(e))
def test_counting_gz_file_type_check():
- ht = khmer.new_hashbits(12, 1, 1)
+ ht = khmer.Hashbits(12, 1, 1)
inpath = utils.get_test_data('goodversion-k12.ht.gz')
- kh = khmer.new_counting_hash(12, 1, 1)
+ kh = khmer.CountingHash(12, 1, 1)
try:
kh.load(inpath)
assert 0, "this should fail"
except IOError as e:
- print str(e)
+ print(str(e))
def test_counting_bad_primes_list():
try:
- ht = khmer.CountingHash(12, ["a", "b", "c"], 1)
+ ht = khmer._CountingHash(12, ["a", "b", "c"], 1)
assert 0, "bad list of primes should fail"
except TypeError as e:
- print str(e)
+ print(str(e))
def test_bad_use_bigcount():
- countingtable = khmer.new_counting_hash(4, 4 ** 4, 4)
+ countingtable = khmer.CountingHash(4, 4 ** 4, 4)
countingtable.set_use_bigcount(True)
assert countingtable.get_use_bigcount()
try:
countingtable.get_use_bigcount(True)
assert 0, "this should fail"
except TypeError as err:
- print str(err)
+ print(str(err))
def test_consume_absentfasta():
- countingtable = khmer.new_counting_hash(4, 4 ** 4, 4)
+ countingtable = khmer.CountingHash(4, 4 ** 4, 4)
try:
countingtable.consume_fasta("absent_file.fa")
assert 0, "This should fail"
except IOError as err:
- print str(err)
+ print(str(err))
def test_consume_absentfasta_with_reads_parser():
- countingtable = khmer.new_counting_hash(4, 4 ** 4, 4)
+ countingtable = khmer.CountingHash(4, 4 ** 4, 4)
try:
countingtable.consume_fasta_with_reads_parser()
assert 0, "this should fail"
except TypeError as err:
- print str(err)
+ print(str(err))
try:
readparser = ReadParser(utils.get_test_data('empty-file'))
countingtable.consume_fasta_with_reads_parser(readparser)
assert 0, "this should fail"
except IOError as err:
- print str(err)
+ print(str(err))
except ValueError as err:
- print str(err)
+ print(str(err))
def test_badconsume():
- countingtable = khmer.new_counting_hash(4, 4 ** 4, 4)
+ countingtable = khmer.CountingHash(4, 4 ** 4, 4)
try:
countingtable.consume()
assert 0, "this should fail"
except TypeError as err:
- print str(err)
+ print(str(err))
try:
countingtable.consume("AAA")
assert 0, "this should fail"
except ValueError as err:
- print str(err)
+ print(str(err))
def test_get_badmin_count():
- countingtable = khmer.new_counting_hash(4, 4 ** 4, 4)
+ countingtable = khmer.CountingHash(4, 4 ** 4, 4)
try:
countingtable.get_min_count()
assert 0, "this should fail"
except TypeError as err:
- print str(err)
+ print(str(err))
try:
countingtable.get_min_count("AAA")
assert 0, "this should fail"
except ValueError as err:
- print str(err)
+ print(str(err))
def test_get_badmax_count():
- countingtable = khmer.new_counting_hash(4, 4 ** 4, 4)
+ countingtable = khmer.CountingHash(4, 4 ** 4, 4)
try:
countingtable.get_max_count()
assert 0, "this should fail"
except TypeError as err:
- print str(err)
+ print(str(err))
try:
countingtable.get_max_count("AAA")
assert 0, "this should fail"
except ValueError as err:
- print str(err)
+ print(str(err))
def test_get_badmedian_count():
- countingtable = khmer.new_counting_hash(4, 4 ** 4, 4)
+ countingtable = khmer.CountingHash(4, 4 ** 4, 4)
try:
countingtable.get_median_count()
assert 0, "this should fail"
except TypeError as err:
- print str(err)
+ print(str(err))
try:
countingtable.get_median_count("AAA")
assert 0, "this should fail"
except ValueError as err:
- print str(err)
+ print(str(err))
def test_get_badkadian_count():
- countingtable = khmer.new_counting_hash(4, 4 ** 4, 4)
+ countingtable = khmer.CountingHash(4, 4 ** 4, 4)
try:
countingtable.get_kadian_count()
assert 0, "this should fail"
except TypeError as err:
- print str(err)
+ print(str(err))
try:
countingtable.get_kadian_count("AAA")
assert 0, "this should fail"
except ValueError as err:
- print str(err)
+ print(str(err))
def test_badget():
- countingtable = khmer.new_counting_hash(4, 4 ** 4, 4)
+ countingtable = khmer.CountingHash(4, 4 ** 4, 4)
try:
countingtable.get()
assert 0, "this should fail"
except TypeError as err:
- print str(err)
+ print(str(err))
def test_badget_2():
- countingtable = khmer.new_counting_hash(6, 1e6)
+ countingtable = khmer.CountingHash(6, 1e6, 2)
countingtable.consume(DNA)
@@ -916,98 +1243,98 @@ def test_badget_2():
countingtable.get("AGCTT")
assert 0, "this should fail"
except ValueError as err:
- print str(err)
+ print(str(err))
def test_badtrim():
- countingtable = khmer.new_counting_hash(6, 1e6, 2)
+ countingtable = khmer.CountingHash(6, 1e6, 2)
countingtable.consume(DNA)
try:
countingtable.trim_on_abundance()
assert 0, "this should fail"
except TypeError as err:
- print str(err)
+ print(str(err))
countingtable.trim_on_abundance("AAAAAA", 1)
def test_badfasta_count_kmers_by_position():
- countingtable = khmer.new_counting_hash(4, 4 ** 4, 4)
+ countingtable = khmer.CountingHash(4, 4 ** 4, 4)
try:
countingtable.fasta_count_kmers_by_position()
except TypeError as err:
- print str(err)
+ print(str(err))
filename = utils.get_test_data("test-short.fa")
try:
countingtable.fasta_count_kmers_by_position(filename, -1, 0)
assert 0, "this should fail"
except ValueError as err:
- print str(err)
+ print(str(err))
try:
countingtable.fasta_count_kmers_by_position(filename, 0, -1)
assert 0, "this should fail"
except ValueError as err:
- print str(err)
+ print(str(err))
def test_badload():
- countingtable = khmer.new_counting_hash(4, 4 ** 4, 4)
+ countingtable = khmer.CountingHash(4, 4 ** 4, 4)
try:
countingtable.load()
assert 0, "this should fail"
except TypeError as err:
- print str(err)
+ print(str(err))
def test_badsave():
- countingtable = khmer.new_counting_hash(4, 4 ** 4, 4)
+ countingtable = khmer.CountingHash(4, 4 ** 4, 4)
try:
countingtable.save()
assert 0, "this should fail"
except TypeError as err:
- print str(err)
+ print(str(err))
def test_badksize():
- countingtable = khmer.new_counting_hash(4, 4 ** 4, 4)
+ countingtable = khmer.CountingHash(4, 4 ** 4, 4)
try:
countingtable.ksize(True)
assert 0, "this should fail"
except TypeError as err:
- print str(err)
+ print(str(err))
def test_badhashsizes():
- countingtable = khmer.new_counting_hash(4, 4 ** 4, 4)
+ countingtable = khmer.CountingHash(4, 4 ** 4, 4)
try:
countingtable.hashsizes(True)
assert 0, "this should fail"
except TypeError as err:
- print str(err)
+ print(str(err))
def test_badconsume_and_tag():
- countingtable = khmer.new_counting_hash(4, 4 ** 4, 4)
+ countingtable = khmer.CountingHash(4, 4 ** 4, 4)
try:
countingtable.consume_and_tag()
assert 0, "this should fail"
except TypeError as err:
- print str(err)
+ print(str(err))
def test_consume_fasta_and_tag():
- countingtable = khmer.new_counting_hash(4, 4 ** 4, 4)
+ countingtable = khmer.CountingHash(4, 4 ** 4, 4)
try:
countingtable.consume_fasta_and_tag()
assert 0, "this should fail"
except TypeError as err:
- print str(err)
+ print(str(err))
countingtable.consume_fasta_and_tag(utils.get_test_data("test-graph2.fa"))
def test_consume_and_retrieve_tags_1():
- ct = khmer.new_counting_hash(4, 4 ** 4, 4)
+ ct = khmer.CountingHash(4, 4 ** 4, 4)
# first, for each sequence, build tags.
for record in screed.open(utils.get_test_data('test-graph2.fa')):
@@ -1030,7 +1357,7 @@ def test_consume_and_retrieve_tags_1():
def test_consume_and_retrieve_tags_empty():
- ct = khmer.new_counting_hash(4, 4 ** 4, 4)
+ ct = khmer.CountingHash(4, 4 ** 4, 4)
# load each sequence but do not build tags - everything should be empty.
for record in screed.open(utils.get_test_data('test-graph2.fa')):
@@ -1054,7 +1381,7 @@ def test_consume_and_retrieve_tags_empty():
def test_find_all_tags_list_error():
- ct = khmer.new_counting_hash(4, 4 ** 4, 4)
+ ct = khmer.CountingHash(4, 4 ** 4, 4)
# load each sequence but do not build tags - everything should be empty.
for record in screed.open(utils.get_test_data('test-graph2.fa')):
@@ -1071,3 +1398,46 @@ def test_find_all_tags_list_error():
assert False, "a ValueError should be raised for incorrect k-mer size"
except ValueError:
pass
+
+
+def test_abund_dist_gz_bigcount():
+ infile = utils.get_temp_filename('test.fa')
+ shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile)
+ outfile = utils.get_temp_filename('test_ct.gz')
+ script = 'load-into-counting.py'
+ htfile = utils.get_temp_filename('test_ct')
+ args = ['-x', str(1e7), '-N', str(2), '-k', str(2), htfile, infile]
+ utils.runscript(script, args) # create a bigcount table
+ assert os.path.exists(htfile)
+ data = open(htfile, 'rb').read()
+ f_out = gzip.open(outfile, 'wb') # compress the created bigcount table
+ f_out.write(data)
+ f_out.close()
+ # load the compressed bigcount table
+ try:
+ counting_hash = khmer.load_counting_hash(outfile)
+ except IOError as err:
+ assert 0, 'Should not produce IOError: ' + str(err)
+ hashsizes = counting_hash.hashsizes()
+ kmer_size = counting_hash.ksize()
+ tracking = khmer._Hashbits(kmer_size, hashsizes)
+ abundances = counting_hash.abundance_distribution(infile, tracking)
+ # calculate abundance distribution for compressed bigcount table
+ flag = False
+ # check if abundance is > 255
+ # if ok gzipped bigcount was loaded correctly
+ for _, i in enumerate(abundances):
+ print(_, i)
+ if _ > 255 and i > 0:
+ flag = True
+ break
+ assert flag
+
+
+def test_counting_load_bigcount():
+ count_table = khmer.CountingHash(10, 1e5, 4)
+ count_table.set_use_bigcount(True)
+ for i in range(500):
+ print(i, count_table.count('ATATATATAT'))
+ count = count_table.get('ATATATATAT')
+ assert count == 500
diff --git a/tests/test_counting_single.py b/tests/test_counting_single.py
index 7fd348a..a76e63c 100644
--- a/tests/test_counting_single.py
+++ b/tests/test_counting_single.py
@@ -1,6 +1,8 @@
+from __future__ import print_function
+from __future__ import absolute_import
#
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
-# Copyright (C) Michigan State University, 2009-2013. It is licensed under
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
+# Copyright (C) Michigan State University, 2009-2015. It is licensed under
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
#
@@ -8,14 +10,14 @@
# pylint: disable=C0111,C0103
import khmer
-import khmer_tst_utils as utils
+from . import khmer_tst_utils as utils
from nose.plugins.attrib import attr
MAX_COUNT = 255
def test_no_collision():
- kh = khmer.new_hashtable(4, 4)
+ kh = khmer._CountingHash(4, [5])
kh.count('AAAA')
assert kh.get('AAAA') == 1
@@ -24,17 +26,17 @@ def test_no_collision():
assert kh.get('TTTT') == 2
- at attr('linux')
+ at attr('huge')
def test_toobig():
try:
- ct = khmer.new_hashtable(4, 1000000000000)
+ ct = khmer.CountingHash(4, 1000000000000, 1)
assert 0, "this should fail"
except MemoryError as err:
- print str(err)
+ print(str(err))
def test_collision():
- kh = khmer.new_hashtable(4, 4)
+ kh = khmer._CountingHash(4, [5])
kh.count('AAAA')
assert kh.get('AAAA') == 1
@@ -44,30 +46,30 @@ def test_collision():
def test_badcount():
- countingtable = khmer.new_hashtable(4, 4)
+ countingtable = khmer._CountingHash(4, [5])
try:
countingtable.count()
assert 0, "count should require one argument"
except TypeError as err:
- print str(err)
+ print(str(err))
try:
countingtable.count('ABCDE')
assert 0, "count should require k-mer size to be equal"
except ValueError as err:
- print str(err)
+ print(str(err))
def test_hashtable_n_entries():
- countingtable = khmer.new_hashtable(4, 4)
+ countingtable = khmer._CountingHash(4, [5])
try:
countingtable.n_entries("nope")
assert 0, "n_entries should accept no arguments"
except TypeError as err:
- print str(err)
+ print(str(err))
def test_complete_no_collision():
- kh = khmer.new_hashtable(4, 4 ** 2)
+ kh = khmer._CountingHash(4, [4**4])
for i in range(0, kh.n_entries()):
s = khmer.reverse_hash(i, 4)
@@ -87,13 +89,13 @@ def test_complete_no_collision():
n_fwd_filled += 1
assert n_rc_filled == kh.n_entries(), n_rc_filled
- assert n_palindromes == 16, n_palindromes # @CTB check this
+ assert n_palindromes == 16, n_palindromes
assert n_fwd_filled == kh.n_entries() // 2 + n_palindromes // 2, \
n_fwd_filled
def test_complete_2_collision():
- kh = khmer.new_hashtable(4, 4)
+ kh = khmer._CountingHash(4, [5])
for i in range(0, kh.n_entries()):
s = khmer.reverse_hash(i, 4)
@@ -114,7 +116,7 @@ def test_complete_2_collision():
def test_complete_4_collision():
- kh = khmer.new_hashtable(4, 2)
+ kh = khmer._CountingHash(4, [3])
for i in range(0, kh.n_entries()):
s = khmer.reverse_hash(i, 4)
@@ -136,14 +138,14 @@ def test_complete_4_collision():
def test_maxcount():
# hashtable should saturate at some point so as not to overflow counter
- kh = khmer.new_hashtable(4, 4)
+ kh = khmer._CountingHash(4, [5])
last_count = None
for _ in range(0, 10000):
kh.count('AAAA')
c = kh.get('AAAA')
- print last_count, c
+ print(last_count, c)
if c == last_count:
break
last_count = c
@@ -154,7 +156,7 @@ def test_maxcount():
def test_maxcount_with_bigcount():
# hashtable should not saturate, if use_bigcount is set.
- kh = khmer.new_hashtable(4, 4)
+ kh = khmer._CountingHash(4, [5])
kh.set_use_bigcount(True)
last_count = None
@@ -162,7 +164,7 @@ def test_maxcount_with_bigcount():
kh.count('AAAA')
c = kh.get('AAAA')
- print last_count, c
+ print(last_count, c)
if c == last_count:
break
last_count = c
@@ -172,7 +174,7 @@ def test_maxcount_with_bigcount():
def test_consume_uniqify_first():
- kh = khmer.new_hashtable(4, 4)
+ kh = khmer._CountingHash(4, [5])
s = "TTTT"
s_rc = "AAAA"
@@ -184,7 +186,7 @@ def test_consume_uniqify_first():
def test_maxcount_consume():
# hashtable should saturate at some point so as not to overflow counter
- kh = khmer.new_hashtable(4, 4)
+ kh = khmer._CountingHash(4, [5])
s = "A" * 10000
kh.consume(s)
@@ -195,7 +197,7 @@ def test_maxcount_consume():
def test_maxcount_consume_with_bigcount():
# use the bigcount hack to avoid saturating the hashtable.
- kh = khmer.new_hashtable(4, 4)
+ kh = khmer._CountingHash(4, [5])
kh.set_use_bigcount(True)
s = "A" * 10000
@@ -206,21 +208,21 @@ def test_maxcount_consume_with_bigcount():
def test_get_mincount():
- kh = khmer.new_hashtable(4, 4)
+ kh = khmer._CountingHash(4, [5])
s = "AAAAACGT"
kh.consume(s)
x = kh.get_min_count(s)
- assert x == 1
+ assert x == 1, x
kh.consume(s)
x = kh.get_min_count(s)
- assert x == 2
+ assert x == 2, x
def test_get_maxcount():
- kh = khmer.new_hashtable(4, 4)
+ kh = khmer._CountingHash(4, [7])
s = "AAAAACGT"
kh.consume(s)
@@ -234,29 +236,29 @@ def test_get_maxcount():
def test_get_maxcount_rc():
- kh = khmer.new_hashtable(4, 4)
+ kh = khmer._CountingHash(4, [7])
s = "AAAAACGT"
src = "ACGTTTTT"
kh.consume(s)
x = kh.get_max_count(s)
- assert x == 2
+ assert x == 2, x
kh.consume(src)
x = kh.get_max_count(s)
- assert x == 4
+ assert x == 4, x
def test_get_mincount_rc():
- kh = khmer.new_hashtable(4, 4)
+ kh = khmer._CountingHash(4, [5])
s = "AAAAACGT"
src = "ACGTTTTT"
kh.consume(s)
x = kh.get_min_count(s)
- assert x == 1
+ assert x == 1, x
kh.consume(src)
x = kh.get_min_count(s)
@@ -264,7 +266,7 @@ def test_get_mincount_rc():
def test_badget():
- kh = khmer.new_hashtable(6, 4 ** 10)
+ kh = khmer.CountingHash(6, 4 ** 10, 1)
DNA = "AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAG"
@@ -278,11 +280,11 @@ def test_badget():
kh.get("AGCTT")
assert 0, "this should fail"
except ValueError as err:
- print str(err)
+ print(str(err))
def test_64bitshift():
- kh = khmer.new_hashtable(25, 4)
+ kh = khmer.CountingHash(25, 4, 1)
fullstr = "GTATGCCAGCTCCAACTGGGCCGGTACGAGCAGGCCATTGCCTCTTGCCGCGATGCGTCGGCG"
substr = "ATGCCAGCTCCAACTGGGCCGGTACGAGCAGGCCATTGCCTCTTGC"
@@ -291,7 +293,7 @@ def test_64bitshift():
def test_64bitshift_2():
- kh = khmer.new_hashtable(25, 4)
+ kh = khmer.CountingHash(25, 4, 1)
fullstr = "GTATGCCAGCTCCAACTGGGCCGGTACGAGCAGGCCATTGCCTCTTGCCGCGATGCGTCGGCG"
kh.consume(fullstr)
@@ -302,12 +304,12 @@ def test_64bitshift_2():
def test_very_short_read():
short_filename = utils.get_test_data('test-short.fa')
- kh = khmer.new_hashtable(9, 4)
+ kh = khmer.CountingHash(9, 4, 1)
n_reads, n_kmers = kh.consume_fasta(short_filename)
assert n_reads == 1, n_reads
assert n_kmers == 0, n_kmers
- kh = khmer.new_hashtable(8, 4)
+ kh = khmer.CountingHash(8, 4, 1)
n_reads, n_kmers = kh.consume_fasta(short_filename)
assert n_reads == 1, n_reads
assert n_kmers == 1, n_kmers
@@ -316,7 +318,7 @@ def test_very_short_read():
class Test_ConsumeString(object):
def setup(self):
- self.kh = khmer.new_hashtable(4, 4 ** 4)
+ self.kh = khmer._CountingHash(4, [4**4])
def test_n_occupied(self):
assert self.kh.n_occupied() == 0
@@ -328,7 +330,7 @@ class Test_ConsumeString(object):
self.kh.n_occupied("MU", 1, 3)
assert 0, "n_occupied shouldn't accept three arguments"
except TypeError as err:
- print str(err)
+ print(str(err))
def test_abundance_by_pos(self):
kh = self.kh
@@ -373,7 +375,7 @@ class Test_ConsumeString(object):
assert self.kh.n_occupied() == 0
self.kh.consume('AAAA')
assert self.kh.n_occupied(0, 1) == 1
- assert self.kh.n_occupied(1, 4 ** 4) == 0
+ assert self.kh.n_occupied(1, 4 ** 4) == 0, self.kh.n_occupied()
hashvalue = khmer.forward_hash('AACT', 4)
self.kh.consume('AACT')
@@ -412,14 +414,14 @@ class Test_ConsumeString(object):
class Test_AbundanceDistribution(object):
def setup(self):
- self.kh = khmer.new_hashtable(4, 4)
+ self.kh = khmer._CountingHash(4, [5])
A_filename = utils.get_test_data('all-A.fa')
self.kh.consume_fasta(A_filename)
def test_count_A(self):
A_filename = utils.get_test_data('all-A.fa')
- tracking = khmer.new_hashbits(4, 4, 1)
+ tracking = khmer._Hashbits(4, [5])
dist = self.kh.abundance_distribution(A_filename, tracking)
assert sum(dist) == 1
diff --git a/tests/test_filter.py b/tests/test_filter.py
index 28b87ae..2ff9091 100644
--- a/tests/test_filter.py
+++ b/tests/test_filter.py
@@ -1,6 +1,8 @@
+from __future__ import print_function
+from __future__ import absolute_import
#
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
-# Copyright (C) Michigan State University, 2009-2013. It is licensed under
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
+# Copyright (C) Michigan State University, 2009-2015. It is licensed under
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
#
@@ -8,7 +10,7 @@ import khmer
from screed.fasta import fasta_iter
from nose.plugins.attrib import attr
-import khmer_tst_utils as utils
+from . import khmer_tst_utils as utils
def teardown():
@@ -25,7 +27,7 @@ def load_fa_seq_names(filename):
class Test_Filter(object):
def test_abund(self):
- ht = khmer.new_hashtable(10, 4 ** 10)
+ ht = khmer.CountingHash(10, 4 ** 10, 1)
filename = utils.get_test_data('test-abund-read.fa')
outname = utils.get_temp_filename('test_abund.out')
@@ -35,18 +37,18 @@ class Test_Filter(object):
ht.consume_fasta()
assert 0, "should fail"
except TypeError as err:
- print str(err)
+ print(str(err))
try:
ht.consume_fasta("nonexistent")
assert 0, "should fail"
except IOError as err:
- print str(err)
+ print(str(err))
ht.output_fasta_kmer_pos_freq(filename, outname)
try:
ht.output_fasta_kmer_pos_freq()
assert 0, "should fail"
except TypeError as err:
- print str(err)
+ print(str(err))
fd = open(outname, "r")
diff --git a/tests/test_functions.py b/tests/test_functions.py
index edf0c8d..32f3f99 100644
--- a/tests/test_functions.py
+++ b/tests/test_functions.py
@@ -1,17 +1,20 @@
+from __future__ import print_function
+from __future__ import absolute_import
#
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
# Copyright (C) Michigan State University, 2009-2015. It is licensed under
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
#
import khmer
-from nose.plugins.attrib import attr
import os
-import khmer_tst_utils as utils
+import sys
import collections
+from . import khmer_tst_utils as utils
from khmer.utils import (check_is_pair, broken_paired_reader, check_is_left,
check_is_right)
from khmer.kfile import check_input_files
+from cStringIO import StringIO
def test_forward_hash():
@@ -76,15 +79,37 @@ def test_get_primes():
assert primes == [19, 17, 13, 11, 7, 5, 3]
+def test_get_primes_fal():
+ try:
+ primes = khmer.get_n_primes_near_x(5, 5)
+ assert 0, "previous statement should fail"
+ except AssertionError:
+ raise
+ except Exception as err:
+ assert "unable to find 5 prime numbers < 5" in str(err)
+
+
+def test_extract_countinghash_info_badfile():
+ try:
+ khmer.extract_countinghash_info(
+ utils.get_test_data('test-abund-read-2.fa'))
+ assert 0, 'this should fail'
+ except ValueError:
+ pass
+
+
def test_extract_countinghash_info():
fn = utils.get_temp_filename('test_extract_counting.ct')
for size in [1e6, 2e6, 5e6, 1e7]:
- ht = khmer.new_counting_hash(25, size, 4)
+ ht = khmer.CountingHash(25, size, 4)
ht.save(fn)
- info = khmer.extract_countinghash_info(fn)
+ try:
+ info = khmer.extract_countinghash_info(fn)
+ except ValueError as err:
+ assert 0, 'Should not throw a ValueErorr: ' + str(err)
ksize, table_size, n_tables, _, _, _ = info
- print ksize, table_size, n_tables
+ print(ksize, table_size, n_tables)
assert(ksize) == 25
assert table_size == size
@@ -92,8 +117,17 @@ def test_extract_countinghash_info():
try:
os.remove(fn)
- except OSError as e:
- print >>sys.stder, '...failed to remove {fn}'.format(fn)
+ except OSError as err:
+ assert 0, '...failed to remove ' + fn + str(err)
+
+
+def test_extract_hashbits_info_badfile():
+ try:
+ khmer.extract_hashbits_info(
+ utils.get_test_data('test-abund-read-2.fa'))
+ assert 0, 'this should fail'
+ except ValueError:
+ pass
def test_extract_hashbits_info():
@@ -104,7 +138,7 @@ def test_extract_hashbits_info():
info = khmer.extract_hashbits_info(fn)
ksize, table_size, n_tables, _, _ = info
- print ksize, table_size, n_tables
+ print(ksize, table_size, n_tables)
assert(ksize) == 25
assert table_size == size
@@ -112,26 +146,40 @@ def test_extract_hashbits_info():
try:
os.remove(fn)
- except OSError as e:
- print >>sys.stderr, '...failed to remove {fn}'.format(fn)
+ except OSError as err:
+ print('...failed to remove {fn}'.format(fn) + str(err),
+ file=sys.stderr)
def test_check_file_status_kfile():
fn = utils.get_temp_filename('thisfiledoesnotexist')
check_file_status_exited = False
+
+ old_stderr = sys.stderr
+ sys.stderr = capture = StringIO()
+
try:
check_input_files(fn, False)
except SystemExit:
- check_file_status_exited = True
- assert check_file_status_exited
+ assert "does not exist" in capture.getvalue(), capture.getvalue()
+ finally:
+ sys.stderr = old_stderr
def test_check_file_status_kfile_force():
fn = utils.get_temp_filename('thisfiledoesnotexist')
+
+ old_stderr = sys.stderr
+ sys.stderr = capture = StringIO()
+
try:
check_input_files(fn, True)
- except OSError as e:
+ except OSError:
assert False
+ finally:
+ sys.stderr = old_stderr
+
+ assert "does not exist" in capture.getvalue(), capture.getvalue()
FakeFQRead = collections.namedtuple('Read', ['name', 'quality', 'sequence'])
diff --git a/tests/test_graph.py b/tests/test_graph.py
index 9b05c48..5afcb92 100644
--- a/tests/test_graph.py
+++ b/tests/test_graph.py
@@ -1,13 +1,15 @@
+from __future__ import print_function
+from __future__ import absolute_import
#
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
-# Copyright (C) Michigan State University, 2009-2013. It is licensed under
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
+# Copyright (C) Michigan State University, 2009-2015. It is licensed under
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
#
import khmer
import screed
-import khmer_tst_utils as utils
+from . import khmer_tst_utils as utils
from nose.plugins.attrib import attr
@@ -19,7 +21,7 @@ def teardown():
class Test_ExactGraphFu(object):
def setup(self):
- self.ht = khmer.new_hashbits(12, 1e4)
+ self.ht = khmer.Hashbits(12, 1e4, 2)
def test_counts(self):
ht = self.ht
@@ -113,7 +115,7 @@ class Test_ExactGraphFu(object):
class Test_InexactGraphFu(object):
def setup(self):
- self.ht = khmer.new_hashbits(12, 4 ** 2 + 1)
+ self.ht = khmer.Hashbits(12, 4 ** 3 + 1, 2)
def test_graph_links_next_a(self):
ht = self.ht
@@ -131,7 +133,7 @@ class Test_InexactGraphFu(object):
ht.consume(word[1:] + "C")
x = ht.calc_connected_graph_size(word)
- assert x == 2
+ assert x == 2, x
def test_graph_links_next_g(self):
ht = self.ht
@@ -197,7 +199,7 @@ class Test_Partitioning(object):
filename = utils.get_test_data('random-20-a.fa')
- ht = khmer.new_hashbits(21, 4, 4)
+ ht = khmer._Hashbits(21, [5, 7, 11, 13])
ht.consume_fasta_and_tag(filename)
output_file = utils.get_temp_filename('part0test')
@@ -214,7 +216,7 @@ class Test_Partitioning(object):
filename = utils.get_test_data('random-20-a.fa')
- ht = khmer.new_hashbits(21, 4, 4)
+ ht = khmer._Hashbits(21, [5, 7, 11, 13])
ht.consume_fasta_and_tag(filename)
output_file = utils.get_temp_filename('parttest')
@@ -229,7 +231,7 @@ class Test_Partitioning(object):
def test_output_fq(self):
filename = utils.get_test_data('random-20-a.fq')
- ht = khmer.new_hashbits(20, 1e4, 4)
+ ht = khmer.Hashbits(20, 1e4, 4)
ht.consume_fasta_and_tag(filename)
subset = ht.do_subset_partition(0, 0)
ht.merge_subset(subset)
@@ -237,7 +239,7 @@ class Test_Partitioning(object):
output_file = utils.get_temp_filename('parttest')
ht.output_partitions(filename, output_file, False)
- print open(output_file).read()
+ print(open(output_file).read())
x = set([r.quality for r in screed.open(output_file)])
assert x, x
@@ -245,7 +247,7 @@ class Test_Partitioning(object):
def test_disconnected_20_a(self):
filename = utils.get_test_data('random-20-a.fa')
- ht = khmer.new_hashbits(21, 1e5, 4)
+ ht = khmer.Hashbits(21, 1e5, 4)
ht.consume_fasta_and_tag(filename)
subset = ht.do_subset_partition(0, 0)
@@ -255,7 +257,7 @@ class Test_Partitioning(object):
def test_connected_20_a(self):
filename = utils.get_test_data('random-20-a.fa')
- ht = khmer.new_hashbits(20, 1e4, 4)
+ ht = khmer.Hashbits(20, 1e4, 4)
ht.consume_fasta_and_tag(filename)
subset = ht.do_subset_partition(0, 0)
@@ -265,7 +267,7 @@ class Test_Partitioning(object):
def test_disconnected_20_b(self):
filename = utils.get_test_data('random-20-b.fa')
- ht = khmer.new_hashbits(21, 1e4, 4)
+ ht = khmer.Hashbits(21, 1e4, 4)
ht.consume_fasta_and_tag(filename)
subset = ht.do_subset_partition(0, 0)
@@ -275,7 +277,7 @@ class Test_Partitioning(object):
def test_connected_20_b(self):
filename = utils.get_test_data('random-20-b.fa')
- ht = khmer.new_hashbits(20, 1e4, 4)
+ ht = khmer.Hashbits(20, 1e4, 4)
ht.consume_fasta_and_tag(filename)
subset = ht.do_subset_partition(0, 0)
@@ -285,7 +287,7 @@ class Test_Partitioning(object):
def test_disconnected_31_c(self):
filename = utils.get_test_data('random-31-c.fa')
- ht = khmer.new_hashbits(32, 1e6, 4)
+ ht = khmer.Hashbits(32, 1e6, 4)
ht.consume_fasta_and_tag(filename)
subset = ht.do_subset_partition(0, 0)
@@ -295,7 +297,7 @@ class Test_Partitioning(object):
def test_connected_31_c(self):
filename = utils.get_test_data('random-31-c.fa')
- ht = khmer.new_hashbits(31, 1e5, 4)
+ ht = khmer.Hashbits(31, 1e5, 4)
ht.consume_fasta_and_tag(filename)
subset = ht.do_subset_partition(0, 0)
@@ -308,13 +310,13 @@ class Test_Partitioning(object):
class Test_PythonAPI(object):
def test_find_all_tags_kmersize(self):
- ht = khmer.new_hashbits(20, 4 ** 4 + 1)
+ ht = khmer.Hashbits(20, 4 ** 4 + 1, 2)
a = "ATTGGGACTCTGGGAGCACTTATCATGGAGAT"
b = "GAGCACTTTAACCCTGCAGAGTGGCCAAGGCT"
c = "GGAGCACTTATCATGGAGATATATCCCGTGCTTAAACATCGCACTTTAACCCTGCAGAGT"
- print ht.consume(a)
+ print(ht.consume(a))
try:
ppi = ht.find_all_tags(c[:19])
assert False, "should raise a ValueError for wrong k-mer size"
@@ -328,23 +330,23 @@ class Test_PythonAPI(object):
pass
def test_ordered_connect(self):
- ht = khmer.new_hashbits(20, 4 ** 4 + 1)
+ ht = khmer.Hashbits(20, 4 ** 4 + 1, 2)
a = "ATTGGGACTCTGGGAGCACTTATCATGGAGAT"
b = "GAGCACTTTAACCCTGCAGAGTGGCCAAGGCT"
c = "GGAGCACTTATCATGGAGATATATCCCGTGCTTAAACATCGCACTTTAACCCTGCAGAGT"
- print ht.consume(a)
+ print(ht.consume(a))
ppi = ht.find_all_tags(a[:20])
pid = ht.assign_partition_id(ppi)
assert pid == 0, pid
- print ht.consume(b)
+ print(ht.consume(b))
ppi = ht.find_all_tags(b[:20])
pid = ht.assign_partition_id(ppi)
assert pid == 0, pid
- print ht.consume(c)
+ print(ht.consume(c))
ppi = ht.find_all_tags(c[:20])
pid = ht.assign_partition_id(ppi)
assert pid == 2, pid
diff --git a/tests/test_hashbits.py b/tests/test_hashbits.py
index 40b8d71..12a4f09 100644
--- a/tests/test_hashbits.py
+++ b/tests/test_hashbits.py
@@ -1,6 +1,8 @@
+from __future__ import print_function
+from __future__ import absolute_import
#
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
-# Copyright (C) Michigan State University, 2009-2013. It is licensed under
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
+# Copyright (C) Michigan State University, 2009-2015. It is licensed under
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
#
@@ -11,7 +13,7 @@ from khmer import ReadParser
from screed.fasta import fasta_iter
import screed
-import khmer_tst_utils as utils
+from . import khmer_tst_utils as utils
from nose.plugins.attrib import attr
@@ -19,8 +21,17 @@ def teardown():
utils.cleanup()
+ at attr('huge')
+def test_toobig():
+ try:
+ pt = khmer.Hashbits(32, 1e13, 1)
+ assert 0, "This should fail"
+ except MemoryError as err:
+ print(str(err))
+
+
def test__get_set_tag_density():
- ht = khmer.new_hashbits(32, 1, 1)
+ ht = khmer._Hashbits(32, [1])
orig = ht._get_tag_density()
assert orig != 2
@@ -28,6 +39,76 @@ def test__get_set_tag_density():
assert ht._get_tag_density() == 2
+def test_update_from():
+ ht = khmer.Hashbits(5, 1000, 4)
+ ht2 = khmer.Hashbits(5, 1000, 4)
+
+ assert ht.get('AAAAA') == 0
+ assert ht.get('GCGCG') == 0
+ assert ht2.get('AAAAA') == 0
+ assert ht2.get('GCGCG') == 0
+
+ ht2.count('AAAAA')
+
+ assert ht.get('AAAAA') == 0
+ assert ht.get('GCGCG') == 0
+ assert ht2.get('AAAAA') == 1
+ assert ht2.get('GCGCG') == 0
+
+ ht.count('GCGCG')
+
+ assert ht.get('AAAAA') == 0
+ assert ht.get('GCGCG') == 1
+ assert ht2.get('AAAAA') == 1
+ assert ht2.get('GCGCG') == 0
+
+ ht.update(ht2)
+
+ assert ht.get('AAAAA') == 1
+ assert ht.get('GCGCG') == 1
+ assert ht2.get('AAAAA') == 1
+ assert ht2.get('GCGCG') == 0
+
+
+def test_update_from_diff_ksize_2():
+ ht = khmer.Hashbits(5, 1000, 4)
+ ht2 = khmer.Hashbits(4, 1000, 4)
+
+ try:
+ ht.update(ht2)
+ assert 0, "should not be reached"
+ except ValueError as err:
+ print(str(err))
+
+ try:
+ ht2.update(ht)
+ assert 0, "should not be reached"
+ except ValueError as err:
+ print(str(err))
+
+
+def test_update_from_diff_tablesize():
+ ht = khmer.Hashbits(5, 100, 4)
+ ht2 = khmer.Hashbits(5, 1000, 4)
+
+ try:
+ ht.update(ht2)
+ assert 0, "should not be reached"
+ except ValueError as err:
+ print(str(err))
+
+
+def test_update_from_diff_num_tables():
+ ht = khmer.Hashbits(5, 1000, 3)
+ ht2 = khmer.Hashbits(5, 1000, 4)
+
+ try:
+ ht.update(ht2)
+ assert 0, "should not be reached"
+ except ValueError as err:
+ print(str(err))
+
+
def test_n_occupied_1():
filename = utils.get_test_data('random-20-a.fa')
@@ -36,13 +117,13 @@ def test_n_occupied_1():
N_HT = 1 # number of hashtables
# test modified c++ n_occupied code
- ht1 = khmer.new_hashbits(K, HT_SIZE, N_HT)
+ ht1 = khmer.Hashbits(K, HT_SIZE, N_HT)
for n, record in enumerate(fasta_iter(open(filename))):
ht1.consume(record['sequence'])
# this number calculated independently
- assert ht1.n_occupied() == 3877
+ assert ht1.n_occupied() == 3884, ht1.n_occupied()
def test_bloom_python_1():
@@ -53,7 +134,7 @@ def test_bloom_python_1():
HT_SIZE = 100000 # size of hashtable
N_HT = 3 # number of hashtables
- ht2 = khmer.new_hashbits(K, HT_SIZE, N_HT)
+ ht2 = khmer.Hashbits(K, HT_SIZE, N_HT)
n_unique = 0
for n, record in enumerate(fasta_iter(open(filename))):
@@ -66,8 +147,10 @@ def test_bloom_python_1():
ht2.count(kmer)
assert n_unique == 3960
- assert ht2.n_occupied() == 3882
- assert ht2.n_unique_kmers() == 3960 # this number equals to n_unique
+ assert ht2.n_occupied() == 3885, ht2.n_occupied()
+
+ # this number equals n_unique
+ assert ht2.n_unique_kmers() == 3960, ht2.n_unique_kmers()
def test_bloom_c_1():
@@ -79,12 +162,12 @@ def test_bloom_c_1():
HT_SIZE = 100000 # size of hashtable
N_HT = 3 # number of hashtables
- ht3 = khmer.new_hashbits(K, HT_SIZE, N_HT)
+ ht3 = khmer.Hashbits(K, HT_SIZE, N_HT)
for n, record in enumerate(fasta_iter(open(filename))):
ht3.consume(record['sequence'])
- assert ht3.n_occupied() == 3882
+ assert ht3.n_occupied() == 3885
assert ht3.n_unique_kmers() == 3960
@@ -93,7 +176,7 @@ def test_n_occupied_2(): # simple one
HT_SIZE = 10 # use 11
N_HT = 1
- ht1 = khmer.new_hashbits(K, HT_SIZE, N_HT)
+ ht1 = khmer._Hashbits(K, [11])
ht1.count('AAAA') # 00 00 00 00 = 0
assert ht1.n_occupied() == 1
@@ -104,17 +187,14 @@ def test_n_occupied_2(): # simple one
assert ht1.n_occupied() == 2
ht1.count('AGAC') # 00 11 00 10 # collision 2
- assert ht1.n_occupied() == 2
+ assert ht1.n_occupied() == 2, ht1.n_occupied()
def test_bloom_c_2(): # simple one
K = 4
- HT_SIZE = 10 # use 11
- N_HT1 = 1 # hashtable size = 11
- N_HT2 = 2 # hashtable size = 11,13
# use only 1 hashtable, no bloom filter
- ht1 = khmer.new_hashbits(K, HT_SIZE, N_HT1)
+ ht1 = khmer._Hashbits(K, [11])
ht1.count('AAAA') # 00 00 00 00 = 0
ht1.count('ACTG') # 00 10 01 11 =
assert ht1.n_unique_kmers() == 2
@@ -124,7 +204,7 @@ def test_bloom_c_2(): # simple one
assert ht1.n_unique_kmers() == 2
# use two hashtables with 11,13
- ht2 = khmer.new_hashbits(K, HT_SIZE, N_HT2)
+ ht2 = khmer._Hashbits(K, [11, 13])
ht2.count('AAAA') # 00 00 00 00 = 0
ht2.count('ACTG') # 00 10 01 11 = 2*16 +4 +3 = 39
@@ -138,7 +218,7 @@ def test_bloom_c_2(): # simple one
def test_filter_if_present():
- ht = khmer.new_hashbits(32, 2, 2)
+ ht = khmer._Hashbits(32, [3, 5])
maskfile = utils.get_test_data('filter-test-A.fa')
inputfile = utils.get_test_data('filter-test-B.fa')
@@ -154,7 +234,7 @@ def test_filter_if_present():
def test_combine_pe():
inpfile = utils.get_test_data('combine_parts_1.fa')
- ht = khmer.new_hashbits(32, 1, 1)
+ ht = khmer._Hashbits(32, [1])
ht.consume_partitioned_fasta(inpfile)
assert ht.count_partitions() == (2, 0)
@@ -179,7 +259,7 @@ def test_combine_pe():
def test_load_partitioned():
inpfile = utils.get_test_data('combine_parts_1.fa')
- ht = khmer.new_hashbits(32, 1, 1)
+ ht = khmer._Hashbits(32, [1])
ht.consume_partitioned_fasta(inpfile)
assert ht.count_partitions() == (2, 0)
@@ -196,9 +276,9 @@ def test_load_partitioned():
def test_count_within_radius_simple():
inpfile = utils.get_test_data('all-A.fa')
- ht = khmer.new_hashbits(4, 2, 2)
+ ht = khmer._Hashbits(4, [3, 5])
- print ht.consume_fasta(inpfile)
+ print(ht.consume_fasta(inpfile))
n = ht.count_kmers_within_radius('AAAA', 1)
assert n == 1
@@ -208,13 +288,13 @@ def test_count_within_radius_simple():
def test_count_within_radius_big():
inpfile = utils.get_test_data('random-20-a.fa')
- ht = khmer.new_hashbits(20, 1e5, 4)
+ ht = khmer.Hashbits(20, 1e5, 4)
ht.consume_fasta(inpfile)
n = ht.count_kmers_within_radius('CGCAGGCTGGATTCTAGAGG', int(1e6))
- assert n == 3960
+ assert n == 3961, n
- ht = khmer.new_hashbits(21, 1e5, 4)
+ ht = khmer.Hashbits(21, 1e5, 4)
ht.consume_fasta(inpfile)
n = ht.count_kmers_within_radius('CGCAGGCTGGATTCTAGAGGC', int(1e6))
assert n == 39
@@ -222,7 +302,7 @@ def test_count_within_radius_big():
def test_count_kmer_degree():
inpfile = utils.get_test_data('all-A.fa')
- ht = khmer.new_hashbits(4, 2, 2)
+ ht = khmer._Hashbits(4, [3, 5])
ht.consume_fasta(inpfile)
assert ht.kmer_degree('AAAA') == 2
@@ -232,7 +312,7 @@ def test_count_kmer_degree():
def test_save_load_tagset():
- ht = khmer.new_hashbits(32, 1, 1)
+ ht = khmer._Hashbits(32, [1])
outfile = utils.get_temp_filename('tagset')
@@ -250,11 +330,11 @@ def test_save_load_tagset():
fp = open(outfile, 'rb')
data = fp.read()
fp.close()
- assert len(data) == 26, len(data)
+ assert len(data) == 30, len(data)
def test_save_load_tagset_noclear():
- ht = khmer.new_hashbits(32, 1, 1)
+ ht = khmer._Hashbits(32, [1])
outfile = utils.get_temp_filename('tagset')
@@ -272,7 +352,7 @@ def test_save_load_tagset_noclear():
fp = open(outfile, 'rb')
data = fp.read()
fp.close()
- assert len(data) == 34, len(data)
+ assert len(data) == 38, len(data)
def test_stop_traverse():
@@ -282,7 +362,7 @@ def test_stop_traverse():
HT_SIZE = 1e4 # size of hashtable
N_HT = 3 # number of hashtables
- ht = khmer.new_hashbits(K, HT_SIZE, N_HT)
+ ht = khmer.Hashbits(K, HT_SIZE, N_HT)
# without tagging/joining across consume, this breaks into two partition;
# with, it is one partition.
@@ -303,7 +383,7 @@ def test_tag_across_stoptraverse():
HT_SIZE = 1e4 # size of hashtable
N_HT = 3 # number of hashtables
- ht = khmer.new_hashbits(K, HT_SIZE, N_HT)
+ ht = khmer.Hashbits(K, HT_SIZE, N_HT)
# without tagging/joining across consume, this breaks into two partition;
# with, it is one partition.
@@ -331,7 +411,7 @@ def test_notag_across_stoptraverse():
HT_SIZE = 1e4 # size of hashtable
N_HT = 3 # number of hashtables
- ht = khmer.new_hashbits(K, HT_SIZE, N_HT)
+ ht = khmer.Hashbits(K, HT_SIZE, N_HT)
# connecting k-mer at the beginning/end of a read: breaks up into two.
ht.add_stop_tag('TTGCATACGTTGAGCCAGCG')
@@ -346,7 +426,7 @@ def test_notag_across_stoptraverse():
def test_find_stoptags():
- ht = khmer.new_hashbits(5, 1, 1)
+ ht = khmer._Hashbits(5, [1])
ht.add_stop_tag("AAAAA")
assert ht.identify_stoptags_by_position("AAAAA") == [0]
@@ -356,7 +436,7 @@ def test_find_stoptags():
def test_find_stoptags2():
- ht = khmer.new_hashbits(4, 1, 1)
+ ht = khmer._Hashbits(4, [1])
ht.add_stop_tag("ATGC")
x = ht.identify_stoptags_by_position("ATGCATGCGCAT")
@@ -364,17 +444,17 @@ def test_find_stoptags2():
def test_get_ksize():
- kh = khmer.new_hashbits(22, 1, 1)
+ kh = khmer._Hashbits(22, [1])
assert kh.ksize() == 22
def test_get_hashsizes():
- kh = khmer.new_hashbits(22, 100, 4)
- assert kh.hashsizes() == [101, 103, 107, 109], kh.hashsizes()
+ kh = khmer.Hashbits(22, 100, 4)
+ assert kh.hashsizes() == [97L, 89L, 83L, 79L], kh.hashsizes()
def test_extract_unique_paths_0():
- kh = khmer.new_hashbits(10, 4, 4)
+ kh = khmer._Hashbits(10, [5, 7, 11, 13])
x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1)
assert x == ['ATGGAGAGACACAGATAGACAGGAGTGGCGATG']
@@ -385,36 +465,36 @@ def test_extract_unique_paths_0():
def test_extract_unique_paths_1():
- kh = khmer.new_hashbits(10, 4, 4)
+ kh = khmer._Hashbits(10, [5, 7, 11, 13])
kh.consume('AGTGGCGATG')
x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1)
- print x
+ print(x)
assert x == ['ATGGAGAGACACAGATAGACAGGAGTGGCGAT'] # all but the last k-mer
def test_extract_unique_paths_2():
- kh = khmer.new_hashbits(10, 4, 4)
+ kh = khmer._Hashbits(10, [5, 7, 11, 13])
kh.consume('ATGGAGAGAC')
x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1)
- print x
+ print(x)
assert x == ['TGGAGAGACACAGATAGACAGGAGTGGCGATG'] # all but the 1st k-mer
def test_extract_unique_paths_3():
- kh = khmer.new_hashbits(10, 4, 4)
+ kh = khmer._Hashbits(10, [5, 7, 11, 13])
kh.consume('ATGGAGAGAC')
kh.consume('AGTGGCGATG')
x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1)
- print x
+ print(x)
# all but the 1st/last k-mer
assert x == ['TGGAGAGACACAGATAGACAGGAGTGGCGAT']
def test_extract_unique_paths_4():
- kh = khmer.new_hashbits(10, 4, 4)
+ kh = khmer.Hashbits(10, 1e6, 4)
kh.consume('ATGGAGAGAC')
kh.consume('AGTGGCGATG')
@@ -422,7 +502,7 @@ def test_extract_unique_paths_4():
kh.consume('ATAGACAGGA')
x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1)
- print x
+ print(x)
assert x == ['TGGAGAGACACAGATAGACAGG', 'TAGACAGGAGTGGCGAT']
@@ -434,7 +514,7 @@ def test_find_unpart():
HT_SIZE = 1e4 # size of hashtable
N_HT = 3 # number of hashtables
- ht = khmer.new_hashbits(K, HT_SIZE, N_HT)
+ ht = khmer.Hashbits(K, HT_SIZE, N_HT)
ht.consume_fasta_and_tag(filename)
subset = ht.do_subset_partition(0, 0)
@@ -456,7 +536,7 @@ def test_find_unpart_notraverse():
HT_SIZE = 1e4 # size of hashtable
N_HT = 3 # number of hashtables
- ht = khmer.new_hashbits(K, HT_SIZE, N_HT)
+ ht = khmer.Hashbits(K, HT_SIZE, N_HT)
ht.consume_fasta_and_tag(filename)
subset = ht.do_subset_partition(0, 0)
@@ -478,7 +558,7 @@ def test_find_unpart_fail():
HT_SIZE = 1e4 # size of hashtable
N_HT = 3 # number of hashtables
- ht = khmer.new_hashbits(K, HT_SIZE, N_HT)
+ ht = khmer.Hashbits(K, HT_SIZE, N_HT)
ht.consume_fasta_and_tag(filename)
subset = ht.do_subset_partition(0, 0)
@@ -493,24 +573,24 @@ def test_find_unpart_fail():
def test_simple_median():
- hi = khmer.new_hashbits(6, 2, 2)
+ hi = khmer.Hashbits(6, 1e5, 2)
(median, average, stddev) = hi.get_median_count("AAAAAA")
- print median, average, stddev
+ print(median, average, stddev)
assert median == 0
assert average == 0.0
assert stddev == 0.0
hi.consume("AAAAAA")
(median, average, stddev) = hi.get_median_count("AAAAAA")
- print median, average, stddev
+ print(median, average, stddev)
assert median == 1
assert average == 1.0
assert stddev == 0.0
def test_badget():
- hbts = khmer.new_hashbits(6, 1e6, 1)
+ hbts = khmer.Hashbits(6, 1e6, 1)
dna = "AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAG"
@@ -521,10 +601,16 @@ def test_badget():
assert hbts.get("GATGAG") == 0
try:
- hbts.get("AGCTT")
+ hbts.get(b"AGCTT")
+ assert 0, "this should fail"
+ except ValueError as err:
+ print(str(err))
+
+ try:
+ hbts.get(u"AGCTT")
assert 0, "this should fail"
except ValueError as err:
- print str(err)
+ print(str(err))
#
@@ -533,7 +619,7 @@ def test_badget():
def test_load_notexist_should_fail():
savepath = utils.get_temp_filename('temphashbitssave0.ht')
- hi = khmer.new_counting_hash(12, 2)
+ hi = khmer._CountingHash(12, [1])
try:
hi.load(savepath)
assert 0, "load should fail"
@@ -545,7 +631,8 @@ def test_load_truncated_should_fail():
inpath = utils.get_test_data('random-20-a.fa')
savepath = utils.get_temp_filename('temphashbitssave0.ct')
- hi = khmer.new_counting_hash(12, 1000)
+ hi = khmer.CountingHash(12, 1000, 2)
+
hi.consume_fasta(inpath)
hi.save(savepath)
@@ -557,27 +644,27 @@ def test_load_truncated_should_fail():
fp.write(data[:1000])
fp.close()
- hi = khmer.new_counting_hash(12, 1)
+ hi = khmer._CountingHash(12, [1])
try:
hi.load(savepath)
assert 0, "load should fail"
except IOError as e:
- print str(e)
+ print(str(e))
def test_save_load_tagset_notexist():
- ht = khmer.new_hashbits(32, 1, 1)
+ ht = khmer._Hashbits(32, [1])
outfile = utils.get_temp_filename('tagset')
try:
ht.load_tagset(outfile)
assert 0, "this test should fail"
except IOError as e:
- print str(e)
+ print(str(e))
def test_save_load_tagset_trunc():
- ht = khmer.new_hashbits(32, 1, 1)
+ ht = khmer._Hashbits(32, [1])
outfile = utils.get_temp_filename('tagset')
@@ -590,16 +677,18 @@ def test_save_load_tagset_trunc():
data = fp.read()
fp.close()
- fp = open(outfile, 'wb')
- fp.write(data[:26])
- fp.close()
+ for i in range(len(data)):
+ fp = open(outfile, 'wb')
+ fp.write(data[:i])
+ fp.close()
+
+ # try loading it...
+ try:
+ ht.load_tagset(outfile)
+ assert 0, "this test should fail"
+ except IOError as err:
+ print(str(err), i)
- # try loading it...
- try:
- ht.load_tagset(outfile)
- assert 0, "this test should fail"
- except IOError:
- pass
# to build the test files used below, add 'test' to this function
# and then look in /tmp. You will need to tweak the version info in
@@ -610,13 +699,13 @@ def _build_testfiles():
# hashbits file
inpath = utils.get_test_data('random-20-a.fa')
- hi = khmer.new_hashbits(12, 2)
+ hi = khmer.Hashbits(12, 2)
hi.consume_fasta(inpath)
hi.save('/tmp/goodversion-k12.ht')
# tagset file
- ht = khmer.new_hashbits(32, 1, 1)
+ ht = khmer._Hashbits(32, [1])
ht.add_tag('A' * 32)
ht.add_tag('G' * 32)
@@ -626,7 +715,7 @@ def _build_testfiles():
fakelump_fa = utils.get_test_data('fakelump.fa')
- ht = khmer.new_hashbits(32, 4, 4)
+ ht = khmer.Hashbits(32, 4, 4)
ht.consume_fasta_and_tag(fakelump_fa)
subset = ht.do_subset_partition(0, 0)
@@ -635,7 +724,7 @@ def _build_testfiles():
EXCURSION_DISTANCE = 40
EXCURSION_KMER_THRESHOLD = 82
EXCURSION_KMER_COUNT_THRESHOLD = 1
- counting = khmer.new_counting_hash(32, 4, 4)
+ counting = khmer.CountingHash(32, 4, 4)
ht.repartition_largest_partition(None, counting,
EXCURSION_DISTANCE,
@@ -646,7 +735,7 @@ def _build_testfiles():
def test_hashbits_file_version_check():
- ht = khmer.new_hashbits(12, 1, 1)
+ ht = khmer._Hashbits(12, [1])
inpath = utils.get_test_data('badversion-k12.ht')
@@ -654,25 +743,25 @@ def test_hashbits_file_version_check():
ht.load(inpath)
assert 0, "this should fail"
except IOError as e:
- print str(e)
+ print(str(e))
def test_hashbits_file_type_check():
- kh = khmer.new_counting_hash(12, 1, 1)
+ kh = khmer._CountingHash(12, [1])
savepath = utils.get_temp_filename('tempcountingsave0.ct')
kh.save(savepath)
- ht = khmer.new_hashbits(12, 1, 1)
+ ht = khmer._Hashbits(12, [1])
try:
ht.load(savepath)
assert 0, "this should fail"
except IOError as e:
- print str(e)
+ print(str(e))
def test_stoptags_file_version_check():
- ht = khmer.new_hashbits(32, 1, 1)
+ ht = khmer._Hashbits(32, [1])
inpath = utils.get_test_data('badversion-k32.stoptags')
@@ -680,33 +769,33 @@ def test_stoptags_file_version_check():
ht.load_stop_tags(inpath)
assert 0, "this should fail"
except IOError as e:
- print str(e)
+ print(str(e))
def test_stoptags_ksize_check():
- ht = khmer.new_hashbits(31, 1, 1)
+ ht = khmer._Hashbits(31, [1])
inpath = utils.get_test_data('goodversion-k32.stoptags')
try:
ht.load_stop_tags(inpath)
assert 0, "this should fail"
except IOError as e:
- print str(e)
+ print(str(e))
def test_stop_tags_filetype_check():
- ht = khmer.new_hashbits(31, 1, 1)
+ ht = khmer._Hashbits(31, [1])
inpath = utils.get_test_data('goodversion-k32.tagset')
try:
ht.load_stop_tags(inpath)
assert 0, "this should fail"
except IOError as e:
- print str(e)
+ print(str(e))
def test_tagset_file_version_check():
- ht = khmer.new_hashbits(32, 1, 1)
+ ht = khmer._Hashbits(32, [1])
inpath = utils.get_test_data('badversion-k32.tagset')
@@ -714,29 +803,48 @@ def test_tagset_file_version_check():
ht.load_tagset(inpath)
assert 0, "this should fail"
except IOError as e:
- print str(e)
+ print(str(e))
+
+
+def test_stop_tags_truncate_check():
+ ht = khmer._Hashbits(32, [1])
+
+ inpath = utils.get_test_data('goodversion-k32.tagset')
+ data = open(inpath, 'rb').read()
+
+ truncpath = utils.get_temp_filename('zzz')
+ for i in range(len(data)):
+ fp = open(truncpath, 'wb')
+ fp.write(data[:i])
+ fp.close()
+
+ try:
+ ht.load_stop_tags(truncpath)
+ assert 0, "expect failure of previous command"
+ except IOError as e:
+ print(i, str(e))
def test_tagset_ksize_check():
- ht = khmer.new_hashbits(31, 1, 1)
+ ht = khmer._Hashbits(31, [1])
inpath = utils.get_test_data('goodversion-k32.tagset')
try:
ht.load_tagset(inpath)
assert 0, "this should fail"
except IOError as e:
- print str(e)
+ print(str(e))
def test_tagset_filetype_check():
- ht = khmer.new_hashbits(31, 1, 1)
+ ht = khmer._Hashbits(31, [1])
inpath = utils.get_test_data('goodversion-k32.stoptags')
try:
ht.load_tagset(inpath)
assert 0, "this should fail"
except IOError as e:
- print str(e)
+ print(str(e))
def test_bad_primes_list():
@@ -744,21 +852,42 @@ def test_bad_primes_list():
coutingtable = khmer._Hashbits(31, ["a", "b", "c"], 1)
assert 0, "Bad primes list should fail"
except TypeError as e:
- print str(e)
+ print(str(e))
def test_consume_absentfasta_with_reads_parser():
- presencetable = khmer.new_hashbits(31, 1, 1)
+ presencetable = khmer._Hashbits(31, [1])
try:
presencetable.consume_fasta_with_reads_parser()
assert 0, "this should fail"
except TypeError as err:
- print str(err)
+ print(str(err))
try:
readparser = ReadParser(utils.get_test_data('empty-file'))
presencetable.consume_fasta_with_reads_parser(readparser)
assert 0, "this should fail"
except IOError as err:
- print str(err)
+ print(str(err))
except ValueError as err:
- print str(err)
+ print(str(err))
+
+
+def test_bad_primes():
+ try:
+ countingtable = khmer._Hashbits.__new__(
+ khmer._Hashbits, 6, ["a", "b", "c"])
+ assert 0, "this should fail"
+ except TypeError as e:
+ print(str(e))
+
+
+def test_consume_fasta_and_tag_with_badreads_parser():
+ presencetable = khmer.Hashbits(6, 1e6, 2)
+ try:
+ readsparser = khmer.ReadParser(utils.get_test_data("test-empty.fa"))
+ presencetable.consume_fasta_and_tag_with_reads_parser(readsparser)
+ assert 0, "this should fail"
+ except IOError as e:
+ print(str(e))
+ except ValueError as e:
+ print(str(e))
diff --git a/tests/test_hashbits_obj.py b/tests/test_hashbits_obj.py
deleted file mode 100644
index 2d3ebd1..0000000
--- a/tests/test_hashbits_obj.py
+++ /dev/null
@@ -1,563 +0,0 @@
-#
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
-# Copyright (C) Michigan State University, 2009-2013. It is licensed under
-# the three-clause BSD license; see LICENSE.
-# Contact: khmer-project at idyll.org
-#
-# pylint: disable=missing-docstring,protected-access
-
-#
-# This is an exact copy of test_hashbits, with all invocations of
-# khmer.new_hashbits replaced by khmer.Hashbits constructor calls
-#
-
-import khmer
-from khmer import Hashbits
-
-from screed.fasta import fasta_iter
-import screed
-
-import khmer_tst_utils as utils
-from nose.plugins.attrib import attr
-
-
-def teardown():
- utils.cleanup()
-
-
- at attr('linux')
-def test_toobig():
- try:
- pt = khmer.Hashbits(32, 1e13, 1)
- assert 0, "This should fail"
- except MemoryError as err:
- print str(err)
-
-
-def test__get_set_tag_density():
- ht = khmer.Hashbits(32, 1, 1)
-
- orig = ht._get_tag_density()
- assert orig != 2
- ht._set_tag_density(2)
- assert ht._get_tag_density() == 2
-
-
-def test_n_occupied_1():
- filename = utils.get_test_data('random-20-a.fa')
-
- K = 20 # size of kmer
- HT_SIZE = 100000 # size of hashtable
- N_HT = 1 # number of hashtables
-
- # test modified c++ n_occupied code
- ht1 = khmer.Hashbits(K, HT_SIZE, N_HT)
-
- for n, record in enumerate(fasta_iter(open(filename))):
- ht1.consume(record['sequence'])
-
- # this number calculated independently
- assert ht1.n_occupied() == 3877
-
-
-def test_bloom_python_1():
- # test python code to count unique kmers using bloom filter
- filename = utils.get_test_data('random-20-a.fa')
-
- K = 20 # size of kmer
- HT_SIZE = 100000 # size of hashtable
- N_HT = 3 # number of hashtables
-
- ht2 = khmer.Hashbits(K, HT_SIZE, N_HT)
-
- n_unique = 0
- for n, record in enumerate(fasta_iter(open(filename))):
- sequence = record['sequence']
- seq_len = len(sequence)
- for n in range(0, seq_len + 1 - K):
- kmer = sequence[n:n + K]
- if (not ht2.get(kmer)):
- n_unique += 1
- ht2.count(kmer)
-
- assert n_unique == 3960
- assert ht2.n_occupied() == 3882
- assert ht2.n_unique_kmers() == 3960 # this number equals to n_unique
-
-
-def test_bloom_c_1():
- # test c++ code to count unique kmers using bloom filter
-
- filename = utils.get_test_data('random-20-a.fa')
-
- K = 20 # size of kmer
- HT_SIZE = 100000 # size of hashtable
- N_HT = 3 # number of hashtables
-
- ht3 = khmer.Hashbits(K, HT_SIZE, N_HT)
-
- for n, record in enumerate(fasta_iter(open(filename))):
- ht3.consume(record['sequence'])
-
- assert ht3.n_occupied() == 3882
- assert ht3.n_unique_kmers() == 3960
-
-
-def test_n_occupied_2(): # simple one
- K = 4
- HT_SIZE = 10 # use 11
- N_HT = 1
-
- ht1 = khmer.Hashbits(K, HT_SIZE, N_HT)
- ht1.count('AAAA') # 00 00 00 00 = 0
- assert ht1.n_occupied() == 1
-
- ht1.count('ACTG') # 00 10 01 11 =
- assert ht1.n_occupied() == 2
-
- ht1.count('AACG') # 00 00 10 11 = 11 # collision 1
-
- assert ht1.n_occupied() == 2
- ht1.count('AGAC') # 00 11 00 10 # collision 2
- assert ht1.n_occupied() == 2
-
-
-def test_bloom_c_2(): # simple one
- K = 4
- HT_SIZE = 10 # use 11
- N_HT1 = 1 # hashtable size = 11
- N_HT2 = 2 # hashtable size = 11,13
-
- # use only 1 hashtable, no bloom filter
- ht1 = khmer.Hashbits(K, HT_SIZE, N_HT1)
- ht1.count('AAAA') # 00 00 00 00 = 0
- ht1.count('ACTG') # 00 10 01 11 =
- assert ht1.n_unique_kmers() == 2
- ht1.count('AACG') # 00 00 10 11 = 11 # collision with 1st kmer
- assert ht1.n_unique_kmers() == 2
- ht1.count('AGAC') # 00 11 00 10 # collision with 2nd kmer
- assert ht1.n_unique_kmers() == 2
-
- # use two hashtables with 11,13
- ht2 = khmer.Hashbits(K, HT_SIZE, N_HT2)
- ht2.count('AAAA') # 00 00 00 00 = 0
-
- ht2.count('ACTG') # 00 10 01 11 = 2*16 +4 +3 = 39
- assert ht2.n_unique_kmers() == 2
- ht2.count('AACG') # 00 00 10 11 = 11 # collision with only 1st kmer
- assert ht2.n_unique_kmers() == 3
- ht2.count('AGAC') # 00 11 00 10 3*16 +2 = 50
- # collision with both 2nd and 3rd kmers
-
- assert ht2.n_unique_kmers() == 3
-
-
-def test_filter_if_present():
- ht = khmer.Hashbits(32, 1e6, 2)
-
- maskfile = utils.get_test_data('filter-test-A.fa')
- inputfile = utils.get_test_data('filter-test-B.fa')
- outfile = utils.get_temp_filename('filter')
-
- ht.consume_fasta(maskfile)
- ht.filter_if_present(inputfile, outfile)
-
- records = list(fasta_iter(open(outfile)))
- assert len(records) == 1
- assert records[0]['name'] == '3'
-
-
-def test_combine_pe():
- inpfile = utils.get_test_data('combine_parts_1.fa')
- ht = khmer.Hashbits(32, 1, 1)
-
- ht.consume_partitioned_fasta(inpfile)
- assert ht.count_partitions() == (2, 0)
-
- s1 = "CATGCAGAAGTTCCGCAACCATACCGTTCAGT"
- pid1 = ht.get_partition_id(s1)
-
- s2 = "CAAATGTACATGCACTTAAAATCATCCAGCCG"
- pid2 = ht.get_partition_id(s2)
-
- assert pid1 == 2
- assert pid2 == 80293
-
- ht.join_partitions(pid1, pid2)
-
- pid1 = ht.get_partition_id(s1)
- pid2 = ht.get_partition_id(s2)
-
- assert pid1 == pid2
- assert ht.count_partitions() == (1, 0)
-
-
-def test_load_partitioned():
- inpfile = utils.get_test_data('combine_parts_1.fa')
- ht = khmer.Hashbits(32, 1, 1)
-
- ht.consume_partitioned_fasta(inpfile)
- assert ht.count_partitions() == (2, 0)
-
- s1 = "CATGCAGAAGTTCCGCAACCATACCGTTCAGT"
- assert ht.get(s1)
-
- s2 = "CAAATGTACATGCACTTAAAATCATCCAGCCG"
- assert ht.get(s2)
-
- s3 = "CATGCAGAAGTTCCGCAACCATACCGTTCAGTTCCTGGTGGCTA"[-32:]
- assert ht.get(s3)
-
-
-def test_count_within_radius_simple():
- inpfile = utils.get_test_data('all-A.fa')
- ht = khmer.Hashbits(4, 1e6, 2)
-
- print ht.consume_fasta(inpfile)
- n = ht.count_kmers_within_radius('AAAA', 1)
- assert n == 1
-
- n = ht.count_kmers_within_radius('AAAA', 10)
- assert n == 1
-
-
-def test_count_within_radius_big():
- inpfile = utils.get_test_data('random-20-a.fa')
- ht = khmer.Hashbits(20, 1e6, 4)
-
- ht.consume_fasta(inpfile)
- n = ht.count_kmers_within_radius('CGCAGGCTGGATTCTAGAGG', int(1e6))
- assert n == 3960
-
- ht = khmer.Hashbits(21, 1e6, 4)
- ht.consume_fasta(inpfile)
- n = ht.count_kmers_within_radius('CGCAGGCTGGATTCTAGAGGC', int(1e6))
- assert n == 39
-
-
-def test_count_kmer_degree():
- inpfile = utils.get_test_data('all-A.fa')
- ht = khmer.Hashbits(4, 1e6, 2)
- ht.consume_fasta(inpfile)
-
- assert ht.kmer_degree('AAAA') == 2
- assert ht.kmer_degree('AAAT') == 1
- assert ht.kmer_degree('AATA') == 0
- assert ht.kmer_degree('TAAA') == 1
-
-
-def test_save_load_tagset():
- ht = khmer.Hashbits(32, 1, 1)
-
- outfile = utils.get_temp_filename('tagset')
-
- ht.add_tag('A' * 32)
- ht.save_tagset(outfile)
-
- ht.add_tag('G' * 32)
-
- ht.load_tagset(outfile) # implicitly => clear_tags=True
- ht.save_tagset(outfile)
-
- # if tags have been cleared, then the new tagfile will be larger (34 bytes)
- # else smaller (26 bytes).
-
- fp = open(outfile, 'rb')
- data = fp.read()
- fp.close()
- assert len(data) == 26, len(data)
-
-
-def test_save_load_tagset_noclear():
- ht = khmer.Hashbits(32, 1, 1)
-
- outfile = utils.get_temp_filename('tagset')
-
- ht.add_tag('A' * 32)
- ht.save_tagset(outfile)
-
- ht.add_tag('G' * 32)
-
- ht.load_tagset(outfile, False) # set clear_tags => False; zero tags
- ht.save_tagset(outfile)
-
- # if tags have been cleared, then the new tagfile will be large (34 bytes);
- # else small (26 bytes).
-
- fp = open(outfile, 'rb')
- data = fp.read()
- fp.close()
- assert len(data) == 34, len(data)
-
-
-def test_stop_traverse():
- filename = utils.get_test_data('random-20-a.fa')
-
- K = 20 # size of kmer
- HT_SIZE = 100000 # size of hashtable
- N_HT = 3 # number of hashtables
-
- ht = khmer.Hashbits(K, HT_SIZE, N_HT)
-
- # without tagging/joining across consume, this breaks into two partition;
- # with, it is one partition.
- ht.add_stop_tag('TTGCATACGTTGAGCCAGCG')
-
- ht.consume_fasta_and_tag(filename) # DO NOT join reads across stoptags
- subset = ht.do_subset_partition(0, 0, True)
- ht.merge_subset(subset)
-
- n, _ = ht.count_partitions()
- assert n == 2, n
-
-
-def test_tag_across_stoptraverse():
- filename = utils.get_test_data('random-20-a.fa')
-
- K = 20 # size of kmer
- HT_SIZE = 100000 # size of hashtable
- N_HT = 3 # number of hashtables
-
- ht = khmer.Hashbits(K, HT_SIZE, N_HT)
-
- # without tagging/joining across consume, this breaks into two partition;
- # with, it is one partition.
- ht.add_stop_tag('CCGAATATATAACAGCGACG')
-
- ht.consume_fasta_and_tag_with_stoptags(filename) # DO join reads across
-
- subset = ht.do_subset_partition(0, 0)
- n, _ = ht.count_partitions()
- assert n == 99 # reads only connected by traversal...
-
- n, _ = ht.subset_count_partitions(subset)
- assert n == 2 # but need main to cross stoptags.
-
- ht.merge_subset(subset)
-
- n, _ = ht.count_partitions() # ta-da!
- assert n == 1, n
-
-
-def test_notag_across_stoptraverse():
- filename = utils.get_test_data('random-20-a.fa')
-
- K = 20 # size of kmer
- HT_SIZE = 100000 # size of hashtable
- N_HT = 3 # number of hashtables
-
- ht = khmer.Hashbits(K, HT_SIZE, N_HT)
-
- # connecting k-mer at the beginning/end of a read: breaks up into two.
- ht.add_stop_tag('TTGCATACGTTGAGCCAGCG')
-
- ht.consume_fasta_and_tag_with_stoptags(filename)
-
- subset = ht.do_subset_partition(0, 0)
- ht.merge_subset(subset)
-
- n, _ = ht.count_partitions()
- assert n == 2, n
-
-
-def test_find_stoptags():
- ht = khmer.Hashbits(5, 1, 1)
- ht.add_stop_tag("AAAAA")
-
- assert ht.identify_stoptags_by_position("AAAAA") == [0]
- assert ht.identify_stoptags_by_position("AAAAAA") == [0, 1]
- assert ht.identify_stoptags_by_position("TTTTT") == [0]
- assert ht.identify_stoptags_by_position("TTTTTT") == [0, 1]
-
-
-def test_find_stoptags2():
- ht = khmer.Hashbits(4, 1, 1)
- ht.add_stop_tag("ATGC")
-
- x = ht.identify_stoptags_by_position("ATGCATGCGCAT")
- assert x == [0, 2, 4, 8], x
-
-
-def test_get_ksize():
- kh = khmer.Hashbits(22, 1, 1)
- assert kh.ksize() == 22
-
-
-def test_get_hashsizes():
- kh = khmer.Hashbits(22, 100, 4)
- assert kh.hashsizes() == [101, 103, 107, 109], kh.hashsizes()
-
-
-def test_extract_unique_paths_0():
- kh = khmer.Hashbits(10, 1e5, 4)
-
- x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1)
- assert x == ['ATGGAGAGACACAGATAGACAGGAGTGGCGATG']
-
- kh.consume('ATGGAGAGACACAGATAGACAGGAGTGGCGATG')
- x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1)
- assert not x
-
-
-def test_extract_unique_paths_1():
- kh = khmer.Hashbits(10, 1e5, 4)
-
- kh.consume('AGTGGCGATG')
- x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1)
- print x
- assert x == ['ATGGAGAGACACAGATAGACAGGAGTGGCGAT'] # all but the last k-mer
-
-
-def test_extract_unique_paths_2():
- kh = khmer.Hashbits(10, 1e5, 4)
-
- kh.consume('ATGGAGAGAC')
- x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1)
- print x
- assert x == ['TGGAGAGACACAGATAGACAGGAGTGGCGATG'] # all but the 1st k-mer
-
-
-def test_extract_unique_paths_3():
- kh = khmer.Hashbits(10, 1e5, 4)
-
- kh.consume('ATGGAGAGAC')
- kh.consume('AGTGGCGATG')
- x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1)
- print x
- # all but the 1st/last k-mer
- assert x == ['TGGAGAGACACAGATAGACAGGAGTGGCGAT']
-
-
-def test_extract_unique_paths_4():
- kh = khmer.Hashbits(10, 1e5, 4)
-
- kh.consume('ATGGAGAGAC')
- kh.consume('AGTGGCGATG')
-
- kh.consume('ATAGACAGGA')
-
- x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1)
- print x
- assert x == ['TGGAGAGACACAGATAGACAGG', 'TAGACAGGAGTGGCGAT']
-
-
-def test_find_unpart():
- filename = utils.get_test_data('random-20-a.odd.fa')
- filename2 = utils.get_test_data('random-20-a.even.fa')
-
- K = 20 # size of kmer
- HT_SIZE = 100000 # size of hashtable
- N_HT = 3 # number of hashtables
-
- ht = khmer.Hashbits(K, HT_SIZE, N_HT)
- ht.consume_fasta_and_tag(filename)
-
- subset = ht.do_subset_partition(0, 0)
- ht.merge_subset(subset)
-
- n, _ = ht.count_partitions()
- assert n == 49
-
- ht.find_unpart(filename2, True, False)
- n, _ = ht.count_partitions()
- assert n == 1, n # all sequences connect
-
-
-def test_find_unpart_notraverse():
- filename = utils.get_test_data('random-20-a.odd.fa')
- filename2 = utils.get_test_data('random-20-a.even.fa')
-
- K = 20 # size of kmer
- HT_SIZE = 100000 # size of hashtable
- N_HT = 3 # number of hashtables
-
- ht = khmer.Hashbits(K, HT_SIZE, N_HT)
- ht.consume_fasta_and_tag(filename)
-
- subset = ht.do_subset_partition(0, 0)
- ht.merge_subset(subset)
-
- n, _ = ht.count_partitions()
- assert n == 49
-
- ht.find_unpart(filename2, False, False) # <-- don't traverse
- n, _ = ht.count_partitions()
- assert n == 99, n # all sequences disconnected
-
-
-def test_find_unpart_fail():
- filename = utils.get_test_data('random-20-a.odd.fa')
- filename2 = utils.get_test_data('random-20-a.odd.fa') # <- switch to odd
-
- K = 20 # size of kmer
- HT_SIZE = 100000 # size of hashtable
- N_HT = 3 # number of hashtables
-
- ht = khmer.Hashbits(K, HT_SIZE, N_HT)
- ht.consume_fasta_and_tag(filename)
-
- subset = ht.do_subset_partition(0, 0)
- ht.merge_subset(subset)
-
- n, _ = ht.count_partitions()
- assert n == 49
-
- ht.find_unpart(filename2, True, False)
- n, _ = ht.count_partitions()
- assert n == 49, n # only 49 sequences worth of tags
-
-
-def test_simple_median():
- hi = khmer.Hashbits(6, 1e6, 2)
-
- (median, average, stddev) = hi.get_median_count("AAAAAA")
- print median, average, stddev
- assert median == 0
- assert average == 0.0
- assert stddev == 0.0
-
- hi.consume("AAAAAA")
- (median, average, stddev) = hi.get_median_count("AAAAAA")
- print median, average, stddev
- assert median == 1
- assert average == 1.0
- assert stddev == 0.0
-
-
-def test_badget():
- hbts = khmer.Hashbits(6, 1e6, 1)
-
- dna = "AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAG"
-
- hbts.consume(dna)
-
- assert hbts.get("AGCTTT") == 1
-
- assert hbts.get("GATGAG") == 0
-
- try:
- hbts.get("AGCTT")
- assert 0, "this should fail"
- except ValueError as err:
- print str(err)
-
-
-def test_bad_primes():
- try:
- countingtable = khmer._Hashbits.__new__(
- khmer._Hashbits, 6, ["a", "b", "c"])
- assert 0, "this should fail"
- except TypeError as e:
- print str(e)
-
-
-def test_consume_fasta_and_tag_with_badreads_parser():
- presencetable = khmer.Hashbits(6, 1e6, 2)
- try:
- readsparser = khmer.ReadParser(utils.get_test_data("test-empty.fa"))
- presencetable.consume_fasta_and_tag_with_reads_parser(readsparser)
- assert 0, "this should fail"
- except IOError as e:
- print str(e)
- except ValueError as e:
- print str(e)
diff --git a/tests/test_hll.py b/tests/test_hll.py
index 266ce5d..ebde12d 100644
--- a/tests/test_hll.py
+++ b/tests/test_hll.py
@@ -1,5 +1,7 @@
+from __future__ import division, print_function, unicode_literals
+from __future__ import absolute_import
#
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
# Copyright (C) Michigan State University, 2014-2015. It is licensed under
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
@@ -7,20 +9,19 @@
# pylint: disable=missing-docstring,protected-access
import math
-import string
import khmer
from screed.fasta import fasta_iter
-import khmer_tst_utils as utils
+from . import khmer_tst_utils as utils
from nose.tools import assert_raises
-TT = string.maketrans('ACGT', 'TGCA')
K = 20 # size of kmer
ERR_RATE = 0.01
N_UNIQUE = 3960
+TRANSLATE = {'A': 'T', 'C': 'G', 'T': 'A', 'G': 'C'}
def teardown():
@@ -41,7 +42,7 @@ def test_hll_add_python():
seq_len = len(sequence)
for n in range(0, seq_len + 1 - K):
kmer = sequence[n:n + K]
- rc = kmer[::-1].translate(TT)
+ rc = "".join(TRANSLATE[c] for c in kmer[::-1])
hllcpp.add(kmer)
@@ -61,9 +62,12 @@ def test_hll_consume_string():
filename = utils.get_test_data('random-20-a.fa')
hllcpp = khmer.HLLCounter(ERR_RATE, K)
- for n, record in enumerate(fasta_iter(open(filename))):
- hllcpp.consume_string(record['sequence'])
+ n_consumed = 0
+ for n, record in enumerate(fasta_iter(open(filename)), 1):
+ n_consumed += hllcpp.consume_string(record['sequence'])
+ assert n == 99
+ assert n_consumed == 3960
assert abs(1 - float(hllcpp.estimate_cardinality()) / N_UNIQUE) < ERR_RATE
@@ -79,8 +83,10 @@ def test_hll_consume_fasta():
filename = utils.get_test_data('random-20-a.fa')
hllcpp = khmer.HLLCounter(ERR_RATE, K)
- hllcpp.consume_fasta(filename)
+ n, n_consumed = hllcpp.consume_fasta(filename)
+ assert n == 99
+ assert n_consumed == 3960
assert abs(1 - float(hllcpp.estimate_cardinality()) / N_UNIQUE) < ERR_RATE
@@ -90,10 +96,12 @@ def test_hll_consume_fasta_ep():
filename = utils.get_test_data('paired-mixed.fa')
hll = khmer.HLLCounter(0.36, 32)
- hll.consume_fasta(filename)
+ n, n_consumed = hll.consume_fasta(filename)
assert all(c != 0 for c in hll.counters)
assert len(hll) == 236
+ assert n == 11
+ assert n_consumed == 575
def test_hll_consume_fasta_estimate_bias():
@@ -104,17 +112,21 @@ def test_hll_consume_fasta_estimate_bias():
filename = utils.get_test_data("test-abund-read-3.fa")
hll = khmer.HLLCounter(0.36, K)
- hll.consume_fasta(filename)
+ n, n_consumed = hll.consume_fasta(filename)
assert all(c != 0 for c in hll.counters)
assert len(hll) == 79
+ assert n == 21
+ assert n_consumed == 1176
def test_hll_len():
filename = utils.get_test_data('random-20-a.fa')
hllcpp = khmer.HLLCounter(ERR_RATE, K)
- hllcpp.consume_fasta(filename)
+ n, n_consumed = hllcpp.consume_fasta(filename)
+ assert n == 99
+ assert n_consumed == 3960
assert hllcpp.estimate_cardinality() == len(hllcpp)
@@ -225,7 +237,7 @@ def test_hll_change_ksize():
hllcpp.ksize = 24
assert hllcpp.ksize == 24
- hllcpp.ksize = 12L
+ hllcpp.ksize = 12
assert hllcpp.ksize == 12
with assert_raises(ValueError):
@@ -248,3 +260,43 @@ def test_hll_get_counters():
counters = hll.counters
assert len(counters) == 2 ** 4
assert all(c == 0 for c in counters)
+
+
+def test_hll_merge_1():
+ hll = khmer.HLLCounter(0.36, K)
+ hll2 = khmer.HLLCounter(0.36, K - 1)
+
+ try:
+ hll.merge(hll2)
+ assert 0, "previous statement should fail with a ValueError"
+ except ValueError as err:
+ print(str(err))
+
+
+def test_hll_merge_2():
+ hll = khmer.HLLCounter(0.10, K)
+ hll2 = khmer.HLLCounter(0.36, K)
+
+ try:
+ hll.merge(hll2)
+ assert 0, "previous statement should fail with a ValueError"
+ except ValueError as err:
+ print(str(err))
+
+
+def test_hll_merge_3():
+ hll = khmer.HLLCounter(0.36, 32)
+ hll2 = khmer.HLLCounter(0.36, 32)
+
+ filename = utils.get_test_data('paired-mixed.fa')
+ hll = khmer.HLLCounter(0.36, 32)
+ hll.consume_fasta(filename)
+
+ hll2 = khmer.HLLCounter(0.36, 32)
+ hll2.consume_fasta(filename)
+
+ assert len(hll) == 236
+ assert len(hll2) == 236
+
+ hll.merge(hll2)
+ assert len(hll) == 236
diff --git a/tests/test_labelhash.py b/tests/test_labelhash.py
index 36c9e7b..59a56b0 100644
--- a/tests/test_labelhash.py
+++ b/tests/test_labelhash.py
@@ -1,16 +1,19 @@
+from __future__ import print_function
+from __future__ import absolute_import
#
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
-# Copyright (C) Michigan State University, 2009-2013. It is licensed under
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
+# Copyright (C) Michigan State University, 2009-2015. It is licensed under
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
#
# pylint: disable=missing-docstring,protected-access
+import os
import khmer
-from khmer import LabelHash
+from khmer import LabelHash, CountingLabelHash
from screed.fasta import fasta_iter
import screed
-import khmer_tst_utils as utils
+from . import khmer_tst_utils as utils
from nose.plugins.attrib import attr
@@ -22,13 +25,22 @@ def teardown():
# * thread-safety
- at attr('linux')
+ at attr('huge')
def test_toobig():
try:
lh = LabelHash(20, 1e13, 1)
assert 0, "This should fail."
except MemoryError as err:
- print str(err)
+ print(str(err))
+
+
+def test_error_create():
+ from khmer import _LabelHash
+ try:
+ lh = _LabelHash(None)
+ assert 0, "This should fail."
+ except ValueError as err:
+ print(str(err))
def test_n_labels():
@@ -36,7 +48,7 @@ def test_n_labels():
filename = utils.get_test_data('test-labels.fa')
lh.consume_fasta_and_tag_with_labels(filename)
- print lh.n_labels()
+ print(lh.n_labels())
assert lh.n_labels() == 4
@@ -53,6 +65,100 @@ def test_get_label_dict():
assert a_label in expected
+def test_get_label_dict_save_load():
+ lb_pre = LabelHash(20, 1e7, 4)
+ filename = utils.get_test_data('test-labels.fa')
+ lb_pre.consume_fasta_and_tag_with_labels(filename)
+
+ # save labels to a file
+ savepath = utils.get_temp_filename('saved.labels')
+ lb_pre.save_labels_and_tags(savepath)
+
+ # trash the old LabelHash
+ del lb_pre
+
+ # create new, load labels & tags
+ lb = LabelHash(20, 1e7, 4)
+ lb.load_labels_and_tags(savepath)
+
+ labels = lb.get_label_dict()
+ expected = [0, 1, 2, 3]
+ for e_label in expected:
+ assert e_label in labels
+ for a_label in labels:
+ assert a_label in expected
+
+
+def test_get_label_dict_save_load_wrong_ksize():
+ lb_pre = LabelHash(19, 1e7, 4)
+ filename = utils.get_test_data('test-labels.fa')
+ lb_pre.consume_fasta_and_tag_with_labels(filename)
+
+ # save labels to a file
+ savepath = utils.get_temp_filename('saved.labels')
+ lb_pre.save_labels_and_tags(savepath)
+
+ # trash the old LabelHash
+ del lb_pre
+
+ # create new, load labels & tags
+ lb = LabelHash(20, 1e7, 4)
+ try:
+ lb.load_labels_and_tags(savepath)
+ assert 0, "this should not succeed - different ksize"
+ except IOError as err:
+ print(str(err))
+ assert "Incorrect k-mer size 19" in str(err)
+
+
+def test_save_load_corrupted():
+ lb_pre = LabelHash(20, 1e7, 4)
+ filename = utils.get_test_data('test-labels.fa')
+ lb_pre.consume_fasta_and_tag_with_labels(filename)
+
+ # save labels to a file
+ savepath = utils.get_temp_filename('saved.labels')
+ lb_pre.save_labels_and_tags(savepath)
+
+ # trash the old LabelHash
+ del lb_pre
+
+ lb = LabelHash(20, 1e7, 4)
+
+ # produce all possible truncated versions of this file
+ data = open(savepath, 'rb').read()
+ for i in range(len(data)):
+ truncated = utils.get_temp_filename('trunc.labels')
+ fp = open(truncated, 'wb')
+ fp.write(data[:i])
+ fp.close()
+
+ try:
+ lb.load_labels_and_tags(truncated)
+ assert 0, "this should not succeed -- truncated file len %d" % (i,)
+ except IOError as err:
+ print('expected failure for', i, ': ', str(err))
+
+
+def test_save_fail_readonly():
+ lb_pre = LabelHash(20, 1e7, 4)
+ filename = utils.get_test_data('test-labels.fa')
+ lb_pre.consume_fasta_and_tag_with_labels(filename)
+
+ # save labels to a file
+ savepath = utils.get_temp_filename('saved.labels')
+ fp = open(savepath, 'w')
+ fp.close()
+
+ os.chmod(savepath, 0x444)
+
+ try:
+ lb_pre.save_labels_and_tags(savepath)
+ assert 0, "this should fail: read-only file"
+ except IOError as err:
+ print(str(err))
+
+
def test_get_tag_labels():
lb = LabelHash(20, 1e7, 4)
filename = utils.get_test_data('single-read.fq')
@@ -70,22 +176,22 @@ def test_consume_fasta_and_tag_with_labels():
filename = utils.get_test_data('test-transcript.fa')
total_reads, n_consumed = lb.consume_fasta_and_tag_with_labels(filename)
- print "doing get"
- assert lb.get(read_1[:20])
+ print("doing get")
+ assert lb.graph.get(read_1[:20])
assert total_reads == 3
- print "doing n_labels"
- print lb.n_labels()
- print "doing label dict"
- print lb.get_label_dict()
- print "get tagset"
- for tag in lb.get_tagset():
- print "forward hash"
- print tag, khmer.forward_hash(tag, 20)
+ print("doing n_labels")
+ print(lb.n_labels())
+ print("doing label dict")
+ print(lb.get_label_dict())
+ print("get tagset")
+ for tag in lb.graph.get_tagset():
+ print("forward hash")
+ print(tag, khmer.forward_hash(tag, 20))
for record in screed.open(filename):
- print "Sweeping tags"
- print lb.sweep_tag_neighborhood(record.sequence, 40)
- print "Sweeping labels..."
- print lb.sweep_label_neighborhood(record.sequence, 40)
+ print("Sweeping tags")
+ print(lb.sweep_tag_neighborhood(record.sequence, 40))
+ print("Sweeping labels...")
+ print(lb.sweep_label_neighborhood(record.sequence, 40))
assert lb.n_labels() == 3
@@ -99,8 +205,8 @@ def test_consume_partitioned_fasta_and_tag_with_labels():
for record in screed.open(filename):
seq = record.sequence
labels.update(lb.sweep_label_neighborhood(seq, 0, False, False))
- # print lb.n_labels()
- # print labels
+ # print(lb.n_labels())
+ # print(labels)
assert len(labels) == 1
assert labels.pop() == 2
assert lb.n_labels() == 1
@@ -122,7 +228,7 @@ def test_consume_sequence_and_tag_with_labels():
def test_sweep_tag_neighborhood():
lb = LabelHash(20, 1e7, 4)
filename = utils.get_test_data('single-read.fq')
- lb.consume_fasta_and_tag(filename)
+ lb.graph.consume_fasta_and_tag(filename)
tags = lb.sweep_tag_neighborhood('CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT')
assert len(tags) == 1
@@ -156,11 +262,11 @@ def test_label_tag_correctness():
labels = lb.sweep_label_neighborhood(
'ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGGCTCTGCCTAGAG'
'CTAGGCTAGGTGTGCTCTGCCTAGAGCTAGGCTAGGTGT')
- print lb.sweep_tag_neighborhood(
+ print(lb.sweep_tag_neighborhood(
'TTCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGGCTCTGCCTAGAG'
- 'CTAGGCTAGGTGTGCTCTGCTAGAGCTAGGCTAGGTGT')
- print labels
- print len('ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAG') - 19
+ 'CTAGGCTAGGTGTGCTCTGCTAGAGCTAGGCTAGGTGT'))
+ print(labels)
+ print(len('ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAG') - 19)
assert len(labels) == 2
assert 0 in labels
assert 1 in labels
@@ -169,7 +275,7 @@ def test_label_tag_correctness():
labels = lb.sweep_label_neighborhood(
'GCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGCTCTGCCTAGAGCTAGGCTAGGTGTTGGGGATAG'
'ATAGATAGATGACCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGA')
- print labels
+ print(labels)
assert len(labels) == 3
assert 0 in labels
assert 1 in labels
@@ -180,7 +286,7 @@ def test_label_tag_correctness():
'TGGGATAGATAGATAGATGACCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGACCTAGAG'
'CTAGGCTAGGTGTTGGGGATAGATAGATAGATGAGTTGGGGATAGATAGATAGATGAGTGTAGATCCA'
'ACAACACATACA')
- print labels
+ print(labels)
assert len(labels) == 2
assert 1 in labels
assert 2 in labels
@@ -188,508 +294,144 @@ def test_label_tag_correctness():
# read D
labels = lb.sweep_label_neighborhood(
'TATATATATAGCTAGCTAGCTAACTAGCTAGCATCGATCGATCGATC')
- print labels
+ print(labels)
assert len(labels) == 1
assert 3 in labels
-#
-# Begin Hashbits tests
-#
-
-
-def test__get_set_tag_density():
- ht = khmer.LabelHash(32, 1, 1)
-
- orig = ht._get_tag_density()
- assert orig != 2
- ht._set_tag_density(2)
- assert ht._get_tag_density() == 2
-
-
-def test_n_occupied_1():
- filename = utils.get_test_data('random-20-a.fa')
-
- K = 20 # size of kmer
- HT_SIZE = 100000 # size of hashtable
- N_HT = 1 # number of hashtables
-
- # test modified c++ n_occupied code
- ht1 = khmer.LabelHash(K, HT_SIZE, N_HT)
-
- for n, record in enumerate(fasta_iter(open(filename))):
- ht1.consume(record['sequence'])
-
- # this number calculated independently
- assert ht1.n_occupied() == 3877
-
-
-def test_bloom_python_1():
- # test python code to count unique kmers using bloom filter
- filename = utils.get_test_data('random-20-a.fa')
-
- K = 20 # size of kmer
- HT_SIZE = 100000 # size of hashtable
- N_HT = 3 # number of hashtables
-
- ht2 = khmer.LabelHash(K, HT_SIZE, N_HT)
-
- n_unique = 0
- for n, record in enumerate(fasta_iter(open(filename))):
- sequence = record['sequence']
- seq_len = len(sequence)
- for n in range(0, seq_len + 1 - K):
- kmer = sequence[n:n + K]
- if (not ht2.get(kmer)):
- n_unique += 1
- ht2.count(kmer)
-
- assert n_unique == 3960
- assert ht2.n_occupied() == 3882
- assert ht2.n_unique_kmers() == 3960 # this number equals to n_unique
-
-
-def test_bloom_c_1():
- # test c++ code to count unique kmers using bloom filter
-
- filename = utils.get_test_data('random-20-a.fa')
-
- K = 20 # size of kmer
- HT_SIZE = 100000 # size of hashtable
- N_HT = 3 # number of hashtables
-
- ht3 = khmer.LabelHash(K, HT_SIZE, N_HT)
-
- for n, record in enumerate(fasta_iter(open(filename))):
- ht3.consume(record['sequence'])
-
- assert ht3.n_occupied() == 3882
- assert ht3.n_unique_kmers() == 3960
-
-
-def test_n_occupied_2(): # simple one
- K = 4
- HT_SIZE = 10 # use 11
- N_HT = 1
-
- ht1 = khmer.LabelHash(K, HT_SIZE, N_HT)
- ht1.count('AAAA') # 00 00 00 00 = 0
- assert ht1.n_occupied() == 1
-
- ht1.count('ACTG') # 00 10 01 11 =
- assert ht1.n_occupied() == 2
-
- ht1.count('AACG') # 00 00 10 11 = 11 # collision 1
-
- assert ht1.n_occupied() == 2
- ht1.count('AGAC') # 00 11 00 10 # collision 2
- assert ht1.n_occupied() == 2
-
-
-def test_bloom_c_2(): # simple one
- K = 4
- HT_SIZE = 10 # use 11
- N_HT1 = 1 # hashtable size = 11
- N_HT2 = 2 # hashtable size = 11,13
-
- # use only 1 hashtable, no bloom filter
- ht1 = khmer.LabelHash(K, HT_SIZE, N_HT1)
- ht1.count('AAAA') # 00 00 00 00 = 0
- ht1.count('ACTG') # 00 10 01 11 =
- assert ht1.n_unique_kmers() == 2
- ht1.count('AACG') # 00 00 10 11 = 11 # collision with 1st kmer
- assert ht1.n_unique_kmers() == 2
- ht1.count('AGAC') # 00 11 00 10 # collision with 2nd kmer
- assert ht1.n_unique_kmers() == 2
-
- # use two hashtables with 11,13
- ht2 = khmer.LabelHash(K, HT_SIZE, N_HT2)
- ht2.count('AAAA') # 00 00 00 00 = 0
-
- ht2.count('ACTG') # 00 10 01 11 = 2*16 +4 +3 = 39
- assert ht2.n_unique_kmers() == 2
- ht2.count('AACG') # 00 00 10 11 = 11 # collision with only 1st kmer
- assert ht2.n_unique_kmers() == 3
- ht2.count('AGAC') # 00 11 00 10 3*16 +2 = 50
- # collision with both 2nd and 3rd kmers
-
- assert ht2.n_unique_kmers() == 3
-
-
-def test_filter_if_present():
- ht = khmer.LabelHash(32, 1e6, 2)
-
- maskfile = utils.get_test_data('filter-test-A.fa')
- inputfile = utils.get_test_data('filter-test-B.fa')
- outfile = utils.get_temp_filename('filter')
-
- ht.consume_fasta(maskfile)
- ht.filter_if_present(inputfile, outfile)
-
- records = list(fasta_iter(open(outfile)))
- assert len(records) == 1
- assert records[0]['name'] == '3'
-
-
-def test_combine_pe():
- inpfile = utils.get_test_data('combine_parts_1.fa')
- ht = khmer.LabelHash(32, 1, 1)
-
- ht.consume_partitioned_fasta(inpfile)
- assert ht.count_partitions() == (2, 0)
-
- s1 = "CATGCAGAAGTTCCGCAACCATACCGTTCAGT"
- pid1 = ht.get_partition_id(s1)
-
- s2 = "CAAATGTACATGCACTTAAAATCATCCAGCCG"
- pid2 = ht.get_partition_id(s2)
-
- assert pid1 == 2
- assert pid2 == 80293
-
- ht.join_partitions(pid1, pid2)
-
- pid1 = ht.get_partition_id(s1)
- pid2 = ht.get_partition_id(s2)
-
- assert pid1 == pid2
- assert ht.count_partitions() == (1, 0)
-
-
-def test_load_partitioned():
- inpfile = utils.get_test_data('combine_parts_1.fa')
- ht = khmer.LabelHash(32, 1, 1)
-
- ht.consume_partitioned_fasta(inpfile)
- assert ht.count_partitions() == (2, 0)
-
- s1 = "CATGCAGAAGTTCCGCAACCATACCGTTCAGT"
- assert ht.get(s1)
-
- s2 = "CAAATGTACATGCACTTAAAATCATCCAGCCG"
- assert ht.get(s2)
-
- s3 = "CATGCAGAAGTTCCGCAACCATACCGTTCAGTTCCTGGTGGCTA"[-32:]
- assert ht.get(s3)
-
-
-def test_count_within_radius_simple():
- inpfile = utils.get_test_data('all-A.fa')
- ht = khmer.LabelHash(4, 1e6, 2)
-
- print ht.consume_fasta(inpfile)
- n = ht.count_kmers_within_radius('AAAA', 1)
- assert n == 1
-
- n = ht.count_kmers_within_radius('AAAA', 10)
- assert n == 1
-
-
-def test_count_within_radius_big():
- inpfile = utils.get_test_data('random-20-a.fa')
- ht = khmer.LabelHash(20, 1e6, 4)
-
- ht.consume_fasta(inpfile)
- n = ht.count_kmers_within_radius('CGCAGGCTGGATTCTAGAGG', int(1e6))
- assert n == 3960
-
- ht = khmer.LabelHash(21, 1e6, 4)
- ht.consume_fasta(inpfile)
- n = ht.count_kmers_within_radius('CGCAGGCTGGATTCTAGAGGC', int(1e6))
- assert n == 39
-
-
-def test_count_kmer_degree():
- inpfile = utils.get_test_data('all-A.fa')
- ht = khmer.LabelHash(4, 1e6, 2)
- ht.consume_fasta(inpfile)
-
- assert ht.kmer_degree('AAAA') == 2
- assert ht.kmer_degree('AAAT') == 1
- assert ht.kmer_degree('AATA') == 0
- assert ht.kmer_degree('TAAA') == 1
-
-
-def test_save_load_tagset():
- ht = khmer.LabelHash(32, 1, 1)
-
- outfile = utils.get_temp_filename('tagset')
-
- ht.add_tag('A' * 32)
- ht.save_tagset(outfile)
-
- ht.add_tag('G' * 32)
-
- ht.load_tagset(outfile) # implicitly => clear_tags=True
- ht.save_tagset(outfile)
-
- # if tags have been cleared, then the new tagfile will be larger (34 bytes)
- # else smaller (26 bytes).
-
- fp = open(outfile, 'rb')
- data = fp.read()
- fp.close()
- assert len(data) == 26, len(data)
-
-
-def test_save_load_tagset_noclear():
- ht = khmer.LabelHash(32, 1, 1)
-
- outfile = utils.get_temp_filename('tagset')
-
- ht.add_tag('A' * 32)
- ht.save_tagset(outfile)
-
- ht.add_tag('G' * 32)
-
- ht.load_tagset(outfile, False) # set clear_tags => False; zero tags
- ht.save_tagset(outfile)
-
- # if tags have been cleared, then the new tagfile will be large (34 bytes);
- # else small (26 bytes).
-
- fp = open(outfile, 'rb')
- data = fp.read()
- fp.close()
- assert len(data) == 34, len(data)
-
-
-def test_stop_traverse():
- filename = utils.get_test_data('random-20-a.fa')
-
- K = 20 # size of kmer
- HT_SIZE = 100000 # size of hashtable
- N_HT = 3 # number of hashtables
-
- ht = khmer.LabelHash(K, HT_SIZE, N_HT)
-
- # without tagging/joining across consume, this breaks into two partition;
- # with, it is one partition.
- ht.add_stop_tag('TTGCATACGTTGAGCCAGCG')
-
- ht.consume_fasta_and_tag(filename) # DO NOT join reads across stoptags
- subset = ht.do_subset_partition(0, 0, True)
- ht.merge_subset(subset)
-
- n, _ = ht.count_partitions()
- assert n == 2, n
-
-
-def test_tag_across_stoptraverse():
- filename = utils.get_test_data('random-20-a.fa')
-
- K = 20 # size of kmer
- HT_SIZE = 100000 # size of hashtable
- N_HT = 3 # number of hashtables
-
- ht = khmer.LabelHash(K, HT_SIZE, N_HT)
-
- # without tagging/joining across consume, this breaks into two partition;
- # with, it is one partition.
- ht.add_stop_tag('CCGAATATATAACAGCGACG')
-
- ht.consume_fasta_and_tag_with_stoptags(filename) # DO join reads across
-
- subset = ht.do_subset_partition(0, 0)
- n, _ = ht.count_partitions()
- assert n == 99 # reads only connected by traversal...
-
- n, _ = ht.subset_count_partitions(subset)
- assert n == 2 # but need main to cross stoptags.
-
- ht.merge_subset(subset)
-
- n, _ = ht.count_partitions() # ta-da!
- assert n == 1, n
-
-
-def test_notag_across_stoptraverse():
- filename = utils.get_test_data('random-20-a.fa')
-
- K = 20 # size of kmer
- HT_SIZE = 100000 # size of hashtable
- N_HT = 3 # number of hashtables
-
- ht = khmer.LabelHash(K, HT_SIZE, N_HT)
-
- # connecting k-mer at the beginning/end of a read: breaks up into two.
- ht.add_stop_tag('TTGCATACGTTGAGCCAGCG')
-
- ht.consume_fasta_and_tag_with_stoptags(filename)
-
- subset = ht.do_subset_partition(0, 0)
- ht.merge_subset(subset)
-
- n, _ = ht.count_partitions()
- assert n == 2, n
-
-
-def test_find_stoptags():
- ht = khmer.LabelHash(5, 1, 1)
- ht.add_stop_tag("AAAAA")
-
- assert ht.identify_stoptags_by_position("AAAAA") == [0]
- assert ht.identify_stoptags_by_position("AAAAAA") == [0, 1]
- assert ht.identify_stoptags_by_position("TTTTT") == [0]
- assert ht.identify_stoptags_by_position("TTTTTT") == [0, 1]
-
-
-def test_find_stoptags2():
- ht = khmer.LabelHash(4, 1, 1)
- ht.add_stop_tag("ATGC")
-
- x = ht.identify_stoptags_by_position("ATGCATGCGCAT")
- assert x == [0, 2, 4, 8], x
-
-
-def test_get_ksize():
- kh = khmer.LabelHash(22, 1, 1)
- assert kh.ksize() == 22
-
-
-def test_get_hashsizes():
- kh = khmer.LabelHash(22, 100, 4)
- assert kh.hashsizes() == [101, 103, 107, 109], kh.hashsizes()
-
-
-def test_extract_unique_paths_0():
- kh = khmer.LabelHash(10, 1e5, 4)
-
- x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1)
- assert x == ['ATGGAGAGACACAGATAGACAGGAGTGGCGATG']
-
- kh.consume('ATGGAGAGACACAGATAGACAGGAGTGGCGATG')
- x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1)
- assert not x
-
-
-def test_extract_unique_paths_1():
- kh = khmer.LabelHash(10, 1e5, 4)
-
- kh.consume('AGTGGCGATG')
- x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1)
- print x
- assert x == ['ATGGAGAGACACAGATAGACAGGAGTGGCGAT'] # all but the last k-mer
-
-
-def test_extract_unique_paths_2():
- kh = khmer.LabelHash(10, 1e5, 4)
-
- kh.consume('ATGGAGAGAC')
- x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1)
- print x
- assert x == ['TGGAGAGACACAGATAGACAGGAGTGGCGATG'] # all but the 1st k-mer
-
-
-def test_extract_unique_paths_3():
- kh = khmer.LabelHash(10, 1e5, 4)
-
- kh.consume('ATGGAGAGAC')
- kh.consume('AGTGGCGATG')
- x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1)
- print x
- # all but the 1st/last k-mer
- assert x == ['TGGAGAGACACAGATAGACAGGAGTGGCGAT']
-
-
-def test_extract_unique_paths_4():
- kh = khmer.LabelHash(10, 1e5, 4)
-
- kh.consume('ATGGAGAGAC')
- kh.consume('AGTGGCGATG')
-
- kh.consume('ATAGACAGGA')
-
- x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1)
- print x
- assert x == ['TGGAGAGACACAGATAGACAGG', 'TAGACAGGAGTGGCGAT']
-
-
-def test_find_unpart():
- filename = utils.get_test_data('random-20-a.odd.fa')
- filename2 = utils.get_test_data('random-20-a.even.fa')
-
- K = 20 # size of kmer
- HT_SIZE = 100000 # size of hashtable
- N_HT = 3 # number of hashtables
-
- ht = khmer.LabelHash(K, HT_SIZE, N_HT)
- ht.consume_fasta_and_tag(filename)
-
- subset = ht.do_subset_partition(0, 0)
- ht.merge_subset(subset)
-
- n, _ = ht.count_partitions()
- assert n == 49
-
- ht.find_unpart(filename2, True, False)
- n, _ = ht.count_partitions()
- assert n == 1, n # all sequences connect
+def test_counting_label_tag_correctness():
+ lb = CountingLabelHash(20, 1e7, 4)
+ filename = utils.get_test_data('test-labels.fa')
+ lb.consume_fasta_and_tag_with_labels(filename)
-def test_find_unpart_notraverse():
- filename = utils.get_test_data('random-20-a.odd.fa')
- filename2 = utils.get_test_data('random-20-a.even.fa')
+ # read A
+ labels = lb.sweep_label_neighborhood(
+ 'ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGGCTCTGCCTAGAG'
+ 'CTAGGCTAGGTGTGCTCTGCCTAGAGCTAGGCTAGGTGT')
+ print(lb.sweep_tag_neighborhood(
+ 'TTCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGGCTCTGCCTAGAG'
+ 'CTAGGCTAGGTGTGCTCTGCTAGAGCTAGGCTAGGTGT'))
+ print(labels)
+ print(len('ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAG') - 19)
+ assert len(labels) == 2
+ assert 0 in labels
+ assert 1 in labels
- K = 20 # size of kmer
- HT_SIZE = 100000 # size of hashtable
- N_HT = 3 # number of hashtables
+ # read B
+ labels = lb.sweep_label_neighborhood(
+ 'GCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGCTCTGCCTAGAGCTAGGCTAGGTGTTGGGGATAG'
+ 'ATAGATAGATGACCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGA')
+ print(labels)
+ assert len(labels) == 3
+ assert 0 in labels
+ assert 1 in labels
+ assert 2 in labels
- ht = khmer.LabelHash(K, HT_SIZE, N_HT)
- ht.consume_fasta_and_tag(filename)
+ # read C
+ labels = lb.sweep_label_neighborhood(
+ 'TGGGATAGATAGATAGATGACCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGACCTAGAG'
+ 'CTAGGCTAGGTGTTGGGGATAGATAGATAGATGAGTTGGGGATAGATAGATAGATGAGTGTAGATCCA'
+ 'ACAACACATACA')
+ print(labels)
+ assert len(labels) == 2
+ assert 1 in labels
+ assert 2 in labels
- subset = ht.do_subset_partition(0, 0)
- ht.merge_subset(subset)
+ # read D
+ labels = lb.sweep_label_neighborhood(
+ 'TATATATATAGCTAGCTAGCTAACTAGCTAGCATCGATCGATCGATC')
+ print(labels)
+ assert len(labels) == 1
+ assert 3 in labels
- n, _ = ht.count_partitions()
- assert n == 49
- ht.find_unpart(filename2, False, False) # <-- don't traverse
- n, _ = ht.count_partitions()
- assert n == 99, n # all sequences disconnected
+def test_label_tag_correctness_save_load():
+ lb_pre = LabelHash(20, 1e7, 4)
+ filename = utils.get_test_data('test-labels.fa')
+ lb_pre.consume_fasta_and_tag_with_labels(filename)
+ # save labels to a file
+ savepath = utils.get_temp_filename('saved.labels')
+ lb_pre.save_labels_and_tags(savepath)
-def test_find_unpart_fail():
- filename = utils.get_test_data('random-20-a.odd.fa')
- filename2 = utils.get_test_data('random-20-a.odd.fa') # <- switch to odd
+ # trash the old LabelHash
+ del lb_pre
- K = 20 # size of kmer
- HT_SIZE = 100000 # size of hashtable
- N_HT = 3 # number of hashtables
+ # create new, load labels & tags
+ lb = LabelHash(20, 1e7, 4)
+ lb.load_labels_and_tags(savepath)
- ht = khmer.LabelHash(K, HT_SIZE, N_HT)
- ht.consume_fasta_and_tag(filename)
+ # read A
+ labels = lb.sweep_label_neighborhood(
+ 'ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGGCTCTGCCTAGAG'
+ 'CTAGGCTAGGTGTGCTCTGCCTAGAGCTAGGCTAGGTGT')
+ print(lb.sweep_tag_neighborhood(
+ 'TTCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGGCTCTGCCTAGAG'
+ 'CTAGGCTAGGTGTGCTCTGCTAGAGCTAGGCTAGGTGT'))
+ print(labels)
+ print(len('ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAG') - 19)
+ assert len(labels) == 2
+ assert 0 in labels
+ assert 1 in labels
- subset = ht.do_subset_partition(0, 0)
- ht.merge_subset(subset)
+ # read B
+ labels = lb.sweep_label_neighborhood(
+ 'GCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGCTCTGCCTAGAGCTAGGCTAGGTGTTGGGGATAG'
+ 'ATAGATAGATGACCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGA')
+ print(labels)
+ assert len(labels) == 3
+ assert 0 in labels
+ assert 1 in labels
+ assert 2 in labels
- n, _ = ht.count_partitions()
- assert n == 49
+ # read C
+ labels = lb.sweep_label_neighborhood(
+ 'TGGGATAGATAGATAGATGACCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGACCTAGAG'
+ 'CTAGGCTAGGTGTTGGGGATAGATAGATAGATGAGTTGGGGATAGATAGATAGATGAGTGTAGATCCA'
+ 'ACAACACATACA')
+ print(labels)
+ assert len(labels) == 2
+ assert 1 in labels
+ assert 2 in labels
- ht.find_unpart(filename2, True, False)
- n, _ = ht.count_partitions()
- assert n == 49, n # only 49 sequences worth of tags
+ # read D
+ labels = lb.sweep_label_neighborhood(
+ 'TATATATATAGCTAGCTAGCTAACTAGCTAGCATCGATCGATCGATC')
+ print(labels)
+ assert len(labels) == 1
+ assert 3 in labels
-def test_simple_median():
- hi = khmer.LabelHash(6, 1e6, 2)
+def test_load_wrong_filetype():
+ lb = LabelHash(20, 1e7, 4)
- (median, average, stddev) = hi.get_median_count("AAAAAA")
- print median, average, stddev
- assert median == 0
- assert average == 0.0
- assert stddev == 0.0
+ # try to load a tagset
+ filename = utils.get_test_data('goodversion-k32.tagset')
+ try:
+ lb.load_labels_and_tags(filename)
+ assert 0, "this should not succeed - bad file type"
+ except IOError as err:
+ print(str(err))
+ assert "Incorrect file format type" in str(err)
+
+ # try to load a nonsense file
+ filename = utils.get_test_data('all-A.fa')
+ try:
+ lb.load_labels_and_tags(filename)
+ assert 0, "this should not succeed - bad file signature"
+ except IOError as err:
+ print(str(err))
+ assert "Incorrect file signature" in str(err)
- hi.consume("AAAAAA")
- (median, average, stddev) = hi.get_median_count("AAAAAA")
- print median, average, stddev
- assert median == 1
- assert average == 1.0
- assert stddev == 0.0
+def test_load_wrong_fileversion():
+ lb = LabelHash(20, 1e7, 4)
-def test_bad_primes():
+ # try to load a tagset from an old version
+ filename = utils.get_test_data('badversion-k32.tagset')
try:
- hi = khmer._LabelHash.__new__(khmer.LabelHash, 6, ["a", "b", "c"])
- assert 0, "Non number prime list should fail"
- except TypeError as e:
- print str(e)
+ lb.load_labels_and_tags(filename)
+ assert 0, "this should not succeed - bad file type"
+ except IOError as err:
+ print(str(err))
+ assert "Incorrect file format version" in str(err)
diff --git a/tests/test_lump.py b/tests/test_lump.py
index 3d82471..c7eeb0d 100644
--- a/tests/test_lump.py
+++ b/tests/test_lump.py
@@ -1,6 +1,7 @@
+from __future__ import absolute_import
#
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
-# Copyright (C) Michigan State University, 2009-2013. It is licensed under
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
+# Copyright (C) Michigan State University, 2009-2015. It is licensed under
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
#
@@ -8,7 +9,7 @@
import khmer
import screed
-import khmer_tst_utils as utils
+from . import khmer_tst_utils as utils
from nose.plugins.attrib import attr
# Below, 'fakelump.fa' is an artificial data set of 3x1 kb sequences in
@@ -18,7 +19,7 @@ from nose.plugins.attrib import attr
def test_fakelump_together():
fakelump_fa = utils.get_test_data('fakelump.fa')
- ht = khmer.new_hashbits(32, 1e5, 4)
+ ht = khmer.Hashbits(32, 1e5, 4)
ht.consume_fasta_and_tag(fakelump_fa)
subset = ht.do_subset_partition(0, 0)
@@ -34,7 +35,7 @@ def test_fakelump_stop():
fakelump_fa = utils.get_test_data('fakelump.fa')
fakelump_stoptags_txt = utils.get_test_data('fakelump.fa.stoptags.txt')
- ht = khmer.new_hashbits(32, 1e5, 4)
+ ht = khmer.Hashbits(32, 1e5, 4)
ht.consume_fasta_and_tag(fakelump_fa)
for line in open(fakelump_stoptags_txt):
@@ -52,7 +53,7 @@ def test_fakelump_stop():
def test_fakelump_stop2():
fakelump_fa = utils.get_test_data('fakelump.fa')
- ht = khmer.new_hashbits(32, 1e5, 4)
+ ht = khmer.Hashbits(32, 1e5, 4)
ht.consume_fasta_and_tag(fakelump_fa)
ht.add_stop_tag('GGGGAGGGGTGCAGTTGTGACTTGCTCGAGAG')
@@ -70,7 +71,7 @@ def test_fakelump_repartitioning():
fakelump_fa = utils.get_test_data('fakelump.fa')
fakelump_fa_foo = utils.get_temp_filename('fakelump.fa.stopfoo')
- ht = khmer.new_hashbits(32, 1e5, 4)
+ ht = khmer.Hashbits(32, 1e5, 4)
ht.consume_fasta_and_tag(fakelump_fa)
subset = ht.do_subset_partition(0, 0)
@@ -87,7 +88,7 @@ def test_fakelump_repartitioning():
EXCURSION_DISTANCE = 40
EXCURSION_KMER_THRESHOLD = 82
EXCURSION_KMER_COUNT_THRESHOLD = 1
- counting = khmer.new_counting_hash(32, 1e4, 4)
+ counting = khmer.CountingHash(32, 1e5, 4)
ht.repartition_largest_partition(None, counting,
EXCURSION_DISTANCE,
@@ -98,7 +99,7 @@ def test_fakelump_repartitioning():
# ok, now re-do everything with these stop tags, specifically.
- ht = khmer.new_hashbits(32, 1e5, 4)
+ ht = khmer.Hashbits(32, 1e5, 4)
ht.consume_fasta_and_tag(fakelump_fa)
ht.load_stop_tags(fakelump_fa_foo)
@@ -106,14 +107,14 @@ def test_fakelump_repartitioning():
ht.merge_subset(subset)
(n_partitions, n_singletons) = ht.count_partitions()
- assert n_partitions == 3, n_partitions
+ assert n_partitions == 6, n_partitions
def test_fakelump_load_stop_tags_trunc():
fakelump_fa = utils.get_test_data('fakelump.fa')
fakelump_fa_foo = utils.get_temp_filename('fakelump.fa.stopfoo')
- ht = khmer.new_hashbits(32, 1e5, 4)
+ ht = khmer.Hashbits(32, 1e5, 4)
ht.consume_fasta_and_tag(fakelump_fa)
subset = ht.do_subset_partition(0, 0)
@@ -130,7 +131,7 @@ def test_fakelump_load_stop_tags_trunc():
EXCURSION_DISTANCE = 40
EXCURSION_KMER_THRESHOLD = 82
EXCURSION_KMER_COUNT_THRESHOLD = 1
- counting = khmer.new_counting_hash(32, 4, 4)
+ counting = khmer._CountingHash(32, [5, 7, 11, 13])
ht.repartition_largest_partition(None, counting,
EXCURSION_DISTANCE,
@@ -138,14 +139,14 @@ def test_fakelump_load_stop_tags_trunc():
EXCURSION_KMER_COUNT_THRESHOLD)
ht.save_stop_tags(fakelump_fa_foo)
- data = open(fakelump_fa_foo).read()
+ data = open(fakelump_fa_foo, 'rb').read()
fp = open(fakelump_fa_foo, 'wb')
fp.write(data[:10])
fp.close()
# ok, now try loading these stop tags; should fail.
- ht = khmer.new_hashbits(32, 4, 4)
+ ht = khmer._Hashbits(32, [5, 7, 11, 13])
ht.consume_fasta_and_tag(fakelump_fa)
try:
@@ -159,7 +160,7 @@ def test_fakelump_load_stop_tags_notexist():
fakelump_fa_foo = utils.get_temp_filename('fakelump.fa.stopfoo')
# ok, now try loading these stop tags; should fail.
- ht = khmer.new_hashbits(32, 4, 4)
+ ht = khmer._Hashbits(32, [5, 7, 11, 13])
try:
ht.load_stop_tags(fakelump_fa_foo)
diff --git a/tests/test_normalize_by_median.py b/tests/test_normalize_by_median.py
new file mode 100644
index 0000000..abdcc58
--- /dev/null
+++ b/tests/test_normalize_by_median.py
@@ -0,0 +1,627 @@
+from __future__ import print_function
+from __future__ import absolute_import
+from __future__ import unicode_literals
+#
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
+# Copyright (C) Michigan State University, 2009-2015. It is licensed under
+# the three-clause BSD license; see LICENSE.
+# Contact: khmer-project at idyll.org
+#
+
+import os
+import shutil
+import threading
+import io
+
+import screed
+import khmer
+
+from . import khmer_tst_utils as utils
+from nose.plugins.attrib import attr
+from .test_scripts import _make_counting
+
+
+def test_normalize_by_median_indent():
+ infile = utils.get_test_data('paired-mixed.fa.pe')
+ hashfile = utils.get_test_data('normC20k20.ct')
+ outfile = utils.get_temp_filename('paired-mixed.fa.pe.keep')
+ script = 'normalize-by-median.py'
+ args = ['--loadtable', hashfile, '-o', outfile, infile]
+ (status, out, err) = utils.runscript(script, args)
+ assert status == 0, (out, err)
+ assert os.path.exists(outfile)
+
+
+def test_normalize_by_median():
+ CUTOFF = '1'
+
+ infile = utils.get_temp_filename('test.fa')
+ in_dir = os.path.dirname(infile)
+
+ shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile)
+
+ script = 'normalize-by-median.py'
+ args = ['-C', CUTOFF, '-k', '17', infile]
+ (status, out, err) = utils.runscript(script, args, in_dir)
+
+ assert 'Total number of unique k-mers: 98' in err, err
+
+ outfile = infile + '.keep'
+ assert os.path.exists(outfile), outfile
+
+ seqs = [r.sequence for r in screed.open(outfile)]
+ assert len(seqs) == 1, seqs
+ assert seqs[0].startswith('GGTTGACGGGGCTCAGGGGG'), seqs
+ assert "IOErrors" not in err
+
+
+def test_normalize_by_median_unpaired_final_read():
+ CUTOFF = '1'
+
+ infile = utils.get_temp_filename('test.fa')
+ in_dir = os.path.dirname(infile)
+
+ shutil.copyfile(utils.get_test_data('single-read.fq'), infile)
+
+ script = 'normalize-by-median.py'
+ args = ['-C', CUTOFF, '-k', '17', '-p', infile]
+ try:
+ (status, out, err) = utils.runscript(script, args, in_dir)
+ raise Exception("Shouldn't get to this")
+ except AssertionError as e:
+ out = str(e)
+ assert "ERROR: Unpaired reads when require_paired" in out, out
+
+
+def test_normalize_by_median_unforced_badfile():
+ CUTOFF = '1'
+
+ infile = utils.get_temp_filename("potatoes")
+ outfile = infile + '.keep'
+ in_dir = os.path.dirname(infile)
+ script = 'normalize-by-median.py'
+ args = ['-C', CUTOFF, '-k', '17', infile]
+ try:
+ (status, out, err) = utils.runscript(script, args, in_dir)
+ raise Exception("Shouldn't get to this")
+ except AssertionError as e:
+ out = str(e)
+ assert "ERROR: [Errno 2] No such file or directory:" in out, out
+
+ if os.path.exists(outfile):
+ assert False, '.keep file should have been removed: '
+
+
+def test_normalize_by_median_contradictory_args():
+ infile = utils.get_temp_filename('test.fa')
+ in_dir = os.path.dirname(infile)
+ outfile = utils.get_temp_filename('report.out')
+
+ shutil.copyfile(utils.get_test_data('test-large.fa'), infile)
+
+ script = 'normalize-by-median.py'
+ args = ['-C', '1', '-k', '17', '--force-single', '-p', '-R',
+ outfile, infile]
+ try:
+ (status, out, err) = utils.runscript(script, args, in_dir)
+ raise Exception("Shouldn't get to this")
+ except AssertionError as e:
+ out = str(e)
+ assert "cannot both be set" in out, out
+
+
+def test_normalize_by_median_stdout_3():
+ CUTOFF = '1'
+
+ infile = utils.get_temp_filename('test.fa')
+ in_dir = os.path.dirname(infile)
+
+ shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile)
+
+ script = 'normalize-by-median.py'
+ args = ['-C', CUTOFF, '-k', '17', infile, '--out', '-']
+ (status, out, err) = utils.runscript(script, args, in_dir)
+
+ assert 'Total number of unique k-mers: 98' in err, err
+ assert 'in /dev/stdout' in err, err
+ assert "IOErrors" not in err
+
+
+ at attr('known_failing')
+def test_normalize_by_median_known_good():
+ CUTOFF = '2'
+
+ infile = utils.get_temp_filename('test.fa.gz')
+ in_dir = os.path.dirname(infile)
+ shutil.copyfile(utils.get_test_data('100k-filtered.fa.gz'), infile)
+
+ script = 'normalize-by-median.py'
+ args = ['-C', CUTOFF, '-k', '20', '-x', '4e6', infile]
+ (status, out, err) = utils.runscript(script, args, in_dir)
+
+ outfile = infile + '.keep'
+ assert os.path.exists(outfile), outfile
+ iter_known = screed.open(utils.get_test_data('100k-filtered.fa.keep.gz'))
+ iter_out = screed.open(outfile)
+ try:
+ for rknown, rout in zip(iter_known, iter_out):
+ assert rknown.name == rout.name
+ except Exception as e:
+ print(e)
+ assert False
+
+
+ at attr('huge')
+def test_normalize_by_median_report_fp():
+ infile = utils.get_temp_filename('test.fa')
+ in_dir = os.path.dirname(infile)
+ outfile = utils.get_temp_filename('report.out')
+
+ shutil.copyfile(utils.get_test_data('test-large.fa'), infile)
+
+ script = 'normalize-by-median.py'
+ args = ['-C', '1', '-k', '17', '-R', outfile, infile]
+ (status, out, err) = utils.runscript(script, args, in_dir)
+
+ assert "fp rate estimated to be 0.623" in err, err
+ report = open(outfile, 'r')
+ line = report.readline()
+ assert "100000 25261 0.25261" in line, line
+
+
+def test_normalize_by_median_unpaired_and_paired():
+ CUTOFF = '1'
+
+ infile = utils.get_temp_filename('test.fa')
+ in_dir = os.path.dirname(infile)
+
+ shutil.copyfile(utils.get_test_data('test-abund-read-paired.fa'), infile)
+
+ unpairedfile = utils.get_temp_filename('test1.fa', tempdir=in_dir)
+ shutil.copyfile(utils.get_test_data('random-20-a.fa'), unpairedfile)
+
+ script = 'normalize-by-median.py'
+ args = ['-C', CUTOFF, '-k', '17', '-u', unpairedfile, '-p', infile]
+ (status, out, err) = utils.runscript(script, args, in_dir)
+
+ assert 'Total number of unique k-mers: 4030' in err, err
+
+ outfile = infile + '.keep'
+ assert os.path.exists(outfile), outfile
+
+
+def test_normalize_by_median_count_kmers_PE():
+ CUTOFF = '1'
+ infile = utils.get_temp_filename('test.fa')
+ in_dir = os.path.dirname(infile)
+ # The test file has one pair of identical read except for the last base
+ # The 2nd read should be discarded in the unpaired mode
+ # but kept in the paired end mode adding only one more unique kmer
+ shutil.copyfile(utils.get_test_data('paired_one.base.dif.fa'), infile)
+ script = 'normalize-by-median.py'
+
+ args = ['-C', CUTOFF, '-k', '17', '--force-single', infile]
+ (status, out, err) = utils.runscript(script, args, in_dir)
+ assert 'Total number of unique k-mers: 98' in err, err
+ assert 'kept 1 of 2 or 50%' in err, err
+
+ args = ['-C', CUTOFF, '-k', '17', '-p', infile]
+ (status, out, err) = utils.runscript(script, args, in_dir)
+ assert 'Total number of unique k-mers: 99' in err, err
+ assert 'kept 2 of 2 or 100%' in err, err
+
+
+def test_normalize_by_median_double_file_name():
+ infile = utils.get_temp_filename('test-abund-read-2.fa')
+ in_dir = os.path.dirname(infile)
+
+ shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile)
+
+ script = 'normalize-by-median.py'
+ args = [utils.get_test_data('test-abund-read-2.fa'), infile]
+
+ try:
+ (status, out, err) = utils.runscript(script, args, in_dir)
+ except AssertionError as e:
+ assert "Duplicate filename--Cannot handle this!" in str(e), str(e)
+
+
+def test_normalize_by_median_overwrite():
+ outfile = utils.get_temp_filename('test.fa.keep')
+ shutil.copyfile(utils.get_test_data('test-abund-read.fa'), outfile)
+ in_dir = os.path.dirname(outfile)
+
+ CUTOFF = '1'
+ infile = utils.get_temp_filename('test.fa', in_dir)
+ shutil.copyfile(utils.get_test_data('test-abund-read-3.fa'), infile)
+ script = 'normalize-by-median.py'
+
+ args = ['-C', CUTOFF, '-k', '17', '-o', outfile, infile]
+ (status, out, err) = utils.runscript(script, args, in_dir)
+ assert os.path.exists(outfile), outfile
+ seqs = [r.sequence for r in screed.open(outfile)]
+ assert len(seqs) == 1, seqs
+ assert 'GACAGCgtgCCGCA' in seqs[0], seqs
+
+
+def test_normalize_by_median_version():
+ script = 'normalize-by-median.py'
+ args = ['--version']
+ status, out, err = utils.runscript(script, args)
+
+ errlines = err.splitlines()
+ for err in errlines:
+ if err.startswith('||') or \
+ not err.strip():
+ continue
+ break
+
+ print(errlines)
+ print(err)
+
+ assert err.startswith('khmer ')
+
+
+def test_normalize_by_median_2():
+ CUTOFF = '2'
+
+ infile = utils.get_temp_filename('test.fa')
+ in_dir = os.path.dirname(infile)
+
+ shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile)
+
+ script = 'normalize-by-median.py'
+ args = ['-C', CUTOFF, '-k', '17', infile]
+ utils.runscript(script, args, in_dir)
+
+ outfile = infile + '.keep'
+ assert os.path.exists(outfile), outfile
+
+ seqs = [r.sequence for r in screed.open(outfile)]
+ assert len(seqs) == 2, seqs
+ assert seqs[0].startswith('GGTTGACGGGGCTCAGGGGG'), seqs
+ assert seqs[1] == 'GGTTGACGGGGCTCAGGG', seqs
+
+
+def test_normalize_by_median_paired():
+ CUTOFF = '1'
+
+ infile = utils.get_temp_filename('test.fa')
+ in_dir = os.path.dirname(infile)
+
+ shutil.copyfile(utils.get_test_data('test-abund-read-paired.fa'), infile)
+
+ script = 'normalize-by-median.py'
+ args = ['-C', CUTOFF, '-p', '-k', '17', infile]
+ utils.runscript(script, args, in_dir)
+
+ outfile = infile + '.keep'
+ assert os.path.exists(outfile), outfile
+
+ seqs = [r.sequence for r in screed.open(outfile)]
+ assert len(seqs) == 2, seqs
+ assert seqs[0].startswith('GGTTGACGGGGCTCAGGGGG'), seqs
+ assert seqs[1].startswith('GGTTGACGGGGCTCAGGG'), seqs
+
+
+def test_normalize_by_median_paired_fq():
+ CUTOFF = '20'
+
+ infile = utils.get_temp_filename('test.fa')
+ in_dir = os.path.dirname(infile)
+
+ shutil.copyfile(utils.get_test_data('test-abund-read-paired.fq'), infile)
+
+ script = 'normalize-by-median.py'
+ args = ['-C', CUTOFF, '-p', '-k', '17', infile]
+ _, out, err = utils.runscript(script, args, in_dir)
+ print(out)
+ print(err)
+
+ outfile = infile + '.keep'
+ assert os.path.exists(outfile), outfile
+
+ seqs = [r.sequence for r in screed.open(outfile)]
+ assert len(seqs) == 6, len(seqs)
+ assert seqs[0].startswith('GGTTGACGGGGCTCAGGGGG'), seqs
+ assert seqs[1].startswith('GGTTGACGGGGCTCAGGG'), seqs
+
+ names = [r.name for r in screed.open(outfile, parse_description=False)]
+ assert len(names) == 6, names
+ assert '895:1:37:17593:9954 1::FOO' in names, names
+ assert '895:1:37:17593:9954 2::FOO' in names, names
+
+
+def test_normalize_by_median_impaired():
+ CUTOFF = '1'
+
+ infile = utils.get_temp_filename('test.fa')
+ in_dir = os.path.dirname(infile)
+
+ shutil.copyfile(utils.get_test_data('test-abund-read-impaired.fa'), infile)
+
+ script = 'normalize-by-median.py'
+ args = ['-C', CUTOFF, '-p', '-k', '17', infile]
+ _, out, err = utils.runscript(script, args, in_dir, fail_ok=True)
+ assert 'ERROR: Unpaired reads ' in err, err
+
+
+def test_normalize_by_median_force():
+ CUTOFF = '1'
+
+ corrupt_infile = utils.get_temp_filename('test-corrupt.fq')
+ good_infile = utils.get_temp_filename('test-good.fq',
+ tempdir=os.path.dirname(
+ corrupt_infile))
+
+ in_dir = os.path.dirname(good_infile)
+
+ shutil.copyfile(utils.get_test_data('test-error-reads.fq'), corrupt_infile)
+ shutil.copyfile(utils.get_test_data('test-fastq-reads.fq'), good_infile)
+
+ script = 'normalize-by-median.py'
+ args = ['-f', '-C', CUTOFF, '-k', '17', corrupt_infile, good_infile]
+
+ (status, out, err) = utils.runscript(script, args, in_dir)
+
+ assert '*** Skipping' in err
+ assert '** IOErrors' in err
+
+
+def test_normalize_by_median_no_bigcount():
+ infile = utils.get_temp_filename('test.fa')
+ hashfile = utils.get_temp_filename('test-out.ct')
+ outfile = infile + '.keep'
+ in_dir = os.path.dirname(infile)
+
+ shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile)
+ counting_ht = _make_counting(infile, K=8)
+
+ script = 'normalize-by-median.py'
+ args = ['-C', '1000', '-k 8', '--savetable', hashfile, infile]
+
+ (status, out, err) = utils.runscript(script, args, in_dir)
+ assert status == 0, (out, err)
+ print((out, err))
+
+ assert os.path.exists(hashfile), hashfile
+ kh = khmer.load_counting_hash(hashfile)
+
+ assert kh.get('GGTTGACG') == 255
+
+
+def test_normalize_by_median_empty():
+ CUTOFF = '1'
+
+ infile = utils.get_temp_filename('test.fa')
+ in_dir = os.path.dirname(infile)
+
+ shutil.copyfile(utils.get_test_data('test-empty.fa'), infile)
+
+ script = 'normalize-by-median.py'
+ args = ['-C', CUTOFF, '-k', '17', infile]
+ utils.runscript(script, args, in_dir)
+
+ outfile = infile + '.keep'
+ assert os.path.exists(outfile), outfile
+
+
+def test_normalize_by_median_emptycountingtable():
+ CUTOFF = '1'
+
+ infile = utils.get_temp_filename('test.fa')
+ in_dir = os.path.dirname(infile)
+
+ shutil.copyfile(utils.get_test_data('test-empty.fa'), infile)
+
+ script = 'normalize-by-median.py'
+ args = ['-C', CUTOFF, '--loadtable', infile, infile]
+ (status, out, err) = utils.runscript(script, args, in_dir, fail_ok=True)
+ assert 'ValueError' in err, (status, out, err)
+
+
+def test_normalize_by_median_fpr():
+ MAX_TABLESIZE_PARAM = 12
+
+ infile = utils.get_temp_filename('test-fpr.fq')
+ in_dir = os.path.dirname(infile)
+ shutil.copyfile(utils.get_test_data('test-fastq-reads.fq'), infile)
+
+ script = 'normalize-by-median.py'
+ args = ['-f', '-k 17', '-x ' + str(MAX_TABLESIZE_PARAM), infile]
+
+ (status, out, err) = utils.runscript(script, args, in_dir, fail_ok=True)
+
+ print(out)
+ print(err)
+
+ assert os.path.exists(infile + '.keep'), infile
+ assert '** ERROR: the graph structure is too small' in err, err
+
+
+def write_by_chunks(infile, outfile, CHUNKSIZE=8192):
+ ifile = io.open(infile, 'rb')
+ ofile = io.open(outfile, 'wb')
+ chunk = ifile.read(CHUNKSIZE)
+ while len(chunk) > 0:
+ ofile.write(chunk)
+ chunk = ifile.read(CHUNKSIZE)
+ ifile.close()
+ ofile.close()
+
+
+def test_normalize_by_median_streaming():
+ CUTOFF = '20'
+
+ infile = utils.get_test_data('100-reads.fq.gz')
+ in_dir = os.path.dirname(infile)
+ fifo = utils.get_temp_filename('fifo')
+ outfile = utils.get_temp_filename('outfile')
+
+ # Use a fifo to copy stdout to a file for checking
+ os.mkfifo(fifo)
+ thread = threading.Thread(target=write_by_chunks, args=(fifo, outfile))
+ thread.start()
+
+ # Execute diginorm
+ script = 'normalize-by-median.py'
+ args = ['-C', CUTOFF, '-k', '17', '-o', fifo, infile]
+ (status, out, err) = utils.runscript(script, args, in_dir)
+
+ # Merge the thread
+ thread.join()
+
+ assert os.path.exists(outfile), outfile
+ with open(outfile) as fp:
+ linecount = sum(1 for _ in fp)
+ assert linecount == 400
+
+
+def test_diginorm_basic_functionality_1():
+ # each of these pairs has both a multicopy sequence ('ACTTCA...') and
+ # a random sequence. With 'C=1' and '-p', all should be kept.
+ CUTOFF = ['-C', '1']
+ PAIRING = ['-p']
+
+ infile = utils.get_temp_filename('test.fa')
+ in_dir = os.path.dirname(infile)
+
+ shutil.copyfile(utils.get_test_data('dn-test-all-paired-all-keep.fa'),
+ infile)
+
+ script = 'normalize-by-median.py'
+ args = list(CUTOFF) + list(PAIRING) + ['-k', '15', infile]
+ _, out, err = utils.runscript(script, args, in_dir)
+ print(out)
+ print(err)
+
+ outfile = infile + '.keep'
+ assert os.path.exists(outfile), outfile
+
+ seqs = set([r.name for r in screed.open(outfile)])
+
+ assert seqs == set(['a/1', 'a/2',
+ 'b/1', 'b/2',
+ 'c/1', 'c/2',
+ 'd/1', 'd/2']), seqs
+
+
+def test_diginorm_basic_functionality_2():
+ # each of these pairs has both a multicopy sequence ('ACTTCA...')
+ # and a random sequence ('G...'). With 'C=1' and '--force-
+ # single', only random seqs should be kept, together with one copy
+ # of the multicopy sequence.
+ CUTOFF = ['-C', '1']
+ PAIRING = ['--force-single']
+
+ infile = utils.get_temp_filename('test.fa')
+ in_dir = os.path.dirname(infile)
+
+ shutil.copyfile(utils.get_test_data('dn-test-all-paired-all-keep.fa'),
+ infile)
+
+ script = 'normalize-by-median.py'
+ args = list(CUTOFF) + list(PAIRING) + ['-k', '15', infile]
+ _, out, err = utils.runscript(script, args, in_dir)
+ print(out)
+ print(err)
+
+ outfile = infile + '.keep'
+ assert os.path.exists(outfile), outfile
+
+ seqs = set([r.name for r in screed.open(outfile)])
+
+ assert seqs == set(['a/1', 'a/2',
+ 'b/2',
+ 'c/1',
+ 'd/2']), seqs
+
+
+def test_diginorm_basic_functionality_3():
+ # This data is entirely unpaired, but with one duplicate ('A...').
+ # and a random sequence ('G...'). With 'C=1' only three seqs should
+ # be left, with no other complaints.
+
+ CUTOFF = ['-C', '1']
+ PAIRING = []
+
+ infile = utils.get_temp_filename('test.fa')
+ in_dir = os.path.dirname(infile)
+
+ shutil.copyfile(utils.get_test_data('dn-test-none-paired.fa'),
+ infile)
+
+ script = 'normalize-by-median.py'
+ args = list(CUTOFF) + list(PAIRING) + ['-k', '15', infile]
+ _, out, err = utils.runscript(script, args, in_dir)
+ print(out)
+ print(err)
+
+ outfile = infile + '.keep'
+ assert os.path.exists(outfile), outfile
+
+ seqs = set([r.name for r in screed.open(outfile)])
+
+ assert seqs == set(['a/1',
+ 'b/2',
+ 'd/1']), seqs
+
+
+def test_diginorm_basic_functionality_4():
+ # This data is mixed paired/unpaired, but with one duplicate ('A...').
+ # and a random sequence ('G...'). With 'C=2' all of the sequences
+ # should be kept.
+
+ CUTOFF = ['-C', '1']
+
+ infile = utils.get_temp_filename('test.fa')
+ in_dir = os.path.dirname(infile)
+
+ shutil.copyfile(utils.get_test_data('dn-test-some-paired-all-keep.fa'),
+ infile)
+
+ script = 'normalize-by-median.py'
+ args = list(CUTOFF) + ['-k', '15', infile]
+
+ _, out, err = utils.runscript(script, args, in_dir)
+ print(out)
+ print(err)
+
+ outfile = infile + '.keep'
+ assert os.path.exists(outfile), outfile
+
+ seqs = set([r.name for r in screed.open(outfile)])
+
+ assert seqs == set(['a/1', 'a/2',
+ 'b/2',
+ 'c/1', 'c/2',
+ 'd/2']), seqs
+
+
+def test_diginorm_basic_functionality_5():
+ # each of these pairs has both a multicopy sequence ('ACTTCA...') and
+ # a random sequence. With 'C=1' and '-p', all should be
+ CUTOFF = ['-C', '1']
+ PAIRING = ['-p']
+
+ infile = utils.get_temp_filename('test.fa')
+ in_dir = os.path.dirname(infile)
+
+ shutil.copyfile(utils.get_test_data('dn-test-all-paired-all-keep.fa'),
+ infile)
+
+ script = 'normalize-by-median.py'
+ args = list(CUTOFF) + list(PAIRING) + ['-k', '15', infile]
+ _, out, err = utils.runscript(script, args, in_dir)
+ print(out)
+ print(err)
+
+ outfile = infile + '.keep'
+ assert os.path.exists(outfile), outfile
+
+ seqs = set([r.name for r in screed.open(outfile)])
+
+ assert seqs == set(['a/1', 'a/2',
+ 'b/1', 'b/2',
+ 'c/1', 'c/2',
+ 'd/1', 'd/2']), seqs
diff --git a/tests/test_oxli_functions.py b/tests/test_oxli_functions.py
new file mode 100644
index 0000000..63ad48c
--- /dev/null
+++ b/tests/test_oxli_functions.py
@@ -0,0 +1,60 @@
+from __future__ import print_function
+from __future__ import absolute_import
+from __future__ import unicode_literals
+#
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
+# Copyright (C) Michigan State University, 2009-2015. It is licensed under
+# the three-clause BSD license; see LICENSE.
+# Contact: khmer-project at idyll.org
+#
+
+# pylint: disable=C0111,C0103,E1103,W0612
+
+from . import khmer_tst_utils as utils
+import khmer
+from oxli import functions
+
+
+def test_estimate_functions_1():
+ res = functions.estimate_optimal_with_N_and_M(99, 1024)
+ assert res[0] == 7, res[0]
+ assert res[1] == 146, res[1]
+ assert res[2] == 1022, res[2]
+ assert abs(.008 - res[3]) < .001, res[3]
+
+ res = functions.estimate_optimal_with_N_and_f(99, 0.00701925498897)
+ assert res[0] == 7, res[0]
+ assert res[1] == 145, res[1]
+ assert res[2] == 1015, res[2]
+ assert abs(.008 - res[3]) < .002, res[3]
+
+ res = functions.estimate_optimal_with_N_and_M(1024, 2)
+ assert res[0] == 1, res[0]
+ assert res[1] == 2, res[1]
+ assert res[2] == 2, res[2]
+ assert res[3] == 1.0, res[3]
+
+ # using a crazy high FP rate just for coverage
+ res = functions.estimate_optimal_with_N_and_f(1024, 0.7)
+ assert res[0] == 1, res[0]
+ assert res[1] == 850, res[1]
+ assert res[2] == 850, res[2]
+ assert abs(.7 - res[3]) < 0.0022, abs(.7 - res[3])
+
+
+def test_estimate_functions_namedtup():
+ res = functions.estimate_optimal_with_N_and_M(99, 1024)
+ assert res.num_htables == 7, res[0]
+ assert res.htable_size == 146, res[1]
+ assert res.mem_use == 1022, res[2]
+ assert abs(.008 - res.fp_rate) < .001, res[3]
+
+ res = functions.estimate_optimal_with_N_and_f(99, 0.00701925498897)
+ assert res.num_htables == 7, res[0]
+ assert res.htable_size == 145, res[1]
+ assert res.mem_use == 1015, res[2]
+ assert abs(.008 - res.fp_rate) < .002, res[3]
+
+
+def test_output_gen():
+ res = functions.optimal_args_output_gen(99, 0.00701925498897)
diff --git a/tests/test_read_aligner.py b/tests/test_read_aligner.py
index b5ce9b5..0fa9eec 100644
--- a/tests/test_read_aligner.py
+++ b/tests/test_read_aligner.py
@@ -1,8 +1,10 @@
#
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
-# Copyright (C) Michigan State University, 2009-2013. It is licensed under
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
+# Copyright (C) Michigan State University, 2009-2015. It is licensed under
# the three-clause BSD license; see LICENSE. Contact: ctb at msu.edu
#
+from __future__ import print_function
+
import khmer
from nose.tools import assert_almost_equals
@@ -13,7 +15,7 @@ def eq_(v1, v2):
def test_alignnocov():
- ch = khmer.new_counting_hash(10, 1048576, 1)
+ ch = khmer.CountingHash(10, 1048576, 1)
read = "ACCTAGGTTCGACATGTACC"
aligner = khmer.ReadAligner(ch, 0, 0)
for i in range(20):
@@ -27,7 +29,7 @@ def test_alignnocov():
def test_simple_readalign():
- ch = khmer.new_counting_hash(10, 1048576, 1)
+ ch = khmer.CountingHash(10, 1048576, 1)
aligner = khmer.ReadAligner(ch, 2, 0)
for i in range(20):
ch.consume("AGAGGGAAAGCTAGGTTCGACATGTCCTTGACAGAT")
@@ -46,7 +48,7 @@ def test_simple_readalign():
def test_readalign():
- ch = khmer.new_counting_hash(10, 1048576, 1)
+ ch = khmer.CountingHash(10, 1048576, 1)
aligner = khmer.ReadAligner(ch, 1, 0)
for i in range(20):
ch.consume("AGAGGGAAAGCTAGGTTCGACAAGTCCTTGACAGAT")
@@ -207,15 +209,15 @@ queries = [
def test_readalign_new():
- ch = khmer.new_counting_hash(32, 1048576, 1)
+ ch = khmer.CountingHash(32, 1048576, 1)
aligner = khmer.ReadAligner(ch, 1, 0)
for seq in ht_seqs:
ch.consume(seq)
for query in queries:
score, graphAlign, readAlign, trunc = aligner.align(query["seq"])
- print graphAlign
- print readAlign
+ print(graphAlign)
+ print(readAlign)
eq_(graphAlign, query["graph_aln"])
eq_(readAlign, query["read_aln"])
eq_(trunc, query["truncated"])
diff --git a/tests/test_read_parsers.py b/tests/test_read_parsers.py
index f2ae329..c785772 100644
--- a/tests/test_read_parsers.py
+++ b/tests/test_read_parsers.py
@@ -1,6 +1,8 @@
+from __future__ import print_function
+from __future__ import absolute_import
#
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
-# Copyright (C) Michigan State University, 2009-2013. It is licensed under
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
+# Copyright (C) Michigan State University, 2009-2015. It is licensed under
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
#
@@ -10,7 +12,7 @@
import khmer
from khmer import ReadParser
-import khmer_tst_utils as utils
+from . import khmer_tst_utils as utils
from nose.plugins.attrib import attr
from functools import reduce
@@ -68,7 +70,7 @@ def test_num_reads_threads():
n_threads = 4
threads = []
rparser = ReadParser(utils.get_test_data("100-reads.fq.gz"))
- for _ in xrange(n_threads):
+ for _ in range(n_threads):
thr = threading.Thread(target=count_reads, args=[rparser, ])
threads.append(thr)
thr.start()
@@ -108,7 +110,7 @@ def test_gzip_decompression_truncated():
pass
assert 0, "this should fail"
except IOError as err:
- print str(err)
+ print(str(err))
def test_gzip_decompression_truncated_pairiter():
@@ -119,7 +121,7 @@ def test_gzip_decompression_truncated_pairiter():
pass
assert 0, "this should fail"
except IOError as err:
- print str(err)
+ print(str(err))
def test_bzip2_decompression():
@@ -140,7 +142,7 @@ def test_bzip2_decompression_truncated():
pass
assert 0, "this should fail"
except IOError as err:
- print str(err)
+ print(str(err))
def test_bzip2_decompression_truncated_pairiter():
@@ -151,7 +153,7 @@ def test_bzip2_decompression_truncated_pairiter():
pass
assert 0, "this should fail"
except IOError as err:
- print str(err)
+ print(str(err))
def test_badbzip2():
@@ -161,9 +163,9 @@ def test_badbzip2():
pass
assert 0, "this should fail"
except IOError as err:
- print str(err)
+ print(str(err))
except ValueError as err:
- print str(err)
+ print(str(err))
@attr('multithread')
@@ -184,7 +186,7 @@ def test_with_multiple_threads(testfile="test-reads.fq.bz2"):
threads = []
reads_counts_per_thread = [0] * N_THREADS
rparser = ReadParser(utils.get_test_data(testfile))
- for tnum in xrange(N_THREADS):
+ for tnum in range(N_THREADS):
t = \
threading.Thread(
target=count_reads,
@@ -308,10 +310,10 @@ def test_read_pair_iterator_in_error_mode():
in rparser.iter_read_pairs(ReadParser.PAIR_MODE_ERROR_ON_UNPAIRED):
read_pairs_2.append([read_1, read_2])
matches = \
- map(
+ list(map(
lambda rp1, rp2: rp1[0].name == rp2[0].name,
read_pairs_1, read_pairs_2
- )
+ ))
assert all(matches) # Assert ALL the matches. :-]
@@ -353,12 +355,12 @@ def test_constructor():
assert 0, ("ReadParser's constructor shouldn't accept a character for "
"the number of threads")
except TypeError as err:
- print str(err)
+ print(str(err))
try:
rparser = ReadParser("non-existent-file-name")
assert 0, "ReadParser shouldn't accept a non-existant file name"
except ValueError as err:
- print str(err)
+ print(str(err))
def test_iternext():
@@ -369,7 +371,7 @@ def test_iternext():
read_pairs.append(read_1, read_2)
assert 0, "Shouldn't be able to iterate over non FASTA file"
except IOError as err:
- print str(err)
+ print(str(err))
except ValueError as err:
- print str(err)
+ print(str(err))
# vim: set ft=python ts=4 sts=4 sw=4 et tw=79:
diff --git a/tests/test_sandbox_scripts.py b/tests/test_sandbox_scripts.py
index c2ab6fc..ef85a82 100644
--- a/tests/test_sandbox_scripts.py
+++ b/tests/test_sandbox_scripts.py
@@ -1,5 +1,7 @@
+from __future__ import print_function
+from __future__ import absolute_import
#
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
# Copyright (C) Michigan State University, 2015. It is licensed under
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
@@ -11,13 +13,14 @@ import sys
import os
import os.path
import shutil
-from cStringIO import StringIO
+from io import StringIO
import traceback
import nose
+from nose.plugins.attrib import attr
import glob
import imp
-import khmer_tst_utils as utils
+from . import khmer_tst_utils as utils
import khmer
import screed
@@ -54,7 +57,7 @@ class _checkImportSucceeds(object):
try:
mod = imp.load_source('__zzz', self.filename)
except:
- print traceback.format_exc()
+ print(traceback.format_exc())
raise AssertionError("%s cannot be imported" % (self.filename,))
#
@@ -72,9 +75,10 @@ class _checkImportSucceeds(object):
exec(
compile(open(self.filename).read(), self.filename, 'exec'),
global_dict)
- except (ImportError, SyntaxError):
- print traceback.format_exc()
- raise AssertionError("%s cannot be exec'd" % (self.filename,))
+ except (ImportError, SyntaxError) as err:
+ print("{0}".format(err))
+ raise AssertionError("%s cannot be exec'd" % (self.filename),
+ "{0}".format(traceback))
except:
pass # other failures are expected :)
finally:
@@ -107,7 +111,7 @@ def test_sweep_reads():
mout = os.path.join(in_dir, 'test_multi.fa')
oout = os.path.join(in_dir, 'test_orphaned.fa')
- print os.listdir(in_dir)
+ print(os.listdir(in_dir))
assert os.path.exists(out1)
assert os.path.exists(out2)
@@ -118,10 +122,10 @@ def test_sweep_reads():
seqsm = set([r.name for r in screed.open(mout)])
seqso = set([r.name for r in screed.open(oout)])
- print seqs1
- print seqs2
- print seqsm
- print seqso
+ print(seqs1)
+ print(seqs2)
+ print(seqsm)
+ print(seqso)
assert seqs1 == set(['read1_p0\t0', 'read2_p0\t0'])
assert seqs2 == set(['read3_p1\t1'])
assert (seqsm == set(['read4_multi\t0\t1']) or
@@ -157,19 +161,19 @@ def test_sweep_reads_fq():
assert os.path.exists(out2)
assert os.path.exists(mout)
assert os.path.exists(oout)
- print open(out1).read()
+ print(open(out1).read())
- print os.listdir(in_dir)
+ print(os.listdir(in_dir))
seqs1 = set([r.name for r in screed.open(out1)])
seqs2 = set([r.name for r in screed.open(out2)])
seqsm = set([r.name for r in screed.open(mout)])
seqso = set([r.name for r in screed.open(oout)])
- print seqs1
- print seqs2
- print seqsm
- print seqso
+ print(seqs1)
+ print(seqs2)
+ print(seqsm)
+ print(seqso)
assert seqs1 == set(['read1_p0\t0', 'read2_p0\t0'])
assert seqs2 == set(['read3_p1\t1'])
assert (seqsm == set(['read4_multi\t0\t1']) or
@@ -194,9 +198,9 @@ def test_sweep_reads_2():
'test', '--label-by-seq', inref, infile]
status, out, err = utils.runscript(script, args, wdir, sandbox=True)
- for i in xrange(99):
+ for i in range(99):
p = os.path.join(wdir, 'test_{i}.fa'.format(i=i))
- print p, err, out
+ print(p, err, out)
assert os.path.exists(p)
os.remove(p)
assert os.path.exists(os.path.join(wdir, 'test.counts.csv'))
@@ -214,9 +218,9 @@ def test_sweep_reads_3():
'test', '--label-by-group', '10', infile, infile]
status, out, err = utils.runscript(script, args, wdir, sandbox=True)
- for i in xrange(10):
+ for i in range(10):
p = os.path.join(wdir, 'test_{i}.fa'.format(i=i))
- print p, err, out
+ print(p, err, out)
assert os.path.exists(p)
os.remove(p)
@@ -229,3 +233,25 @@ def test_sweep_reads_3():
assert os.path.exists(counts_fn)
assert os.path.exists(os.path.join(wdir, 'test.dist.txt'))
assert not os.path.exists(os.path.join(wdir, 'test_multi.fa'))
+
+
+def test_collect_reads():
+ outfile = utils.get_temp_filename('out.graph')
+ infile = utils.get_test_data('test-reads.fa')
+ script = 'collect-reads.py'
+ args = ['-M', '1e7', outfile, infile]
+
+ status, out, err = utils.runscript(script, args, sandbox=True)
+
+ assert status == 0
+ assert os.path.exists(outfile)
+
+
+def test_saturate_by_median():
+ infile = utils.get_test_data('test-reads.fa')
+ script = 'saturate-by-median.py'
+ args = ['-M', '1e7', infile]
+
+ status, out, err = utils.runscript(script, args, sandbox=True)
+
+ assert status == 0
diff --git a/tests/test_script_arguments.py b/tests/test_script_arguments.py
index bc0e132..e7cdc8d 100644
--- a/tests/test_script_arguments.py
+++ b/tests/test_script_arguments.py
@@ -1,5 +1,5 @@
#
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
# Copyright (C) Michigan State University, 2014-2015. It is licensed under
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
@@ -7,36 +7,46 @@
"""
Tests for various argument-handling code.
"""
+from __future__ import print_function, unicode_literals
+from __future__ import absolute_import
import sys
-import cStringIO
-import khmer_tst_utils as utils
+import io
+import collections
+from . import khmer_tst_utils as utils
+import argparse
import khmer.kfile
+from khmer import khmer_args
+from cStringIO import StringIO
def test_check_space():
fakelump_fa = utils.get_test_data('fakelump.fa')
- save_stderr, sys.stderr = sys.stderr, cStringIO.StringIO()
+ save_stderr, sys.stderr = sys.stderr, io.StringIO()
try:
khmer.kfile.check_space(
[fakelump_fa], force=False, _testhook_free_space=0)
assert 0, "this should fail"
except SystemExit as e:
- print str(e)
+ print(str(e))
finally:
sys.stderr = save_stderr
def test_check_tablespace():
- save_stderr, sys.stderr = sys.stderr, cStringIO.StringIO()
+ save_stderr, sys.stderr = sys.stderr, io.StringIO()
+
+ parser = khmer_args.build_counting_args()
+ args = parser.parse_args(['-M', '1e9'])
+
try:
- khmer.kfile.check_space_for_hashtable(
- 1e9, force=False, _testhook_free_space=0)
+ khmer.kfile.check_space_for_hashtable(args, 'countgraph', force=False,
+ _testhook_free_space=0)
assert 0, "this should fail"
except SystemExit as e:
- print str(e)
+ print(str(e))
finally:
sys.stderr = save_stderr
@@ -44,36 +54,208 @@ def test_check_tablespace():
def test_check_space_force():
fakelump_fa = utils.get_test_data('fakelump.fa')
- save_stderr, sys.stderr = sys.stderr, cStringIO.StringIO()
+ save_stderr, sys.stderr = sys.stderr, io.StringIO()
try:
khmer.kfile.check_space(
[fakelump_fa], force=True, _testhook_free_space=0)
assert True, "this should pass"
except SystemExit as e:
- print str(e)
+ print(str(e))
finally:
sys.stderr = save_stderr
def test_check_tablespace_force():
- save_stderr, sys.stderr = sys.stderr, cStringIO.StringIO()
+ save_stderr, sys.stderr = sys.stderr, io.StringIO()
+
+ parser = khmer_args.build_counting_args()
+ args = parser.parse_args(['-M', '1e9'])
+
try:
- khmer.kfile.check_space_for_hashtable(
- 1e9, force=True, _testhook_free_space=0)
+ khmer.kfile.check_space_for_hashtable(args, 'countgraph', True,
+ _testhook_free_space=0)
assert True, "this should pass"
except SystemExit as e:
- print str(e)
+ print(str(e))
finally:
sys.stderr = save_stderr
def test_invalid_file_warn():
- save_stderr, sys.stderr = sys.stderr, cStringIO.StringIO()
+ save_stderr, sys.stderr = sys.stderr, io.StringIO()
try:
khmer.kfile.check_valid_file_exists(["nonexistent", "nonexistent2"])
assert sys.stderr.getvalue().count("\n") == 2, \
"Should produce two warning lines"
- except SystemExit, e:
- print str(e)
+ except SystemExit as e:
+ print(str(e))
finally:
sys.stderr = save_stderr
+
+
+FakeArgparseObject = collections.namedtuple('FakeArgs',
+ ['ksize', 'n_tables',
+ 'max_tablesize',
+ 'max_memory_usage'])
+
+
+def test_create_countgraph_1():
+ ksize = khmer_args.DEFAULT_K
+ n_tables = khmer_args.DEFAULT_N_TABLES
+ max_tablesize = khmer_args.DEFAULT_MAX_TABLESIZE
+ max_mem = 1e7
+
+ args = FakeArgparseObject(ksize, n_tables, max_tablesize, max_mem)
+
+ countgraph = khmer_args.create_countgraph(args)
+ assert countgraph.hashsizes() == [2499997L, 2499989L, 2499983L, 2499967L]
+ assert sum(countgraph.hashsizes()) < max_mem, sum(countgraph.hashsizes())
+
+
+def test_create_countgraph_2():
+ # tests overriding ksize by passing into create_nodegraph explicitly.
+
+ ksize = khmer_args.DEFAULT_K
+ n_tables = khmer_args.DEFAULT_N_TABLES
+ max_tablesize = khmer_args.DEFAULT_MAX_TABLESIZE
+ max_mem = 1e7
+
+ args = FakeArgparseObject(ksize, n_tables, max_tablesize, max_mem)
+
+ countgraph = khmer_args.create_countgraph(args, ksize=15)
+ assert countgraph.ksize() == 15
+
+
+def test_create_countgraph_3():
+ # tests too-big ksize
+
+ ksize = khmer_args.DEFAULT_K
+ n_tables = khmer_args.DEFAULT_N_TABLES
+ max_tablesize = khmer_args.DEFAULT_MAX_TABLESIZE
+ max_mem = 1e7
+
+ args = FakeArgparseObject(ksize, n_tables, max_tablesize, max_mem)
+
+ old_stderr = sys.stderr
+ sys.stderr = capture = StringIO()
+
+ try:
+ countgraph = khmer_args.create_countgraph(args, ksize=35)
+ assert 0, "should not reach this"
+ except SystemExit:
+ err = capture.getvalue()
+ assert 'khmer only supports k-mer sizes <= 32.' in err, err
+ finally:
+ sys.stderr = old_stderr
+
+
+def test_create_countgraph_4_multiplier():
+ ksize = khmer_args.DEFAULT_K
+ n_tables = khmer_args.DEFAULT_N_TABLES
+ max_tablesize = khmer_args.DEFAULT_MAX_TABLESIZE
+ max_mem = 1e7
+
+ args = FakeArgparseObject(ksize, n_tables, max_tablesize, max_mem)
+
+ countgraph = khmer_args.create_countgraph(args, multiplier=2.0)
+ assert sum(countgraph.hashsizes()) < max_mem / 2.0, \
+ sum(countgraph.hashsizes())
+
+
+def test_create_nodegraph_1():
+ ksize = khmer_args.DEFAULT_K
+ n_tables = khmer_args.DEFAULT_N_TABLES
+ max_tablesize = khmer_args.DEFAULT_MAX_TABLESIZE
+ max_mem = 1e7
+
+ args = FakeArgparseObject(ksize, n_tables, max_tablesize, max_mem)
+
+ nodegraph = khmer_args.create_nodegraph(args)
+ assert nodegraph.hashsizes() == [19999999L, 19999981L,
+ 19999963L, 19999927L]
+
+ assert sum(nodegraph.hashsizes())/8.0 < max_mem, sum(nodegraph.hashsizes())
+
+
+def test_create_nodegraph_2():
+ # tests overriding ksize by passing into create_nodegraph explicitly.
+
+ ksize = khmer_args.DEFAULT_K
+ n_tables = khmer_args.DEFAULT_N_TABLES
+ max_tablesize = khmer_args.DEFAULT_MAX_TABLESIZE
+ max_mem = 1e7
+
+ args = FakeArgparseObject(ksize, n_tables, max_tablesize, max_mem)
+
+ nodegraph = khmer_args.create_nodegraph(args, ksize=15)
+ assert nodegraph.ksize() == 15
+
+
+def test_create_nodegraph_3():
+ # tests too-big ksize
+
+ ksize = khmer_args.DEFAULT_K
+ n_tables = khmer_args.DEFAULT_N_TABLES
+ max_tablesize = khmer_args.DEFAULT_MAX_TABLESIZE
+ max_mem = 1e7
+
+ args = FakeArgparseObject(ksize, n_tables, max_tablesize, max_mem)
+
+ old_stderr = sys.stderr
+ sys.stderr = capture = StringIO()
+
+ try:
+ nodegraph = khmer_args.create_nodegraph(args, ksize=35)
+ assert 0, "should not reach this"
+ except SystemExit:
+ err = capture.getvalue()
+ assert 'khmer only supports k-mer sizes <= 32.' in err, err
+
+
+def test_create_nodegraph_4_multiplier():
+ ksize = khmer_args.DEFAULT_K
+ n_tables = khmer_args.DEFAULT_N_TABLES
+ max_tablesize = khmer_args.DEFAULT_MAX_TABLESIZE
+ max_mem = 1e7
+
+ args = FakeArgparseObject(ksize, n_tables, max_tablesize, max_mem)
+
+ nodegraph = khmer_args.create_nodegraph(args, multiplier=2.0)
+ assert sum(nodegraph.hashsizes())/8.0 < max_mem / 2.0, \
+ sum(nodegraph.hashsizes())
+
+
+def test_report_on_config_bad_hashtype():
+ ksize = khmer_args.DEFAULT_K
+ n_tables = khmer_args.DEFAULT_N_TABLES
+ max_tablesize = khmer_args.DEFAULT_MAX_TABLESIZE
+ max_mem = 1e7
+
+ args = FakeArgparseObject(ksize, n_tables, max_tablesize, max_mem)
+
+ try:
+ khmer_args.report_on_config(args, 'foograph')
+ assert 0, "the previous statement should raise an exception"
+ except AssertionError:
+ raise
+ except Exception as err:
+ assert "unknown graph type: foograph" in str(err), str(err)
+
+
+def test_fail_calculate_foograph_size():
+ # tests unknown graph type
+
+ ksize = khmer_args.DEFAULT_K
+ n_tables = khmer_args.DEFAULT_N_TABLES
+ max_tablesize = khmer_args.DEFAULT_MAX_TABLESIZE
+ max_mem = 1e7
+
+ args = FakeArgparseObject(ksize, n_tables, max_tablesize, max_mem)
+
+ try:
+ nodegraph = khmer_args._calculate_tablesize(args, 'foograph')
+ assert 0, "previous statement should fail"
+ except AssertionError:
+ raise
+ except Exception as err:
+ assert "unknown graph type: foograph" in str(err), str(err)
diff --git a/tests/test_scripts.py b/tests/test_scripts.py
index 2f4a75d..ffbbb81 100644
--- a/tests/test_scripts.py
+++ b/tests/test_scripts.py
@@ -1,5 +1,8 @@
+from __future__ import print_function
+from __future__ import absolute_import
+from __future__ import unicode_literals
#
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
# Copyright (C) Michigan State University, 2009-2015. It is licensed under
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
@@ -12,7 +15,7 @@ import sys
import os
import stat
import shutil
-from cStringIO import StringIO
+from io import StringIO
import traceback
from nose.plugins.attrib import attr
import subprocess
@@ -20,16 +23,12 @@ import threading
import bz2
import io
-import khmer_tst_utils as utils
+from . import khmer_tst_utils as utils
import khmer
import khmer.kfile
import screed
-def scriptpath(script):
- return script
-
-
def teardown():
utils.cleanup()
@@ -42,7 +41,7 @@ def test_check_space():
def test_load_into_counting():
- script = scriptpath('load-into-counting.py')
+ script = 'load-into-counting.py'
args = ['-x', '1e3', '-N', '2', '-k', '20', '-t']
outfile = utils.get_temp_filename('out.ct')
@@ -51,12 +50,65 @@ def test_load_into_counting():
args.extend([outfile, infile])
(status, out, err) = utils.runscript(script, args)
- assert 'Total number of unique k-mers: 89' in err, err
+ assert 'Total number of unique k-mers: 83' in err, err
+ assert os.path.exists(outfile)
+
+
+def test_load_into_counting_tablesize_warning():
+ script = 'load-into-counting.py'
+ args = ['-k', '20', '-t']
+
+ outfile = utils.get_temp_filename('out.ct')
+ infile = utils.get_test_data('test-abund-read-2.fa')
+
+ args.extend([outfile, infile])
+
+ (status, out, err) = utils.runscript(script, args)
+ assert os.path.exists(outfile)
+ assert "WARNING: tablesize is default!" in err
+
+
+def test_load_into_counting_max_memory_usage_parameter():
+ script = 'load-into-counting.py'
+ args = ['-M', '2e3', '-k', '20', '-t']
+
+ outfile = utils.get_temp_filename('out.ct')
+ infile = utils.get_test_data('test-abund-read-2.fa')
+
+ args.extend([outfile, infile])
+
+ (status, out, err) = utils.runscript(script, args)
+ assert os.path.exists(outfile)
+ assert "WARNING: tablesize is default!" not in err
+
+ kh = khmer.load_counting_hash(outfile)
+ assert sum(kh.hashsizes()) < 3e8
+
+
+def test_load_into_counting_abundance_dist_nobig():
+ script = 'load-into-counting.py'
+ args = ['-x', '1e3', '-N', '2', '-k', '20', '-t', '-b']
+
+ outfile = utils.get_temp_filename('out.ct')
+ infile = utils.get_test_data('test-abund-read-2.fa')
+
+ args.extend([outfile, infile])
+
+ (status, out, err) = utils.runscript(script, args)
+ assert 'Total number of unique k-mers: 83' in err, err
assert os.path.exists(outfile)
+ htfile = outfile
+ outfile = utils.get_temp_filename('out')
+ script2 = 'abundance-dist.py'
+ args = ['-z', htfile, infile, outfile]
+ (status, out, err) = utils.runscript(script2, args)
+ assert 'WARNING: The loaded graph has bigcount' in err, err
+ assert 'bigcount' in err, err
+
def test_load_into_counting_nonwritable():
- script = scriptpath('load-into-counting.py')
+ script = 'load-into-counting.py'
args = ['-x', '1e3', '-N', '2', '-k', '20', '-t']
outfile = utils.get_temp_filename('test-nonwritable')
@@ -73,9 +125,9 @@ def test_load_into_counting_nonwritable():
assert status == 1, status
- at attr('linux')
+ at attr('huge')
def test_load_into_counting_toobig():
- script = scriptpath('load-into-counting.py')
+ script = 'load-into-counting.py'
args = ['-x', '1e12', '-N', '2', '-k', '20', '-t', '--force']
outfile = utils.get_temp_filename('out.kh')
@@ -89,7 +141,7 @@ def test_load_into_counting_toobig():
def test_load_into_counting_fail():
- script = scriptpath('load-into-counting.py')
+ script = 'load-into-counting.py'
args = ['-x', '1e2', '-N', '2', '-k', '20'] # use small HT
outfile = utils.get_temp_filename('out.ct')
@@ -99,12 +151,12 @@ def test_load_into_counting_fail():
(status, out, err) = utils.runscript(script, args, fail_ok=True)
assert status == 1, status
- print err
+ print(err)
assert "** ERROR: the graph structure is too small" in err
def test_load_into_counting_multifile():
- script = scriptpath('load-into-counting.py')
+ script = 'load-into-counting.py'
args = ['-x', '1e7', '-N', '2', '-k', '20', '-t']
outfile = utils.get_temp_filename('out.kh')
@@ -119,7 +171,7 @@ def test_load_into_counting_multifile():
def test_load_into_counting_tsv():
- script = scriptpath('load-into-counting.py')
+ script = 'load-into-counting.py'
args = ['-x', '1e7', '-N', '2', '-k', '20', '-t', '-s', 'tsv']
outfile = utils.get_temp_filename('out.ct')
@@ -142,7 +194,7 @@ def test_load_into_counting_tsv():
def test_load_into_counting_json():
- script = scriptpath('load-into-counting.py')
+ script = 'load-into-counting.py'
args = ['-x', '1e7', '-N', '2', '-k', '20', '-t', '-s', 'json']
outfile = utils.get_temp_filename('out.ct')
@@ -159,20 +211,21 @@ def test_load_into_counting_json():
with open(jsonfile) as jsonfh:
got_json = json.load(jsonfh)
outbase = os.path.basename(outfile)
+
expected_json = {
- "files": [infile],
- "ht_name": outbase,
- "num_kmers": 95,
- "num_reads": 1001,
- "fpr": 9.024965705097741e-11,
- "mrinfo_version": "0.2.0",
+ u"files": [infile],
+ u"ht_name": outbase,
+ u"num_kmers": 95,
+ u"num_reads": 1001,
+ u"fpr": 9.025048735197377e-11,
+ u"mrinfo_version": "0.2.0",
}
assert got_json == expected_json, got_json
def test_load_into_counting_bad_summary_fmt():
- script = scriptpath('load-into-counting.py')
+ script = 'load-into-counting.py'
args = ['-x', '1e7', '-N', '2', '-k', '20', '-s', 'badfmt']
outfile = utils.get_temp_filename('out.ct')
@@ -186,7 +239,7 @@ def test_load_into_counting_bad_summary_fmt():
def _make_counting(infilename, SIZE=1e7, N=2, K=20, BIGCOUNT=True):
- script = scriptpath('load-into-counting.py')
+ script = 'load-into-counting.py'
args = ['-x', str(SIZE), '-N', str(N), '-k', str(K)]
if not BIGCOUNT:
@@ -203,7 +256,7 @@ def _make_counting(infilename, SIZE=1e7, N=2, K=20, BIGCOUNT=True):
def test_filter_abund_1():
- script = scriptpath('filter-abund.py')
+ script = 'filter-abund.py'
infile = utils.get_temp_filename('test.fa')
n_infile = utils.get_temp_filename('test-fastq-n-reads.fq')
@@ -249,7 +302,7 @@ def test_filter_abund_2():
shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile)
counting_ht = _make_counting(infile, K=17)
- script = scriptpath('filter-abund.py')
+ script = 'filter-abund.py'
args = ['-C', '1', counting_ht, infile, infile]
utils.runscript(script, args, in_dir)
@@ -270,7 +323,7 @@ def test_filter_abund_3_fq_retained():
shutil.copyfile(utils.get_test_data('test-abund-read-2.fq'), infile)
counting_ht = _make_counting(infile, K=17)
- script = scriptpath('filter-abund.py')
+ script = 'filter-abund.py'
args = ['-C', '1', counting_ht, infile, infile]
utils.runscript(script, args, in_dir)
@@ -298,7 +351,7 @@ def test_filter_abund_4_fq_casava_18():
infile)
counting_ht = _make_counting(infile, K=17)
- script = scriptpath('filter-abund.py')
+ script = 'filter-abund.py'
args = [counting_ht, infile, infile]
utils.runscript(script, args, in_dir)
@@ -315,7 +368,7 @@ def test_filter_abund_1_singlefile():
shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile)
- script = scriptpath('filter-abund-single.py')
+ script = 'filter-abund-single.py'
args = ['-x', '1e7', '-N', '2', '-k', '17', '-t', infile]
(status, out, err) = utils.runscript(script, args, in_dir)
@@ -336,7 +389,7 @@ def test_filter_abund_2_singlefile():
shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile)
- script = scriptpath('filter-abund-single.py')
+ script = 'filter-abund-single.py'
args = ['-x', '1e7', '-N', '2', '-k', '17', '-t', '--savetable',
tabfile, infile]
(status, out, err) = utils.runscript(script, args, in_dir)
@@ -358,7 +411,7 @@ def test_filter_abund_2_singlefile_fq_casava_18():
shutil.copyfile(utils.get_test_data('test-abund-read-2.paired2.fq'),
infile)
- script = scriptpath('filter-abund-single.py')
+ script = 'filter-abund-single.py'
args = ['-x', '1e7', '-N', '2', '-k', '17', infile]
(status, out, err) = utils.runscript(script, args, in_dir)
@@ -377,7 +430,7 @@ def test_filter_abund_4_retain_low_abund():
shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile)
counting_ht = _make_counting(infile, K=17)
- script = scriptpath('filter-abund.py')
+ script = 'filter-abund.py'
args = ['-V', counting_ht, infile]
utils.runscript(script, args, in_dir)
@@ -388,17 +441,16 @@ def test_filter_abund_4_retain_low_abund():
assert len(seqs) == 2, seqs
assert 'GGTTGACGGGGCTCAGGG' in seqs
-# test that the -V option *does* trim sequences that are low abundance
-
def test_filter_abund_5_trim_high_abund():
+ # test that the -V option *does* trim sequences that are high abundance
infile = utils.get_temp_filename('test.fa')
in_dir = os.path.dirname(infile)
shutil.copyfile(utils.get_test_data('test-abund-read-3.fa'), infile)
counting_ht = _make_counting(infile, K=17)
- script = scriptpath('filter-abund.py')
+ script = 'filter-abund.py'
args = ['-V', counting_ht, infile]
utils.runscript(script, args, in_dir)
@@ -411,17 +463,18 @@ def test_filter_abund_5_trim_high_abund():
# trimmed sequence @ error
assert 'GGTTGACGGGGCTCAGGGGGCGGCTGACTCCGAGAGACAGC' in seqs
-# test that -V/-Z setting - should not trip if -Z is set high enough.
-
def test_filter_abund_6_trim_high_abund_Z():
+ # test that -V/-Z settings interact properly -
+ # trimming should not happen if -Z is set high enough.
+
infile = utils.get_temp_filename('test.fa')
in_dir = os.path.dirname(infile)
shutil.copyfile(utils.get_test_data('test-abund-read-3.fa'), infile)
counting_ht = _make_counting(infile, K=17)
- script = scriptpath('filter-abund.py')
+ script = 'filter-abund.py'
args = ['-V', '-Z', '25', counting_ht, infile]
utils.runscript(script, args, in_dir)
@@ -437,6 +490,72 @@ def test_filter_abund_6_trim_high_abund_Z():
assert badseq in seqs # should be there, untrimmed
+def test_filter_abund_7_retain_Ns():
+ # check that filter-abund retains sequences with Ns, and treats them as As.
+
+ infile = utils.get_temp_filename('test.fq')
+ in_dir = os.path.dirname(infile)
+
+ # copy test file over to test.fq & load into counting table
+ shutil.copyfile(utils.get_test_data('test-filter-abund-Ns.fq'), infile)
+ counting_ht = _make_counting(infile, K=17)
+
+ script = 'filter-abund.py'
+ args = ['-C', '3', counting_ht, infile]
+ utils.runscript(script, args, in_dir)
+
+ outfile = infile + '.abundfilt'
+ assert os.path.exists(outfile), outfile
+
+ # test for a sequence with an 'N' in it --
+ names = set([r.name for r in screed.open(outfile, parse_description=0)])
+ assert '895:1:37:17593:9954 1::FOO_withN' in names, names
+
+ # check to see if that 'N' was properly changed to an 'A'
+ seqs = set([r.sequence for r in screed.open(outfile)])
+ assert 'GGTTGACGGGGCTCAGGGGGCGGCTGACTCCGAG' not in seqs, seqs
+
+ # ...and that an 'N' remains in the output sequences
+ found_N = False
+ for s in seqs:
+ if 'N' in s:
+ found_N = True
+ assert found_N, seqs
+
+
+def test_filter_abund_single_8_retain_Ns():
+ # check that filter-abund-single retains
+ # sequences with Ns, and treats them as As.
+
+ infile = utils.get_temp_filename('test.fq')
+ in_dir = os.path.dirname(infile)
+
+ # copy test file over to test.fq & load into counting table
+ shutil.copyfile(utils.get_test_data('test-filter-abund-Ns.fq'), infile)
+
+ script = 'filter-abund-single.py'
+ args = ['-k', '17', '-x', '1e7', '-N', '2', '-C', '3', infile]
+ utils.runscript(script, args, in_dir)
+
+ outfile = infile + '.abundfilt'
+ assert os.path.exists(outfile), outfile
+
+ # test for a sequence with an 'N' in it --
+ names = set([r.name for r in screed.open(outfile, parse_description=0)])
+ assert '895:1:37:17593:9954 1::FOO_withN' in names, names
+
+ # check to see if that 'N' was properly changed to an 'A'
+ seqs = set([r.sequence for r in screed.open(outfile)])
+ assert 'GGTTGACGGGGCTCAGGGGGCGGCTGACTCCGAG' not in seqs, seqs
+
+ # ...and that an 'N' remains in the output sequences
+ found_N = False
+ for s in seqs:
+ if 'N' in s:
+ found_N = True
+ assert found_N, seqs
+
+
def test_filter_stoptags():
infile = utils.get_temp_filename('test.fa')
in_dir = os.path.dirname(infile)
@@ -447,13 +566,13 @@ def test_filter_stoptags():
# now, create a file with some stop tags in it --
K = 18
- kh = khmer.new_hashbits(K, 1, 1)
+ kh = khmer._Hashbits(K, [1])
kh.add_stop_tag('GTTGACGGGGCTCAGGGG')
kh.save_stop_tags(stopfile)
del kh
# finally, run filter-stoptags.
- script = scriptpath('filter-stoptags.py')
+ script = 'filter-stoptags.py'
args = ['-k', str(K), stopfile, infile, infile]
utils.runscript(script, args, in_dir)
@@ -478,13 +597,13 @@ def test_filter_stoptags_fq():
# now, create a file with some stop tags in it --
K = 18
- kh = khmer.new_hashbits(K, 1, 1)
+ kh = khmer._Hashbits(K, [1])
kh.add_stop_tag('GTTGACGGGGCTCAGGGG')
kh.save_stop_tags(stopfile)
del kh
# finally, run filter-stoptags.
- script = scriptpath('filter-stoptags.py')
+ script = 'filter-stoptags.py'
args = ['-k', str(K), stopfile, infile, infile]
utils.runscript(script, args, in_dir)
@@ -504,356 +623,6 @@ def test_filter_stoptags_fq():
assert 'seq 1::BAR' in names
-def test_normalize_by_median_indent():
- infile = utils.get_test_data('paired-mixed.fa.pe')
- hashfile = utils.get_test_data('normC20k20.ct')
- outfile = utils.get_temp_filename('paired-mixed.fa.pe.keep')
- script = scriptpath('normalize-by-median.py')
- args = ['--loadtable', hashfile, '-o', outfile, infile]
- (status, out, err) = utils.runscript(script, args)
- assert status == 0, (out, err)
- assert os.path.exists(outfile)
-
-
-def test_normalize_by_median():
- CUTOFF = '1'
-
- infile = utils.get_temp_filename('test.fa')
- in_dir = os.path.dirname(infile)
-
- shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile)
-
- script = scriptpath('normalize-by-median.py')
- args = ['-C', CUTOFF, '-k', '17', '-t', infile]
- (status, out, err) = utils.runscript(script, args, in_dir)
-
- assert 'Total number of unique k-mers: 98' in err, err
-
- outfile = infile + '.keep'
- assert os.path.exists(outfile), outfile
-
- seqs = [r.sequence for r in screed.open(outfile)]
- assert len(seqs) == 1, seqs
- assert seqs[0].startswith('GGTTGACGGGGCTCAGGGGG'), seqs
-
-
-def test_normalize_by_median_unpaired_and_paired():
- CUTOFF = '1'
-
- infile = utils.get_temp_filename('test.fa')
- in_dir = os.path.dirname(infile)
-
- shutil.copyfile(utils.get_test_data('test-abund-read-paired.fa'), infile)
-
- unpairedfile = utils.get_temp_filename('test1.fa', tempdir=in_dir)
- shutil.copyfile(utils.get_test_data('random-20-a.fa'), unpairedfile)
-
- script = scriptpath('normalize-by-median.py')
- args = ['-C', CUTOFF, '-k', '17', '-t', '-u', unpairedfile, '-p', infile]
- (status, out, err) = utils.runscript(script, args, in_dir)
-
- assert 'Total number of unique k-mers: 4029' in err, err
-
- outfile = infile + '.keep'
- assert os.path.exists(outfile), outfile
-
-
-def test_normalize_by_median_double_file_name():
- infile = utils.get_temp_filename('test-abund-read-2.fa')
- in_dir = os.path.dirname(infile)
-
- shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile)
-
- script = scriptpath('normalize-by-median.py')
- args = [utils.get_test_data('test-abund-read-2.fa'), infile]
- (status, out, err) = utils.runscript(script, args, in_dir)
-
- assert "WARNING: At least two input files are named" in err, err
-
-
-def test_normalize_by_median_overwrite():
- outfile = utils.get_temp_filename('test.fa.keep')
- shutil.copyfile(utils.get_test_data('test-abund-read.fa'), outfile)
- in_dir = os.path.dirname(outfile)
-
- CUTOFF = '1'
- infile = utils.get_temp_filename('test.fa', in_dir)
- shutil.copyfile(utils.get_test_data('test-abund-read-3.fa'), infile)
- script = scriptpath('normalize-by-median.py')
-
- args = ['-C', CUTOFF, '-k', '17', '-t', '-o', outfile, infile]
- (status, out, err) = utils.runscript(script, args, in_dir)
- assert os.path.exists(outfile), outfile
- seqs = [r.sequence for r in screed.open(outfile)]
- assert len(seqs) == 1, seqs
- assert 'GACAGCgtgCCGCA' in seqs[0], seqs
-
-
-def test_normalize_by_median_version():
- script = scriptpath('normalize-by-median.py')
- args = ['--version']
- status, out, err = utils.runscript(script, args)
-
- errlines = err.splitlines()
- for err in errlines:
- if err.startswith('||') or \
- not err.strip():
- continue
- break
-
- print errlines
- print err
-
- assert err.startswith('khmer ')
-
-
-def test_normalize_by_median_2():
- CUTOFF = '2'
-
- infile = utils.get_temp_filename('test.fa')
- in_dir = os.path.dirname(infile)
-
- shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile)
-
- script = scriptpath('normalize-by-median.py')
- args = ['-C', CUTOFF, '-k', '17', infile]
- utils.runscript(script, args, in_dir)
-
- outfile = infile + '.keep'
- assert os.path.exists(outfile), outfile
-
- seqs = [r.sequence for r in screed.open(outfile)]
- assert len(seqs) == 2, seqs
- assert seqs[0].startswith('GGTTGACGGGGCTCAGGGGG'), seqs
- assert seqs[1] == 'GGTTGACGGGGCTCAGGG', seqs
-
-
-def test_normalize_by_median_paired():
- CUTOFF = '1'
-
- infile = utils.get_temp_filename('test.fa')
- in_dir = os.path.dirname(infile)
-
- shutil.copyfile(utils.get_test_data('test-abund-read-paired.fa'), infile)
-
- script = scriptpath('normalize-by-median.py')
- args = ['-C', CUTOFF, '-p', '-k', '17', infile]
- utils.runscript(script, args, in_dir)
-
- outfile = infile + '.keep'
- assert os.path.exists(outfile), outfile
-
- seqs = [r.sequence for r in screed.open(outfile)]
- assert len(seqs) == 2, seqs
- assert seqs[0].startswith('GGTTGACGGGGCTCAGGGGG'), seqs
- assert seqs[1].startswith('GGTTGACGGGGCTCAGGG'), seqs
-
-
-def test_normalize_by_median_paired_fq():
- CUTOFF = '20'
-
- infile = utils.get_temp_filename('test.fa')
- in_dir = os.path.dirname(infile)
-
- shutil.copyfile(utils.get_test_data('test-abund-read-paired.fq'), infile)
-
- script = scriptpath('normalize-by-median.py')
- args = ['-C', CUTOFF, '-p', '-k', '17', infile]
- _, out, err = utils.runscript(script, args, in_dir)
- print out
- print err
-
- outfile = infile + '.keep'
- assert os.path.exists(outfile), outfile
-
- seqs = [r.sequence for r in screed.open(outfile)]
- assert len(seqs) == 6, len(seqs)
- assert seqs[0].startswith('GGTTGACGGGGCTCAGGGGG'), seqs
- assert seqs[1].startswith('GGTTGACGGGGCTCAGGG'), seqs
-
- names = [r.name for r in screed.open(outfile, parse_description=False)]
- assert len(names) == 6, names
- assert '895:1:37:17593:9954 1::FOO' in names, names
- assert '895:1:37:17593:9954 2::FOO' in names, names
-
-
-def test_normalize_by_median_impaired():
- CUTOFF = '1'
-
- infile = utils.get_temp_filename('test.fa')
- in_dir = os.path.dirname(infile)
-
- shutil.copyfile(utils.get_test_data('test-abund-read-impaired.fa'), infile)
-
- script = scriptpath('normalize-by-median.py')
- args = ['-C', CUTOFF, '-p', '-k', '17', infile]
- _, out, err = utils.runscript(script, args, in_dir, fail_ok=True)
- assert '** ERROR: Error: Improperly interleaved pairs ' in err
-
-
-def test_normalize_by_median_force():
- CUTOFF = '1'
-
- corrupt_infile = utils.get_temp_filename('test-corrupt.fq')
- good_infile = utils.get_temp_filename('test-good.fq',
- tempdir=os.path.dirname(
- corrupt_infile))
-
- in_dir = os.path.dirname(good_infile)
-
- shutil.copyfile(utils.get_test_data('test-error-reads.fq'), corrupt_infile)
- shutil.copyfile(utils.get_test_data('test-fastq-reads.fq'), good_infile)
-
- script = scriptpath('normalize-by-median.py')
- args = ['-f', '-C', CUTOFF, '-k', '17', corrupt_infile, good_infile]
-
- (status, out, err) = utils.runscript(script, args, in_dir)
-
- test_ht = khmer.load_counting_hash(corrupt_infile + '.ct.failed')
- test_good_read = 'CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT'
- test_good_read2 = 'TAGTATCATCAAGGTTCAAGATGTTAATGAATAACAATTGCGCAGCAA'
- assert test_ht.count(test_good_read[:17]) > 0
- assert test_ht.count(test_good_read2[:17]) > 0
- assert os.path.exists(corrupt_infile + '.ct.failed')
- assert '*** Skipping' in err
- assert '** IOErrors' in err
-
-
-def test_normalize_by_median_no_bigcount():
- infile = utils.get_temp_filename('test.fa')
- hashfile = utils.get_temp_filename('test-out.ct')
- outfile = infile + '.keep'
- in_dir = os.path.dirname(infile)
-
- shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile)
- counting_ht = _make_counting(infile, K=8)
-
- script = scriptpath('normalize-by-median.py')
- args = ['-C', '1000', '-k 8', '--savetable', hashfile, infile]
-
- (status, out, err) = utils.runscript(script, args, in_dir)
- assert status == 0, (out, err)
- print(out, err)
-
- assert os.path.exists(hashfile), hashfile
- kh = khmer.load_counting_hash(hashfile)
-
- assert kh.get('GGTTGACG') == 255
-
-
-def test_normalize_by_median_dumpfrequency():
- CUTOFF = '1'
-
- infiles = [utils.get_temp_filename('test-0.fq')]
- in_dir = os.path.dirname(infiles[0])
- for x in range(1, 5):
- infiles.append(utils.get_temp_filename('test-{x}.fq'.format(x=x),
- tempdir=in_dir))
-
- for infile in infiles:
- shutil.copyfile(utils.get_test_data('test-fastq-reads.fq'), infile)
-
- script = scriptpath('normalize-by-median.py')
- args = ['-d', '2', '-C', CUTOFF, '-k', '17']
- args.extend(infiles)
-
- (status, out, err) = utils.runscript(script, args, in_dir)
-
- test_ht = khmer.load_counting_hash(os.path.join(in_dir, 'backup.ct'))
- test_good_read = 'CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT'
- test_good_read2 = 'TAGTATCATCAAGGTTCAAGATGTTAATGAATAACAATTGCGCAGCAA'
- assert test_ht.count(test_good_read[:17]) > 0
- assert test_ht.count(test_good_read2[:17]) > 0
-
- assert os.path.exists(os.path.join(in_dir, 'backup.ct'))
- assert out.count('Backup: Saving') == 2
- assert 'Nothing' in out
-
-
-def test_normalize_by_median_empty():
- CUTOFF = '1'
-
- infile = utils.get_temp_filename('test.fa')
- in_dir = os.path.dirname(infile)
-
- shutil.copyfile(utils.get_test_data('test-empty.fa'), infile)
-
- script = scriptpath('normalize-by-median.py')
- args = ['-C', CUTOFF, '-k', '17', infile]
- utils.runscript(script, args, in_dir)
-
- outfile = infile + '.keep'
- assert os.path.exists(outfile), outfile
-
-
-def test_normalize_by_median_emptycountingtable():
- CUTOFF = '1'
-
- infile = utils.get_temp_filename('test.fa')
- in_dir = os.path.dirname(infile)
-
- shutil.copyfile(utils.get_test_data('test-empty.fa'), infile)
-
- script = scriptpath('normalize-by-median.py')
- args = ['-C', CUTOFF, '--loadtable', infile, infile]
- (status, out, err) = utils.runscript(script, args, in_dir, fail_ok=True)
- assert 'ValueError' in err, (status, out, err)
-
-
-def test_normalize_by_median_fpr():
- MIN_TABLESIZE_PARAM = 1
-
- infile = utils.get_temp_filename('test-fpr.fq')
- in_dir = os.path.dirname(infile)
- shutil.copyfile(utils.get_test_data('test-fastq-reads.fq'), infile)
-
- script = scriptpath('normalize-by-median.py')
- args = ['-f', '-k 17', '-x ' + str(MIN_TABLESIZE_PARAM), infile]
-
- (status, out, err) = utils.runscript(script, args, in_dir, fail_ok=True)
-
- assert os.path.exists(infile + '.keep')
- assert '** ERROR: the graph structure is too small' in err, err
-
-
-def write_by_chunks(infile, outfile, CHUNKSIZE=8192):
- ifile = io.open(infile, 'rb')
- ofile = io.open(outfile, 'wb')
- chunk = ifile.read(CHUNKSIZE)
- while len(chunk) > 0:
- ofile.write(chunk)
- chunk = ifile.read(CHUNKSIZE)
- ifile.close()
- ofile.close()
-
-
-def test_normalize_by_median_stdout():
- CUTOFF = '20'
-
- infile = utils.get_test_data('100-reads.fq.gz')
- in_dir = os.path.dirname(infile)
- fifo = utils.get_temp_filename('fifo')
- outfile = utils.get_temp_filename('outfile')
-
- # Use a fifo to copy stdout to a file for checking
- os.mkfifo(fifo)
- thread = threading.Thread(target=write_by_chunks, args=(fifo, outfile))
- thread.start()
-
- # Execute diginorm
- script = scriptpath('normalize-by-median.py')
- args = ['-C', CUTOFF, '-k', '17', '-o', fifo, infile]
- (status, out, err) = utils.runscript(script, args, in_dir)
-
- # Merge the thread
- thread.join()
-
- assert os.path.exists(outfile), outfile
- with open(outfile) as fp:
- linecount = sum(1 for _ in fp)
- assert linecount == 400
-
-
def test_count_median():
infile = utils.get_temp_filename('test.fa')
outfile = infile + '.counts'
@@ -861,7 +630,7 @@ def test_count_median():
shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile)
counting_ht = _make_counting(infile, K=8)
- script = scriptpath('count-median.py')
+ script = 'count-median.py'
args = [counting_ht, infile, outfile]
utils.runscript(script, args)
@@ -881,7 +650,7 @@ def test_count_median_fq():
shutil.copyfile(utils.get_test_data('test-abund-read-2.fq'), infile)
counting_ht = _make_counting(infile, K=8)
- script = scriptpath('count-median.py')
+ script = 'count-median.py'
args = [counting_ht, infile, outfile]
utils.runscript(script, args)
@@ -901,7 +670,7 @@ def test_count_median_fq_csv():
shutil.copyfile(utils.get_test_data('test-abund-read-2.fq'), infile)
counting_ht = _make_counting(infile, K=8)
- script = scriptpath('count-median.py')
+ script = 'count-median.py'
args = ['--csv', counting_ht, infile, outfile]
utils.runscript(script, args)
@@ -917,12 +686,43 @@ def test_count_median_fq_csv():
names = set([line.split(',')[0] for line in data])
assert '895:1:37:17593:9954 1::FOO' in names, names
-#
-
def test_load_graph():
- script = scriptpath('load-graph.py')
- args = ['-x', '1e7', '-N', '2', '-k', '20', '-t']
+ script = 'load-graph.py'
+ args = ['-x', '1e7', '-N', '2', '-k', '20']
+
+ outfile = utils.get_temp_filename('out')
+ infile = utils.get_test_data('random-20-a.fa')
+
+ args.extend([outfile, infile])
+
+ (status, out, err) = utils.runscript(script, args)
+
+ assert 'Total number of unique k-mers: 3960' in err, err
+
+ ht_file = outfile + '.pt'
+ assert os.path.exists(ht_file), ht_file
+
+ tagset_file = outfile + '.tagset'
+ assert os.path.exists(tagset_file), tagset_file
+
+ try:
+ ht = khmer.load_hashbits(ht_file)
+ except IOError as err:
+ assert 0, str(err)
+ ht.load_tagset(tagset_file)
+
+ # check to make sure we get the expected result for this data set
+ # upon partitioning (all in one partition). This is kind of a
+ # roundabout way of checking that load-graph worked :)
+ subset = ht.do_subset_partition(0, 0)
+ x = ht.subset_count_partitions(subset)
+ assert x == (1, 0), x
+
+
+def test_oxli_build_graph():
+ script = 'oxli'
+ args = ['build-graph', '-x', '1e7', '-N', '2', '-k', '20']
outfile = utils.get_temp_filename('out')
infile = utils.get_test_data('random-20-a.fa')
@@ -951,7 +751,7 @@ def test_load_graph():
def test_load_graph_no_tags():
- script = scriptpath('load-graph.py')
+ script = 'load-graph.py'
args = ['-x', '1e7', '-N', '2', '-k', '20', '-n']
outfile = utils.get_temp_filename('out')
@@ -973,8 +773,31 @@ def test_load_graph_no_tags():
# loading the ht file...
+def test_oxli_build_graph_no_tags():
+ script = 'oxli'
+ args = ['build-graph', '-x', '1e7', '-N', '2', '-k', '20', '-n']
+
+ outfile = utils.get_temp_filename('out')
+ infile = utils.get_test_data('random-20-a.fa')
+
+ args.extend([outfile, infile])
+
+ utils.runscript(script, args)
+
+ ht_file = outfile + '.pt'
+ assert os.path.exists(ht_file), ht_file
+
+ tagset_file = outfile + '.tagset'
+ assert not os.path.exists(tagset_file), tagset_file
+
+ assert khmer.load_hashbits(ht_file)
+
+ # can't think of a good way to make sure this worked, beyond just
+ # loading the ht file...
+
+
def test_load_graph_fail():
- script = scriptpath('load-graph.py')
+ script = 'load-graph.py'
args = ['-x', '1e3', '-N', '2', '-k', '20'] # use small HT
outfile = utils.get_temp_filename('out')
@@ -987,9 +810,46 @@ def test_load_graph_fail():
assert "** ERROR: the graph structure is too small" in err
+def test_oxli_build_graph_fail():
+ script = 'oxli'
+ args = ['build-graph', '-x', '1e3', '-N', '2', '-k', '20'] # use small HT
+
+ outfile = utils.get_temp_filename('out')
+ infile = utils.get_test_data('random-20-a.fa')
+
+ args.extend([outfile, infile])
+
+ (status, out, err) = utils.runscript(script, args, fail_ok=True)
+ assert status == 1, status
+ assert "** ERROR: the graph structure is too small" in err
+
+
def test_load_graph_write_fp():
- script = scriptpath('load-graph.py')
- args = ['-x', '1e5', '-N', '2', '-k', '20', '-w'] # use small HT
+ script = 'load-graph.py'
+ args = ['-x', '1e5', '-N', '2', '-k', '20'] # use small HT
+
+ outfile = utils.get_temp_filename('out')
+ infile = utils.get_test_data('random-20-a.fa')
+
+ args.extend([outfile, infile])
+
+ (status, out, err) = utils.runscript(script, args)
+
+ ht_file = outfile + '.pt'
+ assert os.path.exists(ht_file), ht_file
+
+ info_file = outfile + '.info'
+ assert os.path.exists(info_file), info_file
+ data = [x.strip() for x in open(info_file)]
+ data = set(data)
+ assert '3959 unique k-mers' in data, data
+ assert 'false positive rate estimated to be 0.002' in data
+
+
+def test_oxli_build_graph_write_fp():
+ script = 'oxli'
+ # use small HT
+ args = ['build-graph', '-x', '1e5', '-N', '2', '-k', '20']
outfile = utils.get_temp_filename('out')
infile = utils.get_test_data('random-20-a.fa')
@@ -1010,7 +870,7 @@ def test_load_graph_write_fp():
def test_load_graph_multithread():
- script = scriptpath('load-graph.py')
+ script = 'load-graph.py'
outfile = utils.get_temp_filename('test')
infile = utils.get_test_data('test-reads.fa')
@@ -1020,11 +880,46 @@ def test_load_graph_multithread():
(status, out, err) = utils.runscript(script, args)
+def test_oxli_build_graph_multithread():
+ script = 'oxli'
+
+ outfile = utils.get_temp_filename('test')
+ infile = utils.get_test_data('test-reads.fa')
+
+ args = ['build-graph', '-N', '4', '-x', '1e7', '-T', '8', outfile, infile]
+
+ (status, out, err) = utils.runscript(script, args)
+
+
+def test_load_graph_max_memory_usage_parameter():
+ script = 'load-graph.py'
+ args = ['-M', '2e7', '-k', '20', '-n']
+
+ outfile = utils.get_temp_filename('out')
+ infile = utils.get_test_data('random-20-a.fa')
+
+ args.extend([outfile, infile])
+
+ (status, out, err) = utils.runscript(script, args)
+
+ assert 'Total number of unique k-mers: 3960' in err, err
+
+ ht_file = outfile + '.pt'
+ assert os.path.exists(ht_file), ht_file
+
+ try:
+ ht = khmer.load_hashbits(ht_file)
+ except IOError as err:
+ assert 0, str(err)
+
+ assert (sum(ht.hashsizes()) / 8.) < 2e7, ht.hashsizes()
+
+
def _make_graph(infilename, min_hashsize=1e7, n_hashes=2, ksize=20,
do_partition=False,
annotate_partitions=False,
stop_big_traverse=False):
- script = scriptpath('load-graph.py')
+ script = 'load-graph.py'
args = ['-x', str(min_hashsize), '-N', str(n_hashes), '-k', str(ksize)]
outfile = utils.get_temp_filename('out')
@@ -1041,13 +936,13 @@ def _make_graph(infilename, min_hashsize=1e7, n_hashes=2, ksize=20,
assert os.path.exists(tagset_file), tagset_file
if do_partition:
- script = scriptpath('partition-graph.py')
+ script = 'partition-graph.py'
args = [outfile]
if stop_big_traverse:
args.insert(0, '--no-big-traverse')
utils.runscript(script, args)
- script = scriptpath('merge-partitions.py')
+ script = 'merge-partitions.py'
args = [outfile, '-k', str(ksize)]
utils.runscript(script, args)
@@ -1055,7 +950,7 @@ def _make_graph(infilename, min_hashsize=1e7, n_hashes=2, ksize=20,
assert os.path.exists(final_pmap_file)
if annotate_partitions:
- script = scriptpath('annotate-partitions.py')
+ script = 'annotate-partitions.py'
args = ["-k", str(ksize), outfile, infilename]
in_dir = os.path.dirname(outfile)
@@ -1071,7 +966,7 @@ def _DEBUG_make_graph(infilename, min_hashsize=1e7, n_hashes=2, ksize=20,
do_partition=False,
annotate_partitions=False,
stop_big_traverse=False):
- script = scriptpath('load-graph.py')
+ script = 'load-graph.py'
args = ['-x', str(min_hashsize), '-N', str(n_hashes), '-k', str(ksize)]
outfile = utils.get_temp_filename('out')
@@ -1088,15 +983,15 @@ def _DEBUG_make_graph(infilename, min_hashsize=1e7, n_hashes=2, ksize=20,
assert os.path.exists(tagset_file), tagset_file
if do_partition:
- print ">>>> DEBUG: Partitioning <<<"
- script = scriptpath('partition-graph.py')
+ print(">>>> DEBUG: Partitioning <<<")
+ script = 'partition-graph.py'
args = [outfile]
if stop_big_traverse:
args.insert(0, '--no-big-traverse')
utils.runscript(script, args)
- print ">>>> DEBUG: Merging Partitions <<<"
- script = scriptpath('merge-partitions.py')
+ print(">>>> DEBUG: Merging Partitions <<<")
+ script = 'merge-partitions.py'
args = [outfile, '-k', str(ksize)]
utils.runscript(script, args)
@@ -1104,8 +999,8 @@ def _DEBUG_make_graph(infilename, min_hashsize=1e7, n_hashes=2, ksize=20,
assert os.path.exists(final_pmap_file)
if annotate_partitions:
- print ">>>> DEBUG: Annotating Partitions <<<"
- script = scriptpath('annotate-partitions.py')
+ print(">>>> DEBUG: Annotating Partitions <<<")
+ script = 'annotate-partitions.py'
args = ["-k", str(ksize), outfile, infilename]
in_dir = os.path.dirname(outfile)
@@ -1120,12 +1015,12 @@ def _DEBUG_make_graph(infilename, min_hashsize=1e7, n_hashes=2, ksize=20,
def test_partition_graph_1():
graphbase = _make_graph(utils.get_test_data('random-20-a.fa'))
- script = scriptpath('partition-graph.py')
+ script = 'partition-graph.py'
args = [graphbase]
utils.runscript(script, args)
- script = scriptpath('merge-partitions.py')
+ script = 'merge-partitions.py'
args = [graphbase, '-k', str(20)]
utils.runscript(script, args)
@@ -1144,12 +1039,12 @@ def test_partition_graph_nojoin_k21():
# test with K=21
graphbase = _make_graph(utils.get_test_data('random-20-a.fa'), ksize=21)
- script = scriptpath('partition-graph.py')
+ script = 'partition-graph.py'
args = [graphbase]
utils.runscript(script, args)
- script = scriptpath('merge-partitions.py')
+ script = 'merge-partitions.py'
args = [graphbase, '-k', str(21)]
utils.runscript(script, args)
@@ -1176,12 +1071,12 @@ def test_partition_graph_nojoin_stoptags():
del ht
# run script with stoptags option
- script = scriptpath('partition-graph.py')
+ script = 'partition-graph.py'
args = ['--stoptags', stoptags_file, graphbase]
utils.runscript(script, args)
- script = scriptpath('merge-partitions.py')
+ script = 'merge-partitions.py'
args = [graphbase, '-k', str(20)]
utils.runscript(script, args)
@@ -1227,6 +1122,22 @@ def test_partition_graph_no_big_traverse():
assert x[0] == 4, x # should be four partitions, broken at knot.
+def test_partition_find_knots_execute():
+ graphbase = _make_graph(utils.get_test_data('random-20-a.fa'))
+
+ script = 'partition-graph.py'
+ args = [graphbase]
+
+ utils.runscript(script, args)
+
+ script = 'find-knots.py'
+ args = [graphbase]
+ utils.runscript(script, args)
+
+ stoptags_file = graphbase + '.stoptags'
+ assert os.path.exists(stoptags_file)
+
+
def test_annotate_partitions():
seqfile = utils.get_test_data('random-20-a.fa')
graphbase = _make_graph(seqfile, do_partition=True)
@@ -1236,7 +1147,7 @@ def test_annotate_partitions():
final_pmap_file = graphbase + '.pmap.merged'
assert os.path.exists(final_pmap_file)
- script = scriptpath('annotate-partitions.py')
+ script = 'annotate-partitions.py'
args = ["-k", "20", graphbase, seqfile]
utils.runscript(script, args, in_dir)
@@ -1259,7 +1170,7 @@ def test_annotate_partitions_2():
final_pmap_file = graphbase + '.pmap.merged'
assert os.path.exists(final_pmap_file)
- script = scriptpath('annotate-partitions.py')
+ script = 'annotate-partitions.py'
args = ["-k", "21", graphbase, seqfile]
utils.runscript(script, args, in_dir)
@@ -1267,7 +1178,7 @@ def test_annotate_partitions_2():
parts = [r.name.split('\t')[1] for r in screed.open(partfile)]
parts = set(parts)
- print parts
+ print(parts)
assert len(parts) == 99, len(parts)
@@ -1281,7 +1192,7 @@ def test_extract_partitions():
partfile = os.path.join(in_dir, 'random-20-a.fa.part')
# ok, now run extract-partitions.
- script = scriptpath('extract-partitions.py')
+ script = 'extract-partitions.py'
args = ['extracted', partfile]
utils.runscript(script, args, in_dir)
@@ -1310,7 +1221,7 @@ def test_extract_partitions_header_whitespace():
partfile = os.path.join(in_dir, 'test-overlap2.fa.part')
# ok, now run extract-partitions.
- script = scriptpath('extract-partitions.py')
+ script = 'extract-partitions.py'
args = ['extracted', partfile]
utils.runscript(script, args, in_dir)
@@ -1321,13 +1232,13 @@ def test_extract_partitions_header_whitespace():
assert os.path.exists(groupfile)
dist = open(distfile).readline()
- assert dist.strip() == '1 11957 11957 11957'
+ assert dist.strip() == '1 11960 11960 11960', dist.strip()
parts = [r.name.split('\t')[1]
for r in screed.open(partfile, parse_description=False)]
assert len(parts) == 13538, len(parts)
parts = set(parts)
- assert len(parts) == 12601, len(parts)
+ assert len(parts) == 12602, len(parts)
def test_extract_partitions_fq():
@@ -1340,7 +1251,7 @@ def test_extract_partitions_fq():
partfile = os.path.join(in_dir, 'random-20-a.fq.part')
# ok, now run extract-partitions.
- script = scriptpath('extract-partitions.py')
+ script = 'extract-partitions.py'
args = ['extracted', partfile]
utils.runscript(script, args, in_dir)
@@ -1380,7 +1291,7 @@ def test_extract_partitions_output_unassigned():
partfile = os.path.join(in_dir, 'random-20-a.fa.part')
# ok, now run extract-partitions.
- script = scriptpath('extract-partitions.py')
+ script = 'extract-partitions.py'
args = ['-U', 'extracted', partfile]
utils.runscript(script, args, in_dir)
@@ -1411,7 +1322,7 @@ def test_extract_partitions_no_output_groups():
partfile = os.path.join(in_dir, 'random-20-a.fq.part')
# ok, now run extract-partitions.
- script = scriptpath('extract-partitions.py')
+ script = 'extract-partitions.py'
args = ['-n', 'extracted', partfile]
# We expect a sys.exit -> we need the test to be tolerant
@@ -1430,7 +1341,7 @@ def test_extract_partitions_pid_0():
in_dir = os.path.dirname(partfile)
# ok, now run extract-partitions.
- script = scriptpath('extract-partitions.py')
+ script = 'extract-partitions.py'
args = ['-U', 'extracted', partfile]
utils.runscript(script, args, in_dir)
@@ -1455,7 +1366,7 @@ def test_extract_partitions_multi_groups():
in_dir = os.path.dirname(partfile)
# ok, now run extract-partitions.
- script = scriptpath('extract-partitions.py')
+ script = 'extract-partitions.py'
args = ['-m', '1', '-X', '1', 'extracted', partfile]
utils.runscript(script, args, in_dir)
@@ -1477,7 +1388,7 @@ def test_extract_partitions_no_groups():
in_dir = os.path.dirname(empty_file)
# ok, now run extract-partitions.
- script = scriptpath('extract-partitions.py')
+ script = 'extract-partitions.py'
args = ['extracted', empty_file]
_, _, err = utils.runscript(script, args, in_dir, fail_ok=True)
@@ -1497,27 +1408,27 @@ def test_abundance_dist():
htfile = _make_counting(infile, K=17)
- script = scriptpath('abundance-dist.py')
+ script = 'abundance-dist.py'
args = ['-z', htfile, infile, outfile]
utils.runscript(script, args, in_dir)
- fp = iter(open(outfile))
- line = fp.next().strip()
- assert line == '1 96 96 0.98', line
- line = fp.next().strip()
- assert line == '1001 2 98 1.0', line
+ with open(outfile) as fp:
+ line = fp.readline().strip()
+ assert line == '1 96 96 0.98', line
+ line = fp.readline().strip()
+ assert line == '1001 2 98 1.0', line
os.remove(outfile)
args = ['-z', '--csv', htfile, infile, outfile]
utils.runscript(script, args, in_dir)
- fp = iter(open(outfile))
- line = fp.next().strip()
- assert (line == 'abundance,count,cumulative,cumulative_fraction'), line
- line = fp.next().strip()
- assert line == '1,96,96,0.98', line
- line = fp.next().strip()
- assert line == '1001,2,98,1.0', line
+ with open(outfile) as fp:
+ line = fp.readline().strip()
+ assert (line == 'abundance,count,cumulative,cumulative_fraction'), line
+ line = fp.readline().strip()
+ assert line == '1,96,96,0.98', line
+ line = fp.readline().strip()
+ assert line == '1001,2,98,1.0', line
def test_abundance_dist_nobigcount():
@@ -1527,17 +1438,17 @@ def test_abundance_dist_nobigcount():
shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile)
- htfile = _make_counting(infile, K=17, BIGCOUNT=False)
+ htfile = _make_counting(infile, K=17)
- script = scriptpath('abundance-dist.py')
- args = ['-z', htfile, infile, outfile]
+ script = 'abundance-dist.py'
+ args = ['-b', '-z', htfile, infile, outfile]
utils.runscript(script, args, in_dir)
- fp = iter(open(outfile))
- line = fp.next().strip()
- assert line == '1 96 96 0.98', line
- line = fp.next().strip()
- assert line == '255 2 98 1.0', line
+ with open(outfile) as fp:
+ line = fp.readline().strip()
+ assert line == '1 96 96 0.98', line
+ line = fp.readline().strip()
+ assert line == '255 2 98 1.0', line
def test_abundance_dist_single():
@@ -1547,18 +1458,18 @@ def test_abundance_dist_single():
shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile)
- script = scriptpath('abundance-dist-single.py')
+ script = 'abundance-dist-single.py'
args = ['-x', '1e7', '-N', '2', '-k', '17', '-z', '-t', infile,
outfile]
(status, out, err) = utils.runscript(script, args, in_dir)
assert 'Total number of unique k-mers: 98' in err, err
- fp = iter(open(outfile))
- line = fp.next().strip()
- assert line == '1 96 96 0.98', line
- line = fp.next().strip()
- assert line == '1001 2 98 1.0', line
+ with open(outfile) as fp:
+ line = fp.readline().strip()
+ assert line == '1 96 96 0.98', line
+ line = fp.readline().strip()
+ assert line == '1001 2 98 1.0', line
def test_abundance_dist_threaded():
@@ -1568,18 +1479,18 @@ def test_abundance_dist_threaded():
shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile)
- script = scriptpath('abundance-dist-single.py')
+ script = 'abundance-dist-single.py'
args = ['-x', '1e7', '-N', '2', '-k', '17', '-z', '-t', '--threads', '18',
infile, outfile]
(status, out, err) = utils.runscript(script, args, in_dir)
assert 'Total number of unique k-mers: 98' in err, err
- fp = iter(open(outfile))
- line = fp.next().strip()
- assert line == '1 96 96 0.98', line
- line = fp.next().strip()
- assert line == '1001 2 98 1.0', line
+ with open(outfile) as fp:
+ line = fp.readline().strip()
+ assert line == '1 96 96 0.98', line
+ line = fp.readline().strip()
+ assert line == '1001 2 98 1.0', line
def test_abundance_dist_single_csv():
@@ -1589,18 +1500,18 @@ def test_abundance_dist_single_csv():
shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile)
- script = scriptpath('abundance-dist-single.py')
+ script = 'abundance-dist-single.py'
args = ['-x', '1e7', '-N', '2', '-k', '17', '-z', '--csv', infile,
outfile]
(status, out, err) = utils.runscript(script, args, in_dir)
- fp = iter(open(outfile))
- line = fp.next().strip()
- assert (line == 'abundance,count,cumulative,cumulative_fraction'), line
- line = fp.next().strip()
- assert line == '1,96,96,0.98', line
- line = fp.next().strip()
- assert line == '1001,2,98,1.0', line
+ with open(outfile) as fp:
+ line = fp.readline().strip()
+ assert (line == 'abundance,count,cumulative,cumulative_fraction'), line
+ line = fp.readline().strip()
+ assert line == '1,96,96,0.98', line
+ line = fp.readline().strip()
+ assert line == '1001,2,98,1.0', line
def test_abundance_dist_single_nobigcount():
@@ -1610,15 +1521,15 @@ def test_abundance_dist_single_nobigcount():
shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile)
- script = scriptpath('abundance-dist-single.py')
+ script = 'abundance-dist-single.py'
args = ['-x', '1e7', '-N', '2', '-k', '17', '-z', '-b', infile, outfile]
utils.runscript(script, args, in_dir)
- fp = iter(open(outfile))
- line = fp.next().strip()
- assert line == '1 96 96 0.98', line
- line = fp.next().strip()
- assert line == '255 2 98 1.0', line
+ with open(outfile) as fp:
+ line = fp.readline().strip()
+ assert line == '1 96 96 0.98', line
+ line = fp.readline().strip()
+ assert line == '255 2 98 1.0', line
def test_abundance_dist_single_nosquash():
@@ -1628,15 +1539,15 @@ def test_abundance_dist_single_nosquash():
shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile)
- script = scriptpath('abundance-dist-single.py')
+ script = 'abundance-dist-single.py'
args = ['-x', '1e7', '-N', '2', '-k', '17', '-z', '-t', infile, outfile]
utils.runscript(script, args, in_dir)
- fp = iter(open(outfile))
- line = fp.next().strip()
- assert line == '1 96 96 0.98', line
- line = fp.next().strip()
- assert line == '1001 2 98 1.0', line
+ with open(outfile) as fp:
+ line = fp.readline().strip()
+ assert line == '1 96 96 0.98', line
+ line = fp.readline().strip()
+ assert line == '1001 2 98 1.0', line
def test_abundance_dist_single_savetable():
@@ -1647,16 +1558,16 @@ def test_abundance_dist_single_savetable():
shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile)
- script = scriptpath('abundance-dist-single.py')
+ script = 'abundance-dist-single.py'
args = ['-x', '1e7', '-N', '2', '-k', '17', '-z', '-t', '--savetable',
tabfile, infile, outfile]
utils.runscript(script, args, in_dir)
- fp = iter(open(outfile))
- line = fp.next().strip()
- assert line == '1 96 96 0.98', line
- line = fp.next().strip()
- assert line == '1001 2 98 1.0', line
+ with open(outfile) as fp:
+ line = fp.readline().strip()
+ assert line == '1 96 96 0.98', line
+ line = fp.readline().strip()
+ assert line == '1001 2 98 1.0', line
def test_do_partition():
@@ -1664,7 +1575,7 @@ def test_do_partition():
graphbase = utils.get_temp_filename('out')
in_dir = os.path.dirname(graphbase)
- script = scriptpath('do-partition.py')
+ script = 'do-partition.py'
args = ["-k", "20", graphbase, seqfile]
utils.runscript(script, args, in_dir)
@@ -1683,7 +1594,7 @@ def test_do_partition_2():
graphbase = utils.get_temp_filename('out')
in_dir = os.path.dirname(graphbase)
- script = scriptpath('do-partition.py')
+ script = 'do-partition.py'
args = ["-k", "21", graphbase, seqfile]
utils.runscript(script, args, in_dir)
@@ -1702,7 +1613,7 @@ def test_do_partition_2_fq():
graphbase = utils.get_temp_filename('out')
in_dir = os.path.dirname(graphbase)
- script = scriptpath('do-partition.py')
+ script = 'do-partition.py'
args = ["-k", "21", graphbase, seqfile]
utils.runscript(script, args, in_dir)
@@ -1715,6 +1626,30 @@ def test_do_partition_2_fq():
assert '46 1::FIZ' in names
+def test_interleave_read_seq1_fq():
+ # create input files
+ infile1 = utils.get_test_data('paired-slash1.fq.1')
+ infile2 = utils.get_test_data('paired-slash1.fq.2')
+
+ # correct output
+ ex_outfile = utils.get_test_data('paired-slash1.fq')
+
+ # actual output file
+ outfile = utils.get_temp_filename('out.fq')
+
+ script = 'interleave-reads.py'
+ args = [infile1, infile2, '-o', outfile]
+
+ utils.runscript(script, args)
+
+ n = 0
+ for r, q in zip(screed.open(ex_outfile), screed.open(outfile)):
+ n += 1
+ assert r.name == q.name
+ assert r.sequence == q.sequence
+ assert n > 0
+
+
def test_interleave_reads_1_fq():
# test input files
infile1 = utils.get_test_data('paired.fq.1')
@@ -1726,7 +1661,7 @@ def test_interleave_reads_1_fq():
# actual output file
outfile = utils.get_temp_filename('out.fq')
- script = scriptpath('interleave-reads.py')
+ script = 'interleave-reads.py'
args = [infile1, infile2, '-o', outfile]
utils.runscript(script, args)
@@ -1745,7 +1680,7 @@ def test_interleave_reads_broken_fq():
# actual output file
outfile = utils.get_temp_filename('out.fq')
- script = scriptpath('interleave-reads.py')
+ script = 'interleave-reads.py'
args = [infile1, infile2, '-o', outfile]
status, out, err = utils.runscript(script, args, fail_ok=True)
@@ -1761,7 +1696,7 @@ def test_interleave_reads_broken_fq_2():
# actual output file
outfile = utils.get_temp_filename('out.fq')
- script = scriptpath('interleave-reads.py')
+ script = 'interleave-reads.py'
args = [infile1, infile2, '-o', outfile]
status, out, err = utils.runscript(script, args, fail_ok=True)
@@ -1777,7 +1712,7 @@ def test_interleave_reads_broken_fq_3():
# actual output file
outfile = utils.get_temp_filename('out.fq')
- script = scriptpath('interleave-reads.py')
+ script = 'interleave-reads.py'
args = [infile1, infile2, '-o', outfile]
status, out, err = utils.runscript(script, args, fail_ok=True)
@@ -1792,7 +1727,7 @@ def test_interleave_reads_broken_fq_4():
# actual output file
outfile = utils.get_temp_filename('out.fq')
- script = scriptpath('interleave-reads.py')
+ script = 'interleave-reads.py'
args = [infile1, '-o', outfile]
status, out, err = utils.runscript(script, args, fail_ok=True)
@@ -1811,7 +1746,7 @@ def test_interleave_reads_2_fa():
# actual output file
outfile = utils.get_temp_filename('out.fa')
- script = scriptpath('interleave-reads.py')
+ script = 'interleave-reads.py'
args = [infile1, infile2, '-o', outfile]
utils.runscript(script, args)
@@ -1833,8 +1768,8 @@ def test_make_initial_stoptags():
shutil.copyfile(utils.get_test_data('test-reads.fq.bz2'), bzinfile)
in_dir = os.path.dirname(bzinfile)
- genscript = scriptpath('load-graph.py')
- genscriptargs = ['-t', 'test-reads', 'test-reads.fq.bz2']
+ genscript = 'load-graph.py'
+ genscriptargs = ['test-reads', 'test-reads.fq.bz2']
utils.runscript(genscript, genscriptargs, in_dir)
# test input file gen'd by load-graphs
@@ -1847,7 +1782,7 @@ def test_make_initial_stoptags():
# actual output file
outfile1 = utils.get_temp_filename('test-reads.stoptags', in_dir)
- script = scriptpath('make-initial-stoptags.py')
+ script = 'make-initial-stoptags.py'
# make-initial-stoptags has weird file argument syntax
# read the code before modifying
args = ['test-reads']
@@ -1868,7 +1803,7 @@ def test_extract_paired_reads_1_fa():
in_dir = os.path.dirname(outfile1)
outfile2 = utils.get_temp_filename('paired-mixed.fa.se', in_dir)
- script = scriptpath('extract-paired-reads.py')
+ script = 'extract-paired-reads.py'
args = [infile]
utils.runscript(script, args, in_dir)
@@ -1903,7 +1838,7 @@ def test_extract_paired_reads_2_fq():
in_dir = os.path.dirname(outfile1)
outfile2 = utils.get_temp_filename('paired-mixed.fq.se', in_dir)
- script = scriptpath('extract-paired-reads.py')
+ script = 'extract-paired-reads.py'
args = [infile]
utils.runscript(script, args, in_dir)
@@ -1930,6 +1865,145 @@ def test_extract_paired_reads_2_fq():
assert n > 0
+def test_extract_paired_reads_3_output_dir():
+ # test input file
+ infile = utils.get_test_data('paired-mixed.fa')
+
+ ex_outfile1 = utils.get_test_data('paired-mixed.fa.pe')
+ ex_outfile2 = utils.get_test_data('paired-mixed.fa.se')
+
+ # output directory
+ out_dir = utils.get_temp_filename('output')
+
+ script = 'extract-paired-reads.py'
+ args = [infile, '-o', out_dir]
+
+ utils.runscript(script, args)
+
+ outfile1 = os.path.join(out_dir, 'paired-mixed.fa.pe')
+ outfile2 = os.path.join(out_dir, 'paired-mixed.fa.se')
+ assert os.path.exists(outfile1), outfile1
+ assert os.path.exists(outfile2), outfile2
+
+ n = 0
+ for r, q in zip(screed.open(ex_outfile1), screed.open(outfile1)):
+ n += 1
+ assert r.name == q.name
+ assert r.sequence == q.sequence
+ assert n > 0
+
+ n = 0
+ for r, q in zip(screed.open(ex_outfile2), screed.open(outfile2)):
+ n += 1
+ assert r.name == q.name
+ assert r.sequence == q.sequence
+ assert n > 0
+
+
+def test_extract_paired_reads_4_output_files():
+ # test input file
+ infile = utils.get_test_data('paired-mixed.fa')
+
+ ex_outfile1 = utils.get_test_data('paired-mixed.fa.pe')
+ ex_outfile2 = utils.get_test_data('paired-mixed.fa.se')
+
+ # actual output files...
+ outfile1 = utils.get_temp_filename('out_pe')
+ outfile2 = utils.get_temp_filename('out_se')
+
+ script = 'extract-paired-reads.py'
+ args = [infile, '-p', outfile1, '-s', outfile2]
+
+ utils.runscript(script, args)
+
+ assert os.path.exists(outfile1), outfile1
+ assert os.path.exists(outfile2), outfile2
+
+ n = 0
+ for r, q in zip(screed.open(ex_outfile1), screed.open(outfile1)):
+ n += 1
+ assert r.name == q.name
+ assert r.sequence == q.sequence
+ assert n > 0
+
+ n = 0
+ for r, q in zip(screed.open(ex_outfile2), screed.open(outfile2)):
+ n += 1
+ assert r.name == q.name
+ assert r.sequence == q.sequence
+ assert n > 0
+
+
+def test_extract_paired_reads_5_stdin_error():
+ script = 'extract-paired-reads.py'
+ args = ['-f', '/dev/stdin']
+
+ status, out, err = utils.runscript(script, args, fail_ok=True)
+ assert status == 1
+ assert "output filenames must be provided." in err
+
+
+def execute_extract_paired_streaming(ifilename):
+ fifo = utils.get_temp_filename('fifo')
+ in_dir = os.path.dirname(fifo)
+ outfile1 = utils.get_temp_filename('paired.pe')
+ outfile2 = utils.get_temp_filename('paired.se')
+ script = 'extract-paired-reads.py'
+ args = [fifo, '-p', outfile1, '-s', outfile2]
+
+ # make a fifo to simulate streaming
+ os.mkfifo(fifo)
+
+ thread = threading.Thread(target=utils.runscript,
+ args=(script, args, in_dir))
+ thread.start()
+ ifile = open(ifilename, 'r')
+ fifofile = open(fifo, 'w')
+ chunk = ifile.read(4)
+ while len(chunk) > 0:
+ fifofile.write(chunk)
+ chunk = ifile.read(4)
+ fifofile.close()
+ thread.join()
+ assert os.path.exists(outfile1), outfile1
+ assert os.path.exists(outfile2), outfile2
+
+
+def test_extract_paired_streaming():
+ testinput = utils.get_test_data('paired-mixed.fa')
+ o = execute_extract_paired_streaming(testinput)
+
+
+def execute_split_paired_streaming(ifilename):
+ fifo = utils.get_temp_filename('fifo')
+ in_dir = os.path.dirname(fifo)
+ outfile1 = utils.get_temp_filename('paired-1.fa')
+ outfile2 = utils.get_temp_filename('paired-2.fa')
+ script = 'split-paired-reads.py'
+ args = [fifo, '-1', outfile1, '-2', outfile2]
+
+ # make a fifo to simulate streaming
+ os.mkfifo(fifo)
+
+ thread = threading.Thread(target=utils.runscript,
+ args=(script, args, in_dir))
+ thread.start()
+ ifile = open(ifilename, 'r')
+ fifofile = open(fifo, 'w')
+ chunk = ifile.read(4)
+ while len(chunk) > 0:
+ fifofile.write(chunk)
+ chunk = ifile.read(4)
+ fifofile.close()
+ thread.join()
+ assert os.path.exists(outfile1), outfile1
+ assert os.path.exists(outfile2), outfile2
+
+
+def test_split_paired_streaming():
+ o = execute_split_paired_streaming(utils.get_test_data('paired.fa'))
+
+
def test_split_paired_reads_1_fa():
# test input file
infile = utils.get_test_data('paired.fa')
@@ -1942,7 +2016,7 @@ def test_split_paired_reads_1_fa():
in_dir = os.path.dirname(outfile1)
outfile2 = utils.get_temp_filename('paired.fa.2', in_dir)
- script = scriptpath('split-paired-reads.py')
+ script = 'split-paired-reads.py'
args = [infile]
utils.runscript(script, args, in_dir)
@@ -1977,7 +2051,7 @@ def test_split_paired_reads_2_fq():
in_dir = os.path.dirname(outfile1)
outfile2 = utils.get_temp_filename('paired.fq.2', in_dir)
- script = scriptpath('split-paired-reads.py')
+ script = 'split-paired-reads.py'
args = [infile]
utils.runscript(script, args, in_dir)
@@ -2008,7 +2082,7 @@ def test_split_paired_reads_2_mixed_fq_require_pair():
shutil.copyfile(utils.get_test_data('paired-mixed.fq'), infile)
in_dir = os.path.dirname(infile)
- script = scriptpath('split-paired-reads.py')
+ script = 'split-paired-reads.py'
args = ['-p', infile]
status, out, err = utils.runscript(script, args, in_dir, fail_ok=True)
@@ -2022,7 +2096,7 @@ def test_split_paired_reads_2_mixed_fq():
shutil.copyfile(utils.get_test_data('paired-mixed-2.fq'), infile)
in_dir = os.path.dirname(infile)
- script = scriptpath('split-paired-reads.py')
+ script = 'split-paired-reads.py'
args = [infile]
status, out, err = utils.runscript(script, args, in_dir)
@@ -2036,7 +2110,7 @@ def test_split_paired_reads_2_mixed_fq_broken_pairing_format():
shutil.copyfile(utils.get_test_data('paired-mixed-broken.fq'), infile)
in_dir = os.path.dirname(infile)
- script = scriptpath('split-paired-reads.py')
+ script = 'split-paired-reads.py'
args = [infile]
status, out, err = utils.runscript(script, args, in_dir, fail_ok=True)
@@ -2056,7 +2130,7 @@ def test_split_paired_reads_3_output_dir():
output_dir = os.path.dirname(outfile1)
outfile2 = utils.get_temp_filename('paired.fq.2', output_dir)
- script = scriptpath('split-paired-reads.py')
+ script = 'split-paired-reads.py'
args = ['--output-dir', output_dir, infile]
utils.runscript(script, args)
@@ -2093,7 +2167,7 @@ def test_split_paired_reads_3_output_files():
output_dir = os.path.dirname(outfile1)
outfile2 = utils.get_temp_filename('yyy', output_dir)
- script = scriptpath('split-paired-reads.py')
+ script = 'split-paired-reads.py'
args = ['-1', outfile1, '-2', outfile2, infile]
utils.runscript(script, args)
@@ -2130,7 +2204,7 @@ def test_split_paired_reads_3_output_files_left():
output_dir = os.path.dirname(outfile1)
outfile2 = utils.get_temp_filename('paired.fq.2', output_dir)
- script = scriptpath('split-paired-reads.py')
+ script = 'split-paired-reads.py'
args = ['-o', output_dir, '-1', outfile1, infile]
utils.runscript(script, args)
@@ -2167,7 +2241,7 @@ def test_split_paired_reads_3_output_files_right():
output_dir = os.path.dirname(outfile1)
outfile2 = utils.get_temp_filename('yyy', output_dir)
- script = scriptpath('split-paired-reads.py')
+ script = 'split-paired-reads.py'
args = ['-2', outfile2, '-o', output_dir, infile]
utils.runscript(script, args)
@@ -2198,7 +2272,7 @@ def test_sample_reads_randomly():
shutil.copyfile(utils.get_test_data('test-reads.fa'), infile)
- script = scriptpath('sample-reads-randomly.py')
+ script = 'sample-reads-randomly.py'
# fix random number seed for reproducibility
args = ['-N', '10', '-M', '12000', '-R', '1']
args.append(infile)
@@ -2208,18 +2282,32 @@ def test_sample_reads_randomly():
assert os.path.exists(outfile), outfile
seqs = set([r.name for r in screed.open(outfile)])
- print list(sorted(seqs))
-
- assert seqs == set(['850:2:1:1859:11742/1', '850:2:1:1859:11742/2',
- '850:2:1:2131:17360/1', '850:2:1:2131:17360/2',
- '850:2:1:2416:7565/1', '850:2:1:2416:7565/2',
- '850:2:1:2490:13491/1', '850:2:1:2490:13491/2',
- '850:2:1:2962:3999/1', '850:2:1:2962:3999/2',
- '850:2:1:3096:20321/1', '850:2:1:3096:20321/2',
- '850:2:1:3164:6414/1', '850:2:1:3164:6414/2',
- '850:2:1:3206:13876/1', '850:2:1:3206:13876/2',
- '850:2:1:3631:20919/1', '850:2:1:3631:20919/2',
- '850:2:1:3655:15581/1', '850:2:1:3655:15581/2'])
+ print(list(sorted(seqs)))
+
+ if sys.version_info.major == 2:
+ answer = {'850:2:1:1859:11742/1', '850:2:1:1859:11742/2',
+ '850:2:1:2131:17360/1', '850:2:1:2131:17360/2',
+ '850:2:1:2416:7565/1', '850:2:1:2416:7565/2',
+ '850:2:1:2490:13491/1', '850:2:1:2490:13491/2',
+ '850:2:1:2962:3999/1', '850:2:1:2962:3999/2',
+ '850:2:1:3096:20321/1', '850:2:1:3096:20321/2',
+ '850:2:1:3164:6414/1', '850:2:1:3164:6414/2',
+ '850:2:1:3206:13876/1', '850:2:1:3206:13876/2',
+ '850:2:1:3631:20919/1', '850:2:1:3631:20919/2',
+ '850:2:1:3655:15581/1', '850:2:1:3655:15581/2'}
+ else:
+ answer = {'850:2:1:1257:3404/1', '850:2:1:1257:3404/2',
+ '850:2:1:1362:19357/1', '850:2:1:1362:19357/2',
+ '850:2:1:1396:5659/1', '850:2:1:1396:5659/2',
+ '850:2:1:2063:11124/1', '850:2:1:2063:11124/2',
+ '850:2:1:2121:12070/1', '850:2:1:2121:12070/2',
+ '850:2:1:2528:15779/1', '850:2:1:2528:15779/2',
+ '850:2:1:2581:12886/1', '850:2:1:2581:12886/2',
+ '850:2:1:2864:8505/1', '850:2:1:2864:8505/2',
+ '850:2:1:3000:2015/1', '850:2:1:3000:2015/2',
+ '850:2:1:3302:5025/1', '850:2:1:3302:5025/2'}
+
+ assert seqs == answer
def test_sample_reads_randomly_force_single():
@@ -2228,7 +2316,7 @@ def test_sample_reads_randomly_force_single():
shutil.copyfile(utils.get_test_data('test-reads.fa'), infile)
- script = scriptpath('sample-reads-randomly.py')
+ script = 'sample-reads-randomly.py'
# fix random number seed for reproducibility
args = ['-N', '10', '-M', '12000', '-R', '1', '--force_single']
args.append(infile)
@@ -2238,17 +2326,32 @@ def test_sample_reads_randomly_force_single():
assert os.path.exists(outfile), outfile
seqs = set([r.name for r in screed.open(outfile)])
- print list(sorted(seqs))
- assert seqs == set(['850:2:1:2399:20086/2',
- '850:2:1:2273:13309/1',
- '850:2:1:2065:16816/1',
- '850:2:1:1984:7162/2',
- '850:2:1:2691:14602/1',
- '850:2:1:1762:5439/1',
- '850:2:1:2503:4494/2',
- '850:2:1:2263:11143/2',
- '850:2:1:1792:15774/2',
- '850:2:1:2084:17145/1'])
+ print(list(sorted(seqs)))
+
+ if sys.version_info.major == 2:
+ answer = {'850:2:1:2399:20086/2',
+ '850:2:1:2273:13309/1',
+ '850:2:1:2065:16816/1',
+ '850:2:1:1984:7162/2',
+ '850:2:1:2691:14602/1',
+ '850:2:1:1762:5439/1',
+ '850:2:1:2503:4494/2',
+ '850:2:1:2263:11143/2',
+ '850:2:1:1792:15774/2',
+ '850:2:1:2084:17145/1'}
+ else:
+ answer = {'850:2:1:1199:4197/1',
+ '850:2:1:1251:16575/2',
+ '850:2:1:1267:6790/2',
+ '850:2:1:1601:4443/1',
+ '850:2:1:1625:19325/1',
+ '850:2:1:1832:14607/2',
+ '850:2:1:1946:20852/2',
+ '850:2:1:2401:4896/2',
+ '850:2:1:2562:1308/1',
+ '850:2:1:3123:15968/2'}
+
+ assert seqs == answer
def test_sample_reads_randomly_fq():
@@ -2257,7 +2360,7 @@ def test_sample_reads_randomly_fq():
shutil.copyfile(utils.get_test_data('test-reads.fq.gz'), infile)
- script = scriptpath('sample-reads-randomly.py')
+ script = 'sample-reads-randomly.py'
# fix random number seed for reproducibility
args = ['-N', '10', '-M', '12000', '-R', '1']
args.append(infile)
@@ -2266,25 +2369,38 @@ def test_sample_reads_randomly_fq():
outfile = infile + '.subset'
assert os.path.exists(outfile), outfile
+ if sys.version_info.major == 2:
+ answer = {'850:2:1:2399:20086/2',
+ '850:2:1:1762:5439 1::FOO',
+ '850:2:1:2065:16816/1',
+ '850:2:1:2263:11143/2',
+ '850:2:1:1792:15774/2',
+ '850:2:1:2691:14602/1',
+ '850:2:1:2503:4494 1::FOO',
+ '850:2:1:2084:17145/1',
+ '850:2:1:1984:7162 1::FOO',
+ '850:2:1:2273:13309 1::FOO'}
+ else:
+ answer = {'850:2:1:1199:4197 1::FOO',
+ '850:2:1:1251:16575/2',
+ '850:2:1:1267:6790/2',
+ '850:2:1:1601:4443 1::FOO',
+ '850:2:1:1625:1932 1::FOO1',
+ '850:2:1:1832:14607 1::FOO',
+ '850:2:1:1946:20852 1::FOO',
+ '850:2:1:2401:4896/2',
+ '850:2:1:2562:1308/1',
+ '850:2:1:3123:15968/2'}
+
seqs = set([r.name for r in screed.open(outfile,
parse_description=False)])
-
- print list(sorted(seqs))
- assert seqs == set(['850:2:1:2399:20086/2',
- '850:2:1:1762:5439 1::FOO',
- '850:2:1:2065:16816/1',
- '850:2:1:2263:11143/2',
- '850:2:1:1792:15774/2',
- '850:2:1:2691:14602/1',
- '850:2:1:2503:4494 1::FOO',
- '850:2:1:2084:17145/1',
- '850:2:1:1984:7162 1::FOO',
- '850:2:1:2273:13309 1::FOO'])
+ print(list(sorted(seqs)))
+ assert seqs == answer
def test_fastq_to_fasta():
- script = scriptpath('fastq-to-fasta.py')
+ script = 'fastq-to-fasta.py'
clean_infile = utils.get_temp_filename('test-clean.fq')
n_infile = utils.get_temp_filename('test-n.fq')
@@ -2334,7 +2450,7 @@ def test_fastq_to_fasta():
def test_extract_long_sequences_fa():
- script = scriptpath('extract-long-sequences.py')
+ script = 'extract-long-sequences.py'
fa_infile = utils.get_temp_filename('test.fa')
shutil.copyfile(utils.get_test_data('paired-mixed.fa'), fa_infile)
@@ -2356,7 +2472,7 @@ def test_extract_long_sequences_fa():
def test_extract_long_sequences_fq():
- script = scriptpath('extract-long-sequences.py')
+ script = 'extract-long-sequences.py'
fq_infile = utils.get_temp_filename('test.fq')
shutil.copyfile(utils.get_test_data('paired-mixed.fq'), fq_infile)
@@ -2382,7 +2498,7 @@ def test_sample_reads_randomly_S():
shutil.copyfile(utils.get_test_data('test-fastq-reads.fq'), infile)
- script = scriptpath('sample-reads-randomly.py')
+ script = 'sample-reads-randomly.py'
# fix random number seed for reproducibility
args = ['-N', '10', '-R', '1', '-S', '3']
@@ -2400,38 +2516,63 @@ def test_sample_reads_randomly_S():
outfile = infile + '.subset.0'
assert os.path.exists(outfile), outfile
- seqs = set([r.name for r in screed.open(outfile)])
- print list(sorted(seqs))
-
- assert seqs == set(['895:1:1:1303:14389', '895:1:1:1347:3237',
- '895:1:1:1295:6189', '895:1:1:1308:20421',
- '895:1:1:1320:11648', '895:1:1:1352:5369',
- '895:1:1:1318:10532', '895:1:1:1363:11839',
- '895:1:1:1355:13535', '895:1:1:1349:15165'])
+ seqs = set([r.name for r in screed.open(outfile, parse_description=True)])
+ print(list(sorted(seqs)))
+
+ print(seqs)
+ if sys.version_info.major == 2:
+ answer = {'895:1:1:1303:14389', '895:1:1:1347:3237',
+ '895:1:1:1295:6189', '895:1:1:1308:20421',
+ '895:1:1:1320:11648', '895:1:1:1352:5369',
+ '895:1:1:1318:10532', '895:1:1:1363:11839',
+ '895:1:1:1355:13535', '895:1:1:1349:15165'}
+ else:
+ answer = {'895:1:1:1290:11501', '895:1:1:1303:14389',
+ '895:1:1:1307:4308', '895:1:1:1308:2539',
+ '895:1:1:1331:1766', '895:1:1:1333:2512',
+ '895:1:1:1347:3237', '895:1:1:1363:11839',
+ '895:1:1:1378:18986', '895:1:1:1383:3089'}
+
+ assert seqs == answer
outfile = infile + '.subset.1'
assert os.path.exists(outfile), outfile
- seqs = set([r.name for r in screed.open(outfile)])
- print list(sorted(seqs))
-
- assert seqs == set(['895:1:1:1303:14389', '895:1:1:1373:4848',
- '895:1:1:1357:19736', '895:1:1:1347:3237',
- '895:1:1:1338:7557', '895:1:1:1388:11093',
- '895:1:1:1296:1784', '895:1:1:1290:11501',
- '895:1:1:1355:13535', '895:1:1:1303:6251'])
-
- outfile = infile + '.subset.2'
- assert os.path.exists(outfile), outfile
-
- seqs = set([r.name for r in screed.open(outfile)])
- print list(sorted(seqs))
-
- assert seqs == set(['895:1:1:1298:13380', '895:1:1:1348:18672',
- '895:1:1:1309:4153', '895:1:1:1252:19493',
- '895:1:1:1368:4434', '895:1:1:1348:1257',
- '895:1:1:1383:3089', '895:1:1:1355:13535',
- '895:1:1:1303:6251', '895:1:1:1349:15165'])
+ seqs = set([r.name for r in screed.open(outfile, parse_description=True)])
+ print(list(sorted(seqs)))
+
+ if sys.version_info.major == 2:
+ answer = set(['895:1:1:1303:14389', '895:1:1:1373:4848',
+ '895:1:1:1357:19736', '895:1:1:1347:3237',
+ '895:1:1:1338:7557', '895:1:1:1388:11093',
+ '895:1:1:1296:1784', '895:1:1:1290:11501',
+ '895:1:1:1355:13535', '895:1:1:1303:6251'])
+ else:
+ answer = {'895:1:1:1255:18861', '895:1:1:1276:16426',
+ '895:1:1:1303:6251', '895:1:1:1308:20421',
+ '895:1:1:1314:10430', '895:1:1:1351:14718',
+ '895:1:1:1355:13535', '895:1:1:1358:4953',
+ '895:1:1:1362:3983', '895:1:1:1363:9988'}
+ assert seqs == answer
+
+ seqs = set([r.name for r in screed.open(outfile, parse_description=True)])
+ print(list(sorted(seqs)))
+
+ if sys.version_info.major == 2:
+ answer = {'895:1:1:1303:14389', '895:1:1:1373:4848',
+ '895:1:1:1357:19736', '895:1:1:1347:3237',
+ '895:1:1:1338:7557', '895:1:1:1388:11093',
+ '895:1:1:1296:1784', '895:1:1:1290:11501',
+ '895:1:1:1355:13535', '895:1:1:1303:6251'}
+
+ else:
+ answer = {'895:1:1:1362:3983', '895:1:1:1363:9988',
+ '895:1:1:1314:10430', '895:1:1:1255:18861',
+ '895:1:1:1308:20421', '895:1:1:1358:4953',
+ '895:1:1:1351:14718', '895:1:1:1303:6251',
+ '895:1:1:1276:16426', '895:1:1:1355:13535'}
+
+ assert seqs == answer
def test_count_overlap_invalid_datafile():
@@ -2440,11 +2581,14 @@ def test_count_overlap_invalid_datafile():
shutil.copy(utils.get_test_data('test-overlap1.fa'), seqfile1)
htfile = _make_graph(seqfile1, ksize=20)
outfile = utils.get_temp_filename('overlap.out', in_dir)
- script = scriptpath('count-overlap.py')
- args = ['--ksize', '20', '--n_tables', '2', '--min-tablesize', '10000000',
+ script = 'count-overlap.py'
+ args = ['--ksize', '20', '--n_tables', '2', '--max-tablesize', '10000000',
htfile + '.pt', htfile + '.pt', outfile]
(status, out, err) = utils.runscript(script, args, in_dir, fail_ok=True)
- assert "IOError" in err
+ if sys.version_info.major == 2:
+ assert "IOError" in err
+ else:
+ assert "OSError" in err
def test_count_overlap():
@@ -2456,22 +2600,22 @@ def test_count_overlap():
shutil.copy(utils.get_test_data('test-overlap1.fa'), seqfile1)
shutil.copy(utils.get_test_data('test-overlap2.fa'), seqfile2)
htfile = _make_graph(seqfile1, ksize=20)
- script = scriptpath('count-overlap.py')
- args = ['--ksize', '20', '--n_tables', '2', '--min-tablesize', '10000000',
+ script = 'count-overlap.py'
+ args = ['--ksize', '20', '--n_tables', '2', '--max-tablesize', '10000000',
htfile + '.pt', seqfile2, outfile]
(status, out, err) = utils.runscript(script, args, in_dir)
assert status == 0
assert os.path.exists(outfile), outfile
data = [x.strip() for x in open(outfile)]
data = set(data)
- assert '# of unique k-mers in dataset2: 759047' in data
- assert '# of overlap unique k-mers: 245621' in data
+ assert '# of unique k-mers in dataset2: 759020' in data, data
+ assert '# of overlap unique k-mers: 245547' in data
assert os.path.exists(curvefile), curvefile
data = [x.strip() for x in open(curvefile)]
data = set(data)
- assert '178633 1155' in data
- assert '496285 2970' in data
- assert '752053 238627' in data
+ assert '178630 1134' in data, data
+ assert '496280 2904' in data
+ assert '752031 238558' in data
def test_count_overlap_csv():
@@ -2483,22 +2627,22 @@ def test_count_overlap_csv():
shutil.copy(utils.get_test_data('test-overlap1.fa'), seqfile1)
shutil.copy(utils.get_test_data('test-overlap2.fa'), seqfile2)
htfile = _make_graph(seqfile1, ksize=20)
- script = scriptpath('count-overlap.py')
- args = ['--ksize', '20', '--n_tables', '2', '--min-tablesize',
+ script = 'count-overlap.py'
+ args = ['--ksize', '20', '--n_tables', '2', '--max-tablesize',
'10000000', '--csv', htfile + '.pt', seqfile2, outfile]
(status, out, err) = utils.runscript(script, args, in_dir)
assert status == 0
assert os.path.exists(outfile), outfile
data = [x.strip() for x in open(outfile)]
data = set(data)
- assert '# of unique k-mers in dataset2: 759047' in data
- assert '# of overlap unique k-mers: 245621' in data
+ assert '# of unique k-mers in dataset2: 759020' in data
+ assert '# of overlap unique k-mers: 245547' in data
assert os.path.exists(curvefile), curvefile
data = [x.strip() for x in open(curvefile)]
data = set(data)
- assert '178633,1155' in data
- assert '496285,2970' in data
- assert '752053,238627' in data
+ assert '178630,1134' in data, data
+ assert '496280,2904' in data
+ assert '752031,238558' in data
def execute_streaming_diginorm(ifilename):
@@ -2510,7 +2654,7 @@ def execute_streaming_diginorm(ifilename):
# Get temp filenames, etc.
fifo = utils.get_temp_filename('fifo')
in_dir = os.path.dirname(fifo)
- script = scriptpath('normalize-by-median.py')
+ script = 'normalize-by-median.py'
args = ['-C', '1', '-k', '17', '-o', 'outfile', fifo]
# make a fifo to simulate streaming
@@ -2543,8 +2687,8 @@ def execute_load_graph_streaming(filename):
This is not directly executed but is run by the tests themselves
'''
- script = scriptpath('load-graph.py')
- args = '-x 1e7 -N 2 -k 20 -t out -'
+ script = 'load-graph.py'
+ args = '-x 1e7 -N 2 -k 20 out -'
infile = utils.get_temp_filename('temp')
in_dir = os.path.dirname(infile)
@@ -2553,9 +2697,9 @@ def execute_load_graph_streaming(filename):
if status != 0:
for line in out:
- print out
+ print(out)
for line in err:
- print err
+ print(err)
assert status == 0, status
err.seek(0)
err = err.read()
@@ -2578,7 +2722,6 @@ def execute_load_graph_streaming(filename):
assert x == (1, 0), x
- at attr('known_failing')
def test_screed_streaming_ufa():
# uncompressed fa
o = execute_streaming_diginorm(utils.get_test_data('test-abund-read-2.fa'))
@@ -2589,7 +2732,6 @@ def test_screed_streaming_ufa():
assert seqs[0].startswith('GGTTGACGGGGCTCAGGGGG')
- at attr('known_failing')
def test_screed_streaming_ufq():
# uncompressed fq
o = execute_streaming_diginorm(utils.get_test_data('test-fastq-reads.fq'))
@@ -2598,7 +2740,6 @@ def test_screed_streaming_ufq():
assert seqs[0].startswith('CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT')
- at attr('known_failing')
def test_screed_streaming_bzipfq():
# bzip compressed fq
o = execute_streaming_diginorm(utils.get_test_data('100-reads.fq.bz2'))
@@ -2607,7 +2748,6 @@ def test_screed_streaming_bzipfq():
assert seqs[0].startswith('CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT'), seqs
- at attr('known_failing')
def test_screed_streaming_bzipfa():
# bzip compressed fa
o = execute_streaming_diginorm(
@@ -2930,15 +3070,15 @@ def test_trim_low_abund_trimtest():
for record in screed.open(outfile):
if record.name == 'seqtrim/1':
- print record.name, record.sequence
+ print(record.name, record.sequence)
assert record.sequence == \
'GGTTGACGGGGCTCAGGGGGCGGCTGACTCCGAGAGACAGCAGCC'
elif record.name == 'seqtrim/2':
- print record.name, record.sequence
+ print(record.name, record.sequence)
assert record.sequence == \
'GGTTGACGGGGCTCAGGGGGCGGCTGACTCCGAGAGACAGCAGCCGC'
elif record.name == 'seqtrim2/1':
- print record.name, record.sequence
+ print(record.name, record.sequence)
assert record.sequence == \
'GGTTGACGGGGCTCAGGGGGCGGCTGACTCCGAGAGACAGCA'
@@ -2962,15 +3102,15 @@ def test_trim_low_abund_trimtest_after_load():
for record in screed.open(outfile):
if record.name == 'seqtrim/1':
- print record.name, record.sequence
+ print(record.name, record.sequence)
assert record.sequence == \
'GGTTGACGGGGCTCAGGGGGCGGCTGACTCCGAGAGACAGCAGCC'
elif record.name == 'seqtrim/2':
- print record.name, record.sequence
+ print(record.name, record.sequence)
assert record.sequence == \
'GGTTGACGGGGCTCAGGGGGCGGCTGACTCCGAGAGACAGCAGCCGC'
elif record.name == 'seqtrim2/1':
- print record.name, record.sequence
+ print(record.name, record.sequence)
assert record.sequence == \
'GGTTGACGGGGCTCAGGGGGCGGCTGACTCCGAGAGACAGCA'
@@ -2993,15 +3133,15 @@ def test_trim_low_abund_trimtest_savetable():
for record in screed.open(outfile):
if record.name == 'seqtrim/1':
- print record.name, record.sequence
+ print(record.name, record.sequence)
assert record.sequence == \
'GGTTGACGGGGCTCAGGGGGCGGCTGACTCCGAGAGACAGCAGCC'
elif record.name == 'seqtrim/2':
- print record.name, record.sequence
+ print(record.name, record.sequence)
assert record.sequence == \
'GGTTGACGGGGCTCAGGGGGCGGCTGACTCCGAGAGACAGCAGCCGC'
elif record.name == 'seqtrim2/1':
- print record.name, record.sequence
+ print(record.name, record.sequence)
assert record.sequence == \
'GGTTGACGGGGCTCAGGGGGCGGCTGACTCCGAGAGACAGCA'
diff --git a/tests/test_subset_graph.py b/tests/test_subset_graph.py
index 97f9ba6..b5a4209 100644
--- a/tests/test_subset_graph.py
+++ b/tests/test_subset_graph.py
@@ -1,6 +1,8 @@
+from __future__ import print_function
+from __future__ import absolute_import
#
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
-# Copyright (C) Michigan State University, 2009-2013. It is licensed under
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
+# Copyright (C) Michigan State University, 2009-2015. It is licensed under
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
#
@@ -8,8 +10,8 @@
import khmer
import screed
-import khmer_tst_utils as utils
import os
+from . import khmer_tst_utils as utils
def teardown():
@@ -19,7 +21,7 @@ def teardown():
class Test_RandomData(object):
def test_3_merge_013(self):
- ht = khmer.new_hashbits(20, 4 ** 4 + 1)
+ ht = khmer.Hashbits(20, 4 ** 4 + 1, 2)
filename = utils.get_test_data('test-graph2.fa')
@@ -41,7 +43,7 @@ class Test_RandomData(object):
assert n_partitions == 1, n_partitions # combined.
def test_3_merge_023(self):
- ht = khmer.new_hashbits(20, 4 ** 4 + 1)
+ ht = khmer.Hashbits(20, 4 ** 4 + 1, 2)
filename = utils.get_test_data('test-graph2.fa')
(total_reads, total_kmers) = ht.consume_fasta_and_tag(filename)
@@ -62,7 +64,7 @@ class Test_RandomData(object):
assert n_partitions == 1, n_partitions # combined.
def test_5_merge_046(self):
- ht = khmer.new_hashbits(20, 4 ** 4 + 1)
+ ht = khmer.Hashbits(20, 4 ** 4 + 1, 2)
filename = utils.get_test_data('test-graph5.fa')
(total_reads, total_kmers) = ht.consume_fasta_and_tag(filename)
@@ -81,7 +83,7 @@ class Test_RandomData(object):
assert n_partitions == 1, n_partitions # combined.
def test_random_20_a_succ(self):
- ht = khmer.new_hashbits(20, 4 ** 7 + 1)
+ ht = khmer.Hashbits(20, 4 ** 7 + 1, 2)
filename = utils.get_test_data('random-20-a.fa')
outfile = utils.get_temp_filename('out')
@@ -100,7 +102,7 @@ class Test_RandomData(object):
assert n_partitions == 1, n_partitions
def test_random_20_a_succ_II(self):
- ht = khmer.new_hashbits(20, 4 ** 7 + 1)
+ ht = khmer.Hashbits(20, 4 ** 7 + 1, 2)
filename = utils.get_test_data('random-20-a.fa')
outfile = utils.get_temp_filename('out')
@@ -119,7 +121,7 @@ class Test_RandomData(object):
assert n_partitions == 1, n_partitions
def test_random_20_a_succ_III(self):
- ht = khmer.new_hashbits(20, 4 ** 7 + 1)
+ ht = khmer.Hashbits(20, 4 ** 7 + 1, 2)
filename = utils.get_test_data('random-20-a.fa')
outfile = utils.get_temp_filename('out')
@@ -142,7 +144,7 @@ class Test_RandomData(object):
assert n_partitions == 1, n_partitions
def test_random_20_a_succ_IV(self):
- ht = khmer.new_hashbits(20, 4 ** 7 + 1)
+ ht = khmer.Hashbits(20, 4 ** 7 + 1, 2)
filename = utils.get_test_data('random-20-a.fa')
outfile = utils.get_temp_filename('out')
@@ -162,7 +164,7 @@ class Test_RandomData(object):
assert n_partitions == 1, n_partitions
def test_random_20_a_succ_IV_save(self):
- ht = khmer.new_hashbits(20, 4 ** 7 + 1)
+ ht = khmer.Hashbits(20, 4 ** 7 + 1, 2)
filename = utils.get_test_data('random-20-a.fa')
savefile_ht = utils.get_temp_filename('ht')
@@ -175,7 +177,7 @@ class Test_RandomData(object):
ht.save_tagset(savefile_tags)
del ht
- ht = khmer.new_hashbits(20, 4 ** 7 + 1)
+ ht = khmer.Hashbits(20, 4 ** 7 + 1, 2)
ht.load(savefile_ht)
ht.load_tagset(savefile_tags)
@@ -198,15 +200,15 @@ class Test_RandomData(object):
class Test_SaveLoadPmap(object):
def test_save_load_merge(self):
- ht = khmer.new_hashbits(20, 4 ** 4 + 1)
+ ht = khmer.Hashbits(20, 4 ** 4 + 1, 2)
filename = utils.get_test_data('test-graph2.fa')
(total_reads, total_kmers) = ht.consume_fasta_and_tag(filename)
assert total_reads == 3, total_reads
divvy = ht.divide_tags_into_subsets(1)
- print divvy
- assert len(divvy) is 3
+ print(divvy)
+ assert len(divvy) == 3
(a, b, c) = divvy
outfile1 = utils.get_temp_filename('x.pmap')
@@ -230,8 +232,45 @@ class Test_SaveLoadPmap(object):
n_partitions = ht.output_partitions(filename, outfile)
assert n_partitions == 1, n_partitions # combined.
+ def test_save_load_merge_truncate(self):
+ ht = khmer.Hashbits(20, 4 ** 4 + 1, 2)
+ filename = utils.get_test_data('test-graph2.fa')
+
+ (total_reads, total_kmers) = ht.consume_fasta_and_tag(filename)
+ assert total_reads == 3, total_reads
+
+ divvy = ht.divide_tags_into_subsets(1)
+ print(divvy)
+ assert len(divvy) is 3
+ (a, b, c) = divvy
+
+ outfile1 = utils.get_temp_filename('x.pmap')
+ outfile2 = utils.get_temp_filename('y.pmap')
+
+ x = ht.do_subset_partition(a, b)
+ ht.save_subset_partitionmap(x, outfile1)
+ del x
+
+ y = ht.do_subset_partition(b, 0)
+ ht.save_subset_partitionmap(y, outfile2)
+ del y
+
+ outfile3 = utils.get_temp_filename('z.pmap')
+ data = open(outfile1, 'rb').read()
+
+ for i in range(len(data)):
+ fp = open(outfile3, 'wb')
+ fp.write(data[:i])
+ fp.close()
+
+ try:
+ a = ht.load_subset_partitionmap(outfile3)
+ assert 0, "this should not pass"
+ except IOError as err:
+ print(str(err), i)
+
def test_save_load_merge_2(self):
- ht = khmer.new_hashbits(20, 4 ** 8 + 1)
+ ht = khmer.Hashbits(20, 4 ** 8 + 1, 2)
filename = utils.get_test_data('random-20-a.fa')
(total_reads, total_kmers) = ht.consume_fasta_and_tag(filename)
@@ -263,23 +302,22 @@ class Test_SaveLoadPmap(object):
assert n_partitions == 1, n_partitions # combined.
def test_save_load_merge_nexist(self):
- ht = khmer.new_hashbits(20, 1)
+ ht = khmer._Hashbits(20, [1])
try:
a = ht.load_subset_partitionmap('this does not exist')
assert 0, "this should not succeed"
except IOError as e:
- print str(e)
+ print(str(e))
def test_save_merge_from_disk(self):
- ht = khmer.new_hashbits(20, 4 ** 4 + 1)
+ ht = khmer.Hashbits(20, 4 ** 4 + 1, 2)
filename = utils.get_test_data('test-graph2.fa')
(total_reads, total_kmers) = ht.consume_fasta_and_tag(filename)
assert total_reads == 3, total_reads
divvy = ht.divide_tags_into_subsets(1)
- print divvy
- assert len(divvy) is 3
+ print(divvy)
(a, b, c) = divvy
outfile1 = utils.get_temp_filename('x.pmap')
@@ -301,7 +339,7 @@ class Test_SaveLoadPmap(object):
assert n_partitions == 1, n_partitions # combined.
def test_save_merge_from_disk_2(self):
- ht = khmer.new_hashbits(20, 4 ** 7 + 1)
+ ht = khmer.Hashbits(20, 4 ** 7 + 1, 2)
filename = utils.get_test_data('random-20-a.fa')
(total_reads, total_kmers) = ht.consume_fasta_and_tag(filename)
@@ -330,15 +368,14 @@ class Test_SaveLoadPmap(object):
assert n_partitions == 1, n_partitions # combined.
def test_save_merge_from_disk_file_not_exist(self):
- ht = khmer.new_hashbits(20, 4 ** 4 + 1)
+ ht = khmer.Hashbits(20, 4 ** 4 + 1, 2)
filename = utils.get_test_data('test-graph2.fa')
(total_reads, total_kmers) = ht.consume_fasta_and_tag(filename)
assert total_reads == 3, total_reads
divvy = ht.divide_tags_into_subsets(1)
- print divvy
- assert len(divvy) is 3
+ print(divvy)
(a, b, c) = divvy
outfile1 = utils.get_temp_filename('x.pmap')
@@ -349,38 +386,37 @@ class Test_SaveLoadPmap(object):
ht.merge_subset_from_disk(outfile1)
assert 0, "this should fail"
except IOError as e:
- print str(e)
+ print(str(e))
def test_merge_from_disk_file_bad_type(self):
- ht = khmer.new_hashbits(20, 4 ** 4 + 1)
+ ht = khmer.Hashbits(20, 4 ** 4 + 1, 2)
infile = utils.get_test_data('goodversion-k12.ht')
try:
ht.merge_subset_from_disk(infile)
assert 0, "this should fail"
except IOError as e:
- print str(e)
+ print(str(e))
def test_merge_from_disk_file_version(self):
- ht = khmer.new_hashbits(20, 4 ** 4 + 1)
+ ht = khmer.Hashbits(20, 4 ** 4 + 1, 2)
infile = utils.get_test_data('badversion-k12.ht')
try:
ht.merge_subset_from_disk(infile)
assert 0, "this should fail"
except IOError as e:
- print str(e)
+ print(str(e))
def test_save_merge_from_disk_ksize(self):
- ht = khmer.new_hashbits(20, 4 ** 4 + 1)
+ ht = khmer.Hashbits(20, 4 ** 4 + 1, 2)
filename = utils.get_test_data('test-graph2.fa')
(total_reads, total_kmers) = ht.consume_fasta_and_tag(filename)
assert total_reads == 3, total_reads
divvy = ht.divide_tags_into_subsets(1)
- print divvy
- assert len(divvy) is 3
+ print(divvy)
(a, b, c) = divvy
outfile1 = utils.get_temp_filename('x.pmap')
@@ -388,18 +424,89 @@ class Test_SaveLoadPmap(object):
ht.save_subset_partitionmap(x, outfile1)
del x
- ht = khmer.new_hashbits(19, 1, 1)
+ ht = khmer._Hashbits(19, [1])
try:
ht.merge_subset_from_disk(outfile1)
assert 0, "this should fail"
except IOError as e:
- print str(e)
+ print(str(e))
+
+
+def test_save_load_merge_on_graph():
+ ht = khmer.Hashbits(20, 4 ** 4 + 1, 2)
+ filename = utils.get_test_data('test-graph2.fa')
+
+ (total_reads, total_kmers) = ht.consume_fasta_and_tag(filename)
+ assert total_reads == 3, total_reads
+
+ divvy = ht.divide_tags_into_subsets(1)
+ print(divvy)
+ assert len(divvy) is 3
+ (a, b, c) = divvy
+
+ outfile1 = utils.get_temp_filename('x.pmap')
+ outfile2 = utils.get_temp_filename('y.pmap')
+
+ x = ht.do_subset_partition(a, b)
+ ht.save_subset_partitionmap(x, outfile1)
+ del x
+
+ y = ht.do_subset_partition(b, 0)
+ ht.save_subset_partitionmap(y, outfile2)
+ del y
+
+ a = ht.load_partitionmap(outfile1) # <-- this is different
+ b = ht.load_subset_partitionmap(outfile2)
+
+ ht.merge_subset(b)
+
+ outfile = utils.get_temp_filename('out.part')
+ n_partitions = ht.output_partitions(filename, outfile)
+ assert n_partitions == 1, n_partitions # combined.
+
+
+def test_save_load_on_graph_truncate():
+ ht = khmer.Hashbits(20, 4 ** 4 + 1, 2)
+ filename = utils.get_test_data('test-graph2.fa')
+
+ (total_reads, total_kmers) = ht.consume_fasta_and_tag(filename)
+ assert total_reads == 3, total_reads
+
+ divvy = ht.divide_tags_into_subsets(1)
+ print(divvy)
+ assert len(divvy) is 3
+ (a, b, c) = divvy
+
+ outfile1 = utils.get_temp_filename('x.pmap')
+ outfile2 = utils.get_temp_filename('y.pmap')
+
+ x = ht.do_subset_partition(a, b)
+ ht.save_subset_partitionmap(x, outfile1)
+ del x
+
+ y = ht.do_subset_partition(b, 0)
+ ht.save_subset_partitionmap(y, outfile2)
+ del y
+
+ outfile3 = utils.get_temp_filename('z.pmap')
+ data = open(outfile1, 'rb').read()
+
+ for i in range(len(data)):
+ fp = open(outfile3, 'wb')
+ fp.write(data[:i])
+ fp.close()
+
+ try:
+ a = ht.load_partitionmap(outfile3)
+ assert 0, "this should not pass"
+ except IOError as err:
+ print(str(err), i)
def test_output_partitions():
filename = utils.get_test_data('test-output-partitions.fa')
- ht = khmer.new_hashbits(10, 1, 1)
+ ht = khmer._Hashbits(10, [1])
ht.set_partition_id('TTAGGACTGC', 2)
ht.set_partition_id('TGCGTTTCAA', 3)
ht.set_partition_id('ATACTGTAAA', 4)
@@ -424,7 +531,7 @@ test_output_partitions.runme = True
def test_tiny_real_partitions():
filename = utils.get_test_data('real-partition-tiny.fa')
- ht = khmer.new_hashbits(32, 8e1, 4)
+ ht = khmer.Hashbits(32, 8e2, 4)
ht.consume_fasta_and_tag(filename)
subset = ht.do_subset_partition(0, 0)
@@ -451,7 +558,7 @@ def test_tiny_real_partitions():
def test_small_real_partitions():
filename = utils.get_test_data('real-partition-small.fa')
- ht = khmer.new_hashbits(32, 2e2, 4)
+ ht = khmer.Hashbits(32, 2e3, 4)
ht.consume_fasta_and_tag(filename)
subset = ht.do_subset_partition(0, 0)
@@ -491,14 +598,14 @@ CCTCGGGCCTTTCCGTTCCGTTGCCGCCCAAGCTCTCTAGCATCGAATCGGTCAAGCGGT\
def test_partition_on_abundance_1():
- print(a,)
- print(b,)
- kh = khmer.new_counting_hash(20, 1e3, 4)
+ print((a,))
+ print((b,))
+ kh = khmer.CountingHash(20, 1e3, 4)
for i in range(10):
- print kh.consume_and_tag(a)
+ print(kh.consume_and_tag(a))
for i in range(10):
- print kh.consume_and_tag(b)
+ print(kh.consume_and_tag(b))
# all paths in 'a' and 'b'
p = kh.do_subset_partition_with_abundance(10, 50)
@@ -507,12 +614,12 @@ def test_partition_on_abundance_1():
def test_partition_on_abundance_2():
- kh = khmer.new_counting_hash(20, 1e3, 4)
+ kh = khmer.CountingHash(20, 1e3, 4)
for i in range(10):
- print kh.consume_and_tag(a)
+ print(kh.consume_and_tag(a))
for i in range(5):
- print kh.consume_and_tag(b)
+ print(kh.consume_and_tag(b))
# all paths in 'a'
p = kh.do_subset_partition_with_abundance(10, 50)
@@ -521,12 +628,12 @@ def test_partition_on_abundance_2():
def test_partition_on_abundance_3():
- kh = khmer.new_counting_hash(20, 1e4, 4)
+ kh = khmer.CountingHash(20, 1e4, 4)
for i in range(10):
- print kh.consume_and_tag(a)
+ print(kh.consume_and_tag(a))
for i in range(5):
- print kh.consume_and_tag(b)
+ print(kh.consume_and_tag(b))
# this will get paths only in 'a'
p = kh.do_subset_partition_with_abundance(10, 50)
@@ -535,12 +642,12 @@ def test_partition_on_abundance_3():
p = kh.do_subset_partition_with_abundance(5, 10)
x = p.count_partitions()
- print x
+ print(x)
assert x == (2, 2) # two partitions, two ignored tags
def test_partition_overlap_1():
- kh = khmer.new_counting_hash(20, 1e3, 4)
+ kh = khmer.CountingHash(20, 1e3, 4)
for i in range(10):
kh.consume_and_tag(a)
@@ -561,7 +668,7 @@ def test_partition_overlap_1():
def test_partition_overlap_2():
- kh = khmer.new_counting_hash(20, 1e4, 4)
+ kh = khmer.CountingHash(20, 1e4, 4)
for i in range(10):
kh.consume_and_tag(a)
diff --git a/tests/test_threaded_sequence_processor.py b/tests/test_threaded_sequence_processor.py
index 5aac0f4..854a3fc 100644
--- a/tests/test_threaded_sequence_processor.py
+++ b/tests/test_threaded_sequence_processor.py
@@ -1,11 +1,16 @@
import sys
from khmer.thread_utils import ThreadedSequenceProcessor, SequenceGroup
-from cStringIO import StringIO
+from io import StringIO
from screed.fasta import fasta_iter
from screed.fastq import fastq_iter
-import Queue
from nose.plugins.attrib import attr
+# stdlib queue module was renamed on Python 3
+try:
+ import queue
+except ImportError:
+ import Queue as queue
+
def load_records(stringio_fp):
records = list(fasta_iter(StringIO(stringio_fp.getvalue())))
@@ -111,7 +116,7 @@ def test_paired_2thread():
while not self.done or not inq.empty():
try:
g = inq.get(True, 1)
- except Queue.Empty:
+ except queue.Empty:
continue
assert len(g.seqlist) == 2
@@ -160,7 +165,7 @@ def test_paired_2thread_more_seq():
while not self.done or not inq.empty():
try:
g = inq.get(True, 1)
- except Queue.Empty:
+ except queue.Empty:
continue
if len(g.seqlist) == 2:
diff --git a/tests/test_version.py b/tests/test_version.py
index 775edbe..f8a8901 100644
--- a/tests/test_version.py
+++ b/tests/test_version.py
@@ -1,6 +1,7 @@
+from __future__ import print_function, unicode_literals
#
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
-# Copyright (C) Michigan State University, 2014. It is licensed under
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
+# Copyright (C) Michigan State University, 2014-2015. It is licensed under
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
#
@@ -12,8 +13,8 @@ from nose.plugins.attrib import attr
def test_python_and_c_match():
# checks c++ compiler option version against versioneer version
# (respectively)
- print 'c++ version {0}:'.format(khmer.__version_cpp__())
- print 'versioneer (python) version: {0}'.format(khmer.__version__)
+ print('c++ version {0}:'.format(khmer.__version_cpp__()))
+ print('versioneer (python) version: {0}'.format(khmer.__version__))
assert khmer.__version_cpp__() == khmer.__version__
@@ -22,9 +23,9 @@ def test_python_and_c_match_base():
# it's a hash based on git commits which can get out-of-sync too easily
cppver = '-'.join(khmer.__version_cpp__().split('-')[0:2])
pyver = '-'.join(khmer.__version__.split('-')[0:2])
- print 'c++ version {0}'.format(cppver)
- print 'python version: {0}'.format(pyver)
- print 'if you are seeing this, the version compiled into your cpp'
- print 'objects and your versioneer stuff is out-of-sync.'
- print 'try doing: make clean; make'
+ print('c++ version {0}'.format(cppver))
+ print('python version: {0}'.format(pyver))
+ print('if you are seeing this, the version compiled into your cpp')
+ print('objects and your versioneer stuff is out-of-sync.')
+ print('try doing: make clean; make')
assert cppver == pyver
diff --git a/versioneer.py b/versioneer.py
index c00770f..f624309 100644
--- a/versioneer.py
+++ b/versioneer.py
@@ -280,6 +280,7 @@ public domain. The `_version.py` that it creates is also in the public
domain.
"""
+from __future__ import print_function
import errno
import os
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/khmer.git
More information about the debian-med-commit
mailing list