[med-svn] [khmer] 01/01: Imported Upstream version 2.0~rc2+dfsg
Michael Crusoe
misterc-guest at moszumanska.debian.org
Fri Jul 31 16:33:14 UTC 2015
This is an automated email from the git hooks/post-receive script.
misterc-guest pushed a commit to branch upstream
in repository khmer.
commit abe825085829670f6e5756c5c063f4d9afcb01d6
Author: Michael R. Crusoe <michael.crusoe at gmail.com>
Date: Fri Jul 31 07:08:12 2015 -0700
Imported Upstream version 2.0~rc2+dfsg
---
.mailmap | 47 ++
.ycm_extra_conf.py | 105 ----
CITATION | 20 +
ChangeLog | 244 ++++++++
MANIFEST.in | 1 +
Makefile | 63 +-
doc/dev/getting-started.rst | 25 +
doc/release-notes/release-1.4.rst | 2 +-
doc/user/install.rst | 5 +-
doc/user/scripts.rst | 3 +
doc/whats-new-2.0.rst | 30 +
jenkins-build.sh | 21 +-
khmer/__init__.py | 113 +++-
khmer/_khmer.cc | 671 ++++++++++++++-------
khmer/_version.py | 4 +-
khmer/kfile.py | 35 +-
khmer/khmer_args.py | 19 +-
khmer/thread_utils.py | 2 +-
khmer/utils.py | 12 +-
lib/Makefile | 220 +++----
lib/counting.cc | 85 ++-
lib/counting.hh | 23 +-
lib/get_version.py | 1 +
lib/hashbits.cc | 30 +-
lib/hashbits.hh | 6 +
lib/hashtable.cc | 132 ++--
lib/hashtable.hh | 55 +-
lib/hllcounter.cc | 35 +-
lib/hllcounter.hh | 11 +-
lib/ht-diff.cc | 149 -----
lib/khmer_exception.hh | 32 +-
lib/kmer_hash.cc | 10 +-
lib/labelhash.cc | 33 +-
lib/labelhash.hh | 16 +-
lib/{khmer.pc.in => oxli.pc.in} | 4 +-
lib/perf_metrics.cc | 35 --
lib/perf_metrics.hh | 75 ---
lib/read_aligner.cc | 235 ++++++--
lib/read_aligner.hh | 37 +-
lib/read_parsers.cc | 25 +-
lib/read_parsers.hh | 43 +-
lib/subset.cc | 40 +-
lib/subset.hh | 10 +-
lib/test-HashTables.cc | 150 -----
lib/test-Parser.cc | 145 -----
lib/test-compile.cc | 5 +-
lib/trace_logger.cc | 70 ---
lib/trace_logger.hh | 76 ---
oxli/__init__.py | 6 +-
oxli/build_graph.py | 12 +-
oxli/functions.py | 129 +++-
sandbox/Makefile.read_aligner_training | 26 +
sandbox/README.rst | 4 +-
sandbox/build-sparse-graph.py | 2 +-
sandbox/calc-best-assembly.py | 17 +-
sandbox/collect-reads.py | 9 +-
sandbox/collect-variants.py | 6 +-
sandbox/correct-errors.py | 219 -------
.../trim-low-abund.py => sandbox/correct-reads.py | 246 ++++----
sandbox/count-kmers-single.py | 103 ++++
sandbox/count-kmers.py | 80 +++
sandbox/error-correct-pass2.py | 94 +++
sandbox/estimate_optimal_hash.py | 11 +-
sandbox/extract-single-partition.py | 2 +-
sandbox/optimal_args_hashbits.py | 2 +-
sandbox/readaligner_pairhmm_train.py | 205 +++++++
sandbox/saturate-by-median.py | 4 +-
sandbox/sweep-files.py | 8 +-
sandbox/sweep-reads.py | 8 +-
scripts/abundance-dist-single.py | 34 +-
scripts/abundance-dist.py | 32 +-
scripts/count-median.py | 36 +-
scripts/count-overlap.py | 20 +-
scripts/extract-long-sequences.py | 9 +-
scripts/extract-paired-reads.py | 4 +-
scripts/extract-partitions.py | 3 +-
scripts/fastq-to-fasta.py | 7 +-
scripts/filter-abund-single.py | 13 +-
scripts/filter-abund.py | 18 +-
scripts/interleave-reads.py | 31 +-
scripts/load-graph.py | 12 +-
scripts/load-into-counting.py | 21 +-
scripts/make-initial-stoptags.py | 4 +-
scripts/normalize-by-median.py | 214 ++++---
scripts/oxli | 16 +
scripts/partition-graph.py | 4 +-
scripts/readstats.py | 18 +-
scripts/sample-reads-randomly.py | 12 +-
scripts/split-paired-reads.py | 12 +-
scripts/trim-low-abund.py | 30 +-
{sandbox => scripts}/unique-kmers.py | 60 +-
setup.py | 2 +-
tests/khmer_tst_utils.py | 108 ++--
tests/test-data/empty-file.bz2 | Bin 0 -> 14 bytes
tests/test-data/empty-file.gz | Bin 0 -> 32 bytes
tests/test-data/paired-broken4.fq.1 | 4 +
tests/test-data/paired-broken4.fq.2 | 4 +
tests/test-data/paired.fq.2 | 1 -
tests/test-data/readaligner-default.json | 50 ++
tests/test-data/readaligner-k12.json | 50 ++
tests/test-data/test-fastq-reads.fa | 200 ++++++
tests/test_counting_hash.py | 76 ++-
tests/test_counting_single.py | 4 +-
tests/test_filter.py | 2 +-
tests/test_functions.py | 9 +-
tests/test_hashbits.py | 568 ++++++++---------
tests/test_hll.py | 2 +-
tests/test_labelhash.py | 12 +-
tests/test_lump.py | 4 +-
tests/test_normalize_by_median.py | 212 +++++--
tests/test_oxli_functions.py | 40 +-
tests/test_read_aligner.py | 502 +++++++++++++--
tests/test_read_parsers.py | 42 +-
tests/test_sandbox_scripts.py | 40 +-
tests/test_script_arguments.py | 59 +-
tests/test_scripts.py | 398 +++++++-----
tests/test_streaming_io.py | 451 ++++++++++++++
tests/test_subset_graph.py | 16 +-
118 files changed, 5034 insertions(+), 2835 deletions(-)
diff --git a/.mailmap b/.mailmap
new file mode 100644
index 0000000..4e1b7d4
--- /dev/null
+++ b/.mailmap
@@ -0,0 +1,47 @@
+Michael R. Crusoe <mcrusoe at msu.edu>
+Michael R. Crusoe <mcrusoe at msu.edu> <michael.crusoe at gmail.com>
+Michael R. Crusoe <mcrusoe at msu.edu> <mcruseo at msu.edu>
+Michael R. Crusoe <mcrusoe at msu.edu> <mcrusoe at athyra.(none)>
+Camille Scott <camille.scott.w at gmail.com> <cs.welcher at gmail.com>
+Tamer Mansour <drtamermansour at gmail.com>
+Rhys Kidd <rhyskidd at gmail.com>
+Susan Steinman <susan.steinman at gmail.com> <steinman.tutoring at gmail.com>
+Adina Howe <adina at iastate.edu> <howead at msu.edu>
+Elmar Bucher <buchere at ohsu.edu> <elmbeech at zoho.com>
+Luiz Irber <luiz.irber at gmail.com> <luizirber at users.noreply.github.com>
+Luiz Irber <luiz.irber at gmail.com> <irberlui at msu.edu>
+Qingpeng Zhang <qingpeng at gmail.com> <qingpeng at msu.edu>
+Reed Cartwright <cartwright at asu.edu>
+Reed Cartwright <cartwright at asu.edu> <reed at cartwrig.ht>
+Jacob Fenton <bocajnotnef at gmail.com>
+Michael Wright <wrigh517 at gmail.com> <narrows at 13-67-33.client.wireless.msu.edu>
+Eric McDonald <em at msu.edu> <emcd.msu at gmail.com>
+Jared Simpson <js18 at sanger.ac.uk> <jared.simpson at gmail.com>
+Benjamin Taylor <taylo886 at msu.edu> <taylo886 at cse.msu.edu>
+Kaben Nanlohy <kaben.nanlohy at gmail.com> <kaben at idyll.org>
+Ramakrishnan Srinivasan <ramrs at nyu.edu>
+Rodney Picett <pickett.rodney at gmail.com>
+Sarah Guermond <sarah.guermond at gmail.com> <sguermond at users.noreply.github.com>
+Sarah Guermond <sarah.guermond at gmail.com> <s.guermond at gmail.com>
+Hussien F. Alameldin <hussien at msu.edu> <hussienfotoh at gmail.com>
+Brian Wyss <wyssbria at msu.edu>
+Heather L. Wiencko <wienckhl at tcd.ie> <timiat at yahoo.com>
+Jiarong Guo <guojiaro at gmail.com>
+Josiah Seaman <josiah at dnaskittle.com> <josiah.seaman at gmail.com>
+Leonor Garcia-Gutierrez <l.garcia-gutierrez at warwick.ac.uk>
+Ryan R. Boyce <boycerya at msu.edu>
+en zyme <en_zyme at outlook.com> <enzyme at bu.edu>
+Scott Sievert <sieve121 at umn.edu>
+Joshua R. Nahum <joshnahum at gmail.com>
+Jonathan Gluck <jdg at cs.umd.edu> <jonathangluck08854 at gmail.com>
+Joshua R. Herr <joshua.r.herr at gmail.com>
+Bede Constantinides <bede.constantinides at manchester.ac.uk> <bedeabc at gmail.com>
+Kevin D. Murray <kevin.murray at anu.edu.au> <spam at kdmurray.id.au>
+James A. Stapleton <jas at msu.edu>
+Scott Fay <scott.a.fay at gmail.com> <scott.fay at invitae.com>
+Iván González <igonzalez at mailaps.org> <iglpdc at gmail.com>
+Sherine Awad <drmahmoud at ucdavis.edu> <sherine.awad at gmail.com>
+Alexander Johan Nederbragt <lex.nederbragt at ibv.uio.no>
+Charles Pepe-Ranney <chuck.peperanney at gmail.com>
+Jeramia Ory <Jeramia.Ory at stlcop.edu> <jeramia.ory at gmail.com>
+<jared.simpson at oicr.on.ca> <js18 at sanger.ac.uk>
diff --git a/.ycm_extra_conf.py b/.ycm_extra_conf.py
deleted file mode 100644
index fcc7939..0000000
--- a/.ycm_extra_conf.py
+++ /dev/null
@@ -1,105 +0,0 @@
-# This file is NOT licensed under the GPLv3, which is the license for the rest
-# of YouCompleteMe.
-#
-# Here's the license text for this file:
-#
-# This is free and unencumbered software released into the public domain.
-#
-# Anyone is free to copy, modify, publish, use, compile, sell, or
-# distribute this software, either in source code form or as a compiled
-# binary, for any purpose, commercial or non-commercial, and by any
-# means.
-#
-# In jurisdictions that recognize copyright laws, the author or authors
-# of this software dedicate any and all copyright interest in the
-# software to the public domain. We make this dedication for the benefit
-# of the public at large and to the detriment of our heirs and
-# successors. We intend this dedication to be an overt act of
-# relinquishment in perpetuity of all present and future rights to this
-# software under copyright law.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
-# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
-# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-# OTHER DEALINGS IN THE SOFTWARE.
-#
-# For more information, please refer to <http://unlicense.org/>
-
-import os
-import ycm_core
-
-SOURCE_EXTENSIONS = ['.cpp', '.cxx', '.cc', '.c', '.m', '.mm']
-
-database = ycm_core.CompilationDatabase(
- os.path.dirname(os.path.abspath(__file__)))
-
-def MakeRelativePathsInFlagsAbsolute(flags, working_directory):
- if not working_directory:
- return list(flags)
- new_flags = []
- make_next_absolute = False
- path_flags = ['-isystem', '-I', '-iquote', '--sysroot=']
- for flag in flags:
- new_flag = flag
-
- if make_next_absolute:
- make_next_absolute = False
- if not flag.startswith('/'):
- new_flag = os.path.join(working_directory, flag)
-
- for path_flag in path_flags:
- if flag == path_flag:
- make_next_absolute = True
- break
-
- if flag.startswith(path_flag):
- path = flag[len(path_flag):]
- new_flag = path_flag + os.path.join(working_directory, path)
- break
-
- if new_flag:
- new_flags.append(new_flag)
- return new_flags
-
-
-def IsHeaderFile(filename):
- extension = os.path.splitext(filename)[1]
- return extension in ['.h', '.hxx', '.hpp', '.hh']
-
-
-def GetCompilationInfoForFile(filename):
- # The compilation_commands.json file generated by CMake does not have entries
- # for header files. So we do our best by asking the db for flags for a
- # corresponding source file, if any. If one exists, the flags for that file
- # should be good enough.
- if IsHeaderFile(filename):
- basename = os.path.splitext(filename)[0]
- for extension in SOURCE_EXTENSIONS:
- replacement_file = basename + extension
- if os.path.exists(replacement_file):
- compilation_info = database.GetCompilationInfoForFile(
- replacement_file)
- if compilation_info.compiler_flags_:
- return compilation_info
- return None
- return database.GetCompilationInfoForFile(filename)
-
-
-def FlagsForFile(filename, **kwargs):
- # Bear in mind that compilation_info.compiler_flags_ does NOT return a
- # python list, but a "list-like" StringVec object
- compilation_info = GetCompilationInfoForFile(filename)
- if not compilation_info:
- return None
-
- final_flags = MakeRelativePathsInFlagsAbsolute(
- compilation_info.compiler_flags_,
- compilation_info.compiler_working_dir_)
-
- return {
- 'flags': final_flags,
- 'do_cache': True
- }
diff --git a/CITATION b/CITATION
index 3b4b3c6..a1b09f3 100644
--- a/CITATION
+++ b/CITATION
@@ -107,6 +107,26 @@ the digital normalization algorithm, described in:
url = "http://arxiv.org/abs/1203.4802",
}
+Efficient k-mer error trimming
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The script trim-low-abund.py is described in:
+
+ Crossing the streams: a framework for streaming analysis of short DNA
+ sequencing reads
+ Zhang Q, Awad S, Brown CT
+ https://dx.doi.org/10.7287/peerj.preprints.890v1
+
+.. code-block:: tex
+
+ @unpublished{semistream,
+ author = "Qingpeng Zhang and Sherine Awad and C. Titus Brown",
+ title = "Crossing the streams: a framework for streaming analysis of short DNA sequencing reads",
+ year = "2015",
+ eprint = "PeerJ Preprints 3:e1100",
+ url = "https://dx.doi.org/10.7287/peerj.preprints.890v1"
+ }
+
K-mer counting
^^^^^^^^^^^^^^
diff --git a/ChangeLog b/ChangeLog
index 2aeb3db..cd69867 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,247 @@
+2015-07-31 Kevin Murray <spam at kdmurray.id.au>
+
+ * lib/Makefile,Makefile,lib/*.pc.in,lib/test-compile.cc: Misc debian-based
+ compatiablity changes
+ * lib/get_version.py: Add crunchbang, chmod +x
+
+2015-07-29 Michael R. Crusoe <crusoe at ucdavis.edu>
+
+ * khmer/_khmer.cc: add more CPyChecker inspired fixes
+ * lib/*.{cc,hh}: clean up includes and forward declarations
+
+2015-07-29 Luiz Irber <khmer at luizirber.org>
+
+ * Makefile: Adapt Makefile rules for py3 changes.
+ * jenkins-build.sh: Read PYTHON_EXECUTABLE and TEST_ATTR from environment.
+
+2015-07-29 Amanda Charbonneau <charbo24 at msu.edu>
+
+ * scripts/fastq-to-fasta.py: Changed '-n' default description to match
+ behaviour
+
+2015-07-29 Luiz Irber <khmer at luizirber.org>
+
+ * tests/test_{scripts,streaming_io}.py: Fix the build + add a test
+
+2015-07-28 Titus Brown <titus at idyll.org>
+
+ * tests/test_streaming_io.py: new shell cmd tests for streaming/piping.
+ * tests/khmer_tst_utils.py: refactor/replace runtestredirect(...) with
+ scriptpath(...) and run_shell_cmd(...).
+ * scripts/test_scripts.py: remove test_interleave_reads_broken_fq_4 for
+ only one input file for interleave-reads.py; replace runscriptredirect call
+ with run_shell_cmd.
+ * scripts/interleave-reads.py: force exactly two input files.
+ * scripts/split-paired-reads.py: fix print statement; clarify output.
+ * scripts/{normalize-by-median.py,sample-reads-randomly.py,
+ trim-low-abund.py}: if stdin is supplied for input, check that -o
+ specifies output file.
+ * scripts/filter-abund.py: if stdin is supplied for input, check that -o
+ specifies output file; switched -o to use argparse.FileType.
+ * scripts/extract-long-sequences.py: switched -o to use argparse.FileType.
+ * scripts/{abundance-dist,count-median}.py: added '-' handling for output.
+ * khmer/kfile.py: change 'check_input_files' to no longer warn that
+ '-' doesn't exist'.
+ * tests/test-data/paired.fq.2: removed extraneous newline from end.
+ * tests/{test_normalize_by_median,test_script_arguments,test_scripts}.py:
+ added tests for new code.
+ * scripts/oxli: added script for running tests in development directory.
+ * khmer/{__init__,khmer_args}.py,tests/{test_normalize_by_median,
+ test_script_arguments}.py: refactored out use of AssertionError by not
+ throwing plain Exceptions when a ValueError or RuntimeError would do.
+ * oxli/__init__.py: give default help instead of an error when `oxli` is
+ called with no arguments.
+ * tests/test_{normalize_by_median,sandbox_scripts,scripts,streaming_io}.py:
+ always check status code if calling `runscripts` with `fail_ok=True`.
+
+2015-07-28 Luiz Irber <khmer at luizirber.org>
+
+ * sandbox/unique-kmers.py: moved to scripts.
+ * scripts/unique-kmers.py: fix import bug and initialize to_print earlier.
+ * tests/test_scripts.py: add tests for unique-kmers.py.
+ * doc/user/scripts.rst: added unique-kmers.py to script page
+
+2015-07-28 Jacob Fenton <bocajnotnef at gmail.com>
+
+ * scripts/abundance-dist.py: disallowed forcing on the input file check for
+ the counting table file
+
+2015-07-28 Michael R. Crusoe <crusoe at ucdavis.edu>
+
+ * .mailmap, Makefile: generate a list of authors
+
+2015-07-28 Kevin Murray <spam at kdmurray.id.au>
+ Titus Brown <titus at idyll.org>
+
+ * khmer/utils.py: added fix for SRA-style FASTQ output.
+ * tests/test_scripts.py: tested against a broken version of SRA format.
+ * tests/test-data/paired-broken4.fq.{1,2}: added test files.
+
+2015-07-28 Michael R. Crusoe <crusoe at ucdavis.edu>
+ Titus Brown <titus at idyll.org>
+
+ * lib/read_aligner.{cc,hh},tests/{test_read_aligner.py,
+ test-data/readaligner-{default,k12}.json},khmer/__init__.py: refactor,
+ read aligner parameters are now configurable & save/load-able. Can do
+ whole-genome variant finding.
+ * khmer/_khmer.cc,tests/test_read_aligner.py: ReadAligner.align_forward
+ method added
+ * sandbox/correct-errors.py -> sandbox/correct-reads.py: total rewrite
+ * sandbox/error-correct-pass2.py: new script
+ * sandbox/readaligner_pairhmm_train.py: new script
+ * tests/test_sandbox_scripts.py, doc/release-notes/release-1.4.rst:
+ spelling fixes, import re-arrangement
+ * sandbox/{Makefile.read_aligner_training,readaligner_pairhmm_train.py}:
+ Added script to train the aligner
+
+2015-07-27 Titus Brown <titus at idyll.org>
+
+ * khmer/khmer_args.py,CITATION: added entry for PeerJ paper on
+ semi-streaming to citations.
+ * scripts/{abundance-dist-single.py,abundance-dist.py,count-median.py,
+ count-overlap.py,filter-abund-single.py,load-into-counting.py}: changed
+ default behavior to output data in CSV format and report total k-mers.
+ * tests/test_scripts.py: updated/removed tests for CSV.
+ * doc/whats-new-2.0.rst: added information about change in columnar output,
+ along with other minor corrections.
+ * scripts/normalize-by-median.py: corrected epilog.
+ * khmer/thread_utils.py,
+ sandbox/{calc-best-assembly.py,extract-single-partition.py},
+ scripts/{count-median.py,extract-long-sequences.py,extract-paired-reads.py,
+ extract-partitions.py,fastq-to-fasta.py,
+ interleave-reads.py,normalize-by-median.py,readstats.py,
+ sample-reads-randomly.py,split-paired-reads.py,trim-low-abund.py},
+ tests/{test_normalize_by_median.py,test_scripts.py}: remove explicit
+ 'parse_description' from screed open calls.
+ * khmer/_khmer.cc,lib/Makefile,lib/hashtable.{cc,hh},setup.py: removed
+ WITH_INTERNAL_METRICS and trace_logger/perf_metrics references.
+ * lib/perf_metrics.{cc,hh},lib/trace_logger.{cc,hh}: removed unused files.
+
+2015-07-24 Jacob Fenton <bocajnotnef at gmail.com>
+
+ * doc/dev/getting-started.rst: added instructions for second contribution
+
+2015-07-22 Jacob Fenton <bocajnotnef at gmail.com>
+
+ * tests/test_read_parsers.py: added workaround for bug in OSX Python
+ * Makefile: respect that workaround when running the tests
+
+2015-07-21 Jacob Fenton <bocajnotnef at gmail.com>
+
+ * khmer/{kfile,khmer_args}.py: refactored information passing, made it so
+ space checks happen in the right directory.
+ * oxli/build_graph.py,sandbox/collect-reads.py,scripts/{
+ abundance-dist-single,filter-abund-single,load-into-counting,
+ normalize-by-median,trim-low-abund}.py,tests/test_script_arguments.py:
+ changed to use new arg structure for checking hashtable save space.
+ * oxli/functions.py,scripts/saturate-by-median.py: updated error message
+ to mention --force option.
+ * scripts/{count-overlap,load-into-counting,make-initial-stoptags,
+ partition-graph,sample-reads-randomly}.py: removed unnecessary call to
+ check_space.
+
+2015-07-20 Titus Brown <titus at idyll.org>
+
+ * khmer/__init__.py: cleaned up FP rate reporting.
+ * scripts/normalize-by-median.py: corrected epilog; refactored reporting
+ to be a bit cleaner; use CSV for reporting file;
+ added --report-frequency arg.
+ * tests/test_normalize_by_median.py: updated/added tests for reporting.
+
+2015-07-17 Jacob Fenton <bocajnotnef at gmail.com>
+
+ * oxli/{functions,build_graph}.py,scripts/{load-graph,normalize-by-median,
+ abundance-dist}.py,tests/test_{normalize_by_median,subset_graph,hashbits,
+ oxli_function}.py: pylint cleanup.
+
+2015-07-17 Michael R. Crusoe <crusoe at ucdavis.edu>
+
+ * Makefile, tests/test_read_aligner.py: import khmer when pylinting.
+
+2015-07-17 Michael R. Crusoe <crusoe at ucdavis.edu>
+
+ * lib/read_parser.{cc,hh}: use std::string everywhere to match existing
+ exceptions.
+
+2015-07-10 Jacob Fenton <bocajnotnef at gmail.com>
+
+ * khmer/kfile.py: changed check_valid_file_exists to recognize fifos as
+ non-empty.
+ * tests/test_normalize_by_median.py: added test.
+
+2015-07-10 Jacob Fenton <bocajnotnef at gmail.com>
+
+ * oxli/functions.py: changed estimate functions to use correct letter
+ abbreviations.
+ * sandbox/estimate_optimal_hash.py: changed to use renamed estimate
+ functions.
+ * sandbox/unique-kmers.py: changed to not output recommended HT args by
+ default.
+ * tests/test_oxli_functions.py: changed to use renamed estimate functions.
+
+2015-07-10 Jacob Fenton <bocajnotnef at gmail.com>
+
+ * oxli/functions.py: added '--force' check to sanity check.
+
+2015-07-10 Jacob Fenton <bocajnotnef at gmail.com>
+
+ * oxli/functions.py: moved optimization/sanity check func to oxli.
+ * scripts/normalize-by-median.py,oxli/build_graph.py: added
+ optimization/sanity checking via oxli estimation funcs.
+ * tests/test_normalize_by_median.py: updated tests to cover estimation
+ functions.
+
+2015-07-08 Luiz Irber <khmer at luizirber.org>
+
+ * lib/{counting,hashbits,hashtable,labelhash,subset}.cc: print hexadecimal
+ representation of the signature read from the file.
+
+2015-07-06 Luiz Irber <khmer at luizirber.org>
+
+ * sandbox/collect-reads.py: Set a default value for coverage based
+ on the docstring.
+ * sandbox/count-kmers-single.py, tests/test_{functions,script_arguments}.py:
+ Replace xrange and cStringIO (not Python 3 compatible).
+ * lib/*.{hh,cc}, oxli/functions.py, tests/*.py: make format.
+
+2015-07-05 Jacob Fenton <bocajnotnef at gmail.com>
+
+ * doc/whats-new-2.0.rst: added in normalize-by-median.py broken paired
+ updates.
+
+2015-07-05 Michael R. Crusoe <crusoe at ucdavis.edu>
+
+ * Makefile: fix cppcheck invocation.
+ * khmer/_khmer.cc: switch to prefix increment for non-primitive objects,
+ use a C++ cast, adjust scope.
+ * lib/hashtable.{hh,cc}: make copy constructor no-op explicit. adjust scope
+ * lib/{ht-diff,test-HashTables,test-Parser}.cc: remove unused test code.
+ * lib/labelhash.cc,hllcounter.cc: astyle reformatting.
+ * lib/read_parsers.hh: more explicit constructors.
+
+2015-07-05 Michael R. Crusoe <crusoe at ucdavis.edu>
+
+ * sandbox/{collect-variants,optimal_args_hashbits,sweep-files}.py:
+ update API usage.
+
+2015-07-05 Titus Brown <titus at idyll.org>
+
+ * sandbox/{count-kmers.py,count-kmers-single.py}: added scripts to output
+ k-mer counts.
+ * tests/test_sandbox_scripts.py: added tests for count-kmers.py and
+ count-kmers-single.py.
+ * sandbox/README.rst: added count-kmers.py and count-kmers-single.py to
+ sandbox/README.
+
+2015-07-05 Kevin Murray <spam at kdmurray.id.au>
+
+ * lib/*.{cc,hh},sandbox/*.py,khmer/_khmer.cc,tests/test_*.py: Simplify
+ exception hierarchy, and ensure all C++ exceptions are converted to python
+ errors.
+ * scripts/normalize-by-median.py: Clarify error message.
+ * tests/khmer_tst_utils.py: Add longify function, converts int => long on
+ py2, and passes thru list unmodified on py3.
+
2015-06-30 Jacob Fenton <bocajnotnef at gmail.com>
* tests/{test_script_arguments,test_functions}.py: changed tests to use
diff --git a/MANIFEST.in b/MANIFEST.in
index 2d49b98..31c1bf4 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -10,6 +10,7 @@ include third-party/zlib/zconf.h.in third-party/zlib/zlib.pc.in
exclude third-party/zlib/Makefile third-party/zlib/zconf.h
recursive-include scripts filter-abund.xml normalize-by-median.xml README.txt
graft tests
+include scripts/oxli
global-exclude *.orig
global-exclude *.pyc
diff --git a/Makefile b/Makefile
index 616efdc..9a1d378 100644
--- a/Makefile
+++ b/Makefile
@@ -4,7 +4,7 @@
# and documentation
# make coverage-report to check coverage of the python scripts by the tests
-CPPSOURCES=$(wildcard lib/*.cc lib/*.hh khmer/_khmermodule.cc)
+CPPSOURCES=$(wildcard lib/*.cc lib/*.hh khmer/_khmer.cc)
PYSOURCES=$(wildcard khmer/*.py scripts/*.py)
SOURCES=$(PYSOURCES) $(CPPSOURCES) setup.py
DEVPKGS=pep8==1.5.7 diff_cover autopep8 pylint coverage gcovr nose pep257 \
@@ -12,19 +12,27 @@ DEVPKGS=pep8==1.5.7 diff_cover autopep8 pylint coverage gcovr nose pep257 \
GCOVRURL=git+https://github.com/nschum/gcovr.git@never-executed-branches
VERSION=$(shell git describe --tags --dirty | sed s/v//)
-CPPCHECK=ls lib/*.cc khmer/_khmermodule.cc | grep -v test | cppcheck -DNDEBUG \
- -DVERSION=0.0.cppcheck -UNO_UNIQUE_RC --enable=all \
- --file-list=- --platform=unix64 --std=c++03 --inline-suppr \
+CPPCHECK=ls lib/*.cc khmer/_khmer.cc | grep -v test | cppcheck -DNDEBUG \
+ -DVERSION=0.0.cppcheck -DSEQAN_HAS_BZIP2=1 -DSEQAN_HAS_ZLIB=1 \
+ -UNO_UNIQUE_RC --enable=all --suppress='*:/usr/*' \
+ --file-list=- --platform=unix64 --std=c++11 --inline-suppr \
--quiet -Ilib -Ithird-party/bzip2 -Ithird-party/zlib \
- -Ithird-party/smhasher
+ -Ithird-party/smhasher -I/usr/include/python3.4m -DHAVE_SSIZE_T \
+ -D__linux__ -D__x86_64__ -D__LP64__ -I/usr/include \
+ -I/usr/include/x86_64-linux-gnu/ -I/usr/include/linux \
+ -I/usr/lib/gcc/x86_64-linux-gnu/4.9/include/
UNAME := $(shell uname)
ifeq ($(UNAME),Linux)
- TESTATTR='!known_failing,!jenkins,!huge'
+ TESTATTR ?= '!known_failing,!jenkins,!huge'
else
- TESTATTR='!known_failing,!jenkins,!huge'
+ TESTATTR ?= '!known_failing,!jenkins,!huge,!linux'
endif
+
+MODEXT=$(shell python -c "import sysconfig;print(sysconfig.get_config_var('SO'))")
+EXTENSION_MODULE = khmer/_khmer$(MODEXT)
+
## all : default task; compile C++ code, build shared object library
all: sharedobj
@@ -40,9 +48,9 @@ install-dependencies:
pip install --upgrade --requirement doc/requirements.txt
## sharedobj : build khmer shared object file
-sharedobj: khmer/_khmermodule.so
+sharedobj: $(EXTENSION_MODULE)
-khmer/_khmermodule.so: $(CPPSOURCES)
+$(EXTENSION_MODULE): $(CPPSOURCES)
./setup.py build_ext --inplace
coverage-debug: $(CPPSOURCES)
@@ -64,7 +72,7 @@ dist/khmer-$(VERSION).tar.gz: $(SOURCES)
clean: FORCE
cd lib && ${MAKE} clean || true
cd tests && rm -rf khmertest_* || true
- rm -f khmer/_khmermodule.so
+ rm -f $(EXTENSION_MODULE)
rm -f khmer/*.pyc lib/*.pyc
./setup.py clean --all || true
rm -f coverage-debug
@@ -141,6 +149,7 @@ format: astyle autopep8
## pylint : run static code analysis on Python code
pylint: $(PYSOURCES) $(wildcard tests/*.py)
pylint --msg-template="{path}:{line}: [{msg_id}({symbol}), {obj}] {msg}" \
+ --extension-pkg-whitelist=khmer \
setup.py khmer/[!_]*.py khmer/__init__.py scripts/*.py tests \
oxli/*.py || true
@@ -155,9 +164,9 @@ diff_pylint_report: pylint_report.txt
# We need to get coverage to look at our scripts. Since they aren't in a
# python module we can't tell nosetests to look for them (via an import
# statement). So we run nose inside of coverage.
-.coverage: $(PYSOURCES) $(wildcard tests/*.py) khmer/_khmermodule.so
+.coverage: $(PYSOURCES) $(wildcard tests/*.py) $(EXTENSION_MODULE)
coverage run --branch --source=scripts,khmer,oxli --omit=khmer/_version.py \
- -m nose --with-xunit --attr=\!known_failing --processes=0
+ -m nose --with-xunit --attr $(TEST_ATTR) --processes=0
coverage.xml: .coverage
coverage xml
@@ -173,7 +182,8 @@ coverage-report: .coverage
coverage-gcovr.xml: coverage-debug .coverage
gcovr --root=. --branches --output=coverage-gcovr.xml --xml \
- --gcov-exclude='.*zlib.*|.*bzip2.*|.*smhasher.*|.*seqan.*'
+ --gcov-exclude='.*zlib.*|.*bzip2.*|.*smhasher.*|.*seqan.*' \
+ --exclude-unreachable-branches
diff-cover: coverage-gcovr.xml coverage.xml
diff-cover coverage-gcovr.xml coverage.xml
@@ -207,13 +217,13 @@ libtest: FORCE
$(MAKE) all && \
$(MAKE) install PREFIX=../install_target
test -d install_target/include
- test -f install_target/include/khmer.hh
+ test -f install_target/include/oxli/khmer.hh
test -d install_target/lib
- test -f install_target/lib/libkhmer.a
+ test -f install_target/lib/liboxli.a
$(CXX) -o install_target/test-prog-static -I install_target/include \
- lib/test-compile.cc install_target/lib/libkhmer.a
+ lib/test-compile.cc install_target/lib/liboxli.a
$(CXX) -o install_target/test-prog-dynamic -I install_target/include \
- -L install_target/lib lib/test-compile.cc -lkhmer
+ -L install_target/lib lib/test-compile.cc -loxli
rm -rf install_target
## test : run the khmer test suite
@@ -276,4 +286,23 @@ convert-release-notes:
pandoc --from=markdown --to=rst $${file} > $${file%%.md}.rst; \
done
+list-authors:
+ @echo '\author[1]{Michael R. Crusoe}'
+ @git log --format='\author[]{%aN}' | sort -uk2 | \
+ grep -v 'root\|crusoe\|titus'
+ @echo '\author[]{C. Titus Brown}'
+ @echo '\affil[1]{mcrusoe at msu.edu}'
+ @git log --format='\author[]{%aN} \affil[]{%aE}' | sort -uk2 | \
+ awk -F\\ '{print "\\"$$3}' | grep -v \
+ 'root\|crusoe\|titus\|waffle\|boyce\|pickett.rodney'
+ # R. Boyce requested to be removed 2015/05/21
+ # via pers correspondence to MRC
+ # P Rodney requested to be removed 2015/06/22 via pers correspondence
+ # to MRC
+ @echo '\affil[]{titus at idyll.org}'
+
+list-author-emails:
+ @echo 'name, E-Mail Address'
+ @git log --format='%aN,%aE' | sort -u | grep -v 'root\|waffle\|boyce'
+
FORCE:
diff --git a/doc/dev/getting-started.rst b/doc/dev/getting-started.rst
index a07c743..8434c0e 100644
--- a/doc/dev/getting-started.rst
+++ b/doc/dev/getting-started.rst
@@ -7,6 +7,8 @@
Getting started with khmer development
======================================
+.. contents::
+
This document is for people who would like to contribute to khmer. It
walks first-time contributors through making their own copy of khmer,
building it, and submitting changes for review and merge into the master
@@ -332,3 +334,26 @@ Here are a few suggestions:
* You can also help other people out by watching for new issues or
looking at pull requests. Remember to be nice and polite!
+
+Your second contribution...
+---------------------------
+
+Here are a few pointers on getting started on your second (or third,
+or fourth, or nth contribution).
+
+So, assuming you've found an issue you'd like to work on there are a
+couple things to do to make sure your local copy of the repository is
+ready for a new issue--specifically, we need to make sure it's in sync
+with the remote repository so you aren't working on a old copy. So::
+
+ git checkout master
+ git fetch --all
+ git pull
+
+This puts you on the latest master branch and pulls down updates from
+GitHub with any changes that may have been made since your last
+contribution (usually including the merge of your last
+contribution). Then we merge those changes into your local copy of the
+master branch.
+
+Now, you can go back to `Claiming an issue and starting to develop`_.
diff --git a/doc/release-notes/release-1.4.rst b/doc/release-notes/release-1.4.rst
index c257299..b9abf6c 100644
--- a/doc/release-notes/release-1.4.rst
+++ b/doc/release-notes/release-1.4.rst
@@ -83,7 +83,7 @@ correctly #781 @drtamermansour
``split-paired-reads.py``: added ``-o`` option to allow specification of
an output directory #752 @bede
-Fixed a string formatting and a boundry error in
+Fixed a string formatting and a boundary error in
``sample-reads-randomly.py`` #773 @qingpeng #995 @ctb
CSV output added to ``abundance-dist.py``, ``abundance-dist-single.py``,
diff --git a/doc/user/install.rst b/doc/user/install.rst
index e43709c..c3b49db 100644
--- a/doc/user/install.rst
+++ b/doc/user/install.rst
@@ -4,9 +4,8 @@
Installing and running khmer
============================
-You'll need a 64-bit operating system, Python 2.7.x and internet access.
-
-The khmer project currently works with Python 2.6 but we target Python 2.7.x.
+You'll need a 64-bit operating system, internet access, and Python
+2.7.x OR Python 3.3 or greater.
Build requirements
------------------
diff --git a/doc/user/scripts.rst b/doc/user/scripts.rst
index 3bc0cbb..8f2b327 100644
--- a/doc/user/scripts.rst
+++ b/doc/user/scripts.rst
@@ -58,6 +58,9 @@ k-mer counting and abundance filtering
.. autoprogram:: count-overlap:get_parser()
:prog: count-overlap.py
+.. autoprogram:: unique-kmers:get_parser()
+ :prog: unique-kmers.py
+
.. _scripts-partitioning:
Partitioning
diff --git a/doc/whats-new-2.0.rst b/doc/whats-new-2.0.rst
index 2a7f0c7..dc9cc69 100644
--- a/doc/whats-new-2.0.rst
+++ b/doc/whats-new-2.0.rst
@@ -3,6 +3,27 @@
What's New In khmer 2.0?
########################
+New behavior
+============
+
+Digital normalization script now supports mixed paired and unpaired read input
+------------------------------------------------------------------------------
+
+`normalize-by-median.py` now supports mixed paired and unpaired (or
+"broken-paired") input. Behavior can be forced to either treat all
+reads as singletons or to require all reads be properly paired using
+:option:`--force-single` or :option:`--paired`, respectively. If
+:option:`--paired` is set, :option:`--unpaired-reads` can be used to
+include a file of unpaired reads. The unpaired reads will be examined
+after all of the other sequence files.
+
+Reservoir sampling script extracts paired reads by default
+----------------------------------------------------------
+
+`sample-reads-randomly.py` now retains pairs in the output, by
+default. This can be overridden to match previous behavior
+with :option:`--force_single`.
+
Incompatible changes
====================
@@ -27,3 +48,12 @@ this project.
Files of the above types made in previous versions of khmer are not compatible
with v2.0; the reverse is also true.
+
+Scripts now output columnar data in CSV format by default
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+All scripts that output any kind of columnar data now do so in CSV format,
+with headers. Previously this had to be enabled with :options:`--csv`.
+(Affects `abundance-dist-single.py`, `abundance-dist.py`, `count-median.py`,
+and `count-overlap.py`.) `normalize-by-median.py` also now outputs CSV
+when :option:`-R` is used.
diff --git a/jenkins-build.sh b/jenkins-build.sh
index 94f7d78..54042e9 100755
--- a/jenkins-build.sh
+++ b/jenkins-build.sh
@@ -4,11 +4,13 @@ make clean
rm -Rf .env dist cov-int
-if type python2> /dev/null 2>&1
-then
- PYTHON_EXECUTABLE=$(which python2)
-else
- PYTHON_EXECUTABLE=$(which python)
+if [ -z "${PYTHON_EXECUTABLE}" ]; then
+ if type python2> /dev/null 2>&1
+ then
+ PYTHON_EXECUTABLE=$(which python2)
+ else
+ PYTHON_EXECUTABLE=$(which python)
+ fi
fi
virtualenv -p ${PYTHON_EXECUTABLE} .env
@@ -32,7 +34,7 @@ then
export CFLAGS="-pg -fprofile-arcs -ftest-coverage"
python setup.py build_ext --build-temp $PWD --debug --inplace \
--libraries gcov develop
- make coverage-gcovr.xml coverage.xml
+ make coverage-gcovr.xml coverage.xml TESTATTR='!known_failing,!huge'
./setup.py install
else
echo "gcov was not found (or we are on OSX), skipping coverage check"
@@ -70,5 +72,10 @@ fi
# takes too long to run on every build
#bash -ex -c 'cd examples/stamps/; ./do.sh' || { echo examples/stamps/do.sh no longer runs; /bin/false; }
-make lib
+unset CFLAGS
+unset LDFLAGS
+unset CPPFLAGS
+unset CXXFLAGS
+
+# Don't do lib too, as we already compile as part of libtest
make libtest
diff --git a/khmer/__init__.py b/khmer/__init__.py
index 032ca7b..b0b48e9 100644
--- a/khmer/__init__.py
+++ b/khmer/__init__.py
@@ -7,12 +7,14 @@
"""This is khmer; please see http://khmer.readthedocs.org/."""
from __future__ import print_function
+from math import log
+import json
from khmer._khmer import CountingHash as _CountingHash
from khmer._khmer import LabelHash as _LabelHash
from khmer._khmer import Hashbits as _Hashbits
from khmer._khmer import HLLCounter as _HLLCounter
-from khmer._khmer import ReadAligner
+from khmer._khmer import ReadAligner as _ReadAligner
from khmer._khmer import forward_hash # figuregen/*.py
# tests/test_{functions,counting_hash,labelhash,counting_single}.py
@@ -147,7 +149,8 @@ def extract_countinghash_info(filename):
def calc_expected_collisions(hashtable, force=False, max_false_pos=.2):
"""Do a quick & dirty expected collision rate calculation on a hashtable.
- Check to see that collision rate is within threshold.
+
+ Also check to see that collision rate is within threshold.
Keyword argument:
hashtable: the hashtable object to inspect
@@ -171,8 +174,8 @@ def calc_expected_collisions(hashtable, force=False, max_false_pos=.2):
print("** Do not use these results!!", file=sys.stderr)
print("**", file=sys.stderr)
print("** (estimated false positive rate of %.3f;" % fp_all,
- file=sys.stderr)
- print("max allowable %.3f" % max_false_pos, file=sys.stderr)
+ file=sys.stderr, end=' ')
+ print("max recommended %.3f)" % max_false_pos, file=sys.stderr)
print("**", file=sys.stderr)
if not force:
@@ -218,8 +221,8 @@ def get_n_primes_near_x(number, target):
i -= 2
if len(primes) != number:
- raise Exception("unable to find %d prime numbers < %d" % (number,
- target))
+ raise RuntimeError("unable to find %d prime numbers < %d" % (number,
+ target))
return primes
@@ -287,3 +290,101 @@ class HLLCounter(_HLLCounter):
def __len__(self):
return self.estimate_cardinality()
+
+
+class ReadAligner(_ReadAligner):
+
+ """Sequence to graph aligner.
+
+ ReadAligner uses a CountingHash (the counts of k-mers in the target DNA
+ sequences) as an implicit De Bruijn graph. Input DNA sequences are aligned
+ to this graph via a paired Hidden Markov Model.
+
+ The HMM is configured upon class instantiation; default paramaters for the
+ HMM are provided in 'defaultTransitionProbablitites' and
+ 'defaultScoringMatrix'.
+
+ The main method is 'align'.
+ """
+
+ defaultTransitionProbabilities = ( # _M, _Ir, _Ig, _Mu, _Iru, _Igu
+ (log(0.9848843, 2), log(0.0000735, 2), log(0.0000334, 2),
+ log(0.0150068, 2), log(0.0000017, 2), log(0.0000003, 2)), # M_
+ (log(0.5196194, 2), log(0.4647955, 2), log(0.0059060, 2),
+ log(0.0096792, 2)), # Ir_
+ (log(0.7611255, 2), log(0.2294619, 2), log(0.0072673, 2),
+ log(0.0021453, 2)), # Ig_
+ (log(0.0799009, 2), log(0.0000262, 2), log(0.0001836, 2),
+ log(0.9161349, 2), log(0.0033370, 2), log(0.0004173, 2)), # Mu_
+ (log(0.1434529, 2), log(0.0036995, 2), log(0.2642928, 2),
+ log(0.5885548, 2)), # Iru_
+ (log(0.1384551, 2), log(0.0431328, 2), log(0.6362921, 2),
+ log(0.1821200, 2)) # Igu_
+ )
+
+ defaultScoringMatrix = [
+ log(0.955, 2), log(0.04, 2), log(0.004, 2), log(0.001, 2)]
+
+ def __new__(cls, counting_table, trusted_cov_cutoff, bits_theta,
+ **kwargs):
+
+ if 'filename' in kwargs:
+ with open(kwargs.pop('filename')) as paramfile:
+ params = json.load(paramfile)
+ scoring_matrix = params['scoring_matrix']
+ transition_probabilities = params['transition_probabilities']
+ else:
+ if 'scoring_matrix' in kwargs:
+ scoring_matrix = kwargs.pop('scoring_matrix')
+ else:
+ scoring_matrix = ReadAligner.defaultScoringMatrix
+ if 'transition_probabilities' in kwargs:
+ transition_probabilities = kwargs.pop(
+ 'transition_probabilities')
+ else:
+ transition_probabilities = \
+ ReadAligner.defaultTransitionProbabilities
+ r = _ReadAligner.__new__(cls, counting_table, trusted_cov_cutoff,
+ bits_theta, scoring_matrix,
+ transition_probabilities)
+ r.graph = counting_table
+ return r
+
+ def __init__(self, *args, **kwargs):
+ """
+ ReadAligner initialization.
+
+ HMM state notation abbreviations:
+ M_t - trusted match; M_u - untrusted match
+ Ir_t - trusted read insert; Ir_u - untrusted read insert
+ Ig_t - trusted graph insert; Ig_u - untrusted graph insert
+
+ Keyword arguments:
+ filename - a path to a JSON encoded file providing the scoring matrix
+ for the HMM in an entry named 'scoring_matrix' and the transition
+ probababilties for the HMM in an entry named
+ 'transition_probabilities'. If provided the remaining keyword
+ arguments are ignored. (default: None)
+ scoring_matrix - a list of floats: trusted match, trusted mismatch,
+ unstrusted match, untrusted mismatch. (default:
+ ReadAligner.defaultScoringMatrix)
+ transition_probabilities - A sparse matrix as a tuple of six tuples.
+ The inner tuples contain 6, 4, 4, 6, 4, and 4 floats respectively.
+ Transition are notated as 'StartState-NextState':
+ (
+ ( M_t-M_t, M_t-Ir_t, M_t-Ig_t, M_t-M_u, M_t-Ir_u, M_t-Ig_u),
+ (Ir_t-M_t, Ir_t-Ir_t, Ir_t-M_u, Ir_t-Ir_u ),
+ (Ig_t-M_t, , Ig_t-Ig_t, Ig_t-M_u, Ig_t-Ig_u),
+ ( M_u-M_t, M_u-Ir_t, M_u-Ig_t, M_u-M_u, M_u-Ir_u, M_u-Ig_u),
+ (Ir_u-M_t, Ir_u-Ir_t, Ir_u-M_u, Ir_u-Ir_u ),
+ (Ig_u-M_t, , Ig_u-Ig_t, Ig_u-M_u, Ig_u-Ig_u)
+ )
+ (default: ReadAligner.defaultTransitionProbabilities)
+
+
+ Note: the underlying CPython implementation creates the ReadAligner
+ during the __new__ process and so the class initialization actually
+ occurs there. Instatiation is documented here in __init__ as this is
+ the traditional way.
+ """
+ _ReadAligner.__init__(self)
diff --git a/khmer/_khmer.cc b/khmer/_khmer.cc
index d58e832..d41d438 100644
--- a/khmer/_khmer.cc
+++ b/khmer/_khmer.cc
@@ -75,57 +75,6 @@ extern "C" {
MOD_INIT(_khmer);
}
-// Configure module logging.
-//#define WITH_INTERNAL_TRACING
-namespace khmer
-{
-
-namespace python
-{
-
-#ifdef WITH_INTERNAL_TRACING
-#warning "Internal tracing of Python extension module is enabled."
-static uint8_t const _MODULE_TRACE_LEVEL = TraceLogger:: TLVL_DEBUG9;
-static void _trace_logger(
- uint8_t level, char const * format, ...
-)
-{
- static FILE * _stream_handle = NULL;
-
- if (NULL == _stream_handle) {
- _stream_handle = fopen( "pymod.log", "w" );
- }
-
- va_list varargs;
-
- if (_MODULE_TRACE_LEVEL <= level) {
- va_start( varargs, format );
- vfprintf( _stream_handle, format, varargs );
- va_end( varargs );
- fflush( _stream_handle );
- }
-
-}
-#endif
-
-
-} // namespace python
-
-} // namespace khmer
-
-
-class _khmer_exception
-{
-private:
- std::string _message;
-public:
- _khmer_exception(std::string message) : _message(message) { };
- inline const std::string get_message() const
- {
- return _message;
- };
-};
-
/***********************************************************************/
//
@@ -218,7 +167,7 @@ static PyGetSetDef khmer_Read_accessors [ ] = {
static PyTypeObject khmer_Read_Type = {
PyVarObject_HEAD_INIT(NULL, 0) /* init & ob_size */
- "_khmer.Read", /* tp_name */
+ "_khmer.Read", /* tp_name */
sizeof(khmer_Read_Object), /* tp_basicsize */
0, /* tp_itemsize */
(destructor)khmer_Read_dealloc, /* tp_dealloc */
@@ -314,8 +263,8 @@ _ReadParser_new( PyTypeObject * subtype, PyObject * args, PyObject * kwds )
try {
myself->parser =
IParser:: get_parser( ifile_name );
- } catch (InvalidStreamHandle &exc) {
- PyErr_SetString( PyExc_ValueError, exc.what() );
+ } catch (khmer_file_exception &exc) {
+ PyErr_SetString( PyExc_OSError, exc.what() );
return NULL;
}
return self;
@@ -329,12 +278,13 @@ _ReadParser_iternext( PyObject * self )
khmer_ReadParser_Object * myself = (khmer_ReadParser_Object *)self;
IParser * parser = myself->parser;
- bool stop_iteration = false;
- char const * exc = NULL;
- Read * the_read_PTR;
+ bool stop_iteration = false;
+ const char *value_exception = NULL;
+ const char *file_exception = NULL;
+ Read *the_read_PTR = NULL;
try {
the_read_PTR = new Read( );
- } catch (std::bad_alloc &e) {
+ } catch (std::bad_alloc &exc) {
return PyErr_NoMemory();
}
@@ -343,13 +293,13 @@ _ReadParser_iternext( PyObject * self )
if (!stop_iteration) {
try {
parser->imprint_next_read( *the_read_PTR );
- } catch (NoMoreReadsAvailable &e) {
+ } catch (NoMoreReadsAvailable &exc) {
stop_iteration = true;
- } catch (StreamReadError &e) {
- exc = e.what();
- } catch (InvalidRead &e) {
- exc = e.what();
- }
+ } catch (khmer_file_exception &exc) {
+ file_exception = exc.what();
+ } catch (khmer_value_exception &exc) {
+ value_exception = exc.what();
+ }
}
Py_END_ALLOW_THREADS
@@ -360,9 +310,14 @@ _ReadParser_iternext( PyObject * self )
return NULL;
}
- if (exc != NULL) {
+ if (file_exception != NULL) {
delete the_read_PTR;
- PyErr_SetString(PyExc_IOError, exc);
+ PyErr_SetString(PyExc_OSError, file_exception);
+ return NULL;
+ }
+ if (value_exception != NULL) {
+ delete the_read_PTR;
+ PyErr_SetString(PyExc_ValueError, value_exception);
return NULL;
}
@@ -377,42 +332,39 @@ PyObject *
_ReadPairIterator_iternext(khmer_ReadPairIterator_Object * myself)
{
khmer_ReadParser_Object * parent = (khmer_ReadParser_Object*)myself->parent;
- IParser * parser = parent->parser;
- uint8_t pair_mode = myself->pair_mode;
+ IParser *parser = parent->parser;
+ uint8_t pair_mode = myself->pair_mode;
ReadPair the_read_pair;
- bool stop_iteration = false;
- const char * value_error_what = NULL;
- const char * io_error_what = NULL;
+ bool stop_iteration = false;
+ const char *value_exception = NULL;
+ const char *file_exception = NULL;
Py_BEGIN_ALLOW_THREADS
stop_iteration = parser->is_complete( );
- if (!stop_iteration)
+ if (!stop_iteration) {
try {
parser->imprint_next_read_pair( the_read_pair, pair_mode );
- } catch (UnknownPairReadingMode &exc) {
- value_error_what = exc.what();
- } catch (InvalidRead &exc) {
- io_error_what = exc.what();
- } catch (InvalidReadPair &exc) {
- io_error_what = exc.what();
- } catch (StreamReadError &exc) {
- io_error_what = "Input file error.";
} catch (NoMoreReadsAvailable &exc) {
stop_iteration = true;
+ } catch (khmer_file_exception &exc) {
+ file_exception = exc.what();
+ } catch (khmer_value_exception &exc) {
+ value_exception = exc.what();
}
+ }
Py_END_ALLOW_THREADS
// Note: Can return NULL instead of setting the StopIteration exception.
if (stop_iteration) {
return NULL;
}
- if (value_error_what != NULL) {
- PyErr_SetString(PyExc_ValueError, value_error_what);
+ if (file_exception != NULL) {
+ PyErr_SetString(PyExc_OSError, file_exception);
return NULL;
}
- if (io_error_what != NULL) {
- PyErr_SetString( PyExc_IOError, io_error_what);
+ if (value_exception != NULL) {
+ PyErr_SetString(PyExc_ValueError, value_exception);
return NULL;
}
@@ -534,7 +486,9 @@ static PyGetSetDef khmer_ReadParser_accessors[] = {
{NULL, NULL, NULL, NULL, NULL} /* Sentinel */
};
-static PyTypeObject khmer_ReadParser_Type = {
+static PyTypeObject khmer_ReadParser_Type
+CPYCHECKER_TYPE_OBJECT_FOR_TYPEDEF("khmer_ReadParser_Object")
+= {
PyVarObject_HEAD_INIT(NULL, 0) /* init & ob_size */
"_khmer.ReadParser", /* tp_name */
sizeof(khmer_ReadParser_Object), /* tp_basicsize */
@@ -587,6 +541,10 @@ void _init_ReadParser_Type_constants()
int result;
PyObject * value = PyLong_FromLong( IParser:: PAIR_MODE_ALLOW_UNPAIRED );
+ if (value == NULL) {
+ Py_DECREF(cls_attrs_DICT);
+ return;
+ }
result = PyDict_SetItemString(cls_attrs_DICT,
"PAIR_MODE_ALLOW_UNPAIRED", value);
Py_XDECREF(value);
@@ -596,6 +554,10 @@ void _init_ReadParser_Type_constants()
}
value = PyLong_FromLong( IParser:: PAIR_MODE_IGNORE_UNPAIRED );
+ if (value == NULL) {
+ Py_DECREF(cls_attrs_DICT);
+ return;
+ }
result = PyDict_SetItemString(cls_attrs_DICT,
"PAIR_MODE_IGNORE_UNPAIRED", value );
Py_XDECREF(value);
@@ -605,6 +567,10 @@ void _init_ReadParser_Type_constants()
}
value = PyLong_FromLong( IParser:: PAIR_MODE_ERROR_ON_UNPAIRED );
+ if (value == NULL) {
+ Py_DECREF(cls_attrs_DICT);
+ return;
+ }
result = PyDict_SetItemString(cls_attrs_DICT,
"PAIR_MODE_ERROR_ON_UNPAIRED", value);
Py_XDECREF(value);
@@ -856,8 +822,11 @@ hashtable_consume_fasta(khmer_KHashtable_Object * me, PyObject * args)
unsigned int total_reads = 0;
try {
hashtable->consume_fasta(filename, total_reads, n_consumed);
- } catch (khmer_file_exception &e) {
- PyErr_SetString(PyExc_IOError, e.what());
+ } catch (khmer_file_exception &exc) {
+ PyErr_SetString(PyExc_OSError, exc.what());
+ return NULL;
+ } catch (khmer_value_exception &exc) {
+ PyErr_SetString(PyExc_ValueError, exc.what());
return NULL;
}
@@ -881,12 +850,30 @@ hashtable_consume_fasta_with_reads_parser(khmer_KHashtable_Object * me,
_PyObject_to_khmer_ReadParser( rparser_obj );
// call the C++ function, and trap signals => Python
- unsigned long long n_consumed = 0;
- unsigned int total_reads = 0;
+ unsigned long long n_consumed = 0;
+ unsigned int total_reads = 0;
+ const char *value_exception = NULL;
+ const char *file_exception = NULL;
+
Py_BEGIN_ALLOW_THREADS
- hashtable->consume_fasta(rparser, total_reads, n_consumed);
+ try {
+ hashtable->consume_fasta(rparser, total_reads, n_consumed);
+ } catch (khmer_file_exception &exc) {
+ file_exception = exc.what();
+ } catch (khmer_value_exception &exc) {
+ value_exception = exc.what();
+ }
Py_END_ALLOW_THREADS
+ if (file_exception != NULL) {
+ PyErr_SetString(PyExc_OSError, file_exception);
+ return NULL;
+ }
+ if (value_exception != NULL) {
+ PyErr_SetString(PyExc_ValueError, value_exception);
+ return NULL;
+ }
+
return Py_BuildValue("IK", total_reads, n_consumed);
}
@@ -974,7 +961,7 @@ hashtable_load(khmer_KHashtable_Object * me, PyObject * args)
try {
hashtable->load(filename);
} catch (khmer_file_exception &e) {
- PyErr_SetString(PyExc_IOError, e.what());
+ PyErr_SetString(PyExc_OSError, e.what());
return NULL;
}
@@ -996,7 +983,7 @@ hashtable_save(khmer_KHashtable_Object * me, PyObject * args)
try {
hashtable->save(filename);
} catch (khmer_file_exception &e) {
- PyErr_SetString(PyExc_IOError, e.what());
+ PyErr_SetString(PyExc_OSError, e.what());
return NULL;
}
@@ -1146,7 +1133,15 @@ hashtable_consume_fasta_and_tag(khmer_KHashtable_Object * me, PyObject * args)
unsigned long long n_consumed;
unsigned int total_reads;
- hashtable->consume_fasta_and_tag(filename, total_reads, n_consumed);
+ try {
+ hashtable->consume_fasta_and_tag(filename, total_reads, n_consumed);
+ } catch (khmer_file_exception &exc) {
+ PyErr_SetString(PyExc_OSError, exc.what());
+ return NULL;
+ } catch (khmer_value_exception &exc) {
+ PyErr_SetString(PyExc_ValueError, exc.what());
+ return NULL;
+ }
return Py_BuildValue("IK", total_reads, n_consumed);
}
@@ -1272,7 +1267,7 @@ hashtable_load_stop_tags(khmer_KHashtable_Object * me, PyObject * args)
try {
hashtable->load_stop_tags(filename, clear_tags);
} catch (khmer_file_exception &e) {
- PyErr_SetString(PyExc_IOError, e.what());
+ PyErr_SetString(PyExc_OSError, e.what());
return NULL;
}
@@ -1295,7 +1290,7 @@ hashtable_save_stop_tags(khmer_KHashtable_Object * me, PyObject * args)
try {
hashtable->save_stop_tags(filename);
} catch (khmer_file_exception &e) {
- PyErr_SetString(PyExc_IOError, e.what());
+ PyErr_SetString(PyExc_OSError, e.what());
return NULL;
}
@@ -1450,11 +1445,18 @@ hashtable_do_subset_partition(khmer_KHashtable_Object * me, PyObject * args)
khmer_KSubsetPartition_Object * subset_obj = (khmer_KSubsetPartition_Object *)\
PyObject_New(khmer_KSubsetPartition_Object, &khmer_KSubsetPartition_Type);
+
+ if (subset_obj == NULL) {
+ delete subset_p;
+ return NULL;
+ }
+
subset_obj->subset = subset_p;
- return (PyObject *)subset_obj;
+ return (PyObject *) subset_obj;
}
+
static
PyObject *
hashtable_join_partitions_by_path(khmer_KHashtable_Object * me, PyObject * args)
@@ -1477,12 +1479,13 @@ hashtable_merge_subset(khmer_KHashtable_Object * me, PyObject * args)
{
Hashtable * hashtable = me->hashtable;
- khmer_KSubsetPartition_Object * subset_obj;
- if (!PyArg_ParseTuple(args, "O!", &khmer_KSubsetPartition_Type, &subset_obj)) {
+ khmer_KSubsetPartition_Object * subset_obj = NULL;
+ if (!PyArg_ParseTuple(args, "O!", &khmer_KSubsetPartition_Type,
+ &subset_obj)) {
return NULL;
}
- SubsetPartition * subset_p;
- subset_p = subset_obj->subset;
+
+ SubsetPartition * subset_p = subset_obj->subset;
hashtable->partition->merge(subset_p);
@@ -1503,7 +1506,7 @@ hashtable_merge_from_disk(khmer_KHashtable_Object * me, PyObject * args)
try {
hashtable->partition->merge_from_disk(filename);
} catch (khmer_file_exception &e) {
- PyErr_SetString(PyExc_IOError, e.what());
+ PyErr_SetString(PyExc_OSError, e.what());
return NULL;
}
@@ -1527,22 +1530,28 @@ hashtable_consume_fasta_and_tag_with_reads_parser(khmer_KHashtable_Object * me,
read_parsers:: IParser * rparser = rparser_obj-> parser;
// call the C++ function, and trap signals => Python
- unsigned long long n_consumed = 0;
- unsigned int total_reads = 0;
- char const * exc = NULL;
+ const char *value_exception = NULL;
+ const char *file_exception = NULL;
+ unsigned long long n_consumed = 0;
+ unsigned int total_reads = 0;
+
Py_BEGIN_ALLOW_THREADS
try {
- hashtable->consume_fasta_and_tag(
- rparser, total_reads, n_consumed
- );
- } catch (khmer::read_parsers::NoMoreReadsAvailable &e) {
- exc = e.what();
+ hashtable->consume_fasta_and_tag(rparser, total_reads, n_consumed);
+ } catch (khmer_file_exception &exc) {
+ file_exception = exc.what();
+ } catch (khmer_value_exception &exc) {
+ value_exception = exc.what();
}
Py_END_ALLOW_THREADS
- if (exc != NULL) {
- PyErr_SetString(PyExc_IOError, exc);
+
+ if (file_exception != NULL) {
+ PyErr_SetString(PyExc_OSError, file_exception);
return NULL;
}
+ if (value_exception != NULL) {
+ PyErr_SetString(PyExc_ValueError, value_exception);
+ }
return Py_BuildValue("IK", total_reads, n_consumed);
}
@@ -1567,14 +1576,16 @@ hashtable_consume_fasta_and_tag_with_stoptags(khmer_KHashtable_Object * me,
// call the C++ function, and trap signals => Python
- unsigned long long n_consumed;
- unsigned int total_reads;
-
+ unsigned long long n_consumed;
+ unsigned int total_reads;
try {
hashtable->consume_fasta_and_tag_with_stoptags(filename,
total_reads, n_consumed);
- } catch (khmer_file_exception &e) {
- PyErr_SetString(PyExc_IOError, e.what());
+ } catch (khmer_file_exception &exc) {
+ PyErr_SetString(PyExc_OSError, exc.what());
+ return NULL;
+ } catch (khmer_value_exception &exc) {
+ PyErr_SetString(PyExc_ValueError, exc.what());
return NULL;
}
@@ -1601,8 +1612,11 @@ hashtable_consume_partitioned_fasta(khmer_KHashtable_Object * me,
try {
hashtable->consume_partitioned_fasta(filename, total_reads, n_consumed);
- } catch (khmer_file_exception &e) {
- PyErr_SetString(PyExc_IOError, e.what());
+ } catch (khmer_file_exception &exc) {
+ PyErr_SetString(PyExc_OSError, exc.what());
+ return NULL;
+ } catch (khmer_value_exception &exc) {
+ PyErr_SetString(PyExc_ValueError, exc.what());
return NULL;
}
@@ -1724,7 +1738,7 @@ hashtable_get_stop_tags(khmer_KHashtable_Object * me, PyObject * args)
PyObject * x = PyList_New(hashtable->stop_tags.size());
unsigned long long i = 0;
for (si = hashtable->stop_tags.begin(); si != hashtable->stop_tags.end();
- si++) {
+ ++si) {
std::string s = _revhash(*si, k);
PyList_SET_ITEM(x, i, Py_BuildValue("s", s.c_str()));
i++;
@@ -1748,7 +1762,8 @@ hashtable_get_tagset(khmer_KHashtable_Object * me, PyObject * args)
PyObject * x = PyList_New(hashtable->all_tags.size());
unsigned long long i = 0;
- for (si = hashtable->all_tags.begin(); si != hashtable->all_tags.end(); si++) {
+ for (si = hashtable->all_tags.begin(); si != hashtable->all_tags.end();
+ ++si) {
std::string s = _revhash(*si, k);
PyList_SET_ITEM(x, i, Py_BuildValue("s", s.c_str()));
i++;
@@ -1785,7 +1800,10 @@ hashtable_output_partitions(khmer_KHashtable_Object * me, PyObject * args)
output,
output_unassigned);
} catch (khmer_file_exception &e) {
- PyErr_SetString(PyExc_IOError, e.what());
+ PyErr_SetString(PyExc_OSError, e.what());
+ return NULL;
+ } catch (khmer_value_exception &exc) {
+ PyErr_SetString(PyExc_ValueError, exc.what());
return NULL;
}
@@ -1811,9 +1829,17 @@ hashtable_find_unpart(khmer_KHashtable_Object * me, PyObject * args)
bool stop_big_traversals = PyObject_IsTrue(stop_big_traversals_o);
unsigned int n_singletons = 0;
- SubsetPartition * subset_p = hashtable->partition;
- n_singletons = subset_p->find_unpart(filename, traverse,
- stop_big_traversals);
+ try {
+ SubsetPartition * subset_p = hashtable->partition;
+ n_singletons = subset_p->find_unpart(filename, traverse,
+ stop_big_traversals);
+ } catch (khmer_file_exception &exc) {
+ PyErr_SetString(PyExc_OSError, exc.what());
+ return NULL;
+ } catch (khmer_value_exception &exc) {
+ PyErr_SetString(PyExc_ValueError, exc.what());
+ return NULL;
+ }
return PyLong_FromLong(n_singletons);
}
@@ -1831,7 +1857,15 @@ hashtable_filter_if_present(khmer_KHashtable_Object * me, PyObject * args)
return NULL;
}
- hashtable->filter_if_present(filename, output);
+ try {
+ hashtable->filter_if_present(filename, output);
+ } catch (khmer_file_exception &exc) {
+ PyErr_SetString(PyExc_OSError, exc.what());
+ return NULL;
+ } catch (khmer_value_exception &exc) {
+ PyErr_SetString(PyExc_ValueError, exc.what());
+ return NULL;
+ }
Py_RETURN_NONE;
}
@@ -1851,7 +1885,7 @@ hashtable_save_partitionmap(khmer_KHashtable_Object * me, PyObject * args)
try {
hashtable->partition->save_partitionmap(filename);
} catch (khmer_file_exception &e) {
- PyErr_SetString(PyExc_IOError, e.what());
+ PyErr_SetString(PyExc_OSError, e.what());
return NULL;
}
@@ -1873,7 +1907,7 @@ hashtable_load_partitionmap(khmer_KHashtable_Object * me, PyObject * args)
try {
hashtable->partition->load_partitionmap(filename);
} catch (khmer_file_exception &e) {
- PyErr_SetString(PyExc_IOError, e.what());
+ PyErr_SetString(PyExc_OSError, e.what());
return NULL;
}
@@ -1918,10 +1952,12 @@ hashtable_subset_count_partitions(khmer_KHashtable_Object * me, PyObject * args)
{
khmer_KSubsetPartition_Object * subset_obj = NULL;
- if (!PyArg_ParseTuple(args, "O!", &khmer_KSubsetPartition_Type, &subset_obj)) {
+ if (!PyArg_ParseTuple(args, "O!", &khmer_KSubsetPartition_Type,
+ &subset_obj)) {
return NULL;
}
+
size_t n_partitions = 0, n_unassigned = 0;
subset_obj->subset->count_partitions(n_partitions, n_unassigned);
@@ -1935,12 +1971,12 @@ hashtable_subset_partition_size_distribution(khmer_KHashtable_Object * me,
PyObject * args)
{
khmer_KSubsetPartition_Object * subset_obj = NULL;
- if (!PyArg_ParseTuple(args, "O!", &khmer_KSubsetPartition_Type, &subset_obj)) {
+ if (!PyArg_ParseTuple(args, "O!", &khmer_KSubsetPartition_Type,
+ &subset_obj)) {
return NULL;
}
- SubsetPartition * subset_p;
- subset_p = subset_obj->subset;
+ SubsetPartition * subset_p = subset_obj->subset;
PartitionCountDistribution d;
@@ -1954,7 +1990,7 @@ hashtable_subset_partition_size_distribution(khmer_KHashtable_Object * me,
PartitionCountDistribution::iterator di;
unsigned int i;
- for (i = 0, di = d.begin(); di != d.end(); di++, i++) {
+ for (i = 0, di = d.begin(); di != d.end(); ++di, i++) {
PyObject * value = Py_BuildValue("KK", di->first, di->second);
if (value == NULL) {
Py_DECREF(x);
@@ -1995,7 +2031,7 @@ hashtable_load_tagset(khmer_KHashtable_Object * me, PyObject * args)
try {
hashtable->load_tagset(filename, clear_tags);
} catch (khmer_file_exception &e) {
- PyErr_SetString(PyExc_IOError, e.what());
+ PyErr_SetString(PyExc_OSError, e.what());
return NULL;
}
@@ -2017,7 +2053,7 @@ hashtable_save_tagset(khmer_KHashtable_Object * me, PyObject * args)
try {
hashtable->save_tagset(filename);
} catch (khmer_file_exception &e) {
- PyErr_SetString(PyExc_IOError, e.what());
+ PyErr_SetString(PyExc_OSError, e.what());
return NULL;
}
@@ -2032,19 +2068,19 @@ hashtable_save_subset_partitionmap(khmer_KHashtable_Object * me,
const char * filename = NULL;
khmer_KSubsetPartition_Object * subset_obj = NULL;
- if (!PyArg_ParseTuple(args, "O!s", &khmer_KSubsetPartition_Type, &subset_obj, &filename)) {
+ if (!PyArg_ParseTuple(args, "O!s", &khmer_KSubsetPartition_Type,
+ &subset_obj, &filename)) {
return NULL;
}
- SubsetPartition * subset_p;
- subset_p = subset_obj->subset;
+ SubsetPartition * subset_p = subset_obj->subset;
Py_BEGIN_ALLOW_THREADS
try {
subset_p->save_partitionmap(filename);
} catch (khmer_file_exception &e) {
- PyErr_SetString(PyExc_IOError, e.what());
+ PyErr_SetString(PyExc_OSError, e.what());
return NULL;
}
@@ -2073,30 +2109,33 @@ hashtable_load_subset_partitionmap(khmer_KHashtable_Object * me,
return PyErr_NoMemory();
}
- bool fail = false;
- std::string err;
+ const char *file_exception = NULL;
Py_BEGIN_ALLOW_THREADS
-
try {
subset_p->load_partitionmap(filename);
- } catch (khmer_file_exception &e) {
- fail = true;
- err = e.what();
+ } catch (khmer_file_exception &exc) {
+ file_exception = exc.what();
}
-
Py_END_ALLOW_THREADS
- if (fail) {
- PyErr_SetString(PyExc_IOError, err.c_str());
+ if (file_exception != NULL) {
+ PyErr_SetString(PyExc_OSError, file_exception);
delete subset_p;
return NULL;
- } else {
- khmer_KSubsetPartition_Object * subset_obj = (khmer_KSubsetPartition_Object *)\
- PyObject_New(khmer_KSubsetPartition_Object, &khmer_KSubsetPartition_Type);
- subset_obj->subset = subset_p;
- return (PyObject*) subset_obj;
}
+
+ khmer_KSubsetPartition_Object * subset_obj = (khmer_KSubsetPartition_Object *)\
+ PyObject_New(khmer_KSubsetPartition_Object, &khmer_KSubsetPartition_Type);
+
+ if (subset_obj == NULL) {
+ delete subset_p;
+ return NULL;
+ }
+
+ subset_obj->subset = subset_p;
+
+ return (PyObject *) subset_obj;
}
static
@@ -2137,11 +2176,14 @@ hashtable__validate_subset_partitionmap(khmer_KHashtable_Object * me,
{
khmer_KSubsetPartition_Object * subset_obj = NULL;
- if (!PyArg_ParseTuple(args, "O!", &khmer_KSubsetPartition_Type, &subset_obj)) {
+ if (!PyArg_ParseTuple(args, "O!", &khmer_KSubsetPartition_Type,
+ &subset_obj)) {
return NULL;
}
- subset_obj->subset->_validate_pmap();
+ SubsetPartition * subset_p = subset_obj->subset;
+
+ subset_p->_validate_pmap();
Py_RETURN_NONE;
}
@@ -2243,7 +2285,7 @@ hashtable_divide_tags_into_subsets(khmer_KHashtable_Object * me,
PyObject * x = PyList_New(divvy.size());
unsigned int i = 0;
for (SeenSet::const_iterator si = divvy.begin(); si != divvy.end();
- si++, i++) {
+ ++si, i++) {
PyList_SET_ITEM(x, i, PyLong_FromUnsignedLongLong(*si));
}
@@ -2681,7 +2723,7 @@ count_find_spectral_error_positions(khmer_KCountingHash_Object * me,
{
khmer::CountingHash * counting = me->counting;
- char * seq = NULL;
+ const char * seq = NULL;
khmer::BoundedCounterType max_count = 0; // unsigned short int
if (!PyArg_ParseTuple(args, "sH", &seq, &max_count)) {
@@ -2724,8 +2766,16 @@ count_fasta_dump_kmers_by_abundance(khmer_KCountingHash_Object * me,
return NULL;
}
- counting->fasta_dump_kmers_by_abundance(inputfile,
- limit_by);
+ try {
+ counting->fasta_dump_kmers_by_abundance(inputfile,
+ limit_by);
+ } catch (khmer_file_exception &exc) {
+ PyErr_SetString(PyExc_OSError, exc.what());
+ return NULL;
+ } catch (khmer_value_exception &exc) {
+ PyErr_SetString(PyExc_ValueError, exc.what());
+ return NULL;
+ }
Py_RETURN_NONE;
}
@@ -2914,8 +2964,17 @@ count_fasta_count_kmers_by_position(khmer_KCountingHash_Object * me,
max_read_len = (unsigned int) max_read_len_long;
unsigned long long * counts;
- counts = counting->fasta_count_kmers_by_position(inputfile, max_read_len,
- (unsigned short) limit_by_count_int);
+ try {
+ counts = counting->fasta_count_kmers_by_position(inputfile,
+ max_read_len,
+ (unsigned short) limit_by_count_int);
+ } catch (khmer_file_exception &exc) {
+ PyErr_SetString(PyExc_OSError, exc.what());
+ return NULL;
+ } catch (khmer_value_exception &exc) {
+ PyErr_SetString(PyExc_ValueError, exc.what());
+ return NULL;
+ }
PyObject * x = PyList_New(max_read_len);
if (x == NULL) {
@@ -2951,15 +3010,31 @@ count_abundance_distribution_with_reads_parser(khmer_KCountingHash_Object * me,
return NULL;
}
- read_parsers:: IParser * rparser = rparser_obj->parser;
- Hashbits * hashbits = tracking_obj->hashbits;
-
- HashIntoType * dist = NULL;
+ read_parsers::IParser *rparser = rparser_obj->parser;
+ Hashbits *hashbits = tracking_obj->hashbits;
+ HashIntoType *dist = NULL;
+ const char *value_exception = NULL;
+ const char *file_exception = NULL;
Py_BEGIN_ALLOW_THREADS
- dist = counting->abundance_distribution(rparser, hashbits);
+ try {
+ dist = counting->abundance_distribution(rparser, hashbits);
+ } catch (khmer_file_exception &exc) {
+ file_exception = exc.what();
+ } catch (khmer_value_exception &exc) {
+ value_exception = exc.what();
+ }
Py_END_ALLOW_THREADS
+ if (file_exception != NULL) {
+ PyErr_SetString(PyExc_OSError, file_exception);
+ return NULL;
+ }
+ if (value_exception != NULL) {
+ PyErr_SetString(PyExc_ValueError, value_exception);
+ return NULL;
+ }
+
PyObject * x = PyList_New(MAX_BIGCOUNT + 1);
if (x == NULL) {
delete[] dist;
@@ -2986,35 +3061,49 @@ count_abundance_distribution(khmer_KCountingHash_Object * me, PyObject * args)
return NULL;
}
- Hashbits * hashbits = tracking_obj->hashbits;
- HashIntoType * dist;
-
- char const * result = "";
- bool exception = false;
+ Hashbits *hashbits = tracking_obj->hashbits;
+ HashIntoType *dist = NULL;
+ const char *value_exception = NULL;
+ const char *file_exception = NULL;
Py_BEGIN_ALLOW_THREADS
try {
dist = counting->abundance_distribution(filename, hashbits);
- } catch (khmer_file_exception &e) {
- exception = true;
- result = e.what();
+ } catch (khmer_file_exception &exc) {
+ file_exception = exc.what();
+ } catch (khmer_value_exception &exc) {
+ value_exception = exc.what();
}
Py_END_ALLOW_THREADS
- if (exception) {
- PyErr_SetString(PyExc_IOError, result);
+ if (file_exception != NULL) {
+ PyErr_SetString(PyExc_OSError, file_exception);
+ if (dist != NULL) {
+ delete []dist;
+ }
+ return NULL;
+ }
+ if (value_exception != NULL) {
+ PyErr_SetString(PyExc_ValueError, value_exception);
+ if (dist != NULL) {
+ delete []dist;
+ }
return NULL;
}
PyObject * x = PyList_New(MAX_BIGCOUNT + 1);
if (x == NULL) {
- delete[] dist;
+ if (dist != NULL) {
+ delete []dist;
+ }
return NULL;
}
for (int i = 0; i < MAX_BIGCOUNT + 1; i++) {
PyList_SET_ITEM(x, i, PyLong_FromUnsignedLongLong(dist[i]));
}
- delete[] dist;
+ if (dist != NULL) {
+ delete []dist;
+ }
return x;
}
@@ -3223,7 +3312,7 @@ static PyObject* _new_counting_hash(PyTypeObject * type, PyObject * args,
Py_DECREF(self);
return PyErr_NoMemory();
}
- self->khashtable.hashtable = (Hashtable *) self->counting;
+ self->khashtable.hashtable = dynamic_cast<Hashtable*>(self->counting);
}
return (PyObject *) self;
@@ -3247,14 +3336,18 @@ hashbits_count_overlap(khmer_KHashbits_Object * me, PyObject * args)
// call the C++ function, and trap signals => Python
- unsigned long long n_consumed;
- unsigned int total_reads;
- HashIntoType curve[2][100];
+ HashIntoType curve[2][100];
try {
- hashbits->consume_fasta_overlap(filename, curve, *ht2, total_reads, n_consumed);
- } catch (InvalidStreamHandle &e) {
- PyErr_SetString(PyExc_IOError, e.what());
+ unsigned long long n_consumed;
+ unsigned int total_reads;
+ hashbits->consume_fasta_overlap(filename, curve, *ht2, total_reads,
+ n_consumed);
+ } catch (khmer_file_exception &exc) {
+ PyErr_SetString(PyExc_OSError, exc.what());
+ return NULL;
+ } catch (khmer_value_exception &exc) {
+ PyErr_SetString(PyExc_ValueError, exc.what());
return NULL;
}
@@ -3438,7 +3531,7 @@ subset_partition_size_distribution(khmer_KSubsetPartition_Object * me,
PartitionCountDistribution::iterator di;
unsigned int i;
- for (i = 0, di = d.begin(); di != d.end(); di++, i++) {
+ for (i = 0, di = d.begin(); di != d.end(); ++di, i++) {
PyObject * tup = Py_BuildValue("KK", di->first, di->second);
if (tup != NULL) {
PyList_SET_ITEM(x, i, tup);
@@ -3472,7 +3565,7 @@ subset_partition_sizes(khmer_KSubsetPartition_Object * me, PyObject * args)
unsigned int i = 0;
PartitionCountMap::const_iterator mi;
- for (mi = cm.begin(); mi != cm.end(); mi++) {
+ for (mi = cm.begin(); mi != cm.end(); ++mi) {
if (mi->second >= min_size) {
i++;
}
@@ -3484,7 +3577,7 @@ subset_partition_sizes(khmer_KSubsetPartition_Object * me, PyObject * args)
}
// this should probably be a dict. @CTB
- for (i = 0, mi = cm.begin(); mi != cm.end(); mi++) {
+ for (i = 0, mi = cm.begin(); mi != cm.end(); ++mi) {
if (mi->second >= min_size) {
PyObject * tup = Py_BuildValue("II", mi->first, mi->second);
if (tup != NULL) {
@@ -3525,7 +3618,7 @@ subset_partition_average_coverages(khmer_KSubsetPartition_Object * me,
}
// this should probably be a dict. @CTB
- for (i = 0, mi = cm.begin(); mi != cm.end(); mi++, i++) {
+ for (i = 0, mi = cm.begin(); mi != cm.end(); ++mi, i++) {
PyObject * tup = Py_BuildValue("II", mi->first, mi->second);
if (tup != NULL) {
PyList_SET_ITEM(x, i, tup);
@@ -3673,32 +3766,37 @@ labelhash_consume_fasta_and_tag_with_labels(khmer_KLabelHash_Object * me,
{
LabelHash * hb = me->labelhash;
- std::ofstream outfile;
-
const char * filename;
if (!PyArg_ParseTuple(args, "s", &filename)) {
return NULL;
}
- unsigned long long n_consumed;
- unsigned int total_reads;
- char const * exc = NULL;
+ const char *value_exception = NULL;
+ const char *file_exception = NULL;
+ unsigned long long n_consumed = 0;
+ unsigned int total_reads = 0;
//Py_BEGIN_ALLOW_THREADS
try {
hb->consume_fasta_and_tag_with_labels(filename, total_reads,
n_consumed);
- } catch (khmer_file_exception &e) {
- exc = e.what();
+ } catch (khmer_file_exception &exc) {
+ file_exception = exc.what();
+ } catch (khmer_value_exception &exc) {
+ value_exception = exc.what();
}
//Py_END_ALLOW_THREADS
- if (exc != NULL) {
- PyErr_SetString(PyExc_IOError, exc);
+
+ if (file_exception != NULL) {
+ PyErr_SetString(PyExc_OSError, file_exception);
+ return NULL;
+ }
+ if (value_exception != NULL) {
+ PyErr_SetString(PyExc_ValueError, value_exception);
return NULL;
}
return Py_BuildValue("IK", total_reads, n_consumed);
-
}
static
@@ -3716,16 +3814,20 @@ labelhash_consume_partitioned_fasta_and_tag_with_labels(
// call the C++ function, and trap signals => Python
- unsigned long long n_consumed;
- unsigned int total_reads;
+ unsigned long long n_consumed = 0;
+ unsigned int total_reads = 0;
try {
labelhash->consume_partitioned_fasta_and_tag_with_labels(filename,
total_reads, n_consumed);
- } catch (khmer_file_exception &e) {
- PyErr_SetString(PyExc_IOError, e.what());
+ } catch (khmer_file_exception &exc) {
+ PyErr_SetString(PyExc_OSError, exc.what());
+ return NULL;
+ } catch (khmer_value_exception &exc) {
+ PyErr_SetString(PyExc_ValueError, exc.what());
return NULL;
}
+
return Py_BuildValue("IK", total_reads, n_consumed);
}
@@ -3742,6 +3844,7 @@ labelhash_consume_sequence_and_tag_with_labels(khmer_KLabelHash_Object * me,
}
unsigned long long n_consumed = 0;
Label * the_label = hb->check_and_allocate_label(c);
+
hb->consume_sequence_and_tag_with_labels(seq, n_consumed, *the_label);
return Py_BuildValue("K", n_consumed);
}
@@ -3929,7 +4032,7 @@ labelhash_save_labels_and_tags(khmer_KLabelHash_Object * me, PyObject * args)
try {
labelhash->save_labels_and_tags(filename);
} catch (khmer_file_exception &e) {
- PyErr_SetString(PyExc_IOError, e.what());
+ PyErr_SetString(PyExc_OSError, e.what());
return NULL;
}
@@ -3950,7 +4053,7 @@ labelhash_load_labels_and_tags(khmer_KLabelHash_Object * me, PyObject * args)
try {
labelhash->load_labels_and_tags(filename);
} catch (khmer_file_exception &e) {
- PyErr_SetString(PyExc_IOError, e.what());
+ PyErr_SetString(PyExc_OSError, e.what());
return NULL;
}
@@ -4042,12 +4145,14 @@ hashtable_repartition_largest_partition(khmer_KHashtable_Object * me,
SubsetPartition * subset_p;
unsigned int distance, threshold, frequency;
- if (!PyArg_ParseTuple(args, "OO!III", &subset_o, &khmer_KCountingHash_Type,
- &counting_o, &distance, &threshold, &frequency)) {
+ if (!PyArg_ParseTuple(args, "OO!III",
+ &subset_o,
+ &khmer_KCountingHash_Type, &counting_o,
+ &distance, &threshold, &frequency)) {
return NULL;
}
- if (subset_o != Py_None) {
+ if (PyObject_TypeCheck(subset_o, &khmer_KSubsetPartition_Type)) {
subset_p = ((khmer_KSubsetPartition_Object *) subset_o)->subset;
} else {
subset_p = hashtable->partition;
@@ -4093,8 +4198,110 @@ static PyObject * readaligner_align(khmer_ReadAligner_Object * me,
return ret;
}
+static PyObject * readaligner_align_forward(khmer_ReadAligner_Object * me,
+ PyObject * args)
+{
+ ReadAligner * aligner = me->aligner;
+
+ const char * read;
+
+ if (!PyArg_ParseTuple(args, "s", &read)) {
+ return NULL;
+ }
+
+ /*if (strlen(read) < (unsigned int)aligner->ksize()) {
+ PyErr_SetString(PyExc_ValueError,
+ "string length must >= the hashtable k-mer size");
+ return NULL;
+ }*/
+
+ Alignment * aln;
+ aln = aligner->AlignForward(read);
+
+ const char* alignment = aln->graph_alignment.c_str();
+ const char* readAlignment = aln->read_alignment.c_str();
+ PyObject * x = PyList_New(aln->covs.size());
+ for (size_t i = 0; i < aln->covs.size(); i++ ){
+ PyList_SET_ITEM(x, i, PyLong_FromLong(aln->covs[i]));
+ }
+
+ PyObject * ret = Py_BuildValue("dssOO", aln->score, alignment,
+ readAlignment,
+ (aln->truncated)? Py_True : Py_False,
+ x);
+ delete aln;
+ Py_DECREF(x);
+
+ return ret;
+}
+
+static PyObject* khmer_ReadAligner_get_scoring_matrix(
+ khmer_ReadAligner_Object * me, PyObject * args)
+{
+
+ if (!PyArg_ParseTuple(args, "")) {
+ return NULL;
+ }
+ ScoringMatrix matrix = me->aligner->getScoringMatrix();
+
+ return Py_BuildValue( "dddd", matrix.trusted_match, matrix.trusted_mismatch,
+ matrix.untrusted_match, matrix.untrusted_mismatch);
+}
+
+static PyObject* khmer_ReadAligner_get_transition_probabilities(
+ khmer_ReadAligner_Object * me, PyObject * args)
+{
+
+ if (!PyArg_ParseTuple(args, "")) {
+ return NULL;
+ }
+ ScoringMatrix matrix = me->aligner->getScoringMatrix();
+
+ return Py_BuildValue( "(dddddd)(dddd)(dddd)(dddddd)(dddd)(dddd)",
+ matrix.tsc[0], matrix.tsc[1], matrix.tsc[2],
+ matrix.tsc[3], matrix.tsc[4], matrix.tsc[5],
+ matrix.tsc[6], matrix.tsc[7], matrix.tsc[8],
+ matrix.tsc[9], matrix.tsc[10], matrix.tsc[11],
+ matrix.tsc[12], matrix.tsc[13], matrix.tsc[14],
+ matrix.tsc[15], matrix.tsc[16], matrix.tsc[17],
+ matrix.tsc[18], matrix.tsc[19], matrix.tsc[20],
+ matrix.tsc[21], matrix.tsc[22], matrix.tsc[23],
+ matrix.tsc[24], matrix.tsc[25], matrix.tsc[26],
+ matrix.tsc[27]);
+}
+
static PyMethodDef khmer_ReadAligner_methods[] = {
{"align", (PyCFunction)readaligner_align, METH_VARARGS, ""},
+ {"align_forward", (PyCFunction)readaligner_align_forward, METH_VARARGS, ""},
+ {
+ "get_scoring_matrix", (PyCFunction)khmer_ReadAligner_get_scoring_matrix,
+ METH_VARARGS,
+ "Get the scoring matrix in use.\n\n\
+Returns a tuple of floats: (trusted_match, trusted_mismatch, untrusted_match, \
+untrusted_mismatch)"
+ },
+ {
+ "get_transition_probabilities",
+ (PyCFunction)khmer_ReadAligner_get_transition_probabilities,
+ METH_VARARGS,
+ "Get the transition probabilties in use.\n\n\
+HMM state notation abbreviations:\n\
+ M_t - trusted match; M_u - untrusted match\n\
+ Ir_t - trusted read insert; Ir_u - untrusted read insert\n\
+ Ig_t - trusted graph insert; Ig_u - untrusted graph insert\n\
+\
+Returns a sparse matrix as a tuple of six tuples.\n\
+The inner tuples contain 6, 4, 4, 6, 4, and 4 floats respectively.\n\
+Transition are notated as 'StartState-NextState':\n\
+(\n\
+ ( M_t-M_t, M_t-Ir_t, M_t-Ig_t, M_t-M_u, M_t-Ir_u, M_t-Ig_u),\n\
+ (Ir_t-M_t, Ir_t-Ir_t, Ir_t-M_u, Ir_t-Ir_u ),\n\
+ (Ig_t-M_t, , Ig_t-Ig_t, Ig_t-M_u, Ig_t-Ig_u),\n\
+ ( M_u-M_t, M_u-Ir_t, M_u-Ig_t, M_u-M_u, M_u-Ir_u, M_u-Ig_u),\n\
+ (Ir_u-M_t, Ir_u-Ir_t, Ir_u-M_u, Ir_u-Ir_u ),\n\
+ (Ig_u-M_t, , Ig_u-Ig_t, Ig_u-M_u, Ig_u-Ig_u)\n\
+)"
+ },
{NULL} /* Sentinel */
};
@@ -4123,15 +4330,31 @@ static PyObject* khmer_ReadAligner_new(PyTypeObject *type, PyObject * args,
khmer_KCountingHash_Object * ch = NULL;
unsigned short int trusted_cov_cutoff = 2;
double bits_theta = 1;
-
- if(!PyArg_ParseTuple(args, "O!Hd", &khmer_KCountingHash_Type, &ch,
- &trusted_cov_cutoff, &bits_theta)) {
+ double scoring_matrix[] = { 0, 0, 0, 0 };
+ double * transitions = new double[28];
+
+ if(!PyArg_ParseTuple(
+ args,
+ "O!Hd|(dddd)((dddddd)(dddd)(dddd)(dddddd)(dddd)(dddd))",
+ &khmer_KCountingHash_Type, &ch, &trusted_cov_cutoff,
+ &bits_theta, &scoring_matrix[0], &scoring_matrix[1],
+ &scoring_matrix[2], &scoring_matrix[3], &transitions[0],
+ &transitions[1], &transitions[2], &transitions[3],
+ &transitions[4], &transitions[5], &transitions[6],
+ &transitions[7], &transitions[8], &transitions[9],
+ &transitions[10], &transitions[11], &transitions[12],
+ &transitions[13], &transitions[14], &transitions[15],
+ &transitions[16], &transitions[17], &transitions[18],
+ &transitions[19], &transitions[20], &transitions[21],
+ &transitions[22], &transitions[23], &transitions[24],
+ &transitions[25], &transitions[26], &transitions[27])) {
Py_DECREF(self);
return NULL;
}
self->aligner = new ReadAligner(ch->counting, trusted_cov_cutoff,
- bits_theta);
+ bits_theta, scoring_matrix,
+ transitions);
}
return (PyObject *) self;
@@ -4157,7 +4380,7 @@ static PyTypeObject khmer_ReadAlignerType = {
0, /*tp_getattro*/
0, /*tp_setattro*/
0, /*tp_as_buffer*/
- Py_TPFLAGS_DEFAULT, /*tp_flags*/
+ Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /*tp_flags*/
"ReadAligner object", /* tp_doc */
0, /* tp_traverse */
0, /* tp_clear */
@@ -4197,9 +4420,16 @@ hashtable_consume_fasta_and_traverse(khmer_KHashtable_Object * me,
CountingHash * counting = counting_o->counting;
- hashtable->consume_fasta_and_traverse(filename, radius, big_threshold,
- transfer_threshold, *counting);
-
+ try {
+ hashtable->consume_fasta_and_traverse(filename, radius, big_threshold,
+ transfer_threshold, *counting);
+ } catch (khmer_file_exception &exc) {
+ PyErr_SetString(PyExc_OSError, exc.what());
+ return NULL;
+ } catch (khmer_value_exception &exc) {
+ PyErr_SetString(PyExc_ValueError, exc.what());
+ return NULL;
+ }
Py_RETURN_NONE;
}
@@ -4355,8 +4585,11 @@ static PyObject * hllcounter_consume_fasta(khmer_KHLLCounter_Object * me,
unsigned int total_reads = 0;
try {
me->hllcounter->consume_fasta(filename, total_reads, n_consumed);
- } catch (khmer_file_exception &e) {
- PyErr_SetString(PyExc_IOError, e.what());
+ } catch (khmer_file_exception &exc) {
+ PyErr_SetString(PyExc_OSError, exc.what());
+ return NULL;
+ } catch (khmer_value_exception &exc) {
+ PyErr_SetString(PyExc_ValueError, exc.what());
return NULL;
}
@@ -4609,9 +4842,9 @@ static PyObject * forward_hash(PyObject * self, PyObject * args)
return NULL;
}
- PyObject * hash;
try {
- hash = PyLong_FromUnsignedLongLong(_hash(kmer, ksize));
+ PyObject * hash;
+ hash = PyLong_FromUnsignedLongLong(_hash(kmer, ksize));
return hash;
} catch (khmer_exception &e) {
PyErr_SetString(PyExc_RuntimeError, e.what());
diff --git a/khmer/_version.py b/khmer/_version.py
index 5f61635..5da9804 100644
--- a/khmer/_version.py
+++ b/khmer/_version.py
@@ -16,8 +16,8 @@ import subprocess
import sys
# these strings will be replaced by git during git-archive
-git_refnames = " (tag: v2.0-rc1)"
-git_full = "bbd38a6d3d0960f71c65dd46ecda3b61584a8b4c"
+git_refnames = " (HEAD -> master, tag: v2.0-rc2)"
+git_full = "8c2f8d33969ad402dac2c9bacbfc02197bd1ce02"
# these strings are filled in when 'setup.py versioneer' creates _version.py
tag_prefix = "v"
diff --git a/khmer/kfile.py b/khmer/kfile.py
index 9a01f59..a901833 100644
--- a/khmer/kfile.py
+++ b/khmer/kfile.py
@@ -12,7 +12,7 @@ from __future__ import print_function, unicode_literals
import os
import sys
import errno
-from stat import S_ISBLK, S_ISFIFO
+from stat import S_ISBLK, S_ISFIFO, S_ISCHR
from khmer import khmer_args
@@ -27,6 +27,7 @@ def check_input_files(file_path, force):
if file_path == '-':
return
+
try:
mode = os.stat(file_path).st_mode
except OSError:
@@ -34,25 +35,31 @@ def check_input_files(file_path, force):
file_path, file=sys.stderr)
if not force:
+ print("NOTE: This can be overridden using the --force argument",
+ file=sys.stderr)
print("Exiting", file=sys.stderr)
sys.exit(1)
else:
return
- # block devices will be nonzero
- if S_ISBLK(mode) or S_ISFIFO(mode):
+ # block devices/stdin will be nonzero
+ if S_ISBLK(mode) or S_ISFIFO(mode) or S_ISCHR(mode):
return
if not os.path.exists(file_path):
print("ERROR: Input file %s does not exist; exiting" %
file_path, file=sys.stderr)
if not force:
+ print("NOTE: This can be overridden using the --force argument",
+ file=sys.stderr)
sys.exit(1)
else:
if os.stat(file_path).st_size == 0:
print("ERROR: Input file %s is empty; exiting." %
file_path, file=sys.stderr)
if not force:
+ print("NOTE: This can be overridden using the --force"
+ " argument", file=sys.stderr)
sys.exit(1)
@@ -109,17 +116,18 @@ def check_space(in_files, force, _testhook_free_space=None):
print(" Free space: %.1f GB"
% (float(free_space) / 1e9,), file=sys.stderr)
if not force:
+ print("NOTE: This can be overridden using the --force argument",
+ file=sys.stderr)
sys.exit(1)
-def check_space_for_hashtable(args, hashtype, force,
+def check_space_for_hashtable(outfile_name, hash_size, force,
_testhook_free_space=None):
- """Check we have enough size to write a hash table."""
- hash_size = khmer_args._calculate_tablesize(args, hashtype)
+ """Check that we have enough size to write the specified hash table."""
- cwd = os.getcwd()
- dir_path = os.path.dirname(os.path.realpath(cwd))
+ dir_path = os.path.dirname(os.path.realpath(outfile_name))
target = os.statvfs(dir_path)
+
if _testhook_free_space is None:
free_space = target.f_frsize * target.f_bavail
else:
@@ -129,13 +137,15 @@ def check_space_for_hashtable(args, hashtype, force,
if size_diff > 0:
print("ERROR: Not enough free space on disk "
"for saved table files;"
- " Need at least %s GB more."
+ " Need at least %.1f GB more."
% (float(size_diff) / 1e9,), file=sys.stderr)
print(" Table size: %.1f GB"
% (float(hash_size) / 1e9,), file=sys.stderr)
print(" Free space: %.1f GB"
% (float(free_space) / 1e9,), file=sys.stderr)
if not force:
+ print("NOTE: This can be overridden using the --force argument",
+ file=sys.stderr)
sys.exit(1)
@@ -148,8 +158,11 @@ def check_valid_file_exists(in_files):
or non-existent.
"""
for in_file in in_files:
- if os.path.exists(in_file):
- if os.stat(in_file).st_size > 0:
+ if in_file == '-':
+ pass
+ elif os.path.exists(in_file):
+ mode = os.stat(in_file).st_mode
+ if os.stat(in_file).st_size > 0 or S_ISBLK(mode) or S_ISFIFO(mode):
return
else:
print('WARNING: Input file %s is empty' %
diff --git a/khmer/khmer_args.py b/khmer/khmer_args.py
index a9e9358..946a645 100644
--- a/khmer/khmer_args.py
+++ b/khmer/khmer_args.py
@@ -60,6 +60,9 @@ def build_hash_args(descr=None, epilog=None, parser=None):
parser.add_argument('--n_tables', '-N', type=int,
default=DEFAULT_N_TABLES,
help='number of k-mer counting tables to use')
+ parser.add_argument('-U', '--unique-kmers', type=int, default=0,
+ help='approximate number of unique kmers in the input'
+ ' set')
group = parser.add_mutually_exclusive_group()
group.add_argument('--max-tablesize', '-x', type=float,
@@ -83,7 +86,6 @@ def build_counting_args(descr=None, epilog=None):
def build_hashbits_args(descr=None, epilog=None, parser=None):
"""Build an ArgumentParser with args for hashbits based scripts."""
-
parser = build_hash_args(descr=descr, epilog=epilog, parser=parser)
parser.hashtype = 'nodegraph'
@@ -130,9 +132,9 @@ def add_loadhash_args(parser):
action=LoadAction)
-def _calculate_tablesize(args, hashtype, multiplier=1.0):
+def calculate_tablesize(args, hashtype, multiplier=1.0):
if hashtype not in ('countgraph', 'nodegraph'):
- raise Exception("unknown graph type: %s" % (hashtype,))
+ raise ValueError("unknown graph type: %s" % (hashtype,))
if args.max_memory_usage:
if hashtype == 'countgraph':
@@ -154,7 +156,7 @@ def create_nodegraph(args, ksize=None, multiplier=1.0):
print_error("\n** ERROR: khmer only supports k-mer sizes <= 32.\n")
sys.exit(1)
- tablesize = _calculate_tablesize(args, 'nodegraph', multiplier=multiplier)
+ tablesize = calculate_tablesize(args, 'nodegraph', multiplier)
return khmer.Hashbits(ksize, tablesize, args.n_tables)
@@ -165,7 +167,7 @@ def create_countgraph(args, ksize=None, multiplier=1.0):
print_error("\n** ERROR: khmer only supports k-mer sizes <= 32.\n")
sys.exit(1)
- tablesize = _calculate_tablesize(args, 'countgraph', multiplier=multiplier)
+ tablesize = calculate_tablesize(args, 'countgraph', multiplier=multiplier)
return khmer.CountingHash(ksize, tablesize, args.n_tables)
@@ -177,12 +179,12 @@ def report_on_config(args, hashtype='countgraph'):
"""
from khmer.utils import print_error
if hashtype not in ('countgraph', 'nodegraph'):
- raise Exception("unknown graph type: %s" % (hashtype,))
+ raise ValueError("unknown graph type: %s" % (hashtype,))
if args.quiet:
return
- tablesize = _calculate_tablesize(args, hashtype)
+ tablesize = calculate_tablesize(args, hashtype)
print_error("\nPARAMETERS:")
print_error(" - kmer size = {0} \t\t(-k)".format(args.ksize))
@@ -224,7 +226,8 @@ _algorithms = {
'software': 'MR Crusoe et al., '
'2014. http://dx.doi.org/10.6084/m9.figshare.979190',
'diginorm': 'CT Brown et al., arXiv:1203.4802 [q-bio.GN]',
- 'streaming': 'Q Zhang, S Awad, CT Brown, unpublished',
+ 'streaming': 'Q Zhang, S Awad, CT Brown, '
+ 'https://dx.doi.org/10.7287/peerj.preprints.890v1',
'graph': 'J Pell et al., http://dx.doi.org/10.1073/pnas.1121464109',
'counting': 'Q Zhang et al., '
'http://dx.doi.org/10.1371/journal.pone.0101271',
diff --git a/khmer/thread_utils.py b/khmer/thread_utils.py
index 41c3914..df997b3 100644
--- a/khmer/thread_utils.py
+++ b/khmer/thread_utils.py
@@ -26,7 +26,7 @@ DEFAULT_GROUPSIZE = 100
def verbose_loader(filename):
"""Screed iterator that additionally prints progress info to stderr."""
- screed_iter = screed.open(filename, parse_description=False)
+ screed_iter = screed.open(filename)
for n, record in enumerate(screed_iter):
if n % 100000 == 0:
print('... filtering', n, file=sys.stderr)
diff --git a/khmer/utils.py b/khmer/utils.py
index 0e1d5e1..3abb11e 100644
--- a/khmer/utils.py
+++ b/khmer/utils.py
@@ -34,6 +34,9 @@ def check_is_pair(record1, record2):
Handles both Casava formats: seq/1 and seq/2, and 'seq::... 1::...'
and 'seq::... 2::...'.
+
+ Also handles the default format of the SRA toolkit's fastq-dump:
+ 'Accession seq/1'
"""
if hasattr(record1, 'quality') or hasattr(record2, 'quality'):
if not (hasattr(record1, 'quality') and hasattr(record2, 'quality')):
@@ -47,8 +50,7 @@ def check_is_pair(record1, record2):
subpart1 = lhs1.split('/', 1)[0]
subpart2 = lhs2.split('/', 1)[0]
- assert subpart1
- if subpart1 == subpart2:
+ if subpart1 and subpart1 == subpart2:
return True
# handle '@name 1:rst'
@@ -57,7 +59,11 @@ def check_is_pair(record1, record2):
# handle @name seq/1
elif lhs1 == lhs2 and rhs1.endswith('/1') and rhs2.endswith('/2'):
- return True
+ subpart1 = rhs1.split('/', 1)[0]
+ subpart2 = rhs2.split('/', 1)[0]
+
+ if subpart1 and subpart1 == subpart2:
+ return True
return False
diff --git a/lib/Makefile b/lib/Makefile
index f3518ea..2bd68d4 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -1,3 +1,8 @@
+# Should we use the standard system zlib and libbz2?
+
+USE_SYSTEM_ZLIB ?= false
+USE_SYSTEM_LIBBZ2 ?= false
+
# Profile?
# Set this variable to true if you wish to profile the codes.
WANT_PROFILING=false
@@ -23,53 +28,61 @@ WANT_EXTRA_SANITY_CHECKING=false
# when optimization is turned on).
WANT_DEBUGGING=false
-# Compile with performance metrics turned on?
-# Set this variable to true if you want to use instrumentation provided
-# in the sources for performance measurement purposes
-# and are willing to accept the overhead such instrumentation introduces.
-WITH_INTERNAL_METRICS=false
-
-
PREFIX=/usr/local
### NOTE: No user-serviceable parts below this line! ###
INCLUDES= -I ../third-party/seqan/core/include/ \
- -I ../third-party/zlib/ \
- -I ../third-party/bzip2/ \
-I ../third-party/smhasher/
-CXXFLAGS=$(INCLUDES)
-CXX_WARNING_FLAGS=-Wall
-CXX_OPTIMIZATION_FLAGS=-O3
-CXX_SHARED_LIB_FLAGS=-fPIC
-CXXFLAGS+= \
- $(CXX_WARNING_FLAGS) \
- $(CXX_OPTIMIZATION_FLAGS) \
- $(CXX_SHARED_LIB_FLAGS)
+ifeq ($(USE_SYSTEM_ZLIB), false)
+INCLUDES += -I ../third-party/zlib/
+endif
+
+ifeq ($(USE_SYSTEM_LIBBZ2), false)
+INCLUDES += -I ../third-party/bzip2/
+endif
+
+# Warnings in common to C and C++
+WARNINGS=-Wall
+
+# Flags in common to C and C++
+COMMON_FLAGS=-O3 -fPIC
+SEQAN_FLAGS=-DSEQAN_HAS_ZLIB=1 -DSEQAN_HAS_BZIP2=1
-CFLAGS=$(INCLUDES)
-C_WARNING_FLAGS=-Wall
-C_OPTIMIZATION_FLAGS=-O3
-C_SHARED_LIB_FLAGS=-fPIC
-CFLAGS+= $(C_WARNING_FLAGS) $(C_OPTIMIZATION_FLAGS) $(C_SHARED_LIB_FLAGS)
+# Base C/CXXFLAGS
+CPPFLAGS ?=
+CPPFLAGS += $(SEQAN_FLAGS)
-LIBS=
+CXXFLAGS ?=
+CXXFLAGS += $(COMMON_FLAGS) $(WARNINGS)
+CXXFLAGS += -Wstrict-null-sentinel
+CXXFLAGS += $(INCLUDES) $(CPPFLAGS)
+
+CFLAGS ?=
+CXXFLAGS += $(COMMON_FLAGS) $(WARNINGS)
+CFLAGS += -Wshadow -Wcast-align -Wstrict-prototypes
+CFLAGS += $(INCLUDES) $(CPPFLAGS)
+
+LDFLAGS ?=
+ifneq ($(USE_SYSTEM_ZLIB), false)
+LDFLAGS += -lz
+endif
+
+ifneq ($(USE_SYSTEM_LIBBZ2), false)
+LDFLAGS += -lbz2
+endif
ifeq ($(WANT_DEBUGGING), true)
-CXX_DEBUG_FLAGS=-g
-CXXFLAGS+= $(CXX_DEBUG_FLAGS)
-CFLAGS+= $(CXX_DEBUG_FLAGS)
-else
-CXX_DEBUG_FLAGS=
+DEBUG_FLAGS=-g
+CXXFLAGS += $(DEBUG_FLAGS)
+CFLAGS += $(DEBUG_FLAGS)
endif
ifeq ($(WANT_EXTRA_SANITY_CHECKING), true)
DEFINE_KHMER_EXTRA_SANITY_CHECKS=-DKHMER_EXTRA_SANITY_CHECKS
-CXXFLAGS+= $(DEFINE_KHMER_EXTRA_SANITY_CHECKS)
-CFLAGS+= $(DEFINE_KHMER_EXTRA_SANITY_CHECKS)
-else
-DEFINE_KHMER_EXTRA_SANITY_CHECKS=
+CXXFLAGS += $(DEFINE_KHMER_EXTRA_SANITY_CHECKS)
+CFLAGS += $(DEFINE_KHMER_EXTRA_SANITY_CHECKS)
endif
ifeq ($(WANT_PROFILING), true)
@@ -77,19 +90,15 @@ ifeq ($(PROFILER_OF_CHOICE), TAU)
CXX=tau_cxx.sh
endif
ifeq ($(PROFILER_OF_CHOICE), gprof)
-PROFILING_LIBS=-pg
-CXXFLAGS+= -pg
-LIBS+= $(PROFILING_LIBS)
+CXXFLAGS += -pg
+CFLAGS += -pg
+LDFLAGS += -pg
endif
endif
-ifeq ($(WITH_INTERNAL_METRICS), true)
-CXXFLAGS+= -DWITH_INTERNAL_METRICS
-endif
-
# Place POSIX threads last in linking order, if needed.
ifneq ($(shell uname), Linux)
-LIBS+= -pthread
+LDFLAGS += -pthread
endif
@@ -99,47 +108,52 @@ HAVE_OPENMP=$(shell \
rm -f chkomp)
ifeq ($(HAVE_OPENMP), true)
- CFLAGS += -fopenmp
- CXXFLAGS += -fopenmp
+CXXFLAGS +=-fopenmp
+CFLAGS +=-fopenmp
endif
-VERSION = $(shell python get_version.py)
+ifneq ($(PACKAGE_VERSION),)
+VERSION = $(PACKAGE_VERSION)
+else
+VERSION = $(shell ./get_version.py)
+endif
+
+MAJOR_VERSION = $(shell echo $(VERSION) | sed -e 's/^\([^-\.]*\)\.\([^-\.]*\).*/\1/')
+MINOR_VERSION = $(shell echo $(VERSION) | sed -e 's/^\([^-\.]*\)\.\([^-\.]*\).*/\2/')
-LIBVERSION = $(shell python get_version.py | sed -e 's/^\([^-]*\)-.*/\1/')
-LIBKHMERSO=libkhmer.so.$(LIBVERSION)
+LIB_VERSION = $(MAJOR_VERSION).$(MINOR_VERSION)
-CXXFLAGS+= -DVERSION=$(VERSION)
+ifeq ($(shell uname), Darwin)
+SHARED_EXT = dylib
+SONAME = liboxli.$(SHARED_EXT).$(MAJOR_VERSION)
+SONAME_FLAGS = -install_name $(PREFIX)/lib/$(SONAME) -compatibility_version $(MAJOR_VERSION) -current_version $(LIB_VERSION)
+else
+SHARED_EXT = so
+SONAME = liboxli.$(SHARED_EXT).$(MAJOR_VERSION)
+SONAME_FLAGS = -Wl,-soname=$(SONAME)
+endif
+
+# The ABI version of liboxli
+LIBVERSION = 1
+LIBKHMERSO=liboxli.$(SHARED_EXT).$(LIB_VERSION)
+
+CXXFLAGS += -DVERSION=$(VERSION)
NO_UNIQUE_RC=0
-CXXFLAGS+= -DNO_UNIQUE_RC=$(NO_UNIQUE_RC)
+CXXFLAGS += -DNO_UNIQUE_RC=$(NO_UNIQUE_RC)
+CFLAGS += -DNO_UNIQUE_RC=$(NO_UNIQUE_RC)
export CXX
export CFLAGS
export CXXFLAGS
-export LIBS
+export LDFLAGS
export VERSION
#### Third party dependencies ####
-# ZLIB
+# ZLIB, use .lo not .o, so we get -fPIC and other library-related flags
ZLIB_DIR=../third-party/zlib
ZLIB_OBJS_BASE=\
- adler32.o \
- crc32.o \
- deflate.o \
- infback.o \
- inffast.o \
- inflate.o \
- inftrees.o \
- trees.o \
- zutil.o \
- compress.o \
- uncompr.o \
- gzclose.o \
- gzlib.o \
- gzread.o \
- gzwrite.o
-ZLIB_PIC_OBJS_BASE=\
adler32.lo \
crc32.lo \
deflate.lo \
@@ -157,7 +171,6 @@ ZLIB_PIC_OBJS_BASE=\
gzwrite.lo
ZLIB_OBJS=$(addprefix $(ZLIB_DIR)/, $(ZLIB_OBJS_BASE))
-ZLIB_PIC_OBJS=$(addprefix $(ZLIB_DIR)/, $(ZLIB_PIC_OBJS_BASE))
# BZ2
BZIP2_DIR=../third-party/bzip2
@@ -173,7 +186,7 @@ BZIP2_OBJS_BASE= \
BZIP2_OBJS=$(addprefix $(BZIP2_DIR)/, $(BZIP2_OBJS_BASE))
-#### khmer proper below here ####
+#### oxli proper below here ####
LIBKHMER_OBJS= \
counting.o \
@@ -182,14 +195,25 @@ LIBKHMER_OBJS= \
hllcounter.o \
kmer_hash.o \
labelhash.o \
- perf_metrics.o \
read_aligner.o \
read_parsers.o \
subset.o \
- trace_logger.o \
- murmur3.o \
- $(BZIP2_OBJS) \
- $(ZLIB_PIC_OBJS)
+ murmur3.o
+
+PRECOMILE_OBJS ?=
+PRECLEAN_TARGS ?=
+
+ifeq ($(USE_SYSTEM_ZLIB), false)
+LIBKHMER_OBJS += $(ZLIB_OBJS)
+PRECOMILE_OBJS += $(ZLIB_OBJS)
+PRECLEAN_TARGS += zlibclean
+endif
+
+ifeq ($(USE_SYSTEM_LIBBZ2), false)
+LIBKHMER_OBJS += $(BZIP2_OBJS)
+PRECOMILE_OBJS += $(BZIP2_OBJS)
+PRECLEAN_TARGS += libbz2clean
+endif
KHMER_HEADERS= \
counting.hh \
@@ -199,45 +223,40 @@ KHMER_HEADERS= \
khmer.hh \
kmer_hash.hh \
labelhash.hh \
- perf_metrics.hh \
primes.hh \
read_aligner.hh \
read_parsers.hh \
subset.hh \
- trace_logger.hh
-
-TEST_PROGS = test-Colors test-read-aligner test-compile
# START OF RULES #
# The all rule comes first!
-all: $(LIBKHMERSO) libkhmer.a khmer.pc
+all: $(LIBKHMERSO) liboxli.a oxli.pc
-clean:
- rm -f *.o *.a *.so* khmer.pc $(TEST_PROGS)
+zlibclean:
(cd $(ZLIB_DIR) && make distclean)
+libbz2clean:
(cd $(BZIP2_DIR) && make -f Makefile-libbz2_so clean)
-test: $(TEST_PROGS)
+clean: $(PRECLEAN_TARGS)
+ rm -f *.o *.a *.$(SHARED_EXT)* oxli.pc $(TEST_PROGS)
-install: $(LIBKHMERSO) libkhmer.a khmer.pc $(KHMER_HEADERS)
- mkdir -p $(PREFIX)/lib $(PREFIX)/lib/pkgconfig $(PREFIX)/include/
- cp -r $(KHMER_HEADERS) \
- ../third-party/smhasher/MurmurHash3.h \
- $(PREFIX)/include/
- cp khmer.pc $(PREFIX)/lib/pkgconfig/
- cp $(LIBKHMERSO) libkhmer.a $(PREFIX)/lib
- ln -sf $(PREFIX)/lib/$(LIBKHMERSO) $(PREFIX)/lib/libkhmer.so
+install: $(LIBKHMERSO) liboxli.a oxli.pc $(KHMER_HEADERS)
+ mkdir -p $(PREFIX)/lib $(PREFIX)/lib/pkgconfig $(PREFIX)/include/oxli
+ cp -r $(KHMER_HEADERS) \
+ ../third-party/smhasher/MurmurHash3.h \
+ $(PREFIX)/include/oxli/
+ cp oxli.pc $(PREFIX)/lib/pkgconfig/
+ cp $(LIBKHMERSO) liboxli.a $(PREFIX)/lib
+ ln -sf $(PREFIX)/lib/$(LIBKHMERSO) $(PREFIX)/lib/$(SONAME)
+ ln -sf $(PREFIX)/lib/$(SONAME) $(PREFIX)/lib/liboxli.$(SHARED_EXT)
-khmer.pc: khmer.pc.in
+oxli.pc: oxli.pc.in
sed -e 's, at prefix@,$(PREFIX),' -e 's, at VERSION@,$(VERSION),' $< >$@
$(ZLIB_OBJS):
(cd $(ZLIB_DIR) && ./configure && make $(ZLIB_OBJS_BASE))
-$(ZLIB_PIC_OBJS):
- (cd $(ZLIB_DIR) && ./configure && make $(ZLIB_PIC_OBJS_BASE))
-
$(BZIP2_OBJS):
(cd $(BZIP2_DIR) && make -f Makefile-libbz2_so $(BZIP2_OBJS_BASE))
@@ -245,17 +264,14 @@ $(BZIP2_OBJS):
murmur3.o: ../third-party/smhasher/MurmurHash3.cc
$(CXX) $(CXXFLAGS) -c -o $@ $<
-%.o: %.cc $(ZLIB_OBJS) $(ZLIB_PIC_OBJS) $(BZIP2_OBJS) $(KHMER_HEADERS)
- $(CXX) $(CXXFLAGS) -c -o $@ $<
+%.o: %.cc $(PRECOMILE_OBJS) $(KHMER_HEADERS)
+ $(CXX) $(CXXFLAGS) $(LDFLAGS) -c -o $@ $<
$(LIBKHMERSO): $(LIBKHMER_OBJS)
- $(CXX) $(CXXFLAGS) -shared -o $@ $(LIBKHMER_OBJS)
- ln -sf $(LIBKHMERSO) libkhmer.so
+ $(CXX) $(CXXFLAGS) $(LDFLAGS) $(SONAME_FLAGS) -shared -o $@ $^
+ ln -sf $(LIBKHMERSO) $(SONAME)
+ ln -sf $(SONAME) liboxli.$(SHARED_EXT)
-libkhmer.a: $(LIBKHMER_OBJS)
- ar rcs $@ $(LIBKHMER_OBJS)
+liboxli.a: $(LIBKHMER_OBJS)
+ ar rcs $@ $^
ranlib $@
-
-# catch-all rule for test drivers
-test-%: test-%.cc libkhmer.a
- $(CXX) $(CXXFLAGS) -I . -o $@ $< libkhmer.a
diff --git a/lib/counting.cc b/lib/counting.cc
index 27b23ad..2b9b921 100644
--- a/lib/counting.cc
+++ b/lib/counting.cc
@@ -5,16 +5,17 @@
// Contact: khmer-project at idyll.org
//
-#include "hashtable.hh"
+#include <errno.h>
+#include <algorithm>
+#include <iostream>
+#include <sstream> // IWYU pragma: keep
+
#include "counting.hh"
#include "hashbits.hh"
+#include "hashtable.hh"
+#include "khmer_exception.hh"
#include "read_parsers.hh"
-
#include "zlib.h"
-#include <math.h>
-#include <algorithm>
-#include <sstream>
-#include <errno.h>
using namespace std;
using namespace khmer;
@@ -35,7 +36,11 @@ void CountingHash::output_fasta_kmer_pos_freq(
Read read;
while(!parser->is_complete()) {
- read = parser->get_next_read();
+ try {
+ read = parser->get_next_read();
+ } catch (NoMoreReadsAvailable &exc) {
+ break;
+ }
seq = read.sequence;
long numPos = seq.length() - _ksize + 1;
@@ -119,31 +124,31 @@ CountingHash::abundance_distribution(
throw khmer_exception();
}
- try {
- while(!parser->is_complete()) {
+ while(!parser->is_complete()) {
+ try {
read = parser->get_next_read();
- seq = read.sequence;
+ } catch (NoMoreReadsAvailable &exc) {
+ break;
+ }
+ seq = read.sequence;
- if (check_and_normalize_read(seq)) {
- KMerIterator kmers(seq.c_str(), _ksize);
+ if (check_and_normalize_read(seq)) {
+ KMerIterator kmers(seq.c_str(), _ksize);
- while(!kmers.done()) {
- HashIntoType kmer = kmers.next();
+ while(!kmers.done()) {
+ HashIntoType kmer = kmers.next();
- if (!tracking->get_count(kmer)) {
- tracking->count(kmer);
+ if (!tracking->get_count(kmer)) {
+ tracking->count(kmer);
- BoundedCounterType n = get_count(kmer);
- dist[n]++;
- }
+ BoundedCounterType n = get_count(kmer);
+ dist[n]++;
}
-
- name.clear();
- seq.clear();
}
+ name.clear();
+ seq.clear();
}
- } catch (NoMoreReadsAvailable) {
}
return dist;
}
@@ -180,7 +185,11 @@ HashIntoType * CountingHash::fasta_count_kmers_by_position(
unsigned long long read_num = 0;
while(!parser->is_complete()) {
- read = parser->get_next_read();
+ try {
+ read = parser->get_next_read();
+ } catch (NoMoreReadsAvailable &exc) {
+ break;
+ }
seq = read.sequence;
bool valid_read = check_and_normalize_read(seq);
@@ -232,7 +241,11 @@ void CountingHash::fasta_dump_kmers_by_abundance(
unsigned long long read_num = 0;
while(!parser->is_complete()) {
- read = parser->get_next_read();
+ try {
+ read = parser->get_next_read();
+ } catch (NoMoreReadsAvailable &exc) {
+ break;
+ }
bool valid_read = check_and_normalize_read(seq);
seq = read.sequence;
@@ -519,9 +532,11 @@ CountingHashFileReader::CountingHashFileReader(
infile.read((char *) &ht_type, 1);
if (!(std::string(signature, 4) == SAVED_SIGNATURE)) {
std::ostringstream err;
- err << "Does not start with signature for a khmer " <<
- "file: " << signature << " Should be: " <<
- SAVED_SIGNATURE;
+ err << "Does not start with signature for a khmer file: 0x";
+ for(size_t i=0; i < 4; ++i) {
+ err << std::hex << (int) signature[i];
+ }
+ err << " Should be: " << SAVED_SIGNATURE;
throw khmer_file_exception(err.str());
} else if (!(version == SAVED_FORMAT_VERSION)) {
std::ostringstream err;
@@ -898,7 +913,7 @@ CountingHashGzFileWriter::CountingHashGzFileWriter(
msg << strerror(errno);
}
gzclose(outfile);
- throw khmer_file_exception(msg.str().c_str());
+ throw khmer_file_exception(msg.str());
}
written += gz_result;
}
@@ -944,7 +959,11 @@ void CountingHash::collect_high_abundance_kmers(
bool done = false;
while(!parser->is_complete() && !done) {
- read = parser->get_next_read();
+ try {
+ read = parser->get_next_read();
+ } catch (NoMoreReadsAvailable &exc) {
+ break;
+ }
currSeq = read.sequence;
// do we want to process it?
@@ -983,7 +1002,11 @@ void CountingHash::collect_high_abundance_kmers(
total_reads = 0;
while(!parser->is_complete() && total_reads != stop_at_read) {
- read = parser->get_next_read();
+ try {
+ read = parser->get_next_read();
+ } catch (NoMoreReadsAvailable &exc) {
+ break;
+ }
currSeq = read.sequence;
// do we want to process it?
diff --git a/lib/counting.hh b/lib/counting.hh
index 4849870..ac8b70e 100644
--- a/lib/counting.hh
+++ b/lib/counting.hh
@@ -8,19 +8,38 @@
#ifndef COUNTING_HH
#define COUNTING_HH
-#include "hashtable.hh"
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+#include <map>
+#include <string>
+#include <utility>
#include <vector>
+#include "hashtable.hh"
+#include "khmer.hh"
+#include "kmer_hash.hh"
+
+namespace khmer
+{
+class Hashbits;
+
+namespace read_parsers
+{
+struct IParser;
+} // namespace read_parsers
+} // namespace khmer
+
namespace khmer
{
typedef std::map<HashIntoType, BoundedCounterType> KmerCountMap;
-class CountingHashIntersect;
class CountingHashFile;
class CountingHashFileReader;
class CountingHashFileWriter;
class CountingHashGzFileReader;
class CountingHashGzFileWriter;
+class CountingHashIntersect;
class CountingHash : public khmer::Hashtable
{
diff --git a/lib/get_version.py b/lib/get_version.py
old mode 100644
new mode 100755
index 5d7fa66..929a1b8
--- a/lib/get_version.py
+++ b/lib/get_version.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python
from __future__ import print_function
import sys
sys.path.insert(0, '../')
diff --git a/lib/hashbits.cc b/lib/hashbits.cc
index 8119305..f103e02 100644
--- a/lib/hashbits.cc
+++ b/lib/hashbits.cc
@@ -5,14 +5,14 @@
// Contact: khmer-project at idyll.org
//
-#include <iostream>
-#include "hashtable.hh"
+#include <errno.h>
+#include <sstream> // IWYU pragma: keep
+
#include "hashbits.hh"
+#include "hashtable.hh"
+#include "khmer_exception.hh"
#include "read_parsers.hh"
-#include <sstream>
-#include <errno.h>
-
using namespace std;
using namespace khmer;
using namespace khmer:: read_parsers;
@@ -99,9 +99,11 @@ void Hashbits::load(std::string infilename)
infile.read((char *) &ht_type, 1);
if (!(std::string(signature, 4) == SAVED_SIGNATURE)) {
std::ostringstream err;
- err << "Does not start with signature for a khmer " <<
- "file: " << signature << " Should be: " <<
- SAVED_SIGNATURE;
+ err << "Does not start with signature for a khmer file: 0x";
+ for(size_t i=0; i < 4; ++i) {
+ err << std::hex << (int) signature[i];
+ }
+ err << " Should be: " << SAVED_SIGNATURE;
throw khmer_file_exception(err.str());
} else if (!(version == SAVED_FORMAT_VERSION)) {
std::ostringstream err;
@@ -187,7 +189,11 @@ void Hashbits::consume_fasta_overlap(const std::string &filename,
IParser* parser = IParser::get_parser(filename.c_str());
while(!parser->is_complete()) {
- read = parser->get_next_read();
+ try {
+ read = parser->get_next_read();
+ } catch (NoMoreReadsAvailable &exc) {
+ break;
+ }
total_reads++;
}
//block size for curve
@@ -217,7 +223,11 @@ void Hashbits::consume_fasta_overlap(const std::string &filename,
//
while(!parser->is_complete()) {
- read = parser->get_next_read();
+ try {
+ read = parser->get_next_read();
+ } catch (NoMoreReadsAvailable &exc) {
+ break;
+ }
currSeq = read.sequence;
unsigned int this_n_consumed;
diff --git a/lib/hashbits.hh b/lib/hashbits.hh
index 3139650..2eb2e7e 100644
--- a/lib/hashbits.hh
+++ b/lib/hashbits.hh
@@ -8,8 +8,14 @@
#ifndef HASHBITS_HH
#define HASHBITS_HH
+#include <stddef.h>
+#include <string.h>
+#include <string>
#include <vector>
+
#include "hashtable.hh"
+#include "khmer.hh"
+#include "kmer_hash.hh"
namespace khmer
{
diff --git a/lib/hashtable.cc b/lib/hashtable.cc
index cf4c2cb..85d417d 100644
--- a/lib/hashtable.cc
+++ b/lib/hashtable.cc
@@ -5,68 +5,25 @@
// Contact: khmer-project at idyll.org
//
-#include "khmer.hh"
+#include <errno.h>
+#include <math.h>
+#include <algorithm>
+#include <deque>
+#include <fstream>
+#include <iostream>
+#include <sstream> // IWYU pragma: keep
+#include <queue>
+#include <set>
+
+#include "counting.hh"
#include "hashtable.hh"
+#include "khmer.hh"
#include "read_parsers.hh"
-#include "counting.hh"
-
-#include <algorithm>
-#include <sstream>
-#include <errno.h>
using namespace std;
using namespace khmer;
using namespace khmer:: read_parsers;
-#ifdef WITH_INTERNAL_METRICS
-HashTablePerformanceMetrics::
-HashTablePerformanceMetrics( )
- : IPerformanceMetrics( ),
- clock_nsecs_norm_read( 0 ),
- cpu_nsecs_norm_read( 0 ),
- clock_nsecs_hash_kmer( 0 ),
- cpu_nsecs_hash_kmer( 0 ),
- clock_nsecs_update_tallies( 0 ),
- cpu_nsecs_update_tallies( 0 )
-{ }
-
-
-HashTablePerformanceMetrics::
-~HashTablePerformanceMetrics( )
-{ }
-
-
-void
-HashTablePerformanceMetrics::
-accumulate_timer_deltas( uint32_t metrics_key )
-{
-
- switch (metrics_key) {
- case MKEY_TIME_NORM_READ:
- clock_nsecs_norm_read +=
- _timespec_diff_in_nsecs( _temp_clock_start, _temp_clock_stop );
- cpu_nsecs_norm_read +=
- _timespec_diff_in_nsecs( _temp_cpu_start, _temp_cpu_stop );
- break;
- case MKEY_TIME_HASH_KMER:
- clock_nsecs_hash_kmer +=
- _timespec_diff_in_nsecs( _temp_clock_start, _temp_clock_stop );
- cpu_nsecs_hash_kmer +=
- _timespec_diff_in_nsecs( _temp_cpu_start, _temp_cpu_stop );
- break;
- case MKEY_TIME_UPDATE_TALLIES:
- clock_nsecs_update_tallies +=
- _timespec_diff_in_nsecs( _temp_clock_start, _temp_clock_stop );
- cpu_nsecs_update_tallies +=
- _timespec_diff_in_nsecs( _temp_cpu_start, _temp_cpu_stop );
- break;
- default:
- throw InvalidPerformanceMetricsKey( );
- }
-
-}
-#endif
-
//
// check_and_process_read: checks for non-ACGT characters before consuming
//
@@ -146,18 +103,18 @@ consume_fasta(
// Iterate through the reads and consume their k-mers.
while (!parser->is_complete( )) {
-
+ bool is_valid;
try {
- bool is_valid;
read = parser->get_next_read( );
+ } catch (NoMoreReadsAvailable) {
+ break;
+ }
- unsigned int this_n_consumed =
- check_and_process_read(read.sequence, is_valid);
+ unsigned int this_n_consumed =
+ check_and_process_read(read.sequence, is_valid);
- __sync_add_and_fetch( &n_consumed, this_n_consumed );
- __sync_add_and_fetch( &total_reads, 1 );
- } catch (read_parsers::NoMoreReadsAvailable) {
- }
+ __sync_add_and_fetch( &n_consumed, this_n_consumed );
+ __sync_add_and_fetch( &total_reads, 1 );
} // while reads left for parser
@@ -314,20 +271,23 @@ void Hashtable::load_tagset(std::string infilename, bool clear_tags)
}
unsigned char version, ht_type;
- char signature[4];
unsigned int save_ksize = 0;
size_t tagset_size = 0;
HashIntoType * buf = NULL;
try {
+ char signature[4];
infile.read(signature, 4);
infile.read((char *) &version, 1);
infile.read((char *) &ht_type, 1);
if (!(std::string(signature, 4) == SAVED_SIGNATURE)) {
std::ostringstream err;
- err << "Incorrect file signature " << signature
- << " while reading tagset from " << infilename
+ err << "Incorrect file signature 0x";
+ for(size_t i=0; i < 4; ++i) {
+ err << std::hex << (int) signature[i];
+ }
+ err << " while reading tagset from " << infilename
<< "; should be " << SAVED_SIGNATURE;
throw khmer_file_exception(err.str());
} else if (!(version == SAVED_FORMAT_VERSION)) {
@@ -485,7 +445,12 @@ consume_fasta_and_tag(
// Iterate through the reads and consume their k-mers.
while (!parser->is_complete( )) {
- read = parser->get_next_read( );
+ try {
+ read = parser->get_next_read( );
+ } catch (NoMoreReadsAvailable &e) {
+ // Bail out if this error is raised
+ break;
+ }
if (check_and_normalize_read( read.sequence )) {
unsigned long long this_n_consumed = 0;
@@ -523,7 +488,11 @@ void Hashtable::consume_fasta_and_tag_with_stoptags(const std::string &filename,
//
while(!parser->is_complete()) {
- read = parser->get_next_read();
+ try {
+ read = parser->get_next_read();
+ } catch (NoMoreReadsAvailable &exc) {
+ break;
+ }
seq = read.sequence;
read_tags.clear();
@@ -644,7 +613,11 @@ void Hashtable::consume_partitioned_fasta(const std::string &filename,
//
while(!parser->is_complete()) {
- read = parser->get_next_read();
+ try {
+ read = parser->get_next_read();
+ } catch (NoMoreReadsAvailable &exc) {
+ break;
+ }
seq = read.sequence;
if (check_and_normalize_read(seq)) {
@@ -691,7 +664,11 @@ void Hashtable::consume_fasta_and_traverse(const std::string &filename,
//
while(!parser->is_complete()) {
- read = parser->get_next_read();
+ try {
+ read = parser->get_next_read();
+ } catch (NoMoreReadsAvailable &exc) {
+ break;
+ }
seq = read.sequence;
if (check_and_normalize_read(seq)) { // process?
@@ -897,7 +874,11 @@ void Hashtable::filter_if_present(const std::string &infilename,
HashIntoType kmer;
while(!parser->is_complete()) {
- read = parser->get_next_read();
+ try {
+ read = parser->get_next_read();
+ } catch (NoMoreReadsAvailable &exc) {
+ break;
+ }
seq = read.sequence;
if (check_and_normalize_read(seq)) {
@@ -1296,19 +1277,22 @@ void Hashtable::load_stop_tags(std::string infilename, bool clear_tags)
}
unsigned char version, ht_type;
- char signature[4];
unsigned int save_ksize = 0;
size_t tagset_size = 0;
try {
+ char signature[4];
infile.read(signature, 4);
infile.read((char *) &version, 1);
infile.read((char *) &ht_type, 1);
if (!(std::string(signature, 4) == SAVED_SIGNATURE)) {
std::ostringstream err;
- err << "Incorrect file signature " << signature
- << " while reading stoptags from " << infilename
+ err << "Incorrect file signature 0x";
+ for(size_t i=0; i < 4; ++i) {
+ err << std::hex << (int) signature[i];
+ }
+ err << " while reading stoptags from " << infilename
<< "; should be " << SAVED_SIGNATURE;
throw khmer_file_exception(err.str());
} else if (!(version == SAVED_FORMAT_VERSION)) {
diff --git a/lib/hashtable.hh b/lib/hashtable.hh
index d18be70..f0f5135 100644
--- a/lib/hashtable.hh
+++ b/lib/hashtable.hh
@@ -9,22 +9,35 @@
#define HASHTABLE_HH
-#include <vector>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+#include <fstream>
#include <iostream>
#include <list>
-#include <queue>
-
-#include <fstream>
-#include <string>
-#include <set>
#include <map>
#include <queue>
+#include <queue>
+#include <set>
+#include <string>
+#include <vector>
#include "khmer.hh"
#include "khmer_exception.hh"
+#include "kmer_hash.hh"
#include "read_parsers.hh"
#include "subset.hh"
-#include "kmer_hash.hh"
+
+namespace khmer
+{
+class CountingHash;
+class Hashtable;
+
+namespace read_parsers
+{
+struct IParser;
+} // namespace read_parsers
+} // namespace khmer
#define MAX_KEEPER_SIZE int(1e6)
@@ -40,29 +53,6 @@
namespace khmer
{
-#ifdef WITH_INTERNAL_METRICS
-struct HashTablePerformanceMetrics : public IPerformanceMetrics {
-
- enum {
- MKEY_TIME_NORM_READ,
- MKEY_TIME_HASH_KMER,
- MKEY_TIME_UPDATE_TALLIES
- };
-
- uint64_t clock_nsecs_norm_read;
- uint64_t cpu_nsecs_norm_read;
- uint64_t clock_nsecs_hash_kmer;
- uint64_t cpu_nsecs_hash_kmer;
- uint64_t clock_nsecs_update_tallies;
- uint64_t cpu_nsecs_update_tallies;
-
- HashTablePerformanceMetrics( );
- virtual ~HashTablePerformanceMetrics( );
-
- virtual void accumulate_timer_deltas( uint32_t metrics_key );
-
-};
-#endif
//
// Sequence iterator class, test. Not really a C++ iterator yet.
@@ -185,7 +175,7 @@ protected:
HashIntoType bitmask;
unsigned int _nbits_sub_1;
- Hashtable( WordLength ksize )
+ explicit Hashtable( WordLength ksize )
: _max_count( MAX_KCOUNT ),
_max_bigcount( MAX_BIGCOUNT ),
_ksize( ksize )
@@ -241,7 +231,8 @@ protected:
uint32_t _all_tags_spin_lock;
- NONCOPYABLE(Hashtable);
+ explicit Hashtable(const Hashtable&);
+ Hashtable& operator=(const Hashtable&);
public:
SubsetPartition * partition;
diff --git a/lib/hllcounter.cc b/lib/hllcounter.cc
index 91ac9f7..c115524 100644
--- a/lib/hllcounter.cc
+++ b/lib/hllcounter.cc
@@ -5,18 +5,18 @@
// Contact: khmer-project at idyll.org
//
-#include "hllcounter.hh"
-
#include <math.h>
+#include <stdlib.h>
#include <algorithm>
+#include <map>
#include <numeric>
-#include <inttypes.h>
-#include <sstream>
+#include <utility>
+#include "hllcounter.hh"
#include "khmer.hh"
+#include "khmer_exception.hh"
#include "kmer_hash.hh"
#include "read_parsers.hh"
-#include "khmer_exception.hh"
#ifdef _OPENMP
#include <omp.h>
@@ -392,20 +392,21 @@ void HLLCounter::consume_fasta(
// Iterate through the reads and consume their k-mers.
try {
read = parser->get_next_read();
+ } catch (read_parsers::NoMoreReadsAvailable) {
+ break;
+ }
- #pragma omp task default(none) firstprivate(read) \
- shared(counters, n_consumed_partial, total_reads_partial)
- {
- bool is_valid;
- int n, t = omp_get_thread_num();
- n = counters[t]->check_and_process_read(read.sequence,
- is_valid);
- n_consumed_partial[t] += n;
- if (is_valid) {
- total_reads_partial[t] += 1;
- }
+ #pragma omp task default(none) firstprivate(read) \
+ shared(counters, n_consumed_partial, total_reads_partial)
+ {
+ bool is_valid;
+ int n, t = omp_get_thread_num();
+ n = counters[t]->check_and_process_read(read.sequence,
+ is_valid);
+ n_consumed_partial[t] += n;
+ if (is_valid) {
+ total_reads_partial[t] += 1;
}
- } catch (read_parsers::NoMoreReadsAvailable) {
}
} // while reads left for parser
diff --git a/lib/hllcounter.hh b/lib/hllcounter.hh
index f03cff0..0b64a31 100644
--- a/lib/hllcounter.hh
+++ b/lib/hllcounter.hh
@@ -8,11 +8,20 @@
#ifndef HLLCOUNTER_HH
#define HLLCOUNTER_HH
-#include <vector>
#include <string>
+#include <vector>
+#include "khmer.hh"
#include "read_parsers.hh"
+namespace khmer
+{
+namespace read_parsers
+{
+struct IParser;
+} // namespace read_parsers
+} // namespace khmer
+
namespace khmer
{
diff --git a/lib/ht-diff.cc b/lib/ht-diff.cc
deleted file mode 100644
index e6b9bed..0000000
--- a/lib/ht-diff.cc
+++ /dev/null
@@ -1,149 +0,0 @@
-//
-// This file is part of khmer, https://github.com/dib-lab/khmer/, and is
-// Copyright (C) Michigan State University, 2009-2015. It is licensed under
-// the three-clause BSD license; see LICENSE.
-// Contact: khmer-project at idyll.org
-//
-
-// Simple C++ implementation of a diff between counting hashes.
-// Author: Eric A. McDonald
-
-// You can learn which hash bins have differing values with a simple 'cmp'
-// between two hash files (if yo uaccount for file header length),
-// but this program actually loads the tables into memory
-// and checks things such as number of hash tables and hash table sizes.
-// Also, any differences in count or bigcount values are reported in
-// human-readable form.
-
-#if (__cplusplus >= 201103L)
-# include <cstdint>
-#else
-extern "C"
-{
-# include <stdint.h>
-}
-#endif
-#include <cstring>
-#include <cstdio>
-#include <cerrno>
-#include <cstdlib>
-
-#include <string>
-
-#include "khmer.hh"
-#include "error.hh"
-#include "counting.hh"
-
-using namespace std;
-using namespace khmer;
-
-
-static const char * SHORT_OPTS = "C:R";
-
-
-int main( int argc, char * argv[ ] )
-{
- int rc = 0;
- int opt = -1;
- char * conv_residue = NULL;
- uint32_t max_count = MAX_KCOUNT;
- bool report_all = false;
- string ifile_name_1;
- string ifile_name_2;
-
- while (-1 != (opt = getopt( argc, argv, SHORT_OPTS ))) {
-
- switch (opt) {
- case 'C':
- max_count = (uint32_t)strtoul( optarg, &conv_residue, 10 );
- if (!strcmp( optarg, conv_residue )) {
- error( EINVAL, EINVAL, "Invalid count threshold" );
- }
- break;
- case 'R':
- report_all = true;
- break;
- default:
- error( 0, 0, "Skipping unknown arg, '%c'", optopt );
- }
-
- }
-
- if (optind < argc) {
- ifile_name_1 = string( argv[ optind++ ] );
- } else {
- error( EINVAL, 0, "Name of first hash table file required" );
- }
-
- if (optind < argc) {
- ifile_name_2 = string( argv[ optind++ ] );
- } else {
- error( EINVAL, 0, "Name of second hash table file required" );
- }
-
- CountingHash ht1( 20, 1 );
- CountingHash ht2( 20, 1 );
- printf( "Loading hash tables into memory....\n" );
- ht1.load( ifile_name_1 );
- ht2.load( ifile_name_2 );
-
- HashIntoType i = 0, max_ht_size = 0;
- std:: vector<HashIntoType> ht1_sizes = ht1.get_tablesizes( );
- std:: vector<HashIntoType> ht2_sizes = ht2.get_tablesizes( );
-
- // Compare number of tables.
- if (ht1_sizes.size( ) != ht2_sizes.size( )) {
- fprintf(
- stderr, "Unequal number of hashtables (%lu and %lu).\n",
- (unsigned long int)ht1_sizes.size( ),
- (unsigned long int)ht2_sizes.size( )
- );
- exit( 1 );
- } else
- printf(
- "Number of Hash Tables: %lu\n",
- (unsigned long int)ht1_sizes.size( )
- );
-
- // Compare sizes of each table.
- for (i = 0; i < ht1_sizes.size( ); ++i) {
- if (ht1_sizes[ i ] != ht2_sizes[ i ]) {
- fprintf(
- stderr, "Hash table %lu has mismatched sizes of %llu and %llu.\n",
- (unsigned long int)i, ht1_sizes[ i ], ht2_sizes[ i ]
- );
- exit( 1 );
- } else {
- printf(
- "Size of Hash Table %lu: %llu bins\n",
- (unsigned long int)i, ht1_sizes[ i ]
- );
- if (max_ht_size < ht1_sizes[ i ]) {
- max_ht_size = ht1_sizes[ i ];
- }
- }
- }
-
- printf( "Scanning hash key space....\n" );
- for (i = 0; i < max_ht_size; ++i) {
- // Truncate counts at specified saturation threshold.
- // (This accounts for the sloppy counting used for >1 threads.)
- uint32_t count1 = MIN( ht1.get_count( i ), max_count );
- uint32_t count2 = MIN( ht2.get_count( i ), max_count );
- if (count1 != count2) {
- fprintf(
- stderr, "Hash key %llu has mismatched counts of %u and %u.\n",
- i, ht1.get_count( i ), ht2.get_count( i )
- );
- if (!report_all) {
- exit( 1 );
- }
- }
- }
- // TODO: Implement bigcount checking.
-
- return rc;
-
-}
-
-// vim: set sts=4 sw=4 tw=80:
diff --git a/lib/khmer_exception.hh b/lib/khmer_exception.hh
index 95553df..5c524cd 100644
--- a/lib/khmer_exception.hh
+++ b/lib/khmer_exception.hh
@@ -17,7 +17,7 @@ namespace khmer
///
// A base class for all exceptions.
//
-// All exceptions should be derived from this base class.
+// All exceptions should be derived from this base class or a sub-class
//
class khmer_exception : public std::exception
{
@@ -35,6 +35,9 @@ protected:
const std::string _msg;
};
+
+/////// Base Exceptions /////
+
///
// A base class for file exceptions.
//
@@ -45,15 +48,23 @@ public:
: khmer_exception(msg) { }
};
-struct InvalidStreamBuffer : public khmer_exception {
+// A base exception for value exceptions
+class khmer_value_exception : public khmer_exception
+{
+public:
+ explicit khmer_value_exception(const std::string& msg)
+ : khmer_exception(msg) { }
};
-class InvalidStreamHandle : public khmer_file_exception
+/////// Specialised Exceptions /////
+
+class InvalidStream : public khmer_file_exception
{
public:
- InvalidStreamHandle()
- : khmer_file_exception("Generic InvalidStreamHandle error") {}
- InvalidStreamHandle(const std::string& msg) : khmer_file_exception(msg) {}
+ InvalidStream()
+ : khmer_file_exception("Generic InvalidStream error") {}
+ explicit InvalidStream(const std::string& msg)
+ : khmer_file_exception(msg) {}
};
class StreamReadError : public khmer_file_exception
@@ -61,7 +72,8 @@ class StreamReadError : public khmer_file_exception
public:
StreamReadError()
: khmer_file_exception("Generic StreamReadError error") {}
- StreamReadError(const std::string& msg) : khmer_file_exception(msg) {}
+ explicit StreamReadError(const std::string& msg)
+ : khmer_file_exception(msg) {}
};
@@ -69,11 +81,11 @@ public:
// An exception for invalid arguments to functions
//
-class InvalidValue : public khmer_exception
+class InvalidValue : public khmer_value_exception
{
public:
explicit InvalidValue(const std::string& msg)
- : khmer_exception(msg) { }
+ : khmer_value_exception(msg) { }
};
///
@@ -87,7 +99,7 @@ public:
: khmer_exception(msg) { }
};
-}
+} // end namespace khmer
#endif // KHMER_EXCEPTION_HH
diff --git a/lib/kmer_hash.cc b/lib/kmer_hash.cc
index 61c3741..cc7a9bd 100644
--- a/lib/kmer_hash.cc
+++ b/lib/kmer_hash.cc
@@ -5,14 +5,16 @@
// Contact: khmer-project at idyll.org
//
-#include <math.h>
-#include <string>
-#include <iostream>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
#include <algorithm>
+#include <string>
+#include "MurmurHash3.h"
#include "khmer.hh"
+#include "khmer_exception.hh"
#include "kmer_hash.hh"
-#include "MurmurHash3.h"
using namespace std;
diff --git a/lib/labelhash.cc b/lib/labelhash.cc
index b6366f5..4d801ec 100644
--- a/lib/labelhash.cc
+++ b/lib/labelhash.cc
@@ -5,10 +5,18 @@
// Contact: khmer-project at idyll.org
//
-#include "labelhash.hh"
-
-#include <sstream>
#include <errno.h>
+#include <string.h>
+#include <iostream>
+#include <sstream> // IWYU pragma: keep
+#include <set>
+
+#include "hashbits.hh"
+#include "hashtable.hh"
+#include "khmer_exception.hh"
+#include "labelhash.hh"
+#include "read_parsers.hh"
+#include "subset.hh"
#define IO_BUF_SIZE 250*1000*1000
@@ -65,7 +73,11 @@ LabelHash::consume_fasta_and_tag_with_labels(
Label * the_label;
// Iterate through the reads and consume their k-mers.
while (!parser->is_complete( )) {
- read = parser->get_next_read( );
+ try {
+ read = parser->get_next_read( );
+ } catch (NoMoreReadsAvailable &exc) {
+ break;
+ }
if (graph->check_and_normalize_read( read.sequence )) {
// TODO: make threadsafe!
@@ -420,16 +432,19 @@ void LabelHash::load_labels_and_tags(std::string filename)
unsigned long n_labeltags = 1;
try {
unsigned int save_ksize = 0;
- char signature[4];
+ char signature[4];
unsigned char version = 0, ht_type = 0;
- infile.read(signature, 4);
+ infile.read(signature, 4);
infile.read((char *) &version, 1);
infile.read((char *) &ht_type, 1);
- if (!(std::string(signature, 4) == SAVED_SIGNATURE)) {
+ if (!(std::string(signature, 4) == SAVED_SIGNATURE)) {
std::ostringstream err;
- err << "Incorrect file signature " << signature
- << " while reading labels/tags from " << filename
+ err << "Incorrect file signature 0x";
+ for(size_t i=0; i < 4; ++i) {
+ err << std::hex << (int) signature[i];
+ }
+ err << " while reading labels/tags from " << filename
<< " Should be: " << SAVED_SIGNATURE;
throw khmer_file_exception(err.str());
} else if (!(version == SAVED_FORMAT_VERSION)) {
diff --git a/lib/labelhash.hh b/lib/labelhash.hh
index d2f754f..da2ee47 100644
--- a/lib/labelhash.hh
+++ b/lib/labelhash.hh
@@ -8,15 +8,29 @@
#ifndef LABELHASH_HH
#define LABELHASH_HH
+#include <stddef.h>
+#include <stdint.h>
+#include <map>
#include <string>
+#include <utility>
-#include "khmer.hh"
#include "hashbits.hh"
#include "hashtable.hh"
+#include "khmer.hh"
#include "read_parsers.hh"
namespace khmer
{
+class Hashtable;
+
+namespace read_parsers
+{
+struct IParser;
+} // namespace read_parsers
+} // namespace khmer
+
+namespace khmer
+{
class LabelHash
{
diff --git a/lib/khmer.pc.in b/lib/oxli.pc.in
similarity index 83%
rename from lib/khmer.pc.in
rename to lib/oxli.pc.in
index 4c97777..6cf09fb 100644
--- a/lib/khmer.pc.in
+++ b/lib/oxli.pc.in
@@ -4,11 +4,11 @@ libdir=${exec_prefix}/lib
sharedlibdir=${libdir}
includedir=${prefix}/include
-Name: khmer
+Name: oxli
Description: The unsupported core C++ library from the khmer project
URL: http://khmer.readthedocs.org/
Version: @VERSION@
Requires:
-Libs: -L${libdir} -L${sharedlibdir} -lkhmer
+Libs: -L${libdir} -L${sharedlibdir} -loxli
Cflags: -I${includedir}
diff --git a/lib/perf_metrics.cc b/lib/perf_metrics.cc
deleted file mode 100644
index a0c348a..0000000
--- a/lib/perf_metrics.cc
+++ /dev/null
@@ -1,35 +0,0 @@
-//
-// This file is part of khmer, https://github.com/dib-lab/khmer/, and is
-// Copyright (C) Michigan State University, 2009-2015. It is licensed under
-// the three-clause BSD license; see LICENSE.
-// Contact: khmer-project at idyll.org
-//
-
-#include "perf_metrics.hh"
-
-namespace khmer
-{
-
-#ifdef WITH_INTERNAL_METRICS
-IPerformanceMetrics::
-IPerformanceMetrics( )
-{ }
-
-
-IPerformanceMetrics::
-~IPerformanceMetrics( )
-{ }
-
-
-uint64_t const
-IPerformanceMetrics::
-_timespec_diff_in_nsecs( timespec const &start, timespec const &stop )
-{
- return
- ((stop.tv_sec * 1000000000U) + (uint64_t)stop.tv_nsec)
- - ((start.tv_sec * 1000000000U) + (uint64_t)start.tv_nsec);
-}
-#endif
-} // namespace khmer
-
-// vim: set ft=cpp sts=4 sw=4 tw=79:
diff --git a/lib/perf_metrics.hh b/lib/perf_metrics.hh
deleted file mode 100644
index 63a0e49..0000000
--- a/lib/perf_metrics.hh
+++ /dev/null
@@ -1,75 +0,0 @@
-//
-// This file is part of khmer, https://github.com/dib-lab/khmer/, and is
-// Copyright (C) Michigan State University, 2009-2015. It is licensed under
-// the three-clause BSD license; see LICENSE.
-// Contact: khmer-project at idyll.org
-//
-
-#ifndef PERF_METRICS_HH
-#define PERF_METRICS_HH
-
-
-#include <cstring>
-#include <ctime>
-
-
-#include "khmer.hh"
-
-namespace khmer
-{
-
-#ifdef WITH_INTERNAL_METRICS
-struct InvalidPerformanceMetricsKey : public khmer_exception {
-};
-
-
-struct IPerformanceMetrics {
-
- IPerformanceMetrics( );
- virtual ~IPerformanceMetrics( );
-
- inline void start_timers( )
- {
-#if defined (__linux__)
- clock_gettime( CLOCK_REALTIME, &_temp_clock_start );
- clock_gettime( CLOCK_THREAD_CPUTIME_ID, &_temp_cpu_start );
-// TODO: Create proper stopwatches for MacOS X.
-#else
- memset( &_temp_clock_start, 0, sizeof( timespec ) );
- memset( &_temp_cpu_start, 0, sizeof( timespec ) );
-#endif
- }
- inline void stop_timers( )
- {
-#if defined (__linux__)
- clock_gettime( CLOCK_THREAD_CPUTIME_ID, &_temp_cpu_stop );
- clock_gettime( CLOCK_REALTIME, &_temp_clock_stop );
-// TODO: Create proper stopwatches for MacOS X.
-#else
- memset( &_temp_cpu_stop, 0, sizeof( timespec ) );
- memset( &_temp_clock_stop, 0, sizeof( timespec ) );
-#endif
- }
- virtual void accumulate_timer_deltas( uint32_t metrics_key ) = 0;
-
- // TODO: Add a printing or log file feature.
-
-protected:
-
- timespec _temp_cpu_start;
- timespec _temp_cpu_stop;
- timespec _temp_clock_start;
- timespec _temp_clock_stop;
-
- uint64_t const _timespec_diff_in_nsecs(
- timespec const &start, timespec const &stop
- );
-
-};
-
-#endif // WITH_INTERNAL_METRICS
-
-} // namespace khmer
-#endif // PERF_METRICS_HH
-
-// vim: set ft=cpp sts=4 sw=4 tw=79:
diff --git a/lib/read_aligner.cc b/lib/read_aligner.cc
index 569cc52..2b9e400 100644
--- a/lib/read_aligner.cc
+++ b/lib/read_aligner.cc
@@ -3,12 +3,54 @@
// Copyright (C) Michigan State University, 2009-2015. It is licensed under
// the three-clause BSD license; see LICENSE. Contact: ctb at msu.edu
//
-#include "read_aligner.hh"
+#include <ctype.h>
+#include <algorithm>
+#include <limits>
+#include <map>
+#include <memory>
+#include <set>
+#include <utility>
+
+#include "hashtable.hh"
#include "khmer_exception.hh"
+#include "read_aligner.hh"
namespace khmer
{
+Alignment * _empty_alignment()
+{
+ Alignment* ret = new Alignment;
+ ret->score = -std::numeric_limits<double>::infinity();
+ ret->read_alignment = "";
+ ret->graph_alignment = "";
+ ret->truncated = true;
+ return ret;
+}
+
+static Nucl _ch_to_nucl(char base)
+{
+ base = toupper(base);
+
+ Nucl e = A;
+ switch(base) {
+ case 'A':
+ e = A;
+ break;
+ case 'C':
+ e = C;
+ break;
+ case 'G':
+ e = G;
+ break;
+ case 'T':
+ case 'U':
+ e = T;
+ break;
+ }
+ return e;
+}
+
struct del_alignment_node_t {
del_alignment_node_t& operator()(AlignmentNode* p)
{
@@ -222,6 +264,7 @@ void ReadAligner::Enumerate(
next->score = curr->score + sc + m_sm.tsc[trans];
next->trusted = (kmerCov >= m_trusted_cutoff);
+ next->cov = kmerCov;
next->h_score = hcost;
next->f_score = next->score + next->h_score;
@@ -347,6 +390,8 @@ Alignment* ReadAligner::ExtractAlignment(AlignmentNode* node,
std::string read_alignment = "";
std::string graph_alignment = "";
std::string trusted = "";
+ std::vector<BoundedCounterType> covs;
+ size_t farthest_seq_idx = node->seq_idx;
ret->score = node->score;
ret->truncated = (node->seq_idx != 0)
&& (node->seq_idx != read.length() - 1);
@@ -394,6 +439,7 @@ Alignment* ReadAligner::ExtractAlignment(AlignmentNode* node,
graph_alignment = graph_base + graph_alignment;
read_alignment = read_base + read_alignment;
trusted = ((node->trusted)? "T" : "F") + trusted;
+ covs.insert(covs.begin(), node->cov);
} else {
graph_alignment = graph_alignment + graph_base;
read_alignment = read_alignment + read_base;
@@ -405,6 +451,19 @@ Alignment* ReadAligner::ExtractAlignment(AlignmentNode* node,
ret->graph_alignment = graph_alignment;
ret->read_alignment = read_alignment;
ret->trusted = trusted;
+ ret->covs = covs;
+
+ if(ret->truncated) {
+ std::string new_graph_alignment;
+ if (forward) {
+ new_graph_alignment = graph_alignment +
+ read.substr(farthest_seq_idx + 1, std::string::npos);
+ } else {
+ new_graph_alignment = read.substr(0, node->seq_idx)
+ + graph_alignment;
+ }
+ ret->graph_alignment = new_graph_alignment;
+ }
return ret;
@@ -418,7 +477,7 @@ struct SearchStart {
Alignment* ReadAligner::Align(const std::string& read)
{
- int k = m_ch->ksize();
+ WordLength k = m_ch->ksize();
size_t num_kmers = read.length() - k + 1;
SearchStart start;
@@ -436,9 +495,13 @@ Alignment* ReadAligner::Align(const std::string& read)
}
}
- if(start.k_cov > 0) {
- HashIntoType fhash = 0, rhash = 0;
- _hash(start.kmer.c_str(), k, fhash, rhash);
+ if(start.k_cov == 0) {
+ return _empty_alignment();
+ }
+
+ HashIntoType fhash = 0, rhash = 0;
+ _hash(start.kmer.c_str(), k, fhash, rhash);
+
#if READ_ALIGNER_DEBUG
std::cerr << "Starting kmer: " << start.kmer << " "
<< _revhash(fhash, m_ch->ksize()) << " "
@@ -447,56 +510,41 @@ Alignment* ReadAligner::Align(const std::string& read)
<< start.kmer_idx + k - 1
<< " emission: " << start.kmer[k - 1] << std::endl;
#endif
- char base = toupper(start.kmer[k - 1]);
- Nucl e = A;
- switch(base) {
- case 'A':
- e = A;
- break;
- case 'C':
- e = C;
- break;
- case 'G':
- e = G;
- break;
- case 'T':
- case 'U':
- e = T;
- break;
- }
- AlignmentNode startingNode = AlignmentNode(NULL,
- e, start.kmer_idx + k - 1,
- MATCH, MM, fhash, rhash, k);
- startingNode.f_score = 0;
- startingNode.h_score = 0;
- Alignment* forward = NULL;
- Alignment* reverse = NULL;
- size_t final_length = 0;
-
- if(start.k_cov >= m_trusted_cutoff) {
- startingNode.score = k * m_sm.trusted_match + k * m_sm.tsc[MM];
- } else {
- startingNode.score = k * m_sm.untrusted_match + k * m_sm.tsc[MM];
- }
+ Nucl e = _ch_to_nucl(start.kmer[k - 1]);
+ AlignmentNode startingNode = AlignmentNode(NULL,
+ e, start.kmer_idx + k - 1,
+ MATCH, MM, fhash, rhash, k);
+ startingNode.f_score = 0;
+ startingNode.h_score = 0;
+ Alignment* forward = NULL;
+ Alignment* reverse = NULL;
+ size_t final_length = 0;
+
+ if(start.k_cov >= m_trusted_cutoff) {
+ startingNode.score = k * m_sm.trusted_match + k * m_sm.tsc[MM];
+ } else {
+ startingNode.score = k * m_sm.untrusted_match + k * m_sm.tsc[MM];
+ }
- forward = Subalign(&startingNode, read.length(), true, read);
- final_length = forward->read_alignment.length() + k;
+ forward = Subalign(&startingNode, read.length(), true, read);
+ final_length = forward->read_alignment.length() + k;
- startingNode.seq_idx = start.kmer_idx;
- reverse = Subalign(&startingNode, read.length(), false, read);
- final_length += reverse->read_alignment.length();
+ startingNode.seq_idx = start.kmer_idx;
+ reverse = Subalign(&startingNode, read.length(), false, read);
+ final_length += reverse->read_alignment.length();
+
+ Alignment* ret = new Alignment;
- Alignment* ret = new Alignment;
- //We've actually counted the starting node score
- //twice, so we need to adjust for that
- ret->score = reverse->score + forward->score - startingNode.score;
- ret->read_alignment = reverse->read_alignment +
- start.kmer + forward->read_alignment;
- ret->graph_alignment = reverse->graph_alignment +
- start.kmer + forward->graph_alignment;
- ret->score = ret->score - GetNull(final_length);
- ret->truncated = forward->truncated || reverse->truncated;
+ // We've actually counted the starting node score
+ // twice, so we need to adjust for that
+ ret->score = reverse->score + forward->score - startingNode.score;
+ ret->read_alignment = reverse->read_alignment +
+ start.kmer + forward->read_alignment;
+ ret->graph_alignment = reverse->graph_alignment +
+ start.kmer + forward->graph_alignment;
+ ret->score = ret->score - GetNull(final_length);
+ ret->truncated = forward->truncated || reverse->truncated;
#if READ_ALIGNER_DEBUG
fprintf(stderr,
@@ -509,17 +557,84 @@ Alignment* ReadAligner::Align(const std::string& read)
reverse->score, reverse->truncated);
#endif
- delete forward;
- delete reverse;
- return ret;
+ delete forward;
+ delete reverse;
+
+ return ret;
+}
+
+Alignment* ReadAligner::AlignForward(const std::string& read)
+{
+ WordLength k = m_ch->ksize();
+
+ // start with seed at position 0
+ SearchStart start;
+ start.kmer = read.substr(0, k);
+ start.kmer_idx = 0;
+ start.k_cov = m_ch->get_count(start.kmer.c_str());
+
+ if(start.k_cov == 0) {
+ return _empty_alignment();
+ }
+
+ HashIntoType fhash = 0, rhash = 0;
+ _hash(start.kmer.c_str(), k, fhash, rhash);
+
+#if READ_ALIGNER_DEBUG
+ std::cerr << "Starting kmer: " << start.kmer << " "
+ << _revhash(fhash, m_ch->ksize()) << " "
+ << _revhash(rhash, m_ch->ksize())
+ << " cov: " << start.k_cov << " idx: " << start.kmer_idx << ", "
+ << start.kmer_idx + k - 1
+ << " emission: " << start.kmer[k - 1] << std::endl;
+#endif
+
+ Nucl e = _ch_to_nucl(start.kmer[k - 1]);
+ AlignmentNode startingNode = AlignmentNode(NULL,
+ e, start.kmer_idx + k - 1,
+ MATCH, MM, fhash, rhash, k);
+ startingNode.f_score = 0;
+ startingNode.h_score = 0;
+ Alignment* forward = NULL;
+ size_t final_length = 0;
+
+ if(start.k_cov >= m_trusted_cutoff) {
+ startingNode.score = k * m_sm.trusted_match + k * m_sm.tsc[MM];
} else {
+ startingNode.score = k * m_sm.untrusted_match + k * m_sm.tsc[MM];
+ }
- Alignment* ret = new Alignment;
- ret->score = -std::numeric_limits<double>::infinity();
- ret->read_alignment = "";
- ret->graph_alignment = "";
- ret->truncated = true;
- return ret;
+ forward = Subalign(&startingNode, read.length(), true, read);
+ final_length = forward->read_alignment.length() + k;
+
+ Alignment* ret = new Alignment;
+
+ ret->score = forward->score;
+ ret->read_alignment = start.kmer + forward->read_alignment;
+ ret->graph_alignment = start.kmer + forward->graph_alignment;
+ ret->score = ret->score - GetNull(final_length);
+ ret->truncated = forward->truncated;
+
+ ret->covs = forward->covs;
+ ret->covs.insert(ret->covs.begin(), start.k_cov);
+ for (WordLength i = 0; i < k - 1; i++) {
+ ret->covs.push_back(0);
}
+
+#if READ_ALIGNER_DEBUG
+ fprintf(stderr,
+ "FORWARD\n\tread_aln:%s\n\tgraph_aln:%s\n\tscore:%f\n\ttrunc:%d\n",
+ forward->read_alignment.c_str(), forward->graph_alignment.c_str(),
+ forward->score, forward->truncated);
+#endif
+
+ delete forward;
+ return ret;
}
+
+ScoringMatrix ReadAligner::getScoringMatrix()
+{
+ return m_sm;
+}
+
}
diff --git a/lib/read_aligner.hh b/lib/read_aligner.hh
index 20b1654..05374b5 100644
--- a/lib/read_aligner.hh
+++ b/lib/read_aligner.hh
@@ -7,15 +7,19 @@
#ifndef READ_ALIGNER_HH
#define READ_ALIGNER_HH
-#include "khmer.hh"
-#include "counting.hh"
-
-#include <limits>
+#include <math.h>
+#include <stddef.h>
#include <algorithm>
+#include <limits>
+#include <memory>
+#include <queue>
#include <set>
+#include <string>
#include <vector>
-#include <queue>
-#include <memory>
+
+#include "counting.hh"
+#include "khmer.hh"
+#include "kmer_hash.hh"
#define READ_ALIGNER_DEBUG 0
@@ -100,6 +104,7 @@ struct AlignmentNode {
double f_score;
double h_score;
bool trusted;
+ BoundedCounterType cov;
size_t num_indels;
@@ -111,7 +116,7 @@ struct AlignmentNode {
:prev(_prev), base(_emission), seq_idx(_seq_idx),
state(_state), trans(_trans), fwd_hash(_fwd_hash),
rc_hash(_rc_hash), score(0), f_score(0), h_score(0), trusted(false),
- num_indels(0), length(_length) {}
+ cov(0), num_indels(0), length(_length) {}
bool operator== (const AlignmentNode& rhs) const
{
@@ -164,6 +169,7 @@ struct Alignment {
std::string graph_alignment;
std::string read_alignment;
std::string trusted;
+ std::vector<BoundedCounterType> covs;
double score;
bool truncated;
};
@@ -203,9 +209,9 @@ private:
}
return ret;
}
-
public:
Alignment* Align(const std::string&);
+ Alignment* AlignForward(const std::string&);
ReadAligner(khmer::CountingHash* ch,
BoundedCounterType trusted_cutoff, double bits_theta)
@@ -227,7 +233,20 @@ public:
<< std::endl;
#endif
}
+
+ ReadAligner(khmer::CountingHash* ch,
+ BoundedCounterType trusted_cutoff, double bits_theta,
+ double* scoring_matrix, double* transitions)
+ : bitmask(comp_bitmask(ch->ksize())),
+ rc_left_shift(ch->ksize() * 2 - 2),
+ m_ch(ch), m_sm(scoring_matrix[0], scoring_matrix[1],
+ scoring_matrix[2], scoring_matrix[3],
+ transitions),
+ m_trusted_cutoff(trusted_cutoff),
+ m_bits_theta(bits_theta) {};
+
+ ScoringMatrix getScoringMatrix();
+
};
}
-
#endif // READ_ALIGNER_HH
diff --git a/lib/read_parsers.cc b/lib/read_parsers.cc
index 1e8506c..8a9dc14 100644
--- a/lib/read_parsers.cc
+++ b/lib/read_parsers.cc
@@ -5,14 +5,13 @@
// Contact: khmer-project at idyll.org
//
-#include "read_parsers.hh"
+#include <seqan/seq_io.h> // IWYU pragma: keep
+#include <seqan/sequence.h> // IWYU pragma: keep
+#include <seqan/stream.h> // IWYU pragma: keep
+#include <fstream>
-#include <cstring>
#include "khmer_exception.hh"
-#include <seqan/sequence.h>
-#include <seqan/seq_io.h>
-#include <seqan/stream.h>
-#include <pthread.h>
+#include "read_parsers.hh"
namespace khmer
{
@@ -33,11 +32,11 @@ SeqAnParser::SeqAnParser( char const * filename ) : IParser( )
if (!seqan::isGood(_private->stream)) {
std::string message = "Could not open ";
message = message + filename + " for reading.";
- throw InvalidStreamHandle(message.c_str());
+ throw InvalidStream(message);
} else if (seqan::atEnd(_private->stream)) {
std::string message = "File ";
message = message + filename + " does not contain any sequences!";
- throw InvalidStreamHandle(message.c_str());
+ throw InvalidStream(message);
}
__asm__ __volatile__ ("" ::: "memory");
_private->seqan_spin_lock = 0;
@@ -122,7 +121,7 @@ IParser(
REG_EXTENDED | REG_NOSUB
);
if (regex_rc) {
- throw khmer_exception();
+ throw khmer_exception("Could not compile R2 nosub regex");
}
regex_rc =
regcomp(
@@ -130,7 +129,7 @@ IParser(
"^.+(/1| 1:[YN]:[[:digit:]]+:[[:alpha:]]+).{0}", REG_EXTENDED
);
if (regex_rc) {
- throw khmer_exception();
+ throw khmer_exception("Could not compile R1 regex");
}
regex_rc =
regcomp(
@@ -138,7 +137,7 @@ IParser(
"^.+(/2| 2:[YN]:[[:digit:]]+:[[:alpha:]]+).{0}", REG_EXTENDED
);
if (regex_rc) {
- throw khmer_exception();
+ throw khmer_exception("Could not compile R2 regex");
}
_num_reads = 0;
_have_qualities = false;
@@ -169,7 +168,9 @@ imprint_next_read_pair( ReadPair &the_read_pair, uint8_t mode )
_imprint_next_read_pair_in_error_mode( the_read_pair );
break;
default:
- throw UnknownPairReadingMode( );
+ std::ostringstream oss;
+ oss << "Unknown pair reading mode: " << mode;
+ throw UnknownPairReadingMode(oss.str());
}
}
diff --git a/lib/read_parsers.hh b/lib/read_parsers.hh
index b8ceadf..13588b0 100644
--- a/lib/read_parsers.hh
+++ b/lib/read_parsers.hh
@@ -9,9 +9,15 @@
#define READ_PARSERS_HH
#include <regex.h>
-#include <iostream>
+#include <stddef.h>
+#include <stdint.h>
#include <cstdlib>
+#include <iostream>
+#include <string>
+#include <utility>
+
#include "khmer.hh"
+#include "khmer_exception.hh"
namespace khmer
{
@@ -21,32 +27,32 @@ namespace khmer
namespace read_parsers
{
-struct NoMoreReadsAvailable : public khmer_exception {
- explicit NoMoreReadsAvailable(const char *msg) :
- khmer_exception(msg) {}
+struct NoMoreReadsAvailable : public khmer_file_exception {
+ explicit NoMoreReadsAvailable(const std::string& msg) :
+ khmer_file_exception(msg) {}
NoMoreReadsAvailable() :
- khmer_exception("No more reads available in this stream.") {}
+ khmer_file_exception("No more reads available in this stream.") {}
};
-struct InvalidRead : public khmer_exception {
- explicit InvalidRead(const char *msg) :
- khmer_exception(msg) {}
+struct InvalidRead : public khmer_value_exception {
+ explicit InvalidRead(const std::string& msg) :
+ khmer_value_exception(msg) {}
InvalidRead() :
- khmer_exception("Invalid read") {}
+ khmer_value_exception("Invalid FASTA/Q read") {}
};
-struct UnknownPairReadingMode : public khmer_exception {
- explicit UnknownPairReadingMode(const char *msg) :
- khmer_exception(msg) {}
+struct UnknownPairReadingMode : public khmer_value_exception {
+ explicit UnknownPairReadingMode(const std::string& msg) :
+ khmer_value_exception(msg) {}
UnknownPairReadingMode() :
- khmer_exception("Unknown pair reading mode supplied.") {}
+ khmer_value_exception("Unknown pair reading mode supplied.") {}
};
-struct InvalidReadPair : public khmer_exception {
- explicit InvalidReadPair(const char *msg) :
- khmer_exception(msg) {}
+struct InvalidReadPair : public khmer_value_exception {
+ explicit InvalidReadPair(const std::string& msg) :
+ khmer_value_exception(msg) {}
InvalidReadPair() :
- khmer_exception("Invalid read pair detected.") {}
+ khmer_value_exception("Invalid read pair detected.") {}
};
struct Read {
@@ -138,7 +144,7 @@ class SeqAnParser : public IParser
{
public:
- SeqAnParser( const char * filename );
+ explicit SeqAnParser( const char * filename );
~SeqAnParser( );
bool is_complete( );
@@ -146,6 +152,7 @@ public:
private:
struct Handle;
+
Handle* _private;
};
diff --git a/lib/subset.cc b/lib/subset.cc
index c62ed39..d5b490b 100644
--- a/lib/subset.cc
+++ b/lib/subset.cc
@@ -5,13 +5,21 @@
// Contact: khmer-project at idyll.org
//
-#include "hashbits.hh"
-#include "subset.hh"
-#include "read_parsers.hh"
-
-#include <sstream>
-#include <errno.h>
#include <assert.h>
+#include <errno.h>
+#include <string.h>
+#include <iostream>
+#include <sstream> // IWYU pragma: keep
+#include <map>
+#include <set>
+#include <utility>
+
+#include "counting.hh"
+#include "hashtable.hh"
+#include "khmer_exception.hh"
+#include "kmer_hash.hh"
+#include "read_parsers.hh"
+#include "subset.hh"
#define IO_BUF_SIZE 250*1000*1000
#define BIG_TRAVERSALS_ARE 200
@@ -99,7 +107,12 @@ size_t SubsetPartition::output_partitioned_file(
//
while(!parser->is_complete()) {
- read = parser->get_next_read();
+ try {
+ read = parser->get_next_read();
+ } catch (NoMoreReadsAvailable &exc) {
+ break;
+ }
+
seq = read.sequence;
if (_ht->check_and_normalize_read(seq)) {
@@ -205,7 +218,11 @@ unsigned int SubsetPartition::find_unpart(
//
while(!parser->is_complete()) {
- read = parser->get_next_read();
+ try {
+ read = parser->get_next_read();
+ } catch (NoMoreReadsAvailable &exc) {
+ break;
+ }
seq = read.sequence;
if (_ht->check_and_normalize_read(seq)) {
@@ -1283,8 +1300,11 @@ void SubsetPartition::merge_from_disk(string other_filename)
infile.read((char *) &ht_type, 1);
if (!(std::string(signature, 4) == SAVED_SIGNATURE)) {
std::ostringstream err;
- err << "Incorrect file signature " << signature
- << " while reading subset pmap from " << other_filename
+ err << "Incorrect file signature 0x";
+ for(size_t i=0; i < 4; ++i) {
+ err << std::hex << (int) signature[i];
+ }
+ err << " while reading subset pmap from " << other_filename
<< " Should be: " << SAVED_SIGNATURE;
throw khmer_file_exception(err.str());
} else if (!(version == SAVED_FORMAT_VERSION)) {
diff --git a/lib/subset.hh b/lib/subset.hh
index a7d053e..67a5ba7 100644
--- a/lib/subset.hh
+++ b/lib/subset.hh
@@ -8,19 +8,23 @@
#ifndef SUBSET_HH
#define SUBSET_HH
+#include <stddef.h>
+#include <queue>
+#include <string>
+
#include "khmer.hh"
namespace khmer
{
class CountingHash;
-class Hashtable;
class Hashbits;
+class Hashtable;
struct pre_partition_info {
HashIntoType kmer;
SeenSet tagged_kmers;
- pre_partition_info(HashIntoType _kmer) : kmer(_kmer) {};
+ explicit pre_partition_info(HashIntoType _kmer) : kmer(_kmer) {};
};
class SubsetPartition
@@ -40,7 +44,7 @@ protected:
const HashIntoType kmer);
public:
- SubsetPartition(Hashtable * ht) : next_partition_id(2), _ht(ht)
+ explicit SubsetPartition(Hashtable * ht) : next_partition_id(2), _ht(ht)
{
;
};
diff --git a/lib/test-HashTables.cc b/lib/test-HashTables.cc
deleted file mode 100644
index baa0fd2..0000000
--- a/lib/test-HashTables.cc
+++ /dev/null
@@ -1,150 +0,0 @@
-//
-// This file is part of khmer, https://github.com/dib-lab/khmer/, and is
-// Copyright (C) Michigan State University, 2009-2015. It is licensed under
-// the three-clause BSD license; see LICENSE.
-// Contact: khmer-project at idyll.org
-//
-
-// Simple C++ implementation of the 'load-graph' Python script.
-// Author: Eric A. McDonald
-
-
-#include <cstring>
-#include <cstdio>
-#include <cerrno>
-#include <cstdlib>
-#include <unistd.h>
-#include <getopt.h>
-
-#include <omp.h>
-
-#define HASH_TYPE_TO_TEST 1 // Counting Hash
-//#define HASH_TYPE_TO_TEST 2 // Bit Hash
-
-// #define OUTPUT_HASHTABLE
-
-
-#include "error.hh"
-#include "read_parsers.hh"
-#if HASH_TYPE_TO_TEST == 1
-# include "counting.hh"
-#elif HASH_TYPE_TO_TEST == 2
-# include "hashbits.hh"
-#else
-# error "No HASH_TYPE_TO_TEST macro defined."
-#endif
-#include "primes.hh"
-
-using namespace std;
-using namespace khmer;
-using namespace khmer:: read_parsers;
-
-
-static const char * SHORT_OPTS = "k:N:x:s:";
-
-
-int main( int argc, char * argv[ ] )
-{
- unsigned long kmer_length = 32;
- float ht_size_FP = 1.0E6;
- unsigned long ht_count = 4;
- uint64_t cache_size = 4L * 1024 * 1024 * 1024;
-
- int rc = 0;
- int opt = -1;
- char * conv_residue = NULL;
- string ofile_name;
- string ifile_name;
- // FILE * ofile = NULL;
-
- while (-1 != (opt = getopt( argc, argv, SHORT_OPTS ))) {
-
- switch (opt) {
-
- case 'k':
- kmer_length = strtoul( optarg, &conv_residue, 10 );
- if (!strcmp( optarg, conv_residue )) {
- error( EINVAL, EINVAL, "Invalid kmer length" );
- }
- break;
-
- case 'N':
- ht_count = strtoul( optarg, &conv_residue, 10 );
- if (!strcmp( optarg, conv_residue )) {
- error( EINVAL, EINVAL, "Invalid number of hashtables" );
- }
- break;
-
- case 'x':
- ht_size_FP = strtof( optarg, &conv_residue );
- if (!strcmp( optarg, conv_residue )) {
- error( EINVAL, EINVAL, "Invalid hashtable size" );
- }
- break;
-
- case 's':
- cache_size = strtoull( optarg, &conv_residue, 10 );
- if (!strcmp( optarg, conv_residue )) {
- error( EINVAL, EINVAL, "Invalid cache size" );
- }
- break;
-
- default:
- error( 0, 0, "Skipping unknown arg, '%c'", optopt );
- }
-
- }
-
- if (optind < argc) {
- ofile_name = string( argv[ optind++ ] );
- } else {
- error( EINVAL, 0, "Output file name required" );
- }
-
- if (optind < argc) {
- ifile_name = string( argv[ optind++ ] );
- } else {
- error( EINVAL, 0, "Input file name required" );
- }
-
- HashIntoType ht_size = (HashIntoType)ht_size_FP;
- Primes primetab( ht_size );
- vector<HashIntoType> ht_sizes;
- for ( unsigned int i = 0; i < ht_count; ++i ) {
- ht_sizes.push_back( primetab.get_next_prime( ) );
- }
-
- unsigned int reads_total = 0;
- unsigned long long int n_consumed = 0;
-
- Config &the_config = get_active_config( );
- the_config.set_number_of_threads( omp_get_max_threads( ) );
-
-#if HASH_TYPE_TO_TEST == 1
- CountingHash ht( kmer_length, ht_sizes );
- IParser * parser = IParser:: get_parser(
- ifile_name, the_config.get_number_of_threads( ), cache_size
- );
- #pragma omp parallel shared( reads_total, n_consumed )
- {
- ht.consume_fasta( parser, reads_total, n_consumed );
- }
-#elif HASH_TYPE_TO_TEST == 2
- Hashbits ht( kmer_length, ht_sizes );
- ht.consume_fasta_and_tag( ifile_name, reads_total, n_consumed );
-#endif
-
-#ifdef OUTPUT_HASHTABLE
-#if HASH_TYPE_TO_TEST == 1
- ht.save( ofile_name + ".ht_count" );
-#elif HASH_TYPE_TO_TEST == 2
- ht.save( ofile_name + ".ht_bits" );
- ht.save_tagset( ofile_name + ".tagset" );
-#endif
-#endif
-
- return rc;
-}
-
-
-// vim: set sts=4 sw=4 tw=80:
diff --git a/lib/test-Parser.cc b/lib/test-Parser.cc
deleted file mode 100644
index 1424cef..0000000
--- a/lib/test-Parser.cc
+++ /dev/null
@@ -1,145 +0,0 @@
-//
-// This file is part of khmer, https://github.com/dib-lab/khmer/, and is
-// Copyright (C) Michigan State University, 2009-2015. It is licensed under
-// the three-clause BSD license; see LICENSE.
-// Contact: khmer-project at idyll.org
-//
-
-// Test driver for the Parser class.
-// Author: Eric McDonald
-
-
-#include <cerrno>
-#include <cstring>
-#include <cstdio>
-#include <cstdlib>
-#include <fcntl.h>
-#include <getopt.h>
-
-#include <omp.h>
-
-#include "error.hh"
-#include "read_parsers.hh"
-
-
-// #define WRITE_SUMMARY
-
-
-using namespace khmer;
-using namespace khmer:: read_parsers;
-
-
-// s: Cache Size
-static char const * SHORT_OPTS = "s:";
-
-
-int main( int argc, char * argv[ ] )
-{
- int rc = 0;
- Config &the_config = get_active_config( );
- uint64_t cache_size =
- the_config.get_reads_input_buffer_size( );
- char * ifile_name = NULL;
-
- int opt = -1;
- char * conv_residue = NULL;
- while (-1 != (opt = getopt( argc, argv, SHORT_OPTS ))) {
-
- switch (opt) {
-
- case 's':
- cache_size = strtoull( optarg, &conv_residue, 10 );
- if (!strcmp( optarg, conv_residue )) {
- error( EINVAL, EINVAL, "Invalid cache size" );
- }
- break;
-
- default:
- error( 0, 0, "Skipping unknown arg, '%c'", optopt );
-
- } // option switch
-
- } // getopt loop
-
- if (optind < argc) {
- ifile_name = argv[ optind++ ];
- } else {
- error( EINVAL, 0, "Input file name required" );
- }
- std:: string ifile_name_STRING( ifile_name );
-
- the_config.set_input_buffer_trace_level( TraceLogger:: TLVL_ALL );
- uint32_t number_of_threads = omp_get_max_threads( );
- IParser * parser = IParser:: get_parser(
- ifile_name_STRING, number_of_threads, cache_size,
- TraceLogger:: TLVL_DEBUG6
- );
-
- #pragma omp parallel default( shared )
- {
- uint32_t thread_id = (uint32_t)omp_get_thread_num( );
- Read the_read;
- uint64_t seq_len;
- char ofile_name[ FILENAME_MAX + 1 ];
- FILE * ofile_handle = NULL;
-
-#ifdef WRITE_SUMMARY
- ofile_name[ FILENAME_MAX ] = '\0';
-#endif
-
- fprintf(
- stderr,
- "OMP thread %lu reporting for duty.\n",
- (unsigned long int)thread_id
- );
-
-#ifdef WRITE_SUMMARY
- snprintf(
- ofile_name, FILENAME_MAX, "summary-%lu.log",
- (unsigned long int)thread_id
- );
- ofile_handle = fopen( ofile_name, "w" );
- if (NULL == ofile_handle)
- // TODO: Report an error.
- ;
-#endif
-
- for (uint64_t readnum = 0; !parser->is_complete( ); ++readnum) {
-
- if (0 == readnum % 100000)
- fprintf(
- stderr,
- "OMP thread %lu is on read number %llu.\n",
- (unsigned long int)thread_id,
- (unsigned long long int)readnum
- );
-
- the_read = parser->get_next_read( );
-
-#if (1)
- printf(
- "@%s\n%s\n+\n%s\n",
- the_read.name.c_str( ),
- the_read.sequence.c_str( ),
- the_read.accuracy.c_str( )
- );
-#endif
-
-#ifdef WRITE_SUMMARY
- fflush( ofile_handle );
-#endif
-
- }
-
-#ifdef WRITE_SUMMARY
- fclose( ofile_handle );
-#endif
-
- } // parallel region
-
- delete parser;
- return rc;
-}
-
-
-// vim: set ft=cpp sts=4 sw=4 tw=80:
diff --git a/lib/test-compile.cc b/lib/test-compile.cc
index f0c4bb0..285801f 100644
--- a/lib/test-compile.cc
+++ b/lib/test-compile.cc
@@ -6,9 +6,10 @@
//
// Author: Kevin Murray, spam at kdmurray.id.au
-// This file is used to test compilation with libkhmer.a/libkhmer.so
+// This file is used to test compilation with libkhmer.a/libkhmer.so, after
+// installation
-#include <counting.hh>
+#include <oxli/counting.hh>
int main()
{
diff --git a/lib/trace_logger.cc b/lib/trace_logger.cc
deleted file mode 100644
index 9600d99..0000000
--- a/lib/trace_logger.cc
+++ /dev/null
@@ -1,70 +0,0 @@
-//
-// This file is part of khmer, https://github.com/dib-lab/khmer/, and is
-// Copyright (C) Michigan State University, 2009-2015. It is licensed under
-// the three-clause BSD license; see LICENSE.
-// Contact: khmer-project at idyll.org
-//
-
-#include <fcntl.h>
-
-#include "trace_logger.hh"
-#include "khmer_exception.hh"
-
-namespace khmer
-{
-
-
-#ifdef WITH_INTERNAL_TRACING
-TraceLogger::
-TraceLogger( uint8_t const level, FILE * stream_handle )
- : _level( level ), _shared_stream( true ), _stream_handle( stream_handle )
-{
- if( !(NULL != stream_handle) ) {
- throw khmer_exception();
- }
-}
-#endif
-
-
-TraceLogger::
-TraceLogger( uint8_t const level, char const * const file_name_format, ... )
-#ifdef WITH_INTERNAL_TRACING
- : _level( level ), _shared_stream( false )
-{
- char tfile_name[ FILENAME_MAX + 1 ];
- va_list varargs;
-
- va_start( varargs, file_name_format );
- vsnprintf( tfile_name, FILENAME_MAX, file_name_format, varargs );
- va_end( varargs );
-
- _stream_handle = fopen( tfile_name, "w" );
- if (NULL == _stream_handle) {
- throw InvalidStreamBuffer( );
- }
-
-}
-#else // WITH_INTERNAL_TRACING
-{ }
-#endif // !WITH_INTERNAL_TRACING
-
-
-TraceLogger::
-~TraceLogger( )
-#ifdef WITH_INTERNAL_TRACING
-{
-
- if ((!_shared_stream) && (NULL != _stream_handle)) {
- fclose( _stream_handle );
- _stream_handle = NULL;
- }
-
-}
-#else // WITH_INTERNAL_TRACING
-{ }
-#endif // !WITH_INTENRAL_TRACING
-
-} // namespace khmer
-
-
-// vim: set ft=cpp sts=4 sw=4 tw=80:
diff --git a/lib/trace_logger.hh b/lib/trace_logger.hh
deleted file mode 100644
index 9c24161..0000000
--- a/lib/trace_logger.hh
+++ /dev/null
@@ -1,76 +0,0 @@
-//
-// This file is part of khmer, https://github.com/dib-lab/khmer/, and is
-// Copyright (C) Michigan State University, 2009-2015. It is licensed under
-// the three-clause BSD license; see LICENSE.
-// Contact: khmer-project at idyll.org
-//
-
-#ifndef TRACE_LOGGER_HH
-#define TRACE_LOGGER_HH
-
-#include <cstdarg>
-#include <cstdio>
-
-#include "khmer.hh"
-
-
-namespace khmer
-{
-
-
-struct TraceLogger {
-
- enum {
- TLVL_ALL = 0,
- TLVL_DEBUG9, TLVL_DEBUG8, TLVL_DEBUG7, TLVL_DEBUG6, TLVL_DEBUG5,
- TLVL_DEBUG4, TLVL_DEBUG3, TLVL_DEBUG2, TLVL_DEBUG1, TLVL_DEBUG0,
- TLVL_INFO9, TLVL_INFO8, TLVL_INFO7, TLVL_INFO6, TLVL_INFO5,
- TLVL_INFO4, TLVL_INFO3, TLVL_INFO2, TLVL_INFO1, TLVL_INFO0,
- TLVL_WARNING = 30,
- TLVL_ERROR = 40,
- TLVL_CRITICAL = 50,
- TLVL_NONE = 255
- };
-#ifdef WITH_INTERNAL_TRACING
- TraceLogger( uint8_t const level, FILE * stream_handle );
-#endif
- TraceLogger(
- uint8_t const level, char const * const file_name_format, ...
- );
- ~TraceLogger( );
-
- inline void operator( )(
- uint8_t const level, char const * const format, ...
- ) const
-#ifdef WITH_INTERNAL_TRACING
- {
- va_list varargs;
-
- if (_level <= level) {
- va_start( varargs, format );
- vfprintf( _stream_handle, format, varargs );
- va_end( varargs );
- fflush( _stream_handle );
- }
-
- }
-#else // WITH_INTERNAL_TRACING
- { }
-#endif // !WITH_INTERNAL_TRACING
-
-private:
-#ifdef WITH_INTERNAL_TRACING
- bool _shared_stream;
- uint8_t _level;
- FILE * _stream_handle;
-#endif
-};
-
-
-
-} // namespace khmer
-
-
-#endif // TRACE_LOGGER_HH
-
-// vim: set ft=cpp sts=4 sw=4 tw=80:
diff --git a/oxli/__init__.py b/oxli/__init__.py
index 742c4fc..53e60ee 100755
--- a/oxli/__init__.py
+++ b/oxli/__init__.py
@@ -11,6 +11,7 @@ Single entry point script for khmer
"""
import argparse
+import sys
import textwrap
from khmer import khmer_args
from oxli import build_graph
@@ -48,7 +49,10 @@ def main():
"""
main function; does the parsing and kicks off the subcommand
"""
- args = get_parser().parse_args()
+ if (len(sys.argv) < 2):
+ args = get_parser().parse_args(['--help'])
+ else:
+ args = get_parser().parse_args()
args.func(args)
if __name__ == '__main__':
diff --git a/oxli/build_graph.py b/oxli/build_graph.py
index 43b5a33..adf4de6 100644
--- a/oxli/build_graph.py
+++ b/oxli/build_graph.py
@@ -5,7 +5,7 @@
# the three-clause BSD license; see doc/LICENSE.txt.
# Contact: khmer-project at idyll.org
#
-# pylint: disable=invalid-name,missing-docstring
+# pylint: disable=missing-docstring
"""
Build a graph from the given sequences, save in <ptname>.
@@ -20,7 +20,8 @@ import sys
import khmer
from khmer import khmer_args
-from khmer.khmer_args import (report_on_config, info, add_threading_args)
+from khmer.khmer_args import (report_on_config, info, add_threading_args,
+ calculate_tablesize)
from khmer.kfile import check_input_files, check_space
from khmer.kfile import check_space_for_hashtable
from oxli import functions
@@ -51,8 +52,11 @@ def main(args):
for fname in args.input_filenames:
check_input_files(fname, args.force)
- check_space(args.input_filenames, args.force)
- check_space_for_hashtable(args, 'nodegraph', args.force)
+ # if optimization args are given, do optimization
+ args = functions.do_sanity_checking(args, 0.01)
+
+ tablesize = calculate_tablesize(args, 'nodegraph')
+ check_space_for_hashtable(args.output_filename, tablesize, args.force)
print('Saving k-mer presence table to %s' % base, file=sys.stderr)
print('Loading kmers from sequences in %s' %
diff --git a/oxli/functions.py b/oxli/functions.py
index e429efd..5b72be4 100644
--- a/oxli/functions.py
+++ b/oxli/functions.py
@@ -1,3 +1,7 @@
+"""
+A collection of functions for use throughout khmer/oxli
+"""
+
#
# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
# Copyright (C) Michigan State University, 2009-2015. It is licensed under
@@ -6,51 +10,72 @@
#
+from __future__ import print_function
from collections import namedtuple
import threading
import math
import khmer.utils
+import sys
+
+
+def optimal_size(num_kmers, mem_cap=None, fp_rate=None):
+ """
+ Utility function for estimating optimal counting table args where:
+ - num_kmers: number of unique kmers [required]
+ - mem_cap: the allotted amount of memory [optional, conflicts with f]
+ - fp_rate: the desired false positive rate [optional, conflicts with M]
+ """
+ if all((num_kmers is not None, mem_cap is not None, fp_rate is None)):
+ return estimate_optimal_with_K_and_M(num_kmers, mem_cap)
+ elif all((num_kmers is not None, mem_cap is None, fp_rate is not None)):
+ return estimate_optimal_with_K_and_f(num_kmers, fp_rate)
+ else:
+ raise TypeError("num_kmers and either mem_cap or fp_rate"
+ " must be defined.")
-def estimate_optimal_with_N_and_M(N, M):
+def estimate_optimal_with_K_and_M(num_kmers, mem_cap):
"""
- Utility function for estimating optimal counting table args where N is the
- number of unique kmer and M is the allotted amount of memory
+ Utility function for estimating optimal counting table args where num_kmers
+ is the number of unique kmer and mem_cap is the allotted amount of memory
"""
- Z = math.log(2)*(M/float(N))
- intZ = int(Z)
- if intZ == 0:
- intZ = 1
- H = int(M/intZ)
- M = H*intZ
- f2 = (1-math.exp(-N/float(H)))**intZ
+
+ n_tables = math.log(2) * (mem_cap / float(num_kmers))
+ int_n_tables = int(n_tables)
+ if int_n_tables == 0:
+ int_n_tables = 1
+ ht_size = int(mem_cap / int_n_tables)
+ mem_cap = ht_size * int_n_tables
+ fp_rate = (1 - math.exp(-num_kmers / float(ht_size))) ** int_n_tables
res = namedtuple("result", ["num_htables", "htable_size", "mem_use",
"fp_rate"])
- return res(intZ, H, M, f2)
+ return res(int_n_tables, ht_size, mem_cap, fp_rate)
-def estimate_optimal_with_N_and_f(N, f):
+def estimate_optimal_with_K_and_f(num_kmers, des_fp_rate):
"""
- Utility function for estimating optimal memory where N is the number of
- unique kmers and f is the desired false positive rate
+ Utility function for estimating optimal memory where num_kmers is the
+ number of unique kmers and des_fp_rate is the desired false positive rate
"""
- Z = math.log(f, 0.5)
- intZ = int(Z)
- if intZ == 0:
- intZ = 1
+ n_tables = math.log(des_fp_rate, 0.5)
+ int_n_tables = int(n_tables)
+ if int_n_tables == 0:
+ int_n_tables = 1
- H1 = int(-N/(math.log(1-f**(1/float(intZ)))))
- M1 = H1 * intZ
- f1 = (1-math.exp(-N/float(H1)))**intZ
+ ht_size = int(-num_kmers / (
+ math.log(1 - des_fp_rate ** (1 / float(int_n_tables)))))
+ mem_cap = ht_size * int_n_tables
+ fp_rate = (1 - math.exp(-num_kmers / float(ht_size))) ** int_n_tables
res = namedtuple("result", ["num_htables", "htable_size", "mem_use",
"fp_rate"])
- return res(intZ, H1, M1, f1)
+ return res(int_n_tables, ht_size, mem_cap, fp_rate)
def optimal_args_output_gen(unique_kmers, fp_rate):
"""
Assembles output string for optimal arg sandbox scripts
+ takes in unique_kmers and desired fp_rate
"""
to_print = []
@@ -63,8 +88,12 @@ def optimal_args_output_gen(unique_kmers, fp_rate):
'expected_memory_usage')
for fp_rate in range(1, 10):
- Z, H, M, f = estimate_optimal_with_N_and_f(unique_kmers, fp_rate/10.0)
- to_print.append('{:11.3f}\t{:19}\t{:17e}\t{:21e}'.format(f, Z, H, M))
+ num_tables, table_size, mem_cap, fp_rate = \
+ optimal_size(unique_kmers, fp_rate=fp_rate / 10.0)
+ to_print.append('{:11.3f}\t{:19}\t{:17e}\t{:21e}'.format(fp_rate,
+ num_tables,
+ table_size,
+ mem_cap))
mem_list = [1, 5, 10, 20, 50, 100, 200, 300, 400, 500, 1000, 2000, 5000]
@@ -74,15 +103,61 @@ def optimal_args_output_gen(unique_kmers, fp_rate):
'size_hashtable(H)\texpected_fp')
for mem in mem_list:
- Z, H, M, f = estimate_optimal_with_N_and_M(unique_kmers,
- mem*1000000000)
- to_print.append('{:21e}\t{:19}\t{:17e}\t{:11.3f}'.format(M, Z, H, f))
+ num_tables, table_size, mem_cap, fp_rate =\
+ optimal_size(unique_kmers, mem_cap=mem * 1000000000)
+ to_print.append('{:21e}\t{:19}\t{:17e}\t{:11.3f}'.format(mem_cap,
+ num_tables,
+ table_size,
+ fp_rate))
return "\n".join(to_print)
+def do_sanity_checking(args, desired_max_fp):
+ """
+ simple function to check if the restrictions in the args (if there are any)
+ make sense--If not, complain. If no restrictions are given, add some that
+ make sense.
+ Takes in args and desired max FP rate
+ """
+ # if optimization args are given, do optimization
+ if args.unique_kmers != 0:
+ if args.max_memory_usage:
+ # verify that this is a sane memory usage restriction
+ res = estimate_optimal_with_K_and_M(args.unique_kmers,
+ args.max_memory_usage)
+ if res.fp_rate > desired_max_fp:
+ print("""
+*** ERROR: The given restrictions yield an estimate false positive rate of {0},
+*** which is above the recommended false positive ceiling of {1}!"""
+ .format(res.fp_rate, desired_max_fp), file=sys.stderr)
+ if not args.force:
+ print("NOTE: This can be overridden using the --force"
+ " argument", file=sys.stderr)
+ print("*** Aborting...!", file=sys.stderr)
+ sys.exit(1)
+ else:
+ res = estimate_optimal_with_K_and_f(args.unique_kmers,
+ desired_max_fp)
+ if args.max_tablesize and args.max_tablesize < res.htable_size:
+ print("*** Warning: The given tablesize is too small!",
+ file=sys.stderr)
+ print("*** Estimating false positive rate to be {0}".format(
+ res.fp_rate), file=sys.stderr)
+ else:
+ print("*** INFO: set memory ceiling using auto optimization.",
+ file=sys.stderr)
+ print("*** Ceiling is: {0} bytes\n".format(res.mem_use),
+ file=sys.stderr)
+ args.max_mem = res.mem_use
+
+ return args
+
+
def build_graph(ifilenames, graph, num_threads=1, tags=False):
"""
Algorithm to construct a counting graph from a set of input files
+ takes in list of input files, existing graph
+ optionally, number of threads and if there should be tags
"""
if tags:
diff --git a/sandbox/Makefile.read_aligner_training b/sandbox/Makefile.read_aligner_training
new file mode 100644
index 0000000..2c2419c
--- /dev/null
+++ b/sandbox/Makefile.read_aligner_training
@@ -0,0 +1,26 @@
+KHMER= ../khmer
+
+KSIZE= 30
+HASH_SIZE= 4e8
+N_HASHES= 4
+
+all: estimated_probabilities.${KSIZE}.json
+
+mockRefG.fa:
+ wget https://github.com/dib-lab/khmer-testdata/raw/master/mockRefG.fa
+
+%.fastq.gz:
+ wget ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR172/SRR172903/SRR172903.fastq.gz
+ wget ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR172/SRR172902/SRR172902.fastq.gz
+
+combined_reads.ht: SRR172902.fastq.gz SRR172903.fastq.gz
+ python $(KHMER)/scripts/load-into-counting.py --ksize $(KSIZE) -x $(HASH_SIZE) -N $(N_HASHES) $@ $^
+
+mockRefG.1.bt1: mockRefG.fa
+ bowtie2-build mockRefG.fa
+
+combined_reads_mapping.bam: SRR172902.fastq.gz SRR172903.fastq.gz mockRefG.1.bt1
+ bowtie2 -x mockRefG -U SRR172902.fastq.gz -U SRR172903.fastq.gz | samtools view -S -F4 -b - > combined_reads_mapping.bam
+
+estimated_probabilities.${KSIZE}.json: combined_reads.ht combined_reads_mapping.bam
+ python readaligner_pairhmm_train.py --json combined_reads.ht combined_reads_mapping.bam > estimated_probabilities.${KSIZE}.json
diff --git a/sandbox/README.rst b/sandbox/README.rst
index 48af420..36ffa88 100644
--- a/sandbox/README.rst
+++ b/sandbox/README.rst
@@ -10,9 +10,11 @@ We are still triaging and documenting the various scripts.
----
-Awaiting promotion to sandbox:
+Awaiting promotion to scripts:
* calc-error-profile.py - calculate a per-base "error profile" for shotgun sequencing data, w/o a reference. (Used/tested in `2014 paper on semi-streaming algorithms <https://github.com/ged-lab/2014-streaming/blob/master/>`__)
+* count-kmers.py - output k-mer counts for multiple input files.
+* count-kmers-single.py - output k-mer counts for a single k-mer file.
* correct-errors.py - streaming error correction.
* unique-kmers.py - estimate the number of k-mers present in a file with the HyperLogLog low-memory probabilistic cardinality estimation algorithm.
diff --git a/sandbox/build-sparse-graph.py b/sandbox/build-sparse-graph.py
index 6686d73..1a12b3b 100755
--- a/sandbox/build-sparse-graph.py
+++ b/sandbox/build-sparse-graph.py
@@ -19,7 +19,7 @@ def main():
input_fasta = sys.argv[3]
K = int(sys.argv[1])
x = float(sys.argv[2])
-
+
ht = khmer.Hashbits(K, x, 4)
sparse_graph = gt.Graph()
diff --git a/sandbox/calc-best-assembly.py b/sandbox/calc-best-assembly.py
index 931a24d..e370647 100755
--- a/sandbox/calc-best-assembly.py
+++ b/sandbox/calc-best-assembly.py
@@ -42,22 +42,23 @@ def main():
stats.append((total, filename))
if not args.quiet:
- print("assembly %s has %d bp > %d" % (filename,
- total,
- args.cutoff), file=sys.stderr)
+ print("assembly %s has %d bp > %d" % (filename, total,
+ args.cutoff),
+ file=sys.stderr)
stats.sort(reverse=True)
best_total, winner_file = stats[0]
print('----', file=sys.stderr)
print("assembly %s wins: %d total bp > %d" % (winner_file,
- best_total,
- args.cutoff), file=sys.stderr)
+ best_total,
+ args.cutoff),
+ file=sys.stderr)
if args.output_file:
- for record in screed.open(winner_file, parse_description=False):
- print('>%s\n%s' % (record.name,
- record.sequence), file=args.output_file)
+ for record in screed.open(winner_file):
+ print('>%s\n%s' % (record.name, record.sequence),
+ file=args.output_file)
print(winner_file)
diff --git a/sandbox/collect-reads.py b/sandbox/collect-reads.py
index f02c0ea..ca29727 100755
--- a/sandbox/collect-reads.py
+++ b/sandbox/collect-reads.py
@@ -21,7 +21,8 @@ import sys
import textwrap
import khmer
from khmer import khmer_args
-from khmer.khmer_args import build_counting_args, report_on_config, info
+from khmer.khmer_args import (build_counting_args, report_on_config, info,
+ calculate_tablesize)
from khmer.kfile import check_input_files, check_space
from khmer.kfile import check_space_for_hashtable
import argparse
@@ -54,7 +55,7 @@ def get_parser():
"sequence files.")
parser.add_argument('--report-total-kmers', '-t', action='store_true',
help="Prints the total number of k-mers to stderr")
- parser.add_argument('-C', '--coverage', type=int,
+ parser.add_argument('-C', '--coverage', type=int, default=50,
help='Collect reads until this coverage, then exit.')
parser.add_argument('-o', '--output', type=argparse.FileType('w'),
help='Write collect reads into this file.')
@@ -77,7 +78,9 @@ def main():
check_input_files(name, False)
check_space(args.input_sequence_filename, False)
- check_space_for_hashtable(args, 'countgraph', False)
+ tablesize = calculate_tablesize(args, 'countgraph')
+ check_space_for_hashtable(args.output_countingtable_filename, tablesize,
+ False)
print('Saving k-mer counting table to %s' % base)
print('Loading sequences from %s' % repr(filenames))
diff --git a/sandbox/collect-variants.py b/sandbox/collect-variants.py
index db368f0..57af85d 100755
--- a/sandbox/collect-variants.py
+++ b/sandbox/collect-variants.py
@@ -46,15 +46,15 @@ def main():
print(' - kmer size = %d \t\t(-k)' % args.ksize, file=sys.stderr)
print(' - n hashes = %d \t\t(-N)' % args.n_tables, file=sys.stderr)
print(' - min hashsize = %-5.2g \t(-x)' % \
- args.min_tablesize, file=sys.stderr)
+ args.max_tablesize, file=sys.stderr)
print('', file=sys.stderr)
print('Estimated memory usage is %.2g bytes ' \
'(n_hashes x min_hashsize)' % \
- (args.n_tables * args.min_tablesize), file=sys.stderr)
+ (args.n_tables * args.max_tablesize), file=sys.stderr)
print('-' * 8, file=sys.stderr)
K = args.ksize
- HT_SIZE = args.min_tablesize
+ HT_SIZE = args.max_tablesize
N_HT = args.n_tables
filenames = args.input_filenames
diff --git a/sandbox/correct-errors.py b/sandbox/correct-errors.py
deleted file mode 100755
index 71c6890..0000000
--- a/sandbox/correct-errors.py
+++ /dev/null
@@ -1,219 +0,0 @@
-#! /usr/bin/env python
-#
-# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
-# Copyright (C) Michigan State University, 2009-2015. It is licensed under
-# the three-clause BSD license; see LICENSE.
-# Contact: khmer-project at idyll.org
-#
-"""
-Streaming error correction.
-
-% python sandbox/correct-errors.py [ <data1> [ <data2> [ ... ] ] ]
-
-Use -h for parameter help.
-
-TODO: paired support: paired reads should be kept together.
-TODO: load/save counting table.
-TODO: move output_single elsewhere
-TODO: add to sandbox/README
-TODO: change name to correct-reads?
-"""
-from __future__ import print_function
-import sys
-import screed
-import os
-import khmer
-import argparse
-import tempfile
-import shutil
-
-DEFAULT_NORMALIZE_LIMIT = 20
-DEFAULT_CUTOFF = 2
-
-DEFAULT_K = 32
-DEFAULT_N_HT = 4
-DEFAULT_MIN_HASHSIZE = 1e6
-
-
-def output_single(read, new_sequence):
- name = read.name
- sequence = new_sequence
-
- quality = None
- if hasattr(read, 'quality'):
- quality = read.quality[:len(sequence)]
-
- # in cases where sequence _lengthened_, need to truncate it to
- # match the quality score length.
- sequence = sequence[:len(quality)]
-
- if quality:
- assert len(sequence) == len(quality), (sequence, quality)
- return "@%s\n%s\n+\n%s\n" % (name, sequence, quality)
- else:
- return ">%s\n%s\n" % (name, sequence)
-
-
-def main():
- parser = argparse.ArgumentParser(description='XXX')
-
- env_ksize = os.environ.get('KHMER_KSIZE', DEFAULT_K)
- env_n_hashes = os.environ.get('KHMER_N_HASHES', DEFAULT_N_HT)
- env_hashsize = os.environ.get('KHMER_MIN_HASHSIZE', DEFAULT_MIN_HASHSIZE)
-
- parser.add_argument('--ksize', '-k', type=int, dest='ksize',
- default=env_ksize,
- help='k-mer size to use')
- parser.add_argument('--n_hashes', '-N', type=int, dest='n_hashes',
- default=env_n_hashes,
- help='number of hash tables to use')
- parser.add_argument('--hashsize', '-x', type=float, dest='min_hashsize',
- default=env_hashsize,
- help='lower bound on hashsize to use')
-
- parser.add_argument("--trusted-cov", dest="trusted_cov", type=int,
- default=DEFAULT_CUTOFF)
- parser.add_argument("--theta", dest="bits_theta", type=float, default=1.0)
-
- parser.add_argument('--normalize-to', '-Z', type=int, dest='normalize_to',
- help='base cutoff on median k-mer abundance of this',
- default=DEFAULT_NORMALIZE_LIMIT)
-
- parser.add_argument('--tempdir', '-T', type=str, dest='tempdir',
- default='./')
-
- parser.add_argument('input_filenames', nargs='+')
- args = parser.parse_args()
-
- K = args.ksize
- HT_SIZE = args.min_hashsize
- N_HT = args.n_hashes
-
- NORMALIZE_LIMIT = args.normalize_to
-
- print('making hashtable')
- ht = khmer.CountingHash(K, HT_SIZE, N_HT)
-
- aligner = khmer.ReadAligner(ht, args.trusted_cov, args.bits_theta)
-
- tempdir = tempfile.mkdtemp('khmer', 'tmp', args.tempdir)
- print('created temporary directory %s; use -T to change location' % tempdir)
-
- ###
-
- save_pass2 = 0
- n_aligned = 0
- n_corrected = 0
- total_reads = 0
-
- pass2list = []
- for filename in args.input_filenames:
- pass2filename = os.path.basename(filename) + '.pass2'
- pass2filename = os.path.join(tempdir, pass2filename)
- corrfilename = os.path.basename(filename) + '.corr'
-
- pass2list.append((filename, pass2filename, corrfilename))
-
- pass2fp = open(pass2filename, 'w')
- corrfp = open(corrfilename, 'w')
-
- for n, read in enumerate(screed.open(filename)):
- total_reads += 1
-
- if n % 10000 == 0:
- print('...', n, filename, n_aligned, n_corrected, save_pass2, \
- total_reads)
- seq = read.sequence.replace('N', 'A')
-
- # build the alignment...
- score, graph_alignment, read_alignment, truncated = \
- aligner.align(read.sequence)
-
- # next, decide whether or to keep it.
- output_corrected = False
- if not truncated:
- n_aligned += 1
-
- # build a better sequence -- this is the corrected one.
- if True:
- graph_seq = graph_alignment.replace("-", "")
- else:
- graph_seq = ""
- for i in range(len(graph_alignment)):
- if graph_alignment[i] == "-":
- graph_seq += read_alignment[i]
- else:
- graph_seq += graph_alignment[i]
-
- corrected = graph_seq
- if graph_seq != read.sequence:
- n_corrected += 1
-
- # get the minimum count for this new sequence
- mincount = ht.get_min_count(graph_seq)
- if mincount < args.normalize_to:
- output_corrected = True
-
- # has this portion of the graph saturated? if not,
- # consume & save => pass2.
- if output_corrected:
- corrfp.write(output_single(read, corrected))
- else: # uncorrected...
- ht.consume(read.sequence)
- pass2fp.write(output_single(read, read.sequence))
- save_pass2 += 1
-
- pass2fp.close()
- corrfp.close()
-
- print('%s: kept aside %d of %d from first pass, in %s' % \
- (filename, save_pass2, n, filename))
- print('aligned %d of %d reads so far' % (n_aligned, total_reads))
- print('changed %d of %d reads so far' % (n_corrected, total_reads))
-
- for orig_filename, pass2filename, corrfilename in pass2list:
- print('second pass: looking at sequences kept aside in %s' % \
- pass2filename)
- for n, read in enumerate(screed.open(pass2filename)):
- if n % 10000 == 0:
- print('... x 2', n, pass2filename, n_aligned, n_corrected, \
- total_reads)
-
- corrfp = open(corrfilename, 'a')
-
- # build the alignment...
- score, graph_alignment, read_alignment, truncated = \
- aligner.align(read.sequence)
-
- if truncated: # no good alignment; output original
- corrected = read.sequence
- else:
- n_aligned += 1
- # build a better sequence -- this is the corrected one.
- if True:
- graph_seq = graph_alignment.replace("-", "")
- else:
- graph_seq = ""
- for i in range(len(graph_alignment)):
- if graph_alignment[i] == "-":
- graph_seq += read_alignment[i]
- else:
- graph_seq += graph_alignment[i]
-
- corrected = graph_seq
- if corrected != read.sequence:
- n_corrected += 1
-
- corrfp.write(output_single(read, corrected))
-
- print('removing %s' % pass2filename)
- os.unlink(pass2filename)
-
- print('removing temp directory & contents (%s)' % tempdir)
- shutil.rmtree(tempdir)
-
- print('Aligned %d of %d total' % (n_aligned, total_reads))
- print('Changed %d of %d total' % (n_corrected, total_reads))
-
-if __name__ == '__main__':
- main()
diff --git a/scripts/trim-low-abund.py b/sandbox/correct-reads.py
similarity index 52%
copy from scripts/trim-low-abund.py
copy to sandbox/correct-reads.py
index 741b181..a86f9be 100755
--- a/scripts/trim-low-abund.py
+++ b/sandbox/correct-reads.py
@@ -1,20 +1,21 @@
-#! /usr/bin/env python
+#! /usr/bin/env python2
#
-# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
+# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
# Copyright (C) Michigan State University, 2009-2015. It is licensed under
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
#
"""
-Trim sequences at k-mers of the given abundance, using a streaming algorithm.
+Semi-streaming error correction.
-Output sequences will be placed in 'infile.abundtrim'.
+Output sequences will be placed in 'infile.corr'.
-% python scripts/trim-low-abund.py [ <data1> [ <data2> [ ... ] ] ]
+% python scripts/correct-reads.py [ <data1> [ <data2> [ ... ] ] ]
Use -h for parameter help.
+
+TODO: add to sandbox/README.
"""
-from __future__ import print_function
import sys
import screed
import os
@@ -24,9 +25,6 @@ import shutil
import textwrap
import argparse
-from screed import Record
-from khmer import khmer_args
-
from khmer.khmer_args import (build_counting_args, info, add_loadhash_args,
report_on_config)
from khmer.utils import write_record, write_record_pair, broken_paired_reader
@@ -37,46 +35,52 @@ DEFAULT_NORMALIZE_LIMIT = 20
DEFAULT_CUTOFF = 2
-def trim_record(read, trim_at):
- new_read = Record()
- new_read.name = read.name
- new_read.sequence = read.sequence[:trim_at]
- if hasattr(read, 'quality'):
- new_read.quality = read.quality[:trim_at]
+def correct_sequence(aligner, sequence):
+ # align to graph.
+ score, graph_alignment, read_alignment, truncated = \
+ aligner.align(sequence)
+
+ # next, decide whether or to keep it.
+ output_corrected = False
+ if not truncated:
+ graph_seq = graph_alignment.replace("-", "")
+ return True, graph_seq
+
+ return False, sequence
+
+
+def fix_quality(record):
+ if len(record.sequence) < len(record.quality):
+ record.quality = record.quality[:len(record.sequence)]
- return new_read
+ while len(record.sequence) > len(record.quality):
+ record.quality += 'I' # @CTB hack
def get_parser():
epilog = """
- The output is one file for each input file, <input file>.abundtrim, placed
- in the current directory. This output contains the input sequences
- trimmed at low-abundance k-mers.
-
- The ``-V/--variable-coverage`` parameter will, if specified,
- prevent elimination of low-abundance reads by only trimming
- low-abundance k-mers from high-abundance reads; use this for
- non-genomic data sets that may have variable coverage.
-
- Note that the output reads will not necessarily be in the same order
- as the reads in the input files; if this is an important consideration,
- use ``load-into-counting.py`` and ``filter-abund.py``. However, read
- pairs will be kept together, in "broken-paired" format; you can use
+ The output is one file for each input file, <input file>.corr, placed
+ in the current directory. This output contains the input sequences,
+ corrected at low-abundance k-mers.
+
+ Note that the output reads will not necessarily be in the same
+ order as the reads in the input files. However, read pairs will be
+ kept together, in "broken-paired" format; you can use
``extract-paired-reads.py`` to extract read pairs and orphans.
Example::
- trim-low-abund.py -x 5e7 -k 20 -C 2 data/100k-filtered.fa
+ correct-reads.py -x 5e7 -k 20 -C 2 data/100k-filtered.fa
"""
parser = build_counting_args(
- descr='Trim low-abundance k-mers using a streaming algorithm.',
+ descr='Correct reads using a semi-streaming algorithm.',
epilog=textwrap.dedent(epilog))
parser.add_argument('input_filenames', nargs='+')
parser.add_argument('--cutoff', '-C', type=int,
- help='remove k-mers below this abundance',
+ help='k-mers below this abundance are not trusted',
default=DEFAULT_CUTOFF)
parser.add_argument('--normalize-to', '-Z', type=int,
@@ -92,8 +96,7 @@ def get_parser():
parser.add_argument('--variable-coverage', '-V', action='store_true',
default=False,
- help='Only trim low-abundance k-mers from sequences '
- 'that have high coverage.')
+ help='Only correct sequences that have high coverage.')
add_loadhash_args(parser)
parser.add_argument('-s', '--savetable', metavar="filename", default='',
@@ -104,20 +107,21 @@ def get_parser():
parser.add_argument('--force', default=False, action='store_true')
parser.add_argument('--ignore-pairs', default=False, action='store_true')
parser.add_argument('--tempdir', '-T', type=str, default='./')
+ parser.add_argument("--theta", dest="bits_theta", type=float, default=1.0)
return parser
def main():
- info('trim-low-abund.py', ['streaming'])
+ info('correct-reads.py', ['streaming'])
parser = get_parser()
args = parser.parse_args()
###
if len(set(args.input_filenames)) != len(args.input_filenames):
- print("Error: Cannot input the same filename multiple times.",
- file=sys.stderr)
+ print >>sys.stderr, \
+ "Error: Cannot input the same filename multiple times."
sys.exit(1)
###
@@ -126,22 +130,26 @@ def main():
check_valid_file_exists(args.input_filenames)
check_space(args.input_filenames, args.force)
if args.savetable:
- check_space_for_hashtable(args, 'countgraph', args.force)
+ check_space_for_hashtable(
+ args.n_tables * args.min_tablesize, args.force)
- if args.loadtable:
- print('loading countgraph from', args.loadtable, file=sys.stderr)
- ct = khmer.load_counting_hash(args.loadtable)
- else:
- print('making countgraph', file=sys.stderr)
- ct = khmer_args.create_countgraph(args)
+ K = args.ksize
- K = ct.ksize()
CUTOFF = args.cutoff
NORMALIZE_LIMIT = args.normalize_to
+ if args.loadtable:
+ print >>sys.stderr, 'loading k-mer counting table from', args.loadtable
+ ct = khmer.load_counting_hash(args.loadtable)
+ else:
+ print >>sys.stderr, 'making k-mer counting table'
+ ct = khmer.new_counting_hash(K, args.min_tablesize, args.n_tables)
+
tempdir = tempfile.mkdtemp('khmer', 'tmp', args.tempdir)
- print('created temporary directory %s; '
- 'use -T to change location' % tempdir, file=sys.stderr)
+ print >>sys.stderr, 'created temporary directory %s; ' \
+ 'use -T to change location' % tempdir
+
+ aligner = khmer.ReadAligner(ct, args.cutoff, args.bits_theta)
# ### FIRST PASS ###
@@ -151,18 +159,18 @@ def main():
n_reads = 0
written_bp = 0
written_reads = 0
- trimmed_reads = 0
+ corrected_reads = 0
pass2list = []
for filename in args.input_filenames:
pass2filename = os.path.basename(filename) + '.pass2'
pass2filename = os.path.join(tempdir, pass2filename)
if args.out is None:
- trimfp = open(os.path.basename(filename) + '.abundtrim', 'w')
+ corrfp = open(os.path.basename(filename) + '.corr', 'w')
else:
- trimfp = args.out
+ corrfp = args.out
- pass2list.append((filename, pass2filename, trimfp))
+ pass2list.append((filename, pass2filename, corrfp))
screed_iter = screed.open(filename, parse_description=False)
pass2fp = open(pass2filename, 'w')
@@ -174,8 +182,8 @@ def main():
force_single=args.ignore_pairs)
for n, is_pair, read1, read2 in paired_iter:
if n % 10000 == 0:
- print('...', n, filename, save_pass2, n_reads, n_bp,
- written_reads, written_bp, file=sys.stderr)
+ print >>sys.stderr, '...', n, filename, save_pass2, \
+ n_reads, n_bp, written_reads, written_bp
# we want to track paired reads here, to make sure that pairs
# are not split between first pass and second pass.
@@ -196,23 +204,26 @@ def main():
write_record_pair(read1, read2, pass2fp)
save_pass2 += 2
else:
- _, trim_at1 = ct.trim_on_abundance(seq1, CUTOFF)
- _, trim_at2 = ct.trim_on_abundance(seq2, CUTOFF)
-
- if trim_at1 >= K:
- read1 = trim_record(read1, trim_at1)
-
- if trim_at2 >= K:
- read2 = trim_record(read2, trim_at2)
-
- if trim_at1 != len(seq1):
- trimmed_reads += 1
- if trim_at2 != len(seq2):
- trimmed_reads += 1
-
- write_record_pair(read1, read2, trimfp)
+ is_aligned, new_seq1 = correct_sequence(aligner, seq1)
+ if is_aligned:
+ if new_seq1 != read1.sequence:
+ corrected_reads += 1
+ read1.sequence = new_seq1
+ if hasattr(read1, 'quality'):
+ fix_quality(read1)
+
+ is_aligned, new_seq2 = correct_sequence(aligner, seq2)
+ if is_aligned:
+ if new_seq2 != read2.sequence:
+ corrected_reads += 1
+ read2.sequence = new_seq2
+ if hasattr(read2, 'quality'):
+ fix_quality(read2)
+
+ write_record_pair(read1, read2, corrfp)
written_reads += 2
- written_bp += trim_at1 + trim_at2
+ written_bp += len(read1)
+ written_bp += len(read2)
else:
n_reads += 1
n_bp += len(read1.sequence)
@@ -228,32 +239,32 @@ def main():
write_record(read1, pass2fp)
save_pass2 += 1
else: # trim!!
- _, trim_at = ct.trim_on_abundance(seq, CUTOFF)
- if trim_at >= K:
- new_read = trim_record(read1, trim_at)
- write_record(new_read, trimfp)
+ is_aligned, new_seq = correct_sequence(aligner, seq)
+ if is_aligned:
+ if new_seq != read1.sequence:
+ corrected_reads += 1
+ read1.sequence = new_seq
+ if hasattr(read1, 'quality'):
+ fix_quality(read1)
- written_reads += 1
- written_bp += trim_at
+ write_record(read1, corrfp)
- if trim_at != len(read1.sequence):
- trimmed_reads += 1
+ written_reads += 1
+ written_bp += len(new_seq)
pass2fp.close()
- print('%s: kept aside %d of %d from first pass, in %s' %
- (filename, save_pass2, n, filename),
- file=sys.stderr)
+ print >>sys.stderr, '%s: kept aside %d of %d from first pass, in %s' \
+ % (filename, save_pass2, n, filename)
save_pass2_total += save_pass2
# ### SECOND PASS. ###
skipped_n = 0
skipped_bp = 0
- for _, pass2filename, trimfp in pass2list:
- print('second pass: looking at sequences kept aside in %s' %
- pass2filename,
- file=sys.stderr)
+ for _, pass2filename, corrfp in pass2list:
+ print >>sys.stderr, ('second pass: looking at sequences kept aside '
+ 'in %s') % pass2filename
# note that for this second pass, we don't care about paired
# reads - they will be output in the same order they're read in,
@@ -263,73 +274,72 @@ def main():
for n, read in enumerate(screed.open(pass2filename,
parse_description=False)):
if n % 10000 == 0:
- print('... x 2', n, pass2filename,
- written_reads, written_bp, file=sys.stderr)
+ print >>sys.stderr, '... x 2', n, pass2filename, \
+ written_reads, written_bp
seq = read.sequence.replace('N', 'A')
med, _, _ = ct.get_median_count(seq)
# do we retain low-abundance components unchanged?
if med < NORMALIZE_LIMIT and args.variable_coverage:
- write_record(read, trimfp)
+ write_record(read, corrfp)
written_reads += 1
written_bp += len(read.sequence)
skipped_n += 1
skipped_bp += len(read.sequence)
- # otherwise, examine/trim/truncate.
+ # otherwise, examine/correct.
else: # med >= NORMALIZE LIMIT or not args.variable_coverage
- _, trim_at = ct.trim_on_abundance(seq, CUTOFF)
- if trim_at >= K:
- new_read = trim_record(read, trim_at)
- write_record(new_read, trimfp)
+ is_aligned, new_seq = correct_sequence(aligner, seq)
+ if is_aligned:
+ if new_seq != read.sequence:
+ corrected_reads += 1
+ read.sequence = new_seq
+ if hasattr(read, 'quality'):
+ fix_quality(read)
+ write_record(read, corrfp)
written_reads += 1
- written_bp += trim_at
-
- if trim_at != len(read.sequence):
- trimmed_reads += 1
+ written_bp += len(new_seq)
- print('removing %s' % pass2filename, file=sys.stderr)
+ print >>sys.stderr, 'removing %s' % pass2filename
os.unlink(pass2filename)
- print('removing temp directory & contents (%s)' % tempdir, file=sys.stderr)
+ print >>sys.stderr, 'removing temp directory & contents (%s)' % tempdir
shutil.rmtree(tempdir)
n_passes = 1.0 + (float(save_pass2_total) / n_reads)
- percent_reads_trimmed = float(trimmed_reads + (n_reads - written_reads)) /\
+ percent_reads_corrected = float(corrected_reads +
+ (n_reads - written_reads)) /\
n_reads * 100.0
- print('read %d reads, %d bp' % (n_reads, n_bp,))
- print('wrote %d reads, %d bp' % (written_reads, written_bp,))
- print('looked at %d reads twice (%.2f passes)' % (save_pass2_total,
- n_passes))
- print('removed %d reads and trimmed %d reads (%.2f%%)' %
- (n_reads - written_reads, trimmed_reads, percent_reads_trimmed))
- print('trimmed or removed %.2f%% of bases (%d total)' %
- ((1 - (written_bp / float(n_bp))) * 100.0, n_bp - written_bp))
+ print >>sys.stderr, 'read %d reads, %d bp' % (n_reads, n_bp,)
+ print >>sys.stderr, 'wrote %d reads, %d bp' % (written_reads, written_bp,)
+ print >>sys.stderr, 'looked at %d reads twice (%.2f passes)' % \
+ (save_pass2_total, n_passes)
+ print >>sys.stderr, 'removed %d reads and corrected %d reads (%.2f%%)' % \
+ (n_reads - written_reads, corrected_reads, percent_reads_corrected)
+ print >>sys.stderr, 'removed %.2f%% of bases (%d total)' % \
+ ((1 - (written_bp / float(n_bp))) * 100.0, n_bp - written_bp)
if args.variable_coverage:
percent_reads_hicov = 100.0 * float(n_reads - skipped_n) / n_reads
- print('%d reads were high coverage (%.2f%%);' % (n_reads - skipped_n,
- percent_reads_hicov),
- file=sys.stderr)
- print('skipped %d reads/%d bases because of low coverage' %
- (skipped_n, skipped_bp),
- file=sys.stderr)
+ print >>sys.stderr, '%d reads were high coverage (%.2f%%);' % \
+ (n_reads - skipped_n, percent_reads_hicov)
+ print >>sys.stderr, ('skipped %d reads/%d bases because of low'
+ 'coverage') % (skipped_n, skipped_bp)
fp_rate = \
khmer.calc_expected_collisions(ct, args.force, max_false_pos=.8)
# for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975
- print('fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate),
- file=sys.stderr)
+ print >>sys.stderr, \
+ 'fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate)
- print('output in *.abundtrim', file=sys.stderr)
+ print >>sys.stderr, 'output in *.corr'
if args.savetable:
- print("Saving k-mer counting table to",
- args.savetable, file=sys.stderr)
+ print >>sys.stderr, "Saving k-mer counting table to", args.savetable
ct.save(args.savetable)
diff --git a/sandbox/count-kmers-single.py b/sandbox/count-kmers-single.py
new file mode 100755
index 0000000..7cb49c1
--- /dev/null
+++ b/sandbox/count-kmers-single.py
@@ -0,0 +1,103 @@
+#! /usr/bin/env python2
+#
+# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
+# Copyright (C) University of California, Davis, 2015. It is licensed under
+# the three-clause BSD license; see doc/LICENSE.txt.
+# Contact: khmer-project at idyll.org
+#
+# pylint: disable=missing-docstring,invalid-name
+"""
+Produce k-mer counts for all the k-mers in the given sequence file,
+using the given counting table.
+
+% python sandbox/count-kmers-single.py <fasta/fastq>
+
+Use '-h' for parameter help.
+"""
+from __future__ import print_function
+
+import sys
+import khmer
+import argparse
+import screed
+import csv
+from khmer.khmer_args import (build_counting_args, report_on_config, info,
+ add_threading_args)
+from khmer.kfile import (check_input_files, check_space,
+ check_space_for_hashtable)
+import threading
+
+
+def get_parser():
+ parser = build_counting_args(
+ descr="Output abundances of the k-mers in the sequence file.")
+ add_threading_args(parser)
+
+ parser.add_argument('input_sequence_filename', help='The input'
+ ' FAST[AQ] sequence file.')
+
+ parser.add_argument('-o', '--out', metavar="output_file",
+ dest='output_file',
+ type=argparse.FileType('w'),
+ default=None, help='output counts to this file')
+
+ return parser
+
+
+def main():
+ info('count-kmers-single.py', ['counting'])
+ args = get_parser().parse_args()
+
+ check_input_files(args.input_sequence_filename, False)
+
+ print ('making k-mer counting table', file=sys.stderr)
+ counting_hash = khmer.CountingHash(args.ksize, args.max_tablesize,
+ args.n_tables)
+ # @CTB counting_hash.set_use_bigcount(args.bigcount)
+
+ kmer_size = counting_hash.ksize()
+ hashsizes = counting_hash.hashsizes()
+ tracking = khmer._Hashbits( # pylint: disable=protected-access
+ kmer_size, hashsizes)
+
+ print ('kmer_size: %s' % counting_hash.ksize(), file=sys.stderr)
+ print ('k-mer counting table sizes: %s' % (counting_hash.hashsizes(),),
+ file=sys.stderr)
+
+ if args.output_file is None:
+ args.output_file = sys.stdout
+ writer = csv.writer(args.output_file)
+
+ # start loading
+ rparser = khmer.ReadParser(args.input_sequence_filename)
+ threads = []
+ print ('consuming input, round 1 -- %s' % (args.input_sequence_filename),
+ file=sys.stderr)
+ for _ in range(args.threads):
+ thread = \
+ threading.Thread(
+ target=counting_hash.consume_fasta_with_reads_parser,
+ args=(rparser, )
+ )
+ threads.append(thread)
+ thread.start()
+
+ for thread in threads:
+ thread.join()
+
+ for record in screed.open(args.input_sequence_filename):
+ seq = record.sequence.replace('N', 'A')
+ for i in range(len(seq) - kmer_size + 1):
+ kmer = seq[i:i+kmer_size]
+ if not tracking.get(kmer):
+ tracking.count(kmer)
+ writer.writerow([kmer, str(counting_hash.get(kmer))])
+
+ print ('Total number of unique k-mers: {0}'.format(
+ counting_hash.n_unique_kmers()), file=sys.stderr)
+
+
+if __name__ == '__main__':
+ main()
+
+# vim: set ft=python ts=4 sts=4 sw=4 et tw=79:
diff --git a/sandbox/count-kmers.py b/sandbox/count-kmers.py
new file mode 100644
index 0000000..0d736da
--- /dev/null
+++ b/sandbox/count-kmers.py
@@ -0,0 +1,80 @@
+#! /usr/bin/env python2
+#
+# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
+# Copyright (C) University of California, Davis, 2015. It is licensed under
+# the three-clause BSD license; see doc/LICENSE.txt.
+# Contact: khmer-project at idyll.org
+#
+# pylint: disable=missing-docstring,invalid-name
+"""
+Produce k-mer counts for all the k-mers in the given sequence file,
+using the given counting table.
+
+% python sandbox/count-kmers.py <ct> <fasta/fastq> [ <fasta/fastq> ... ]
+
+Use '-h' for parameter help.
+"""
+from __future__ import print_function
+
+import sys
+import khmer
+import argparse
+import screed
+import csv
+from khmer.khmer_args import info
+
+
+def get_parser():
+ parser = argparse.ArgumentParser(
+ description="Output abundances of the k-mers in "
+ "the sequence files using a pre-made k-mer counting table.",
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+
+ parser.add_argument('input_counting_table_filename', help='The name of the'
+ ' input k-mer counting table file.')
+ parser.add_argument('input_sequence_filenames', help='The input'
+ ' FAST[AQ] sequence file(s).', nargs='+')
+
+ parser.add_argument('-o', '--out', metavar="output_file",
+ dest='output_file',
+ type=argparse.FileType('w'),
+ default=None, help='output counts to this file')
+
+ return parser
+
+
+def main():
+ info('count-kmers.py', ['counting'])
+ args = get_parser().parse_args()
+
+ print ('hashtable from', args.input_counting_table_filename,
+ file=sys.stderr)
+ counting_hash = khmer.load_counting_hash(
+ args.input_counting_table_filename)
+
+ kmer_size = counting_hash.ksize()
+ hashsizes = counting_hash.hashsizes()
+ tracking = khmer._Hashbits( # pylint: disable=protected-access
+ kmer_size, hashsizes)
+
+ if args.output_file is None:
+ args.output_file = sys.stdout
+ writer = csv.writer(args.output_file)
+
+ for filename in args.input_sequence_filenames:
+ for record in screed.open(filename):
+ seq = record.sequence.replace('N', 'A')
+ for i in range(len(seq) - kmer_size + 1):
+ kmer = seq[i:i+kmer_size]
+ if not tracking.get(kmer):
+ tracking.count(kmer)
+ writer.writerow([kmer, str(counting_hash.get(kmer))])
+
+ print ('Total number of unique k-mers: {0}'.format(
+ counting_hash.n_unique_kmers()), file=sys.stderr)
+
+
+if __name__ == '__main__':
+ main()
+
+# vim: set ft=python ts=4 sts=4 sw=4 et tw=79:
diff --git a/sandbox/error-correct-pass2.py b/sandbox/error-correct-pass2.py
new file mode 100755
index 0000000..466848f
--- /dev/null
+++ b/sandbox/error-correct-pass2.py
@@ -0,0 +1,94 @@
+#! /usr/bin/env python2
+#
+# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
+# Copyright (C) Michigan State University, 2009-2013. It is licensed under
+# the three-clause BSD license; see doc/LICENSE.txt.
+# Contact: khmer-project at idyll.org
+#
+"""
+Error correct reads based on a counting hash from a diginorm step.
+Output sequences will be put in @@@.
+
+% python scripts/error-correct-pass2 <counting.ct> <data1> [ <data2> <...> ]
+
+Use '-h' for parameter help.
+"""
+from __future__ import print_function
+import sys
+import screed
+import os
+import khmer
+import argparse
+
+
+###
+
+DEFAULT_CUTOFF = 2
+
+def output_single(read, new_sequence):
+ name = read.name
+ sequence = new_sequence
+
+ quality = None
+ if hasattr(read, 'quality'):
+ quality = read.quality[:len(sequence)]
+ sequence = sequence[:len(quality)] # in cases where sequence _lengthened_
+
+ if quality:
+ assert len(sequence) == len(quality), (sequence, quality)
+ return "@%s\n%s\n+\n%s\n" % (name, sequence, quality)
+ else:
+ return ">%s\n%s\n" % (name, sequence)
+
+
+def main():
+ parser = argparse.ArgumentParser()
+
+ parser.add_argument("--trusted-cov", dest="trusted_cov", type=int,
+ default=DEFAULT_CUTOFF)
+ parser.add_argument("--theta", dest="bits_theta", type=float, default=1.0)
+ parser.add_argument('-o', '--output', dest='output_file',
+ help="output file for histogram; defaults to "
+ "<first filename>.errhist in cwd.",
+ type=argparse.FileType('w'), default=None)
+
+ parser.add_argument('counts_table')
+ parser.add_argument('readfile')
+
+ args = parser.parse_args()
+
+ print('loading counts')
+ ht = khmer.load_counting_hash(args.counts_table)
+
+ aligner = khmer.ReadAligner(ht,
+ args.trusted_cov,
+ args.bits_theta)
+
+ print("trusted:", args.trusted_cov)
+
+ corrfp = args.output_file
+ if not corrfp:
+ outfile = os.path.basename(args.readfile) + '.corr'
+ corrfp = open(outfile, 'w')
+
+ n_corrected = 0
+ for n, read in enumerate(screed.open(args.readfile)):
+ if n % 10000 == 0:
+ print('...', n, n_corrected, file=sys.stderr)
+ seq = read.sequence.replace('N', 'A')
+
+ # build the alignment...
+ score, graph_alignment, read_alignment, truncated = \
+ aligner.align(seq)
+
+ if not truncated:
+ graph_seq = graph_alignment.replace("-", "")
+ if graph_seq != seq:
+ n_corrected += 1
+
+ seq = graph_seq
+
+ corrfp.write(output_single(read, seq))
+
+if __name__ == '__main__':
+ main()
diff --git a/sandbox/estimate_optimal_hash.py b/sandbox/estimate_optimal_hash.py
index 63e4932..3f3944e 100755
--- a/sandbox/estimate_optimal_hash.py
+++ b/sandbox/estimate_optimal_hash.py
@@ -30,8 +30,7 @@ from __future__ import print_function
import argparse
import khmer, oxli
from khmer.khmer_args import info
-from oxli.functions import estimate_optimal_with_N_and_M
-from oxli.functions import estimate_optimal_with_N_and_f
+from oxli.functions import optimal_size
import textwrap
import sys
@@ -69,14 +68,14 @@ def get_parser():
+ khmer.__version__)
return parser
-
+
def main():
info('estimate_optimal_hash.py', ['counting'])
args = get_parser().parse_args()
N = args.N
if args.M:
M = args.M
- result = estimate_optimal_with_N_and_M(N,M)
+ result = optimal_size(N, M=M)
print("number of estimated distinct k-mers: ", N, file=sys.stderr)
print("size of memory available to use: ", M, file=sys.stderr)
print("optimal number of hash tables: ", result.num_htables,
@@ -87,10 +86,10 @@ def main():
file=sys.stderr)
print("estimated usage of memory: ", result.mem_use,
file=sys.stderr)
-
+
elif args.f:
f = args.f
- result = estimate_optimal_with_N_and_f(N,f)
+ result = optimal_size(N, f=f)
print("number of estimated distinct k-mers: ", N, file=sys.stderr)
print("desired maximum false positive rate: ", f, file=sys.stderr)
print("optimal number of hash tables: ", result.num_htables,
diff --git a/sandbox/extract-single-partition.py b/sandbox/extract-single-partition.py
index ccc0f28..4e64b3f 100755
--- a/sandbox/extract-single-partition.py
+++ b/sandbox/extract-single-partition.py
@@ -11,7 +11,7 @@ from screed.fasta import fasta_iter
def read_partition_file(fp):
- for n, record in enumerate(fasta_iter(fp, parse_description=False)):
+ for n, record in enumerate(fasta_iter(fp)):
name = record['name']
name, partition_id = name.rsplit('\t', 1)
yield n, name, int(partition_id), record['sequence']
diff --git a/sandbox/optimal_args_hashbits.py b/sandbox/optimal_args_hashbits.py
index 1fba596..9b866e5 100644
--- a/sandbox/optimal_args_hashbits.py
+++ b/sandbox/optimal_args_hashbits.py
@@ -52,7 +52,7 @@ def main():
print('Counting kmers from sequences in %s' % repr(filenames),
file=sys.stderr)
- htable = khmer.new_hashbits(args.ksize, args.min_tablesize, args.n_tables)
+ htable = khmer.new_hashbits(args.ksize, args.max_tablesize, args.n_tables)
target_method = htable.consume_fasta_with_reads_parser
for _, filename in enumerate(filenames):
diff --git a/sandbox/readaligner_pairhmm_train.py b/sandbox/readaligner_pairhmm_train.py
new file mode 100644
index 0000000..0e60e06
--- /dev/null
+++ b/sandbox/readaligner_pairhmm_train.py
@@ -0,0 +1,205 @@
+#!/usr/bin/env python
+from __future__ import division
+from __future__ import print_function
+import khmer
+import argparse
+import collections
+from math import log
+import json
+try:
+ import pysam
+except:
+ pass
+
+cigar_to_state = {0: 'M', 1: 'Ir', 2: 'Ig'}
+
+
+def extract_cigar(cigar):
+ ret = []
+ for t, length in cigar:
+ for i in range(length):
+ ret.append(cigar_to_state[t])
+
+ return ret
+
+
+def trusted_str(cov, trusted_cutoff):
+ if cov < trusted_cutoff:
+ return '_u'
+ else:
+ return '_t'
+
+
+def main():
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--trusted-cutoff', type=int, default=5)
+ parser.add_argument(
+ "ht", type=str, help="Counting bloom filter for the reads")
+ parser.add_argument("bam_file", type=str, help="bam read mapping file")
+ parser.add_argument("--json", action='store_true', help="output JSON")
+
+ args = parser.parse_args()
+
+ ht = khmer.load_counting_hash(args.ht)
+ samfile = pysam.Samfile(args.bam_file)
+
+ k = ht.ksize()
+ seq_cnt = 0
+ dropped_seqs = 0
+ base_cnt = {}
+ state_cnts = {}
+ trans_cnts = {}
+
+ total_bases = 0.0
+
+ for rec in samfile:
+ seq = rec.seq
+ cigar = rec.cigar
+
+ seq_cnt += 1
+ if 'N' in seq:
+ dropped_seqs += 1
+ continue
+
+ states = extract_cigar(cigar)
+
+ kmer = seq[:k]
+ state = states[k] + trusted_str(ht.count(kmer), args.trusted_cutoff)
+
+ state_cnts[state] = state_cnts.get(state, 0) + 1
+ base_cnt[kmer[-1]] = base_cnt.get(kmer[-1], 0) + 1
+
+ for i in range(1, len(seq) - k - 1):
+ total_bases += 1
+ kmer = seq[i:i + k]
+ cov = ht.get(kmer)
+
+ last_state = state
+ state = states[i] + trusted_str(cov, args.trusted_cutoff)
+
+ trans = last_state + '-' + state
+ trans_cnts[trans] = trans_cnts.get(trans, 0) + 1
+
+ state_cnts[state] = state_cnts.get(state, 0) + 1
+ base_cnt[kmer[-1]] = base_cnt.get(kmer[-1], 0) + 1
+
+ if not args.json:
+ print("kmer size=", k)
+ print("seq count=", seq_cnt, "dropped seqs=", dropped_seqs)
+ print("base counts=", base_cnt)
+ print("state counts=", state_cnts)
+ print("trans counts=", trans_cnts)
+
+
+ if not args.json:
+
+ trans_probs = collections.defaultdict(float(0))
+
+ for trans in sorted(trans_cnts.keys()):
+ start_state = trans.split('-')[0]
+ trans_probs[trans] = trans_cnts[
+ trans] / float(state_cnts[start_state])
+ print('{0}\t{1:0.7f}'.format(trans, trans_probs[trans]))
+
+ print('static double trans_default[] = { log2{0:0.7f}, log2{1:0.7f}, ' \
+ 'log2{2:0.7f}, log2{3:0.7f}, log2{4:0.7f}, ' \
+ 'log2(5:0.7f},'.format(trans_probs['M_t-M_t'],
+ trans_probs['M_t-Ir_t'],
+ trans_probs[
+ 'M_t-Ig_t'], trans_probs['M_t-M_u'],
+ trans_probs['M_t-Ir_u'],
+ trans_probs['M_t-Ig_u']))
+ print('log2{0:0.7f}, log2{1:0.7f}, log2{2:0.7f}, log2{3:0.7f},'.format(
+ trans_probs[
+ 'Ir_t-M_t'], trans_probs['Ir_t-Ir_t'], trans_probs['Ir_t-M_u'],
+ trans_probs['Ir_t,Ir_u']))
+ print('log2{0:0.7f}, log2{1:0.7f}, log2{2:0.7f}, log2{3:0.7f},'.format(
+ trans_probs[
+ 'Ig_t-M_t'], trans_probs['Ig_t-Ig_t'], trans_probs['Ig_t-M_u'],
+ trans_probs['Ig_t,Ig_u']))
+ print('log2{0:0.7f}, log2{1:0.7f}, log2{2:0.7f}, log2{3:0.7f}, '\
+ 'log2{4:0.7f}, log2(5:0.7f},'.format(
+ trans_probs['M_u-M_t'], trans_probs['M_u-Ir_t'],
+ trans_probs['M_u-Ig_t'], trans_probs['M_u-M_u'],
+ trans_probs['M_u-Ir_u'], trans_probs['M_u-Ig_u']))
+ print('log2{0:0.7f}, log2{1:0.7f}, log2{2:0.7f}, log2{3:0.7f},'.format(
+ trans_probs[
+ 'Ir_u-M_t'], trans_probs['Ir_u-Ir_t'], trans_probs['Ir_u-M_u'],
+ trans_probs['Ir_u,Ir_u']))
+ print('log2{0:0.7f}, log2{1:0.7f}, log2{2:0.7f}, log2{3:0.7f},'.format(
+ trans_probs[
+ 'Ig_u-M_t'], trans_probs['Ig_u-Ig_t'], trans_probs['Ig_u-M_u'],
+ trans_probs['Ig_u,Ig_u']))
+ print('};')
+ else:
+ params = {'scoring_matrix':
+ [-0.06642736173897607,
+ -4.643856189774724,
+ -7.965784284662087,
+ -9.965784284662087],
+ 'transition_probabilities': ((
+ log(trans_cnts['M_t-M_t'] / float(state_cnts['M_t']), 2),
+ log(trans_cnts['M_t-Ir_t'] /
+ float(state_cnts['M_t']), 2),
+ log(trans_cnts['M_t-Ig_t'] /
+ float(state_cnts['M_t']), 2),
+ log(trans_cnts['M_t-M_u'] / float(state_cnts['M_t']), 2),
+ log(trans_cnts['M_t-Ir_u'] /
+ float(state_cnts['M_t']), 2),
+ log(trans_cnts['M_t-Ig_u'] /
+ float(state_cnts['M_t']), 2),
+ ), (
+ log(trans_cnts['Ir_t-M_t'] /
+ float(state_cnts['Ir_t']), 2),
+ log(trans_cnts['Ir_t-Ir_t'] /
+ float(state_cnts['Ir_t']), 2),
+ log(trans_cnts['Ir_t-M_u'] /
+ float(state_cnts['Ir_t']), 2),
+ log(trans_cnts['Ir_t-Ir_u'] /
+ float(state_cnts['Ir_t']), 2),
+ ), (
+ log(trans_cnts['Ig_t-M_t'] /
+ float(state_cnts['Ig_t']), 2),
+ log(trans_cnts['Ig_t-Ig_t'] /
+ float(state_cnts['Ig_t']), 2),
+ log(trans_cnts['Ig_t-M_u'] /
+ float(state_cnts['Ig_t']), 2),
+ log(trans_cnts['Ig_t-Ig_u'] /
+ float(state_cnts['Ig_t']), 2),
+ ), (
+ log(trans_cnts['M_u-M_t'] / float(state_cnts['M_u']), 2),
+ log(trans_cnts['M_u-Ir_t'] /
+ float(state_cnts['M_u']), 2),
+ log(trans_cnts['M_u-Ig_t'] /
+ float(state_cnts['M_u']), 2),
+ log(trans_cnts['M_u-M_u'] / float(state_cnts['M_u']), 2),
+ log(trans_cnts['M_u-Ir_u'] /
+ float(state_cnts['M_u']), 2),
+ log(trans_cnts['M_u-Ig_u'] /
+ float(state_cnts['M_u']), 2),
+ ), (
+ log(trans_cnts['Ir_u-M_t'] /
+ float(state_cnts['Ir_u']), 2),
+ log(trans_cnts['Ir_u-Ir_t'] /
+ float(state_cnts['Ir_u']), 2),
+ log(trans_cnts['Ir_u-M_u'] /
+ float(state_cnts['Ir_u']), 2),
+ log(trans_cnts['Ir_u-Ir_u'] /
+ float(state_cnts['Ir_u']), 2),
+ ), (
+ log(trans_cnts['Ig_u-M_t'] /
+ float(state_cnts['Ig_u']), 2),
+ log(trans_cnts['Ig_u-Ig_t'] /
+ float(state_cnts['Ig_u']), 2),
+ log(trans_cnts['Ig_u-M_u'] /
+ float(state_cnts['Ig_u']), 2),
+ log(trans_cnts['Ig_u-Ig_u'] /
+ float(state_cnts['Ig_u']), 2),
+ )
+ )
+ }
+ print(json.dumps(params, sort_keys=True, indent=4, separators=(',', ': ')))
+
+
+if __name__ == "__main__":
+ main()
diff --git a/sandbox/saturate-by-median.py b/sandbox/saturate-by-median.py
index 40aee14..a47cde4 100755
--- a/sandbox/saturate-by-median.py
+++ b/sandbox/saturate-by-median.py
@@ -215,6 +215,8 @@ def main(): # pylint: disable=too-many-branches,too-many-statements
except IOError as err:
handle_error(err, input_filename)
if not args.force:
+ print("NOTE: This can be overridden using the --force"
+ " argument", file=sys.stderr)
print('** Exiting!', file=sys.stderr)
sys.exit(1)
else:
@@ -243,7 +245,7 @@ def main(): # pylint: disable=too-many-branches,too-many-statements
if args.force and len(corrupt_files) > 0:
print("** WARNING: Finished with errors!", file=sys.stderr)
- print("** IOErrors occurred in the following files:", file=sys.stderr)
+ print("** I/O Errors occurred in the following files:", file=sys.stderr)
print("\t", " ".join(corrupt_files), file=sys.stderr)
diff --git a/sandbox/sweep-files.py b/sandbox/sweep-files.py
index 6b44ea6..e80ca44 100755
--- a/sandbox/sweep-files.py
+++ b/sandbox/sweep-files.py
@@ -103,15 +103,15 @@ def main():
parser = get_parser()
args = parser.parse_args()
- if args.min_tablesize < MIN_HSIZE:
- args.min_tablesize = MIN_HSIZE
+ if args.max_tablesize < MIN_HSIZE:
+ args.max_tablesize = MIN_HSIZE
if args.ksize < MIN_KSIZE:
args.ksize = MIN_KSIZE
report_on_config(args, hashtype='nodegraph')
K = args.ksize
- HT_SIZE = args.min_tablesize
+ HT_SIZE = args.max_tablesize
N_HT = args.n_tables
traversal_range = args.traversal_range
@@ -137,7 +137,7 @@ def main():
ht.consume_sequence_and_tag_with_labels(record.sequence, i)
- except IOError as e:
+ except (IOError, OSError) as e:
print('!! ERROR: !!', e, file=sys.stderr)
print('...error setting up outputs. exiting...', file=sys.stderr)
diff --git a/sandbox/sweep-reads.py b/sandbox/sweep-reads.py
index fbf2ccb..7134fc3 100755
--- a/sandbox/sweep-reads.py
+++ b/sandbox/sweep-reads.py
@@ -121,7 +121,7 @@ class ReadBufferManager(object):
buf = self.buffers[buf_id]
try:
outfp = open(fpath, 'a')
- except IOError as _:
+ except (IOError, OSError) as _:
print('!! ERROR: {_} !!'.format(_=_), file=sys.stderr)
print('*** Failed to open {fn} for \
buffer flush'.format(fn=fpath), file=sys.stderr)
@@ -290,11 +290,11 @@ def main():
write_record(record, outfp)
- except IOError as e:
+ except (IOError, OSError) as e:
print('!! ERROR !!', e, file=sys.stderr)
print('...error splitting input. exiting...', file=sys.stderr)
- except IOError as e:
+ except (IOError, OSError) as e:
print('!! ERROR: !!', e, file=sys.stderr)
print('...error consuming \
{i}. exiting...'.format(i=input_fastp), file=sys.stderr)
@@ -319,7 +319,7 @@ def main():
file_t = 0.0
try:
read_fp = screed.open(read_file)
- except IOError as error:
+ except (IOError, OSError) as error:
print('!! ERROR: !!', error, file=sys.stderr)
print('*** Could not open {fn}, skipping...'.format(
fn=read_file), file=sys.stderr)
diff --git a/scripts/abundance-dist-single.py b/scripts/abundance-dist-single.py
index 10d2109..4da22b2 100755
--- a/scripts/abundance-dist-single.py
+++ b/scripts/abundance-dist-single.py
@@ -24,9 +24,8 @@ import threading
import textwrap
from khmer import khmer_args
from khmer.khmer_args import (build_counting_args, add_threading_args,
- report_on_config, info)
-from khmer.kfile import (check_input_files, check_space,
- check_space_for_hashtable)
+ report_on_config, info, calculate_tablesize)
+from khmer.kfile import (check_input_files, check_space_for_hashtable)
def get_parser():
@@ -59,14 +58,9 @@ def get_parser():
parser.add_argument('-s', '--squash', dest='squash_output', default=False,
action='store_true',
help='Overwrite output file if it exists')
- parser.add_argument('--csv', default=False, action='store_true',
- help='Use the CSV format for the histogram. '
- 'Includes column headers.')
parser.add_argument('--savetable', default='', metavar="filename",
help="Save the k-mer counting table to the specified "
"filename.")
- parser.add_argument('--report-total-kmers', '-t', action='store_true',
- help="Prints the total number of k-mers to stderr")
parser.add_argument('-f', '--force', default=False, action='store_true',
help='Overwrite output file if it exists')
return parser
@@ -78,10 +72,9 @@ def main(): # pylint: disable=too-many-locals,too-many-branches
report_on_config(args)
check_input_files(args.input_sequence_filename, args.force)
- check_space([args.input_sequence_filename], args.force)
if args.savetable:
- check_space_for_hashtable(args, 'countgraph', args.force)
-
+ tablesize = calculate_tablesize(args, 'countgraph')
+ check_space_for_hashtable(args.savetable, tablesize, args.force)
if (not args.squash_output and
os.path.exists(args.output_histogram_filename)):
print('ERROR: %s exists; not squashing.' %
@@ -89,11 +82,10 @@ def main(): # pylint: disable=too-many-locals,too-many-branches
sys.exit(1)
else:
hist_fp = open(args.output_histogram_filename, 'w')
- if args.csv:
- hist_fp_csv = csv.writer(hist_fp)
- # write headers:
- hist_fp_csv.writerow(['abundance', 'count', 'cumulative',
- 'cumulative_fraction'])
+ hist_fp_csv = csv.writer(hist_fp)
+ # write headers:
+ hist_fp_csv.writerow(['abundance', 'count', 'cumulative',
+ 'cumulative_fraction'])
print('making countgraph', file=sys.stderr)
counting_hash = khmer_args.create_countgraph(args, multiplier=1.1)
@@ -124,9 +116,8 @@ def main(): # pylint: disable=too-many-locals,too-many-branches
for thread in threads:
thread.join()
- if args.report_total_kmers:
- print('Total number of unique k-mers: {0}'.format(
- counting_hash.n_unique_kmers()), file=sys.stderr)
+ print('Total number of unique k-mers: {0}'.format(
+ counting_hash.n_unique_kmers()), file=sys.stderr)
abundance_lists = []
@@ -176,10 +167,7 @@ def main(): # pylint: disable=too-many-locals,too-many-branches
sofar += i
frac = sofar / float(total)
- if args.csv:
- hist_fp_csv.writerow([_, i, sofar, round(frac, 3)])
- else:
- print(_, i, sofar, round(frac, 3), file=hist_fp)
+ hist_fp_csv.writerow([_, i, sofar, round(frac, 3)])
if sofar == total:
break
diff --git a/scripts/abundance-dist.py b/scripts/abundance-dist.py
index 7661ec6..1e96c1e 100755
--- a/scripts/abundance-dist.py
+++ b/scripts/abundance-dist.py
@@ -5,7 +5,7 @@
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
#
-# pylint: disable=missing-docstring,invalid-name
+# pylint: disable=missing-docstring
"""
Produce the k-mer abundance distribution for the given file.
@@ -22,7 +22,6 @@ import argparse
import os
from khmer.kfile import check_input_files
from khmer.khmer_args import info
-from khmer.utils import write_record
def get_parser():
@@ -47,9 +46,6 @@ def get_parser():
parser.add_argument('-b', '--no-bigcount', dest='bigcount', default=True,
action='store_false',
help='Do not count k-mers past 255')
- parser.add_argument('--csv', default=False, action='store_true',
- help='Use the CSV format for the histogram. '
- 'Includes column headers.')
parser.add_argument('--version', action='version', version='%(prog)s ' +
khmer.__version__)
parser.add_argument('-f', '--force', default=False, action='store_true',
@@ -61,10 +57,11 @@ def get_parser():
def main():
info('abundance-dist.py', ['counting'])
args = get_parser().parse_args()
+
infiles = [args.input_counting_table_filename,
args.input_sequence_filename]
for infile in infiles:
- check_input_files(infile, args.force)
+ check_input_files(infile, False)
print('hashtable from', args.input_counting_table_filename,
file=sys.stderr)
@@ -88,7 +85,9 @@ def main():
print('HT sizes:', hashsizes, file=sys.stderr)
print('outputting to', args.output_histogram_filename, file=sys.stderr)
- if os.path.exists(args.output_histogram_filename):
+ if args.output_histogram_filename in ('-', '/dev/stdout'):
+ pass
+ elif os.path.exists(args.output_histogram_filename):
if not args.squash_output:
print('ERROR: %s exists; not squashing.' %
args.output_histogram_filename,
@@ -110,12 +109,14 @@ def main():
file=sys.stderr)
sys.exit(1)
- hash_fp = open(args.output_histogram_filename, 'w')
- if args.csv:
- hash_fp_csv = csv.writer(hash_fp)
- # write headers:
- hash_fp_csv.writerow(['abundance', 'count', 'cumulative',
- 'cumulative_fraction'])
+ if args.output_histogram_filename in ('-', '/dev/stdout'):
+ hash_fp = sys.stdout
+ else:
+ hash_fp = open(args.output_histogram_filename, 'w')
+ hash_fp_csv = csv.writer(hash_fp)
+ # write headers:
+ hash_fp_csv.writerow(['abundance', 'count', 'cumulative',
+ 'cumulative_fraction'])
sofar = 0
for _, i in enumerate(abundances):
@@ -125,10 +126,7 @@ def main():
sofar += i
frac = sofar / float(total)
- if args.csv:
- hash_fp_csv.writerow([_, i, sofar, round(frac, 3)])
- else:
- print(_, i, sofar, round(frac, 3), file=hash_fp)
+ hash_fp_csv.writerow([_, i, sofar, round(frac, 3)])
if sofar == total:
break
diff --git a/scripts/count-median.py b/scripts/count-median.py
index 19e6473..7ca052c 100755
--- a/scripts/count-median.py
+++ b/scripts/count-median.py
@@ -40,11 +40,7 @@ def get_parser():
to estimate expression levels (mRNAseq) or coverage (genomic/metagenomic).
The output file contains sequence id, median, average, stddev, and
- seq length; fields are separated by spaces. For khmer 1.x
- count-median.py will split sequence names at the first space which
- means that some sequence formats (e.g. paired FASTQ in Casava 1.8
- format) will yield uninformative names. Use :option:`--csv` to
- fix this behavior.
+ seq length, in comma-separated value (CSV) format.
Example::
@@ -61,14 +57,12 @@ def get_parser():
parser.add_argument('input', metavar='input_sequence_filename',
help='input FAST[AQ] sequence filename')
parser.add_argument('output', metavar='output_summary_filename',
- help='output summary filename')
+ help='output summary filename',
+ type=argparse.FileType('w'))
parser.add_argument('--version', action='version', version='%(prog)s ' +
khmer.__version__)
parser.add_argument('-f', '--force', default=False, action='store_true',
help='Overwrite output file if it exists')
- parser.add_argument('--csv', default=False, action='store_true',
- help="Use the CSV format for the histogram."
- "Includes column headers.")
return parser
@@ -78,7 +72,8 @@ def main():
htfile = args.ctfile
input_filename = args.input
- output_filename = args.output
+ output = args.output
+ output_filename = str(output)
infiles = [htfile, input_filename]
for infile in infiles:
@@ -89,21 +84,13 @@ def main():
print('loading k-mer counting table from', htfile, file=sys.stderr)
htable = khmer.load_counting_hash(htfile)
ksize = htable.ksize()
-
print('writing to', output_filename, file=sys.stderr)
- output = open(output_filename, 'w')
-
- if args.csv:
- output = csv.writer(output)
- # write headers:
- output.writerow(['name', 'median', 'average', 'stddev', 'seqlen'])
- parse_description = True # @legacy behavior: split seq headers
- if args.csv:
- parse_description = False # only enable if we're doing csv out
+ output = csv.writer(output)
+ # write headers:
+ output.writerow(['name', 'median', 'average', 'stddev', 'seqlen'])
- for record in screed.open(input_filename,
- parse_description=parse_description):
+ for record in screed.open(input_filename):
seq = record.sequence.upper()
if 'N' in seq:
seq = seq.replace('N', 'A')
@@ -111,10 +98,7 @@ def main():
if ksize <= len(seq):
medn, ave, stdev = htable.get_median_count(seq)
ave, stdev = [round(x, 9) for x in (ave, stdev)]
- if args.csv:
- output.writerow([record.name, medn, ave, stdev, len(seq)])
- else:
- print(record.name, medn, ave, stdev, len(seq), file=output)
+ output.writerow([record.name, medn, ave, stdev, len(seq)])
if __name__ == '__main__':
main()
diff --git a/scripts/count-overlap.py b/scripts/count-overlap.py
index a8c715d..51ffbf3 100755
--- a/scripts/count-overlap.py
+++ b/scripts/count-overlap.py
@@ -24,7 +24,7 @@ import csv
import khmer
import textwrap
from khmer import khmer_args
-from khmer.kfile import check_input_files, check_space
+from khmer.kfile import check_input_files
from khmer.khmer_args import (build_hashbits_args, report_on_config, info)
@@ -43,10 +43,6 @@ def get_parser():
help="input sequence filename")
parser.add_argument('report_filename', metavar='output_report_filename',
help='output report filename')
- parser.add_argument('--csv', default=False, action='store_true',
- help='Use the CSV format for the curve output '
- 'in ${output_report_filename}.curve, '
- 'including column headers.')
parser.add_argument('-f', '--force', default=False, action='store_true',
help='Overwrite output file if it exists')
return parser
@@ -60,18 +56,15 @@ def main():
for infile in [args.ptfile, args.fafile]:
check_input_files(infile, args.force)
- check_space([args.ptfile, args.fafile], args.force)
-
print('loading k-mer presence table from', args.ptfile, file=sys.stderr)
ht1 = khmer.load_hashbits(args.ptfile)
kmer_size = ht1.ksize()
output = open(args.report_filename, 'w')
f_curve_obj = open(args.report_filename + '.curve', 'w')
- if args.csv:
- f_curve_obj_csv = csv.writer(f_curve_obj)
- # write headers:
- f_curve_obj_csv.writerow(['input_seq', 'overlap_kmer'])
+ f_curve_obj_csv = csv.writer(f_curve_obj)
+ # write headers:
+ f_curve_obj_csv.writerow(['input_seq', 'overlap_kmer'])
ht2 = khmer_args.create_nodegraph(args, ksize=kmer_size)
@@ -88,10 +81,7 @@ dataset2: %s
output.write(printout1)
for i in range(100):
- if args.csv:
- f_curve_obj_csv.writerow([list_curve[100 + i], list_curve[i]])
- else:
- print(list_curve[100 + i], list_curve[i], file=f_curve_obj)
+ f_curve_obj_csv.writerow([list_curve[100 + i], list_curve[i]])
print('wrote to: ' + args.report_filename, file=sys.stderr)
diff --git a/scripts/extract-long-sequences.py b/scripts/extract-long-sequences.py
index 7155070..0a83bed 100755
--- a/scripts/extract-long-sequences.py
+++ b/scripts/extract-long-sequences.py
@@ -34,7 +34,8 @@ def get_parser():
parser.add_argument('input_filenames', help='Input FAST[AQ]'
' sequence filename.', nargs='+')
parser.add_argument('-o', '--output', help='The name of the output'
- ' sequence file.', default="/dev/stdout")
+ ' sequence file.', default=sys.stdout,
+ metavar='output', type=argparse.FileType('w'))
parser.add_argument('-l', '--length', help='The minimum length of'
' the sequence file.',
type=int, default=200)
@@ -43,12 +44,12 @@ def get_parser():
def main():
args = get_parser().parse_args()
- outfp = open(args.output, 'w')
+ outfp = args.output
for filename in args.input_filenames:
- for record in screed.open(filename, parse_description=False):
+ for record in screed.open(filename):
if len(record['sequence']) >= args.length:
write_record(record, outfp)
- print('wrote to: ' + args.output, file=sys.stderr)
+ print('wrote to: ' + outfp.name, file=sys.stderr)
if __name__ == '__main__':
main()
diff --git a/scripts/extract-paired-reads.py b/scripts/extract-paired-reads.py
index ea8bebf..34f8f21 100755
--- a/scripts/extract-paired-reads.py
+++ b/scripts/extract-paired-reads.py
@@ -89,7 +89,7 @@ def main():
check_space([infile], args.force)
# decide where to put output files - specific directory? or just default?
- if infile == '/dev/stdin' or infile == '-':
+ if infile in ('/dev/stdin', '-'):
if not (args.output_paired and args.output_single):
print("Accepting input from stdin; output filenames must be "
"provided.", file=sys.stderr)
@@ -124,7 +124,7 @@ def main():
n_pe = 0
n_se = 0
- screed_iter = screed.open(infile, parse_description=False)
+ screed_iter = screed.open(infile)
for index, is_pair, read1, read2 in broken_paired_reader(screed_iter):
if index % 100000 == 0 and index > 0:
print('...', index, file=sys.stderr)
diff --git a/scripts/extract-partitions.py b/scripts/extract-partitions.py
index 777db38..b4ee857 100755
--- a/scripts/extract-partitions.py
+++ b/scripts/extract-partitions.py
@@ -34,8 +34,7 @@ DEFAULT_THRESHOLD = 5
def read_partition_file(filename):
- for record_index, record in enumerate(screed.open
- (filename, parse_description=False)):
+ for record_index, record in enumerate(screed.open(filename)):
_, partition_id = record.name.rsplit('\t', 1)
yield record_index, record, int(partition_id)
diff --git a/scripts/fastq-to-fasta.py b/scripts/fastq-to-fasta.py
index ef21cda..0dc5831 100755
--- a/scripts/fastq-to-fasta.py
+++ b/scripts/fastq-to-fasta.py
@@ -33,8 +33,8 @@ def get_parser():
type=argparse.FileType('w'),
default=sys.stdout)
parser.add_argument('-n', '--n_keep', default=False, action='store_true',
- help='Option to drop reads containing \'N\'s in ' +
- 'input_sequence file.')
+ help='Option to keep reads containing \'N\'s in ' +
+ 'input_sequence file. Default is to drop reads')
return parser
@@ -43,8 +43,7 @@ def main():
print(('fastq from ', args.input_sequence), file=sys.stderr)
n_count = 0
- for n, record in enumerate(screed.open(args.input_sequence,
- parse_description=False)):
+ for n, record in enumerate(screed.open(args.input_sequence)):
if n % 10000 == 0:
print('...', n, file=sys.stderr)
diff --git a/scripts/filter-abund-single.py b/scripts/filter-abund-single.py
index b22a494..0aaf76b 100755
--- a/scripts/filter-abund-single.py
+++ b/scripts/filter-abund-single.py
@@ -26,7 +26,7 @@ import textwrap
from khmer.thread_utils import ThreadedSequenceProcessor, verbose_loader
from khmer import khmer_args
from khmer.khmer_args import (build_counting_args, report_on_config,
- add_threading_args, info)
+ add_threading_args, info, calculate_tablesize)
from khmer.kfile import (check_input_files, check_space,
check_space_for_hashtable)
#
@@ -58,8 +58,6 @@ def get_parser():
"k-mer counting table to")
parser.add_argument('datafile', metavar='input_sequence_filename',
help="FAST[AQ] sequence file to trim")
- parser.add_argument('--report-total-kmers', '-t', action='store_true',
- help="Prints the total number of k-mers to stderr")
parser.add_argument('-f', '--force', default=False, action='store_true',
help='Overwrite output file if it exists')
return parser
@@ -71,7 +69,9 @@ def main():
check_input_files(args.datafile, args.force)
check_space([args.datafile], args.force)
if args.savetable:
- check_space_for_hashtable(args, 'countgraph', args.force)
+ tablesize = calculate_tablesize(args, 'countgraph')
+ check_space_for_hashtable(args.savetable, tablesize, args.force)
+
report_on_config(args)
print('making countgraph', file=sys.stderr)
@@ -93,9 +93,8 @@ def main():
for _ in threads:
_.join()
- if args.report_total_kmers:
- print('Total number of unique k-mers: {0}'.format(
- htable.n_unique_kmers()), file=sys.stderr)
+ print('Total number of unique k-mers: {0}'.format(
+ htable.n_unique_kmers()), file=sys.stderr)
fp_rate = khmer.calc_expected_collisions(htable, args.force)
print('fp rate estimated to be %1.3f' % fp_rate, file=sys.stderr)
diff --git a/scripts/filter-abund.py b/scripts/filter-abund.py
index e994c60..a5fc98f 100755
--- a/scripts/filter-abund.py
+++ b/scripts/filter-abund.py
@@ -63,8 +63,9 @@ def get_parser():
help='Base the variable-coverage cutoff on this median'
' k-mer abundance.',
default=DEFAULT_NORMALIZE_LIMIT)
- parser.add_argument('-o', '--out', dest='single_output_filename',
- default='', metavar="optional_output_filename",
+ parser.add_argument('-o', '--out', dest='single_output_file',
+ type=argparse.FileType('w'),
+ metavar="optional_output_filename",
help='Output the trimmed sequences into a single file '
'with the given filename instead of creating a new '
'file for each input file.')
@@ -81,6 +82,12 @@ def main():
check_input_files(args.input_table, args.force)
infiles = args.input_filename
+ if ('-' in infiles or '/dev/stdin' in infiles) and not \
+ args.single_output_file:
+ print("Accepting input from stdin; output filename must "
+ "be provided with -o.", file=sys.stderr)
+ sys.exit(1)
+
for filename in infiles:
check_input_files(filename, args.force)
@@ -116,9 +123,9 @@ def main():
# the filtering loop
for infile in infiles:
print('filtering', infile, file=sys.stderr)
- if args.single_output_filename != '':
- outfile = args.single_output_filename
- outfp = open(outfile, 'a')
+ if args.single_output_file:
+ outfile = args.single_output_file.name
+ outfp = args.single_output_file
else:
outfile = os.path.basename(infile) + '.abundfilt'
outfp = open(outfile, 'w')
@@ -128,5 +135,6 @@ def main():
print('output in', outfile, file=sys.stderr)
+
if __name__ == '__main__':
main()
diff --git a/scripts/interleave-reads.py b/scripts/interleave-reads.py
index 94d5776..ae7d598 100755
--- a/scripts/interleave-reads.py
+++ b/scripts/interleave-reads.py
@@ -18,9 +18,6 @@ By default, output is sent to stdout; or use -o. Use '-h' for parameter help.
"""
from __future__ import print_function
-# TODO: take fa as well?
-# support gzip option?
-
import screed
import sys
import os
@@ -56,7 +53,8 @@ def get_parser():
epilog=textwrap.dedent(epilog),
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
- parser.add_argument('infiles', nargs='+')
+ parser.add_argument('left')
+ parser.add_argument('right')
parser.add_argument('-o', '--output', metavar="filename",
type=argparse.FileType('w'),
default=sys.stdout)
@@ -71,23 +69,12 @@ def main():
info('interleave-reads.py')
args = get_parser().parse_args()
- for _ in args.infiles:
- check_input_files(_, args.force)
-
- check_space(args.infiles, args.force)
-
- s1_file = args.infiles[0]
- if len(args.infiles) == 2:
- s2_file = args.infiles[1]
- else:
- s2_file = s1_file.replace('_R1_', '_R2_')
- if s1_file == s2_file:
- print(("ERROR: given only one filename, that "
- "doesn't contain _R1_. Exiting."), file=sys.stderr)
- sys.exit(1)
+ check_input_files(args.left, args.force)
+ check_input_files(args.right, args.force)
+ check_space([args.left, args.right], args.force)
- print(("given only one file; "
- "guessing that R2 file is %s" % s2_file), file=sys.stderr)
+ s1_file = args.left
+ s2_file = args.right
fail = False
if not os.path.exists(s1_file):
@@ -104,8 +91,8 @@ def main():
print("Interleaving:\n\t%s\n\t%s" % (s1_file, s2_file), file=sys.stderr)
counter = 0
- screed_iter_1 = screed.open(s1_file, parse_description=False)
- screed_iter_2 = screed.open(s2_file, parse_description=False)
+ screed_iter_1 = screed.open(s1_file)
+ screed_iter_2 = screed.open(s2_file)
for read1, read2 in zip_longest(screed_iter_1, screed_iter_2):
if read1 is None or read2 is None:
print(("ERROR: Input files contain different number"
diff --git a/scripts/load-graph.py b/scripts/load-graph.py
index 999403e..0b8c334 100755
--- a/scripts/load-graph.py
+++ b/scripts/load-graph.py
@@ -5,7 +5,7 @@
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
#
-# pylint: disable=invalid-name,missing-docstring
+# pylint disable=missing-docstring
"""
Build a graph from the given sequences, save in <ptname>.
@@ -13,16 +13,10 @@ Build a graph from the given sequences, save in <ptname>.
Use '-h' for parameter help.
"""
-from __future__ import print_function, unicode_literals
import sys
-import threading
-import khmer
from khmer.khmer_args import build_hashbits_args
-from khmer.khmer_args import (report_on_config, info, add_threading_args)
-from khmer.kfile import check_input_files, check_space
-from khmer.kfile import check_space_for_hashtable
from oxli import build_graph
@@ -35,9 +29,7 @@ def get_parser():
if __name__ == '__main__':
- parser = get_parser()
- args = parser.parse_args()
- build_graph.main(args)
+ build_graph.main(get_parser().parse_args())
sys.exit(0)
# vim: set ft=python ts=4 sts=4 sw=4 et tw=79:
diff --git a/scripts/load-into-counting.py b/scripts/load-into-counting.py
index f907c36..287d7de 100755
--- a/scripts/load-into-counting.py
+++ b/scripts/load-into-counting.py
@@ -22,9 +22,9 @@ import textwrap
import khmer
from khmer import khmer_args
from khmer.khmer_args import build_counting_args, report_on_config, info,\
- add_threading_args
+ add_threading_args, calculate_tablesize
from khmer.kfile import check_file_writable
-from khmer.kfile import check_input_files, check_space
+from khmer.kfile import check_input_files
from khmer.kfile import check_space_for_hashtable
@@ -64,8 +64,6 @@ def get_parser():
metavar="FORMAT", choices=[str('json'), str('tsv')],
help="What format should the machine readable run "
"summary be in? (json or tsv, disabled by default)")
- parser.add_argument('--report-total-kmers', '-t', action='store_true',
- help="Prints the total number of k-mers to stderr")
parser.add_argument('-f', '--force', default=False, action='store_true',
help='Overwrite output file if it exists')
return parser
@@ -84,8 +82,9 @@ def main():
for name in args.input_sequence_filename:
check_input_files(name, args.force)
- check_space(args.input_sequence_filename, args.force)
- check_space_for_hashtable(args, 'countgraph', args.force)
+ tablesize = calculate_tablesize(args, 'countgraph')
+ check_space_for_hashtable(args.output_countingtable_filename, tablesize,
+ args.force)
check_file_writable(base)
check_file_writable(base + ".info")
@@ -124,7 +123,8 @@ def main():
thread.join()
if index > 0 and index % 10 == 0:
- check_space_for_hashtable(args, 'countgraph', args.force)
+ tablesize = calculate_tablesize(args, 'countgraph')
+ check_space_for_hashtable(base, tablesize, args.force)
print('mid-save', base, file=sys.stderr)
htable.save(base)
@@ -133,10 +133,9 @@ def main():
total_num_reads += rparser.num_reads
n_kmers = htable.n_unique_kmers()
- if args.report_total_kmers:
- print('Total number of unique k-mers:', n_kmers, file=sys.stderr)
- with open(base + '.info', 'a') as info_fp:
- print('Total number of unique k-mers:', n_kmers, file=info_fp)
+ print('Total number of unique k-mers:', n_kmers, file=sys.stderr)
+ with open(base + '.info', 'a') as info_fp:
+ print('Total number of unique k-mers:', n_kmers, file=info_fp)
print('saving', base, file=sys.stderr)
htable.save(base)
diff --git a/scripts/make-initial-stoptags.py b/scripts/make-initial-stoptags.py
index 29a08ef..e99a690 100755
--- a/scripts/make-initial-stoptags.py
+++ b/scripts/make-initial-stoptags.py
@@ -18,7 +18,7 @@ import textwrap
import khmer
from khmer import khmer_args
from khmer.khmer_args import (build_counting_args, info)
-from khmer.kfile import check_input_files, check_space
+from khmer.kfile import check_input_files
DEFAULT_SUBSET_SIZE = int(1e4)
DEFAULT_COUNTING_HT_SIZE = 3e6 # number of bytes
@@ -83,8 +83,6 @@ def main():
for _ in infiles:
check_input_files(_, args.force)
- check_space(infiles, args.force)
-
print('loading htable %s.pt' % graphbase, file=sys.stderr)
htable = khmer.load_hashbits(graphbase + '.pt')
diff --git a/scripts/normalize-by-median.py b/scripts/normalize-by-median.py
index 3bb2ba7..68e25b1 100755
--- a/scripts/normalize-by-median.py
+++ b/scripts/normalize-by-median.py
@@ -5,7 +5,7 @@
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
#
-# pylint: disable=invalid-name,missing-docstring
+# pylint: disable=missing-docstring
"""
Eliminate surplus reads.
@@ -26,72 +26,111 @@ import khmer
import textwrap
from khmer import khmer_args
from contextlib import contextmanager
-
+from oxli import functions as oxutils
from khmer.khmer_args import (build_counting_args, add_loadhash_args,
- report_on_config, info)
+ report_on_config, info, calculate_tablesize)
import argparse
from khmer.kfile import (check_space, check_space_for_hashtable,
check_valid_file_exists)
-from khmer.utils import write_record, check_is_pair, broken_paired_reader
+from khmer.utils import write_record, broken_paired_reader
DEFAULT_DESIRED_COVERAGE = 20
-def WithDiagnostics(ifilename, norm, reader, fp):
- """
- Generator/context manager to do boilerplate output of statistics using a
- Normalizer object.
+class WithDiagnostics(object):
"""
+ Generator/context manager to do boilerplate output of statistics.
- index = 0
+ uses a Normalizer object.
+ """
- # per read diagnostic output
- for index, record in enumerate(norm(reader)):
+ def __init__(self, norm, report_fp=None, report_frequency=100000):
+ self.norm = norm
+ self.report_fp = report_fp
+ if report_fp:
+ report_fp.write('total,kept,f_kept\n')
- if norm.total % 100000 == 0:
- print('... kept {kept} of {total} or {perc:2}% so far'
- .format(kept=norm.total - norm.discarded,
- total=norm.total,
- perc=int(100. - norm.discarded /
- float(norm.total) * 100.)),
- file=sys.stderr)
+ self.total = 0
+ self.kept = 0
+
+ self.report_frequency = report_frequency
+ self.next_report_at = self.report_frequency
+ self.last_report_at = self.report_frequency
+
+ def __call__(self, reader, ifilename):
+ norm = self.norm
+ report_fp = self.report_fp
+
+ reads_start = self.total
+ total = self.total
+ kept = self.kept
+
+ try:
+ for _, is_paired, read0, read1 in reader:
+ if is_paired:
+ total += 2
+ else:
+ total += 1
+
+ # do diginorm
+ for record in norm(is_paired, read0, read1):
+ kept += 1
+ yield record
- print('... in file ' + ifilename, file=sys.stderr)
+ # report!
+ if total >= self.next_report_at:
+ self.next_report_at += self.report_frequency
+ self.last_report_at = total
+
+ perc_kept = kept / float(total)
+
+ print('... kept {kept} of {tot} or {perc_kept:.1%} so far'
+ .format(kept=kept, tot=total, perc_kept=perc_kept),
+ file=sys.stderr)
+ print('... in file ' + ifilename, file=sys.stderr)
+
+ if report_fp:
+ print("{total},{kept},{f_kept:.4}"
+ .format(total=total, f_kept=perc_kept,
+ kept=kept),
+ file=report_fp)
+ report_fp.flush()
+ finally:
+ self.total = total
+ self.kept = kept
+
+ # per file diagnostic output
+ if total == reads_start:
+ print('SKIPPED empty file ' + ifilename, file=sys.stderr)
+ else:
+ perc_kept = kept / float(total)
- yield record
+ print('DONE with {inp}; kept {kept} of {total} or {perc_kept:.1%}'
+ .format(inp=ifilename, kept=kept, total=total,
+ perc_kept=perc_kept),
+ file=sys.stderr)
- # per file diagnostic output
- if norm.total == 0:
- print('SKIPPED empty file ' + ifilename, file=sys.stderr)
- else:
- print('DONE with {inp}; kept {kept} of {total} or {perc:2}%'
- .format(inp=ifilename, kept=norm.total - norm.discarded,
- total=norm.total, perc=int(100. - norm.discarded /
- float(norm.total) * 100.)),
- file=sys.stderr)
+ # make sure there's at least one report per file, at the end of each
+ # file.
+ if report_fp and total != self.last_report_at:
+ perc_kept = kept / float(total)
- if fp:
- print("{total} {kept} {discarded}"
- .format(total=norm.total, kept=norm.total - norm.discarded,
- discarded=1. - (norm.discarded / float(norm.total))),
- file=fp)
- fp.flush()
+ print("{total},{kept},{f_kept:.4}"
+ .format(total=total, f_kept=perc_kept, kept=kept),
+ file=report_fp)
+ report_fp.flush()
class Normalizer(object):
- """
- Digital normalization algorithm.
- """
+
+ """Digital normalization algorithm."""
def __init__(self, desired_coverage, htable):
self.htable = htable
self.desired_coverage = desired_coverage
- self.total = 0
- self.discarded = 0
-
- def __call__(self, reader):
+ def __call__(self, is_paired, read0, read1):
"""
Actually does digital normalization - the core algorithm.
@@ -101,44 +140,33 @@ class Normalizer(object):
* if any read's median k-mer count is below desired coverage, keep all;
* consume and yield kept reads.
"""
-
desired_coverage = self.desired_coverage
- for index, is_paired, read0, read1 in reader:
- passed_filter = False
+ passed_filter = False
- self.total += 1
+ batch = []
+ batch.append(read0)
+ if read1 is not None:
+ batch.append(read1)
- if is_paired:
- self.total += 1
-
- batch = []
- batch.append(read0)
- if read1 is not None:
- batch.append(read1)
+ for record in batch:
+ seq = record.sequence.replace('N', 'A')
+ if not self.htable.median_at_least(seq, desired_coverage):
+ passed_filter = True
+ if passed_filter:
for record in batch:
seq = record.sequence.replace('N', 'A')
- if not self.htable.median_at_least(seq, desired_coverage):
- passed_filter = True
-
- if passed_filter:
- for record in batch:
- seq = record.sequence.replace('N', 'A')
- self.htable.consume(seq)
- yield record
- else:
- self.discarded += len(batch)
+ self.htable.consume(seq)
+ yield record
@contextmanager
-def CatchIOErrors(ifile, out, single_out, force, corrupt_files):
- """
- Context manager to do boilerplate handling of IOErrors.
- """
+def catch_io_errors(ifile, out, single_out, force, corrupt_files):
+ """Context manager to do boilerplate handling of IOErrors."""
try:
yield
- except (IOError, ValueError) as error:
+ except (IOError, OSError, ValueError) as error:
print('** ERROR: ' + str(error), file=sys.stderr)
print('** Failed on {name}: '.format(name=ifile), file=sys.stderr)
if not single_out:
@@ -172,10 +200,7 @@ def get_parser():
With :option:`-s`/:option:`--savetable`, the k-mer counting table
will be saved to the specified file after all sequences have been
- processed. With :option:`-d`, the k-mer counting table will be
- saved every d files for multifile runs; if :option:`-s` is set,
- the specified name will be used, and if not, the name `backup.ct`
- will be used. :option:`-l`/:option:`--loadtable` will load the
+ processed. :option:`-l`/:option:`--loadtable` will load the
specified k-mer counting table before processing the specified
files. Note that these tables are are in the same format as those
produced by :program:`load-into-counting.py` and consumed by
@@ -225,10 +250,13 @@ def get_parser():
help='save the k-mer counting table to disk after all'
'reads are loaded.')
parser.add_argument('-R', '--report',
- metavar='filename', type=argparse.FileType('w'))
+ metavar='report_filename', type=argparse.FileType('w'))
+ parser.add_argument('--report-frequency',
+ metavar='report_frequency', type=int,
+ default=100000)
parser.add_argument('-f', '--force', dest='force',
- help='continue on next file if read errors are \
- encountered', action='store_true')
+ help='continue past file reading errors',
+ action='store_true')
parser.add_argument('-o', '--out', metavar="filename",
dest='single_output_file',
type=argparse.FileType('w'),
@@ -243,6 +271,7 @@ def get_parser():
def main(): # pylint: disable=too-many-branches,too-many-statements
+
info('normalize-by-median.py', ['diginorm'])
args = get_parser().parse_args()
@@ -251,6 +280,9 @@ def main(): # pylint: disable=too-many-branches,too-many-statements
report_fp = args.report
force_single = args.force_single
+ # if optimization args are given, do optimization
+ args = oxutils.do_sanity_checking(args, 0.1)
+
# check for similar filenames
# if we're using a single output file only check for identical filenames
# otherwise, check for identical BASE names as well.
@@ -274,28 +306,32 @@ def main(): # pylint: disable=too-many-branches,too-many-statements
check_valid_file_exists(args.input_filenames)
check_space(args.input_filenames, args.force)
if args.savetable:
- check_space_for_hashtable(args, 'countgraph', args.force)
+ tablesize = calculate_tablesize(args, 'countgraph')
+ check_space_for_hashtable(args.savetable, tablesize, args.force)
# load or create counting table.
if args.loadtable:
print('loading k-mer counting table from ' + args.loadtable,
file=sys.stderr)
htable = khmer.load_counting_hash(args.loadtable)
+ if args.unique_kmers != 0:
+ print('Warning: You have specified a number of unique kmers'
+ ' but are loading a precreated counting table--'
+ 'argument optimization will NOT be done.', file=sys.stderr)
else:
print('making countgraph', file=sys.stderr)
htable = khmer_args.create_countgraph(args)
- input_filename = None
-
# create an object to handle diginorm of all files
norm = Normalizer(args.cutoff, htable)
+ with_diagnostics = WithDiagnostics(norm, report_fp, args.report_frequency)
# make a list of all filenames and if they're paired or not;
# if we don't know if they're paired, default to allowing but not
# forcing pairing.
files = []
- for e in filenames:
- files.append([e, args.paired])
+ for element in filenames:
+ files.append([element, args.paired])
if args.unpaired_reads:
files.append([args.unpaired_reads, False])
@@ -309,6 +345,11 @@ def main(): # pylint: disable=too-many-branches,too-many-statements
else:
output_name = args.single_output_file.name
outfp = args.single_output_file
+ else:
+ if '-' in filenames or '/dev/stdin' in filenames:
+ print("Accepting input from stdin; output filename must "
+ "be provided with '-o'.", file=sys.stderr)
+ sys.exit(1)
#
# main loop: iterate over all files given, do diginorm.
@@ -320,16 +361,16 @@ def main(): # pylint: disable=too-many-branches,too-many-statements
outfp = open(output_name, 'w')
# failsafe context manager in case an input file breaks
- with CatchIOErrors(filename, outfp, args.single_output_file,
- args.force, corrupt_files):
+ with catch_io_errors(filename, outfp, args.single_output_file,
+ args.force, corrupt_files):
- screed_iter = screed.open(filename, parse_description=False)
+ screed_iter = screed.open(filename)
reader = broken_paired_reader(screed_iter, min_length=args.ksize,
force_single=force_single,
require_paired=require_paired)
# actually do diginorm
- for record in WithDiagnostics(filename, norm, reader, report_fp):
+ for record in with_diagnostics(reader, filename):
if record is not None:
write_record(record, outfp)
@@ -348,7 +389,7 @@ def main(): # pylint: disable=too-many-branches,too-many-statements
htable.save(args.savetable)
fp_rate = \
- khmer.calc_expected_collisions(htable, args.force, max_false_pos=.8)
+ khmer.calc_expected_collisions(htable, False, max_false_pos=.8)
# for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975
print('fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate),
@@ -356,7 +397,8 @@ def main(): # pylint: disable=too-many-branches,too-many-statements
if args.force and len(corrupt_files) > 0:
print("** WARNING: Finished with errors!", file=sys.stderr)
- print("** IOErrors occurred in the following files:", file=sys.stderr)
+ print("** I/O Errors occurred in the following files:",
+ file=sys.stderr)
print("\t", " ".join(corrupt_files), file=sys.stderr)
diff --git a/scripts/oxli b/scripts/oxli
new file mode 100755
index 0000000..74d0bf5
--- /dev/null
+++ b/scripts/oxli
@@ -0,0 +1,16 @@
+#!/usr/bin/env python
+#
+# This file is a part of khmer, http://github.com/ged-lab/khmer/, and is
+# Copyright (C) The Regents of the University of California, 2015. It is
+# licensed under the three-clause BSD license; see doc/LICENSE.txt.
+# Contact: khmer-project at idyll.org
+#
+
+__requires__ = 'khmer'
+import sys
+from pkg_resources import load_entry_point
+
+if __name__ == '__main__':
+ sys.exit(
+ load_entry_point('khmer', 'console_scripts', 'oxli')()
+ )
diff --git a/scripts/partition-graph.py b/scripts/partition-graph.py
index 73666e2..bb03c0b 100755
--- a/scripts/partition-graph.py
+++ b/scripts/partition-graph.py
@@ -24,7 +24,7 @@ import argparse
import khmer
import sys
from khmer.khmer_args import (add_threading_args, info)
-from khmer.kfile import check_input_files, check_space
+from khmer.kfile import check_input_files
# Debugging Support
import re
@@ -111,8 +111,6 @@ def main():
for _ in filenames:
check_input_files(_, args.force)
- check_space(filenames, args.force)
-
print('--', file=sys.stderr)
print('SUBSET SIZE', args.subset_size, file=sys.stderr)
print('N THREADS', args.threads, file=sys.stderr)
diff --git a/scripts/readstats.py b/scripts/readstats.py
index d8e995f..570d8cf 100755
--- a/scripts/readstats.py
+++ b/scripts/readstats.py
@@ -47,9 +47,10 @@ def get_parser():
return parser
-class StatisticsOutput(object):
- # pylint: disable=too-few-public-methods
- """Output statistics for several data files.
+class StatisticsOutput(object): # pylint: disable=too-few-public-methods
+
+ """
+ Output statistics for several data files.
The format of the output is determined by the formatter used.
All statistics are aggregated and a summary is added to the data.
@@ -63,8 +64,7 @@ class StatisticsOutput(object):
return self
def append(self, basepairs, seqs, filename):
- """Append a new line for the given basepair number, sequences and file.
- """
+ """Append a new line for the given basepair num, sequences and file."""
self.formatter.append(
basepairs, seqs, basepairs / float(seqs), filename)
@@ -74,7 +74,9 @@ class StatisticsOutput(object):
class CsvFormatter(object):
+
"""Format the statistis information as CSV."""
+
headers = ['bp', 'seqs', 'avg_len', 'filename']
def __init__(self, underlying_file):
@@ -89,12 +91,12 @@ class CsvFormatter(object):
self.file.writerow([basepairs, seqs, "%.1f" % avg_len, filename])
def finalize(self):
- """No statistics since the CSV data is supposed to be processed further.
- """
+ """No statistics since the CSV data is to be processed further."""
pass
class StdFormatter(object):
+
"""Format the statistics in a human readable string."""
def __init__(self, underlying_file):
@@ -128,7 +130,7 @@ def analyze_file(filename):
"""Run over the given file and count base pairs and sequences."""
bps = 0
seqs = 0
- input_iter = screed.open(filename, parse_description=False)
+ input_iter = screed.open(filename)
for record in input_iter:
if seqs % 100000 == 0:
print('...', filename, seqs, file=sys.stderr)
diff --git a/scripts/sample-reads-randomly.py b/scripts/sample-reads-randomly.py
index 79b4777..72b5bed 100755
--- a/scripts/sample-reads-randomly.py
+++ b/scripts/sample-reads-randomly.py
@@ -27,7 +27,7 @@ import textwrap
import sys
import khmer
-from khmer.kfile import check_input_files, check_space
+from khmer.kfile import check_input_files
from khmer.khmer_args import info
from khmer.utils import write_record, broken_paired_reader
@@ -85,8 +85,6 @@ def main():
for _ in args.filenames:
check_input_files(_, args.force)
- check_space(args.filenames, args.force)
-
# seed the random number generator?
if args.random_seed:
random.seed(args.random_seed)
@@ -104,10 +102,16 @@ def main():
sys.stderr.write(
"Error: cannot specify -o with more than one sample.")
if not args.force:
+ print("NOTE: This can be overridden using the --force"
+ " argument", file=sys.stderr)
sys.exit(1)
output_filename = output_file.name
else:
filename = args.filenames[0]
+ if filename in ('/dev/stdin', '-'):
+ print("Accepting input from stdin; output filename must "
+ "be provided with '-o'.", file=sys.stderr)
+ sys.exit(1)
output_filename = os.path.basename(filename) + '.subset'
if num_samples == 1:
@@ -131,7 +135,7 @@ def main():
# read through all the sequences and load/resample the reservoir
for filename in args.filenames:
print('opening', filename, 'for reading', file=sys.stderr)
- screed_iter = screed.open(filename, parse_description=False)
+ screed_iter = screed.open(filename)
for count, (_, ispair, rcrd1, rcrd2) in enumerate(broken_paired_reader(
screed_iter,
diff --git a/scripts/split-paired-reads.py b/scripts/split-paired-reads.py
index e9eac94..b77144a 100755
--- a/scripts/split-paired-reads.py
+++ b/scripts/split-paired-reads.py
@@ -98,10 +98,10 @@ def main():
check_space(filenames, args.force)
# decide where to put output files - specific directory? or just default?
- if infile == '/dev/stdin' or infile == '-':
+ if infile in ('/dev/stdin', '-'):
if not (args.output_first and args.output_second):
- print >>sys.stderr, ("Accepting input from stdin; "
- "output filenames must be provided.")
+ print("Accepting input from stdin; output filenames must "
+ "be provided.", file=sys.stderr)
sys.exit(1)
elif args.output_directory:
if not os.path.exists(args.output_directory):
@@ -130,7 +130,7 @@ def main():
counter2 = 0
index = None
- screed_iter = screed.open(infile, parse_description=False)
+ screed_iter = screed.open(infile)
# walk through all the reads in broken-paired mode.
paired_iter = broken_paired_reader(screed_iter)
@@ -165,8 +165,8 @@ def main():
print("DONE; split %d sequences (%d left, %d right)" %
(counter1 + counter2, counter1, counter2), file=sys.stderr)
- print("/1 reads in %s" % out1, file=sys.stderr)
- print("/2 reads in %s" % out2, file=sys.stderr)
+ print("left (/1) reads in %s" % out1, file=sys.stderr)
+ print("right (/2) reads in %s" % out2, file=sys.stderr)
if __name__ == '__main__':
main()
diff --git a/scripts/trim-low-abund.py b/scripts/trim-low-abund.py
index 741b181..ddfc43a 100755
--- a/scripts/trim-low-abund.py
+++ b/scripts/trim-low-abund.py
@@ -28,7 +28,7 @@ from screed import Record
from khmer import khmer_args
from khmer.khmer_args import (build_counting_args, info, add_loadhash_args,
- report_on_config)
+ report_on_config, calculate_tablesize)
from khmer.utils import write_record, write_record_pair, broken_paired_reader
from khmer.kfile import (check_space, check_space_for_hashtable,
check_valid_file_exists)
@@ -126,7 +126,14 @@ def main():
check_valid_file_exists(args.input_filenames)
check_space(args.input_filenames, args.force)
if args.savetable:
- check_space_for_hashtable(args, 'countgraph', args.force)
+ tablesize = calculate_tablesize(args, 'countgraph')
+ check_space_for_hashtable(args.savetable, tablesize, args.force)
+
+ if ('-' in args.input_filenames or '/dev/stdin' in args.input_filenames) \
+ and not args.out:
+ print("Accepting input from stdin; output filename must "
+ "be provided with -o.", file=sys.stderr)
+ sys.exit(1)
if args.loadtable:
print('loading countgraph from', args.loadtable, file=sys.stderr)
@@ -164,7 +171,7 @@ def main():
pass2list.append((filename, pass2filename, trimfp))
- screed_iter = screed.open(filename, parse_description=False)
+ screed_iter = screed.open(filename)
pass2fp = open(pass2filename, 'w')
save_pass2 = 0
@@ -260,8 +267,7 @@ def main():
# so pairs will stay together if not orphaned. This is in contrast
# to the first loop.
- for n, read in enumerate(screed.open(pass2filename,
- parse_description=False)):
+ for n, read in enumerate(screed.open(pass2filename)):
if n % 10000 == 0:
print('... x 2', n, pass2filename,
written_reads, written_bp, file=sys.stderr)
@@ -301,14 +307,18 @@ def main():
percent_reads_trimmed = float(trimmed_reads + (n_reads - written_reads)) /\
n_reads * 100.0
- print('read %d reads, %d bp' % (n_reads, n_bp,))
- print('wrote %d reads, %d bp' % (written_reads, written_bp,))
+ print('read %d reads, %d bp' % (n_reads, n_bp,), file=sys.stderr)
+ print('wrote %d reads, %d bp' % (written_reads, written_bp,),
+ file=sys.stderr)
print('looked at %d reads twice (%.2f passes)' % (save_pass2_total,
- n_passes))
+ n_passes),
+ file=sys.stderr)
print('removed %d reads and trimmed %d reads (%.2f%%)' %
- (n_reads - written_reads, trimmed_reads, percent_reads_trimmed))
+ (n_reads - written_reads, trimmed_reads, percent_reads_trimmed),
+ file=sys.stderr)
print('trimmed or removed %.2f%% of bases (%d total)' %
- ((1 - (written_bp / float(n_bp))) * 100.0, n_bp - written_bp))
+ ((1 - (written_bp / float(n_bp))) * 100.0, n_bp - written_bp),
+ file=sys.stderr)
if args.variable_coverage:
percent_reads_hicov = 100.0 * float(n_reads - skipped_n) / n_reads
diff --git a/sandbox/unique-kmers.py b/scripts/unique-kmers.py
similarity index 63%
rename from sandbox/unique-kmers.py
rename to scripts/unique-kmers.py
index aa78d88..ddf854c 100755
--- a/sandbox/unique-kmers.py
+++ b/scripts/unique-kmers.py
@@ -5,11 +5,12 @@
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
#
-# pylint: disable=invalid-name,missing-docstring
+# pylint: disable=invalid-name,missing-docstring,no-member
"""
Estimate number of unique k-mers, with precision <= ERROR_RATE.
-% python sandbox/unique-kmers.py [ -k <k size> ] [ -e <ERROR_RATE> ] <data1> <data2> ...
+% python scripts/unique-kmers.py [ -k <k size> ] [ -e <ERROR_RATE> ] <data1>
+<data2> ...
Use '-h' for parameter help.
"""
@@ -22,24 +23,35 @@ import sys
import textwrap
import khmer
-from khmer.khmer_args import DEFAULT_K, info, ComboFormatter
+from khmer.khmer_args import (DEFAULT_K, info, ComboFormatter,
+ _VersionStdErrAction)
+from khmer.utils import write_record
from oxli.functions import optimal_args_output_gen as output_gen
from khmer import __version__
import screed
+
def get_parser():
descr = "Estimate number of unique k-mers, with precision <= ERROR_RATE."
epilog = ("""
A HyperLogLog counter is used to do cardinality estimation. Since this counter
is based on a tradeoff between precision and memory consumption,
- :option:`-e`/:option:`--error-rate` can be used to control how much
+ the :option:`-e`/:option:`--error-rate` can be used to control how much
memory will be used. In practice the memory footprint is small even
at low error rates (< 0.01).
:option:`-k`/:option:`--ksize` should be set to the desired k-mer size.
- Output is sent to STDOUT, but a report file can be generated with
- :option:`-R`/:option:`--report`.
+ Informational output is sent to STDERR, but a report file can be generated
+ with :option:`-R`/:option:`--report`.
+
+ :option:`--stream-out` will write the sequences taken in to STDOUT.
+ This is useful for workflows: count unique kmers in a stream, then do
+ digital normalization.
+
+ :option:`--diagnostics` will provide detailed options for tablesize
+ and memory limitations for various false positive rates. This is useful for
+ configuring other khmer scripts. This will be written to STDERR.
Example::
@@ -47,15 +59,26 @@ def get_parser():
Example::
-""" " unique-kmers.py -R unique_count -k 30 tests/test-data/test-abund-read-paired.fa") # noqa
+ unique-kmers.py -k 17 --diagnostics tests/test-data/test-abund-read.fa
+
+ Example::
+
+ unique-kmers.py --stream-out -k 17 tests/test-data/test-reads.fa | \\
+ normalize-by-median.py -k 17 -o normalized /dev/stdin
+
+ Example::
+
+ unique-kmers.py -R unique_count -k 30 \\
+ tests/test-data/test-abund-read-paired.fa""") # noqa
parser = argparse.ArgumentParser(
description=descr, epilog=textwrap.dedent(epilog),
formatter_class=ComboFormatter)
env_ksize = os.environ.get('KHMER_KSIZE', DEFAULT_K)
- parser.add_argument('--version', action='version',
+ parser.add_argument('--version', action=_VersionStdErrAction,
version='khmer {v}'.format(v=__version__))
+
parser.add_argument('-q', '--quiet', dest='quiet', default=False,
action='store_true')
@@ -65,15 +88,21 @@ def get_parser():
parser.add_argument('--error-rate', '-e', type=float, default=0.01,
help='Acceptable error rate')
- parser.add_argument('-R', '--report',
- metavar='filename', type=argparse.FileType('w'))
+ parser.add_argument('--report', '-R',
+ metavar='filename', type=argparse.FileType('w'),
+ help='generate informational report and write to'
+ ' filename')
parser.add_argument('--stream-out', '-S', default=False,
- action='store_true')
+ action='store_true',
+ help='write input sequences to STDOUT')
- parser.add_argument('input_filenames', metavar='input_sequence_filename',
- help='Input FAST[AQ] sequence filename.', nargs='+')
+ parser.add_argument('--diagnostics', default=False, action='store_true',
+ help='print out recommended tablesize arguments and '
+ 'restrictions')
+ parser.add_argument('input_filenames', metavar='input_sequence_filename',
+ help='Input FAST[AQ] sequence filename(s).', nargs='+')
return parser
@@ -108,9 +137,10 @@ def main():
print('Total estimated number of unique {0}-mers: {1}'.format(
args.ksize, cardinality),
file=sys.stderr)
-
+
to_print = output_gen(cardinality, args.error_rate)
- print(to_print)
+ if args.diagnostics:
+ print(to_print, file=sys.stderr)
if report_fp:
print(cardinality, args.ksize, 'total', file=report_fp)
diff --git a/setup.py b/setup.py
index c9d785e..d69c7b8 100755
--- a/setup.py
+++ b/setup.py
@@ -106,7 +106,7 @@ BUILD_DEPENDS.extend(path_join("lib", bn + ".hh") for bn in [
SOURCES = ["khmer/_khmer.cc"]
SOURCES.extend(path_join("lib", bn + ".cc") for bn in [
- "trace_logger", "perf_metrics", "read_parsers", "kmer_hash", "hashtable",
+ "read_parsers", "kmer_hash", "hashtable",
"hashbits", "labelhash", "counting", "subset", "read_aligner",
"hllcounter"])
diff --git a/tests/khmer_tst_utils.py b/tests/khmer_tst_utils.py
index 0cbe36c..13edeb3 100644
--- a/tests/khmer_tst_utils.py
+++ b/tests/khmer_tst_utils.py
@@ -15,6 +15,7 @@ import traceback
import subprocess
from io import open
+
try:
from StringIO import StringIO
except ImportError:
@@ -52,7 +53,27 @@ def cleanup():
cleanup_list = []
+def scriptpath(scriptname='interleave-reads.py'):
+ "Return the path to the scripts, in both dev and install situations."
+
+ # note - it doesn't matter what the scriptname is here, as long as
+ # it's some khmer script present in this version of khmer.
+
+ path = os.path.join(os.path.dirname(__file__), "../scripts")
+
+ if os.path.exists(os.path.join(path, scriptname)):
+ return path
+
+ for path in os.environ['PATH'].split(':'):
+ if os.path.exists(os.path.join(path, scriptname)):
+ return path
+
+
def _runscript(scriptname, sandbox=False):
+ """
+ Find & run a script with exec (i.e. not via os.system or subprocess).
+ """
+
import pkg_resources
ns = {"__name__": "__main__"}
ns['sys'] = globals()['sys']
@@ -63,17 +84,16 @@ def _runscript(scriptname, sandbox=False):
return 0
except pkg_resources.ResolutionError as err:
if sandbox:
- paths = [os.path.join(os.path.dirname(__file__), "../sandbox")]
+ path = os.path.join(os.path.dirname(__file__), "../sandbox")
else:
- paths = [os.path.join(os.path.dirname(__file__),
- "../scripts")]
- paths.extend(os.environ['PATH'].split(':'))
- for path in paths:
- scriptfile = os.path.join(path, scriptname)
+ path = scriptpath()
+
+ scriptfile = os.path.join(path, scriptname)
+ if os.path.isfile(scriptfile):
if os.path.isfile(scriptfile):
exec(compile(open(scriptfile).read(), scriptfile, 'exec'), ns)
return 0
- if sandbox:
+ elif sandbox:
raise nose.SkipTest("sandbox tests are only run in a repository.")
return -1
@@ -84,7 +104,8 @@ def runscript(scriptname, args, in_directory=None,
"""Run a Python script using exec().
Run the given Python script, with the given args, in the given directory,
- using 'execfile'.
+ using 'exec'. Mimic proper shell functionality with argv, and capture
+ stdout and stderr.
When using :attr:`fail_ok`=False in tests, specify the expected error.
"""
@@ -131,48 +152,37 @@ def runscript(scriptname, args, in_directory=None,
return status, out, err
-def runscriptredirect(scriptname, args, stdinfilename, in_directory=None,
- fail_ok=False, sandbox=False):
- """Run a Python script using subprocess().
-
- Run the given Python script, with the given args, in the given directory,
- using 'subprocess'.
- """
+def run_shell_cmd(cmd, fail_ok=False, in_directory=None):
cwd = os.getcwd()
+ if in_directory:
+ os.chdir(in_directory)
- status = -1
+ print('running: ', cmd)
+ try:
+ p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE)
+ (out, err) = p.communicate()
- if sandbox:
- paths = [os.path.join(os.path.dirname(__file__), "../sandbox")]
- else:
- paths = [os.path.join(os.path.dirname(__file__), "../scripts")]
- paths.extend(os.environ['PATH'].split(':'))
- for path in paths:
- scriptfile = os.path.join(path, scriptname)
- if os.path.isfile(scriptfile):
- if in_directory:
- os.chdir(in_directory)
- sysargs = 'cat ' + stdinfilename + ' | python ' + scriptfile + \
- " " + args
- out = open(
- os.path.join(in_directory, "out"), 'w+', encoding='utf-8')
- err = open(
- os.path.join(in_directory, "err"), 'w+', encoding='utf-8')
- print('running:', scriptname, 'in:', in_directory)
- print('arguments', sysargs)
- status = subprocess.call(args=sysargs, stdout=out, stderr=err,
- shell=True)
- os.chdir(cwd)
- if status != 0 and not fail_ok:
- out.seek(0)
- out = out.read()
- err.seek(0)
- err = err.read()
- print(out)
- print(err)
- assert False, (status, out, err)
-
- return status, out, err
+ out = out.decode('utf-8')
+ err = err.decode('utf-8')
- if sandbox:
- raise nose.SkipTest("sandbox tests are only run in a repository.")
+ if p.returncode != 0 and not fail_ok:
+ print('out:', out)
+ print('err:', err)
+ raise AssertionError("exit code is non zero: %d" % p.returncode)
+
+ return (p.returncode, out, err)
+ finally:
+ os.chdir(cwd)
+
+
+def longify(listofints):
+ """List of ints => list of longs, only on py2.
+
+ Takes a list of numeric types, and returns longs on python2, or the
+ original list on python3.
+ """
+ # For map(long, [list of ints]) cross-version hackery
+ if sys.version_info.major < 3:
+ return map(long, listofints)
+ return listofints
diff --git a/tests/test-data/empty-file.bz2 b/tests/test-data/empty-file.bz2
new file mode 100644
index 0000000..b56f3b9
Binary files /dev/null and b/tests/test-data/empty-file.bz2 differ
diff --git a/tests/test-data/empty-file.gz b/tests/test-data/empty-file.gz
new file mode 100644
index 0000000..0fad667
Binary files /dev/null and b/tests/test-data/empty-file.gz differ
diff --git a/tests/test-data/paired-broken4.fq.1 b/tests/test-data/paired-broken4.fq.1
new file mode 100644
index 0000000..a13d9b4
--- /dev/null
+++ b/tests/test-data/paired-broken4.fq.1
@@ -0,0 +1,4 @@
+ at SRR797058.3 HWI-ST600:227:C0WR4ACXX:7:1101:17167:2000/1
+NCTACCAAAAAAATGCCCGATAATTCTGACCATTCCTTCCTCATTCTCGTCTGGCGTTTGGTCACGACGCACGATACCTTCTGCACTTGTCAAGACAGCGG
++
+#00@#################################################################################################
diff --git a/tests/test-data/paired-broken4.fq.2 b/tests/test-data/paired-broken4.fq.2
new file mode 100644
index 0000000..b37161b
--- /dev/null
+++ b/tests/test-data/paired-broken4.fq.2
@@ -0,0 +1,4 @@
+ at SRR797058.3 HWI-ST600:227:C0WR4ACXX:7:1101:17167:9999/2
+CTTGACAAGAGCAGAAGTTATCTTGCCTCGGGACCAAACGCCAGACGAGCACGAGGGAGCGATCGTCCGCATTAGCCGGCATTCTTTTGCTAGCAGATCGG
++
+=?###################################################################################################
diff --git a/tests/test-data/paired.fq.2 b/tests/test-data/paired.fq.2
index 4b0ed7b..4ff9afa 100644
--- a/tests/test-data/paired.fq.2
+++ b/tests/test-data/paired.fq.2
@@ -10,4 +10,3 @@ GGTTGACGGGGCTCAGGGGGCGGCTGACTCCGAGAGACAGCAGCCGCAGCTGTCGTCAGGGGATTTCCGGGGCGGAGGCC
GGTTGACGGGGCTCAGGGGGCGGCTGACTCCGAGAGACAGCAGCCGCAGCTGTCGTCAGGGGATTTCCGGGGCGGAGGCCGCAGACGCGAGTGGTGGAGGGAGAAGGCCTGACG
+
##################################################################################################################
-
diff --git a/tests/test-data/readaligner-default.json b/tests/test-data/readaligner-default.json
new file mode 100644
index 0000000..c97e8da
--- /dev/null
+++ b/tests/test-data/readaligner-default.json
@@ -0,0 +1,50 @@
+{
+ "scoring_matrix": [
+ -0.06642736173897607,
+ -4.643856189774724,
+ -7.965784284662087,
+ -9.965784284662087
+ ],
+ "transition_probabilities": [
+ [
+ -0.021973842014145723,
+ -13.73189622448781,
+ -14.869792371737484,
+ -6.058239815501842,
+ -19.166033822961197,
+ -21.66853416349038
+ ],
+ [
+ -0.9444728000497686,
+ -1.105331993785005,
+ -7.40360292819022,
+ -6.690896473084504
+ ],
+ [
+ -0.3937937393192493,
+ -2.123673467366609,
+ -7.104364821496794,
+ -8.864604875515933
+ ],
+ [
+ -3.645644436080496,
+ -15.220073662674086,
+ -12.411146320797728,
+ -0.1263680454390087,
+ -8.227232598141855,
+ -11.226627458948142
+ ],
+ [
+ -2.8013509614037972,
+ -8.078453985883888,
+ -1.9197909720107271,
+ -0.7647513448614925
+ ],
+ [
+ -2.8525098984653257,
+ -4.535070816966942,
+ -0.6522388852285496,
+ -2.457038730417613
+ ]
+ ]
+}
\ No newline at end of file
diff --git a/tests/test-data/readaligner-k12.json b/tests/test-data/readaligner-k12.json
new file mode 100644
index 0000000..96e594f
--- /dev/null
+++ b/tests/test-data/readaligner-k12.json
@@ -0,0 +1,50 @@
+{
+ "scoring_matrix": [
+ -0.06642736173897607,
+ -4.643856189774724,
+ -7.965784284662087,
+ -9.965784284662087
+ ],
+ "transition_probabilities": [
+ [
+ -0.026682951271565506,
+ -9.914804535510426,
+ -12.141787036607257,
+ -10.397090021035718,
+ -18.93814224957318,
+ -21.192698501482667
+ ],
+ [
+ -1.3469970008036818,
+ -0.8268851828081922,
+ -10.50575629769457,
+ -9.368461466993008
+ ],
+ [
+ -0.4341897229177519,
+ -2.230759666245372,
+ -9.828796523112176,
+ -11.63615144516978
+ ],
+ [
+ -0.015678000180557785,
+ -13.181473677755502,
+ -14.67136517171856,
+ -6.70130751161339,
+ -15.726469771159012,
+ -18.409279595278313
+ ],
+ [
+ -0.03221351929242559,
+ -6.011510033472284,
+ -8.44845478869957,
+ -8.089881097025156
+ ],
+ [
+ -0.026272943646111175,
+ -6.421067096717085,
+ -7.773052425460627,
+ -9.561548320266915
+ ]
+ ]
+}
diff --git a/tests/test-data/test-fastq-reads.fa b/tests/test-data/test-fastq-reads.fa
new file mode 100644
index 0000000..4bd6eca
--- /dev/null
+++ b/tests/test-data/test-fastq-reads.fa
@@ -0,0 +1,200 @@
+>895:1:1:1246:14654 1:N:0:NNNNN
+CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT
+>895:1:1:1248:9583 1:N:0:NNNNN
+ACTGGGCGTAGACGGTGTCCTCATCGGCACCAGC
+>895:1:1:1252:19493 1:N:0:NNNNN
+CCGGCGTGGTTGGTGAGGTCACTGAGCTTCATGTC
+>895:1:1:1255:18861 1:N:0:NNNNN
+ACGACGAGAAGCTGATCTACCGCGCCGAGCGCATC
+>895:1:1:1264:15854 1:N:0:NNNNN
+CGTGATGATGTGCTTGCGGCCGGAGGGCCTGTTGCCCAGG
+>895:1:1:1265:2265 1:N:0:NNNNN
+TATAGCGTGAGGCGATGACGTTGCTGTCCTTGGCGCGGC
+>895:1:1:1273:17782 1:N:0:NNNNN
+TCGAAAATCACGTGGGAGATGCACTATCACGCGGTCGGTGAGGAAGTGACCGACCACACCGAGCTCGC
+>895:1:1:1274:18571 1:N:0:NNNNN
+AGCAGGCGAACAGCACGCCGAACAATACTGTCTTCATGCCAAACTGCTGAAAGCCGAGCACAGCAGAAATGCTCCAGAG
+>895:1:1:1276:16426 1:N:0:NNNNN
+GCAGGTATTGGTTTGCCTAACGTTGAAATTGCAGGATTAACG
+>895:1:1:1283:17864 1:N:0:NNNNN
+ATTCGTCAACCCGCGGCTCGAGCTGCGCATCC
+>895:1:1:1287:13756 1:N:0:NNNNN
+AGGGGAAATCCATTTCAAAGCGTTCGTGATCACGATAGACCGTCACTAAGCCACTGACTGTATGGAAGCAAAC
+>895:1:1:1290:11501 1:N:0:NNNNN
+CCAATCACTGCGATCGGCGCACGGACCTTGGAGCCGGAGCAG
+>895:1:1:1294:5882 1:N:0:NNNNN
+GGCATTGACTATGGGATCCAAGCGATTGCACATG
+>895:1:1:1295:6189 1:N:0:NNNNN
+GTCTAATCTTCGAGCAACTCCACGCTGTAGTATCATCAAGGTTCAAGATGTTAATGAATAACAATTGCGCAGCAA
+>895:1:1:1295:5208 1:N:0:NNNNN
+GCGCATCGCGCGGCTTGAGGACGCTGTCTTCAAGC
+>895:1:1:1296:1784 1:N:0:NNNNN
+GATGTTGAATTTTTACATCAACCGTGCCGGC
+>895:1:1:1296:6541 1:N:0:NNNNN
+AAACAGCTCTTCGGATTTAAAGATTACGCAGGACATCCGGCAGGCACTAGCGAAAGACAGGGAGCTGTCCACGACGGCGAAGCAC
+>895:1:1:1298:13380 1:N:0:NNNNN
+AGCTGAGCGGGCACATCTTCTTCAAGCACCGCTACTTCGGCTTCGATGACGCGG
+>895:1:1:1299:3449 1:N:0:NNNNN
+TGGCGCGCGCTTCAAATACGAAAACTTGAAGCTCGGCTGGAGGGC
+>895:1:1:1300:2738 1:N:0:NNNNN
+GTCCGTGCAAAGGGCACTACGTTTGAAGTCCCAGCAATTTCGGGCAGTTCACCGGCACTTT
+>895:1:1:1303:6251 2:N:0:NNNNN
+CGAGATTACAGGTGGGATTCCCGAACCGATATTTGGTTAACCTAAGGAGGCTGAG
+>895:1:1:1303:14389 2:N:0:NNNNN
+GTCGAGTCCGACAACCCGATCATGCAGCCGCGCCATGCCCGCTTGCCAG
+>895:1:1:1306:15360 2:N:0:NNNNN
+ACTTTACCTGCTCGGCACCTCCATCGCGCGACCTCTGATTGCCAA
+>895:1:1:1307:4308 2:N:0:NNNNN
+GAGTTAAAATGCCTGCCGCGTGCCCGGCCATGACAAAAATCTTTCTGCTTCGCACATCGC
+>895:1:1:1308:20421 2:N:0:NNNNN
+CAGATGCCCGGCTGCTTTTTCCGGATCCGAAATCCTATTATGGCATTCCGCTCGATCCGC
+>895:1:1:1308:2539 1:N:0:NNNNN
+TTCCGGCGCCACGCGGACAGATCACGGACCGCAACGGAGTGCCGCTGGCG
+>895:1:1:1308:2539 2:N:0:NNNNN
+CGGCAGCTTGTCCTTGATTCCCTCGTACTCCTGCGGCGTGAGATTTGGAGCGATCTCAAACGGCATG
+>895:1:1:1309:4153 2:N:0:NNNNN
+GGACGCGCACGCCGGTGAGGAACGACCCGCCCGCGCGGGCGTGGTCGCCGGGGCCGTCC
+>895:1:1:1312:10985 2:N:0:NNNNN
+GAGCGCGTCGAGCCCCGCGTGGTCGACCGCGAGCCGTTCGAGCGACCGCCAGTGGCTCCGG
+>895:1:1:1314:10430 2:N:0:NNNNN
+CCTGACTCTCGGCCTGGCGCTTACTTTCGCGCCTGGAAGCCTCACGACACAGCAGAAGTCTGCCTGGCGCAC
+>895:1:1:1318:10532 1:N:0:NNNNN
+TACGCTATCAACGGGTTCGGTCGTATCGGGCGCAACGTGCTTCGCGCCATGACGCAAAAACAAGTCGAGCGGGTTCGCGCAATCAATGACCTCACTGACACGCGCAC
+>895:1:1:1318:10532 2:N:0:NNNNN
+ATTCGAGCACAACATCGACATCCAATTTTTTCCACGGCAAATTCGCCGGGTCGCGCTCC
+>895:1:1:1320:11648 2:N:0:NNNNN
+AACGATTCGAACACGGTAATTGCCGTTTTCGGCGGCGACTTCGAAGGCGGCG
+>895:1:1:1325:8700 1:N:0:NNNNN
+CAGCTGCAGCAATTCGCCGTCGTGGGTTTCGATTGGGGCGGCCGCGCCGC
+>895:1:1:1326:7273 1:N:0:NNNNN
+TATCAATACTGGCAACGCAGGTGTTGCTGGCGAGGCCAACACTATCCGTATCGG
+>895:1:1:1326:7273 2:N:0:NNNNN
+GTGGGATGCCTTCGGGATCAACCTCTTTTTTATAACGGAACGTGACCGGTTTGAGCGCGTGAATA
+>895:1:1:1327:13028 2:N:0:NNNNN
+ACTAAGTGGCGTATACCCTTCCATTTCAACTCGCAGTTGATGGTTACCCTGAGCGAGAGACTCCAGCCGAAGCCGGGTTTGTCCCTTGAGTTCTCCATCAATAAAAACTCTGGC
+>895:1:1:1327:15301 2:N:0:NNNNN
+TCAGTCGTCGAACGCTATACCGGTCAACTCGTCTGTCGAGCCGG
+>895:1:1:1328:3708 2:N:0:NNNNN
+CCGAGGCTGAGGGCGAAAGGCAACAATGAGATCGTCGACATCGTT
+>895:1:1:1330:9540 2:N:0:NNNNN
+TCCTCCATAGAGTGTTCCGAAGTAAGGTGCCAACGGGTCTTCCTTCGGGAGACTTGCCTGGTATTTTGCCAGCTTCTGGTGAACCGAATCC
+>895:1:1:1331:1766 1:N:0:NNNNN
+TGCTGCTGATCGGCTGTGCCCACGAACTTTCTGGC
+>895:1:1:1333:2512 1:N:0:NNNNN
+TGGCGAGAGTCGCCTCAGGGCGGCATATCTTTGAGGCGAATGGGAATCGGTTTGCCCTGTTCCT
+>895:1:1:1333:2512 2:N:0:NNNNN
+CTTACGCCTTGTCTTTGAGCGCCGTGGAGCTCGGATGGCGACGCCTGCGC
+>895:1:1:1333:2445 2:N:0:NNNNN
+AGATTGTCGGAGCGCCAGTTCGAGCTGGCGTGCTCTGCCGCCCCCTATGT
+>895:1:1:1334:19532 2:N:0:NNNNN
+AGTACTACGTCATCGAATCGCAGAATCCGAAGATGCAGGGCTTCGACGGCGCCGTGAT
+>895:1:1:1335:19932 1:N:0:NNNNN
+GACTCACCTCACAGTCTTCCAGGCCCGCGAAAGCAATGATAGGGCAATCGAGAGGCTCTTCAGTCGCATAAGTATACGTCTCACAGATCG
+>895:1:1:1335:19932 2:N:0:NNNNN
+CTATCCACCAGTTACCCACAGCCGAGTTCGTGGGAGAACTGCGTCGTCTCAATGGTACATCAGAG
+>895:1:1:1336:4956 1:N:0:NNNNN
+GCTCGGCGAGGTGTTCCGCCAGCGCACGACTGCGGAATGGGTCGATCTCCTC
+>895:1:1:1336:4956 2:N:0:NNNNN
+CGGTACTCGATCGGCGTGCCCGAGAAGCGCATCGGGCTCGC
+>895:1:1:1338:15407 1:N:0:NNNNN
+AGAAGAAGTCCCAGACGTCGCCCACCACCGGCACCGAACCGCC
+>895:1:1:1338:7557 2:N:0:NNNNN
+ATGTTAAACTCCGGTCGAACGGCCTTGGCACGGGCG
+>895:1:1:1338:6614 2:N:0:NNNNN
+CACAGGTCACTCAGCGGCTTATACAGCTTGCTCCGCCTTTCGCTCGGTCGAGAACACGATCTTTCCGTCACGCCTCATGG
+>895:1:1:1340:19387 2:N:0:NNNNN
+ATTTCGCGATTCTTGTGCTGGCTGCGCTCCATTTGGCAGGCTACCACCACGCCAGTGGGAAGATGCGTGATGCGCAC
+>895:1:1:1342:20695 2:N:0:NNNNN
+CGAGATCATAGGTGCGTTCGGCTTGATCTGGGCGAGCTGGCTTTCGAGATAGAACTTGCGCTCGTCG
+>895:1:1:1342:11001 1:N:0:NNNNN
+ATAACGTTATTGGCTGTCGTGACACCGCTGCCGGC
+>895:1:1:1342:11001 2:N:0:NNNNN
+TCACTTGTGGAGCGAACACGGCCAGCGGTGCTCGCGCACTCGCTAGCAACACAAGCGGTGCCAAAA
+>895:1:1:1344:1968 1:N:0:NNNNN
+TGACCTTTTGTTTGAGCAGGGTTCCGACTGCCTGGACGTCCTGCTTCTTGACCGCCTGGATCAATGAAGACTCGGCCG
+>895:1:1:1347:8723 1:N:0:NNNNN
+ACGGATCGATGCCGCGAGCGAGGAGGCGATCGACGAGCCGGTCATAGAAGTCCAGGCCTGCCGGGTTCACCGCGCCC
+>895:1:1:1347:8723 2:N:0:NNNNN
+GACGGCCGAGGGCCCTGCATCTGGGACAACTTTGT
+>895:1:1:1347:3237 1:N:0:NNNNN
+ACCACGTTCTTGACCTGCGCCTTGCCAACAGCCG
+>895:1:1:1347:3237 2:N:0:NNNNN
+TGGCGGTCGATCCCAAGCTCTACGAGGGCAGCTGGAGCGG
+>895:1:1:1348:18672 2:N:0:NNNNN
+AGAATAATTTCATAATCGGGATAGTTCAGCCTGCCTAAAGAATCCAGGC
+>895:1:1:1348:1257 1:N:0:NNNNN
+ATCGAATCGGATCTGGCAGCTTTTGCCAGTTGGTCGGCGG
+>895:1:1:1349:15165 2:N:0:NNNNN
+GAGCCGGGGGCCGAAGAAGCTGCGTTCCGTCGCCGGCAGTGGCTC
+>895:1:1:1349:21156 1:N:0:NNNNN
+ACCGCAAGTCGGCGACCGGCAAGTACGAAAAGACC
+>895:1:1:1349:13984 1:N:0:NNNNN
+GAAAGGATCAACTGCCGCTTCAAGTTCCAGAAACGCCGTCAGCTTTTCGTCGGCGCGCACAACGCAACGCTTCCC
+>895:1:1:1351:14718 1:N:0:NNNNN
+AGCGAACGAAGAGCCAAAAAAGCTCCACCTCTCGACAAG
+>895:1:1:1351:14718 2:N:0:NNNNN
+TTCAAAAGTTTCAGTGTGCAAGCGCCGTCGGAATAGCCACCACCTAAACCGGCACCTACCGGTATCACTTTATGTACATG
+>895:1:1:1352:5369 2:N:0:NNNNN
+ATCAAACCCCTCAATCCGCACCGCGGGCTTGCTGCCAACATTGGAAAGATGGCATCCTCCTTTCCTGGGGGATGGGGAG
+>895:1:1:1353:6642 1:N:0:NNNNN
+TGATTATGTTGCGAATGGGTCTGACGCTGCTGG
+>895:1:1:1355:13535 1:N:0:NNNNN
+AAGCGCGATGATCGCAACCGTGCTTTATTCCGACCTTTCCGGTGGCCGG
+>895:1:1:1355:13535 2:N:0:NNNNN
+CACTTTCACCATGAGCGCGTTCTTGCGTCGACAAATGGAGCAGTCGCAGGTGGTCAGTTCCGGGAAATCCGTATCGATCTCGAAGGTGACCGCGCCGCAG
+>895:1:1:1357:19736 1:N:0:NNNNN
+AAAAGCATTACCCAACCGAACACACCGGCTGCAAATAGCCCAACCGCAGCGAGGCCGTTTAATCGTTCATTCCGAC
+>895:1:1:1357:19736 2:N:0:NNNNN
+TTGGCCGCTTTTTAATTTGTTCGGATCGGTTTGCCAAAACGGGGATATTTGTCAAGCGGGAAACTTAGGAAAAATTTCTTAAGACTCATGCCTCCGTGTC
+>895:1:1:1357:4413 2:N:0:NNNNN
+CCCCGCCGATGACCGACAGGTTCTGTCCAGGTGCT
+>895:1:1:1358:4953 1:N:0:NNNNN
+AGAAGTTCGACCGCATGGTGAGCCGCTATGACCGCCAGGCAGATGGCTCGCTCAAAGAAGAACCGCGGG
+>895:1:1:1362:3983 2:N:0:NNNNN
+TCGATATCGCCATCTTTTAAACAGGCGATCGGCACAACTTTGAAACCCGCCATAACCGCGCTCGC
+>895:1:1:1363:9988 2:N:0:NNNNN
+TCTCGAAGTACCAACCCATCGAGCCGTAGGTGTGGCCGGGCAGGCGCAGGGCCTTGATCTTCATCCCC
+>895:1:1:1363:11839 1:N:0:NNNNN
+ACAAGTACACCCTGCGCACCGCGCGCGGGATGACGGTGGCGATC
+>895:1:1:1365:7390 2:N:0:NNNNN
+GCCCCAGCCCTTGAAGTCCGTGTGCCAGTGCGTGCGCGTCCAGATCGGCGGACC
+>895:1:1:1365:12527 2:N:0:NNNNN
+TCGTTGTCGTAGTCGCCCCAGATGCCCGGGCCGCCCACCTTGGCGACGCTCGTAAAGCGCTTGCCCATCTCGTT
+>895:1:1:1368:4434 2:N:0:NNNNN
+CCAAGGAAGAATTTAGAGGAGTTACGAGTCATTCTTCCTCCGGCGCCTTCTGCAACAGCTCGTGCAACAGCAACCTTGCTTTGCTCCAGTCCC
+>895:1:1:1373:4848 1:N:0:NNNNN
+CGGTTGCGACGAGCGAGTCGGAGCCGACACCGTCGAGGATCGTC
+>895:1:1:1373:13994 1:N:0:NNNNN
+GATTCGACATTGTTGACGGCGGTCGGGCAGCCGAACAGGCCGACATTGGCCGG
+>895:1:1:1373:13994 2:N:0:NNNNN
+AGGCTGCCGTCGATCAGGCTTATGAGGCCAAGCTGATCGGCAAGGACAATATCAACGGCTGGCCGTTCGAC
+>895:1:1:1376:16513 2:N:0:NNNNN
+CGAACGATTTATCGACCACGACGGCACGCGTTGGTACCACACCGGCGATCGCGTGCGCCG
+>895:1:1:1378:18986 1:N:0:NNNNN
+GACCAGCGGAACAACGGCAAAACTGAGCATCAAACTCAGGATCACCTGGCTAAGGATCAATAATTGAGCGGTTCCGCTTTCTCCCATCATGGCCG
+>895:1:1:1378:18986 2:N:0:NNNNN
+GCGGTCGCTCTCCTGGCCTCGGGTCAGAATTCCTCTCTGACCGGAACACTTGCCGGGCAG
+>895:1:1:1381:7062 2:N:0:NNNNN
+GCCGGCTCCCCACCGACGACAGCACGTACCCCGGCG
+>895:1:1:1381:4958 1:N:0:NNNNN
+ATCGAGGTGCACACCGCGAGCTTCCGCACGACGCGCGG
+>895:1:1:1381:4958 2:N:0:NNNNN
+AAGTGCTCGCGATAGGCCTCCCACAAGACGCCGCGGCGCGCGTAGGGCGATGAGATCCCGAGCAGCAGCGCG
+>895:1:1:1382:5012 2:N:0:NNNNN
+CCGATGTCGACGTCGCCGGTGTCGGCGGGCAGCC
+>895:1:1:1383:3089 2:N:0:NNNNN
+GGTCCGGTCTAAATCTTGTCCGGAGCCCAGATGATGAAATTGTCCCGGTTCGGCATCTTCACTTGCGGCA
+>895:1:1:1383:14728 1:N:0:NNNNN
+TGATCAACTTTGCTCTCCAGCCCGACCAGCAG
+>895:1:1:1383:6776 1:N:0:NNNNN
+CAATACGAACAAGTTCGTTGTTGGAGATACCGCGGAAGTCTACGACACGACTTCGCTCAACGTCCGCG
+>895:1:1:1384:20217 1:N:0:NNNNN
+CAGTGGCGACGACAAAAGCAAAGGGCCACGAGTTGTACGCCTGTTTGTTTTTGTCTCGGAATCCGGTGTGCATGATGTGTGT
+>895:1:1:1384:20217 2:N:0:NNNNN
+CCACATTCGTCCCAGTGAGAGACAAACCAAAAACCAAACGAACCTTTTGAGCCAGTTTGTGCCG
+>895:1:1:1386:7536 1:N:0:NNNNN
+CAGGCGGCGTAAGCCCGGCGTCGCGGTCACTGCGACGGCGCCGACGACGAGCGTGAGGGCGGCGTCGAGCGGC
+>895:1:1:1386:14753 1:N:0:NNNNN
+CAAGCCCATGCTCTACGCCGGCGGCTCATTCGTGCTCATGCCCGGCTACG
+>895:1:1:1388:11093 2:N:0:NNNNN
+AACCATGAGCAACCGGTTCGAGTGCGAGATCAGCAAAGTCGAAAA
diff --git a/tests/test_counting_hash.py b/tests/test_counting_hash.py
index 5e08aba..2e21119 100644
--- a/tests/test_counting_hash.py
+++ b/tests/test_counting_hash.py
@@ -19,12 +19,12 @@ import screed
import nose
from nose.plugins.attrib import attr
+from nose.tools import assert_raises
+
MAX_COUNT = 255
MAX_BIGCOUNT = 65535
-#
-
# from http://www.rsok.com/~jrm/printprimes.html
PRIMES_1m = [1000003, 1009837]
PRIMES_100m = [100009979, 100000007]
@@ -574,7 +574,7 @@ def test_save_load_large():
inpath = utils.get_test_data('random-20-a.fa')
savepath = utils.get_temp_filename(ctfile)
- sizes = khmer.get_n_primes_near_x(1, 2**31 + 1000)
+ sizes = khmer.get_n_primes_near_x(1, 2 ** 31 + 1000)
orig = khmer._CountingHash(12, sizes)
orig.consume_fasta(inpath)
@@ -605,8 +605,8 @@ def test_save_load():
ht = khmer._CountingHash(12, sizes)
try:
ht.load(savepath)
- except IOError as err:
- assert 0, 'Should not produce an IOError: ' + str(err)
+ except OSError as err:
+ assert 0, 'Should not produce an OSError: ' + str(err)
tracking = khmer._Hashbits(12, sizes)
x = hi.abundance_distribution(inpath, tracking)
@@ -638,7 +638,7 @@ def test_load_truncated():
try:
ht = khmer.load_counting_hash(truncpath)
assert 0, "this should not be reached!"
- except IOError as err:
+ except OSError as err:
print(str(err))
@@ -667,8 +667,8 @@ def test_load_gz():
ht = khmer._CountingHash(12, sizes)
try:
ht.load(loadpath)
- except IOError as err:
- assert 0, "Should not produce an IOError: " + str(err)
+ except OSError as err:
+ assert 0, "Should not produce an OSError: " + str(err)
tracking = khmer._Hashbits(12, sizes)
x = hi.abundance_distribution(inpath, tracking)
@@ -694,8 +694,8 @@ def test_save_load_gz():
ht = khmer._CountingHash(12, sizes)
try:
ht.load(savepath)
- except IOError as err:
- assert 0, 'Should not produce an IOError: ' + str(err)
+ except OSError as err:
+ assert 0, 'Should not produce an OSError: ' + str(err)
tracking = khmer._Hashbits(12, sizes)
x = hi.abundance_distribution(inpath, tracking)
@@ -707,6 +707,17 @@ def test_save_load_gz():
assert x == y, (x, y)
+def test_load_empty_files():
+ def do_load_ct(fname):
+ with assert_raises(OSError):
+ ct = khmer.load_counting_hash(fname)
+
+ # Check empty files, compressed or not
+ for ext in ['', '.gz']:
+ fn = utils.get_test_data('empty-file' + ext)
+ do_load_ct(fn)
+
+
def test_trim_full():
hi = khmer.CountingHash(6, 1e6, 2)
@@ -866,8 +877,8 @@ def test_maxcount_with_bigcount_save():
kh = khmer.CountingHash(1, 1, 1)
try:
kh.load(savepath)
- except IOError as err:
- assert 0, "Should not produce an IOError: " + str(err)
+ except OSError as err:
+ assert 0, "Should not produce an OSError: " + str(err)
c = kh.get('AAAA')
assert c == 1000, "should be able to count to 1000: %d" % c
@@ -885,8 +896,8 @@ def test_bigcount_save():
kh = khmer.CountingHash(1, 1, 1)
try:
kh.load(savepath)
- except IOError as err:
- assert 0, "Should not produce an IOError: " + str(err)
+ except OSError as err:
+ assert 0, "Should not produce an OSError: " + str(err)
# set_use_bigcount should still be True after load (i.e. should be saved)
@@ -909,8 +920,8 @@ def test_nobigcount_save():
kh = khmer.CountingHash(1, 1, 1)
try:
kh.load(savepath)
- except IOError as err:
- assert 0, 'Should not produce an IOError: ' + str(err)
+ except OSError as err:
+ assert 0, 'Should not produce an OSError: ' + str(err)
# set_use_bigcount should still be False after load (i.e. should be saved)
@@ -974,7 +985,11 @@ def test_get_ksize():
def test_get_hashsizes():
kh = khmer.CountingHash(22, 100, 4)
- assert kh.hashsizes() == [97L, 89L, 83L, 79L], kh.hashsizes()
+ # Py2/3 hack, longify converts to long in py2, remove once py2 isn't
+ # supported any longer.
+ expected = utils.longify([97, 89, 83, 79])
+ assert kh.hashsizes() == expected, kh.hashsizes()
+
# def test_collect_high_abundance_kmers():
# seqpath = utils.get_test_data('test-abund-read-2.fa')
@@ -983,9 +998,6 @@ def test_get_hashsizes():
# hb = kh.collect_high_abundance_kmers(seqpath, 2, 4)
-#
-
-
def test_load_notexist_should_fail():
savepath = utils.get_temp_filename('tempcountingsave0.ht')
@@ -993,7 +1005,7 @@ def test_load_notexist_should_fail():
try:
hi.load(savepath)
assert 0, "load should fail"
- except IOError as e:
+ except OSError as e:
print(str(e))
@@ -1017,7 +1029,7 @@ def test_load_truncated_should_fail():
try:
hi.load(savepath)
assert 0, "load should fail"
- except IOError as e:
+ except OSError as e:
print(str(e))
@@ -1028,7 +1040,7 @@ def test_load_gz_notexist_should_fail():
try:
hi.load(savepath)
assert 0, "load should fail"
- except IOError as e:
+ except OSError as e:
print(str(e))
@@ -1052,7 +1064,7 @@ def test_load_gz_truncated_should_fail():
try:
hi.load(savepath)
assert 0, "load should fail"
- except IOError as e:
+ except OSError as e:
print(str(e))
@@ -1064,7 +1076,7 @@ def test_counting_file_version_check():
try:
ht.load(inpath)
assert 0, "this should fail"
- except IOError as e:
+ except OSError as e:
print(str(e))
@@ -1076,7 +1088,7 @@ def test_counting_gz_file_version_check():
try:
ht.load(inpath)
assert 0, "this should fail"
- except IOError as e:
+ except OSError as e:
print(str(e))
@@ -1088,7 +1100,7 @@ def test_counting_file_type_check():
try:
kh.load(inpath)
assert 0, "this should fail"
- except IOError as e:
+ except OSError as e:
print(str(e))
@@ -1102,7 +1114,7 @@ def test_counting_gz_file_type_check():
try:
kh.load(inpath)
assert 0, "this should fail"
- except IOError as e:
+ except OSError as e:
print(str(e))
@@ -1130,7 +1142,7 @@ def test_consume_absentfasta():
try:
countingtable.consume_fasta("absent_file.fa")
assert 0, "This should fail"
- except IOError as err:
+ except OSError as err:
print(str(err))
@@ -1145,7 +1157,7 @@ def test_consume_absentfasta_with_reads_parser():
readparser = ReadParser(utils.get_test_data('empty-file'))
countingtable.consume_fasta_with_reads_parser(readparser)
assert 0, "this should fail"
- except IOError as err:
+ except OSError as err:
print(str(err))
except ValueError as err:
print(str(err))
@@ -1416,8 +1428,8 @@ def test_abund_dist_gz_bigcount():
# load the compressed bigcount table
try:
counting_hash = khmer.load_counting_hash(outfile)
- except IOError as err:
- assert 0, 'Should not produce IOError: ' + str(err)
+ except OSError as err:
+ assert 0, 'Should not produce OSError: ' + str(err)
hashsizes = counting_hash.hashsizes()
kmer_size = counting_hash.ksize()
tracking = khmer._Hashbits(kmer_size, hashsizes)
diff --git a/tests/test_counting_single.py b/tests/test_counting_single.py
index a76e63c..b7256c5 100644
--- a/tests/test_counting_single.py
+++ b/tests/test_counting_single.py
@@ -69,7 +69,7 @@ def test_hashtable_n_entries():
def test_complete_no_collision():
- kh = khmer._CountingHash(4, [4**4])
+ kh = khmer._CountingHash(4, [4 ** 4])
for i in range(0, kh.n_entries()):
s = khmer.reverse_hash(i, 4)
@@ -318,7 +318,7 @@ def test_very_short_read():
class Test_ConsumeString(object):
def setup(self):
- self.kh = khmer._CountingHash(4, [4**4])
+ self.kh = khmer._CountingHash(4, [4 ** 4])
def test_n_occupied(self):
assert self.kh.n_occupied() == 0
diff --git a/tests/test_filter.py b/tests/test_filter.py
index 2ff9091..3af64ff 100644
--- a/tests/test_filter.py
+++ b/tests/test_filter.py
@@ -41,7 +41,7 @@ class Test_Filter(object):
try:
ht.consume_fasta("nonexistent")
assert 0, "should fail"
- except IOError as err:
+ except OSError as err:
print(str(err))
ht.output_fasta_kmer_pos_freq(filename, outname)
try:
diff --git a/tests/test_functions.py b/tests/test_functions.py
index 32f3f99..bcc6739 100644
--- a/tests/test_functions.py
+++ b/tests/test_functions.py
@@ -14,7 +14,10 @@ from . import khmer_tst_utils as utils
from khmer.utils import (check_is_pair, broken_paired_reader, check_is_left,
check_is_right)
from khmer.kfile import check_input_files
-from cStringIO import StringIO
+try:
+ from StringIO import StringIO
+except ImportError:
+ from io import StringIO
def test_forward_hash():
@@ -83,9 +86,7 @@ def test_get_primes_fal():
try:
primes = khmer.get_n_primes_near_x(5, 5)
assert 0, "previous statement should fail"
- except AssertionError:
- raise
- except Exception as err:
+ except RuntimeError as err:
assert "unable to find 5 prime numbers < 5" in str(err)
diff --git a/tests/test_hashbits.py b/tests/test_hashbits.py
index 12a4f09..47b5928 100644
--- a/tests/test_hashbits.py
+++ b/tests/test_hashbits.py
@@ -1,12 +1,12 @@
from __future__ import print_function
from __future__ import absolute_import
#
-# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
-# Copyright (C) Michigan State University, 2009-2015. It is licensed under
+# This file is part of khmer, htabletps://github.com/dib-lab/khmer/, and is
+# Copyrightable (C) Michigan State University, 2009-2015. It is licensed under
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
#
-# pylint: disable=missing-docstring,protected-access
+# pylint: disable=missing-docstring,protected-access,no-member,
import khmer
from khmer import ReadParser
@@ -31,79 +31,79 @@ def test_toobig():
def test__get_set_tag_density():
- ht = khmer._Hashbits(32, [1])
+ htableable = khmer._Hashbits(32, [1])
- orig = ht._get_tag_density()
+ orig = htableable._get_tag_density()
assert orig != 2
- ht._set_tag_density(2)
- assert ht._get_tag_density() == 2
+ htableable._set_tag_density(2)
+ assert htableable._get_tag_density() == 2
def test_update_from():
- ht = khmer.Hashbits(5, 1000, 4)
- ht2 = khmer.Hashbits(5, 1000, 4)
+ htableable = khmer.Hashbits(5, 1000, 4)
+ other_htableable = khmer.Hashbits(5, 1000, 4)
- assert ht.get('AAAAA') == 0
- assert ht.get('GCGCG') == 0
- assert ht2.get('AAAAA') == 0
- assert ht2.get('GCGCG') == 0
+ assert htableable.get('AAAAA') == 0
+ assert htableable.get('GCGCG') == 0
+ assert other_htableable.get('AAAAA') == 0
+ assert other_htableable.get('GCGCG') == 0
- ht2.count('AAAAA')
+ other_htableable.count('AAAAA')
- assert ht.get('AAAAA') == 0
- assert ht.get('GCGCG') == 0
- assert ht2.get('AAAAA') == 1
- assert ht2.get('GCGCG') == 0
+ assert htableable.get('AAAAA') == 0
+ assert htableable.get('GCGCG') == 0
+ assert other_htableable.get('AAAAA') == 1
+ assert other_htableable.get('GCGCG') == 0
- ht.count('GCGCG')
+ htableable.count('GCGCG')
- assert ht.get('AAAAA') == 0
- assert ht.get('GCGCG') == 1
- assert ht2.get('AAAAA') == 1
- assert ht2.get('GCGCG') == 0
+ assert htableable.get('AAAAA') == 0
+ assert htableable.get('GCGCG') == 1
+ assert other_htableable.get('AAAAA') == 1
+ assert other_htableable.get('GCGCG') == 0
- ht.update(ht2)
+ htableable.update(other_htableable)
- assert ht.get('AAAAA') == 1
- assert ht.get('GCGCG') == 1
- assert ht2.get('AAAAA') == 1
- assert ht2.get('GCGCG') == 0
+ assert htableable.get('AAAAA') == 1
+ assert htableable.get('GCGCG') == 1
+ assert other_htableable.get('AAAAA') == 1
+ assert other_htableable.get('GCGCG') == 0
def test_update_from_diff_ksize_2():
- ht = khmer.Hashbits(5, 1000, 4)
- ht2 = khmer.Hashbits(4, 1000, 4)
+ htableable = khmer.Hashbits(5, 1000, 4)
+ other_htableable = khmer.Hashbits(4, 1000, 4)
try:
- ht.update(ht2)
+ htableable.update(other_htableable)
assert 0, "should not be reached"
except ValueError as err:
print(str(err))
try:
- ht2.update(ht)
+ other_htableable.update(htableable)
assert 0, "should not be reached"
except ValueError as err:
print(str(err))
def test_update_from_diff_tablesize():
- ht = khmer.Hashbits(5, 100, 4)
- ht2 = khmer.Hashbits(5, 1000, 4)
+ htableable = khmer.Hashbits(5, 100, 4)
+ other_htableable = khmer.Hashbits(5, 1000, 4)
try:
- ht.update(ht2)
+ htableable.update(other_htableable)
assert 0, "should not be reached"
except ValueError as err:
print(str(err))
def test_update_from_diff_num_tables():
- ht = khmer.Hashbits(5, 1000, 3)
- ht2 = khmer.Hashbits(5, 1000, 4)
+ htableable = khmer.Hashbits(5, 1000, 3)
+ other_htableable = khmer.Hashbits(5, 1000, 4)
try:
- ht.update(ht2)
+ htableable.update(other_htableable)
assert 0, "should not be reached"
except ValueError as err:
print(str(err))
@@ -112,45 +112,45 @@ def test_update_from_diff_num_tables():
def test_n_occupied_1():
filename = utils.get_test_data('random-20-a.fa')
- K = 20 # size of kmer
- HT_SIZE = 100000 # size of hashtable
- N_HT = 1 # number of hashtables
+ ksize = 20 # size of kmer
+ htable_size = 100000 # size of hashtableable
+ num_htableables = 1 # number of hashtableables
# test modified c++ n_occupied code
- ht1 = khmer.Hashbits(K, HT_SIZE, N_HT)
+ htableable = khmer.Hashbits(ksize, htable_size, num_htableables)
- for n, record in enumerate(fasta_iter(open(filename))):
- ht1.consume(record['sequence'])
+ for _, record in enumerate(fasta_iter(open(filename))):
+ htableable.consume(record['sequence'])
# this number calculated independently
- assert ht1.n_occupied() == 3884, ht1.n_occupied()
+ assert htableable.n_occupied() == 3884, htableable.n_occupied()
def test_bloom_python_1():
# test python code to count unique kmers using bloom filter
filename = utils.get_test_data('random-20-a.fa')
- K = 20 # size of kmer
- HT_SIZE = 100000 # size of hashtable
- N_HT = 3 # number of hashtables
+ ksize = 20 # size of kmer
+ htable_size = 100000 # size of hashtableable
+ num_htableables = 3 # number of hashtableables
- ht2 = khmer.Hashbits(K, HT_SIZE, N_HT)
+ htableable = khmer.Hashbits(ksize, htable_size, num_htableables)
n_unique = 0
- for n, record in enumerate(fasta_iter(open(filename))):
+ for _, record in enumerate(fasta_iter(open(filename))):
sequence = record['sequence']
seq_len = len(sequence)
- for n in range(0, seq_len + 1 - K):
- kmer = sequence[n:n + K]
- if (not ht2.get(kmer)):
+ for n in range(0, seq_len + 1 - ksize):
+ kmer = sequence[n:n + ksize]
+ if not htableable.get(kmer):
n_unique += 1
- ht2.count(kmer)
+ htableable.count(kmer)
assert n_unique == 3960
- assert ht2.n_occupied() == 3885, ht2.n_occupied()
+ assert htableable.n_occupied() == 3885, htableable.n_occupied()
# this number equals n_unique
- assert ht2.n_unique_kmers() == 3960, ht2.n_unique_kmers()
+ assert htableable.n_unique_kmers() == 3960, htableable.n_unique_kmers()
def test_bloom_c_1():
@@ -158,74 +158,76 @@ def test_bloom_c_1():
filename = utils.get_test_data('random-20-a.fa')
- K = 20 # size of kmer
- HT_SIZE = 100000 # size of hashtable
- N_HT = 3 # number of hashtables
+ ksize = 20 # size of kmer
+ htable_size = 100000 # size of hashtableable
+ num_htableables = 3 # number of hashtableables
- ht3 = khmer.Hashbits(K, HT_SIZE, N_HT)
+ htableable = khmer.Hashbits(ksize, htable_size, num_htableables)
- for n, record in enumerate(fasta_iter(open(filename))):
- ht3.consume(record['sequence'])
+ for _, record in enumerate(fasta_iter(open(filename))):
+ htableable.consume(record['sequence'])
- assert ht3.n_occupied() == 3885
- assert ht3.n_unique_kmers() == 3960
+ assert htableable.n_occupied() == 3885
+ assert htableable.n_unique_kmers() == 3960
def test_n_occupied_2(): # simple one
- K = 4
- HT_SIZE = 10 # use 11
- N_HT = 1
+ ksize = 4
+ htable_size = 10 # use 11
+ num_htableables = 1
- ht1 = khmer._Hashbits(K, [11])
- ht1.count('AAAA') # 00 00 00 00 = 0
- assert ht1.n_occupied() == 1
+ htableable = khmer._Hashbits(ksize, [11])
+ htableable.count('AAAA') # 00 00 00 00 = 0
+ assert htableable.n_occupied() == 1
- ht1.count('ACTG') # 00 10 01 11 =
- assert ht1.n_occupied() == 2
+ htableable.count('ACTG') # 00 10 01 11 =
+ assert htableable.n_occupied() == 2
- ht1.count('AACG') # 00 00 10 11 = 11 # collision 1
+ htableable.count('AACG') # 00 00 10 11 = 11 # collision 1
- assert ht1.n_occupied() == 2
- ht1.count('AGAC') # 00 11 00 10 # collision 2
- assert ht1.n_occupied() == 2, ht1.n_occupied()
+ assert htableable.n_occupied() == 2
+ htableable.count('AGAC') # 00 11 00 10 # collision 2
+ assert htableable.n_occupied() == 2, htableable.n_occupied()
def test_bloom_c_2(): # simple one
- K = 4
-
- # use only 1 hashtable, no bloom filter
- ht1 = khmer._Hashbits(K, [11])
- ht1.count('AAAA') # 00 00 00 00 = 0
- ht1.count('ACTG') # 00 10 01 11 =
- assert ht1.n_unique_kmers() == 2
- ht1.count('AACG') # 00 00 10 11 = 11 # collision with 1st kmer
- assert ht1.n_unique_kmers() == 2
- ht1.count('AGAC') # 00 11 00 10 # collision with 2nd kmer
- assert ht1.n_unique_kmers() == 2
-
- # use two hashtables with 11,13
- ht2 = khmer._Hashbits(K, [11, 13])
- ht2.count('AAAA') # 00 00 00 00 = 0
-
- ht2.count('ACTG') # 00 10 01 11 = 2*16 +4 +3 = 39
- assert ht2.n_unique_kmers() == 2
- ht2.count('AACG') # 00 00 10 11 = 11 # collision with only 1st kmer
- assert ht2.n_unique_kmers() == 3
- ht2.count('AGAC') # 00 11 00 10 3*16 +2 = 50
+ ksize = 4
+
+ # use only 1 hashtableable, no bloom filter
+ htableable = khmer._Hashbits(ksize, [11])
+ htableable.count('AAAA') # 00 00 00 00 = 0
+ htableable.count('ACTG') # 00 10 01 11 =
+ assert htableable.n_unique_kmers() == 2
+ htableable.count('AACG') # 00 00 10 11 = 11 # collision with 1st kmer
+ assert htableable.n_unique_kmers() == 2
+ htableable.count('AGAC') # 00 11 00 10 # collision with 2nd kmer
+ assert htableable.n_unique_kmers() == 2
+
+ # use two hashtableables with 11,13
+ other_htableable = khmer._Hashbits(ksize, [11, 13])
+ other_htableable.count('AAAA') # 00 00 00 00 = 0
+
+ other_htableable.count('ACTG') # 00 10 01 11 = 2*16 +4 +3 = 39
+ assert other_htableable.n_unique_kmers() == 2
+ # 00 00 10 11 = 11 # collision with only 1st kmer
+ other_htableable.count('AACG')
+ assert other_htableable.n_unique_kmers() == 3
+ other_htableable.count('AGAC')
+ # 00 11 00 10 3*16 +2 = 50
# collision with both 2nd and 3rd kmers
- assert ht2.n_unique_kmers() == 3
+ assert other_htableable.n_unique_kmers() == 3
def test_filter_if_present():
- ht = khmer._Hashbits(32, [3, 5])
+ htable = khmer._Hashbits(32, [3, 5])
maskfile = utils.get_test_data('filter-test-A.fa')
inputfile = utils.get_test_data('filter-test-B.fa')
outfile = utils.get_temp_filename('filter')
- ht.consume_fasta(maskfile)
- ht.filter_if_present(inputfile, outfile)
+ htable.consume_fasta(maskfile)
+ htable.filter_if_present(inputfile, outfile)
records = list(fasta_iter(open(outfile)))
assert len(records) == 1
@@ -234,95 +236,95 @@ def test_filter_if_present():
def test_combine_pe():
inpfile = utils.get_test_data('combine_parts_1.fa')
- ht = khmer._Hashbits(32, [1])
+ htable = khmer._Hashbits(32, [1])
- ht.consume_partitioned_fasta(inpfile)
- assert ht.count_partitions() == (2, 0)
+ htable.consume_partitioned_fasta(inpfile)
+ assert htable.count_partitions() == (2, 0)
- s1 = "CATGCAGAAGTTCCGCAACCATACCGTTCAGT"
- pid1 = ht.get_partition_id(s1)
+ first_seq = "CATGCAGAAGTTCCGCAACCATACCGTTCAGT"
+ pid1 = htable.get_partition_id(first_seq)
- s2 = "CAAATGTACATGCACTTAAAATCATCCAGCCG"
- pid2 = ht.get_partition_id(s2)
+ second_seq = "CAAATGTACATGCACTTAAAATCATCCAGCCG"
+ pid2 = htable.get_partition_id(second_seq)
assert pid1 == 2
assert pid2 == 80293
- ht.join_partitions(pid1, pid2)
+ htable.join_partitions(pid1, pid2)
- pid1 = ht.get_partition_id(s1)
- pid2 = ht.get_partition_id(s2)
+ pid1 = htable.get_partition_id(first_seq)
+ pid2 = htable.get_partition_id(second_seq)
assert pid1 == pid2
- assert ht.count_partitions() == (1, 0)
+ assert htable.count_partitions() == (1, 0)
def test_load_partitioned():
inpfile = utils.get_test_data('combine_parts_1.fa')
- ht = khmer._Hashbits(32, [1])
+ htable = khmer._Hashbits(32, [1])
- ht.consume_partitioned_fasta(inpfile)
- assert ht.count_partitions() == (2, 0)
+ htable.consume_partitioned_fasta(inpfile)
+ assert htable.count_partitions() == (2, 0)
- s1 = "CATGCAGAAGTTCCGCAACCATACCGTTCAGT"
- assert ht.get(s1)
+ first_seq = "CATGCAGAAGTTCCGCAACCATACCGTTCAGT"
+ assert htable.get(first_seq)
- s2 = "CAAATGTACATGCACTTAAAATCATCCAGCCG"
- assert ht.get(s2)
+ second_seq = "CAAATGTACATGCACTTAAAATCATCCAGCCG"
+ assert htable.get(second_seq)
- s3 = "CATGCAGAAGTTCCGCAACCATACCGTTCAGTTCCTGGTGGCTA"[-32:]
- assert ht.get(s3)
+ third_s = "CATGCAGAAGTTCCGCAACCATACCGTTCAGTTCCTGGTGGCTA"[-32:]
+ assert htable.get(third_s)
def test_count_within_radius_simple():
inpfile = utils.get_test_data('all-A.fa')
- ht = khmer._Hashbits(4, [3, 5])
+ htable = khmer._Hashbits(4, [3, 5])
- print(ht.consume_fasta(inpfile))
- n = ht.count_kmers_within_radius('AAAA', 1)
+ print(htable.consume_fasta(inpfile))
+ n = htable.count_kmers_within_radius('AAAA', 1)
assert n == 1
- n = ht.count_kmers_within_radius('AAAA', 10)
+ n = htable.count_kmers_within_radius('AAAA', 10)
assert n == 1
def test_count_within_radius_big():
inpfile = utils.get_test_data('random-20-a.fa')
- ht = khmer.Hashbits(20, 1e5, 4)
+ htable = khmer.Hashbits(20, 1e5, 4)
- ht.consume_fasta(inpfile)
- n = ht.count_kmers_within_radius('CGCAGGCTGGATTCTAGAGG', int(1e6))
+ htable.consume_fasta(inpfile)
+ n = htable.count_kmers_within_radius('CGCAGGCTGGATTCTAGAGG', int(1e6))
assert n == 3961, n
- ht = khmer.Hashbits(21, 1e5, 4)
- ht.consume_fasta(inpfile)
- n = ht.count_kmers_within_radius('CGCAGGCTGGATTCTAGAGGC', int(1e6))
+ htable = khmer.Hashbits(21, 1e5, 4)
+ htable.consume_fasta(inpfile)
+ n = htable.count_kmers_within_radius('CGCAGGCTGGATTCTAGAGGC', int(1e6))
assert n == 39
def test_count_kmer_degree():
inpfile = utils.get_test_data('all-A.fa')
- ht = khmer._Hashbits(4, [3, 5])
- ht.consume_fasta(inpfile)
+ htable = khmer._Hashbits(4, [3, 5])
+ htable.consume_fasta(inpfile)
- assert ht.kmer_degree('AAAA') == 2
- assert ht.kmer_degree('AAAT') == 1
- assert ht.kmer_degree('AATA') == 0
- assert ht.kmer_degree('TAAA') == 1
+ assert htable.kmer_degree('AAAA') == 2
+ assert htable.kmer_degree('AAAT') == 1
+ assert htable.kmer_degree('AATA') == 0
+ assert htable.kmer_degree('TAAA') == 1
def test_save_load_tagset():
- ht = khmer._Hashbits(32, [1])
+ htable = khmer._Hashbits(32, [1])
outfile = utils.get_temp_filename('tagset')
- ht.add_tag('A' * 32)
- ht.save_tagset(outfile)
+ htable.add_tag('A' * 32)
+ htable.save_tagset(outfile)
- ht.add_tag('G' * 32)
+ htable.add_tag('G' * 32)
- ht.load_tagset(outfile) # implicitly => clear_tags=True
- ht.save_tagset(outfile)
+ htable.load_tagset(outfile) # implicitly => clear_tags=True
+ htable.save_tagset(outfile)
# if tags have been cleared, then the new tagfile will be larger (34 bytes)
# else smaller (26 bytes).
@@ -334,17 +336,17 @@ def test_save_load_tagset():
def test_save_load_tagset_noclear():
- ht = khmer._Hashbits(32, [1])
+ htable = khmer._Hashbits(32, [1])
outfile = utils.get_temp_filename('tagset')
- ht.add_tag('A' * 32)
- ht.save_tagset(outfile)
+ htable.add_tag('A' * 32)
+ htable.save_tagset(outfile)
- ht.add_tag('G' * 32)
+ htable.add_tag('G' * 32)
- ht.load_tagset(outfile, False) # set clear_tags => False; zero tags
- ht.save_tagset(outfile)
+ htable.load_tagset(outfile, False) # set clear_tags => False; zero tags
+ htable.save_tagset(outfile)
# if tags have been cleared, then the new tagfile will be large (34 bytes);
# else small (26 bytes).
@@ -358,88 +360,89 @@ def test_save_load_tagset_noclear():
def test_stop_traverse():
filename = utils.get_test_data('random-20-a.fa')
- K = 20 # size of kmer
- HT_SIZE = 1e4 # size of hashtable
- N_HT = 3 # number of hashtables
+ ksize = 20 # size of kmer
+ htable_size = 1e4 # size of hashtableable
+ num_htableables = 3 # number of hashtableables
- ht = khmer.Hashbits(K, HT_SIZE, N_HT)
+ htable = khmer.Hashbits(ksize, htable_size, num_htableables)
# without tagging/joining across consume, this breaks into two partition;
# with, it is one partition.
- ht.add_stop_tag('TTGCATACGTTGAGCCAGCG')
+ htable.add_stop_tag('TTGCATACGTTGAGCCAGCG')
- ht.consume_fasta_and_tag(filename) # DO NOT join reads across stoptags
- subset = ht.do_subset_partition(0, 0, True)
- ht.merge_subset(subset)
+ # DO NOT join reads across stoptags
+ htable.consume_fasta_and_tag(filename)
+ subset = htable.do_subset_partition(0, 0, True)
+ htable.merge_subset(subset)
- n, _ = ht.count_partitions()
+ n, _ = htable.count_partitions()
assert n == 2, n
def test_tag_across_stoptraverse():
filename = utils.get_test_data('random-20-a.fa')
- K = 20 # size of kmer
- HT_SIZE = 1e4 # size of hashtable
- N_HT = 3 # number of hashtables
+ ksize = 20 # size of kmer
+ htable_size = 1e4 # size of hashtableable
+ num_htableables = 3 # number of hashtableables
- ht = khmer.Hashbits(K, HT_SIZE, N_HT)
+ htable = khmer.Hashbits(ksize, htable_size, num_htableables)
# without tagging/joining across consume, this breaks into two partition;
# with, it is one partition.
- ht.add_stop_tag('CCGAATATATAACAGCGACG')
+ htable.add_stop_tag('CCGAATATATAACAGCGACG')
- ht.consume_fasta_and_tag_with_stoptags(filename) # DO join reads across
-
- subset = ht.do_subset_partition(0, 0)
- n, _ = ht.count_partitions()
+ # DO join reads across
+ htable.consume_fasta_and_tag_with_stoptags(filename)
+ subset = htable.do_subset_partition(0, 0)
+ n, _ = htable.count_partitions()
assert n == 99 # reads only connected by traversal...
- n, _ = ht.subset_count_partitions(subset)
+ n, _ = htable.subset_count_partitions(subset)
assert n == 2 # but need main to cross stoptags.
- ht.merge_subset(subset)
+ htable.merge_subset(subset)
- n, _ = ht.count_partitions() # ta-da!
+ n, _ = htable.count_partitions() # ta-da!
assert n == 1, n
def test_notag_across_stoptraverse():
filename = utils.get_test_data('random-20-a.fa')
- K = 20 # size of kmer
- HT_SIZE = 1e4 # size of hashtable
- N_HT = 3 # number of hashtables
+ ksize = 20 # size of kmer
+ htable_size = 1e4 # size of hashtableable
+ num_htableables = 3 # number of hashtableables
- ht = khmer.Hashbits(K, HT_SIZE, N_HT)
+ htable = khmer.Hashbits(ksize, htable_size, num_htableables)
# connecting k-mer at the beginning/end of a read: breaks up into two.
- ht.add_stop_tag('TTGCATACGTTGAGCCAGCG')
+ htable.add_stop_tag('TTGCATACGTTGAGCCAGCG')
- ht.consume_fasta_and_tag_with_stoptags(filename)
+ htable.consume_fasta_and_tag_with_stoptags(filename)
- subset = ht.do_subset_partition(0, 0)
- ht.merge_subset(subset)
+ subset = htable.do_subset_partition(0, 0)
+ htable.merge_subset(subset)
- n, _ = ht.count_partitions()
+ n, _ = htable.count_partitions()
assert n == 2, n
def test_find_stoptags():
- ht = khmer._Hashbits(5, [1])
- ht.add_stop_tag("AAAAA")
+ htable = khmer._Hashbits(5, [1])
+ htable.add_stop_tag("AAAAA")
- assert ht.identify_stoptags_by_position("AAAAA") == [0]
- assert ht.identify_stoptags_by_position("AAAAAA") == [0, 1]
- assert ht.identify_stoptags_by_position("TTTTT") == [0]
- assert ht.identify_stoptags_by_position("TTTTTT") == [0, 1]
+ assert htable.identify_stoptags_by_position("AAAAA") == [0]
+ assert htable.identify_stoptags_by_position("AAAAAA") == [0, 1]
+ assert htable.identify_stoptags_by_position("TTTTT") == [0]
+ assert htable.identify_stoptags_by_position("TTTTTT") == [0, 1]
-def test_find_stoptags2():
- ht = khmer._Hashbits(4, [1])
- ht.add_stop_tag("ATGC")
+def test_find_stoptagsecond_seq():
+ htable = khmer._Hashbits(4, [1])
+ htable.add_stop_tag("ATGC")
- x = ht.identify_stoptags_by_position("ATGCATGCGCAT")
+ x = htable.identify_stoptags_by_position("ATGCATGCGCAT")
assert x == [0, 2, 4, 8], x
@@ -450,7 +453,10 @@ def test_get_ksize():
def test_get_hashsizes():
kh = khmer.Hashbits(22, 100, 4)
- assert kh.hashsizes() == [97L, 89L, 83L, 79L], kh.hashsizes()
+ # Py2/3 hack, longify converts to long in py2, remove once py2 isn't
+ # supported any longer.
+ expected = utils.longify([97, 89, 83, 79])
+ assert kh.hashsizes() == expected, kh.hashsizes()
def test_extract_unique_paths_0():
@@ -510,21 +516,21 @@ def test_find_unpart():
filename = utils.get_test_data('random-20-a.odd.fa')
filename2 = utils.get_test_data('random-20-a.even.fa')
- K = 20 # size of kmer
- HT_SIZE = 1e4 # size of hashtable
- N_HT = 3 # number of hashtables
+ ksize = 20 # size of kmer
+ htable_size = 1e4 # size of hashtableable
+ num_htableables = 3 # number of hashtableables
- ht = khmer.Hashbits(K, HT_SIZE, N_HT)
- ht.consume_fasta_and_tag(filename)
+ htable = khmer.Hashbits(ksize, htable_size, num_htableables)
+ htable.consume_fasta_and_tag(filename)
- subset = ht.do_subset_partition(0, 0)
- ht.merge_subset(subset)
+ subset = htable.do_subset_partition(0, 0)
+ htable.merge_subset(subset)
- n, _ = ht.count_partitions()
+ n, _ = htable.count_partitions()
assert n == 49
- ht.find_unpart(filename2, True, False)
- n, _ = ht.count_partitions()
+ htable.find_unpart(filename2, True, False)
+ n, _ = htable.count_partitions()
assert n == 1, n # all sequences connect
@@ -532,21 +538,21 @@ def test_find_unpart_notraverse():
filename = utils.get_test_data('random-20-a.odd.fa')
filename2 = utils.get_test_data('random-20-a.even.fa')
- K = 20 # size of kmer
- HT_SIZE = 1e4 # size of hashtable
- N_HT = 3 # number of hashtables
+ ksize = 20 # size of kmer
+ htable_size = 1e4 # size of hashtableable
+ num_htableables = 3 # number of hashtableables
- ht = khmer.Hashbits(K, HT_SIZE, N_HT)
- ht.consume_fasta_and_tag(filename)
+ htable = khmer.Hashbits(ksize, htable_size, num_htableables)
+ htable.consume_fasta_and_tag(filename)
- subset = ht.do_subset_partition(0, 0)
- ht.merge_subset(subset)
+ subset = htable.do_subset_partition(0, 0)
+ htable.merge_subset(subset)
- n, _ = ht.count_partitions()
+ n, _ = htable.count_partitions()
assert n == 49
- ht.find_unpart(filename2, False, False) # <-- don't traverse
- n, _ = ht.count_partitions()
+ htable.find_unpart(filename2, False, False) # <-- don't traverse
+ n, _ = htable.count_partitions()
assert n == 99, n # all sequences disconnected
@@ -554,21 +560,21 @@ def test_find_unpart_fail():
filename = utils.get_test_data('random-20-a.odd.fa')
filename2 = utils.get_test_data('random-20-a.odd.fa') # <- switch to odd
- K = 20 # size of kmer
- HT_SIZE = 1e4 # size of hashtable
- N_HT = 3 # number of hashtables
+ ksize = 20 # size of kmer
+ htable_size = 1e4 # size of hashtableable
+ num_htableables = 3 # number of hashtableables
- ht = khmer.Hashbits(K, HT_SIZE, N_HT)
- ht.consume_fasta_and_tag(filename)
+ htable = khmer.Hashbits(ksize, htable_size, num_htableables)
+ htable.consume_fasta_and_tag(filename)
- subset = ht.do_subset_partition(0, 0)
- ht.merge_subset(subset)
+ subset = htable.do_subset_partition(0, 0)
+ htable.merge_subset(subset)
- n, _ = ht.count_partitions()
+ n, _ = htable.count_partitions()
assert n == 49
- ht.find_unpart(filename2, True, False)
- n, _ = ht.count_partitions()
+ htable.find_unpart(filename2, True, False)
+ n, _ = htable.count_partitions()
assert n == 49, n # only 49 sequences worth of tags
@@ -617,13 +623,13 @@ def test_badget():
def test_load_notexist_should_fail():
- savepath = utils.get_temp_filename('temphashbitssave0.ht')
+ savepath = utils.get_temp_filename('temphashbitssave0.htable')
hi = khmer._CountingHash(12, [1])
try:
hi.load(savepath)
assert 0, "load should fail"
- except IOError:
+ except OSError:
pass
@@ -648,29 +654,29 @@ def test_load_truncated_should_fail():
try:
hi.load(savepath)
assert 0, "load should fail"
- except IOError as e:
+ except OSError as e:
print(str(e))
def test_save_load_tagset_notexist():
- ht = khmer._Hashbits(32, [1])
+ htable = khmer._Hashbits(32, [1])
outfile = utils.get_temp_filename('tagset')
try:
- ht.load_tagset(outfile)
+ htable.load_tagset(outfile)
assert 0, "this test should fail"
- except IOError as e:
+ except OSError as e:
print(str(e))
def test_save_load_tagset_trunc():
- ht = khmer._Hashbits(32, [1])
+ htable = khmer._Hashbits(32, [1])
outfile = utils.get_temp_filename('tagset')
- ht.add_tag('A' * 32)
- ht.add_tag('G' * 32)
- ht.save_tagset(outfile)
+ htable.add_tag('A' * 32)
+ htable.add_tag('G' * 32)
+ htable.save_tagset(outfile)
# truncate tagset file...
fp = open(outfile, 'rb')
@@ -684,11 +690,17 @@ def test_save_load_tagset_trunc():
# try loading it...
try:
- ht.load_tagset(outfile)
+ htable.load_tagset(outfile)
assert 0, "this test should fail"
- except IOError as err:
+ except OSError as err:
print(str(err), i)
+ # try loading it...
+ try:
+ htable.load_tagset(outfile)
+ assert 0, "this test should fail"
+ except OSError:
+ pass
# to build the test files used below, add 'test' to this function
# and then look in /tmp. You will need to tweak the version info in
@@ -701,48 +713,48 @@ def _build_testfiles():
inpath = utils.get_test_data('random-20-a.fa')
hi = khmer.Hashbits(12, 2)
hi.consume_fasta(inpath)
- hi.save('/tmp/goodversion-k12.ht')
+ hi.save('/tmp/goodversion-k12.htable')
# tagset file
- ht = khmer._Hashbits(32, [1])
+ htable = khmer._Hashbits(32, [1])
- ht.add_tag('A' * 32)
- ht.add_tag('G' * 32)
- ht.save_tagset('/tmp/goodversion-k32.tagset')
+ htable.add_tag('A' * 32)
+ htable.add_tag('G' * 32)
+ htable.save_tagset('/tmp/goodversion-k32.tagset')
# stoptags file
fakelump_fa = utils.get_test_data('fakelump.fa')
- ht = khmer.Hashbits(32, 4, 4)
- ht.consume_fasta_and_tag(fakelump_fa)
+ htable = khmer.Hashbits(32, 4, 4)
+ htable.consume_fasta_and_tag(fakelump_fa)
- subset = ht.do_subset_partition(0, 0)
- ht.merge_subset(subset)
+ subset = htable.do_subset_partition(0, 0)
+ htable.merge_subset(subset)
EXCURSION_DISTANCE = 40
- EXCURSION_KMER_THRESHOLD = 82
- EXCURSION_KMER_COUNT_THRESHOLD = 1
+ EXCURSION_ksizeMER_THRESHOLD = 82
+ EXCURSION_ksizeMER_COUNT_THRESHOLD = 1
counting = khmer.CountingHash(32, 4, 4)
- ht.repartition_largest_partition(None, counting,
- EXCURSION_DISTANCE,
- EXCURSION_KMER_THRESHOLD,
- EXCURSION_KMER_COUNT_THRESHOLD)
+ htable.repartition_largest_partition(None, counting,
+ EXCURSION_DISTANCE,
+ EXCURSION_ksizeMER_THRESHOLD,
+ EXCURSION_ksizeMER_COUNT_THRESHOLD)
- ht.save_stop_tags('/tmp/goodversion-k32.stoptags')
+ htable.save_stop_tags('/tmp/goodversion-k32.stoptags')
def test_hashbits_file_version_check():
- ht = khmer._Hashbits(12, [1])
+ htable = khmer._Hashbits(12, [1])
- inpath = utils.get_test_data('badversion-k12.ht')
+ inpath = utils.get_test_data('badversion-k12.htable')
try:
- ht.load(inpath)
+ htable.load(inpath)
assert 0, "this should fail"
- except IOError as e:
+ except OSError as e:
print(str(e))
@@ -751,63 +763,63 @@ def test_hashbits_file_type_check():
savepath = utils.get_temp_filename('tempcountingsave0.ct')
kh.save(savepath)
- ht = khmer._Hashbits(12, [1])
+ htable = khmer._Hashbits(12, [1])
try:
- ht.load(savepath)
+ htable.load(savepath)
assert 0, "this should fail"
- except IOError as e:
+ except OSError as e:
print(str(e))
def test_stoptags_file_version_check():
- ht = khmer._Hashbits(32, [1])
+ htable = khmer._Hashbits(32, [1])
inpath = utils.get_test_data('badversion-k32.stoptags')
try:
- ht.load_stop_tags(inpath)
+ htable.load_stop_tags(inpath)
assert 0, "this should fail"
- except IOError as e:
+ except OSError as e:
print(str(e))
def test_stoptags_ksize_check():
- ht = khmer._Hashbits(31, [1])
+ htable = khmer._Hashbits(31, [1])
inpath = utils.get_test_data('goodversion-k32.stoptags')
try:
- ht.load_stop_tags(inpath)
+ htable.load_stop_tags(inpath)
assert 0, "this should fail"
- except IOError as e:
+ except OSError as e:
print(str(e))
def test_stop_tags_filetype_check():
- ht = khmer._Hashbits(31, [1])
+ htable = khmer._Hashbits(31, [1])
inpath = utils.get_test_data('goodversion-k32.tagset')
try:
- ht.load_stop_tags(inpath)
+ htable.load_stop_tags(inpath)
assert 0, "this should fail"
- except IOError as e:
+ except OSError as e:
print(str(e))
def test_tagset_file_version_check():
- ht = khmer._Hashbits(32, [1])
+ htable = khmer._Hashbits(32, [1])
inpath = utils.get_test_data('badversion-k32.tagset')
try:
- ht.load_tagset(inpath)
+ htable.load_tagset(inpath)
assert 0, "this should fail"
- except IOError as e:
+ except OSError as e:
print(str(e))
def test_stop_tags_truncate_check():
- ht = khmer._Hashbits(32, [1])
+ htable = khmer._Hashbits(32, [1])
inpath = utils.get_test_data('goodversion-k32.tagset')
data = open(inpath, 'rb').read()
@@ -819,31 +831,31 @@ def test_stop_tags_truncate_check():
fp.close()
try:
- ht.load_stop_tags(truncpath)
+ htable.load_stop_tags(truncpath)
assert 0, "expect failure of previous command"
- except IOError as e:
+ except OSError as e:
print(i, str(e))
def test_tagset_ksize_check():
- ht = khmer._Hashbits(31, [1])
+ htable = khmer._Hashbits(31, [1])
inpath = utils.get_test_data('goodversion-k32.tagset')
try:
- ht.load_tagset(inpath)
+ htable.load_tagset(inpath)
assert 0, "this should fail"
- except IOError as e:
+ except OSError as e:
print(str(e))
def test_tagset_filetype_check():
- ht = khmer._Hashbits(31, [1])
+ htable = khmer._Hashbits(31, [1])
inpath = utils.get_test_data('goodversion-k32.stoptags')
try:
- ht.load_tagset(inpath)
+ htable.load_tagset(inpath)
assert 0, "this should fail"
- except IOError as e:
+ except OSError as e:
print(str(e))
@@ -866,7 +878,7 @@ def test_consume_absentfasta_with_reads_parser():
readparser = ReadParser(utils.get_test_data('empty-file'))
presencetable.consume_fasta_with_reads_parser(readparser)
assert 0, "this should fail"
- except IOError as err:
+ except OSError as err:
print(str(err))
except ValueError as err:
print(str(err))
@@ -887,7 +899,7 @@ def test_consume_fasta_and_tag_with_badreads_parser():
readsparser = khmer.ReadParser(utils.get_test_data("test-empty.fa"))
presencetable.consume_fasta_and_tag_with_reads_parser(readsparser)
assert 0, "this should fail"
- except IOError as e:
+ except OSError as e:
print(str(e))
except ValueError as e:
print(str(e))
diff --git a/tests/test_hll.py b/tests/test_hll.py
index ebde12d..297374a 100644
--- a/tests/test_hll.py
+++ b/tests/test_hll.py
@@ -74,7 +74,7 @@ def test_hll_consume_string():
def test_hll_empty_fasta():
filename = utils.get_test_data('test-empty.fa')
hll = khmer.HLLCounter(ERR_RATE, K)
- with assert_raises(IOError):
+ with assert_raises(OSError):
hll.consume_fasta(filename)
diff --git a/tests/test_labelhash.py b/tests/test_labelhash.py
index 59a56b0..c3567cd 100644
--- a/tests/test_labelhash.py
+++ b/tests/test_labelhash.py
@@ -106,7 +106,7 @@ def test_get_label_dict_save_load_wrong_ksize():
try:
lb.load_labels_and_tags(savepath)
assert 0, "this should not succeed - different ksize"
- except IOError as err:
+ except OSError as err:
print(str(err))
assert "Incorrect k-mer size 19" in str(err)
@@ -136,7 +136,7 @@ def test_save_load_corrupted():
try:
lb.load_labels_and_tags(truncated)
assert 0, "this should not succeed -- truncated file len %d" % (i,)
- except IOError as err:
+ except OSError as err:
print('expected failure for', i, ': ', str(err))
@@ -155,7 +155,7 @@ def test_save_fail_readonly():
try:
lb_pre.save_labels_and_tags(savepath)
assert 0, "this should fail: read-only file"
- except IOError as err:
+ except OSError as err:
print(str(err))
@@ -410,7 +410,7 @@ def test_load_wrong_filetype():
try:
lb.load_labels_and_tags(filename)
assert 0, "this should not succeed - bad file type"
- except IOError as err:
+ except OSError as err:
print(str(err))
assert "Incorrect file format type" in str(err)
@@ -419,7 +419,7 @@ def test_load_wrong_filetype():
try:
lb.load_labels_and_tags(filename)
assert 0, "this should not succeed - bad file signature"
- except IOError as err:
+ except OSError as err:
print(str(err))
assert "Incorrect file signature" in str(err)
@@ -432,6 +432,6 @@ def test_load_wrong_fileversion():
try:
lb.load_labels_and_tags(filename)
assert 0, "this should not succeed - bad file type"
- except IOError as err:
+ except OSError as err:
print(str(err))
assert "Incorrect file format version" in str(err)
diff --git a/tests/test_lump.py b/tests/test_lump.py
index c7eeb0d..511ee00 100644
--- a/tests/test_lump.py
+++ b/tests/test_lump.py
@@ -152,7 +152,7 @@ def test_fakelump_load_stop_tags_trunc():
try:
ht.load_stop_tags(fakelump_fa_foo)
assert 0, "this test should fail"
- except IOError:
+ except OSError:
pass
@@ -165,5 +165,5 @@ def test_fakelump_load_stop_tags_notexist():
try:
ht.load_stop_tags(fakelump_fa_foo)
assert 0, "this test should fail"
- except IOError:
+ except OSError:
pass
diff --git a/tests/test_normalize_by_median.py b/tests/test_normalize_by_median.py
index abdcc58..41d22b4 100644
--- a/tests/test_normalize_by_median.py
+++ b/tests/test_normalize_by_median.py
@@ -6,7 +6,7 @@ from __future__ import unicode_literals
# Copyright (C) Michigan State University, 2009-2015. It is licensed under
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
-#
+# pylint: disable=missing-docstring,invalid-name,unused-variable
import os
import shutil
@@ -32,6 +32,20 @@ def test_normalize_by_median_indent():
assert os.path.exists(outfile)
+def test_normalize_by_median_empty_file():
+ infile = utils.get_temp_filename('empty')
+ shutil.copyfile(utils.get_test_data('empty-file'), infile)
+ script = 'normalize-by-median.py'
+ in_dir = os.path.dirname(infile)
+
+ args = [infile]
+ (status, out, err) = utils.runscript(script, args, in_dir)
+
+ assert 'WARNING:' in err, err
+ assert 'is empty' in err, err
+ assert 'SKIPPED' in err, err
+
+
def test_normalize_by_median():
CUTOFF = '1'
@@ -52,7 +66,7 @@ def test_normalize_by_median():
seqs = [r.sequence for r in screed.open(outfile)]
assert len(seqs) == 1, seqs
assert seqs[0].startswith('GGTTGACGGGGCTCAGGGGG'), seqs
- assert "IOErrors" not in err
+ assert "I/O Errors" not in err
def test_normalize_by_median_unpaired_final_read():
@@ -64,13 +78,50 @@ def test_normalize_by_median_unpaired_final_read():
shutil.copyfile(utils.get_test_data('single-read.fq'), infile)
script = 'normalize-by-median.py'
- args = ['-C', CUTOFF, '-k', '17', '-p', infile]
- try:
- (status, out, err) = utils.runscript(script, args, in_dir)
- raise Exception("Shouldn't get to this")
- except AssertionError as e:
- out = str(e)
- assert "ERROR: Unpaired reads when require_paired" in out, out
+ args = ['-C', CUTOFF, '-k', '17', '-p', infile]
+ (status, out, err) = utils.runscript(script, args, in_dir, fail_ok=True)
+ assert status != 0
+ assert "ERROR: Unpaired reads when require_paired" in err, err
+
+
+def test_normalize_by_median_sanity_check_0():
+ infile = utils.get_temp_filename('test.fa')
+ in_dir = os.path.dirname(infile)
+
+ shutil.copyfile(utils.get_test_data('single-read.fq'), infile)
+
+ script = 'normalize-by-median.py'
+ args = ['-U', '1024', '--max-mem', '60', infile]
+ (status, out, err) = utils.runscript(script, args, in_dir, fail_ok=True)
+ assert status != 0
+ assert "recommended false positive ceiling of 0.1!" in err, err
+
+
+def test_normalize_by_median_sanity_check_1():
+ infile = utils.get_temp_filename('test.fa')
+ in_dir = os.path.dirname(infile)
+
+ shutil.copyfile(utils.get_test_data('test-filter-abund-Ns.fq'), infile)
+
+ script = 'normalize-by-median.py'
+ args = ['-U', '83', '--max-tablesize', '17', infile]
+ (status, out, err) = utils.runscript(script, args, in_dir, fail_ok=True)
+ assert status != 0
+ assert "Warning: The given tablesize is too small!" in err, err
+
+
+def test_normalize_by_median_sanity_check_2():
+ infile = utils.get_temp_filename('test.fa')
+ in_dir = os.path.dirname(infile)
+
+ shutil.copyfile(utils.get_test_data('test-filter-abund-Ns.fq'), infile)
+
+ script = 'normalize-by-median.py'
+ args = ['-U', '83', infile]
+ (status, out, err) = utils.runscript(script, args, in_dir)
+
+ assert "*** INFO: set memory ceiling using auto optimization." in err, err
+ assert "*** Ceiling is: 399 bytes" in err, err
def test_normalize_by_median_unforced_badfile():
@@ -81,12 +132,9 @@ def test_normalize_by_median_unforced_badfile():
in_dir = os.path.dirname(infile)
script = 'normalize-by-median.py'
args = ['-C', CUTOFF, '-k', '17', infile]
- try:
- (status, out, err) = utils.runscript(script, args, in_dir)
- raise Exception("Shouldn't get to this")
- except AssertionError as e:
- out = str(e)
- assert "ERROR: [Errno 2] No such file or directory:" in out, out
+ (status, out, err) = utils.runscript(script, args, in_dir, fail_ok=True)
+ assert status != 0
+ assert "ERROR: [Errno 2] No such file or directory:" in err, err
if os.path.exists(outfile):
assert False, '.keep file should have been removed: '
@@ -102,12 +150,9 @@ def test_normalize_by_median_contradictory_args():
script = 'normalize-by-median.py'
args = ['-C', '1', '-k', '17', '--force-single', '-p', '-R',
outfile, infile]
- try:
- (status, out, err) = utils.runscript(script, args, in_dir)
- raise Exception("Shouldn't get to this")
- except AssertionError as e:
- out = str(e)
- assert "cannot both be set" in out, out
+ (status, out, err) = utils.runscript(script, args, in_dir, fail_ok=True)
+ assert status != 0
+ assert "cannot both be set" in err, err
def test_normalize_by_median_stdout_3():
@@ -124,7 +169,7 @@ def test_normalize_by_median_stdout_3():
assert 'Total number of unique k-mers: 98' in err, err
assert 'in /dev/stdout' in err, err
- assert "IOErrors" not in err
+ assert "I/O Errors" not in err
@attr('known_failing')
@@ -151,8 +196,62 @@ def test_normalize_by_median_known_good():
assert False
- at attr('huge')
def test_normalize_by_median_report_fp():
+ # this tests basic reporting of diginorm stats => report.out, including
+ # a test of aggregate stats for two input files.
+
+ infile = utils.get_temp_filename('test.fa')
+ shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile)
+ infile2 = utils.get_temp_filename('test2.fa')
+ shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile2)
+
+ in_dir = os.path.dirname(infile)
+ outfile = utils.get_temp_filename('report.out')
+
+ script = 'normalize-by-median.py'
+ args = ['-C', '1', '-k', '17', '-R', outfile, infile, infile2]
+ (status, out, err) = utils.runscript(script, args, in_dir)
+
+ assert os.path.exists(outfile)
+ report = open(outfile, 'r')
+ line = report.readline().strip()
+ assert line == 'total,kept,f_kept', line
+ line = report.readline().strip()
+ assert line == '1001,1,0.000999', line
+ line = report.readline().strip()
+ assert line == '2002,1,0.0004995', line
+
+
+def test_normalize_by_median_report_fp_hifreq():
+ # this tests high-frequency reporting of diginorm stats for a single
+ # file => report.out.
+
+ infile = utils.get_temp_filename('test.fa')
+ shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile)
+
+ in_dir = os.path.dirname(infile)
+ outfile = utils.get_temp_filename('report.out')
+
+ script = 'normalize-by-median.py'
+ args = ['-C', '1', '-k', '17', '-R', outfile, infile,
+ '--report-frequency', '100']
+ (status, out, err) = utils.runscript(script, args, in_dir)
+
+ assert os.path.exists(outfile)
+ report = open(outfile, 'r')
+ line = report.readline().strip()
+ assert line == 'total,kept,f_kept', line
+ line = report.readline().strip()
+ assert line == '100,1,0.01', line
+ line = report.readline().strip()
+ assert line == '200,1,0.005', line
+
+
+ at attr('huge')
+def test_normalize_by_median_report_fp_huge():
+ # this tests reporting of diginorm stats => report.out for a large
+ # file, with the default reporting interval of once every 100k.
+
infile = utils.get_temp_filename('test.fa')
in_dir = os.path.dirname(infile)
outfile = utils.get_temp_filename('report.out')
@@ -165,8 +264,9 @@ def test_normalize_by_median_report_fp():
assert "fp rate estimated to be 0.623" in err, err
report = open(outfile, 'r')
+ line = report.readline() # skip header
line = report.readline()
- assert "100000 25261 0.25261" in line, line
+ assert "100000,25261,0.2526" in line, line
def test_normalize_by_median_unpaired_and_paired():
@@ -203,12 +303,12 @@ def test_normalize_by_median_count_kmers_PE():
args = ['-C', CUTOFF, '-k', '17', '--force-single', infile]
(status, out, err) = utils.runscript(script, args, in_dir)
assert 'Total number of unique k-mers: 98' in err, err
- assert 'kept 1 of 2 or 50%' in err, err
+ assert 'kept 1 of 2 or 50.0%' in err, err
args = ['-C', CUTOFF, '-k', '17', '-p', infile]
(status, out, err) = utils.runscript(script, args, in_dir)
assert 'Total number of unique k-mers: 99' in err, err
- assert 'kept 2 of 2 or 100%' in err, err
+ assert 'kept 2 of 2 or 100.0%' in err, err
def test_normalize_by_median_double_file_name():
@@ -220,10 +320,21 @@ def test_normalize_by_median_double_file_name():
script = 'normalize-by-median.py'
args = [utils.get_test_data('test-abund-read-2.fa'), infile]
- try:
- (status, out, err) = utils.runscript(script, args, in_dir)
- except AssertionError as e:
- assert "Duplicate filename--Cannot handle this!" in str(e), str(e)
+ (status, out, err) = utils.runscript(script, args, in_dir, fail_ok=True)
+ assert status != 0
+ assert "Duplicate filename--Cannot handle this!" in err, err
+
+
+def test_normalize_by_median_stdin_no_out():
+ infile = utils.get_temp_filename('test-abund-read-2.fa')
+ in_dir = os.path.dirname(infile)
+
+ script = 'normalize-by-median.py'
+ args = ["-"]
+
+ (status, out, err) = utils.runscript(script, args, in_dir, fail_ok=True)
+ assert status != 0
+ assert "Accepting input from stdin; output filename" in err, err
def test_normalize_by_median_overwrite():
@@ -326,7 +437,7 @@ def test_normalize_by_median_paired_fq():
assert seqs[0].startswith('GGTTGACGGGGCTCAGGGGG'), seqs
assert seqs[1].startswith('GGTTGACGGGGCTCAGGG'), seqs
- names = [r.name for r in screed.open(outfile, parse_description=False)]
+ names = [r.name for r in screed.open(outfile)]
assert len(names) == 6, names
assert '895:1:37:17593:9954 1::FOO' in names, names
assert '895:1:37:17593:9954 2::FOO' in names, names
@@ -342,7 +453,8 @@ def test_normalize_by_median_impaired():
script = 'normalize-by-median.py'
args = ['-C', CUTOFF, '-p', '-k', '17', infile]
- _, out, err = utils.runscript(script, args, in_dir, fail_ok=True)
+ status, out, err = utils.runscript(script, args, in_dir, fail_ok=True)
+ status != 0
assert 'ERROR: Unpaired reads ' in err, err
@@ -365,7 +477,7 @@ def test_normalize_by_median_force():
(status, out, err) = utils.runscript(script, args, in_dir)
assert '*** Skipping' in err
- assert '** IOErrors' in err
+ assert '** I/O Errors' in err
def test_normalize_by_median_no_bigcount():
@@ -417,6 +529,7 @@ def test_normalize_by_median_emptycountingtable():
script = 'normalize-by-median.py'
args = ['-C', CUTOFF, '--loadtable', infile, infile]
(status, out, err) = utils.runscript(script, args, in_dir, fail_ok=True)
+ assert status != 0
assert 'ValueError' in err, (status, out, err)
@@ -431,10 +544,7 @@ def test_normalize_by_median_fpr():
args = ['-f', '-k 17', '-x ' + str(MAX_TABLESIZE_PARAM), infile]
(status, out, err) = utils.runscript(script, args, in_dir, fail_ok=True)
-
- print(out)
- print(err)
-
+ assert status != 0
assert os.path.exists(infile + '.keep'), infile
assert '** ERROR: the graph structure is too small' in err, err
@@ -450,7 +560,7 @@ def write_by_chunks(infile, outfile, CHUNKSIZE=8192):
ofile.close()
-def test_normalize_by_median_streaming():
+def test_normalize_by_median_streaming_0():
CUTOFF = '20'
infile = utils.get_test_data('100-reads.fq.gz')
@@ -477,6 +587,32 @@ def test_normalize_by_median_streaming():
assert linecount == 400
+def test_normalize_by_median_streaming_1():
+ CUTOFF = '20'
+
+ infile = utils.get_test_data('test-filter-abund-Ns.fq')
+ in_dir = os.path.dirname(infile)
+ fifo = utils.get_temp_filename('fifo')
+ outfile = utils.get_temp_filename('outfile')
+
+ # Use a fifo to copy stdout to a file for checking
+ os.mkfifo(fifo)
+ thread = threading.Thread(target=write_by_chunks, args=(infile, fifo))
+ thread.start()
+
+ # Execute diginorm
+ script = 'normalize-by-median.py'
+ args = ['-C', CUTOFF, '-k', '17', '-o', outfile, fifo]
+ (status, out, err) = utils.runscript(script, args, in_dir)
+
+ # Merge the thread
+ thread.join()
+
+ assert os.path.exists(outfile), outfile
+ assert 'Total number of unique k-mers: 98' in err, err
+ assert 'fifo is empty' not in err, err
+
+
def test_diginorm_basic_functionality_1():
# each of these pairs has both a multicopy sequence ('ACTTCA...') and
# a random sequence. With 'C=1' and '-p', all should be kept.
diff --git a/tests/test_oxli_functions.py b/tests/test_oxli_functions.py
index 63ad48c..22e4371 100644
--- a/tests/test_oxli_functions.py
+++ b/tests/test_oxli_functions.py
@@ -16,26 +16,26 @@ from oxli import functions
def test_estimate_functions_1():
- res = functions.estimate_optimal_with_N_and_M(99, 1024)
+ res = functions.estimate_optimal_with_K_and_M(99, 1024)
assert res[0] == 7, res[0]
assert res[1] == 146, res[1]
assert res[2] == 1022, res[2]
assert abs(.008 - res[3]) < .001, res[3]
- res = functions.estimate_optimal_with_N_and_f(99, 0.00701925498897)
+ res = functions.estimate_optimal_with_K_and_f(99, 0.00701925498897)
assert res[0] == 7, res[0]
assert res[1] == 145, res[1]
assert res[2] == 1015, res[2]
assert abs(.008 - res[3]) < .002, res[3]
- res = functions.estimate_optimal_with_N_and_M(1024, 2)
+ res = functions.estimate_optimal_with_K_and_M(1024, 2)
assert res[0] == 1, res[0]
assert res[1] == 2, res[1]
assert res[2] == 2, res[2]
assert res[3] == 1.0, res[3]
# using a crazy high FP rate just for coverage
- res = functions.estimate_optimal_with_N_and_f(1024, 0.7)
+ res = functions.estimate_optimal_with_K_and_f(1024, 0.7)
assert res[0] == 1, res[0]
assert res[1] == 850, res[1]
assert res[2] == 850, res[2]
@@ -43,18 +43,46 @@ def test_estimate_functions_1():
def test_estimate_functions_namedtup():
- res = functions.estimate_optimal_with_N_and_M(99, 1024)
+ res = functions.estimate_optimal_with_K_and_M(99, 1024)
assert res.num_htables == 7, res[0]
assert res.htable_size == 146, res[1]
assert res.mem_use == 1022, res[2]
assert abs(.008 - res.fp_rate) < .001, res[3]
- res = functions.estimate_optimal_with_N_and_f(99, 0.00701925498897)
+ res = functions.estimate_optimal_with_K_and_f(99, 0.00701925498897)
assert res.num_htables == 7, res[0]
assert res.htable_size == 145, res[1]
assert res.mem_use == 1015, res[2]
assert abs(.008 - res.fp_rate) < .002, res[3]
+def test_optimal_size_function():
+ res = functions.optimal_size(99, mem_cap=1024)
+ assert res.num_htables == 7, res[0]
+ assert res.htable_size == 146, res[1]
+ assert res.mem_use == 1022, res[2]
+ assert abs(.008 - res.fp_rate) < .001, res[3]
+
+ res = functions.optimal_size(99, fp_rate=0.00701925498897)
+ assert res.num_htables == 7, res[0]
+ assert res.htable_size == 145, res[1]
+ assert res.mem_use == 1015, res[2]
+ assert abs(.008 - res.fp_rate) < .002, res[3]
+
+ try:
+ functions.optimal_size(99, mem_cap=1024, fp_rate=0.00701925498897)
+ assert 0, "this should fail"
+ except TypeError as err:
+ print(str(err))
+ assert "num_kmers and either mem_cap or fp_rate" in str(err)
+
+ try:
+ functions.optimal_size(99)
+ assert 0, "this should fail"
+ except TypeError as err:
+ print(str(err))
+ assert "num_kmers and either mem_cap or fp_rate" in str(err)
+
+
def test_output_gen():
res = functions.optimal_args_output_gen(99, 0.00701925498897)
diff --git a/tests/test_read_aligner.py b/tests/test_read_aligner.py
index 0fa9eec..adbf24e 100644
--- a/tests/test_read_aligner.py
+++ b/tests/test_read_aligner.py
@@ -1,17 +1,70 @@
+from __future__ import print_function
+from __future__ import absolute_import
#
# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
# Copyright (C) Michigan State University, 2009-2015. It is licensed under
# the three-clause BSD license; see LICENSE. Contact: ctb at msu.edu
#
-from __future__ import print_function
import khmer
-from nose.tools import assert_almost_equals
+from . import khmer_tst_utils as utils
+# from nose.tools import assert_almost_equals
+
+
+def pretty_compare(a, b):
+ print(len(a), len(b))
+
+ line1 = []
+ line2 = []
+ line3 = []
+ for (x, y) in zip(a, b):
+ line1.append(x)
+ line2.append(y)
+ if x == y:
+ line3.append('|')
+ else:
+ line3.append('x')
+
+ for i in range(0, len(line1), 60):
+ print("".join(line1[i:i + 60]))
+ print("".join(line3[i:i + 60]))
+ print("".join(line2[i:i + 60]))
-# DISABLING TESTS until model probabilities are finalized
def eq_(v1, v2):
- return True
+ assert len(v1)
+ if v1 != v2:
+ pretty_compare(v1, v2)
+ assert v1 == v2, (v1, v2)
+
+
+def neq_(v1, v2):
+ assert len(v1)
+ if v1 == v2:
+ pretty_compare(v1, v2)
+ assert v1 != v2, (v1, v2)
+
+
+def test_graph_attribute():
+ ch = khmer.CountingHash(10, 1048576, 1)
+ aligner = khmer.ReadAligner(ch, 0, 0)
+ assert aligner.graph is ch
+
+
+def test_align_nothing():
+ ch = khmer.CountingHash(10, 1048576, 1)
+ read = "ACCAAGGCTCGAGATTTACC"
+
+ aligner = khmer.ReadAligner(ch, 0, 0)
+ for i in range(20):
+ ch.consume("AGAGGGAAAGCTAGGTTCGACAAGTCCTTGACAGAT")
+ score, graphAlign, readAlign, trunc = aligner.align(read)
+
+ print(score, graphAlign, readAlign)
+
+ assert trunc
+ assert len(graphAlign) == 0
+ assert len(readAlign) == 0
def test_alignnocov():
@@ -26,9 +79,284 @@ def test_alignnocov():
# should be the same
eq_(readAlign, 'ACCTAGGTTCGACATGTACC')
eq_(graphAlign, 'ACCTAGGTTCGACATGTACC')
+ assert not trunc
+
+
+def test_align_middle():
+ ch = khmer.CountingHash(10, 1048576, 1)
+ read = "TCGACAAGTCCTTGACAGAT"
+ aligner = khmer.ReadAligner(ch, 0, 0)
+ for i in range(20):
+ ch.consume("AGAGGGAAAGCTAGGTTCGACAAGTCCTTGACAGAT")
+ ch.consume(read)
+ score, graphAlign, readAlign, trunc = aligner.align(read)
+
+ # should be the same
+ eq_(readAlign, read)
+ eq_(graphAlign, read)
+ assert not trunc
+
+
+def test_align_middle_trunc():
+ return # @CTB
+
+ ch = khmer.CountingHash(10, 1048576, 1)
+ read = "TCGACAAGTCCTTGACAGATGGGGGG"
+ aligner = khmer.ReadAligner(ch, 0, 0)
+ for i in range(20):
+ ch.consume("AGAGGGAAAGCTAGGTTCGACAAGTCCTTGACAGAT")
+
+ # omit suffix from graph
+ ch.consume(read[:-5])
+ score, graphAlign, readAlign, trunc = aligner.align(read)
+
+ # should not be the same...
+ neq_(readAlign, read)
+ neq_(graphAlign, read)
+
+ eq_(readAlign, read[:-5])
+ eq_(graphAlign, read[:-5])
+
+ # ...but truncated
+ assert trunc
+
+
+def test_align_middle_trunc_2():
+ return # @CTB
+
+ ch = khmer.CountingHash(10, 1048576, 1)
+ read = "GGGGGGGGGGGGTCGACAAGTCCTTGACAGAT"
+ aligner = khmer.ReadAligner(ch, 0, 0)
+ for i in range(20):
+ ch.consume("AAAAAAAAAAAATCGACAAGTCCTTGACAGAT")
+
+ # omit prefix from graph
+ ch.consume(read[12:])
+ score, graphAlign, readAlign, trunc = aligner.align(read)
+
+ # here, the alignment must start not at the beginning
+ print(readAlign)
+ print(graphAlign)
+
+ eq_(readAlign, read[12:])
+ eq_(graphAlign, read[12:])
+
+ # ...but truncated
+ assert trunc
+
+
+def test_align_fwd_nothing():
+ ch = khmer.CountingHash(10, 1048576, 1)
+ read = "ACCAAGGCTCGAGATTTACC"
+
+ aligner = khmer.ReadAligner(ch, 0, 0)
+ for i in range(20):
+ ch.consume("AGAGGGAAAGCTAGGTTCGACAAGTCCTTGACAGAT")
+ score, graphAlign, readAlign, trunc, _ = aligner.align_forward(read)
+
+ print(score, graphAlign, readAlign)
+
+ assert trunc
+ assert len(graphAlign) == 0
+ assert len(readAlign) == 0
+
+
+def test_align_fwd_nocov():
+ ch = khmer.CountingHash(10, 1048576, 1)
+ read = "ACCTAGGTTCGACATGTACC"
+ aligner = khmer.ReadAligner(ch, 0, 0)
+ for i in range(20):
+ ch.consume("AGAGGGAAAGCTAGGTTCGACAAGTCCTTGACAGAT")
+ ch.consume("ACCTAGGTTCGACATGTACC")
+ score, graphAlign, readAlign, trunc, _ = aligner.align_forward(read)
+
+ # should be the same
+ eq_(readAlign, 'ACCTAGGTTCGACATGTACC')
+ eq_(graphAlign, 'ACCTAGGTTCGACATGTACC')
+ assert not trunc
+
+
+def test_align_fwd_middle():
+ ch = khmer.CountingHash(10, 1048576, 1)
+ read = "TCGACAAGTCCTTGACAGAT"
+ aligner = khmer.ReadAligner(ch, 0, 0)
+ for i in range(20):
+ ch.consume("AGAGGGAAAGCTAGGTTCGACAAGTCCTTGACAGAT")
+ ch.consume(read)
+ score, graphAlign, readAlign, trunc, _ = aligner.align_forward(read)
+
+ # should be the same
+ eq_(readAlign, read)
+ eq_(graphAlign, read)
+ assert not trunc
+
+
+def test_align_fwd_middle_trunc():
+ return # @CTB
+ ch = khmer.CountingHash(10, 1048576, 1)
+ read = "TCGACAAGTCCTTGACAGATGGGGGG"
+ aligner = khmer.ReadAligner(ch, 0, 0)
+ for i in range(20):
+ ch.consume("AGAGGGAAAGCTAGGTTCGACAAGTCCTTGACAGAT")
+
+ # omit suffix from graph
+ ch.consume(read[:-5])
+ score, graphAlign, readAlign, trunc, _ = aligner.align_forward(read)
+
+ # should not be the same...
+ neq_(readAlign, read)
+ neq_(graphAlign, read)
+
+ eq_(readAlign, read[:-5])
+ eq_(graphAlign, read[:-5])
+
+ # ...but truncated
+ assert trunc
+
+
+def test_align_fwd_middle_trunc_2():
+ ch = khmer.CountingHash(10, 1048576, 1)
+ read = "GGGGGGGGGGGGTCGACAAGTCCTTGACAGAT"
+ aligner = khmer.ReadAligner(ch, 0, 0)
+ for i in range(20):
+ ch.consume("AAAAAAAAAAAATCGACAAGTCCTTGACAGAT")
+
+ # omit prefix from graph
+ ch.consume(read[12:])
+ score, graphAlign, readAlign, trunc, _ = aligner.align_forward(read)
+
+ # this will fail, because align_forward chooses the first kmer as the
+ # seed.
+ assert not readAlign
+ assert not graphAlign
+ assert trunc
+
+
+def test_align_fwd_covs_1():
+ K = 10
+
+ ch = khmer.CountingHash(K, 1048576, 1)
+ read = "GTCGACAAGTCCTTGACAGAT"
+ aligner = khmer.ReadAligner(ch, 0, 0)
+ for i in range(19):
+ ch.consume(read)
+
+ ch.consume("CTCGACAAGTCCTTGACAGAT")
+ # ^
+ score, g, r, is_t, covs = aligner.align_forward(read)
+
+ for start in range(0, len(read) - K + 1):
+ print(ch.get(read[start:start + K]), end=' ')
+ print('')
+
+ assert len(covs) == len(read)
+ assert covs[0] == 19
+ assert min(covs[1:-K]) == 20, covs
+ assert max(covs) == 20, covs
+
+
+def test_align_fwd_covs_2():
+ K = 10
+
+ ch = khmer.CountingHash(K, 1048576, 1)
+ read = "GTCGACAAGTCCTTGACAGAT"
+ aligner = khmer.ReadAligner(ch, 0, 0)
+ for i in range(19):
+ ch.consume(read)
+
+ ch.consume("GACGACAAGTCCTTGACAGAT")
+ # ^
+ score, g, r, is_t, covs = aligner.align_forward(read)
+
+ print(covs, g)
+ for start in range(0, len(read) - K + 1):
+ print(ch.get(read[start:start + K]), end=' ')
+ print('')
+
+ assert len(covs) == len(read)
+ assert covs[0] == 19
+ assert covs[1] == 19
+ assert min(covs[2:-K]) == 20, covs
+ assert max(covs) == 20, covs
+
+
+def test_align_fwd_covs_3():
+ K = 10
+
+ ch = khmer.CountingHash(K, 1048576, 1)
+ read = "GTCGACAAGTCCTTGACAGAT"
+ aligner = khmer.ReadAligner(ch, 0, 0)
+ for i in range(19):
+ ch.consume(read)
+
+ ch.consume("GTAGACAAGTCCTTGACAGAT")
+ # ^
+ score, g, r, is_t, covs = aligner.align_forward(read)
+
+ print(covs, g)
+ for start in range(0, len(read) - K + 1):
+ print(ch.get(read[start:start + K]), end=' ')
+ print('')
+
+ assert len(covs) == len(read)
+ assert covs[0] == 19
+ assert covs[1] == 19
+ assert covs[2] == 19
+ assert min(covs[3:-K]) == 20, covs
+ assert max(covs) == 20, covs
+
+
+def test_align_fwd_covs_4():
+ K = 10
+
+ ch = khmer.CountingHash(K, 1048576, 1)
+ read = "GTCGACAAGTCCTTGACAGAT"
+ aligner = khmer.ReadAligner(ch, 0, 0)
+ for i in range(19):
+ ch.consume(read)
+
+ ch.consume("GTCGACAAGTCCTTGACAGAG")
+ # ^
+ score, g, r, is_t, covs = aligner.align_forward(read)
+
+ print(covs, g)
+ for start in range(0, len(read) - K + 1):
+ print(ch.get(read[start:start + K]), end=' ')
+ print('')
+
+ assert len(covs) == len(read)
+ assert covs[-K] == 19
+ assert min(covs[:-K]) == 20, covs
+ assert max(covs) == 20, covs
+
+
+def test_align_fwd_covs_5():
+ K = 10
+
+ ch = khmer.CountingHash(K, 1048576, 1)
+ read = "GTCGACAAGTCCTTGACAGAT"
+ aligner = khmer.ReadAligner(ch, 0, 0)
+ for i in range(19):
+ ch.consume(read)
+
+ ch.consume("GTCGACAAGTCCTTGACAGCT")
+ # ^
+ score, g, r, is_t, covs = aligner.align_forward(read)
+
+ print(covs, g)
+ for start in range(0, len(read) - K + 1):
+ print(ch.get(read[start:start + K]), end=' ')
+ print('')
+
+ assert len(covs) == len(read)
+ assert covs[-K] == 19
+ assert covs[-K - 1] == 19
+ assert min(covs[:-K - 1]) == 20, covs
+ assert max(covs) == 20, covs
def test_simple_readalign():
+ return # @CTB
ch = khmer.CountingHash(10, 1048576, 1)
aligner = khmer.ReadAligner(ch, 2, 0)
for i in range(20):
@@ -43,11 +371,12 @@ def test_simple_readalign():
# AGCTAGGTTCGACAAGT CCT
# ACCTAGGTTCGACAAGTaCC
# --CTAGGTTCGACATGT-CC
- eq_(graphAlign, 'AGCTAGGTTCGACATGTCC-')
- eq_(readAlign, 'ACCTAGGTTCGACAAGTACc')
+ eq_(graphAlign, 'AGCTAGGTTCGACATGTCCT')
+ eq_(readAlign, 'ACCTAGGTTCGACAAGTACC')
def test_readalign():
+ return # @CTB
ch = khmer.CountingHash(10, 1048576, 1)
aligner = khmer.ReadAligner(ch, 1, 0)
for i in range(20):
@@ -59,8 +388,8 @@ def test_readalign():
score, graphAlign, readAlign, trunc = aligner.align(read)
- eq_(readAlign, 'ACCTAGGTTCGACATGTACc')
- eq_(graphAlign, 'AGCTAGGTTCGACAAGTCC-')
+ eq_(readAlign, 'ACCTAGGTTCGACATGTACC')
+ eq_(graphAlign, 'AGCTAGGTTCGACAAGTCCT')
ht_seqs = ["TTAAATGCCCAATTTTTCCCTCTTTTCTTCTATATGTTTGATTATCAATTTTGCCGCTTTAACTGG"
@@ -81,7 +410,7 @@ queries = [
"seq": "TTAAATGCCCAATTTTTCCCTCTTTTCTTCTATATGTTTGATTATCAATTTTGCCGCTTTAA"
"CTGGGTCTGTTTCTACTGCAAACTTTCCACCAACAAGTTTTTCTGCATCCTGTGTTGCAATCTTAACAA"
"CCTCTTTAC",
- "score": 278.376028204,
+ "score": 274.76338282696173,
"graph_aln": "TTAAATGCCCAATTTTTCCCTCTTTTCTTCTATATGTTTGATTATCAATTTTGCCG"
"CTTTAACTGGGTCTGTTTCTACTGCAAACTTTCCACCAACAAGTTTTTCTGCATCCTGTGTTGCAATCT"
"TAACAACCTCTTTAC",
@@ -94,9 +423,9 @@ queries = [
"seq": "TTAAATGCCCAATTTTTCCCTCTTTTCTTCTATATGTTTGATTATCAATTTTGCCGCTTTAA"
"CTGGGTCTGTTTCTACTGCAAACTTTCCACCAACAAGTTTTTCTGCATCCTGTATTGCAATCTTAACAA"
"CCTCTTTAC",
- "score": 271.753976385,
+ "score": 274.76338282696173,
"graph_aln": "TTAAATGCCCAATTTTTCCCTCTTTTCTTCTATATGTTTGATTATCAATTTTGCCG"
- "CTTTAACTGGGTCTGTTTCTACTGCAAACTTTCCACCAACAAGTTTTTCTGCATCCTGTGTTGCAATCT"
+ "CTTTAACTGGGTCTGTTTCTACTGCAAACTTTCCACCAACAAGTTTTTCTGCATCCTGTATTGCAATCT"
"TAACAACCTCTTTAC",
"read_aln": "TTAAATGCCCAATTTTTCCCTCTTTTCTTCTATATGTTTGATTATCAATTTTGCCGC"
"TTTAACTGGGTCTGTTTCTACTGCAAACTTTCCACCAACAAGTTTTTCTGCATCCTGTATTGCAATCTT"
@@ -107,7 +436,7 @@ queries = [
"seq": "TAAATGCCCAATTTTTCCCTCTTTTCTTCTATATGTTTGATTATCAATTTTGCCGCTTTAAC"
"TGGGTCTGTTTCTACTGCAAACTTTCCACCAACAAGTTTTTCTGCATCCTGTGTTGCAATCTTAACAAC"
"CTCTTTAC",
- "score": 276.416710585,
+ "score": 272.841515695261,
"graph_aln": "TAAATGCCCAATTTTTCCCTCTTTTCTTCTATATGTTTGATTATCAATTTTGCCGC"
"TTTAACTGGGTCTGTTTCTACTGCAAACTTTCCACCAACAAGTTTTTCTGCATCCTGTGTTGCAATCTT"
"AACAACCTCTTTAC",
@@ -120,7 +449,7 @@ queries = [
"seq": "TAAATGCGCAATTTTTCCCTCTTTTCTTCTATATGTTTGATTATCAATTTTGCCGCTTTAAC"
"TGGGTCTGTTTCTACTGCAAACTTTCCACCAACAAGTTTTTCTGCATCCTGTGTTGCAATCTTAACAAC"
"CTCTTTAC",
- "score": 269.794658765,
+ "score": 268.2640868672253,
"graph_aln": "TAAATGCCCAATTTTTCCCTCTTTTCTTCTATATGTTTGATTATCAATTTTGCCGC"
"TTTAACTGGGTCTGTTTCTACTGCAAACTTTCCACCAACAAGTTTTTCTGCATCCTGTGTTGCAATCTT"
"AACAACCTCTTTAC",
@@ -131,42 +460,42 @@ queries = [
},
{
"seq": "TTAAATGCCCAATTTTTCCCTCTTTTCTTCTATATGTTTGATTATCAA",
- "score": 97.5386525659,
+ "score": 97.37145206396536,
"graph_aln": "TTAAATGCCCAATTTTTCCCTCTTTTCTTCTATATGTTTGATTATCAA",
"read_aln": "TTAAATGCCCAATTTTTCCCTCTTTTCTTCTATATGTTTGATTATCAA",
"truncated": False
},
{
"seq": "TTAAATGCCCAATTTTTCCCTCTTTTCTTCTAGATGTTTGATTATCAA",
- "score": 90.9166007464,
+ "score": 92.79402323592961,
"graph_aln": "TTAAATGCCCAATTTTTCCCTCTTTTCTTCTATATGTTTGATTATCAA",
"read_aln": "TTAAATGCCCAATTTTTCCCTCTTTTCTTCTAGATGTTTGATTATCAA",
"truncated": False
},
{
"seq": "TTAAATGCCCAATTTTTCCCTCTTTTCTTCTATATGTATTGATTATCAA",
- "score": 92.9385894977,
+ "score": 84.74620322710143,
"graph_aln": "TTAAATGCCCAATTTTTCCCTCTTTTCTTCTATATGT-TTGATTATCAA",
"read_aln": "TTAAATGCCCAATTTTTCCCTCTTTTCTTCTATATGTaTTGATTATCAA",
"truncated": False
},
{
"seq": "TTAAATGCCCAATTTTTCCCTCTTTTCTTCTATTGTTTGATTATCAA",
- "score": 84.3383420486,
- "graph_aln": "TTAAATGCCCAATTTTTCCCTCTTTTCTTCTATAtGTTTGATTATCAA",
- "read_aln": "TTAAATGCCCAATTTTTCCCTCTTTTCTTCTATT-GTTTGATTATCAA",
+ "score": 82.2182409986759,
+ "graph_aln": "TTAAATGCCCAATTTTTCCCTCTTTTCTTCTATaTGTTTGATTATCAA",
+ "read_aln": "TTAAATGCCCAATTTTTCCCTCTTTTCTTCTAT-TGTTTGATTATCAA",
"truncated": False
},
{
"seq": "TTAAATGCCCAATTTTTCCCTCTTTTCTTCTATATGTATTGATTATCAA",
- "score": 92.9385894977,
+ "score": 84.74620322710143,
"graph_aln": "TTAAATGCCCAATTTTTCCCTCTTTTCTTCTATATGT-TTGATTATCAA",
"read_aln": "TTAAATGCCCAATTTTTCCCTCTTTTCTTCTATATGTaTTGATTATCAA",
"truncated": False
},
{
"seq": "TTAAATGCCCAATTTTTCCCTCTTTTCTTCTATATGTATAGATTATCAA",
- "score": 86.3165376783,
+ "score": 80.1687743990657,
"graph_aln": "TTAAATGCCCAATTTTTCCCTCTTTTCTTCTATATGT-TTGATTATCAA",
"read_aln": "TTAAATGCCCAATTTTTCCCTCTTTTCTTCTATATGTaTAGATTATCAA",
"truncated": False
@@ -175,11 +504,11 @@ queries = [
"seq": "TTAAATGCCCAATTTTTCCCTCTTTTCTTCTATATGTTTGATTATAATTTTGCCGCTTTAAC"
"TGGGTCTAGTTTCTACTGCAAACTTTCCACCAACTAGTTTTTCTGCATCCTTTGTTGCAATCTTAACAA"
"CCTCTTTAC",
- "score": 236.115256507,
- "graph_aln": "TTAAATGCCCAATTTTTCCCTCTTTTCTTCTATATGTTTGATTATCAaTT-TtGCC"
+ "score": 237.81111469018322,
+ "graph_aln": "TTAAATGCCCAATTTTTCCCTCTTTTCTTCTATATGTTTGATTATcAATTTTGCC"
"GCTTTAACTGGGTCT-GTTTCTACTGCAAACTTTCCACCAACAAGTTTTTCTGCATCCTGTGTTGCAAT"
"CTTAACAACCTCTTTAC",
- "read_aln": "TTAAATGCCCAATTTTTCCCTCTTTTCTTCTATATGTTTGATTATAA-TTtT-GCCG"
+ "read_aln": "TTAAATGCCCAATTTTTCCCTCTTTTCTTCTATATGTTTGATTAT-AATTTTGCCG"
"CTTTAACTGGGTCTaGTTTCTACTGCAAACTTTCCACCAACTAGTTTTTCTGCATCCTTTGTTGCAATC"
"TTAACAACCTCTTTAC",
"truncated": False
@@ -187,38 +516,135 @@ queries = [
{
"seq": "TTAAATGCCCAATTTTTCCCTCTTTTCTTCTATATGTTTGAAAATAATTAAAAAAAAAAAAA"
"AAAAAAAAAAAAAAAAAAAAAAAAAA",
- "score": 44.7543247314,
- "graph_aln": "TTAAATGCCCAATTTTTCCCTCTTTTCTTCTATatgtt",
- "read_aln": "TTAAATGCCCAATTTTTCCCTCTTTTCTTCTAT-----",
- "truncated": True
+ "score": 5.331560863368736,
+ "graph_aln":
+ "TTAAATGCCCAATTTTTCCCTCTTTTCTTCTATATGTTTGATTATCAATTTTGCCGCTTTAACTGGGTC"
+ "TGTTTCTACTGCAAACTTT",
+ "read_aln":
+ "TTAAATGCCCAATTTTTCCCTCTTTTCTTCTATATGTTTGAAAATAATTAAAAAAAAAAAAAAAAAAAA"
+ "AAAAAAAAAAAAAAAAAAA",
+ "truncated": False
},
{
"seq": "TTAAATGCCCAATTTTTCCCTCTTTTCTTCTATATGTTTGATTATCAATTTTGCCGCTTTAA"
"CTGGGTCTGTTTCTACTGCAAACTTTCCACCAACAAGAAAAATGTCATCCTGTATTGCAATCTTAACAA"
"CCTCTTTAC",
- "score": 227.446444943,
+ "score": 274.76338282696173,
"graph_aln": "TTAAATGCCCAATTTTTCCCTCTTTTCTTCTATATGTTTGATTATCAATTTTGCCG"
- "CTTTAACTGGGTCTGTTTCTACTGCAAACTTTCCACCAACAAGTtTTTCTG-CATCCTGTGTTGCAATC"
+ "CTTTAACTGGGTCTGTTTCTACTGCAAACTTTCCACCAACAAGAAAAATGTCATCCTGTATTGCAATC"
"TTAACAACCTCTTTAC",
"read_aln": "TTAAATGCCCAATTTTTCCCTCTTTTCTTCTATATGTTTGATTATCAATTTTGCCGC"
- "TTTAACTGGGTCTGTTTCTACTGCAAACTTTCCACCAACAAGA-AAAATGtCATCCTGTATTGCAATCT"
+ "TTTAACTGGGTCTGTTTCTACTGCAAACTTTCCACCAACAAGAAAAATGTCATCCTGTATTGCAATCT"
"TAACAACCTCTTTAC",
"truncated": False
+ },
+ { # the motif of 32 bases are identical match to HT seqs, the rest are
+ # random. TTAAATGCCCAATTTTTCCCTCTTTTCTTCTAT" is the from HT seqs
+ "seq":
+ "ACAAGGCCATTTGTTCGCATTCTGAAGCCGGCTTCCACCATGGTACTGGGAAACTGTCGGAATATTAAA"
+ "TGCCCAATTTTTCCCTCTTTTCTTCTATCCGCAGTATGGACACTGTTTTCCTGAATTTCATTGACAGTT"
+ "TAATTTACTGCGGTCACGCGGAACT",
+ "score": 68.17022311739733,
+ "graph_aln":
+ "ACAAGGCCATTTGTTCGCATTCTGAAGCCGGCTTCCACCATGGTACTGGGAAACTGTCGGAATATTAAA"
+ "TGCCCAATTTTTCCCTCTTTTCTTCTATCCGCAGTATGGACACTGTTTTCCTGAATTTCATTGACAGTT"
+ "TAATTTACTGCGGTCACGCGGAACT",
+ "read_aln": "TTAAATGCCCAATTTTTCCCTCTTTTCTTCTAT",
+ "truncated": True,
+ "description": "truncated-alignment-bc-missing-kmers"
+ },
+ { # Testing for min distance between correctable SNPs
+ # 1st SNP is at position 2+K from beginning, 2nd SNP at position 2+K+K
+ "seq":
+ "TTAAATGCCCAATTTTTCCCTCTTTTCTTCTATACGTTTGATTATCAATTTTGCCGCTTTAACTGG"
+ "ATCTGTTTCTACTGCAAACTTTCCACCAACAAGTTTTTCTGCATCCTGTATTGCAATCTTAACAAC"
+ "CTCTTTAC",
+ "score": 265.608525171,
+ "graph_aln":
+ "TTAAATGCCCAATTTTTCCCTCTTTTCTTCTATATGTTTGATTATCAATTTTGCCGCTTTAACTGG"
+ "GTCTGTTTCTACTGCAAACTTTCCACCAACAAGTTTTTCTGCATCCTGTATTGCAATCTTAACAAC"
+ "CTCTTTAC",
+ "read_aln":
+ "TTAAATGCCCAATTTTTCCCTCTTTTCTTCTATACGTTTGATTATCAATTTTGCCGCTTTAACTGG"
+ "ATCTGTTTCTACTGCAAACTTTCCACCAACAAGTTTTTCTGCATCCTGTATTGCAATCTTAACAAC"
+ "CTCTTTAC",
+ "truncated": False,
+ "description": "2 SNPs, one K apart",
+ },
+ { # Testing for min distance between correctable SNPs
+ # 1st SNP is at position 2+K from beginning, 2nd SNP at position
+ # 2+K+K-1
+ "seq":
+ "TTAAATGCCCAATTTTTCCCTCTTTTCTTCTATACCTTTGATTATCAATTTTGCCGCTTTAACTGG"
+ "GTCTGTTTCTACTGCAAACTTTCCACCAACAAGTTTTTCTGCATCCTGTATTGCAATCTTAACAAC"
+ "CTCTTTAC",
+ "score": 265.608525171,
+ "graph_aln":
+ "TTAAATGCCCAATTTTTCCCTCTTTTCTTCTATATGTTTGATTATCAATTTTGCCGCTTTAACTGG"
+ "GTCTGTTTCTACTGCAAACTTTCCACCAACAAGTTTTTCTGCATCCTGTATTGCAATCTTAACAAC"
+ "CTCTTTAC",
+ "read_aln":
+ "TTAAATGCCCAATTTTTCCCTCTTTTCTTCTATACGTTTGATTATCAATTTTGCCGCTTTAACTAG"
+ "GTCTGTTTCTACTGCAAACTTTCCACCAACAAGTTTTTCTGCATCCTGTATTGCAATCTTAACAAC"
+ "CTCTTTAC",
+ "truncated": False,
+ "description": "2 SNPs, K-2 apart",
}
+
]
+def check_query(aligner, query):
+ score, graphAlign, readAlign, trunc = aligner.align(query["seq"])
+ print(query["seq"])
+ print(graphAlign, query["graph_aln"])
+ print(readAlign, query["read_aln"])
+ print(trunc, query["truncated"])
+ print(score, query["score"])
+ assert graphAlign == query["graph_aln"], "\n%r != \n%r" % \
+ (graphAlign, query["graph_aln"])
+ assert readAlign == query["read_aln"], "\n%r != \n%r" % \
+ (readAlign, query["read_aln"])
+ eq_(trunc, query["truncated"])
+ if query["score"] > 0:
+ assert_almost_equals(score, query["score"])
+
+
def test_readalign_new():
+ return # @CTB
ch = khmer.CountingHash(32, 1048576, 1)
aligner = khmer.ReadAligner(ch, 1, 0)
for seq in ht_seqs:
ch.consume(seq)
for query in queries:
- score, graphAlign, readAlign, trunc = aligner.align(query["seq"])
- print(graphAlign)
- print(readAlign)
- eq_(graphAlign, query["graph_aln"])
- eq_(readAlign, query["read_aln"])
- eq_(trunc, query["truncated"])
- # assert_almost_equals(score, query["score"])
+ if "description" in query:
+ check_query.description = query["description"]
+ yield check_query, aligner, query
+
+
+def test_readaligner_load():
+ ct = khmer.CountingHash(32, 1048576, 1)
+ parameters_json = utils.get_test_data('readaligner-default.json')
+ a_aligner = khmer.ReadAligner(ct, 0, 0, filename=parameters_json)
+ a_scoring_matrix = a_aligner.get_scoring_matrix()
+ a_transition_probabilities = a_aligner.get_transition_probabilities()
+ assert a_scoring_matrix[0] == -0.06642736173897607, a_scoring_matrix[0]
+ assert a_transition_probabilities[0][0] == -0.021973842014145723, (
+ a_transition_probabilities[0][0])
+
+ for seq in ht_seqs:
+ ct.consume(seq)
+
+ for query in queries:
+ a_aligner.align(query['seq'])
+
+ b_aligner = khmer.ReadAligner(
+ ct, 0, 0, transition_probabilities=a_transition_probabilities,
+ scoring_matrix=a_scoring_matrix)
+ b_scoring_matrix = b_aligner.get_scoring_matrix()
+ b_transition_probabilities = b_aligner.get_transition_probabilities()
+ assert b_scoring_matrix == a_scoring_matrix, (
+ a_scoring_matrix, b_scoring_matrix)
+ assert b_transition_probabilities == a_transition_probabilities, (
+ a_transition_probabilities, b_transition_probabilities)
diff --git a/tests/test_read_parsers.py b/tests/test_read_parsers.py
index c785772..c55d17a 100644
--- a/tests/test_read_parsers.py
+++ b/tests/test_read_parsers.py
@@ -87,7 +87,7 @@ def test_num_reads_truncated():
try:
for read in rparser:
n_reads += 1
- except IOError as err:
+ except ValueError as err:
assert "Sequence is empty" in str(err), str(err)
assert rparser.num_reads == 1, "%d valid reads in file, got %d" % (
n_reads, rparser.num_reads)
@@ -109,7 +109,7 @@ def test_gzip_decompression_truncated():
for read in rparser:
pass
assert 0, "this should fail"
- except IOError as err:
+ except OSError as err:
print(str(err))
@@ -120,7 +120,9 @@ def test_gzip_decompression_truncated_pairiter():
for read in rparser.iter_read_pairs():
pass
assert 0, "this should fail"
- except IOError as err:
+ except OSError as err:
+ print(str(err))
+ except ValueError as err:
print(str(err))
@@ -141,7 +143,9 @@ def test_bzip2_decompression_truncated():
for read in rparser:
pass
assert 0, "this should fail"
- except IOError as err:
+ except OSError as err:
+ print(str(err))
+ except ValueError as err:
print(str(err))
@@ -152,7 +156,9 @@ def test_bzip2_decompression_truncated_pairiter():
for read in rparser.iter_read_pairs():
pass
assert 0, "this should fail"
- except IOError as err:
+ except OSError as err:
+ print(str(err))
+ except ValueError as err:
print(str(err))
@@ -162,7 +168,7 @@ def test_badbzip2():
for read in rparser:
pass
assert 0, "this should fail"
- except IOError as err:
+ except OSError as err:
print(str(err))
except ValueError as err:
print(str(err))
@@ -269,7 +275,7 @@ def test_read_truncated():
for read in rparser:
pass
assert 0, "No exception raised on a truncated file"
- except IOError as err:
+ except ValueError as err:
assert "Sequence is empty" in str(err), str(err)
@@ -317,6 +323,7 @@ def test_read_pair_iterator_in_error_mode():
assert all(matches) # Assert ALL the matches. :-]
+ at attr('linux')
def test_read_pair_iterator_in_error_mode_xfail():
rparser = \
@@ -327,7 +334,22 @@ def test_read_pair_iterator_in_error_mode_xfail():
for rpair in rparser.iter_read_pairs():
pass
failed = False
- except IOError as exc:
+ except ValueError as exc:
+ assert "Invalid read pair" in str(exc), str(exc)
+ assert failed
+
+
+def test_read_pair_iterator_in_error_mode_xfail_osxsafe():
+
+ rparser = \
+ ReadParser(utils.get_test_data("test-abund-read-impaired.fa"))
+
+ failed = True
+ try:
+ for rpair in rparser.iter_read_pairs():
+ pass
+ failed = False
+ except ValueError as exc:
pass
assert failed
@@ -361,6 +383,8 @@ def test_constructor():
assert 0, "ReadParser shouldn't accept a non-existant file name"
except ValueError as err:
print(str(err))
+ except OSError as err:
+ print(str(err))
def test_iternext():
@@ -370,7 +394,7 @@ def test_iternext():
for read_1, read_2 in rparser.iter_read_pairs():
read_pairs.append(read_1, read_2)
assert 0, "Shouldn't be able to iterate over non FASTA file"
- except IOError as err:
+ except OSError as err:
print(str(err))
except ValueError as err:
print(str(err))
diff --git a/tests/test_sandbox_scripts.py b/tests/test_sandbox_scripts.py
index ef85a82..c968e5a 100644
--- a/tests/test_sandbox_scripts.py
+++ b/tests/test_sandbox_scripts.py
@@ -1,5 +1,3 @@
-from __future__ import print_function
-from __future__ import absolute_import
#
# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
# Copyright (C) Michigan State University, 2015. It is licensed under
@@ -9,6 +7,10 @@ from __future__ import absolute_import
# pylint: disable=C0111,C0103,E1103,W0612
+from __future__ import print_function
+from __future__ import absolute_import
+from __future__ import unicode_literals
+
import sys
import os
import os.path
@@ -23,6 +25,7 @@ import imp
from . import khmer_tst_utils as utils
import khmer
import screed
+from .test_scripts import _make_counting
def scriptpath(script):
@@ -100,7 +103,7 @@ def test_sweep_reads():
contigfile, readfile, 'junkfile.fa']
status, out, err = utils.runscript(
- script, args, in_dir, fail_ok=True, sandbox=True)
+ script, args, in_dir, sandbox=True)
# check if the bad file was skipped without issue
assert 'ERROR' in err, err
@@ -146,7 +149,7 @@ def test_sweep_reads_fq():
contigfile, readfile, 'junkfile.fa']
status, out, err = utils.runscript(
- script, args, in_dir, fail_ok=True, sandbox=True)
+ script, args, in_dir, sandbox=True)
# check if the bad file was skipped without issue
assert 'ERROR' in err, err
@@ -255,3 +258,32 @@ def test_saturate_by_median():
status, out, err = utils.runscript(script, args, sandbox=True)
assert status == 0
+
+
+def test_count_kmers_1():
+ infile = utils.get_temp_filename('input.fa')
+ shutil.copyfile(utils.get_test_data('random-20-a.fa'), infile)
+ ctfile = _make_counting(infile)
+
+ script = scriptpath('count-kmers.py')
+ args = [ctfile, infile]
+
+ status, out, err = utils.runscript(script, args, os.path.dirname(infile),
+ sandbox=True)
+
+ out = out.splitlines()
+ assert 'TTGTAACCTGTGTGGGGTCG,1' in out
+
+
+def test_count_kmers_2_single():
+ infile = utils.get_temp_filename('input.fa')
+ shutil.copyfile(utils.get_test_data('random-20-a.fa'), infile)
+
+ script = scriptpath('count-kmers-single.py')
+ args = ['-x', '1e7', '-k', '20', '-N', '2', infile]
+
+ status, out, err = utils.runscript(script, args, os.path.dirname(infile),
+ sandbox=True)
+
+ out = out.splitlines()
+ assert 'TTGTAACCTGTGTGGGGTCG,1' in out
diff --git a/tests/test_script_arguments.py b/tests/test_script_arguments.py
index e7cdc8d..25c6b01 100644
--- a/tests/test_script_arguments.py
+++ b/tests/test_script_arguments.py
@@ -18,7 +18,17 @@ from . import khmer_tst_utils as utils
import argparse
import khmer.kfile
from khmer import khmer_args
-from cStringIO import StringIO
+try:
+ from StringIO import StringIO
+except ImportError:
+ from io import StringIO
+
+import sys
+
+
+# For map(long, [list of ints]) cross-version hackery
+if sys.version_info.major > 2:
+ long = int
def test_check_space():
@@ -36,14 +46,16 @@ def test_check_space():
def test_check_tablespace():
+ outfile = utils.get_test_data('truncated.fq')
save_stderr, sys.stderr = sys.stderr, io.StringIO()
parser = khmer_args.build_counting_args()
args = parser.parse_args(['-M', '1e9'])
try:
- khmer.kfile.check_space_for_hashtable(args, 'countgraph', force=False,
- _testhook_free_space=0)
+ tablesize = khmer_args.calculate_tablesize(args, 'countgraph')
+ khmer.kfile.check_space_for_hashtable(outfile, tablesize,
+ False, _testhook_free_space=0)
assert 0, "this should fail"
except SystemExit as e:
print(str(e))
@@ -68,12 +80,15 @@ def test_check_space_force():
def test_check_tablespace_force():
save_stderr, sys.stderr = sys.stderr, io.StringIO()
+ outfile = utils.get_test_data('truncated')
+
parser = khmer_args.build_counting_args()
args = parser.parse_args(['-M', '1e9'])
try:
- khmer.kfile.check_space_for_hashtable(args, 'countgraph', True,
- _testhook_free_space=0)
+ tablesize = khmer_args.calculate_tablesize(args, 'countgraph')
+ khmer.kfile.check_space_for_hashtable(outfile, tablesize,
+ True, _testhook_free_space=0)
assert True, "this should pass"
except SystemExit as e:
print(str(e))
@@ -93,6 +108,18 @@ def test_invalid_file_warn():
sys.stderr = save_stderr
+def test_check_valid_stdin_nowarn():
+ save_stderr, sys.stderr = sys.stderr, io.StringIO()
+ try:
+ khmer.kfile.check_valid_file_exists(["-"])
+ err = sys.stderr.getvalue()
+ assert err.count("\n") == 0, err
+ except SystemExit as e:
+ print(str(e))
+ finally:
+ sys.stderr = save_stderr
+
+
FakeArgparseObject = collections.namedtuple('FakeArgs',
['ksize', 'n_tables',
'max_tablesize',
@@ -108,7 +135,8 @@ def test_create_countgraph_1():
args = FakeArgparseObject(ksize, n_tables, max_tablesize, max_mem)
countgraph = khmer_args.create_countgraph(args)
- assert countgraph.hashsizes() == [2499997L, 2499989L, 2499983L, 2499967L]
+ expected_hashsz = utils.longify([2499997, 2499989, 2499983, 2499967])
+ assert countgraph.hashsizes() == expected_hashsz, countgraph.hashsizes()
assert sum(countgraph.hashsizes()) < max_mem, sum(countgraph.hashsizes())
@@ -171,10 +199,11 @@ def test_create_nodegraph_1():
args = FakeArgparseObject(ksize, n_tables, max_tablesize, max_mem)
nodegraph = khmer_args.create_nodegraph(args)
- assert nodegraph.hashsizes() == [19999999L, 19999981L,
- 19999963L, 19999927L]
+ expected_hashsz = utils.longify([19999999, 19999981, 19999963, 19999927])
+ assert nodegraph.hashsizes() == expected_hashsz, nodegraph.hashsizes()
- assert sum(nodegraph.hashsizes())/8.0 < max_mem, sum(nodegraph.hashsizes())
+ assert sum(nodegraph.hashsizes()) / \
+ 8.0 < max_mem, sum(nodegraph.hashsizes())
def test_create_nodegraph_2():
@@ -221,7 +250,7 @@ def test_create_nodegraph_4_multiplier():
args = FakeArgparseObject(ksize, n_tables, max_tablesize, max_mem)
nodegraph = khmer_args.create_nodegraph(args, multiplier=2.0)
- assert sum(nodegraph.hashsizes())/8.0 < max_mem / 2.0, \
+ assert sum(nodegraph.hashsizes()) / 8.0 < max_mem / 2.0, \
sum(nodegraph.hashsizes())
@@ -236,9 +265,7 @@ def test_report_on_config_bad_hashtype():
try:
khmer_args.report_on_config(args, 'foograph')
assert 0, "the previous statement should raise an exception"
- except AssertionError:
- raise
- except Exception as err:
+ except ValueError as err:
assert "unknown graph type: foograph" in str(err), str(err)
@@ -253,9 +280,7 @@ def test_fail_calculate_foograph_size():
args = FakeArgparseObject(ksize, n_tables, max_tablesize, max_mem)
try:
- nodegraph = khmer_args._calculate_tablesize(args, 'foograph')
+ nodegraph = khmer_args.calculate_tablesize(args, 'foograph')
assert 0, "previous statement should fail"
- except AssertionError:
- raise
- except Exception as err:
+ except ValueError as err:
assert "unknown graph type: foograph" in str(err), str(err)
diff --git a/tests/test_scripts.py b/tests/test_scripts.py
index ffbbb81..ace3d89 100644
--- a/tests/test_scripts.py
+++ b/tests/test_scripts.py
@@ -1,6 +1,3 @@
-from __future__ import print_function
-from __future__ import absolute_import
-from __future__ import unicode_literals
#
# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
# Copyright (C) Michigan State University, 2009-2015. It is licensed under
@@ -8,6 +5,10 @@ from __future__ import unicode_literals
# Contact: khmer-project at idyll.org
#
+from __future__ import print_function
+from __future__ import absolute_import
+from __future__ import unicode_literals
+
# pylint: disable=C0111,C0103,E1103,W0612
import json
@@ -18,7 +19,6 @@ import shutil
from io import StringIO
import traceback
from nose.plugins.attrib import attr
-import subprocess
import threading
import bz2
import io
@@ -42,7 +42,7 @@ def test_check_space():
def test_load_into_counting():
script = 'load-into-counting.py'
- args = ['-x', '1e3', '-N', '2', '-k', '20', '-t']
+ args = ['-x', '1e3', '-N', '2', '-k', '20']
outfile = utils.get_temp_filename('out.ct')
infile = utils.get_test_data('test-abund-read-2.fa')
@@ -56,7 +56,7 @@ def test_load_into_counting():
def test_load_into_counting_tablesize_warning():
script = 'load-into-counting.py'
- args = ['-k', '20', '-t']
+ args = ['-k', '20']
outfile = utils.get_temp_filename('out.ct')
infile = utils.get_test_data('test-abund-read-2.fa')
@@ -70,7 +70,7 @@ def test_load_into_counting_tablesize_warning():
def test_load_into_counting_max_memory_usage_parameter():
script = 'load-into-counting.py'
- args = ['-M', '2e3', '-k', '20', '-t']
+ args = ['-M', '2e3', '-k', '20']
outfile = utils.get_temp_filename('out.ct')
infile = utils.get_test_data('test-abund-read-2.fa')
@@ -87,7 +87,7 @@ def test_load_into_counting_max_memory_usage_parameter():
def test_load_into_counting_abundance_dist_nobig():
script = 'load-into-counting.py'
- args = ['-x', '1e3', '-N', '2', '-k', '20', '-t', '-b']
+ args = ['-x', '1e3', '-N', '2', '-k', '20', '-b']
outfile = utils.get_temp_filename('out.ct')
infile = utils.get_test_data('test-abund-read-2.fa')
@@ -109,7 +109,7 @@ def test_load_into_counting_abundance_dist_nobig():
def test_load_into_counting_nonwritable():
script = 'load-into-counting.py'
- args = ['-x', '1e3', '-N', '2', '-k', '20', '-t']
+ args = ['-x', '1e3', '-N', '2', '-k', '20']
outfile = utils.get_temp_filename('test-nonwritable')
with open(outfile, 'w') as fout:
@@ -128,7 +128,7 @@ def test_load_into_counting_nonwritable():
@attr('huge')
def test_load_into_counting_toobig():
script = 'load-into-counting.py'
- args = ['-x', '1e12', '-N', '2', '-k', '20', '-t', '--force']
+ args = ['-x', '1e12', '-N', '2', '-k', '20', '--force']
outfile = utils.get_temp_filename('out.kh')
infile = utils.get_test_data('test-abund-read-2.fa')
@@ -157,7 +157,7 @@ def test_load_into_counting_fail():
def test_load_into_counting_multifile():
script = 'load-into-counting.py'
- args = ['-x', '1e7', '-N', '2', '-k', '20', '-t']
+ args = ['-x', '1e7', '-N', '2', '-k', '20']
outfile = utils.get_temp_filename('out.kh')
infile = utils.get_test_data('test-abund-read-2.fa')
@@ -172,7 +172,7 @@ def test_load_into_counting_multifile():
def test_load_into_counting_tsv():
script = 'load-into-counting.py'
- args = ['-x', '1e7', '-N', '2', '-k', '20', '-t', '-s', 'tsv']
+ args = ['-x', '1e7', '-N', '2', '-k', '20', '-s', 'tsv']
outfile = utils.get_temp_filename('out.ct')
tabfile = outfile + '.info.tsv'
@@ -195,7 +195,7 @@ def test_load_into_counting_tsv():
def test_load_into_counting_json():
script = 'load-into-counting.py'
- args = ['-x', '1e7', '-N', '2', '-k', '20', '-t', '-s', 'json']
+ args = ['-x', '1e7', '-N', '2', '-k', '20', '-s', 'json']
outfile = utils.get_temp_filename('out.ct')
jsonfile = outfile + '.info.json'
@@ -313,6 +313,21 @@ def test_filter_abund_2():
assert len(seqs) == 2, seqs
assert 'GGTTGACGGGGCTCAGGG' in seqs
+
+def test_filter_abund_2_stdin():
+ infile = utils.get_temp_filename('test.fa')
+ in_dir = os.path.dirname(infile)
+
+ shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile)
+ counting_ht = _make_counting(infile, K=17)
+
+ script = 'filter-abund.py'
+ args = ['-C', '1', counting_ht, '-']
+ (status, out, err) = utils.runscript(script, args, in_dir, fail_ok=True)
+ assert status == 1
+ assert "Accepting input from stdin; output filename must be provided" \
+ in str(err)
+
# make sure that FASTQ records are retained.
@@ -358,7 +373,7 @@ def test_filter_abund_4_fq_casava_18():
outfile = infile + '.abundfilt'
assert os.path.exists(outfile), outfile
- seqs = set([r.name for r in screed.open(outfile, parse_description=False)])
+ seqs = set([r.name for r in screed.open(outfile)])
assert 'pair:foo 1::N' in seqs, seqs
@@ -369,7 +384,7 @@ def test_filter_abund_1_singlefile():
shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile)
script = 'filter-abund-single.py'
- args = ['-x', '1e7', '-N', '2', '-k', '17', '-t', infile]
+ args = ['-x', '1e7', '-N', '2', '-k', '17', infile]
(status, out, err) = utils.runscript(script, args, in_dir)
assert 'Total number of unique k-mers: 98' in err, err
@@ -390,7 +405,7 @@ def test_filter_abund_2_singlefile():
shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile)
script = 'filter-abund-single.py'
- args = ['-x', '1e7', '-N', '2', '-k', '17', '-t', '--savetable',
+ args = ['-x', '1e7', '-N', '2', '-k', '17', '--savetable',
tabfile, infile]
(status, out, err) = utils.runscript(script, args, in_dir)
@@ -418,7 +433,7 @@ def test_filter_abund_2_singlefile_fq_casava_18():
outfile = infile + '.abundfilt'
assert os.path.exists(outfile), outfile
- seqs = set([r.name for r in screed.open(outfile, parse_description=False)])
+ seqs = set([r.name for r in screed.open(outfile)])
assert 'pair:foo 1::N' in seqs, seqs
@@ -508,7 +523,7 @@ def test_filter_abund_7_retain_Ns():
assert os.path.exists(outfile), outfile
# test for a sequence with an 'N' in it --
- names = set([r.name for r in screed.open(outfile, parse_description=0)])
+ names = set([r.name for r in screed.open(outfile)])
assert '895:1:37:17593:9954 1::FOO_withN' in names, names
# check to see if that 'N' was properly changed to an 'A'
@@ -541,7 +556,7 @@ def test_filter_abund_single_8_retain_Ns():
assert os.path.exists(outfile), outfile
# test for a sequence with an 'N' in it --
- names = set([r.name for r in screed.open(outfile, parse_description=0)])
+ names = set([r.name for r in screed.open(outfile)])
assert '895:1:37:17593:9954 1::FOO_withN' in names, names
# check to see if that 'N' was properly changed to an 'A'
@@ -618,7 +633,7 @@ def test_filter_stoptags_fq():
assert 'GGTTGACGGGGCTCAGGG' in seqs, seqs
# make sure that record names are carried through unparsed
- names = [r.name for r in screed.open(outfile, parse_description=False)]
+ names = [r.name for r in screed.open(outfile)]
names = set(names)
assert 'seq 1::BAR' in names
@@ -636,15 +651,15 @@ def test_count_median():
assert os.path.exists(outfile), outfile
- data = [x.strip() for x in open(outfile)]
+ data = [x.strip() for x in open(outfile).readlines()[1:]]
data = set(data)
assert len(data) == 2, data
- assert 'seq 1001 1001.0 0.0 18' in data
- assert '895:1:37:17593:9954/1 1 103.803741455 303.702941895 114' in data
+ assert 'seq,1001,1001.0,0.0,18' in data, data
+ assert '895:1:37:17593:9954/1,1,103.803741455,303.702941895,114' in data
-def test_count_median_fq():
- infile = utils.get_temp_filename('test.fa')
+def test_count_median_fq_csv():
+ infile = utils.get_temp_filename('test.fq')
outfile = infile + '.counts'
shutil.copyfile(utils.get_test_data('test-abund-read-2.fq'), infile)
@@ -658,33 +673,28 @@ def test_count_median_fq():
data = [x.strip() for x in open(outfile)]
data = set(data)
- assert len(data) == 2, data
- assert 'seq 1001 1001.0 0.0 18' in data
- assert '895:1:37:17593:9954 1 103.803741455 303.702941895 114' in data
+ assert len(data) == 4, data
+ assert 'name,median,average,stddev,seqlen' in data
+ assert 'seq,1001,1001.0,0.0,18' in data
+ # verify that sequence names remain unparsed
+ names = set([line.split(',')[0] for line in data])
+ assert '895:1:37:17593:9954 1::FOO' in names, names
-def test_count_median_fq_csv():
- infile = utils.get_temp_filename('test.fa')
- outfile = infile + '.counts'
+
+def test_count_median_fq_csv_stdout():
+ infile = utils.get_temp_filename('test.fq')
+ outfile = '-'
shutil.copyfile(utils.get_test_data('test-abund-read-2.fq'), infile)
counting_ht = _make_counting(infile, K=8)
script = 'count-median.py'
- args = ['--csv', counting_ht, infile, outfile]
- utils.runscript(script, args)
-
- assert os.path.exists(outfile), outfile
-
- data = [x.strip() for x in open(outfile)]
- data = set(data)
- assert len(data) == 4, data
- assert 'name,median,average,stddev,seqlen' in data
- assert 'seq,1001,1001.0,0.0,18' in data
+ args = [counting_ht, infile, outfile]
+ (status, out, err) = utils.runscript(script, args)
- # verify that sequence names remain unparsed with '--csv'
- names = set([line.split(',')[0] for line in data])
- assert '895:1:37:17593:9954 1::FOO' in names, names
+ assert 'name,median,average,stddev,seqlen' in out
+ assert 'seq,1001,1001.0,0.0,18' in out
def test_load_graph():
@@ -708,7 +718,7 @@ def test_load_graph():
try:
ht = khmer.load_hashbits(ht_file)
- except IOError as err:
+ except OSError as err:
assert 0, str(err)
ht.load_tagset(tagset_file)
@@ -750,6 +760,13 @@ def test_oxli_build_graph():
assert x == (1, 0), x
+def test_oxli_nocommand():
+ script = 'oxli'
+
+ (status, out, err) = utils.runscript(script, [])
+ assert status == 0
+
+
def test_load_graph_no_tags():
script = 'load-graph.py'
args = ['-x', '1e7', '-N', '2', '-k', '20', '-n']
@@ -909,7 +926,7 @@ def test_load_graph_max_memory_usage_parameter():
try:
ht = khmer.load_hashbits(ht_file)
- except IOError as err:
+ except OSError as err:
assert 0, str(err)
assert (sum(ht.hashsizes()) / 8.) < 2e7, ht.hashsizes()
@@ -1235,7 +1252,7 @@ def test_extract_partitions_header_whitespace():
assert dist.strip() == '1 11960 11960 11960', dist.strip()
parts = [r.name.split('\t')[1]
- for r in screed.open(partfile, parse_description=False)]
+ for r in screed.open(partfile)]
assert len(parts) == 13538, len(parts)
parts = set(parts)
assert len(parts) == 12602, len(parts)
@@ -1264,12 +1281,12 @@ def test_extract_partitions_fq():
dist = open(distfile).readline()
assert dist.strip() == '99 1 1 99'
- screed_iter = screed.open(partfile, parse_description=False)
+ screed_iter = screed.open(partfile)
names = [r.name.split('\t')[0] for r in screed_iter]
assert '35 1::FOO' in names
assert '46 1::FIZ' in names
- screed_iter = screed.open(partfile, parse_description=False)
+ screed_iter = screed.open(partfile)
parts = [r.name.split('\t')[1] for r in screed_iter]
assert len(parts) == 99, len(parts)
@@ -1326,7 +1343,7 @@ def test_extract_partitions_no_output_groups():
args = ['-n', 'extracted', partfile]
# We expect a sys.exit -> we need the test to be tolerant
- _, out, err = utils.runscript(script, args, in_dir, fail_ok=True)
+ status, out, err = utils.runscript(script, args, in_dir)
assert "NOT outputting groups! Beware!" in err
# Group files are created after output_groups is
# checked. They should not exist in this scenario
@@ -1391,8 +1408,9 @@ def test_extract_partitions_no_groups():
script = 'extract-partitions.py'
args = ['extracted', empty_file]
- _, _, err = utils.runscript(script, args, in_dir, fail_ok=True)
+ status, _, err = utils.runscript(script, args, in_dir, fail_ok=True)
assert "ERROR: Input file", "is empty; Exiting." in err
+ assert status != 0
# No group files should be created
groupfile = os.path.join(in_dir, 'extracted.group0000.fa')
@@ -1414,16 +1432,6 @@ def test_abundance_dist():
with open(outfile) as fp:
line = fp.readline().strip()
- assert line == '1 96 96 0.98', line
- line = fp.readline().strip()
- assert line == '1001 2 98 1.0', line
-
- os.remove(outfile)
- args = ['-z', '--csv', htfile, infile, outfile]
- utils.runscript(script, args, in_dir)
-
- with open(outfile) as fp:
- line = fp.readline().strip()
assert (line == 'abundance,count,cumulative,cumulative_fraction'), line
line = fp.readline().strip()
assert line == '1,96,96,0.98', line
@@ -1431,9 +1439,8 @@ def test_abundance_dist():
assert line == '1001,2,98,1.0', line
-def test_abundance_dist_nobigcount():
+def test_abundance_dist_stdout():
infile = utils.get_temp_filename('test.fa')
- outfile = utils.get_temp_filename('test.dist')
in_dir = os.path.dirname(infile)
shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile)
@@ -1441,35 +1448,32 @@ def test_abundance_dist_nobigcount():
htfile = _make_counting(infile, K=17)
script = 'abundance-dist.py'
- args = ['-b', '-z', htfile, infile, outfile]
- utils.runscript(script, args, in_dir)
+ args = ['-z', htfile, infile, "-"]
+ (status, out, err) = utils.runscript(script, args, in_dir)
- with open(outfile) as fp:
- line = fp.readline().strip()
- assert line == '1 96 96 0.98', line
- line = fp.readline().strip()
- assert line == '255 2 98 1.0', line
+ assert '1,96,96,0.98' in out, out
+ assert '1001,2,98,1.0' in out, out
-def test_abundance_dist_single():
+def test_abundance_dist_nobigcount():
infile = utils.get_temp_filename('test.fa')
outfile = utils.get_temp_filename('test.dist')
in_dir = os.path.dirname(infile)
shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile)
- script = 'abundance-dist-single.py'
- args = ['-x', '1e7', '-N', '2', '-k', '17', '-z', '-t', infile,
- outfile]
- (status, out, err) = utils.runscript(script, args, in_dir)
+ htfile = _make_counting(infile, K=17)
- assert 'Total number of unique k-mers: 98' in err, err
+ script = 'abundance-dist.py'
+ args = ['-b', '-z', htfile, infile, outfile]
+ utils.runscript(script, args, in_dir)
with open(outfile) as fp:
+ line = fp.readline().strip() # skip header
line = fp.readline().strip()
- assert line == '1 96 96 0.98', line
+ assert line == '1,96,96,0.98', line
line = fp.readline().strip()
- assert line == '1001 2 98 1.0', line
+ assert line == '255,2,98,1.0', line
def test_abundance_dist_threaded():
@@ -1480,17 +1484,18 @@ def test_abundance_dist_threaded():
shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile)
script = 'abundance-dist-single.py'
- args = ['-x', '1e7', '-N', '2', '-k', '17', '-z', '-t', '--threads', '18',
+ args = ['-x', '1e7', '-N', '2', '-k', '17', '-z', '--threads', '18',
infile, outfile]
(status, out, err) = utils.runscript(script, args, in_dir)
assert 'Total number of unique k-mers: 98' in err, err
with open(outfile) as fp:
+ line = fp.readline().strip() # skip header
line = fp.readline().strip()
- assert line == '1 96 96 0.98', line
+ assert line == '1,96,96,0.98', line
line = fp.readline().strip()
- assert line == '1001 2 98 1.0', line
+ assert line == '1001,2,98,1.0', line
def test_abundance_dist_single_csv():
@@ -1501,7 +1506,7 @@ def test_abundance_dist_single_csv():
shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile)
script = 'abundance-dist-single.py'
- args = ['-x', '1e7', '-N', '2', '-k', '17', '-z', '--csv', infile,
+ args = ['-x', '1e7', '-N', '2', '-k', '17', '-z', infile,
outfile]
(status, out, err) = utils.runscript(script, args, in_dir)
@@ -1526,10 +1531,11 @@ def test_abundance_dist_single_nobigcount():
utils.runscript(script, args, in_dir)
with open(outfile) as fp:
+ line = fp.readline().strip() # skip header
line = fp.readline().strip()
- assert line == '1 96 96 0.98', line
+ assert line == '1,96,96,0.98', line
line = fp.readline().strip()
- assert line == '255 2 98 1.0', line
+ assert line == '255,2,98,1.0', line
def test_abundance_dist_single_nosquash():
@@ -1540,14 +1546,15 @@ def test_abundance_dist_single_nosquash():
shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile)
script = 'abundance-dist-single.py'
- args = ['-x', '1e7', '-N', '2', '-k', '17', '-z', '-t', infile, outfile]
+ args = ['-x', '1e7', '-N', '2', '-k', '17', '-z', infile, outfile]
utils.runscript(script, args, in_dir)
with open(outfile) as fp:
+ line = fp.readline().strip() # skip header
line = fp.readline().strip()
- assert line == '1 96 96 0.98', line
+ assert line == '1,96,96,0.98', line
line = fp.readline().strip()
- assert line == '1001 2 98 1.0', line
+ assert line == '1001,2,98,1.0', line
def test_abundance_dist_single_savetable():
@@ -1559,15 +1566,16 @@ def test_abundance_dist_single_savetable():
shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile)
script = 'abundance-dist-single.py'
- args = ['-x', '1e7', '-N', '2', '-k', '17', '-z', '-t', '--savetable',
+ args = ['-x', '1e7', '-N', '2', '-k', '17', '-z', '--savetable',
tabfile, infile, outfile]
utils.runscript(script, args, in_dir)
with open(outfile) as fp:
+ line = fp.readline().strip() # skip header
line = fp.readline().strip()
- assert line == '1 96 96 0.98', line
+ assert line == '1,96,96,0.98', line
line = fp.readline().strip()
- assert line == '1001 2 98 1.0', line
+ assert line == '1001,2,98,1.0', line
def test_do_partition():
@@ -1620,7 +1628,7 @@ def test_do_partition_2_fq():
partfile = os.path.join(in_dir, 'random-20-a.fq.part')
- screed_iter = screed.open(partfile, parse_description=False)
+ screed_iter = screed.open(partfile)
names = [r.name.split('\t')[0] for r in screed_iter]
assert '35 1::FOO' in names
assert '46 1::FIZ' in names
@@ -1720,19 +1728,20 @@ def test_interleave_reads_broken_fq_3():
assert "ERROR: This doesn't look like paired data!" in err
-def test_interleave_reads_broken_fq_4():
+def test_interleave_reads_broken_fq_5():
# test input files
- infile1 = utils.get_test_data('paired-mixed-broken.fq')
+ infile1 = utils.get_test_data('paired-broken4.fq.1')
+ infile2 = utils.get_test_data('paired-broken4.fq.2')
# actual output file
outfile = utils.get_temp_filename('out.fq')
script = 'interleave-reads.py'
- args = [infile1, '-o', outfile]
+ args = [infile1, infile2, '-o', outfile]
status, out, err = utils.runscript(script, args, fail_ok=True)
assert status == 1
- assert "ERROR: given only one filename, that doesn't contain _R1_" in err
+ assert "ERROR: This doesn't look like paired data!" in err
def test_interleave_reads_2_fa():
@@ -1847,8 +1856,8 @@ def test_extract_paired_reads_2_fq():
assert os.path.exists(outfile2), outfile2
n = 0
- for r, q in zip(screed.open(ex_outfile1, parse_description=False),
- screed.open(outfile1, parse_description=False)):
+ for r, q in zip(screed.open(ex_outfile1),
+ screed.open(outfile1)):
n += 1
assert r.name == q.name, (r.name, q.name, n)
assert r.sequence == q.sequence
@@ -1856,8 +1865,8 @@ def test_extract_paired_reads_2_fq():
assert n > 0
n = 0
- for r, q in zip(screed.open(ex_outfile2, parse_description=False),
- screed.open(outfile2, parse_description=False)):
+ for r, q in zip(screed.open(ex_outfile2),
+ screed.open(outfile2)):
n += 1
assert r.name == q.name
assert r.sequence == q.sequence
@@ -2090,6 +2099,15 @@ def test_split_paired_reads_2_mixed_fq_require_pair():
assert "is not part of a pair" in err
+def test_split_paired_reads_2_stdin_no_out():
+ script = 'split-paired-reads.py'
+ args = ['-']
+
+ status, out, err = utils.runscript(script, args, fail_ok=True)
+ assert status == 1
+ assert "Accepting input from stdin; output filenames must " in err
+
+
def test_split_paired_reads_2_mixed_fq():
# test input file
infile = utils.get_temp_filename('test.fq')
@@ -2392,12 +2410,20 @@ def test_sample_reads_randomly_fq():
'850:2:1:2562:1308/1',
'850:2:1:3123:15968/2'}
- seqs = set([r.name for r in screed.open(outfile,
- parse_description=False)])
+ seqs = set([r.name for r in screed.open(outfile)])
print(list(sorted(seqs)))
assert seqs == answer
+def test_sample_reads_randomly_stdin_no_out():
+ script = 'sample-reads-randomly.py'
+ args = ['-']
+
+ (status, out, err) = utils.runscript(script, args, fail_ok=True)
+ assert status != 0
+ assert "Accepting input from stdin; output filename" in err, err
+
+
def test_fastq_to_fasta():
script = 'fastq-to-fasta.py'
@@ -2418,8 +2444,7 @@ def test_fastq_to_fasta():
assert len(out.splitlines()) == 2, len(out.splitlines())
assert "No lines dropped" in err
- names = [r.name for r in screed.open(clean_outfile,
- parse_description=False)]
+ names = [r.name for r in screed.open(clean_outfile)]
assert '895:1:1:1246:14654 1:N:0:NNNNN' in names, names
args = [n_infile, '-n', '-o', n_outfile]
@@ -2465,7 +2490,7 @@ def test_extract_long_sequences_fa():
countlines = sum(1 for line in open(fa_outfile))
assert countlines == 22, countlines
- names = [r.name for r in screed.open(fa_outfile, parse_description=False)]
+ names = [r.name for r in screed.open(fa_outfile)]
assert "895:1:37:17593:9954/1" in names
assert "895:1:37:17593:9954/2" in names
@@ -2487,7 +2512,7 @@ def test_extract_long_sequences_fq():
countlines = sum(1 for line in open(fq_outfile))
assert countlines == 44, countlines
- names = [r.name for r in screed.open(fq_outfile, parse_description=False)]
+ names = [r.name for r in screed.open(fq_outfile)]
assert "895:1:37:17593:9954 1::foo" in names
assert "895:1:37:17593:9954 2::foo" in names
@@ -2585,37 +2610,8 @@ def test_count_overlap_invalid_datafile():
args = ['--ksize', '20', '--n_tables', '2', '--max-tablesize', '10000000',
htfile + '.pt', htfile + '.pt', outfile]
(status, out, err) = utils.runscript(script, args, in_dir, fail_ok=True)
- if sys.version_info.major == 2:
- assert "IOError" in err
- else:
- assert "OSError" in err
-
-
-def test_count_overlap():
- seqfile1 = utils.get_temp_filename('test-overlap1.fa')
- in_dir = os.path.dirname(seqfile1)
- seqfile2 = utils.get_temp_filename('test-overlap2.fa', in_dir)
- outfile = utils.get_temp_filename('overlap.out', in_dir)
- curvefile = utils.get_temp_filename('overlap.out.curve', in_dir)
- shutil.copy(utils.get_test_data('test-overlap1.fa'), seqfile1)
- shutil.copy(utils.get_test_data('test-overlap2.fa'), seqfile2)
- htfile = _make_graph(seqfile1, ksize=20)
- script = 'count-overlap.py'
- args = ['--ksize', '20', '--n_tables', '2', '--max-tablesize', '10000000',
- htfile + '.pt', seqfile2, outfile]
- (status, out, err) = utils.runscript(script, args, in_dir)
- assert status == 0
- assert os.path.exists(outfile), outfile
- data = [x.strip() for x in open(outfile)]
- data = set(data)
- assert '# of unique k-mers in dataset2: 759020' in data, data
- assert '# of overlap unique k-mers: 245547' in data
- assert os.path.exists(curvefile), curvefile
- data = [x.strip() for x in open(curvefile)]
- data = set(data)
- assert '178630 1134' in data, data
- assert '496280 2904' in data
- assert '752031 238558' in data
+ assert status != 0
+ assert "OSError" in err
def test_count_overlap_csv():
@@ -2629,7 +2625,7 @@ def test_count_overlap_csv():
htfile = _make_graph(seqfile1, ksize=20)
script = 'count-overlap.py'
args = ['--ksize', '20', '--n_tables', '2', '--max-tablesize',
- '10000000', '--csv', htfile + '.pt', seqfile2, outfile]
+ '10000000', htfile + '.pt', seqfile2, outfile]
(status, out, err) = utils.runscript(script, args, in_dir)
assert status == 0
assert os.path.exists(outfile), outfile
@@ -2680,29 +2676,30 @@ def execute_streaming_diginorm(ifilename):
return in_dir + '/outfile'
-def execute_load_graph_streaming(filename):
+def _execute_load_graph_streaming(filename):
'''Helper function for the matrix of streaming tests using screed via
filter-abund-single, i.e. uncompressed fasta, gzip fasta, bz2 fasta,
uncompressed fastq, etc.
This is not directly executed but is run by the tests themselves
'''
- script = 'load-graph.py'
- args = '-x 1e7 -N 2 -k 20 out -'
-
+ scripts = utils.scriptpath()
infile = utils.get_temp_filename('temp')
in_dir = os.path.dirname(infile)
shutil.copyfile(utils.get_test_data(filename), infile)
- (status, out, err) = utils.runscriptredirect(script, args, infile, in_dir)
+
+ args = '-x 1e7 -N 2 -k 20 out -'
+
+ cmd = 'cat {infile} | {scripts}/load-graph.py {args}'.format(
+ infile=infile, scripts=scripts, args=args)
+
+ (status, out, err) = utils.run_shell_cmd(cmd, in_directory=in_dir)
if status != 0:
- for line in out:
- print(out)
- for line in err:
- print(err)
+ print(out)
+ print(err)
assert status == 0, status
- err.seek(0)
- err = err.read()
+
assert 'Total number of unique k-mers: 3960' in err, err
ht_file = os.path.join(in_dir, 'out.pt')
@@ -2778,34 +2775,34 @@ def test_screed_streaming_gzipfa():
def test_read_parser_streaming_ufa():
# uncompressed FASTA
- execute_load_graph_streaming(utils.get_test_data('random-20-a.fa'))
+ _execute_load_graph_streaming(utils.get_test_data('random-20-a.fa'))
def test_read_parser_streaming_ufq():
# uncompressed FASTQ
- execute_load_graph_streaming(utils.get_test_data('random-20-a.fq'))
+ _execute_load_graph_streaming(utils.get_test_data('random-20-a.fq'))
@attr('known_failing')
def test_read_parser_streaming_bzfq():
# bzip compressed FASTQ
- execute_load_graph_streaming(utils.get_test_data('random-20-a.fq.bz2'))
+ _execute_load_graph_streaming(utils.get_test_data('random-20-a.fq.bz2'))
def test_read_parser_streaming_gzfq():
# gzip compressed FASTQ
- execute_load_graph_streaming(utils.get_test_data('random-20-a.fq.gz'))
+ _execute_load_graph_streaming(utils.get_test_data('random-20-a.fq.gz'))
@attr('known_failing')
def test_read_parser_streaming_bzfa():
# bzip compressed FASTA
- execute_load_graph_streaming(utils.get_test_data('random-20-a.fa.bz2'))
+ _execute_load_graph_streaming(utils.get_test_data('random-20-a.fa.bz2'))
def test_read_parser_streaming_gzfa():
# gzip compressed FASTA
- execute_load_graph_streaming(utils.get_test_data('random-20-a.fa.gz'))
+ _execute_load_graph_streaming(utils.get_test_data('random-20-a.fa.gz'))
def test_readstats():
@@ -2891,12 +2888,20 @@ def test_trim_low_abund_1_duplicate_filename_err():
shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile)
args = ["-k", "17", "-x", "1e7", "-N", "2", '-C', '1', infile, infile]
- try:
- utils.runscript('trim-low-abund.py', args, in_dir)
- raise Exception("should not reach this")
- except AssertionError:
- # an error should be raised by passing 'infile' twice.
- pass
+ (status, out, err) = utils.runscript('trim-low-abund.py', args, in_dir,
+ fail_ok=True)
+ assert status == 1
+ assert "Error: Cannot input the same filename multiple times." in str(err)
+
+
+def test_trim_low_abund_1_stdin_err():
+ args = ["-"]
+
+ (status, out, err) = utils.runscript('trim-low-abund.py', args,
+ fail_ok=True)
+ assert status == 1
+ assert "Accepting input from stdin; output filename must be provided" \
+ in str(err)
def test_trim_low_abund_2():
@@ -3037,7 +3042,7 @@ def test_trim_low_abund_keep_paired_casava18():
outfile = infile + '.abundtrim'
assert os.path.exists(outfile), outfile
- seqs = [r.name for r in screed.open(outfile, parse_description=False)]
+ seqs = [r.name for r in screed.open(outfile)]
assert seqs[-2:] == ['pair:foo 1::N', 'pair:foo 2::N'], seqs
@@ -3233,3 +3238,74 @@ def test_roundtrip_commented_format():
r = open(infile).read()
r2 = open(outfile).read()
assert r == r2, (r, r2)
+
+
+def test_unique_kmers_defaults():
+ infile = utils.get_temp_filename('random-20-a.fa')
+ shutil.copyfile(utils.get_test_data('random-20-a.fa'), infile)
+
+ args = ['-k', '20', '-e', '0.01', infile]
+
+ _, out, err = utils.runscript('unique-kmers.py', args,
+ os.path.dirname(infile))
+
+ err = err.splitlines()
+ assert ('Estimated number of unique 20-mers in {0}: 3950'.format(infile)
+ in err)
+ assert 'Total estimated number of unique 20-mers: 3950' in err
+
+
+def test_unique_kmers_report_fp():
+ infile = utils.get_temp_filename('random-20-a.fa')
+ shutil.copyfile(utils.get_test_data('random-20-a.fa'), infile)
+ outfile = utils.get_temp_filename('report.unique')
+
+ args = ['-k', '20', '-e', '0.01', '-R', outfile, infile]
+
+ _, out, err = utils.runscript('unique-kmers.py', args,
+ os.path.dirname(infile))
+
+ err = err.splitlines()
+ assert ('Estimated number of unique 20-mers in {0}: 3950'.format(infile)
+ in err)
+ assert 'Total estimated number of unique 20-mers: 3950' in err
+
+ with open(outfile, 'r') as report_fp:
+ outf = report_fp.read().splitlines()
+ assert '3950 20 (total)' in outf
+ assert '3950 20 total' in outf
+
+
+def test_unique_kmers_diagnostics():
+ infile = utils.get_temp_filename('random-20-a.fa')
+ shutil.copyfile(utils.get_test_data('random-20-a.fa'), infile)
+
+ args = ['-k', '20', '-e', '0.01', '--diagnostics', infile]
+
+ _, out, err = utils.runscript('unique-kmers.py', args,
+ os.path.dirname(infile))
+
+ out = out.splitlines()
+ assert ('expected_fp\tnumber_hashtable(Z)\t'
+ 'size_hashtable(H)\texpected_memory_usage' in err)
+
+
+def test_unique_kmers_multiple_inputs():
+ infiles = []
+ for fname in ('random-20-a.fa', 'paired-mixed.fa'):
+ infile = utils.get_temp_filename(fname)
+ shutil.copyfile(utils.get_test_data(fname), infile)
+ infiles.append(infile)
+
+ args = ['-k', '20', '-e', '0.01']
+ args += infiles
+
+ _, out, err = utils.runscript('unique-kmers.py', args,
+ os.path.dirname(infile))
+
+ err = err.splitlines()
+ assert ('Estimated number of unique 20-mers in {0}: 3950'
+ .format(infiles[0]) in err)
+ assert ('Estimated number of unique 20-mers in {0}: 232'.format(infiles[1])
+ in err)
+ assert 'Total estimated number of unique 20-mers: 4170' in err
diff --git a/tests/test_streaming_io.py b/tests/test_streaming_io.py
new file mode 100644
index 0000000..6ba7ef9
--- /dev/null
+++ b/tests/test_streaming_io.py
@@ -0,0 +1,451 @@
+#
+# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
+# Copyright (C) Michigan State University, 2009-2015. It is licensed under
+# the three-clause BSD license; see LICENSE.
+# Contact: khmer-project at idyll.org
+#
+
+# important note -- these tests do not contribute to code coverage, because
+# of the use of subprocess to execute. Most script tests should go into
+# test_scripts.py for this reason.
+
+from __future__ import print_function
+from __future__ import absolute_import
+from __future__ import unicode_literals
+
+import khmer
+import screed
+from . import khmer_tst_utils as utils
+from .khmer_tst_utils import scriptpath, run_shell_cmd
+from .test_scripts import _make_counting
+import os.path
+import difflib
+
+
+def files_are_equal(a, b):
+ al = open(a).readlines()
+ bl = open(b).readlines()
+
+ return al == bl
+
+
+def diff_files(a, b):
+ al = open(a).readlines()
+ bl = open(b).readlines()
+
+ results = "\n".join(difflib.context_diff(al, bl, fromfile=a, tofile=b))
+ return results
+
+
+def test_interleave_split_1():
+ in1 = utils.get_test_data('paired.fq.1')
+ in2 = utils.get_test_data('paired.fq.2')
+
+ out1 = utils.get_temp_filename('a.fa')
+ out2 = utils.get_temp_filename('b.fa')
+
+ cmd = """
+ {scripts}/interleave-reads.py {in1} {in2} -o - |
+ {scripts}/split-paired-reads.py -1 {out1} -2 {out2} -
+ """
+
+ cmd = cmd.format(scripts=scriptpath(),
+ in1=in1, in2=in2,
+ out1=out1, out2=out2)
+
+ run_shell_cmd(cmd)
+
+ assert files_are_equal(in1, out1), diff_files(in1, out1)
+ assert files_are_equal(in2, out2), diff_files(in2, out2)
+
+
+def test_interleave_split_2_fail():
+ in1 = utils.get_test_data('paired.fq.1')
+ in2 = utils.get_test_data('paired.fq.2')
+
+ out1 = utils.get_temp_filename('a.fa')
+ out2 = utils.get_temp_filename('b.fa')
+
+ cmd = """
+ {scripts}/interleave-reads.py {in1} {in2} -o - |
+ {scripts}/split-paired-reads.py -
+ """
+
+ cmd = cmd.format(scripts=scriptpath(),
+ in1=in1, in2=in2,
+ out1=out1, out2=out2)
+
+ (status, out, err) = run_shell_cmd(cmd, fail_ok=True)
+ assert status != 0
+ assert "Accepting input from stdin; output filenames must be provided." \
+ in err, err
+
+
+def test_extract_paired_pe():
+ in1 = utils.get_test_data('paired-mixed.fq')
+ out_test = utils.get_test_data('paired-mixed.fq.pe')
+ out1 = utils.get_temp_filename('a.fq')
+
+ cmd = """
+ cat {in1} |
+ {scripts}/extract-paired-reads.py - -p - -s /dev/null > {out1}
+ """
+
+ cmd = cmd.format(scripts=scriptpath(), in1=in1, out1=out1)
+
+ run_shell_cmd(cmd)
+
+ assert files_are_equal(out1, out_test), diff_files(out1, out_test)
+
+
+def test_extract_paired_se():
+ in1 = utils.get_test_data('paired-mixed.fq')
+ out_test = utils.get_test_data('paired-mixed.fq.se')
+ out1 = utils.get_temp_filename('a.fq')
+
+ cmd = """
+ cat {in1} |
+ {scripts}/extract-paired-reads.py - -p /dev/null -s - > {out1}
+ """
+
+ cmd = cmd.format(scripts=scriptpath(), in1=in1, out1=out1)
+
+ run_shell_cmd(cmd)
+
+ assert files_are_equal(out1, out_test), diff_files(out1, out_test)
+
+
+def test_extract_paired_se_fail():
+ in1 = utils.get_test_data('paired-mixed.fq')
+ out_test = utils.get_test_data('paired-mixed.fq.se')
+ out1 = utils.get_temp_filename('a.fq')
+
+ cmd = """
+ cat {in1} |
+ {scripts}/extract-paired-reads.py -p /dev/null - > {out1}
+ """
+
+ cmd = cmd.format(scripts=scriptpath(), in1=in1, out1=out1)
+
+ (status, out, err) = run_shell_cmd(cmd, fail_ok=True)
+ assert status != 0
+ assert "Accepting input from stdin; output filenames must be provided." \
+ in err, err
+
+
+def test_norm_by_median_1():
+ in1 = utils.get_test_data('paired-mixed.fq')
+ out_test = utils.get_test_data('paired-mixed.fq.pe')
+ out1 = utils.get_temp_filename('a.fq')
+
+ cmd = """
+ cat {in1} |
+ {scripts}/extract-paired-reads.py - -p - -s /dev/null |
+ {scripts}/normalize-by-median.py - -o - > {out1}
+ """
+
+ cmd = cmd.format(scripts=scriptpath(), in1=in1, out1=out1)
+
+ run_shell_cmd(cmd)
+
+ assert files_are_equal(out1, out_test), diff_files(out1, out_test)
+
+
+def test_norm_by_median_2_fail():
+ in1 = utils.get_test_data('paired-mixed.fq')
+ out_test = utils.get_test_data('paired-mixed.fq.pe')
+ out1 = utils.get_temp_filename('a.fq')
+
+ cmd = """
+ cat {in1} |
+ {scripts}/extract-paired-reads.py - -p - -s /dev/null |
+ {scripts}/normalize-by-median.py -p - > {out1}
+ """
+
+ cmd = cmd.format(scripts=scriptpath(), in1=in1, out1=out1)
+
+ (status, out, err) = run_shell_cmd(cmd, fail_ok=True)
+ assert status != 0
+ assert "Accepting input from stdin; output filename must be provided with"\
+ in err, err
+
+
+def test_sample_reads_randomly_1():
+ in1 = utils.get_test_data('paired-mixed.fq')
+ out1 = utils.get_temp_filename('a.fq')
+
+ cmd = """
+ cat {in1} |
+ {scripts}/sample-reads-randomly.py - -o - > {out1}
+ """
+
+ cmd = cmd.format(scripts=scriptpath(), in1=in1, out1=out1)
+
+ run_shell_cmd(cmd)
+
+ assert files_are_equal(in1, out1), diff_files(in1, out1)
+
+
+def test_sample_reads_randomly_2_fail():
+ in1 = utils.get_test_data('paired-mixed.fq')
+ out1 = utils.get_temp_filename('a.fq')
+
+ cmd = """
+ cat {in1} |
+ {scripts}/sample-reads-randomly.py - > {out1}
+ """
+
+ cmd = cmd.format(scripts=scriptpath(), in1=in1, out1=out1)
+
+ (status, out, err) = run_shell_cmd(cmd, fail_ok=True)
+ assert status != 0
+ assert "Accepting input from stdin; output filename must be provided with"\
+ in err, err
+
+
+def test_extract_long_sequences_1():
+ in1 = utils.get_test_data('paired-mixed.fa')
+ out1 = utils.get_temp_filename('a.fa')
+
+ cmd = """
+ cat {in1} |
+ {scripts}/extract-long-sequences.py - -l 10 > {out1}
+ """
+
+ cmd = cmd.format(scripts=scriptpath(), in1=in1, out1=out1)
+
+ run_shell_cmd(cmd)
+
+ countlines = sum(1 for line in open(out1))
+ assert countlines == 22, countlines
+
+
+def test_fastq_to_fasta_1():
+ in1 = utils.get_test_data('test-fastq-reads.fq')
+ out1 = utils.get_temp_filename('clean.fa')
+ out_test = utils.get_test_data('test-fastq-reads.fa')
+
+ cmd = """
+ cat {in1} |
+ {scripts}/fastq-to-fasta.py - -o - > {out1}
+ """
+
+ cmd = cmd.format(scripts=scriptpath(), in1=in1, out1=out1)
+
+ run_shell_cmd(cmd)
+ assert files_are_equal(out1, out_test), diff_files(out1, out_test)
+
+
+def test_load_into_counting_1():
+ in1 = utils.get_test_data('test-abund-read-2.fa')
+ out1 = utils.get_temp_filename('out.ct')
+
+ cmd = """
+ cat {in1} |
+ {scripts}/load-into-counting.py -x 1e3 -N 2 -k 20 {out1} - \
+ 2> /dev/null
+ """
+
+ cmd = cmd.format(scripts=scriptpath(), in1=in1, out1=out1)
+ print(cmd)
+
+ (status, out, err) = run_shell_cmd(cmd)
+ assert os.path.exists(out1)
+ khmer.load_counting_hash(out1)
+
+
+def test_load_graph_1():
+ in1 = utils.get_test_data('test-abund-read-2.fa')
+ out1 = utils.get_temp_filename('out.ct')
+
+ cmd = """
+ cat {in1} |
+ {scripts}/load-graph.py -x 1e3 -N 2 -k 20 {out1} - \
+ 2> /dev/null
+ """
+
+ cmd = cmd.format(scripts=scriptpath(), in1=in1, out1=out1)
+ print(cmd)
+
+ (status, out, err) = run_shell_cmd(cmd)
+ assert os.path.exists(out1 + '.pt')
+ khmer.load_hashbits(out1 + '.pt')
+
+
+def test_filter_abund_1():
+ in1 = utils.get_test_data('test-abund-read-2.fa')
+ out1 = utils.get_temp_filename('out.abundfilt')
+
+ countgraph = _make_counting(in1, K=17)
+
+ cmd = """
+ cat {in1} |
+ {scripts}/filter-abund.py {countgraph} - -o - > {out1}
+ """
+
+ cmd = cmd.format(scripts=scriptpath(), in1=in1, out1=out1,
+ countgraph=countgraph)
+
+ run_shell_cmd(cmd)
+
+ assert os.path.exists(out1)
+ seqs = set([r.sequence for r in screed.open(out1)])
+
+ assert len(seqs) == 1, seqs
+ assert 'GGTTGACGGGGCTCAGGG' in seqs
+
+
+def test_filter_abund_2_fail():
+ in1 = utils.get_test_data('test-abund-read-2.fa')
+ out1 = utils.get_temp_filename('out.abundfilt')
+
+ countgraph = _make_counting(in1, K=17)
+
+ cmd = """
+ cat {in1} |
+ {scripts}/filter-abund.py {countgraph} - > {out1}
+ """
+
+ cmd = cmd.format(scripts=scriptpath(), in1=in1, out1=out1,
+ countgraph=countgraph)
+
+ (status, out, err) = run_shell_cmd(cmd, fail_ok=True)
+ assert status != 0
+ assert "Accepting input from stdin; output filename must be provided with"\
+ in err, err
+
+
+def test_abundance_dist_1():
+ in1 = utils.get_test_data('test-abund-read-2.fa')
+ out1 = utils.get_temp_filename('out.dist')
+
+ countgraph = _make_counting(in1, K=17)
+ assert os.path.exists(countgraph)
+
+ cmd = """
+ cat {in1} |
+ {scripts}/abundance-dist.py -z {countgraph} - - > {out1}
+ """
+
+ cmd = cmd.format(scripts=scriptpath(), in1=in1, out1=out1,
+ countgraph=countgraph)
+
+ run_shell_cmd(cmd)
+
+ assert os.path.exists(out1)
+ with open(out1) as fp:
+ line = fp.readline().strip()
+ line = fp.readline().strip()
+ assert line == '1,96,96,0.98', line
+ line = fp.readline().strip()
+ assert line == '1001,2,98,1.0', line
+
+
+def test_trim_low_abund_1():
+ in1 = utils.get_test_data('test-abund-read-2.fa')
+ out1 = utils.get_temp_filename('out.abundtrim')
+
+ cmd = """
+ cat {in1} |
+ {scripts}/trim-low-abund.py -k 17 -x 1e7 -N 2 - -o - > {out1}
+ """
+
+ cmd = cmd.format(scripts=scriptpath(), in1=in1, out1=out1)
+
+ run_shell_cmd(cmd)
+
+ assert os.path.exists(out1)
+ seqs = set([r.sequence for r in screed.open(out1)])
+
+ assert len(seqs) == 1, seqs
+ assert 'GGTTGACGGGGCTCAGGG' in seqs
+
+
+def test_trim_low_abund_2_fail():
+ in1 = utils.get_test_data('test-abund-read-2.fa')
+ out1 = utils.get_temp_filename('out.abundtrim')
+
+ cmd = """
+ cat {in1} |
+ {scripts}/trim-low-abund.py -k 17 -x 1e7 -N 2 - > {out1}
+ """
+
+ cmd = cmd.format(scripts=scriptpath(), in1=in1, out1=out1)
+
+ (status, out, err) = run_shell_cmd(cmd, fail_ok=True)
+ assert status != 0
+ assert "Accepting input from stdin; output filename must be provided with"\
+ in err, err
+
+
+def test_count_median_1():
+ in1 = utils.get_test_data('test-abund-read-2.fa')
+ out1 = utils.get_temp_filename('out.counts')
+
+ countgraph = _make_counting(in1, K=8)
+ cmd = """
+ cat {in1} |
+ {scripts}/count-median.py {countgraph} - - > {out1}
+ """
+
+ cmd = cmd.format(scripts=scriptpath(), countgraph=countgraph,
+ in1=in1, out1=out1)
+
+ run_shell_cmd(cmd)
+
+ assert os.path.exists(out1), out1
+ data = [x.strip() for x in open(out1)]
+ data = set(data)
+ assert len(data) == 3, data
+ assert 'seq,1001,1001.0,0.0,18' in data
+ assert '895:1:37:17593:9954/1,1,103.803741455,303.702941895,114' in data
+
+
+def test_readstats_1():
+ in1 = utils.get_test_data('test-abund-read-2.fa')
+ out1 = utils.get_temp_filename('out.stats')
+
+ cmd = """
+ cat {in1} |
+ {scripts}/readstats.py --csv - > {out1}
+ """
+
+ cmd = cmd.format(scripts=scriptpath(), in1=in1, out1=out1)
+
+ run_shell_cmd(cmd)
+ assert '18114,1001,18.1,-' in open(out1).read(), open(out1).read()
+
+
+def test_unique_kmers_stream_out_fasta():
+ infile = utils.get_test_data('random-20-a.fa')
+
+ cmd = "{scripts}/unique-kmers.py -k 20 -e 0.01 --stream-out {infile}"
+ cmd = cmd.format(scripts=scriptpath(), infile=infile)
+
+ (status, out, err) = run_shell_cmd(cmd)
+
+ expected = ('Estimated number of unique 20-mers in {infile}: 3950'
+ .format(infile=infile))
+ assert expected in err
+ assert 'Total estimated number of unique 20-mers: 3950' in err
+
+ assert '>45' in out
+ assert "ATACGCCACTCGACTTGGCTCGCCCTCGATCTAAAATAGCGGTCGTGTTGGGTTAACAA" in out
+
+
+def test_unique_kmers_stream_out_fastq_with_N():
+ infile = utils.get_test_data('test-filter-abund-Ns.fq')
+
+ cmd = "{scripts}/unique-kmers.py -k 20 -e 0.01 --stream-out {infile}"
+ cmd = cmd.format(scripts=scriptpath(), infile=infile)
+
+ (status, out, err) = run_shell_cmd(cmd)
+
+ expected = ('Estimated number of unique 20-mers in {infile}: 94'
+ .format(infile=infile))
+ assert expected in err
+ assert 'Total estimated number of unique 20-mers: 94' in err
+
+ assert '@895:1:37:17593:9954 1::FOO_withN' in out
+ assert "GGTTGACGGGGCTCAGGGGGCGGCTGACTCCGAGNGACAGCAGCCGCAGCTGTCGTCA" in out
+ assert "##########################################################" in out
diff --git a/tests/test_subset_graph.py b/tests/test_subset_graph.py
index b5a4209..44f7569 100644
--- a/tests/test_subset_graph.py
+++ b/tests/test_subset_graph.py
@@ -6,7 +6,7 @@ from __future__ import absolute_import
# the three-clause BSD license; see LICENSE.
# Contact: khmer-project at idyll.org
#
-# pylint: disable=missing-docstring
+# pylint: disable=missing-docstring,invalid-name,unused-variable,no-member
import khmer
import screed
@@ -266,7 +266,7 @@ class Test_SaveLoadPmap(object):
try:
a = ht.load_subset_partitionmap(outfile3)
assert 0, "this should not pass"
- except IOError as err:
+ except OSError as err:
print(str(err), i)
def test_save_load_merge_2(self):
@@ -306,7 +306,7 @@ class Test_SaveLoadPmap(object):
try:
a = ht.load_subset_partitionmap('this does not exist')
assert 0, "this should not succeed"
- except IOError as e:
+ except OSError as e:
print(str(e))
def test_save_merge_from_disk(self):
@@ -385,7 +385,7 @@ class Test_SaveLoadPmap(object):
try:
ht.merge_subset_from_disk(outfile1)
assert 0, "this should fail"
- except IOError as e:
+ except OSError as e:
print(str(e))
def test_merge_from_disk_file_bad_type(self):
@@ -395,7 +395,7 @@ class Test_SaveLoadPmap(object):
try:
ht.merge_subset_from_disk(infile)
assert 0, "this should fail"
- except IOError as e:
+ except OSError as e:
print(str(e))
def test_merge_from_disk_file_version(self):
@@ -405,7 +405,7 @@ class Test_SaveLoadPmap(object):
try:
ht.merge_subset_from_disk(infile)
assert 0, "this should fail"
- except IOError as e:
+ except OSError as e:
print(str(e))
def test_save_merge_from_disk_ksize(self):
@@ -428,7 +428,7 @@ class Test_SaveLoadPmap(object):
try:
ht.merge_subset_from_disk(outfile1)
assert 0, "this should fail"
- except IOError as e:
+ except OSError as e:
print(str(e))
@@ -499,7 +499,7 @@ def test_save_load_on_graph_truncate():
try:
a = ht.load_partitionmap(outfile3)
assert 0, "this should not pass"
- except IOError as err:
+ except OSError as err:
print(str(err), i)
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/khmer.git
More information about the debian-med-commit
mailing list