[med-svn] [python-pyfasta] 04/06: New upstream version 0.5.2

Andreas Tille tille at debian.org
Wed Dec 27 16:58:51 UTC 2017


This is an automated email from the git hooks/post-receive script.

tille pushed a commit to branch master
in repository python-pyfasta.

commit 752c646fd46fca4f3cf9f469fcbd3a19a53cc965
Author: Andreas Tille <tille at debian.org>
Date:   Wed Dec 27 17:57:48 2017 +0100

    New upstream version 0.5.2
---
 CHANGELOG.txt                         |  77 +++++++++
 MANIFEST.in                           |   4 +
 PKG-INFO                              | 302 ++++++++++++++++++++++++++++++++++
 README.rst                            | 212 ++++++++++++++++++++++++
 debian/README.source                  |  19 ---
 debian/changelog                      |   5 -
 debian/compat                         |   1 -
 debian/control                        |  48 ------
 debian/copyright                      |  31 ----
 debian/rules                          |  25 ---
 debian/source/format                  |   1 -
 debian/watch                          |   2 -
 pyfasta.egg-info/PKG-INFO             | 302 ++++++++++++++++++++++++++++++++++
 pyfasta.egg-info/SOURCES.txt          |  20 +++
 pyfasta.egg-info/dependency_links.txt |   1 +
 pyfasta.egg-info/entry_points.txt     |   3 +
 pyfasta.egg-info/not-zip-safe         |   1 +
 pyfasta.egg-info/top_level.txt        |   1 +
 pyfasta/__init__.py                   | 141 ++++++++++++++++
 pyfasta/fasta.py                      | 233 ++++++++++++++++++++++++++
 pyfasta/records.py                    | 295 +++++++++++++++++++++++++++++++++
 pyfasta/split_fasta.py                | 231 ++++++++++++++++++++++++++
 setup.cfg                             |  13 ++
 setup.py                              |  40 +++++
 tests/bench.py                        |  51 ++++++
 tests/data/dups.fasta                 |   4 +
 tests/data/key.fasta                  |   6 +
 tests/data/three_chrs.fasta           |  30 ++++
 tests/test_all.py                     | 255 ++++++++++++++++++++++++++++
 29 files changed, 2222 insertions(+), 132 deletions(-)

diff --git a/CHANGELOG.txt b/CHANGELOG.txt
new file mode 100644
index 0000000..ad4b80d
--- /dev/null
+++ b/CHANGELOG.txt
@@ -0,0 +1,77 @@
+Changes
+=======
+0.5.2
+-----
+fix complement (@mruffalo)
+
+0.5.0
+-----
+python 3 compatibility thanks to mruffalo
+
+0.4.5
+-----
+pyfasta split can handle > 52 files. (thanks Devtulya)
+
+0.4.4
+-----
+fix pyfasta extract
+
+0.4.3
+-----
+Add 0 or 1-based intervals in sequence() thanks @jamescasbon
+
+0.4.2
+-----
+update for latest numpy (can't close memmap)
+
+0.4.1
+-----
+check for duplicate headers.
+
+0.4.0
+-----
+* add key_fn kwarg to constuctor
+
+0.3.9
+-----
+* only require 'r' (not r+) for memory map.
+
+0.3.8
+-----
+* clean up logic for mixing inplace/non-inplace flattened files.
+  if the inplace is available, it is always used. 
+
+0.3.6/7
+-------
+* dont re-flatten the file every time!
+* allow spaces before and after the header in the orginal fasta.
+
+0.3.5
+-----
+
+* update docs in README.txt for new CLI stuff.
+* allow flattening inplace.
+* get rid of memmap (results in faster parsing).
+
+0.3.4
+-----
+
+* restore python2.5 compatiblity.
+* CLI: add ability to exclude sequence from extract
+* CLI: allow spliting based on header.
+
+0.3.3
+-----
+
+* include this file in the tar ball (thanks wen h.)
+
+0.3.2
+-----
+
+* separate out backends into records.py
+
+* use nosetests (python setup.py nosetests)
+
+* add a TCRecord backend for next-gen sequencing availabe if tc is (easy-)installed.
+
+* improve test coverage.
diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 0000000..ac8e293
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1,4 @@
+include CHANGELOG.txt
+include README.rst
+include tests/*.py
+include tests/data/*.fasta
diff --git a/PKG-INFO b/PKG-INFO
new file mode 100644
index 0000000..7b308bd
--- /dev/null
+++ b/PKG-INFO
@@ -0,0 +1,302 @@
+Metadata-Version: 1.1
+Name: pyfasta
+Version: 0.5.2
+Summary: fast, memory-efficient, pythonic (and command-line) access to fasta sequence files
+Home-page: http://github.com/brentp/pyfasta/
+Author: brentp
+Author-email: bpederse at gmail.com
+License: MIT
+Description: ==================================================
+        pyfasta: pythonic access to fasta sequence files.
+        ==================================================
+        
+        
+        :Author: Brent Pedersen (brentp)
+        :Email: bpederse at gmail.com
+        :License: MIT
+        
+        .. contents ::
+        
+        Implementation
+        ==============
+        
+        Requires Python >= 2.6. Stores a flattened version of the fasta file without
+        spaces or headers and uses either a mmap of numpy binary format or fseek/fread so the
+        *sequence data is never read into memory*. Saves a pickle (.gdx) of the start, stop 
+        (for fseek/mmap) locations of each header in the fasta file for internal use.
+        
+        Usage
+        =====
+        ::
+          
+            >>> from pyfasta import Fasta
+        
+            >>> f = Fasta('tests/data/three_chrs.fasta')
+            >>> sorted(f.keys())
+            ['chr1', 'chr2', 'chr3']
+        
+            >>> f['chr1']
+            NpyFastaRecord(0..80)
+        
+        
+        
+        Slicing
+        -------
+        ::
+        
+            # get full the sequence:
+            >>> a = str(f['chr1'])
+            >>> b = f['chr1'][:]
+            >>> a == b
+            True
+        
+            >>> f['chr1'][:10]
+            'ACTGACTGAC'
+        
+            # get the 1st basepair in every codon (it's python yo)
+            >>> f['chr1'][::3]
+            'AGTCAGTCAGTCAGTCAGTCAGTCAGT'
+        
+            # can query by a 'feature' dictionary (note this is one based coordinates)
+            >>> f.sequence({'chr': 'chr1', 'start': 2, 'stop': 9})
+            'CTGACTGA'
+        
+            # same as:
+            >>> f['chr1'][1:9]
+            'CTGACTGA'
+        
+            # use python, zero based coords
+            >>> f.sequence({'chr': 'chr1', 'start': 2, 'stop': 9}, one_based=False)
+            'TGACTGA'
+        
+            # with reverse complement (automatic for - strand)
+            >>> f.sequence({'chr': 'chr1', 'start': 2, 'stop': 9, 'strand': '-'})
+            'TCAGTCAG'
+        
+        Key Function
+        ------------
+        Sometimes your fasta will have a long header like: "AT1G51370.2 | Symbols:  | F-box family protein | chr1:19045615-19046748 FORWARD" when you only want to key off: "AT1G51370.2". In this case, specify the key_fn argument to the constructor:
+        
+        ::
+        
+            >>> fkey = Fasta('tests/data/key.fasta', key_fn=lambda key: key.split()[0])
+            >>> sorted(fkey.keys())
+            ['a', 'b', 'c']
+        
+        Numpy
+        =====
+        
+        The default is to use a memmaped numpy array as the backend. In which case it's possible to
+        get back an array directly...
+        ::
+        
+            >>> f['chr1'].as_string = False
+            >>> f['chr1'][:10] # doctest: +NORMALIZE_WHITESPACE
+            memmap(['A', 'C', 'T', 'G', 'A', 'C', 'T', 'G', 'A', 'C'], dtype='|S1')
+        
+            >>> import numpy as np
+            >>> a = np.array(f['chr2'])
+            >>> a.shape[0] == len(f['chr2'])
+            True
+        
+            >>> a[10:14] # doctest: +NORMALIZE_WHITESPACE
+            array(['A', 'A', 'A', 'A'], dtype='|S1')
+        
+        mask a sub-sequence
+        ::
+        
+            >>> a[11:13] = np.array('N', dtype='S1')
+            >>> a[10:14].tostring()
+            'ANNA'
+        
+        
+        Backends (Record class)
+        =======================
+        It's also possible to specify another record class as the underlying work-horse
+        for slicing and reading. Currently, there's just the default: 
+        
+          * NpyFastaRecord which uses numpy memmap
+          * FastaRecord, which uses using fseek/fread
+          * MemoryRecord which reads everything into memory and must reparse the original
+            fasta every time.
+          * TCRecord which is identical to NpyFastaRecord except that it saves the index
+            in a TokyoCabinet hash database, for cases when there are enough records that
+            loading the entire index from a pickle into memory is unwise. (NOTE: that the
+            sequence is not loaded into memory in either case).
+        
+        It's possible to specify the class used with the `record_class` kwarg to the `Fasta`
+        constructor:
+        ::
+        
+            >>> from pyfasta import FastaRecord # default is NpyFastaRecord
+            >>> f = Fasta('tests/data/three_chrs.fasta', record_class=FastaRecord)
+            >>> f['chr1']
+            FastaRecord('tests/data/three_chrs.fasta.flat', 0..80)
+        
+        other than the repr, it should behave exactly like the Npy record class backend
+        
+        it's possible to create your own using a sub-class of FastaRecord. see the source 
+        in pyfasta/records.py for details.
+        
+        Flattening
+        ==========
+        In order to efficiently access the sequence content, pyfasta saves a separate, flattened file with all newlines and headers removed from the sequence. In the case of large fasta files, one may not wish to save 2 copies of a 5GG+ file. In that case, it's possible to flatten the file "inplace", keeping all the headers, and retaining the validity of the fasta file -- with the only change being that the new-lines are removed from each sequence. This can be specified via `flatten_inpl [...]
+        ::
+            
+            >>> import os
+            >>> os.unlink('tests/data/three_chrs.fasta.gdx') # cleanup non-inplace idx
+            >>> f = Fasta('tests/data/three_chrs.fasta', flatten_inplace=True)
+            >>> f['chr1']  # note the difference in the output from above.
+            NpyFastaRecord(6..86)
+        
+            # sequence from is same as when requested from non-flat file above.
+            >>> f['chr1'][1:9]
+            'CTGACTGA'
+        
+            # the flattened file is kept as a place holder without the sequence data.
+            >>> open('tests/data/three_chrs.fasta.flat').read()
+            '@flattened@'
+        
+        
+        Command Line Interface
+        ======================
+        there's also a command line interface to manipulate / view fasta files.
+        the `pyfasta` executable is installed via setuptools, running it will show
+        help text.
+        
+        split a fasta file into 6 new files of relatively even size:
+        
+          $ pyfasta **split** -n 6 original.fasta
+        
+        split the fasta file into one new file per header with "%(seqid)s" being filled into each filename.:
+          
+          $ pyfasta **split** --header "%(seqid)s.fasta" original.fasta
+        
+        create 1 new fasta file with the sequence split into 10K-mers:
+        
+          $ pyfasta **split** -n 1 -k 10000 original.fasta
+        
+        2 new fasta files with the sequence split into 10K-mers with 2K overlap:
+        
+          $ pyfasta **split** -n 2 -k 10000 -o 2000 original.fasta
+        
+        
+        show some info about the file (and show gc content):
+        
+          $ pyfasta **info** --gc test/data/three_chrs.fasta
+        
+        
+        **extract** sequence from the file. use the header flag to make
+        a new fasta file. the args are a list of sequences to extract.
+        
+          $ pyfasta **extract** --header --fasta test/data/three_chrs.fasta seqa seqb seqc
+        
+        **extract** sequence from a file using a file containing the headers *not* wanted in the new file:
+        
+          $ pyfasta extract --header --fasta input.fasta --exclude --file seqids_to_exclude.txt
+        
+        **extract** sequence from a fasta file with complex keys where we only want to lookup based on the part before the space.
+        
+          $ pyfasta extract --header --fasta input.with.keys.fasta --space --file seqids.txt
+        
+        **flatten** a file inplace, for faster later use by pyfasta, and without creating another copy. (`Flattening`_)
+        
+          $ pyfasta flatten input.fasta 
+        
+        cleanup 
+        =======
+        (though for real use these will remain for faster access)
+        ::
+        
+            >>> os.unlink('tests/data/three_chrs.fasta.gdx')
+            >>> os.unlink('tests/data/three_chrs.fasta.flat')
+        
+        Testing
+        =======
+        there is currently > 99% test coverage for the 2 modules and all included 
+        record classes. to run the tests:
+        ::
+        
+          $ python setup.py nosetests
+        
+        Changes
+        =======
+        0.5.2
+        -----
+        fix complement (@mruffalo)
+        
+        0.5.0
+        -----
+        python 3 compatibility thanks to mruffalo
+        
+        0.4.5
+        -----
+        pyfasta split can handle > 52 files. (thanks Devtulya)
+        
+        0.4.4
+        -----
+        fix pyfasta extract
+        
+        0.4.3
+        -----
+        Add 0 or 1-based intervals in sequence() thanks @jamescasbon
+        
+        0.4.2
+        -----
+        update for latest numpy (can't close memmap)
+        
+        0.4.1
+        -----
+        check for duplicate headers.
+        
+        0.4.0
+        -----
+        * add key_fn kwarg to constuctor
+        
+        0.3.9
+        -----
+        * only require 'r' (not r+) for memory map.
+        
+        0.3.8
+        -----
+        * clean up logic for mixing inplace/non-inplace flattened files.
+          if the inplace is available, it is always used. 
+        
+        0.3.6/7
+        -------
+        * dont re-flatten the file every time!
+        * allow spaces before and after the header in the orginal fasta.
+        
+        0.3.5
+        -----
+        
+        * update docs in README.txt for new CLI stuff.
+        * allow flattening inplace.
+        * get rid of memmap (results in faster parsing).
+        
+        0.3.4
+        -----
+        
+        * restore python2.5 compatiblity.
+        * CLI: add ability to exclude sequence from extract
+        * CLI: allow spliting based on header.
+        
+        0.3.3
+        -----
+        
+        * include this file in the tar ball (thanks wen h.)
+        
+        0.3.2
+        -----
+        
+        * separate out backends into records.py
+        
+        * use nosetests (python setup.py nosetests)
+        
+        * add a TCRecord backend for next-gen sequencing availabe if tc is (easy-)installed.
+        
+        * improve test coverage.
+        
+Keywords: bioinformatics blast fasta
+Platform: UNKNOWN
+Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
diff --git a/README.rst b/README.rst
new file mode 100644
index 0000000..e6946aa
--- /dev/null
+++ b/README.rst
@@ -0,0 +1,212 @@
+==================================================
+pyfasta: pythonic access to fasta sequence files.
+==================================================
+
+
+:Author: Brent Pedersen (brentp)
+:Email: bpederse at gmail.com
+:License: MIT
+
+.. contents ::
+
+Implementation
+==============
+
+Requires Python >= 2.6. Stores a flattened version of the fasta file without
+spaces or headers and uses either a mmap of numpy binary format or fseek/fread so the
+*sequence data is never read into memory*. Saves a pickle (.gdx) of the start, stop 
+(for fseek/mmap) locations of each header in the fasta file for internal use.
+
+Usage
+=====
+::
+  
+    >>> from pyfasta import Fasta
+
+    >>> f = Fasta('tests/data/three_chrs.fasta')
+    >>> sorted(f.keys())
+    ['chr1', 'chr2', 'chr3']
+
+    >>> f['chr1']
+    NpyFastaRecord(0..80)
+
+
+
+Slicing
+-------
+::
+
+    # get full the sequence:
+    >>> a = str(f['chr1'])
+    >>> b = f['chr1'][:]
+    >>> a == b
+    True
+
+    >>> f['chr1'][:10]
+    'ACTGACTGAC'
+
+    # get the 1st basepair in every codon (it's python yo)
+    >>> f['chr1'][::3]
+    'AGTCAGTCAGTCAGTCAGTCAGTCAGT'
+
+    # can query by a 'feature' dictionary (note this is one based coordinates)
+    >>> f.sequence({'chr': 'chr1', 'start': 2, 'stop': 9})
+    'CTGACTGA'
+
+    # same as:
+    >>> f['chr1'][1:9]
+    'CTGACTGA'
+
+    # use python, zero based coords
+    >>> f.sequence({'chr': 'chr1', 'start': 2, 'stop': 9}, one_based=False)
+    'TGACTGA'
+
+    # with reverse complement (automatic for - strand)
+    >>> f.sequence({'chr': 'chr1', 'start': 2, 'stop': 9, 'strand': '-'})
+    'TCAGTCAG'
+
+Key Function
+------------
+Sometimes your fasta will have a long header like: "AT1G51370.2 | Symbols:  | F-box family protein | chr1:19045615-19046748 FORWARD" when you only want to key off: "AT1G51370.2". In this case, specify the key_fn argument to the constructor:
+
+::
+
+    >>> fkey = Fasta('tests/data/key.fasta', key_fn=lambda key: key.split()[0])
+    >>> sorted(fkey.keys())
+    ['a', 'b', 'c']
+
+Numpy
+=====
+
+The default is to use a memmaped numpy array as the backend. In which case it's possible to
+get back an array directly...
+::
+
+    >>> f['chr1'].as_string = False
+    >>> f['chr1'][:10] # doctest: +NORMALIZE_WHITESPACE
+    memmap(['A', 'C', 'T', 'G', 'A', 'C', 'T', 'G', 'A', 'C'], dtype='|S1')
+
+    >>> import numpy as np
+    >>> a = np.array(f['chr2'])
+    >>> a.shape[0] == len(f['chr2'])
+    True
+
+    >>> a[10:14] # doctest: +NORMALIZE_WHITESPACE
+    array(['A', 'A', 'A', 'A'], dtype='|S1')
+
+mask a sub-sequence
+::
+
+    >>> a[11:13] = np.array('N', dtype='S1')
+    >>> a[10:14].tostring()
+    'ANNA'
+
+
+Backends (Record class)
+=======================
+It's also possible to specify another record class as the underlying work-horse
+for slicing and reading. Currently, there's just the default: 
+
+  * NpyFastaRecord which uses numpy memmap
+  * FastaRecord, which uses using fseek/fread
+  * MemoryRecord which reads everything into memory and must reparse the original
+    fasta every time.
+  * TCRecord which is identical to NpyFastaRecord except that it saves the index
+    in a TokyoCabinet hash database, for cases when there are enough records that
+    loading the entire index from a pickle into memory is unwise. (NOTE: that the
+    sequence is not loaded into memory in either case).
+
+It's possible to specify the class used with the `record_class` kwarg to the `Fasta`
+constructor:
+::
+
+    >>> from pyfasta import FastaRecord # default is NpyFastaRecord
+    >>> f = Fasta('tests/data/three_chrs.fasta', record_class=FastaRecord)
+    >>> f['chr1']
+    FastaRecord('tests/data/three_chrs.fasta.flat', 0..80)
+
+other than the repr, it should behave exactly like the Npy record class backend
+
+it's possible to create your own using a sub-class of FastaRecord. see the source 
+in pyfasta/records.py for details.
+
+Flattening
+==========
+In order to efficiently access the sequence content, pyfasta saves a separate, flattened file with all newlines and headers removed from the sequence. In the case of large fasta files, one may not wish to save 2 copies of a 5GG+ file. In that case, it's possible to flatten the file "inplace", keeping all the headers, and retaining the validity of the fasta file -- with the only change being that the new-lines are removed from each sequence. This can be specified via `flatten_inplace` = True
+::
+    
+    >>> import os
+    >>> os.unlink('tests/data/three_chrs.fasta.gdx') # cleanup non-inplace idx
+    >>> f = Fasta('tests/data/three_chrs.fasta', flatten_inplace=True)
+    >>> f['chr1']  # note the difference in the output from above.
+    NpyFastaRecord(6..86)
+
+    # sequence from is same as when requested from non-flat file above.
+    >>> f['chr1'][1:9]
+    'CTGACTGA'
+
+    # the flattened file is kept as a place holder without the sequence data.
+    >>> open('tests/data/three_chrs.fasta.flat').read()
+    '@flattened@'
+
+
+Command Line Interface
+======================
+there's also a command line interface to manipulate / view fasta files.
+the `pyfasta` executable is installed via setuptools, running it will show
+help text.
+
+split a fasta file into 6 new files of relatively even size:
+
+  $ pyfasta **split** -n 6 original.fasta
+
+split the fasta file into one new file per header with "%(seqid)s" being filled into each filename.:
+  
+  $ pyfasta **split** --header "%(seqid)s.fasta" original.fasta
+
+create 1 new fasta file with the sequence split into 10K-mers:
+
+  $ pyfasta **split** -n 1 -k 10000 original.fasta
+
+2 new fasta files with the sequence split into 10K-mers with 2K overlap:
+
+  $ pyfasta **split** -n 2 -k 10000 -o 2000 original.fasta
+
+
+show some info about the file (and show gc content):
+
+  $ pyfasta **info** --gc test/data/three_chrs.fasta
+
+
+**extract** sequence from the file. use the header flag to make
+a new fasta file. the args are a list of sequences to extract.
+
+  $ pyfasta **extract** --header --fasta test/data/three_chrs.fasta seqa seqb seqc
+
+**extract** sequence from a file using a file containing the headers *not* wanted in the new file:
+
+  $ pyfasta extract --header --fasta input.fasta --exclude --file seqids_to_exclude.txt
+
+**extract** sequence from a fasta file with complex keys where we only want to lookup based on the part before the space.
+
+  $ pyfasta extract --header --fasta input.with.keys.fasta --space --file seqids.txt
+
+**flatten** a file inplace, for faster later use by pyfasta, and without creating another copy. (`Flattening`_)
+
+  $ pyfasta flatten input.fasta 
+
+cleanup 
+=======
+(though for real use these will remain for faster access)
+::
+
+    >>> os.unlink('tests/data/three_chrs.fasta.gdx')
+    >>> os.unlink('tests/data/three_chrs.fasta.flat')
+
+Testing
+=======
+there is currently > 99% test coverage for the 2 modules and all included 
+record classes. to run the tests:
+::
+
+  $ python setup.py nosetests
diff --git a/debian/README.source b/debian/README.source
deleted file mode 100644
index 5026a7e..0000000
--- a/debian/README.source
+++ /dev/null
@@ -1,19 +0,0 @@
-As documented in the original ITP
-
-   https://bugs.debian.org/801721
-
-this package showed problems in the unit tests.  Due to the upstream author
-
-  Date: Wed, 14 Oct 2015 15:04:31 -0600
-  From: Brent Pedersen <bpederse at gmail.com>
-  To: Andreas Tille <andreas at an3as.eu>
-  Subject: Re: Test failures with pyfasta
-
-  hmm, that's not good. but still, I recommend to use pyfaidx as I'm no
-  longer working on pyfaidx.
-
-rather pyfaidx should be packaged.  So there is no intent to upload this
-package for the moment but the packaging should work when switching of
-the test suite.
-
- -- Andreas Tille <tille at debian.org>  Thu, 15 Oct 2015 08:24:03 +0200
diff --git a/debian/changelog b/debian/changelog
deleted file mode 100644
index f866d87..0000000
--- a/debian/changelog
+++ /dev/null
@@ -1,5 +0,0 @@
-python-pyfasta (0.5.2-1) NONE; urgency=medium
-
-  * No intend to upload - see README.source
-
- -- Andreas Tille <tille at debian.org>  Thu, 15 Oct 2015 08:24:03 +0200
diff --git a/debian/compat b/debian/compat
deleted file mode 100644
index ec63514..0000000
--- a/debian/compat
+++ /dev/null
@@ -1 +0,0 @@
-9
diff --git a/debian/control b/debian/control
deleted file mode 100644
index d9ab5fd..0000000
--- a/debian/control
+++ /dev/null
@@ -1,48 +0,0 @@
-Source: python-pyfasta
-Maintainer: Debian Med Packaging Team <debian-med-packaging at lists.alioth.debian.org>
-Uploaders: Andreas Tille <tille at debian.org>
-Section: python
-Priority: optional
-Build-Depends: debhelper (>= 9),
-               dh-python,
-               python-all (>= 2.7),
-               python-coverage,
-               python-setuptools,
-               python-nose,
-               python-numpy,
-               python3-all,
-               python3-setuptools,
-               python3-nose,
-               python3-numpy
-Standards-Version: 3.9.6
-Vcs-Browser: http://anonscm.debian.org/viewvc/debian-med/trunk/packages/python-pyfasta/trunk/
-Vcs-Svn: svn://anonscm.debian.org/debian-med/trunk/packages/python-pyfasta/trunk/
-Homepage: https://pypi.python.org/pypi/pyfasta/
-
-Package: python-pyfasta
-Architecture: all
-Depends: ${misc:Depends},
-         ${python:Depends}
-Description: fast, memory-efficient Python 2 access to fasta sequence files
- In bioinformatics, FASTA format is a text-based format for representing
- either nucleotide sequences or peptide sequences, in which nucleotides
- or amino acids are represented using single-letter codes. The format
- also allows for sequence names and comments to precede the sequences.
- The format originates from the FASTA software package, but has now
- become a standard in the field of bioinformatics.
- .
- This package provides the Python 2 modules to access fasta files.
-
-Package: python3-pyfasta
-Architecture: all
-Depends: ${misc:Depends},
-         ${python3:Depends}
-Description: fast, memory-efficient Python 3 access to fasta sequence files
- In bioinformatics, FASTA format is a text-based format for representing
- either nucleotide sequences or peptide sequences, in which nucleotides
- or amino acids are represented using single-letter codes. The format
- also allows for sequence names and comments to precede the sequences.
- The format originates from the FASTA software package, but has now
- become a standard in the field of bioinformatics.
- .
- This package provides the Python 3 modules to access fasta files.
diff --git a/debian/copyright b/debian/copyright
deleted file mode 100644
index d2d428e..0000000
--- a/debian/copyright
+++ /dev/null
@@ -1,31 +0,0 @@
-Format: http://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
-Upstream-Name: pyfasta
-Upstream-Contact: Brent Pedersen <bpederse at gmail.com>
-Source: https://pypi.python.org/pypi/pyfasta
-
-Files: *
-Copyright: 2011-2015 Brent Pedersen <bpederse at gmail.com>
-License: MIT
-
-Files: debian/*
-Copyright: 2015 Andreas Tille <tille at debian.org>
-License: MIT
-
-License: MIT
- Permission is hereby granted, free of charge, to any person obtaining a copy
- of this software and associated documentation files (the "Software"), to deal
- in the Software without restriction, including without limitation the rights
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- copies of the Software, and to permit persons to whom the Software is
- furnished to do so, subject to the following conditions:
- .
- The above copyright notice and this permission notice shall be included in
- all copies or substantial portions of the Software.
- .
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- THE SOFTWARE.
diff --git a/debian/rules b/debian/rules
deleted file mode 100755
index 6c7f5fb..0000000
--- a/debian/rules
+++ /dev/null
@@ -1,25 +0,0 @@
-#!/usr/bin/make -f
-# -*- makefile -*-
-
-# Uncomment this to turn on verbose mode.
-#export DH_VERBOSE=1
-
-export PYBUILD_NAME=pyfasta
-export PYBUILD_TEST_ARGS={dir}/tests
-
-override_dh_auto_test:
-	LC_ALL=C.UTF-8 dh_auto_test -- --test --system=custom \
-		--test-args='set -e; \
-			     set -x; \
-                             cp -a tests {build_dir}; \
-                             {interpreter} run_tests.py --offline'
-
-run_tests:
-	cp -a tests/data/key.fasta tests/data/key.fasta.orig
-	cp -a tests/data/three_chrs.fasta tests/data/three_chrs.fasta.orig
-	python -m nose tests
-
-
-%:
-	dh $@ --with python2,python3 --buildsystem=pybuild
-
diff --git a/debian/source/format b/debian/source/format
deleted file mode 100644
index 163aaf8..0000000
--- a/debian/source/format
+++ /dev/null
@@ -1 +0,0 @@
-3.0 (quilt)
diff --git a/debian/watch b/debian/watch
deleted file mode 100644
index bda404f..0000000
--- a/debian/watch
+++ /dev/null
@@ -1,2 +0,0 @@
-version=3
-https://pypi.python.org/pypi/pyfasta .*/pyfasta-([0-9.]+).tar.[xgb]z2?
diff --git a/pyfasta.egg-info/PKG-INFO b/pyfasta.egg-info/PKG-INFO
new file mode 100644
index 0000000..7b308bd
--- /dev/null
+++ b/pyfasta.egg-info/PKG-INFO
@@ -0,0 +1,302 @@
+Metadata-Version: 1.1
+Name: pyfasta
+Version: 0.5.2
+Summary: fast, memory-efficient, pythonic (and command-line) access to fasta sequence files
+Home-page: http://github.com/brentp/pyfasta/
+Author: brentp
+Author-email: bpederse at gmail.com
+License: MIT
+Description: ==================================================
+        pyfasta: pythonic access to fasta sequence files.
+        ==================================================
+        
+        
+        :Author: Brent Pedersen (brentp)
+        :Email: bpederse at gmail.com
+        :License: MIT
+        
+        .. contents ::
+        
+        Implementation
+        ==============
+        
+        Requires Python >= 2.6. Stores a flattened version of the fasta file without
+        spaces or headers and uses either a mmap of numpy binary format or fseek/fread so the
+        *sequence data is never read into memory*. Saves a pickle (.gdx) of the start, stop 
+        (for fseek/mmap) locations of each header in the fasta file for internal use.
+        
+        Usage
+        =====
+        ::
+          
+            >>> from pyfasta import Fasta
+        
+            >>> f = Fasta('tests/data/three_chrs.fasta')
+            >>> sorted(f.keys())
+            ['chr1', 'chr2', 'chr3']
+        
+            >>> f['chr1']
+            NpyFastaRecord(0..80)
+        
+        
+        
+        Slicing
+        -------
+        ::
+        
+            # get full the sequence:
+            >>> a = str(f['chr1'])
+            >>> b = f['chr1'][:]
+            >>> a == b
+            True
+        
+            >>> f['chr1'][:10]
+            'ACTGACTGAC'
+        
+            # get the 1st basepair in every codon (it's python yo)
+            >>> f['chr1'][::3]
+            'AGTCAGTCAGTCAGTCAGTCAGTCAGT'
+        
+            # can query by a 'feature' dictionary (note this is one based coordinates)
+            >>> f.sequence({'chr': 'chr1', 'start': 2, 'stop': 9})
+            'CTGACTGA'
+        
+            # same as:
+            >>> f['chr1'][1:9]
+            'CTGACTGA'
+        
+            # use python, zero based coords
+            >>> f.sequence({'chr': 'chr1', 'start': 2, 'stop': 9}, one_based=False)
+            'TGACTGA'
+        
+            # with reverse complement (automatic for - strand)
+            >>> f.sequence({'chr': 'chr1', 'start': 2, 'stop': 9, 'strand': '-'})
+            'TCAGTCAG'
+        
+        Key Function
+        ------------
+        Sometimes your fasta will have a long header like: "AT1G51370.2 | Symbols:  | F-box family protein | chr1:19045615-19046748 FORWARD" when you only want to key off: "AT1G51370.2". In this case, specify the key_fn argument to the constructor:
+        
+        ::
+        
+            >>> fkey = Fasta('tests/data/key.fasta', key_fn=lambda key: key.split()[0])
+            >>> sorted(fkey.keys())
+            ['a', 'b', 'c']
+        
+        Numpy
+        =====
+        
+        The default is to use a memmaped numpy array as the backend. In which case it's possible to
+        get back an array directly...
+        ::
+        
+            >>> f['chr1'].as_string = False
+            >>> f['chr1'][:10] # doctest: +NORMALIZE_WHITESPACE
+            memmap(['A', 'C', 'T', 'G', 'A', 'C', 'T', 'G', 'A', 'C'], dtype='|S1')
+        
+            >>> import numpy as np
+            >>> a = np.array(f['chr2'])
+            >>> a.shape[0] == len(f['chr2'])
+            True
+        
+            >>> a[10:14] # doctest: +NORMALIZE_WHITESPACE
+            array(['A', 'A', 'A', 'A'], dtype='|S1')
+        
+        mask a sub-sequence
+        ::
+        
+            >>> a[11:13] = np.array('N', dtype='S1')
+            >>> a[10:14].tostring()
+            'ANNA'
+        
+        
+        Backends (Record class)
+        =======================
+        It's also possible to specify another record class as the underlying work-horse
+        for slicing and reading. Currently, there's just the default: 
+        
+          * NpyFastaRecord which uses numpy memmap
+          * FastaRecord, which uses using fseek/fread
+          * MemoryRecord which reads everything into memory and must reparse the original
+            fasta every time.
+          * TCRecord which is identical to NpyFastaRecord except that it saves the index
+            in a TokyoCabinet hash database, for cases when there are enough records that
+            loading the entire index from a pickle into memory is unwise. (NOTE: that the
+            sequence is not loaded into memory in either case).
+        
+        It's possible to specify the class used with the `record_class` kwarg to the `Fasta`
+        constructor:
+        ::
+        
+            >>> from pyfasta import FastaRecord # default is NpyFastaRecord
+            >>> f = Fasta('tests/data/three_chrs.fasta', record_class=FastaRecord)
+            >>> f['chr1']
+            FastaRecord('tests/data/three_chrs.fasta.flat', 0..80)
+        
+        other than the repr, it should behave exactly like the Npy record class backend
+        
+        it's possible to create your own using a sub-class of FastaRecord. see the source 
+        in pyfasta/records.py for details.
+        
+        Flattening
+        ==========
+        In order to efficiently access the sequence content, pyfasta saves a separate, flattened file with all newlines and headers removed from the sequence. In the case of large fasta files, one may not wish to save 2 copies of a 5GG+ file. In that case, it's possible to flatten the file "inplace", keeping all the headers, and retaining the validity of the fasta file -- with the only change being that the new-lines are removed from each sequence. This can be specified via `flatten_inpl [...]
+        ::
+            
+            >>> import os
+            >>> os.unlink('tests/data/three_chrs.fasta.gdx') # cleanup non-inplace idx
+            >>> f = Fasta('tests/data/three_chrs.fasta', flatten_inplace=True)
+            >>> f['chr1']  # note the difference in the output from above.
+            NpyFastaRecord(6..86)
+        
+            # sequence from is same as when requested from non-flat file above.
+            >>> f['chr1'][1:9]
+            'CTGACTGA'
+        
+            # the flattened file is kept as a place holder without the sequence data.
+            >>> open('tests/data/three_chrs.fasta.flat').read()
+            '@flattened@'
+        
+        
+        Command Line Interface
+        ======================
+        there's also a command line interface to manipulate / view fasta files.
+        the `pyfasta` executable is installed via setuptools, running it will show
+        help text.
+        
+        split a fasta file into 6 new files of relatively even size:
+        
+          $ pyfasta **split** -n 6 original.fasta
+        
+        split the fasta file into one new file per header with "%(seqid)s" being filled into each filename.:
+          
+          $ pyfasta **split** --header "%(seqid)s.fasta" original.fasta
+        
+        create 1 new fasta file with the sequence split into 10K-mers:
+        
+          $ pyfasta **split** -n 1 -k 10000 original.fasta
+        
+        2 new fasta files with the sequence split into 10K-mers with 2K overlap:
+        
+          $ pyfasta **split** -n 2 -k 10000 -o 2000 original.fasta
+        
+        
+        show some info about the file (and show gc content):
+        
+          $ pyfasta **info** --gc test/data/three_chrs.fasta
+        
+        
+        **extract** sequence from the file. use the header flag to make
+        a new fasta file. the args are a list of sequences to extract.
+        
+          $ pyfasta **extract** --header --fasta test/data/three_chrs.fasta seqa seqb seqc
+        
+        **extract** sequence from a file using a file containing the headers *not* wanted in the new file:
+        
+          $ pyfasta extract --header --fasta input.fasta --exclude --file seqids_to_exclude.txt
+        
+        **extract** sequence from a fasta file with complex keys where we only want to lookup based on the part before the space.
+        
+          $ pyfasta extract --header --fasta input.with.keys.fasta --space --file seqids.txt
+        
+        **flatten** a file inplace, for faster later use by pyfasta, and without creating another copy. (`Flattening`_)
+        
+          $ pyfasta flatten input.fasta 
+        
+        cleanup 
+        =======
+        (though for real use these will remain for faster access)
+        ::
+        
+            >>> os.unlink('tests/data/three_chrs.fasta.gdx')
+            >>> os.unlink('tests/data/three_chrs.fasta.flat')
+        
+        Testing
+        =======
+        there is currently > 99% test coverage for the 2 modules and all included 
+        record classes. to run the tests:
+        ::
+        
+          $ python setup.py nosetests
+        
+        Changes
+        =======
+        0.5.2
+        -----
+        fix complement (@mruffalo)
+        
+        0.5.0
+        -----
+        python 3 compatibility thanks to mruffalo
+        
+        0.4.5
+        -----
+        pyfasta split can handle > 52 files. (thanks Devtulya)
+        
+        0.4.4
+        -----
+        fix pyfasta extract
+        
+        0.4.3
+        -----
+        Add 0 or 1-based intervals in sequence() thanks @jamescasbon
+        
+        0.4.2
+        -----
+        update for latest numpy (can't close memmap)
+        
+        0.4.1
+        -----
+        check for duplicate headers.
+        
+        0.4.0
+        -----
+        * add key_fn kwarg to constuctor
+        
+        0.3.9
+        -----
+        * only require 'r' (not r+) for memory map.
+        
+        0.3.8
+        -----
+        * clean up logic for mixing inplace/non-inplace flattened files.
+          if the inplace is available, it is always used. 
+        
+        0.3.6/7
+        -------
+        * dont re-flatten the file every time!
+        * allow spaces before and after the header in the orginal fasta.
+        
+        0.3.5
+        -----
+        
+        * update docs in README.txt for new CLI stuff.
+        * allow flattening inplace.
+        * get rid of memmap (results in faster parsing).
+        
+        0.3.4
+        -----
+        
+        * restore python2.5 compatiblity.
+        * CLI: add ability to exclude sequence from extract
+        * CLI: allow spliting based on header.
+        
+        0.3.3
+        -----
+        
+        * include this file in the tar ball (thanks wen h.)
+        
+        0.3.2
+        -----
+        
+        * separate out backends into records.py
+        
+        * use nosetests (python setup.py nosetests)
+        
+        * add a TCRecord backend for next-gen sequencing availabe if tc is (easy-)installed.
+        
+        * improve test coverage.
+        
+Keywords: bioinformatics blast fasta
+Platform: UNKNOWN
+Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
diff --git a/pyfasta.egg-info/SOURCES.txt b/pyfasta.egg-info/SOURCES.txt
new file mode 100644
index 0000000..f3fe98f
--- /dev/null
+++ b/pyfasta.egg-info/SOURCES.txt
@@ -0,0 +1,20 @@
+CHANGELOG.txt
+MANIFEST.in
+README.rst
+setup.cfg
+setup.py
+pyfasta/__init__.py
+pyfasta/fasta.py
+pyfasta/records.py
+pyfasta/split_fasta.py
+pyfasta.egg-info/PKG-INFO
+pyfasta.egg-info/SOURCES.txt
+pyfasta.egg-info/dependency_links.txt
+pyfasta.egg-info/entry_points.txt
+pyfasta.egg-info/not-zip-safe
+pyfasta.egg-info/top_level.txt
+tests/bench.py
+tests/test_all.py
+tests/data/dups.fasta
+tests/data/key.fasta
+tests/data/three_chrs.fasta
\ No newline at end of file
diff --git a/pyfasta.egg-info/dependency_links.txt b/pyfasta.egg-info/dependency_links.txt
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/pyfasta.egg-info/dependency_links.txt
@@ -0,0 +1 @@
+
diff --git a/pyfasta.egg-info/entry_points.txt b/pyfasta.egg-info/entry_points.txt
new file mode 100644
index 0000000..245e264
--- /dev/null
+++ b/pyfasta.egg-info/entry_points.txt
@@ -0,0 +1,3 @@
+[console_scripts]
+pyfasta = pyfasta:main
+
diff --git a/pyfasta.egg-info/not-zip-safe b/pyfasta.egg-info/not-zip-safe
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/pyfasta.egg-info/not-zip-safe
@@ -0,0 +1 @@
+
diff --git a/pyfasta.egg-info/top_level.txt b/pyfasta.egg-info/top_level.txt
new file mode 100644
index 0000000..909ccbd
--- /dev/null
+++ b/pyfasta.egg-info/top_level.txt
@@ -0,0 +1 @@
+pyfasta
diff --git a/pyfasta/__init__.py b/pyfasta/__init__.py
new file mode 100644
index 0000000..f1cdf95
--- /dev/null
+++ b/pyfasta/__init__.py
@@ -0,0 +1,141 @@
+from __future__ import print_function
+import sys
+from fasta import Fasta, complement, DuplicateHeaderException
+from records import *
+from split_fasta import split
+import optparse
+
+def main():
+    help = """
+    available actions:
+        `extract`: extract sequences from a fasta file
+        `info`: show info about the fasta file and exit.
+        `split`: split a large fasta file into separate files
+                 and/or into K-mers.
+        `flatten`: flatten a fasta file inplace so that later
+                   command-line (and programmattic) access via
+                   pyfasta will use the inplace flattened version
+                   rather than creating another .flat copy of the
+                   sequence.
+
+    to view the help for a particular action, use:
+        pyfasta [action] --help
+    e.g.:
+        pyfasta extract --help
+    """
+    if len(sys.argv) == 1:
+        print(help)
+        sys.exit()
+
+    action = sys.argv[1]
+
+    sglobals = globals()
+    if not action in sglobals:
+        print("%s not a valid action" % action)
+        print(help)
+        sys.exit()
+
+    globals()[action](sys.argv[2:])
+
+def info(args):
+    """
+    >>> info(['tests/data/three_chrs.fasta'])
+    <BLANKLINE>
+    tests/data/three_chrs.fasta
+    ===========================
+    >chr3 length:3600
+    >chr2 length:80
+    >chr1 length:80
+    <BLANKLINE>
+    3760 basepairs in 3 sequences
+    """
+    parser = optparse.OptionParser("""\
+   print headers and lengths of the given fasta file in order of length. e.g.:
+        pyfasta info --gc some.fasta""")
+
+    parser.add_option("-n", "--n", type="int", dest="nseqs",
+                      help="max number of records to print. use -1 for all",
+                      default=20)
+    parser.add_option("--gc", dest="gc", help="show gc content",
+                      action="store_true", default=False)
+    options, fastas = parser.parse_args(args)
+    if not (fastas):
+        sys.exit(parser.print_help())
+    import operator
+
+    for fasta in fastas:
+        f = Fasta(fasta)
+        info = [(k, len(seq)) for k, seq in f.iteritems()]
+
+        total_len = sum(l for k, l in info)
+        nseqs = len(f)
+        if options.nseqs > -1:
+            info = sorted(info,  key=operator.itemgetter(1, 0), reverse=True)
+            info = info[:options.nseqs]
+        else:
+            info.sort()
+
+        print("\n" + fasta)
+        print("=" * len(fasta))
+        for k, l in info:
+            gc = ""
+            if options.gc:
+                seq = str(f[k]).upper()
+                g = seq.count('G')
+                c = seq.count('C')
+                gc = 100.0 * (g + c) / float(l)
+                gc = "gc:%.2f%%" % gc
+            print((">%s length:%i" % (k, l)) + gc)
+
+        if total_len > 1000000:
+            total_len = "%.3fM" % (total_len / 1000000.)
+        print()
+        print("%s basepairs in %i sequences" % (total_len, nseqs))
+
+def flatten(args):
+    """
+    >>> flatten(['tests/data/three_chrs.fasta'])
+    """
+    parser = optparse.OptionParser("""flatten a fasta file *inplace* so all later access by pyfasta will use that flattend (but still viable) fasta file""")
+    _, fasta = parser.parse_args(args)
+    for fa in fasta:
+        f = Fasta(fa, flatten_inplace=True)
+
+def extract(args):
+    """
+    >>> extract(['--fasta', 'tests/data/three_chrs.fasta', 'chr2'])
+    TAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAT
+    """
+
+    parser = optparse.OptionParser("""extract some sequences from a fasta file. e.g.:
+               pyfasta extract --fasta some.fasta --header at2g26540 at3g45640""")
+    parser.add_option("--fasta", dest="fasta", help="path to the fasta file")
+    parser.add_option("--header", dest="header", help="include headers", action="store_true", default=False)
+    parser.add_option("--exclude", dest="exclude", help="extract all sequences EXCEPT those listed", action="store_true", default=False)
+    parser.add_option("--file", dest="file", help=\
+                      "if this flag is used, the sequences to extract" \
+                      " are read from the file specified in args"
+                      , action="store_true", default=False)
+    parser.add_option("--space", dest="space", action="store_true", help=\
+                      "use the fasta identifier only up to the space as the key",
+                      default=False)
+    options, seqs = parser.parse_args(args)
+    if not (options.fasta and len(seqs)):
+        sys.exit(parser.print_help())
+
+    key_fn = (lambda k: k.split()[0]) if options.space else None
+    f = Fasta(options.fasta, key_fn=key_fn)
+    if options.file:
+        seqs = (x.strip() for x in open(seqs[0]))
+    if options.exclude:
+        seqs = sorted(frozenset(f.iterkeys()).difference(seqs))
+
+    for seqname in seqs:
+        seq = f[seqname]
+        if options.header:
+            print(">%s" % seqname)
+        print(seq)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/pyfasta/fasta.py b/pyfasta/fasta.py
new file mode 100644
index 0000000..05d1fa1
--- /dev/null
+++ b/pyfasta/fasta.py
@@ -0,0 +1,233 @@
+from __future__ import print_function
+import string
+import os.path
+from collections import Mapping
+import sys
+import numpy as np
+
+from records import NpyFastaRecord
+
+# string.maketrans is bytes.maketrans in Python 3, but
+# we want to deal with strings instead of bytes
+try:
+    # 2.x
+    maketrans = string.maketrans
+except AttributeError:
+    # 3.x
+    maketrans = str.maketrans
+
+_complement = maketrans('ATCGatcgNnXx', 'TAGCtagcNnXx')
+# Python 2: string.maketrans returns a bytes object of length 256,
+#   that is used as a lookup table to translate bytes to other bytes.
+# Python 3: str.maketrans returns a dict mapping Unicode code points
+#   to other Unicode code points. Can't use a fully-allocated lookup
+#   table since it would have to be of size `sys.maxunicode`, which
+#   is equal to 1114111 on wide builds of <= 3.2 and all builds of
+#   Python >= 3.3.
+# In Python 2, it's safe to use a unicode object as the translation
+# table; this causes str.translate to return a unicode object instead
+# of a str. This is safe as long as the string that you're translating
+# can be decoded as ASCII, and will fail with a UnicodeDecodeError
+# otherwise.
+if sys.version_info[0] < 3:
+    _complement = _complement.decode('latin-1')
+complement = lambda s: s.translate(_complement)
+
+class FastaNotFound(Exception): pass
+
+class DuplicateHeaderException(Exception):
+    def __init__(self, header):
+        Exception.__init__(self, 'headers must be unique: %s is duplicated' % header)
+
+class Fasta(Mapping):
+    def __init__(self, fasta_name, record_class=NpyFastaRecord,
+                flatten_inplace=False, key_fn=None):
+        """
+            >>> from pyfasta import Fasta, FastaRecord
+
+            >>> f = Fasta('tests/data/three_chrs.fasta',
+            ...                          record_class=FastaRecord)
+            >>> sorted(f.keys())
+            ['chr1', 'chr2', 'chr3']
+
+        slicing returns an object.
+            >>> f['chr1']
+            FastaRecord('tests/data/three_chrs.fasta.flat', 0..80)
+
+        extract sequence with normal python syntax
+            >>> print(f['chr1'][:10])
+            ACTGACTGAC
+
+        take the first basepair in each codon...
+            >>> print(f['chr1'][0:10:3])
+            AGTC
+
+        """
+        if not os.path.exists(fasta_name):
+            raise FastaNotFound('"' + fasta_name + '"')
+        self.fasta_name = fasta_name
+        self.record_class = record_class
+        self.key_fn = key_fn
+        self.index, self.prepared = self.record_class.prepare(self,
+                                              self.gen_seqs_with_headers(key_fn),
+                                              flatten_inplace)
+
+        self.chr = {}
+
+    @classmethod
+    def as_kmers(klass, seq, k, overlap=0):
+        kmax = len(seq)
+        assert overlap < k, ('overlap must be < kmer length')
+        i = 0
+        while i < kmax:
+            yield i, seq[i:i + k]
+            i += k - overlap
+
+    def gen_seqs_with_headers(self, key_fn=None):
+        """remove all newlines from the sequence in a fasta file
+        and generate starts, stops to be used by the record class"""
+        fh = open(self.fasta_name, 'r')
+        # do the flattening (remove newlines)
+        # check of unique-ness of headers.
+        seen_headers = set()
+        header = None
+        seqs = None
+        for line in fh:
+            line = line.rstrip()
+            if not line: continue
+            if line[0] == ">":
+                if seqs is not None:
+                    if header in seen_headers:
+                        raise DuplicateHeaderException(header)
+                    seen_headers.add(header)
+                    yield header, "".join(seqs)
+
+                header = line[1:].strip()
+                if key_fn is not None:
+                    header = key_fn(header)
+                seqs = []
+            else:
+                seqs.append(line)
+
+        if seqs:
+            if header in seen_headers:
+                raise DuplicateHeaderException(header)
+            yield header, "".join(seqs)
+        fh.close()
+
+    def __len__(self):
+        # might not work for all backends?
+        return len(self.index)
+
+    def __iter__(self):
+        return iter(self.index)
+
+    def __getitem__(self, i):
+        if i in self.chr:
+            return self.chr[i]
+        c = self.index[i]
+        self.chr[i] = self.record_class(self.prepared, c[0], c[1])
+        return self.chr[i]
+
+    def sequence(self, f, asstring=True, auto_rc=True
+            , exon_keys=None, one_based=True):
+        """
+        take a feature and use the start/stop or exon_keys to return
+        the sequence from the assocatied fasta file:
+        f: a feature
+        asstring: if true, return the sequence as a string
+                : if false, return as a numpy array
+        auto_rc : if True and the strand of the feature == -1, return
+                  the reverse complement of the sequence
+        one_based: if true, query is using 1 based closed intervals, if false
+                    semi-open zero based intervals
+
+            >>> from pyfasta import Fasta
+            >>> f = Fasta('tests/data/three_chrs.fasta')
+            >>> print(f.sequence({'start':1, 'stop':2, 'strand':1, 'chr': 'chr1'}))
+            AC
+
+            >>> print(f.sequence({'start':1, 'stop':2, 'strand': -1, 'chr': 'chr1'}))
+            GT
+
+            >>> sorted(f.index.items())
+            [('chr1', (0, 80)), ('chr2', (80, 160)), ('chr3', (160, 3760))]
+
+        NOTE: these 2 are reverse-complement-ary because of strand
+        #>>> f.sequence({'start':10, 'stop':12, 'strand': -1, 'chr': 'chr1'})
+            'CAG'
+            >>> print(f.sequence({'start':10, 'stop':12, 'strand': 1, 'chr': 'chr1'}))
+            CTG
+
+
+            >>> print(f.sequence({'start':10, 'stop':12, 'strand': -1, 'chr': 'chr3'}))
+            TGC
+            >>> print(f.sequence({'start':10, 'stop':12, 'strand': 1, 'chr': 'chr3'}))
+            GCA
+
+            >>> print(f['chr3'][:][-10:])
+            CGCACGCTAC
+
+
+        a feature can have exons:
+            >>> feat = dict(start=9, stop=19, strand=1, chr='chr1'
+            ...    , exons=[(9,11), (13, 15), (17, 19)])
+
+        by default, it just returns the full sequence between start
+        and stop.
+            >>> print(f.sequence(feat))
+            ACTGACTGACT
+
+        but if exon_keys is set to an iterable, it will search for
+        those keys and will use the first to create a sequence and
+        return the concatenated result.
+            >>> print(f.sequence(feat, exon_keys=('rnas', 'exons')))
+            ACTACTACT
+
+        Note that sequence is 2 characters shorter than the entire
+        feature, to account for the introns at base-pairs 12 and 16.
+
+        Also note, it looks for an item with key of 'rnas', and didn't
+        fine one, so it continued on to 'exons'. If it doesn't find
+        any of the exon keys, it will fall back on the start, stop of
+        the feature:
+            >>> print(f.sequence(feat, exon_keys=('fake', 'also_fake')))
+            ACTGACTGACT
+        """
+        assert 'chr' in f and f['chr'] in self, (f, f['chr'], self.keys())
+        fasta    = self[f['chr']]
+        sequence = None
+        if not exon_keys is None:
+            sequence = self._seq_from_keys(f, fasta, exon_keys, one_based=one_based)
+
+        if sequence is None:
+            start = f['start'] - int(one_based)
+            sequence = fasta[start: f['stop']]
+
+        if auto_rc and f.get('strand') in (-1, '-1', '-'):
+            sequence = complement(sequence)[::-1]
+
+        if asstring: return sequence
+        return np.array(sequence, dtype='c')
+
+    def _seq_from_keys(self, f, fasta, exon_keys, base='locations', one_based=True):
+        """Internal:
+        f: a feature dict
+        fasta: a Fasta object
+        exon_keys: an iterable of keys, to look for start/stop
+                   arrays to get sequence.
+        base: if base ('locations') exists, look there fore the
+        exon_keys, not in the base of the object:
+            {'name': 'OS11G42690', 'stop': 25210251, 'locations':
+            {'CDS': [(25210018, 25210251)]}, 'start': 25210018, 'chr':
+            '11', 'strand': -1} set(['TRNA', 'CDS'])
+        """
+        fbase = f.get(base, f)
+        for ek in exon_keys:
+            if not ek in fbase: continue
+            locs = fbase[ek]
+            seq = ""
+            for start, stop in locs:
+                seq += fasta[start - int(one_based):stop]
+            return seq
+        return None
diff --git a/pyfasta/records.py b/pyfasta/records.py
new file mode 100644
index 0000000..677c17a
--- /dev/null
+++ b/pyfasta/records.py
@@ -0,0 +1,295 @@
+import cPickle
+import numpy as np
+import sys
+import os
+
+__all__ = ['FastaRecord', 'NpyFastaRecord', 'MemoryRecord']
+
+MAGIC = "@flattened@"
+
+def is_up_to_date(a, b):
+    return os.path.exists(a) and os.stat(a).st_mtime >= os.stat(b).st_mtime
+
+
+def ext_is_flat(ext):
+    with open(ext) as fh:
+        t = fh.read(len(MAGIC))
+    return MAGIC == t
+
+class FastaRecord(object):
+    __slots__ = ('fh', 'start', 'stop')
+    ext = ".flat"
+    idx = ".gdx"
+
+    @classmethod
+    def is_current(klass, fasta_name):
+        utd = is_up_to_date(fasta_name + klass.idx, fasta_name)
+        if not utd: return False
+        return is_up_to_date(fasta_name + klass.ext, fasta_name)
+
+    def __init__(self, fh, start, stop):
+
+        self.fh      = fh
+        self.stop    = stop
+        self.start   = start
+
+    def __len__(self):
+        return self.stop - self.start
+
+    @classmethod
+    def prepare(klass, fasta_obj, seqinfo_generator, flatten_inplace):
+        """
+        returns the __getitem__'able index. and the thing to get the seqs from.
+        """
+        f = fasta_obj.fasta_name
+        if klass.is_current(f):
+            with open(f + klass.idx, 'rb') as fh:
+                idx = cPickle.load(fh)
+            if flatten_inplace or ext_is_flat(f + klass.ext): flat = klass.modify_flat(f)
+            else: flat = klass.modify_flat(f + klass.ext)
+            if flatten_inplace and not ext_is_flat(f + klass.ext):
+                del flat
+            else:
+                return idx, flat
+
+        idx = {}
+        with open(f + klass.ext, 'w') as flatfh:
+            for i, (seqid, seq) in enumerate(seqinfo_generator):
+                if flatten_inplace:
+                    if i == 0:
+                        flatfh.write('>%s\n' % seqid)
+                    else:
+                        flatfh.write('\n>%s\n' % seqid)
+                start = flatfh.tell()
+                flatfh.write(seq)
+                stop = flatfh.tell()
+                idx[seqid] = (start, stop)
+
+        if flatten_inplace:
+            klass.copy_inplace(flatfh.name, f)
+            with open(f + klass.idx, 'wb') as fh:
+                cPickle.dump(idx, fh, -1)
+            return idx, klass.modify_flat(f)
+
+        with open(f + klass.idx, 'wb') as fh:
+            cPickle.dump(idx, fh, -1)
+        return idx, klass.modify_flat(f + klass.ext)
+
+    @classmethod
+    def copy_inplace(klass, flat_name, fasta_name):
+        """
+        so they requested flatten_inplace, so we
+        overwrite the .fasta file with the .fasta.flat
+        file and save something in the .flat as a place-holder.
+        """
+        os.rename(flat_name, fasta_name)
+        # still need the flattend file to show
+        # it's current.
+        with open(fasta_name + klass.ext, 'w') as flatfh:
+            flatfh.write(MAGIC)
+
+
+
+    @classmethod
+    def modify_flat(klass, flat_file):
+        return open(flat_file, 'r')
+
+    def _adjust_slice(self, islice):
+        l = len(self)
+        if not islice.start is None and islice.start < 0:
+            istart = self.stop + islice.start
+        else:
+            if islice.start is None:
+                istart = self.start
+            else:
+                istart = self.start + islice.start
+
+
+        if not islice.stop is None and islice.stop < 0:
+            istop = self.stop + islice.stop
+        else:
+            istop = islice.stop is None and self.stop or (self.start + islice.stop)
+
+        # this will give empty string
+        if istart > self.stop: return self.stop, self.stop 
+        if istart < self.start: istart = self.start
+
+        if istop  < self.start: istop = self.start
+        elif istop > self.stop: istop = self.stop
+
+        return istart, istop
+
+    def __getitem__(self, islice):
+        fh = self.fh
+        fh.seek(self.start)
+
+
+        if isinstance(islice, (int, long)):
+            if islice < 0:
+                if -islice > self.stop - self.start:
+                    raise IndexError
+                fh.seek(self.stop + islice)
+            else:
+                fh.seek(self.start + islice)
+            return fh.read(1)
+
+        # [:]
+        if islice.start in (0, None) and islice.stop in (None, sys.maxint):
+            if islice.step in (1, None):
+                return fh.read(self.stop - self.start)
+            return fh.read(self.stop - self.start)[::islice.step]
+
+        istart, istop = self._adjust_slice(islice)
+        if istart is None: return u""
+        l = istop - istart
+        if l == 0: return u""
+
+
+        fh.seek(istart)
+
+        if islice.step in (1, None):
+            return fh.read(l)
+
+        return fh.read(l)[::islice.step]
+
+
+    def __str__(self):
+        return self[:]
+
+    def __repr__(self):
+        return "%s('%s', %i..%i)" % (self.__class__.__name__, self.fh.name,
+                                   self.start, self.stop)
+
+    @property
+    def __array_interface__(self):
+        return {
+            'shape': (len(self), ),
+            'typestr': '|S1',
+            'version': 3,
+            'data': buffer(self)
+        }
+
+
+class NpyFastaRecord(FastaRecord):
+    __slots__ = ('start', 'stop', 'mm', 'as_string')
+
+    def __init__(self, mm, start, stop, as_string=True):
+        self.mm = mm
+        self.start = start
+        self.stop = stop
+        self.as_string = as_string
+
+    def __repr__(self):
+        return "%s(%i..%i)" % (self.__class__.__name__,
+                                   self.start, self.stop)
+
+    @classmethod
+    def modify_flat(klass, flat_file):
+        mm = np.memmap(flat_file, dtype="S1", mode="r")
+        return mm
+
+    def getdata(self, islice):
+        if isinstance(islice, (int, long)):
+            if islice >= 0:
+                islice += self.start
+            else:
+                islice += self.stop
+                if islice < 0: raise IndexError
+            return self.mm[islice]
+
+        start, stop = self._adjust_slice(islice)
+        return self.mm[start:stop:islice.step]
+
+    def __getitem__(self, islice):
+        d = self.getdata(islice)
+        return d.tostring().decode() if self.as_string else d
+
+    @property
+    def __array_interface__(self):
+        old_as_string = self.as_string
+        self.as_string = False
+        data = self[:]
+        self.as_string = old_as_string
+        return {
+            'shape': (len(self), ),
+            'typestr': '|S1',
+            'version': 3,
+            'data': data,
+        }
+
+
+class MemoryRecord(FastaRecord):
+    """
+    dont write anything to disk, just read the whole thing
+    into memory
+    """
+    @classmethod
+    def prepare(klass, fasta_obj, seqinfo_generator, flatten_inplace=False):
+        f = fasta_obj.fasta_name
+        seqs = {}
+        idx = {}
+        for seqid, seq in seqinfo_generator:
+            seqs[seqid] = (seq, None)
+
+        return seqs, seqs
+
+    def __init__(self, _, seq, _none):
+        self.seq = seq
+
+    def __getitem__(self, slice):
+        return self.seq.__getitem__(slice)
+
+    def __len__(self):
+        return len(self.seq)
+
+
+
+try:
+    import tc
+    class HDB(tc.HDB):
+        def __getitem__(self, k):
+            return cPickle.loads(tc.HDB.get(self, k))
+        def __setitem__(self, k, v):
+            tc.HDB.put(self, k, cPickle.dumps(v, -1))
+        def __del__(self):
+            tc.HDB.close(self)
+
+    class TCRecord(NpyFastaRecord):
+        idx = ".tct"
+
+        @classmethod
+        def prepare(klass, fasta_obj, seqinfo_generator, flatten_inplace):
+            f = fasta_obj.fasta_name
+            if klass.is_current(f):
+                idx = HDB()
+                idx.open(f + klass.idx, tc.HDBOREADER)
+                if flatten_inplace or ext_is_flat(f + klass.ext): flat = klass.modify_flat(f)
+                else: flat = klass.modify_flat(f + klass.ext)
+                return idx, flat
+
+
+            db = HDB(f + klass.idx, tc.HDBOWRITER | tc.HDBOCREAT)
+            flatfh = open(f + klass.ext, 'w')
+            for i, (seqid, seq) in enumerate(seqinfo_generator):
+                if flatten_inplace:
+                    if i == 0:
+                        flatfh.write('>%s\n' % seqid)
+                    else:
+                        flatfh.write('\n>%s\n' % seqid)
+                start = flatfh.tell()
+                flatfh.write(seq)
+                stop = flatfh.tell() 
+                db[seqid] = (start, stop)
+
+            db.sync()
+            flatfh.close()
+
+            if flatten_inplace:
+                klass.copy_inplace(flatfh.name, f)
+                return db, klass.modify_flat(f)
+            return db, klass.modify_flat(f + klass.ext)
+
+
+    __all__.append('TCRecord')
+except ImportError:
+    pass
diff --git a/pyfasta/split_fasta.py b/pyfasta/split_fasta.py
new file mode 100644
index 0000000..c1dcc81
--- /dev/null
+++ b/pyfasta/split_fasta.py
@@ -0,0 +1,231 @@
+from __future__ import print_function
+from pyfasta import Fasta
+import operator
+import collections
+import string
+import sys
+import optparse
+from cStringIO import StringIO
+
+
+def newnames(oldname, n, kmers=None, overlap=None, header=None):
+    """
+    >>> newnames('some.fasta', 1)
+    ['some.split.fasta']
+
+    >>> newnames('some.fasta', 2)
+    ['some.0.fasta', 'some.1.fasta']
+
+    >>> newnames('some', 2)
+    ['some.0', 'some.1']
+
+    >>> newnames('some.fasta', 2, kmers=1000)
+    ['some.0.1Kmer.fasta', 'some.1.1Kmer.fasta']
+
+    >>> newnames('some.fasta', 2, kmers=10000, overlap=2000)
+    ['some.0.10Kmer.2Koverlap.fasta', 'some.1.10Kmer.2Koverlap.fasta']
+
+    >>> newnames('some.fasta', 1, kmers=100000, overlap=2000)
+    ['some.split.100Kmer.2Koverlap.fasta']
+
+    """
+    if kmers and kmers % 1000 == 0: kmers = "%iK" % (kmers/1000)
+    if overlap and overlap % 1000 == 0: overlap = "%iK" % (overlap/1000)
+
+    p = oldname.rfind("fa")
+    kstr = kmers is not None and ("%smer." % kmers) or ""
+    ostr = overlap is not None and ("%soverlap." % overlap) or ""
+    if p != -1:
+        pattern = oldname[:p] + "%s." + kstr + ostr + oldname[p:]
+    else:
+        pattern = oldname + kstr + ostr + ".%s"
+
+    
+    
+
+    if n == 1:
+        names = [pattern % "split"]
+    else:
+        width = len(str(n))
+        names = [pattern % str(i).rjust(width, '0') for i in range(n)]
+    print("creating new files:", file=sys.stderr)
+    print("\n".join(names), file=sys.stderr)
+    return names
+
+
+def print_to_fh(fh, fasta, lens, seqinfo):
+    key, seqlen = seqinfo
+    lens[fh.name] += seqlen
+    f = fasta
+    assert len(str(f[key])) == seqlen, (key, seqlen, len(str(f[key])))
+    print(">%s" % key, file=fh)
+    print(str(f[key]), file=fh)
+
+
+def format_kmer(seqid, start):
+    """
+    prints out a header with 1-based indexing.
+
+    >>> format_kmer('chr3', 1000)
+    'chr3_1001'
+    """
+    return "%s_%i" % (seqid, start + 1)
+
+def split(args):
+    parser = optparse.OptionParser("""\
+   split a fasta file into separated files.
+        pyfasta split -n 6 [-k 5000 ] some.fasta
+    the output will be some.0.fasta, some.1.fasta ... some.6.fasta
+    the sizes will be as even as reasonable.
+   """)
+    parser.add_option("--header", dest="header", metavar="FILENAME_FMT",
+       help="""this overrides all other options. if specified, it will
+               split the file into a separate file for each header. it
+               will be a template specifying the file name for each new file.
+               e.g.:    "%(fasta)s.%(seqid)s.fasta"
+               where 'fasta' is the basename of the input fasta file and seqid
+               is the header of each entry in the fasta file.""" ,default=None)
+
+    parser.add_option("-n", "--n", type="int", dest="nsplits", 
+                            help="number of new files to create")
+    parser.add_option("-o", "--overlap", type="int", dest="overlap", 
+                            help="overlap in basepairs", default=0)
+    parser.add_option("-k", "--kmers", type="int", dest="kmers", default=-1,
+                     help="""\
+    split big files into pieces of this size in basepairs. default
+    default of -1 means do not split the sequence up into k-mers, just
+    split based on the headers. a reasonable value would be 10Kbp""")
+    options, fasta = parser.parse_args(args)
+    if not (fasta and (options.nsplits or options.header)):
+        sys.exit(parser.print_help())
+
+    if isinstance(fasta, (tuple, list)):
+        assert len(fasta) == 1, fasta
+        fasta = fasta[0]
+
+    kmer = options.kmers if options.kmers != -1 else None
+    overlap = options.overlap if options.overlap != 0 else None
+    f = Fasta(fasta)
+    if options.header:
+        names = dict([(seqid, options.header % \
+                      dict(fasta=f.fasta_name, seqid=seqid)) \
+                                       for seqid in f.iterkeys()])
+        """
+        if len(names) > 0:
+            assert names[0][1] != names[1][1], ("problem with header format", options.header)
+        fhs = dict([(seqid, open(fn, 'wb')) for seqid, fn in names[:200]])
+        fhs.extend([(seqid, StringIO(), fn) for seqid, fn in names[200:]])
+        """
+        return with_header_names(f, names)
+    else:
+        names = newnames(fasta, options.nsplits, kmers=kmer, overlap=overlap, 
+                     header=options.header)
+
+        #fhs = [open(n, 'wb') for n in names]
+    if options.kmers == -1:
+        return without_kmers(f, names)
+    else: 
+        return with_kmers(f, names, options.kmers, options.overlap)
+
+def with_header_names(f, names):
+    """
+    split the fasta into the files in fhs by headers.
+    """
+    for seqid, name in names.iteritems():
+        with open(name, 'w') as fh:
+            print(">%s" % seqid, file=fh)
+            print(str(f[seqid]), file=fh)
+
+def with_kmers(f, names, k, overlap):
+    """
+    split the sequences in Fasta object `f` into pieces of length `k`
+    with the given `overlap` the results are written to the array of files
+    `fhs`
+    """
+    fhs = [open(name, 'w') for name in names]
+    i = 0
+    for seqid in f.iterkeys():
+        seq = f[seqid]
+        for (start0, subseq) in Fasta.as_kmers(seq, k, overlap=overlap):
+
+            fh = fhs[i % len(fhs)]
+            print(">%s" % format_kmer(seqid, start0), file=fh)
+            print(subseq, file=fh)
+            i += 1
+    for fh in fhs:
+        fh.close()
+
+def without_kmers(f, names):
+    """
+    long crappy function that does not solve the bin-packing problem.
+    but attempts to distribute the sequences in Fasta object `f` evenly
+    among the file handles in fhs.
+    """
+    fhs = [open(name, 'w') for name in names]
+    name2fh = dict([(fh.name, fh) for fh in fhs])
+    items = sorted([(key, len(f[key])) for key in f.iterkeys()],
+                   key=operator.itemgetter(1))
+
+    l1 = len(items) - 1
+    l0 = 0
+    lens = collections.defaultdict(int)
+
+    n_added = 0
+    while l0 < l1:
+        fh = fhs[n_added % len(fhs)]
+        added = False
+        if n_added >= len(fhs):
+
+            while l1 > l0:
+                lmin = min(lens.itervalues())
+                lmax = max(lens.itervalues())
+                if float(lmin) / lmax < 0.80:
+                    # it's way off, so add a large (l1)
+                    name = find_name_from_len(lmin, lens)
+                    fh = name2fh[name]
+                    print_to_fh(fh, f, lens, items[l1])
+                    l1 -= 1
+                    added = True
+                    n_added += 1
+
+                elif float(lmin) / lmax < 0.94:
+                    # it's just a little off so add a small (l0)
+                    name = find_name_from_len(lmin, lens)
+                    fh = name2fh[name]
+                    print_to_fh(fh, f, lens, items[l0])
+                    l0 += 1
+                    added = True
+                    n_added += 1
+                else:
+                    break
+
+                if not added:
+                    break
+                # TODO: test this on glycine
+                #added = False
+        if added:
+            continue
+
+        print_to_fh(fh, f, lens, items[l1])
+        l1 -= 1
+        n_added += 1
+
+    if l0 == l1:
+        fh = fhs[l0 % len(fhs)]
+        print_to_fh(fh, f, lens, items[l0])
+
+    for fh in fhs:
+        fh.close()
+
+
+def find_name_from_len(lmin, lens):
+    """
+    reverse lookup, get name from dict
+    >>> lens = {'chr1': 4, 'chr2': 5, 'chr3': 4}
+    >>> find_name_from_len(5, lens)
+    'chr2'
+    """
+    for fname, l in lens.iteritems():
+        if l == lmin: 
+            return fname
+    raise Exception('name not found')
diff --git a/setup.cfg b/setup.cfg
new file mode 100644
index 0000000..1e9a0d8
--- /dev/null
+++ b/setup.cfg
@@ -0,0 +1,13 @@
+[nosetests]
+with-doctest = 1
+doctest-extension = .txt
+nocapture = 1
+detailed-errors = 1
+with-coverage = 1
+cover-package = pyfasta
+
+[egg_info]
+tag_build = 
+tag_date = 0
+tag_svn_revision = 0
+
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..e165937
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,40 @@
+from setuptools import setup, find_packages
+
+
+version = '0.5.2'
+
+# Run 2to3 builder if we're on Python 3.x, from
+#   http://wiki.python.org/moin/PortingPythonToPy3k
+try:
+    from distutils.command.build_py import build_py_2to3 as build_py
+except ImportError:
+    # 2.x
+    from distutils.command.build_py import build_py
+command_classes = {'build_py': build_py}
+
+setup(name='pyfasta',
+      version=version,
+      description=\
+        "fast, memory-efficient, pythonic (and command-line) access to fasta sequence files",
+      url="http://github.com/brentp/pyfasta/",
+      long_description=open('README.rst').read() + "\n" + open('CHANGELOG.txt').read(),
+      classifiers=["Topic :: Scientific/Engineering :: Bio-Informatics"],
+      keywords='bioinformatics blast fasta',
+      author='brentp',
+      author_email='bpederse at gmail.com',
+      license='MIT',
+      packages=find_packages(exclude=['ez_setup', 'examples', 'tests']),
+      package_data={'':['CHANGELOG.txt']},
+      include_package_data=True,
+      tests_require=['nose'],
+      test_suite='nose.collector',
+      zip_safe=False,
+      install_requires=[
+          # -*- Extra requirements: -*-
+      ],
+      scripts=[],
+      entry_points={
+      'console_scripts': ['pyfasta = pyfasta:main']
+      },
+      cmdclass=command_classes,
+  )
diff --git a/tests/bench.py b/tests/bench.py
new file mode 100644
index 0000000..ab779ad
--- /dev/null
+++ b/tests/bench.py
@@ -0,0 +1,51 @@
+from __future__ import print_function
+from itertools import islice
+import sys
+import os
+sys.path.insert(0, os.path.abspath("."))
+import pyfasta
+from pyfasta import Fasta
+
+import time
+import random
+random.seed(1234)
+
+SEQLEN = 100000
+def make_long_fasta(filename="t.fasta", nrecs=2500, seqlen=SEQLEN):
+    fh = open(filename, 'w')
+        #0123456789"
+    s = "ACTGACTGAC"
+    for i in range(nrecs):
+        print(">header%i" % i, file=fh)
+        print(s * (seqlen/10), file=fh)
+
+    fh.close()
+    return filename
+
+def read(f, nreads=40000, seqlen=SEQLEN):
+
+    for k in islice(f.iterkeys(), 10):
+        for i in range(nreads):
+            start = random.randint(0, seqlen)
+            end = min(seqlen, start + random.randint(1000, 2000))
+            str(f[k][start:end])
+        
+
+
+def main():
+    fa = make_long_fasta()
+
+    t = time.time()
+    f = Fasta(fa)
+    print("flatten:", time.time() - t)
+
+    
+    t = time.time()
+    read(f)
+    print("read:", time.time() - t)
+
+     
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/data/dups.fasta b/tests/data/dups.fasta
new file mode 100644
index 0000000..435e665
--- /dev/null
+++ b/tests/data/dups.fasta
@@ -0,0 +1,4 @@
+>a
+aaa
+>a
+bbb
diff --git a/tests/data/key.fasta b/tests/data/key.fasta
new file mode 100644
index 0000000..db8f320
--- /dev/null
+++ b/tests/data/key.fasta
@@ -0,0 +1,6 @@
+>a extra
+a
+>b extra
+b
+>c extra
+c
diff --git a/tests/data/three_chrs.fasta b/tests/data/three_chrs.fasta
new file mode 100644
index 0000000..d88292e
--- /dev/null
+++ b/tests/data/three_chrs.fasta
@@ -0,0 +1,30 @@
+
+>chr1
+ACTGACTGACTGACTGACTGACTGACTGACTGACTGA
+CTGACTGACTGACTGACTGACT
+GACTGACTGACTGACTGACTG
+
+>chr2
+TAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+AAAAAAAAAAAAAAAAT
+
+
+
+
+>chr3
+ACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATT
+ACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATT
+ACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATT
+ACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATT
+ACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATT
+ACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATT
+ACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATT
+ACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATT
+ACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATT
+ACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATT
+ACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCAC [...]
+
+
+
+
diff --git a/tests/test_all.py b/tests/test_all.py
new file mode 100644
index 0000000..6dcc6c0
--- /dev/null
+++ b/tests/test_all.py
@@ -0,0 +1,255 @@
+from pyfasta import Fasta
+from pyfasta.records import NpyFastaRecord, MemoryRecord, FastaRecord
+record_classes = [NpyFastaRecord, MemoryRecord, FastaRecord]
+from pyfasta import DuplicateHeaderException
+
+try:
+    from pyfasta.records import TCRecord
+    record_classes.append(TCRecord)
+except ImportError:
+    pass
+
+import os
+import shutil
+from nose.tools import assert_raises
+import numpy as np
+import glob
+
+def _cleanup():
+    for f in glob.glob("tests/data/three_chrs.fasta*") + glob.glob('tests/data/dups.fasta.*'):
+        if f.endswith(".orig"): continue
+        os.unlink(f)
+    shutil.copyfile('tests/data/three_chrs.fasta.orig', 'tests/data/three_chrs.fasta')
+
+
+def test_classes():
+
+    for inplace in (True, False):
+        for klass in record_classes:
+            f = Fasta('tests/data/three_chrs.fasta', record_class=klass, flatten_inplace=inplace)
+            yield check_keys, f
+            yield check_misc, f, klass
+            yield check_contains, f
+            yield check_shape, f
+            yield check_bounds, f
+            yield check_tostring, f
+            yield check_kmers, f
+            yield check_kmer_overlap, f
+            yield check_slice_size, f
+            yield check_slice, f
+            yield check_full_slice, f
+            yield check_array_copy, f
+            yield check_array, f
+            yield check_one_based, f
+
+            fasta_name = f.fasta_name
+
+            del f
+
+            yield check_keyfn, 'tests/data/key.fasta', klass, inplace
+
+            yield check_keyfn2, 'tests/data/key.fasta', klass, inplace
+
+            yield check_reload, klass, fasta_name
+
+            yield check_duplicates, klass, inplace
+
+            _cleanup()
+
+
+def check_duplicates(klass, inplace):
+    assert_raises(DuplicateHeaderException,
+                  lambda: Fasta('tests/data/dups.fasta',
+                                record_class=klass, flatten_inplace=inplace))
+
+
+def check_keys(f):
+    assert sorted(f.keys()) == ['chr1', 'chr2', 'chr3']
+    assert sorted(f.iterkeys()) == ['chr1', 'chr2', 'chr3']
+
+def fix(path):
+    import os.path as op
+
+    for ext in (".gdx", ".flat"):
+        if op.exists(path + ext):
+            os.unlink(path + ext)
+
+    shutil.copyfile(path + ".orig", path)
+
+def check_keyfn(path, klass, inplace):
+    f = Fasta(path, record_class=klass, flatten_inplace=inplace, key_fn=lambda key: key.split()[0])
+    assert sorted(f.keys()) == ['a', 'b', 'c'], f.keys()
+    fix(path)
+    ff = Fasta(path, record_class=klass, flatten_inplace=inplace)
+    assert sorted(ff.keys()) == ['a extra', 'b extra', 'c extra'], (ff.keys(), klass)
+    fix(path)
+
+def check_keyfn2(path, klass, inplace):
+    f = Fasta(path, record_class=klass, flatten_inplace=inplace, key_fn=lambda
+            key: "-".join(key.split()))
+
+    assert sorted(f.keys()) == ['a-extra', 'b-extra', 'c-extra'], f.keys()
+
+    assert f['a-extra']
+    fix(path)
+
+
+def check_reload(klass, fasta_name):
+
+    m = ""
+    if klass.__name__ in ('FastaRecord',): # 'NpyFastaRecord'):
+        m = os.stat(fasta_name + klass.ext).st_mtime 
+
+    f = Fasta(fasta_name, record_class=klass)
+    assert f
+    del f
+    if klass.__name__ in ('FastaRecord',): # 'NpyFastaRecord'):
+        assert os.stat(fasta_name + klass.ext).st_mtime  == m
+
+
+def check_full_slice(f):
+    for k in f.iterkeys():
+        assert str(f[k]) == f[k][:]
+        assert str(f[k]) == f[k][0:]
+
+
+        assert str(f[k])[::2] == f[k][0:][::2]
+        assert str(f[k])[::2] == f[k][:][::2]
+
+def check_contains(f):
+
+    assert not '_________' in f
+    assert 'chr2' in f
+    
+def check_misc(f, klass):
+    seq = f['chr2']
+    assert seq.__class__ == klass
+
+    assert (seq[0] == 'T')
+    assert (seq[-1] == 'T')
+
+    assert (f['chr3'][0:5][::-1] == 'ACGCA')
+
+
+    for i in (1, len(seq) -2):
+        assert (seq[i] == 'A')
+
+    for i in (1, len(seq) -3):
+        assert (seq[i: i + 2] == 'AA')
+    
+    assert (seq[1] == 'A')
+
+    assert (seq[-1] == 'T')
+    assert (seq[-2] == 'A')
+
+    assert (seq[0] == 'T')
+    assert (seq[6:9] == 'AAA')
+
+    seq = 'TACGCACGCTAC'
+    assert (seq == f['chr3'][-12:])
+    assert (seq[:4] == f['chr3'][-12:-8])
+
+def check_shape(f):
+    assert (len(f['chr2']) == 80)
+    assert (len(f['chr3']) == 3600)
+    assert (f['chr2'][:2] == 'TA')
+
+    assert (f['chr3'][:5] == 'ACGCA')
+
+def check_bounds(f):
+    c2 = f['chr2']
+    assert (len(str(c2[0:900])) == 80)
+
+    assert (c2[99:99] == "")
+    assert (c2[99:99] == "")
+    assert (c2[80:81] == "")
+    assert (c2[79:81] == "T")
+
+    assert (c2[800:810] == "")
+    assert_raises(IndexError, lambda: c2[-800])
+
+
+    assert (c2[-800:-810] == "")
+
+def check_array(f):
+
+    s = f.sequence({'chr': 'chr2', 'start': 1, 'stop':3}, asstring=False)
+    assert np.all(s == np.array(['T', 'A', 'A'], dtype="|S1"))
+
+
+def check_tostring(f):
+    s = 'TAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAT'
+    assert (str(f['chr2']) == s)
+
+def check_kmers(f):
+    seq = str(f['chr2'])
+
+    kmers = list(Fasta.as_kmers(f['chr2'], 10))
+    assert (len(kmers) == len(seq) / 10)
+    assert (kmers[0] == (0, seq[:10]))
+
+    seqs = [k[1] for k in kmers]
+    assert ("".join(seqs) == seq)
+    last_pair = kmers[-1]
+    assert (seqs[-1][-1] == 'T')
+
+    seq = str(f['chr3'])
+    kmers = list(Fasta.as_kmers(f['chr3'], 1))
+    assert (kmers[2][0] == 2)
+    seqs = [k[1] for k in kmers]
+    assert ("".join(seqs) == seq)
+
+
+def check_kmer_overlap(f):
+    chr2 = f['chr2']
+
+    kmers = Fasta.as_kmers(chr2, 10, overlap=2)
+    for i, k in enumerate(list(kmers)[:-1]):
+        assert (len(k[1]) == 10)
+        assert (k[0] == (i * (10 - 2)))
+
+    kmers = Fasta.as_kmers(chr2, 10, overlap=4)
+    seqs = [k[1] for k in kmers]
+    paired_seqs = zip(seqs[0:-1], seqs[1:])
+    for a, b in paired_seqs:
+        if len(a) < 4 or len(b) < 4: continue
+        assert (a[-4:] == b[:4])
+
+def check_slice_size(f):
+    assert (f['chr3'][:7] == 'ACGCATT')
+    # take the first basepair of each codon
+    assert (f['chr3'][0:7:3] == 'ACT')
+    # take the 2nd basepair of each codon
+    assert (f['chr3'][1:7:3] == 'CA')
+
+def check_slice(f):
+    assert (str(f['chr3'][:4]) == 'ACGC')
+
+
+    seq = 'TACGCACGCTAC'
+    assert (seq == f['chr3'][-12:])
+
+    assert (seq[:4] == f['chr3'][-12:-8])
+
+    assert (f['chr3'][0:5][::-1] == 'ACGCA')
+    assert (f['chr3'][0:5] == 'ACGCA')
+
+
+def check_array_copy(f):
+    # the array is definitely a copy...
+    a = np.array(f['chr3'], np.dtype('S1'))
+    old = f['chr3'][1:5]
+    assert (a.shape[0] == 3600)
+    a[1:5] = np.array('N', dtype='S1')
+    c = f['chr3'][1:5]
+    assert c == old
+
+    assert a[1:5].tostring().decode() == 'NNNN', a[1:5].tostring().decode()
+
+def check_one_based(f):
+    assert f.sequence({'chr': 'chr1', 'start': 2, 'stop': 9})  == 'CTGACTGA'
+    assert f.sequence({'chr': 'chr1', 'start': 2, 'stop': 9}, one_based=False) == 'TGACTGA'
+
+if __name__ == "__main__":
+    import nose
+    nose.main()

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/python-pyfasta.git



More information about the debian-med-commit mailing list