[med-svn] [python-pyfasta] 04/06: New upstream version 0.5.2
Andreas Tille
tille at debian.org
Wed Dec 27 16:58:51 UTC 2017
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch master
in repository python-pyfasta.
commit 752c646fd46fca4f3cf9f469fcbd3a19a53cc965
Author: Andreas Tille <tille at debian.org>
Date: Wed Dec 27 17:57:48 2017 +0100
New upstream version 0.5.2
---
CHANGELOG.txt | 77 +++++++++
MANIFEST.in | 4 +
PKG-INFO | 302 ++++++++++++++++++++++++++++++++++
README.rst | 212 ++++++++++++++++++++++++
debian/README.source | 19 ---
debian/changelog | 5 -
debian/compat | 1 -
debian/control | 48 ------
debian/copyright | 31 ----
debian/rules | 25 ---
debian/source/format | 1 -
debian/watch | 2 -
pyfasta.egg-info/PKG-INFO | 302 ++++++++++++++++++++++++++++++++++
pyfasta.egg-info/SOURCES.txt | 20 +++
pyfasta.egg-info/dependency_links.txt | 1 +
pyfasta.egg-info/entry_points.txt | 3 +
pyfasta.egg-info/not-zip-safe | 1 +
pyfasta.egg-info/top_level.txt | 1 +
pyfasta/__init__.py | 141 ++++++++++++++++
pyfasta/fasta.py | 233 ++++++++++++++++++++++++++
pyfasta/records.py | 295 +++++++++++++++++++++++++++++++++
pyfasta/split_fasta.py | 231 ++++++++++++++++++++++++++
setup.cfg | 13 ++
setup.py | 40 +++++
tests/bench.py | 51 ++++++
tests/data/dups.fasta | 4 +
tests/data/key.fasta | 6 +
tests/data/three_chrs.fasta | 30 ++++
tests/test_all.py | 255 ++++++++++++++++++++++++++++
29 files changed, 2222 insertions(+), 132 deletions(-)
diff --git a/CHANGELOG.txt b/CHANGELOG.txt
new file mode 100644
index 0000000..ad4b80d
--- /dev/null
+++ b/CHANGELOG.txt
@@ -0,0 +1,77 @@
+Changes
+=======
+0.5.2
+-----
+fix complement (@mruffalo)
+
+0.5.0
+-----
+python 3 compatibility thanks to mruffalo
+
+0.4.5
+-----
+pyfasta split can handle > 52 files. (thanks Devtulya)
+
+0.4.4
+-----
+fix pyfasta extract
+
+0.4.3
+-----
+Add 0 or 1-based intervals in sequence() thanks @jamescasbon
+
+0.4.2
+-----
+update for latest numpy (can't close memmap)
+
+0.4.1
+-----
+check for duplicate headers.
+
+0.4.0
+-----
+* add key_fn kwarg to constuctor
+
+0.3.9
+-----
+* only require 'r' (not r+) for memory map.
+
+0.3.8
+-----
+* clean up logic for mixing inplace/non-inplace flattened files.
+ if the inplace is available, it is always used.
+
+0.3.6/7
+-------
+* dont re-flatten the file every time!
+* allow spaces before and after the header in the orginal fasta.
+
+0.3.5
+-----
+
+* update docs in README.txt for new CLI stuff.
+* allow flattening inplace.
+* get rid of memmap (results in faster parsing).
+
+0.3.4
+-----
+
+* restore python2.5 compatiblity.
+* CLI: add ability to exclude sequence from extract
+* CLI: allow spliting based on header.
+
+0.3.3
+-----
+
+* include this file in the tar ball (thanks wen h.)
+
+0.3.2
+-----
+
+* separate out backends into records.py
+
+* use nosetests (python setup.py nosetests)
+
+* add a TCRecord backend for next-gen sequencing availabe if tc is (easy-)installed.
+
+* improve test coverage.
diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 0000000..ac8e293
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1,4 @@
+include CHANGELOG.txt
+include README.rst
+include tests/*.py
+include tests/data/*.fasta
diff --git a/PKG-INFO b/PKG-INFO
new file mode 100644
index 0000000..7b308bd
--- /dev/null
+++ b/PKG-INFO
@@ -0,0 +1,302 @@
+Metadata-Version: 1.1
+Name: pyfasta
+Version: 0.5.2
+Summary: fast, memory-efficient, pythonic (and command-line) access to fasta sequence files
+Home-page: http://github.com/brentp/pyfasta/
+Author: brentp
+Author-email: bpederse at gmail.com
+License: MIT
+Description: ==================================================
+ pyfasta: pythonic access to fasta sequence files.
+ ==================================================
+
+
+ :Author: Brent Pedersen (brentp)
+ :Email: bpederse at gmail.com
+ :License: MIT
+
+ .. contents ::
+
+ Implementation
+ ==============
+
+ Requires Python >= 2.6. Stores a flattened version of the fasta file without
+ spaces or headers and uses either a mmap of numpy binary format or fseek/fread so the
+ *sequence data is never read into memory*. Saves a pickle (.gdx) of the start, stop
+ (for fseek/mmap) locations of each header in the fasta file for internal use.
+
+ Usage
+ =====
+ ::
+
+ >>> from pyfasta import Fasta
+
+ >>> f = Fasta('tests/data/three_chrs.fasta')
+ >>> sorted(f.keys())
+ ['chr1', 'chr2', 'chr3']
+
+ >>> f['chr1']
+ NpyFastaRecord(0..80)
+
+
+
+ Slicing
+ -------
+ ::
+
+ # get full the sequence:
+ >>> a = str(f['chr1'])
+ >>> b = f['chr1'][:]
+ >>> a == b
+ True
+
+ >>> f['chr1'][:10]
+ 'ACTGACTGAC'
+
+ # get the 1st basepair in every codon (it's python yo)
+ >>> f['chr1'][::3]
+ 'AGTCAGTCAGTCAGTCAGTCAGTCAGT'
+
+ # can query by a 'feature' dictionary (note this is one based coordinates)
+ >>> f.sequence({'chr': 'chr1', 'start': 2, 'stop': 9})
+ 'CTGACTGA'
+
+ # same as:
+ >>> f['chr1'][1:9]
+ 'CTGACTGA'
+
+ # use python, zero based coords
+ >>> f.sequence({'chr': 'chr1', 'start': 2, 'stop': 9}, one_based=False)
+ 'TGACTGA'
+
+ # with reverse complement (automatic for - strand)
+ >>> f.sequence({'chr': 'chr1', 'start': 2, 'stop': 9, 'strand': '-'})
+ 'TCAGTCAG'
+
+ Key Function
+ ------------
+ Sometimes your fasta will have a long header like: "AT1G51370.2 | Symbols: | F-box family protein | chr1:19045615-19046748 FORWARD" when you only want to key off: "AT1G51370.2". In this case, specify the key_fn argument to the constructor:
+
+ ::
+
+ >>> fkey = Fasta('tests/data/key.fasta', key_fn=lambda key: key.split()[0])
+ >>> sorted(fkey.keys())
+ ['a', 'b', 'c']
+
+ Numpy
+ =====
+
+ The default is to use a memmaped numpy array as the backend. In which case it's possible to
+ get back an array directly...
+ ::
+
+ >>> f['chr1'].as_string = False
+ >>> f['chr1'][:10] # doctest: +NORMALIZE_WHITESPACE
+ memmap(['A', 'C', 'T', 'G', 'A', 'C', 'T', 'G', 'A', 'C'], dtype='|S1')
+
+ >>> import numpy as np
+ >>> a = np.array(f['chr2'])
+ >>> a.shape[0] == len(f['chr2'])
+ True
+
+ >>> a[10:14] # doctest: +NORMALIZE_WHITESPACE
+ array(['A', 'A', 'A', 'A'], dtype='|S1')
+
+ mask a sub-sequence
+ ::
+
+ >>> a[11:13] = np.array('N', dtype='S1')
+ >>> a[10:14].tostring()
+ 'ANNA'
+
+
+ Backends (Record class)
+ =======================
+ It's also possible to specify another record class as the underlying work-horse
+ for slicing and reading. Currently, there's just the default:
+
+ * NpyFastaRecord which uses numpy memmap
+ * FastaRecord, which uses using fseek/fread
+ * MemoryRecord which reads everything into memory and must reparse the original
+ fasta every time.
+ * TCRecord which is identical to NpyFastaRecord except that it saves the index
+ in a TokyoCabinet hash database, for cases when there are enough records that
+ loading the entire index from a pickle into memory is unwise. (NOTE: that the
+ sequence is not loaded into memory in either case).
+
+ It's possible to specify the class used with the `record_class` kwarg to the `Fasta`
+ constructor:
+ ::
+
+ >>> from pyfasta import FastaRecord # default is NpyFastaRecord
+ >>> f = Fasta('tests/data/three_chrs.fasta', record_class=FastaRecord)
+ >>> f['chr1']
+ FastaRecord('tests/data/three_chrs.fasta.flat', 0..80)
+
+ other than the repr, it should behave exactly like the Npy record class backend
+
+ it's possible to create your own using a sub-class of FastaRecord. see the source
+ in pyfasta/records.py for details.
+
+ Flattening
+ ==========
+ In order to efficiently access the sequence content, pyfasta saves a separate, flattened file with all newlines and headers removed from the sequence. In the case of large fasta files, one may not wish to save 2 copies of a 5GG+ file. In that case, it's possible to flatten the file "inplace", keeping all the headers, and retaining the validity of the fasta file -- with the only change being that the new-lines are removed from each sequence. This can be specified via `flatten_inpl [...]
+ ::
+
+ >>> import os
+ >>> os.unlink('tests/data/three_chrs.fasta.gdx') # cleanup non-inplace idx
+ >>> f = Fasta('tests/data/three_chrs.fasta', flatten_inplace=True)
+ >>> f['chr1'] # note the difference in the output from above.
+ NpyFastaRecord(6..86)
+
+ # sequence from is same as when requested from non-flat file above.
+ >>> f['chr1'][1:9]
+ 'CTGACTGA'
+
+ # the flattened file is kept as a place holder without the sequence data.
+ >>> open('tests/data/three_chrs.fasta.flat').read()
+ '@flattened@'
+
+
+ Command Line Interface
+ ======================
+ there's also a command line interface to manipulate / view fasta files.
+ the `pyfasta` executable is installed via setuptools, running it will show
+ help text.
+
+ split a fasta file into 6 new files of relatively even size:
+
+ $ pyfasta **split** -n 6 original.fasta
+
+ split the fasta file into one new file per header with "%(seqid)s" being filled into each filename.:
+
+ $ pyfasta **split** --header "%(seqid)s.fasta" original.fasta
+
+ create 1 new fasta file with the sequence split into 10K-mers:
+
+ $ pyfasta **split** -n 1 -k 10000 original.fasta
+
+ 2 new fasta files with the sequence split into 10K-mers with 2K overlap:
+
+ $ pyfasta **split** -n 2 -k 10000 -o 2000 original.fasta
+
+
+ show some info about the file (and show gc content):
+
+ $ pyfasta **info** --gc test/data/three_chrs.fasta
+
+
+ **extract** sequence from the file. use the header flag to make
+ a new fasta file. the args are a list of sequences to extract.
+
+ $ pyfasta **extract** --header --fasta test/data/three_chrs.fasta seqa seqb seqc
+
+ **extract** sequence from a file using a file containing the headers *not* wanted in the new file:
+
+ $ pyfasta extract --header --fasta input.fasta --exclude --file seqids_to_exclude.txt
+
+ **extract** sequence from a fasta file with complex keys where we only want to lookup based on the part before the space.
+
+ $ pyfasta extract --header --fasta input.with.keys.fasta --space --file seqids.txt
+
+ **flatten** a file inplace, for faster later use by pyfasta, and without creating another copy. (`Flattening`_)
+
+ $ pyfasta flatten input.fasta
+
+ cleanup
+ =======
+ (though for real use these will remain for faster access)
+ ::
+
+ >>> os.unlink('tests/data/three_chrs.fasta.gdx')
+ >>> os.unlink('tests/data/three_chrs.fasta.flat')
+
+ Testing
+ =======
+ there is currently > 99% test coverage for the 2 modules and all included
+ record classes. to run the tests:
+ ::
+
+ $ python setup.py nosetests
+
+ Changes
+ =======
+ 0.5.2
+ -----
+ fix complement (@mruffalo)
+
+ 0.5.0
+ -----
+ python 3 compatibility thanks to mruffalo
+
+ 0.4.5
+ -----
+ pyfasta split can handle > 52 files. (thanks Devtulya)
+
+ 0.4.4
+ -----
+ fix pyfasta extract
+
+ 0.4.3
+ -----
+ Add 0 or 1-based intervals in sequence() thanks @jamescasbon
+
+ 0.4.2
+ -----
+ update for latest numpy (can't close memmap)
+
+ 0.4.1
+ -----
+ check for duplicate headers.
+
+ 0.4.0
+ -----
+ * add key_fn kwarg to constuctor
+
+ 0.3.9
+ -----
+ * only require 'r' (not r+) for memory map.
+
+ 0.3.8
+ -----
+ * clean up logic for mixing inplace/non-inplace flattened files.
+ if the inplace is available, it is always used.
+
+ 0.3.6/7
+ -------
+ * dont re-flatten the file every time!
+ * allow spaces before and after the header in the orginal fasta.
+
+ 0.3.5
+ -----
+
+ * update docs in README.txt for new CLI stuff.
+ * allow flattening inplace.
+ * get rid of memmap (results in faster parsing).
+
+ 0.3.4
+ -----
+
+ * restore python2.5 compatiblity.
+ * CLI: add ability to exclude sequence from extract
+ * CLI: allow spliting based on header.
+
+ 0.3.3
+ -----
+
+ * include this file in the tar ball (thanks wen h.)
+
+ 0.3.2
+ -----
+
+ * separate out backends into records.py
+
+ * use nosetests (python setup.py nosetests)
+
+ * add a TCRecord backend for next-gen sequencing availabe if tc is (easy-)installed.
+
+ * improve test coverage.
+
+Keywords: bioinformatics blast fasta
+Platform: UNKNOWN
+Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
diff --git a/README.rst b/README.rst
new file mode 100644
index 0000000..e6946aa
--- /dev/null
+++ b/README.rst
@@ -0,0 +1,212 @@
+==================================================
+pyfasta: pythonic access to fasta sequence files.
+==================================================
+
+
+:Author: Brent Pedersen (brentp)
+:Email: bpederse at gmail.com
+:License: MIT
+
+.. contents ::
+
+Implementation
+==============
+
+Requires Python >= 2.6. Stores a flattened version of the fasta file without
+spaces or headers and uses either a mmap of numpy binary format or fseek/fread so the
+*sequence data is never read into memory*. Saves a pickle (.gdx) of the start, stop
+(for fseek/mmap) locations of each header in the fasta file for internal use.
+
+Usage
+=====
+::
+
+ >>> from pyfasta import Fasta
+
+ >>> f = Fasta('tests/data/three_chrs.fasta')
+ >>> sorted(f.keys())
+ ['chr1', 'chr2', 'chr3']
+
+ >>> f['chr1']
+ NpyFastaRecord(0..80)
+
+
+
+Slicing
+-------
+::
+
+ # get full the sequence:
+ >>> a = str(f['chr1'])
+ >>> b = f['chr1'][:]
+ >>> a == b
+ True
+
+ >>> f['chr1'][:10]
+ 'ACTGACTGAC'
+
+ # get the 1st basepair in every codon (it's python yo)
+ >>> f['chr1'][::3]
+ 'AGTCAGTCAGTCAGTCAGTCAGTCAGT'
+
+ # can query by a 'feature' dictionary (note this is one based coordinates)
+ >>> f.sequence({'chr': 'chr1', 'start': 2, 'stop': 9})
+ 'CTGACTGA'
+
+ # same as:
+ >>> f['chr1'][1:9]
+ 'CTGACTGA'
+
+ # use python, zero based coords
+ >>> f.sequence({'chr': 'chr1', 'start': 2, 'stop': 9}, one_based=False)
+ 'TGACTGA'
+
+ # with reverse complement (automatic for - strand)
+ >>> f.sequence({'chr': 'chr1', 'start': 2, 'stop': 9, 'strand': '-'})
+ 'TCAGTCAG'
+
+Key Function
+------------
+Sometimes your fasta will have a long header like: "AT1G51370.2 | Symbols: | F-box family protein | chr1:19045615-19046748 FORWARD" when you only want to key off: "AT1G51370.2". In this case, specify the key_fn argument to the constructor:
+
+::
+
+ >>> fkey = Fasta('tests/data/key.fasta', key_fn=lambda key: key.split()[0])
+ >>> sorted(fkey.keys())
+ ['a', 'b', 'c']
+
+Numpy
+=====
+
+The default is to use a memmaped numpy array as the backend. In which case it's possible to
+get back an array directly...
+::
+
+ >>> f['chr1'].as_string = False
+ >>> f['chr1'][:10] # doctest: +NORMALIZE_WHITESPACE
+ memmap(['A', 'C', 'T', 'G', 'A', 'C', 'T', 'G', 'A', 'C'], dtype='|S1')
+
+ >>> import numpy as np
+ >>> a = np.array(f['chr2'])
+ >>> a.shape[0] == len(f['chr2'])
+ True
+
+ >>> a[10:14] # doctest: +NORMALIZE_WHITESPACE
+ array(['A', 'A', 'A', 'A'], dtype='|S1')
+
+mask a sub-sequence
+::
+
+ >>> a[11:13] = np.array('N', dtype='S1')
+ >>> a[10:14].tostring()
+ 'ANNA'
+
+
+Backends (Record class)
+=======================
+It's also possible to specify another record class as the underlying work-horse
+for slicing and reading. Currently, there's just the default:
+
+ * NpyFastaRecord which uses numpy memmap
+ * FastaRecord, which uses using fseek/fread
+ * MemoryRecord which reads everything into memory and must reparse the original
+ fasta every time.
+ * TCRecord which is identical to NpyFastaRecord except that it saves the index
+ in a TokyoCabinet hash database, for cases when there are enough records that
+ loading the entire index from a pickle into memory is unwise. (NOTE: that the
+ sequence is not loaded into memory in either case).
+
+It's possible to specify the class used with the `record_class` kwarg to the `Fasta`
+constructor:
+::
+
+ >>> from pyfasta import FastaRecord # default is NpyFastaRecord
+ >>> f = Fasta('tests/data/three_chrs.fasta', record_class=FastaRecord)
+ >>> f['chr1']
+ FastaRecord('tests/data/three_chrs.fasta.flat', 0..80)
+
+other than the repr, it should behave exactly like the Npy record class backend
+
+it's possible to create your own using a sub-class of FastaRecord. see the source
+in pyfasta/records.py for details.
+
+Flattening
+==========
+In order to efficiently access the sequence content, pyfasta saves a separate, flattened file with all newlines and headers removed from the sequence. In the case of large fasta files, one may not wish to save 2 copies of a 5GG+ file. In that case, it's possible to flatten the file "inplace", keeping all the headers, and retaining the validity of the fasta file -- with the only change being that the new-lines are removed from each sequence. This can be specified via `flatten_inplace` = True
+::
+
+ >>> import os
+ >>> os.unlink('tests/data/three_chrs.fasta.gdx') # cleanup non-inplace idx
+ >>> f = Fasta('tests/data/three_chrs.fasta', flatten_inplace=True)
+ >>> f['chr1'] # note the difference in the output from above.
+ NpyFastaRecord(6..86)
+
+ # sequence from is same as when requested from non-flat file above.
+ >>> f['chr1'][1:9]
+ 'CTGACTGA'
+
+ # the flattened file is kept as a place holder without the sequence data.
+ >>> open('tests/data/three_chrs.fasta.flat').read()
+ '@flattened@'
+
+
+Command Line Interface
+======================
+there's also a command line interface to manipulate / view fasta files.
+the `pyfasta` executable is installed via setuptools, running it will show
+help text.
+
+split a fasta file into 6 new files of relatively even size:
+
+ $ pyfasta **split** -n 6 original.fasta
+
+split the fasta file into one new file per header with "%(seqid)s" being filled into each filename.:
+
+ $ pyfasta **split** --header "%(seqid)s.fasta" original.fasta
+
+create 1 new fasta file with the sequence split into 10K-mers:
+
+ $ pyfasta **split** -n 1 -k 10000 original.fasta
+
+2 new fasta files with the sequence split into 10K-mers with 2K overlap:
+
+ $ pyfasta **split** -n 2 -k 10000 -o 2000 original.fasta
+
+
+show some info about the file (and show gc content):
+
+ $ pyfasta **info** --gc test/data/three_chrs.fasta
+
+
+**extract** sequence from the file. use the header flag to make
+a new fasta file. the args are a list of sequences to extract.
+
+ $ pyfasta **extract** --header --fasta test/data/three_chrs.fasta seqa seqb seqc
+
+**extract** sequence from a file using a file containing the headers *not* wanted in the new file:
+
+ $ pyfasta extract --header --fasta input.fasta --exclude --file seqids_to_exclude.txt
+
+**extract** sequence from a fasta file with complex keys where we only want to lookup based on the part before the space.
+
+ $ pyfasta extract --header --fasta input.with.keys.fasta --space --file seqids.txt
+
+**flatten** a file inplace, for faster later use by pyfasta, and without creating another copy. (`Flattening`_)
+
+ $ pyfasta flatten input.fasta
+
+cleanup
+=======
+(though for real use these will remain for faster access)
+::
+
+ >>> os.unlink('tests/data/three_chrs.fasta.gdx')
+ >>> os.unlink('tests/data/three_chrs.fasta.flat')
+
+Testing
+=======
+there is currently > 99% test coverage for the 2 modules and all included
+record classes. to run the tests:
+::
+
+ $ python setup.py nosetests
diff --git a/debian/README.source b/debian/README.source
deleted file mode 100644
index 5026a7e..0000000
--- a/debian/README.source
+++ /dev/null
@@ -1,19 +0,0 @@
-As documented in the original ITP
-
- https://bugs.debian.org/801721
-
-this package showed problems in the unit tests. Due to the upstream author
-
- Date: Wed, 14 Oct 2015 15:04:31 -0600
- From: Brent Pedersen <bpederse at gmail.com>
- To: Andreas Tille <andreas at an3as.eu>
- Subject: Re: Test failures with pyfasta
-
- hmm, that's not good. but still, I recommend to use pyfaidx as I'm no
- longer working on pyfaidx.
-
-rather pyfaidx should be packaged. So there is no intent to upload this
-package for the moment but the packaging should work when switching of
-the test suite.
-
- -- Andreas Tille <tille at debian.org> Thu, 15 Oct 2015 08:24:03 +0200
diff --git a/debian/changelog b/debian/changelog
deleted file mode 100644
index f866d87..0000000
--- a/debian/changelog
+++ /dev/null
@@ -1,5 +0,0 @@
-python-pyfasta (0.5.2-1) NONE; urgency=medium
-
- * No intend to upload - see README.source
-
- -- Andreas Tille <tille at debian.org> Thu, 15 Oct 2015 08:24:03 +0200
diff --git a/debian/compat b/debian/compat
deleted file mode 100644
index ec63514..0000000
--- a/debian/compat
+++ /dev/null
@@ -1 +0,0 @@
-9
diff --git a/debian/control b/debian/control
deleted file mode 100644
index d9ab5fd..0000000
--- a/debian/control
+++ /dev/null
@@ -1,48 +0,0 @@
-Source: python-pyfasta
-Maintainer: Debian Med Packaging Team <debian-med-packaging at lists.alioth.debian.org>
-Uploaders: Andreas Tille <tille at debian.org>
-Section: python
-Priority: optional
-Build-Depends: debhelper (>= 9),
- dh-python,
- python-all (>= 2.7),
- python-coverage,
- python-setuptools,
- python-nose,
- python-numpy,
- python3-all,
- python3-setuptools,
- python3-nose,
- python3-numpy
-Standards-Version: 3.9.6
-Vcs-Browser: http://anonscm.debian.org/viewvc/debian-med/trunk/packages/python-pyfasta/trunk/
-Vcs-Svn: svn://anonscm.debian.org/debian-med/trunk/packages/python-pyfasta/trunk/
-Homepage: https://pypi.python.org/pypi/pyfasta/
-
-Package: python-pyfasta
-Architecture: all
-Depends: ${misc:Depends},
- ${python:Depends}
-Description: fast, memory-efficient Python 2 access to fasta sequence files
- In bioinformatics, FASTA format is a text-based format for representing
- either nucleotide sequences or peptide sequences, in which nucleotides
- or amino acids are represented using single-letter codes. The format
- also allows for sequence names and comments to precede the sequences.
- The format originates from the FASTA software package, but has now
- become a standard in the field of bioinformatics.
- .
- This package provides the Python 2 modules to access fasta files.
-
-Package: python3-pyfasta
-Architecture: all
-Depends: ${misc:Depends},
- ${python3:Depends}
-Description: fast, memory-efficient Python 3 access to fasta sequence files
- In bioinformatics, FASTA format is a text-based format for representing
- either nucleotide sequences or peptide sequences, in which nucleotides
- or amino acids are represented using single-letter codes. The format
- also allows for sequence names and comments to precede the sequences.
- The format originates from the FASTA software package, but has now
- become a standard in the field of bioinformatics.
- .
- This package provides the Python 3 modules to access fasta files.
diff --git a/debian/copyright b/debian/copyright
deleted file mode 100644
index d2d428e..0000000
--- a/debian/copyright
+++ /dev/null
@@ -1,31 +0,0 @@
-Format: http://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
-Upstream-Name: pyfasta
-Upstream-Contact: Brent Pedersen <bpederse at gmail.com>
-Source: https://pypi.python.org/pypi/pyfasta
-
-Files: *
-Copyright: 2011-2015 Brent Pedersen <bpederse at gmail.com>
-License: MIT
-
-Files: debian/*
-Copyright: 2015 Andreas Tille <tille at debian.org>
-License: MIT
-
-License: MIT
- Permission is hereby granted, free of charge, to any person obtaining a copy
- of this software and associated documentation files (the "Software"), to deal
- in the Software without restriction, including without limitation the rights
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- copies of the Software, and to permit persons to whom the Software is
- furnished to do so, subject to the following conditions:
- .
- The above copyright notice and this permission notice shall be included in
- all copies or substantial portions of the Software.
- .
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- THE SOFTWARE.
diff --git a/debian/rules b/debian/rules
deleted file mode 100755
index 6c7f5fb..0000000
--- a/debian/rules
+++ /dev/null
@@ -1,25 +0,0 @@
-#!/usr/bin/make -f
-# -*- makefile -*-
-
-# Uncomment this to turn on verbose mode.
-#export DH_VERBOSE=1
-
-export PYBUILD_NAME=pyfasta
-export PYBUILD_TEST_ARGS={dir}/tests
-
-override_dh_auto_test:
- LC_ALL=C.UTF-8 dh_auto_test -- --test --system=custom \
- --test-args='set -e; \
- set -x; \
- cp -a tests {build_dir}; \
- {interpreter} run_tests.py --offline'
-
-run_tests:
- cp -a tests/data/key.fasta tests/data/key.fasta.orig
- cp -a tests/data/three_chrs.fasta tests/data/three_chrs.fasta.orig
- python -m nose tests
-
-
-%:
- dh $@ --with python2,python3 --buildsystem=pybuild
-
diff --git a/debian/source/format b/debian/source/format
deleted file mode 100644
index 163aaf8..0000000
--- a/debian/source/format
+++ /dev/null
@@ -1 +0,0 @@
-3.0 (quilt)
diff --git a/debian/watch b/debian/watch
deleted file mode 100644
index bda404f..0000000
--- a/debian/watch
+++ /dev/null
@@ -1,2 +0,0 @@
-version=3
-https://pypi.python.org/pypi/pyfasta .*/pyfasta-([0-9.]+).tar.[xgb]z2?
diff --git a/pyfasta.egg-info/PKG-INFO b/pyfasta.egg-info/PKG-INFO
new file mode 100644
index 0000000..7b308bd
--- /dev/null
+++ b/pyfasta.egg-info/PKG-INFO
@@ -0,0 +1,302 @@
+Metadata-Version: 1.1
+Name: pyfasta
+Version: 0.5.2
+Summary: fast, memory-efficient, pythonic (and command-line) access to fasta sequence files
+Home-page: http://github.com/brentp/pyfasta/
+Author: brentp
+Author-email: bpederse at gmail.com
+License: MIT
+Description: ==================================================
+ pyfasta: pythonic access to fasta sequence files.
+ ==================================================
+
+
+ :Author: Brent Pedersen (brentp)
+ :Email: bpederse at gmail.com
+ :License: MIT
+
+ .. contents ::
+
+ Implementation
+ ==============
+
+ Requires Python >= 2.6. Stores a flattened version of the fasta file without
+ spaces or headers and uses either a mmap of numpy binary format or fseek/fread so the
+ *sequence data is never read into memory*. Saves a pickle (.gdx) of the start, stop
+ (for fseek/mmap) locations of each header in the fasta file for internal use.
+
+ Usage
+ =====
+ ::
+
+ >>> from pyfasta import Fasta
+
+ >>> f = Fasta('tests/data/three_chrs.fasta')
+ >>> sorted(f.keys())
+ ['chr1', 'chr2', 'chr3']
+
+ >>> f['chr1']
+ NpyFastaRecord(0..80)
+
+
+
+ Slicing
+ -------
+ ::
+
+ # get full the sequence:
+ >>> a = str(f['chr1'])
+ >>> b = f['chr1'][:]
+ >>> a == b
+ True
+
+ >>> f['chr1'][:10]
+ 'ACTGACTGAC'
+
+ # get the 1st basepair in every codon (it's python yo)
+ >>> f['chr1'][::3]
+ 'AGTCAGTCAGTCAGTCAGTCAGTCAGT'
+
+ # can query by a 'feature' dictionary (note this is one based coordinates)
+ >>> f.sequence({'chr': 'chr1', 'start': 2, 'stop': 9})
+ 'CTGACTGA'
+
+ # same as:
+ >>> f['chr1'][1:9]
+ 'CTGACTGA'
+
+ # use python, zero based coords
+ >>> f.sequence({'chr': 'chr1', 'start': 2, 'stop': 9}, one_based=False)
+ 'TGACTGA'
+
+ # with reverse complement (automatic for - strand)
+ >>> f.sequence({'chr': 'chr1', 'start': 2, 'stop': 9, 'strand': '-'})
+ 'TCAGTCAG'
+
+ Key Function
+ ------------
+ Sometimes your fasta will have a long header like: "AT1G51370.2 | Symbols: | F-box family protein | chr1:19045615-19046748 FORWARD" when you only want to key off: "AT1G51370.2". In this case, specify the key_fn argument to the constructor:
+
+ ::
+
+ >>> fkey = Fasta('tests/data/key.fasta', key_fn=lambda key: key.split()[0])
+ >>> sorted(fkey.keys())
+ ['a', 'b', 'c']
+
+ Numpy
+ =====
+
+ The default is to use a memmaped numpy array as the backend. In which case it's possible to
+ get back an array directly...
+ ::
+
+ >>> f['chr1'].as_string = False
+ >>> f['chr1'][:10] # doctest: +NORMALIZE_WHITESPACE
+ memmap(['A', 'C', 'T', 'G', 'A', 'C', 'T', 'G', 'A', 'C'], dtype='|S1')
+
+ >>> import numpy as np
+ >>> a = np.array(f['chr2'])
+ >>> a.shape[0] == len(f['chr2'])
+ True
+
+ >>> a[10:14] # doctest: +NORMALIZE_WHITESPACE
+ array(['A', 'A', 'A', 'A'], dtype='|S1')
+
+ mask a sub-sequence
+ ::
+
+ >>> a[11:13] = np.array('N', dtype='S1')
+ >>> a[10:14].tostring()
+ 'ANNA'
+
+
+ Backends (Record class)
+ =======================
+ It's also possible to specify another record class as the underlying work-horse
+ for slicing and reading. Currently, there's just the default:
+
+ * NpyFastaRecord which uses numpy memmap
+ * FastaRecord, which uses using fseek/fread
+ * MemoryRecord which reads everything into memory and must reparse the original
+ fasta every time.
+ * TCRecord which is identical to NpyFastaRecord except that it saves the index
+ in a TokyoCabinet hash database, for cases when there are enough records that
+ loading the entire index from a pickle into memory is unwise. (NOTE: that the
+ sequence is not loaded into memory in either case).
+
+ It's possible to specify the class used with the `record_class` kwarg to the `Fasta`
+ constructor:
+ ::
+
+ >>> from pyfasta import FastaRecord # default is NpyFastaRecord
+ >>> f = Fasta('tests/data/three_chrs.fasta', record_class=FastaRecord)
+ >>> f['chr1']
+ FastaRecord('tests/data/three_chrs.fasta.flat', 0..80)
+
+ other than the repr, it should behave exactly like the Npy record class backend
+
+ it's possible to create your own using a sub-class of FastaRecord. see the source
+ in pyfasta/records.py for details.
+
+ Flattening
+ ==========
+ In order to efficiently access the sequence content, pyfasta saves a separate, flattened file with all newlines and headers removed from the sequence. In the case of large fasta files, one may not wish to save 2 copies of a 5GG+ file. In that case, it's possible to flatten the file "inplace", keeping all the headers, and retaining the validity of the fasta file -- with the only change being that the new-lines are removed from each sequence. This can be specified via `flatten_inpl [...]
+ ::
+
+ >>> import os
+ >>> os.unlink('tests/data/three_chrs.fasta.gdx') # cleanup non-inplace idx
+ >>> f = Fasta('tests/data/three_chrs.fasta', flatten_inplace=True)
+ >>> f['chr1'] # note the difference in the output from above.
+ NpyFastaRecord(6..86)
+
+ # sequence from is same as when requested from non-flat file above.
+ >>> f['chr1'][1:9]
+ 'CTGACTGA'
+
+ # the flattened file is kept as a place holder without the sequence data.
+ >>> open('tests/data/three_chrs.fasta.flat').read()
+ '@flattened@'
+
+
+ Command Line Interface
+ ======================
+ there's also a command line interface to manipulate / view fasta files.
+ the `pyfasta` executable is installed via setuptools, running it will show
+ help text.
+
+ split a fasta file into 6 new files of relatively even size:
+
+ $ pyfasta **split** -n 6 original.fasta
+
+ split the fasta file into one new file per header with "%(seqid)s" being filled into each filename.:
+
+ $ pyfasta **split** --header "%(seqid)s.fasta" original.fasta
+
+ create 1 new fasta file with the sequence split into 10K-mers:
+
+ $ pyfasta **split** -n 1 -k 10000 original.fasta
+
+ 2 new fasta files with the sequence split into 10K-mers with 2K overlap:
+
+ $ pyfasta **split** -n 2 -k 10000 -o 2000 original.fasta
+
+
+ show some info about the file (and show gc content):
+
+ $ pyfasta **info** --gc test/data/three_chrs.fasta
+
+
+ **extract** sequence from the file. use the header flag to make
+ a new fasta file. the args are a list of sequences to extract.
+
+ $ pyfasta **extract** --header --fasta test/data/three_chrs.fasta seqa seqb seqc
+
+ **extract** sequence from a file using a file containing the headers *not* wanted in the new file:
+
+ $ pyfasta extract --header --fasta input.fasta --exclude --file seqids_to_exclude.txt
+
+ **extract** sequence from a fasta file with complex keys where we only want to lookup based on the part before the space.
+
+ $ pyfasta extract --header --fasta input.with.keys.fasta --space --file seqids.txt
+
+ **flatten** a file inplace, for faster later use by pyfasta, and without creating another copy. (`Flattening`_)
+
+ $ pyfasta flatten input.fasta
+
+ cleanup
+ =======
+ (though for real use these will remain for faster access)
+ ::
+
+ >>> os.unlink('tests/data/three_chrs.fasta.gdx')
+ >>> os.unlink('tests/data/three_chrs.fasta.flat')
+
+ Testing
+ =======
+ there is currently > 99% test coverage for the 2 modules and all included
+ record classes. to run the tests:
+ ::
+
+ $ python setup.py nosetests
+
+ Changes
+ =======
+ 0.5.2
+ -----
+ fix complement (@mruffalo)
+
+ 0.5.0
+ -----
+ python 3 compatibility thanks to mruffalo
+
+ 0.4.5
+ -----
+ pyfasta split can handle > 52 files. (thanks Devtulya)
+
+ 0.4.4
+ -----
+ fix pyfasta extract
+
+ 0.4.3
+ -----
+ Add 0 or 1-based intervals in sequence() thanks @jamescasbon
+
+ 0.4.2
+ -----
+ update for latest numpy (can't close memmap)
+
+ 0.4.1
+ -----
+ check for duplicate headers.
+
+ 0.4.0
+ -----
+ * add key_fn kwarg to constuctor
+
+ 0.3.9
+ -----
+ * only require 'r' (not r+) for memory map.
+
+ 0.3.8
+ -----
+ * clean up logic for mixing inplace/non-inplace flattened files.
+ if the inplace is available, it is always used.
+
+ 0.3.6/7
+ -------
+ * dont re-flatten the file every time!
+ * allow spaces before and after the header in the orginal fasta.
+
+ 0.3.5
+ -----
+
+ * update docs in README.txt for new CLI stuff.
+ * allow flattening inplace.
+ * get rid of memmap (results in faster parsing).
+
+ 0.3.4
+ -----
+
+ * restore python2.5 compatiblity.
+ * CLI: add ability to exclude sequence from extract
+ * CLI: allow spliting based on header.
+
+ 0.3.3
+ -----
+
+ * include this file in the tar ball (thanks wen h.)
+
+ 0.3.2
+ -----
+
+ * separate out backends into records.py
+
+ * use nosetests (python setup.py nosetests)
+
+ * add a TCRecord backend for next-gen sequencing availabe if tc is (easy-)installed.
+
+ * improve test coverage.
+
+Keywords: bioinformatics blast fasta
+Platform: UNKNOWN
+Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
diff --git a/pyfasta.egg-info/SOURCES.txt b/pyfasta.egg-info/SOURCES.txt
new file mode 100644
index 0000000..f3fe98f
--- /dev/null
+++ b/pyfasta.egg-info/SOURCES.txt
@@ -0,0 +1,20 @@
+CHANGELOG.txt
+MANIFEST.in
+README.rst
+setup.cfg
+setup.py
+pyfasta/__init__.py
+pyfasta/fasta.py
+pyfasta/records.py
+pyfasta/split_fasta.py
+pyfasta.egg-info/PKG-INFO
+pyfasta.egg-info/SOURCES.txt
+pyfasta.egg-info/dependency_links.txt
+pyfasta.egg-info/entry_points.txt
+pyfasta.egg-info/not-zip-safe
+pyfasta.egg-info/top_level.txt
+tests/bench.py
+tests/test_all.py
+tests/data/dups.fasta
+tests/data/key.fasta
+tests/data/three_chrs.fasta
\ No newline at end of file
diff --git a/pyfasta.egg-info/dependency_links.txt b/pyfasta.egg-info/dependency_links.txt
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/pyfasta.egg-info/dependency_links.txt
@@ -0,0 +1 @@
+
diff --git a/pyfasta.egg-info/entry_points.txt b/pyfasta.egg-info/entry_points.txt
new file mode 100644
index 0000000..245e264
--- /dev/null
+++ b/pyfasta.egg-info/entry_points.txt
@@ -0,0 +1,3 @@
+[console_scripts]
+pyfasta = pyfasta:main
+
diff --git a/pyfasta.egg-info/not-zip-safe b/pyfasta.egg-info/not-zip-safe
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/pyfasta.egg-info/not-zip-safe
@@ -0,0 +1 @@
+
diff --git a/pyfasta.egg-info/top_level.txt b/pyfasta.egg-info/top_level.txt
new file mode 100644
index 0000000..909ccbd
--- /dev/null
+++ b/pyfasta.egg-info/top_level.txt
@@ -0,0 +1 @@
+pyfasta
diff --git a/pyfasta/__init__.py b/pyfasta/__init__.py
new file mode 100644
index 0000000..f1cdf95
--- /dev/null
+++ b/pyfasta/__init__.py
@@ -0,0 +1,141 @@
+from __future__ import print_function
+import sys
+from fasta import Fasta, complement, DuplicateHeaderException
+from records import *
+from split_fasta import split
+import optparse
+
+def main():
+ help = """
+ available actions:
+ `extract`: extract sequences from a fasta file
+ `info`: show info about the fasta file and exit.
+ `split`: split a large fasta file into separate files
+ and/or into K-mers.
+ `flatten`: flatten a fasta file inplace so that later
+ command-line (and programmattic) access via
+ pyfasta will use the inplace flattened version
+ rather than creating another .flat copy of the
+ sequence.
+
+ to view the help for a particular action, use:
+ pyfasta [action] --help
+ e.g.:
+ pyfasta extract --help
+ """
+ if len(sys.argv) == 1:
+ print(help)
+ sys.exit()
+
+ action = sys.argv[1]
+
+ sglobals = globals()
+ if not action in sglobals:
+ print("%s not a valid action" % action)
+ print(help)
+ sys.exit()
+
+ globals()[action](sys.argv[2:])
+
+def info(args):
+ """
+ >>> info(['tests/data/three_chrs.fasta'])
+ <BLANKLINE>
+ tests/data/three_chrs.fasta
+ ===========================
+ >chr3 length:3600
+ >chr2 length:80
+ >chr1 length:80
+ <BLANKLINE>
+ 3760 basepairs in 3 sequences
+ """
+ parser = optparse.OptionParser("""\
+ print headers and lengths of the given fasta file in order of length. e.g.:
+ pyfasta info --gc some.fasta""")
+
+ parser.add_option("-n", "--n", type="int", dest="nseqs",
+ help="max number of records to print. use -1 for all",
+ default=20)
+ parser.add_option("--gc", dest="gc", help="show gc content",
+ action="store_true", default=False)
+ options, fastas = parser.parse_args(args)
+ if not (fastas):
+ sys.exit(parser.print_help())
+ import operator
+
+ for fasta in fastas:
+ f = Fasta(fasta)
+ info = [(k, len(seq)) for k, seq in f.iteritems()]
+
+ total_len = sum(l for k, l in info)
+ nseqs = len(f)
+ if options.nseqs > -1:
+ info = sorted(info, key=operator.itemgetter(1, 0), reverse=True)
+ info = info[:options.nseqs]
+ else:
+ info.sort()
+
+ print("\n" + fasta)
+ print("=" * len(fasta))
+ for k, l in info:
+ gc = ""
+ if options.gc:
+ seq = str(f[k]).upper()
+ g = seq.count('G')
+ c = seq.count('C')
+ gc = 100.0 * (g + c) / float(l)
+ gc = "gc:%.2f%%" % gc
+ print((">%s length:%i" % (k, l)) + gc)
+
+ if total_len > 1000000:
+ total_len = "%.3fM" % (total_len / 1000000.)
+ print()
+ print("%s basepairs in %i sequences" % (total_len, nseqs))
+
+def flatten(args):
+ """
+ >>> flatten(['tests/data/three_chrs.fasta'])
+ """
+ parser = optparse.OptionParser("""flatten a fasta file *inplace* so all later access by pyfasta will use that flattend (but still viable) fasta file""")
+ _, fasta = parser.parse_args(args)
+ for fa in fasta:
+ f = Fasta(fa, flatten_inplace=True)
+
+def extract(args):
+ """
+ >>> extract(['--fasta', 'tests/data/three_chrs.fasta', 'chr2'])
+ TAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAT
+ """
+
+ parser = optparse.OptionParser("""extract some sequences from a fasta file. e.g.:
+ pyfasta extract --fasta some.fasta --header at2g26540 at3g45640""")
+ parser.add_option("--fasta", dest="fasta", help="path to the fasta file")
+ parser.add_option("--header", dest="header", help="include headers", action="store_true", default=False)
+ parser.add_option("--exclude", dest="exclude", help="extract all sequences EXCEPT those listed", action="store_true", default=False)
+ parser.add_option("--file", dest="file", help=\
+ "if this flag is used, the sequences to extract" \
+ " are read from the file specified in args"
+ , action="store_true", default=False)
+ parser.add_option("--space", dest="space", action="store_true", help=\
+ "use the fasta identifier only up to the space as the key",
+ default=False)
+ options, seqs = parser.parse_args(args)
+ if not (options.fasta and len(seqs)):
+ sys.exit(parser.print_help())
+
+ key_fn = (lambda k: k.split()[0]) if options.space else None
+ f = Fasta(options.fasta, key_fn=key_fn)
+ if options.file:
+ seqs = (x.strip() for x in open(seqs[0]))
+ if options.exclude:
+ seqs = sorted(frozenset(f.iterkeys()).difference(seqs))
+
+ for seqname in seqs:
+ seq = f[seqname]
+ if options.header:
+ print(">%s" % seqname)
+ print(seq)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/pyfasta/fasta.py b/pyfasta/fasta.py
new file mode 100644
index 0000000..05d1fa1
--- /dev/null
+++ b/pyfasta/fasta.py
@@ -0,0 +1,233 @@
+from __future__ import print_function
+import string
+import os.path
+from collections import Mapping
+import sys
+import numpy as np
+
+from records import NpyFastaRecord
+
+# string.maketrans is bytes.maketrans in Python 3, but
+# we want to deal with strings instead of bytes
+try:
+ # 2.x
+ maketrans = string.maketrans
+except AttributeError:
+ # 3.x
+ maketrans = str.maketrans
+
+_complement = maketrans('ATCGatcgNnXx', 'TAGCtagcNnXx')
+# Python 2: string.maketrans returns a bytes object of length 256,
+# that is used as a lookup table to translate bytes to other bytes.
+# Python 3: str.maketrans returns a dict mapping Unicode code points
+# to other Unicode code points. Can't use a fully-allocated lookup
+# table since it would have to be of size `sys.maxunicode`, which
+# is equal to 1114111 on wide builds of <= 3.2 and all builds of
+# Python >= 3.3.
+# In Python 2, it's safe to use a unicode object as the translation
+# table; this causes str.translate to return a unicode object instead
+# of a str. This is safe as long as the string that you're translating
+# can be decoded as ASCII, and will fail with a UnicodeDecodeError
+# otherwise.
+if sys.version_info[0] < 3:
+ _complement = _complement.decode('latin-1')
+complement = lambda s: s.translate(_complement)
+
+class FastaNotFound(Exception): pass
+
+class DuplicateHeaderException(Exception):
+ def __init__(self, header):
+ Exception.__init__(self, 'headers must be unique: %s is duplicated' % header)
+
+class Fasta(Mapping):
+ def __init__(self, fasta_name, record_class=NpyFastaRecord,
+ flatten_inplace=False, key_fn=None):
+ """
+ >>> from pyfasta import Fasta, FastaRecord
+
+ >>> f = Fasta('tests/data/three_chrs.fasta',
+ ... record_class=FastaRecord)
+ >>> sorted(f.keys())
+ ['chr1', 'chr2', 'chr3']
+
+ slicing returns an object.
+ >>> f['chr1']
+ FastaRecord('tests/data/three_chrs.fasta.flat', 0..80)
+
+ extract sequence with normal python syntax
+ >>> print(f['chr1'][:10])
+ ACTGACTGAC
+
+ take the first basepair in each codon...
+ >>> print(f['chr1'][0:10:3])
+ AGTC
+
+ """
+ if not os.path.exists(fasta_name):
+ raise FastaNotFound('"' + fasta_name + '"')
+ self.fasta_name = fasta_name
+ self.record_class = record_class
+ self.key_fn = key_fn
+ self.index, self.prepared = self.record_class.prepare(self,
+ self.gen_seqs_with_headers(key_fn),
+ flatten_inplace)
+
+ self.chr = {}
+
+ @classmethod
+ def as_kmers(klass, seq, k, overlap=0):
+ kmax = len(seq)
+ assert overlap < k, ('overlap must be < kmer length')
+ i = 0
+ while i < kmax:
+ yield i, seq[i:i + k]
+ i += k - overlap
+
+ def gen_seqs_with_headers(self, key_fn=None):
+ """remove all newlines from the sequence in a fasta file
+ and generate starts, stops to be used by the record class"""
+ fh = open(self.fasta_name, 'r')
+ # do the flattening (remove newlines)
+ # check of unique-ness of headers.
+ seen_headers = set()
+ header = None
+ seqs = None
+ for line in fh:
+ line = line.rstrip()
+ if not line: continue
+ if line[0] == ">":
+ if seqs is not None:
+ if header in seen_headers:
+ raise DuplicateHeaderException(header)
+ seen_headers.add(header)
+ yield header, "".join(seqs)
+
+ header = line[1:].strip()
+ if key_fn is not None:
+ header = key_fn(header)
+ seqs = []
+ else:
+ seqs.append(line)
+
+ if seqs:
+ if header in seen_headers:
+ raise DuplicateHeaderException(header)
+ yield header, "".join(seqs)
+ fh.close()
+
+ def __len__(self):
+ # might not work for all backends?
+ return len(self.index)
+
+ def __iter__(self):
+ return iter(self.index)
+
+ def __getitem__(self, i):
+ if i in self.chr:
+ return self.chr[i]
+ c = self.index[i]
+ self.chr[i] = self.record_class(self.prepared, c[0], c[1])
+ return self.chr[i]
+
+ def sequence(self, f, asstring=True, auto_rc=True
+ , exon_keys=None, one_based=True):
+ """
+ take a feature and use the start/stop or exon_keys to return
+ the sequence from the assocatied fasta file:
+ f: a feature
+ asstring: if true, return the sequence as a string
+ : if false, return as a numpy array
+ auto_rc : if True and the strand of the feature == -1, return
+ the reverse complement of the sequence
+ one_based: if true, query is using 1 based closed intervals, if false
+ semi-open zero based intervals
+
+ >>> from pyfasta import Fasta
+ >>> f = Fasta('tests/data/three_chrs.fasta')
+ >>> print(f.sequence({'start':1, 'stop':2, 'strand':1, 'chr': 'chr1'}))
+ AC
+
+ >>> print(f.sequence({'start':1, 'stop':2, 'strand': -1, 'chr': 'chr1'}))
+ GT
+
+ >>> sorted(f.index.items())
+ [('chr1', (0, 80)), ('chr2', (80, 160)), ('chr3', (160, 3760))]
+
+ NOTE: these 2 are reverse-complement-ary because of strand
+ #>>> f.sequence({'start':10, 'stop':12, 'strand': -1, 'chr': 'chr1'})
+ 'CAG'
+ >>> print(f.sequence({'start':10, 'stop':12, 'strand': 1, 'chr': 'chr1'}))
+ CTG
+
+
+ >>> print(f.sequence({'start':10, 'stop':12, 'strand': -1, 'chr': 'chr3'}))
+ TGC
+ >>> print(f.sequence({'start':10, 'stop':12, 'strand': 1, 'chr': 'chr3'}))
+ GCA
+
+ >>> print(f['chr3'][:][-10:])
+ CGCACGCTAC
+
+
+ a feature can have exons:
+ >>> feat = dict(start=9, stop=19, strand=1, chr='chr1'
+ ... , exons=[(9,11), (13, 15), (17, 19)])
+
+ by default, it just returns the full sequence between start
+ and stop.
+ >>> print(f.sequence(feat))
+ ACTGACTGACT
+
+ but if exon_keys is set to an iterable, it will search for
+ those keys and will use the first to create a sequence and
+ return the concatenated result.
+ >>> print(f.sequence(feat, exon_keys=('rnas', 'exons')))
+ ACTACTACT
+
+ Note that sequence is 2 characters shorter than the entire
+ feature, to account for the introns at base-pairs 12 and 16.
+
+ Also note, it looks for an item with key of 'rnas', and didn't
+ fine one, so it continued on to 'exons'. If it doesn't find
+ any of the exon keys, it will fall back on the start, stop of
+ the feature:
+ >>> print(f.sequence(feat, exon_keys=('fake', 'also_fake')))
+ ACTGACTGACT
+ """
+ assert 'chr' in f and f['chr'] in self, (f, f['chr'], self.keys())
+ fasta = self[f['chr']]
+ sequence = None
+ if not exon_keys is None:
+ sequence = self._seq_from_keys(f, fasta, exon_keys, one_based=one_based)
+
+ if sequence is None:
+ start = f['start'] - int(one_based)
+ sequence = fasta[start: f['stop']]
+
+ if auto_rc and f.get('strand') in (-1, '-1', '-'):
+ sequence = complement(sequence)[::-1]
+
+ if asstring: return sequence
+ return np.array(sequence, dtype='c')
+
+ def _seq_from_keys(self, f, fasta, exon_keys, base='locations', one_based=True):
+ """Internal:
+ f: a feature dict
+ fasta: a Fasta object
+ exon_keys: an iterable of keys, to look for start/stop
+ arrays to get sequence.
+ base: if base ('locations') exists, look there fore the
+ exon_keys, not in the base of the object:
+ {'name': 'OS11G42690', 'stop': 25210251, 'locations':
+ {'CDS': [(25210018, 25210251)]}, 'start': 25210018, 'chr':
+ '11', 'strand': -1} set(['TRNA', 'CDS'])
+ """
+ fbase = f.get(base, f)
+ for ek in exon_keys:
+ if not ek in fbase: continue
+ locs = fbase[ek]
+ seq = ""
+ for start, stop in locs:
+ seq += fasta[start - int(one_based):stop]
+ return seq
+ return None
diff --git a/pyfasta/records.py b/pyfasta/records.py
new file mode 100644
index 0000000..677c17a
--- /dev/null
+++ b/pyfasta/records.py
@@ -0,0 +1,295 @@
+import cPickle
+import numpy as np
+import sys
+import os
+
+__all__ = ['FastaRecord', 'NpyFastaRecord', 'MemoryRecord']
+
+MAGIC = "@flattened@"
+
+def is_up_to_date(a, b):
+ return os.path.exists(a) and os.stat(a).st_mtime >= os.stat(b).st_mtime
+
+
+def ext_is_flat(ext):
+ with open(ext) as fh:
+ t = fh.read(len(MAGIC))
+ return MAGIC == t
+
+class FastaRecord(object):
+ __slots__ = ('fh', 'start', 'stop')
+ ext = ".flat"
+ idx = ".gdx"
+
+ @classmethod
+ def is_current(klass, fasta_name):
+ utd = is_up_to_date(fasta_name + klass.idx, fasta_name)
+ if not utd: return False
+ return is_up_to_date(fasta_name + klass.ext, fasta_name)
+
+ def __init__(self, fh, start, stop):
+
+ self.fh = fh
+ self.stop = stop
+ self.start = start
+
+ def __len__(self):
+ return self.stop - self.start
+
+ @classmethod
+ def prepare(klass, fasta_obj, seqinfo_generator, flatten_inplace):
+ """
+ returns the __getitem__'able index. and the thing to get the seqs from.
+ """
+ f = fasta_obj.fasta_name
+ if klass.is_current(f):
+ with open(f + klass.idx, 'rb') as fh:
+ idx = cPickle.load(fh)
+ if flatten_inplace or ext_is_flat(f + klass.ext): flat = klass.modify_flat(f)
+ else: flat = klass.modify_flat(f + klass.ext)
+ if flatten_inplace and not ext_is_flat(f + klass.ext):
+ del flat
+ else:
+ return idx, flat
+
+ idx = {}
+ with open(f + klass.ext, 'w') as flatfh:
+ for i, (seqid, seq) in enumerate(seqinfo_generator):
+ if flatten_inplace:
+ if i == 0:
+ flatfh.write('>%s\n' % seqid)
+ else:
+ flatfh.write('\n>%s\n' % seqid)
+ start = flatfh.tell()
+ flatfh.write(seq)
+ stop = flatfh.tell()
+ idx[seqid] = (start, stop)
+
+ if flatten_inplace:
+ klass.copy_inplace(flatfh.name, f)
+ with open(f + klass.idx, 'wb') as fh:
+ cPickle.dump(idx, fh, -1)
+ return idx, klass.modify_flat(f)
+
+ with open(f + klass.idx, 'wb') as fh:
+ cPickle.dump(idx, fh, -1)
+ return idx, klass.modify_flat(f + klass.ext)
+
+ @classmethod
+ def copy_inplace(klass, flat_name, fasta_name):
+ """
+ so they requested flatten_inplace, so we
+ overwrite the .fasta file with the .fasta.flat
+ file and save something in the .flat as a place-holder.
+ """
+ os.rename(flat_name, fasta_name)
+ # still need the flattend file to show
+ # it's current.
+ with open(fasta_name + klass.ext, 'w') as flatfh:
+ flatfh.write(MAGIC)
+
+
+
+ @classmethod
+ def modify_flat(klass, flat_file):
+ return open(flat_file, 'r')
+
+ def _adjust_slice(self, islice):
+ l = len(self)
+ if not islice.start is None and islice.start < 0:
+ istart = self.stop + islice.start
+ else:
+ if islice.start is None:
+ istart = self.start
+ else:
+ istart = self.start + islice.start
+
+
+ if not islice.stop is None and islice.stop < 0:
+ istop = self.stop + islice.stop
+ else:
+ istop = islice.stop is None and self.stop or (self.start + islice.stop)
+
+ # this will give empty string
+ if istart > self.stop: return self.stop, self.stop
+ if istart < self.start: istart = self.start
+
+ if istop < self.start: istop = self.start
+ elif istop > self.stop: istop = self.stop
+
+ return istart, istop
+
+ def __getitem__(self, islice):
+ fh = self.fh
+ fh.seek(self.start)
+
+
+ if isinstance(islice, (int, long)):
+ if islice < 0:
+ if -islice > self.stop - self.start:
+ raise IndexError
+ fh.seek(self.stop + islice)
+ else:
+ fh.seek(self.start + islice)
+ return fh.read(1)
+
+ # [:]
+ if islice.start in (0, None) and islice.stop in (None, sys.maxint):
+ if islice.step in (1, None):
+ return fh.read(self.stop - self.start)
+ return fh.read(self.stop - self.start)[::islice.step]
+
+ istart, istop = self._adjust_slice(islice)
+ if istart is None: return u""
+ l = istop - istart
+ if l == 0: return u""
+
+
+ fh.seek(istart)
+
+ if islice.step in (1, None):
+ return fh.read(l)
+
+ return fh.read(l)[::islice.step]
+
+
+ def __str__(self):
+ return self[:]
+
+ def __repr__(self):
+ return "%s('%s', %i..%i)" % (self.__class__.__name__, self.fh.name,
+ self.start, self.stop)
+
+ @property
+ def __array_interface__(self):
+ return {
+ 'shape': (len(self), ),
+ 'typestr': '|S1',
+ 'version': 3,
+ 'data': buffer(self)
+ }
+
+
+class NpyFastaRecord(FastaRecord):
+ __slots__ = ('start', 'stop', 'mm', 'as_string')
+
+ def __init__(self, mm, start, stop, as_string=True):
+ self.mm = mm
+ self.start = start
+ self.stop = stop
+ self.as_string = as_string
+
+ def __repr__(self):
+ return "%s(%i..%i)" % (self.__class__.__name__,
+ self.start, self.stop)
+
+ @classmethod
+ def modify_flat(klass, flat_file):
+ mm = np.memmap(flat_file, dtype="S1", mode="r")
+ return mm
+
+ def getdata(self, islice):
+ if isinstance(islice, (int, long)):
+ if islice >= 0:
+ islice += self.start
+ else:
+ islice += self.stop
+ if islice < 0: raise IndexError
+ return self.mm[islice]
+
+ start, stop = self._adjust_slice(islice)
+ return self.mm[start:stop:islice.step]
+
+ def __getitem__(self, islice):
+ d = self.getdata(islice)
+ return d.tostring().decode() if self.as_string else d
+
+ @property
+ def __array_interface__(self):
+ old_as_string = self.as_string
+ self.as_string = False
+ data = self[:]
+ self.as_string = old_as_string
+ return {
+ 'shape': (len(self), ),
+ 'typestr': '|S1',
+ 'version': 3,
+ 'data': data,
+ }
+
+
+class MemoryRecord(FastaRecord):
+ """
+ dont write anything to disk, just read the whole thing
+ into memory
+ """
+ @classmethod
+ def prepare(klass, fasta_obj, seqinfo_generator, flatten_inplace=False):
+ f = fasta_obj.fasta_name
+ seqs = {}
+ idx = {}
+ for seqid, seq in seqinfo_generator:
+ seqs[seqid] = (seq, None)
+
+ return seqs, seqs
+
+ def __init__(self, _, seq, _none):
+ self.seq = seq
+
+ def __getitem__(self, slice):
+ return self.seq.__getitem__(slice)
+
+ def __len__(self):
+ return len(self.seq)
+
+
+
+try:
+ import tc
+ class HDB(tc.HDB):
+ def __getitem__(self, k):
+ return cPickle.loads(tc.HDB.get(self, k))
+ def __setitem__(self, k, v):
+ tc.HDB.put(self, k, cPickle.dumps(v, -1))
+ def __del__(self):
+ tc.HDB.close(self)
+
+ class TCRecord(NpyFastaRecord):
+ idx = ".tct"
+
+ @classmethod
+ def prepare(klass, fasta_obj, seqinfo_generator, flatten_inplace):
+ f = fasta_obj.fasta_name
+ if klass.is_current(f):
+ idx = HDB()
+ idx.open(f + klass.idx, tc.HDBOREADER)
+ if flatten_inplace or ext_is_flat(f + klass.ext): flat = klass.modify_flat(f)
+ else: flat = klass.modify_flat(f + klass.ext)
+ return idx, flat
+
+
+ db = HDB(f + klass.idx, tc.HDBOWRITER | tc.HDBOCREAT)
+ flatfh = open(f + klass.ext, 'w')
+ for i, (seqid, seq) in enumerate(seqinfo_generator):
+ if flatten_inplace:
+ if i == 0:
+ flatfh.write('>%s\n' % seqid)
+ else:
+ flatfh.write('\n>%s\n' % seqid)
+ start = flatfh.tell()
+ flatfh.write(seq)
+ stop = flatfh.tell()
+ db[seqid] = (start, stop)
+
+ db.sync()
+ flatfh.close()
+
+ if flatten_inplace:
+ klass.copy_inplace(flatfh.name, f)
+ return db, klass.modify_flat(f)
+ return db, klass.modify_flat(f + klass.ext)
+
+
+ __all__.append('TCRecord')
+except ImportError:
+ pass
diff --git a/pyfasta/split_fasta.py b/pyfasta/split_fasta.py
new file mode 100644
index 0000000..c1dcc81
--- /dev/null
+++ b/pyfasta/split_fasta.py
@@ -0,0 +1,231 @@
+from __future__ import print_function
+from pyfasta import Fasta
+import operator
+import collections
+import string
+import sys
+import optparse
+from cStringIO import StringIO
+
+
+def newnames(oldname, n, kmers=None, overlap=None, header=None):
+ """
+ >>> newnames('some.fasta', 1)
+ ['some.split.fasta']
+
+ >>> newnames('some.fasta', 2)
+ ['some.0.fasta', 'some.1.fasta']
+
+ >>> newnames('some', 2)
+ ['some.0', 'some.1']
+
+ >>> newnames('some.fasta', 2, kmers=1000)
+ ['some.0.1Kmer.fasta', 'some.1.1Kmer.fasta']
+
+ >>> newnames('some.fasta', 2, kmers=10000, overlap=2000)
+ ['some.0.10Kmer.2Koverlap.fasta', 'some.1.10Kmer.2Koverlap.fasta']
+
+ >>> newnames('some.fasta', 1, kmers=100000, overlap=2000)
+ ['some.split.100Kmer.2Koverlap.fasta']
+
+ """
+ if kmers and kmers % 1000 == 0: kmers = "%iK" % (kmers/1000)
+ if overlap and overlap % 1000 == 0: overlap = "%iK" % (overlap/1000)
+
+ p = oldname.rfind("fa")
+ kstr = kmers is not None and ("%smer." % kmers) or ""
+ ostr = overlap is not None and ("%soverlap." % overlap) or ""
+ if p != -1:
+ pattern = oldname[:p] + "%s." + kstr + ostr + oldname[p:]
+ else:
+ pattern = oldname + kstr + ostr + ".%s"
+
+
+
+
+ if n == 1:
+ names = [pattern % "split"]
+ else:
+ width = len(str(n))
+ names = [pattern % str(i).rjust(width, '0') for i in range(n)]
+ print("creating new files:", file=sys.stderr)
+ print("\n".join(names), file=sys.stderr)
+ return names
+
+
+def print_to_fh(fh, fasta, lens, seqinfo):
+ key, seqlen = seqinfo
+ lens[fh.name] += seqlen
+ f = fasta
+ assert len(str(f[key])) == seqlen, (key, seqlen, len(str(f[key])))
+ print(">%s" % key, file=fh)
+ print(str(f[key]), file=fh)
+
+
+def format_kmer(seqid, start):
+ """
+ prints out a header with 1-based indexing.
+
+ >>> format_kmer('chr3', 1000)
+ 'chr3_1001'
+ """
+ return "%s_%i" % (seqid, start + 1)
+
+def split(args):
+ parser = optparse.OptionParser("""\
+ split a fasta file into separated files.
+ pyfasta split -n 6 [-k 5000 ] some.fasta
+ the output will be some.0.fasta, some.1.fasta ... some.6.fasta
+ the sizes will be as even as reasonable.
+ """)
+ parser.add_option("--header", dest="header", metavar="FILENAME_FMT",
+ help="""this overrides all other options. if specified, it will
+ split the file into a separate file for each header. it
+ will be a template specifying the file name for each new file.
+ e.g.: "%(fasta)s.%(seqid)s.fasta"
+ where 'fasta' is the basename of the input fasta file and seqid
+ is the header of each entry in the fasta file.""" ,default=None)
+
+ parser.add_option("-n", "--n", type="int", dest="nsplits",
+ help="number of new files to create")
+ parser.add_option("-o", "--overlap", type="int", dest="overlap",
+ help="overlap in basepairs", default=0)
+ parser.add_option("-k", "--kmers", type="int", dest="kmers", default=-1,
+ help="""\
+ split big files into pieces of this size in basepairs. default
+ default of -1 means do not split the sequence up into k-mers, just
+ split based on the headers. a reasonable value would be 10Kbp""")
+ options, fasta = parser.parse_args(args)
+ if not (fasta and (options.nsplits or options.header)):
+ sys.exit(parser.print_help())
+
+ if isinstance(fasta, (tuple, list)):
+ assert len(fasta) == 1, fasta
+ fasta = fasta[0]
+
+ kmer = options.kmers if options.kmers != -1 else None
+ overlap = options.overlap if options.overlap != 0 else None
+ f = Fasta(fasta)
+ if options.header:
+ names = dict([(seqid, options.header % \
+ dict(fasta=f.fasta_name, seqid=seqid)) \
+ for seqid in f.iterkeys()])
+ """
+ if len(names) > 0:
+ assert names[0][1] != names[1][1], ("problem with header format", options.header)
+ fhs = dict([(seqid, open(fn, 'wb')) for seqid, fn in names[:200]])
+ fhs.extend([(seqid, StringIO(), fn) for seqid, fn in names[200:]])
+ """
+ return with_header_names(f, names)
+ else:
+ names = newnames(fasta, options.nsplits, kmers=kmer, overlap=overlap,
+ header=options.header)
+
+ #fhs = [open(n, 'wb') for n in names]
+ if options.kmers == -1:
+ return without_kmers(f, names)
+ else:
+ return with_kmers(f, names, options.kmers, options.overlap)
+
+def with_header_names(f, names):
+ """
+ split the fasta into the files in fhs by headers.
+ """
+ for seqid, name in names.iteritems():
+ with open(name, 'w') as fh:
+ print(">%s" % seqid, file=fh)
+ print(str(f[seqid]), file=fh)
+
+def with_kmers(f, names, k, overlap):
+ """
+ split the sequences in Fasta object `f` into pieces of length `k`
+ with the given `overlap` the results are written to the array of files
+ `fhs`
+ """
+ fhs = [open(name, 'w') for name in names]
+ i = 0
+ for seqid in f.iterkeys():
+ seq = f[seqid]
+ for (start0, subseq) in Fasta.as_kmers(seq, k, overlap=overlap):
+
+ fh = fhs[i % len(fhs)]
+ print(">%s" % format_kmer(seqid, start0), file=fh)
+ print(subseq, file=fh)
+ i += 1
+ for fh in fhs:
+ fh.close()
+
+def without_kmers(f, names):
+ """
+ long crappy function that does not solve the bin-packing problem.
+ but attempts to distribute the sequences in Fasta object `f` evenly
+ among the file handles in fhs.
+ """
+ fhs = [open(name, 'w') for name in names]
+ name2fh = dict([(fh.name, fh) for fh in fhs])
+ items = sorted([(key, len(f[key])) for key in f.iterkeys()],
+ key=operator.itemgetter(1))
+
+ l1 = len(items) - 1
+ l0 = 0
+ lens = collections.defaultdict(int)
+
+ n_added = 0
+ while l0 < l1:
+ fh = fhs[n_added % len(fhs)]
+ added = False
+ if n_added >= len(fhs):
+
+ while l1 > l0:
+ lmin = min(lens.itervalues())
+ lmax = max(lens.itervalues())
+ if float(lmin) / lmax < 0.80:
+ # it's way off, so add a large (l1)
+ name = find_name_from_len(lmin, lens)
+ fh = name2fh[name]
+ print_to_fh(fh, f, lens, items[l1])
+ l1 -= 1
+ added = True
+ n_added += 1
+
+ elif float(lmin) / lmax < 0.94:
+ # it's just a little off so add a small (l0)
+ name = find_name_from_len(lmin, lens)
+ fh = name2fh[name]
+ print_to_fh(fh, f, lens, items[l0])
+ l0 += 1
+ added = True
+ n_added += 1
+ else:
+ break
+
+ if not added:
+ break
+ # TODO: test this on glycine
+ #added = False
+ if added:
+ continue
+
+ print_to_fh(fh, f, lens, items[l1])
+ l1 -= 1
+ n_added += 1
+
+ if l0 == l1:
+ fh = fhs[l0 % len(fhs)]
+ print_to_fh(fh, f, lens, items[l0])
+
+ for fh in fhs:
+ fh.close()
+
+
+def find_name_from_len(lmin, lens):
+ """
+ reverse lookup, get name from dict
+ >>> lens = {'chr1': 4, 'chr2': 5, 'chr3': 4}
+ >>> find_name_from_len(5, lens)
+ 'chr2'
+ """
+ for fname, l in lens.iteritems():
+ if l == lmin:
+ return fname
+ raise Exception('name not found')
diff --git a/setup.cfg b/setup.cfg
new file mode 100644
index 0000000..1e9a0d8
--- /dev/null
+++ b/setup.cfg
@@ -0,0 +1,13 @@
+[nosetests]
+with-doctest = 1
+doctest-extension = .txt
+nocapture = 1
+detailed-errors = 1
+with-coverage = 1
+cover-package = pyfasta
+
+[egg_info]
+tag_build =
+tag_date = 0
+tag_svn_revision = 0
+
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..e165937
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,40 @@
+from setuptools import setup, find_packages
+
+
+version = '0.5.2'
+
+# Run 2to3 builder if we're on Python 3.x, from
+# http://wiki.python.org/moin/PortingPythonToPy3k
+try:
+ from distutils.command.build_py import build_py_2to3 as build_py
+except ImportError:
+ # 2.x
+ from distutils.command.build_py import build_py
+command_classes = {'build_py': build_py}
+
+setup(name='pyfasta',
+ version=version,
+ description=\
+ "fast, memory-efficient, pythonic (and command-line) access to fasta sequence files",
+ url="http://github.com/brentp/pyfasta/",
+ long_description=open('README.rst').read() + "\n" + open('CHANGELOG.txt').read(),
+ classifiers=["Topic :: Scientific/Engineering :: Bio-Informatics"],
+ keywords='bioinformatics blast fasta',
+ author='brentp',
+ author_email='bpederse at gmail.com',
+ license='MIT',
+ packages=find_packages(exclude=['ez_setup', 'examples', 'tests']),
+ package_data={'':['CHANGELOG.txt']},
+ include_package_data=True,
+ tests_require=['nose'],
+ test_suite='nose.collector',
+ zip_safe=False,
+ install_requires=[
+ # -*- Extra requirements: -*-
+ ],
+ scripts=[],
+ entry_points={
+ 'console_scripts': ['pyfasta = pyfasta:main']
+ },
+ cmdclass=command_classes,
+ )
diff --git a/tests/bench.py b/tests/bench.py
new file mode 100644
index 0000000..ab779ad
--- /dev/null
+++ b/tests/bench.py
@@ -0,0 +1,51 @@
+from __future__ import print_function
+from itertools import islice
+import sys
+import os
+sys.path.insert(0, os.path.abspath("."))
+import pyfasta
+from pyfasta import Fasta
+
+import time
+import random
+random.seed(1234)
+
+SEQLEN = 100000
+def make_long_fasta(filename="t.fasta", nrecs=2500, seqlen=SEQLEN):
+ fh = open(filename, 'w')
+ #0123456789"
+ s = "ACTGACTGAC"
+ for i in range(nrecs):
+ print(">header%i" % i, file=fh)
+ print(s * (seqlen/10), file=fh)
+
+ fh.close()
+ return filename
+
+def read(f, nreads=40000, seqlen=SEQLEN):
+
+ for k in islice(f.iterkeys(), 10):
+ for i in range(nreads):
+ start = random.randint(0, seqlen)
+ end = min(seqlen, start + random.randint(1000, 2000))
+ str(f[k][start:end])
+
+
+
+def main():
+ fa = make_long_fasta()
+
+ t = time.time()
+ f = Fasta(fa)
+ print("flatten:", time.time() - t)
+
+
+ t = time.time()
+ read(f)
+ print("read:", time.time() - t)
+
+
+
+
+if __name__ == "__main__":
+ main()
diff --git a/tests/data/dups.fasta b/tests/data/dups.fasta
new file mode 100644
index 0000000..435e665
--- /dev/null
+++ b/tests/data/dups.fasta
@@ -0,0 +1,4 @@
+>a
+aaa
+>a
+bbb
diff --git a/tests/data/key.fasta b/tests/data/key.fasta
new file mode 100644
index 0000000..db8f320
--- /dev/null
+++ b/tests/data/key.fasta
@@ -0,0 +1,6 @@
+>a extra
+a
+>b extra
+b
+>c extra
+c
diff --git a/tests/data/three_chrs.fasta b/tests/data/three_chrs.fasta
new file mode 100644
index 0000000..d88292e
--- /dev/null
+++ b/tests/data/three_chrs.fasta
@@ -0,0 +1,30 @@
+
+>chr1
+ACTGACTGACTGACTGACTGACTGACTGACTGACTGA
+CTGACTGACTGACTGACTGACT
+GACTGACTGACTGACTGACTG
+
+>chr2
+TAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+AAAAAAAAAAAAAAAAT
+
+
+
+
+>chr3
+ACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATT
+ACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATT
+ACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATT
+ACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATT
+ACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATT
+ACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATT
+ACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATT
+ACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATT
+ACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATT
+ACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATT
+ACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCACGCTACACGCATTACGCAC [...]
+
+
+
+
diff --git a/tests/test_all.py b/tests/test_all.py
new file mode 100644
index 0000000..6dcc6c0
--- /dev/null
+++ b/tests/test_all.py
@@ -0,0 +1,255 @@
+from pyfasta import Fasta
+from pyfasta.records import NpyFastaRecord, MemoryRecord, FastaRecord
+record_classes = [NpyFastaRecord, MemoryRecord, FastaRecord]
+from pyfasta import DuplicateHeaderException
+
+try:
+ from pyfasta.records import TCRecord
+ record_classes.append(TCRecord)
+except ImportError:
+ pass
+
+import os
+import shutil
+from nose.tools import assert_raises
+import numpy as np
+import glob
+
+def _cleanup():
+ for f in glob.glob("tests/data/three_chrs.fasta*") + glob.glob('tests/data/dups.fasta.*'):
+ if f.endswith(".orig"): continue
+ os.unlink(f)
+ shutil.copyfile('tests/data/three_chrs.fasta.orig', 'tests/data/three_chrs.fasta')
+
+
+def test_classes():
+
+ for inplace in (True, False):
+ for klass in record_classes:
+ f = Fasta('tests/data/three_chrs.fasta', record_class=klass, flatten_inplace=inplace)
+ yield check_keys, f
+ yield check_misc, f, klass
+ yield check_contains, f
+ yield check_shape, f
+ yield check_bounds, f
+ yield check_tostring, f
+ yield check_kmers, f
+ yield check_kmer_overlap, f
+ yield check_slice_size, f
+ yield check_slice, f
+ yield check_full_slice, f
+ yield check_array_copy, f
+ yield check_array, f
+ yield check_one_based, f
+
+ fasta_name = f.fasta_name
+
+ del f
+
+ yield check_keyfn, 'tests/data/key.fasta', klass, inplace
+
+ yield check_keyfn2, 'tests/data/key.fasta', klass, inplace
+
+ yield check_reload, klass, fasta_name
+
+ yield check_duplicates, klass, inplace
+
+ _cleanup()
+
+
+def check_duplicates(klass, inplace):
+ assert_raises(DuplicateHeaderException,
+ lambda: Fasta('tests/data/dups.fasta',
+ record_class=klass, flatten_inplace=inplace))
+
+
+def check_keys(f):
+ assert sorted(f.keys()) == ['chr1', 'chr2', 'chr3']
+ assert sorted(f.iterkeys()) == ['chr1', 'chr2', 'chr3']
+
+def fix(path):
+ import os.path as op
+
+ for ext in (".gdx", ".flat"):
+ if op.exists(path + ext):
+ os.unlink(path + ext)
+
+ shutil.copyfile(path + ".orig", path)
+
+def check_keyfn(path, klass, inplace):
+ f = Fasta(path, record_class=klass, flatten_inplace=inplace, key_fn=lambda key: key.split()[0])
+ assert sorted(f.keys()) == ['a', 'b', 'c'], f.keys()
+ fix(path)
+ ff = Fasta(path, record_class=klass, flatten_inplace=inplace)
+ assert sorted(ff.keys()) == ['a extra', 'b extra', 'c extra'], (ff.keys(), klass)
+ fix(path)
+
+def check_keyfn2(path, klass, inplace):
+ f = Fasta(path, record_class=klass, flatten_inplace=inplace, key_fn=lambda
+ key: "-".join(key.split()))
+
+ assert sorted(f.keys()) == ['a-extra', 'b-extra', 'c-extra'], f.keys()
+
+ assert f['a-extra']
+ fix(path)
+
+
+def check_reload(klass, fasta_name):
+
+ m = ""
+ if klass.__name__ in ('FastaRecord',): # 'NpyFastaRecord'):
+ m = os.stat(fasta_name + klass.ext).st_mtime
+
+ f = Fasta(fasta_name, record_class=klass)
+ assert f
+ del f
+ if klass.__name__ in ('FastaRecord',): # 'NpyFastaRecord'):
+ assert os.stat(fasta_name + klass.ext).st_mtime == m
+
+
+def check_full_slice(f):
+ for k in f.iterkeys():
+ assert str(f[k]) == f[k][:]
+ assert str(f[k]) == f[k][0:]
+
+
+ assert str(f[k])[::2] == f[k][0:][::2]
+ assert str(f[k])[::2] == f[k][:][::2]
+
+def check_contains(f):
+
+ assert not '_________' in f
+ assert 'chr2' in f
+
+def check_misc(f, klass):
+ seq = f['chr2']
+ assert seq.__class__ == klass
+
+ assert (seq[0] == 'T')
+ assert (seq[-1] == 'T')
+
+ assert (f['chr3'][0:5][::-1] == 'ACGCA')
+
+
+ for i in (1, len(seq) -2):
+ assert (seq[i] == 'A')
+
+ for i in (1, len(seq) -3):
+ assert (seq[i: i + 2] == 'AA')
+
+ assert (seq[1] == 'A')
+
+ assert (seq[-1] == 'T')
+ assert (seq[-2] == 'A')
+
+ assert (seq[0] == 'T')
+ assert (seq[6:9] == 'AAA')
+
+ seq = 'TACGCACGCTAC'
+ assert (seq == f['chr3'][-12:])
+ assert (seq[:4] == f['chr3'][-12:-8])
+
+def check_shape(f):
+ assert (len(f['chr2']) == 80)
+ assert (len(f['chr3']) == 3600)
+ assert (f['chr2'][:2] == 'TA')
+
+ assert (f['chr3'][:5] == 'ACGCA')
+
+def check_bounds(f):
+ c2 = f['chr2']
+ assert (len(str(c2[0:900])) == 80)
+
+ assert (c2[99:99] == "")
+ assert (c2[99:99] == "")
+ assert (c2[80:81] == "")
+ assert (c2[79:81] == "T")
+
+ assert (c2[800:810] == "")
+ assert_raises(IndexError, lambda: c2[-800])
+
+
+ assert (c2[-800:-810] == "")
+
+def check_array(f):
+
+ s = f.sequence({'chr': 'chr2', 'start': 1, 'stop':3}, asstring=False)
+ assert np.all(s == np.array(['T', 'A', 'A'], dtype="|S1"))
+
+
+def check_tostring(f):
+ s = 'TAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAT'
+ assert (str(f['chr2']) == s)
+
+def check_kmers(f):
+ seq = str(f['chr2'])
+
+ kmers = list(Fasta.as_kmers(f['chr2'], 10))
+ assert (len(kmers) == len(seq) / 10)
+ assert (kmers[0] == (0, seq[:10]))
+
+ seqs = [k[1] for k in kmers]
+ assert ("".join(seqs) == seq)
+ last_pair = kmers[-1]
+ assert (seqs[-1][-1] == 'T')
+
+ seq = str(f['chr3'])
+ kmers = list(Fasta.as_kmers(f['chr3'], 1))
+ assert (kmers[2][0] == 2)
+ seqs = [k[1] for k in kmers]
+ assert ("".join(seqs) == seq)
+
+
+def check_kmer_overlap(f):
+ chr2 = f['chr2']
+
+ kmers = Fasta.as_kmers(chr2, 10, overlap=2)
+ for i, k in enumerate(list(kmers)[:-1]):
+ assert (len(k[1]) == 10)
+ assert (k[0] == (i * (10 - 2)))
+
+ kmers = Fasta.as_kmers(chr2, 10, overlap=4)
+ seqs = [k[1] for k in kmers]
+ paired_seqs = zip(seqs[0:-1], seqs[1:])
+ for a, b in paired_seqs:
+ if len(a) < 4 or len(b) < 4: continue
+ assert (a[-4:] == b[:4])
+
+def check_slice_size(f):
+ assert (f['chr3'][:7] == 'ACGCATT')
+ # take the first basepair of each codon
+ assert (f['chr3'][0:7:3] == 'ACT')
+ # take the 2nd basepair of each codon
+ assert (f['chr3'][1:7:3] == 'CA')
+
+def check_slice(f):
+ assert (str(f['chr3'][:4]) == 'ACGC')
+
+
+ seq = 'TACGCACGCTAC'
+ assert (seq == f['chr3'][-12:])
+
+ assert (seq[:4] == f['chr3'][-12:-8])
+
+ assert (f['chr3'][0:5][::-1] == 'ACGCA')
+ assert (f['chr3'][0:5] == 'ACGCA')
+
+
+def check_array_copy(f):
+ # the array is definitely a copy...
+ a = np.array(f['chr3'], np.dtype('S1'))
+ old = f['chr3'][1:5]
+ assert (a.shape[0] == 3600)
+ a[1:5] = np.array('N', dtype='S1')
+ c = f['chr3'][1:5]
+ assert c == old
+
+ assert a[1:5].tostring().decode() == 'NNNN', a[1:5].tostring().decode()
+
+def check_one_based(f):
+ assert f.sequence({'chr': 'chr1', 'start': 2, 'stop': 9}) == 'CTGACTGA'
+ assert f.sequence({'chr': 'chr1', 'start': 2, 'stop': 9}, one_based=False) == 'TGACTGA'
+
+if __name__ == "__main__":
+ import nose
+ nose.main()
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/python-pyfasta.git
More information about the debian-med-commit
mailing list