[med-svn] [python-screed] 01/03: Imported Upstream version 0.7.1
Michael Crusoe
misterc-guest at moszumanska.debian.org
Fri Jan 30 21:19:53 UTC 2015
This is an automated email from the git hooks/post-receive script.
misterc-guest pushed a commit to branch master
in repository python-screed.
commit b16c0d2ecfb8281916dd72dc85c90fbe36e60911
Author: Michael R. Crusoe <mcrusoe at msu.edu>
Date: Tue Jan 27 17:55:09 2015 -0500
Imported Upstream version 0.7.1
---
.gitignore | 10 +
.travis.yml | 6 +
README.md | 11 +
TODO | 14 +
benchmarks/faGen.py | 73 ++
benchmarks/fqGen.py | 81 +++
benchmarks/fqToFaConvert.py | 68 ++
benchmarks/mysql/create.py | 76 +++
benchmarks/mysql/mdbConstants.py | 5 +
benchmarks/mysql/mydb.py | 83 +++
benchmarks/mysql/mysqlCreateTimeit.py | 59 ++
benchmarks/mysql/mysqlTimeit.py | 23 +
benchmarks/mysql/mysql_login.txt | 2 +
benchmarks/pgres/create.py | 73 ++
benchmarks/pgres/drop.py | 6 +
benchmarks/pgres/pdbConstants.py | 5 +
benchmarks/pgres/pgdb.py | 84 +++
benchmarks/pgres/pgresCreateTimeit.py | 59 ++
benchmarks/pgres/pgresTimeit.py | 23 +
benchmarks/pgres/pgres_login.txt | 5 +
benchmarks/screedCreateTimeit.py | 59 ++
benchmarks/screedTimeit.py | 36 +
benchmarks/screedTimeit1M.py | 40 ++
bigtests/__init__.py | 1181 +++++++++++++++++++++++++++++++++
doc/COPYRIGHT.txt | 2 +
doc/LICENSE.txt | 10 +
doc/Makefile | 153 +++++
doc/RELEASE-0.5.txt | 18 +
doc/conf.py | 242 +++++++
doc/example.txt | 34 +
doc/index.txt | 28 +
doc/run-doctests.py | 9 +
doc/schema.txt | 5 +
doc/screed.html | 743 +++++++++++++++++++++
doc/screed.txt | 502 ++++++++++++++
screed/DBConstants.py | 26 +
screed/__init__.py | 33 +
screed/conversion.py | 73 ++
screed/createscreed.py | 80 +++
screed/dna.py | 48 ++
screed/dump_to_fasta.py | 23 +
screed/dump_to_fastq.py | 23 +
screed/fadbm.py | 20 +
screed/fasta.py | 50 ++
screed/fastq.py | 64 ++
screed/fqdbm.py | 20 +
screed/hava.py | 27 +
screed/openscreed.py | 281 ++++++++
screed/pygr_api.py | 163 +++++
screed/screedRecord.py | 195 ++++++
screed/seqparse.py | 59 ++
screed/tests/__init__.py | 1 +
screed/tests/__main__.py | 8 +
screed/tests/empty.fa | 0
screed/tests/havaGen.py | 93 +++
screed/tests/test-whitespace.fa | 12 +
screed/tests/test.fa | 204 ++++++
screed/tests/test.fa.bz2 | Bin 0 -> 3626 bytes
screed/tests/test.fa.gz | Bin 0 -> 4100 bytes
screed/tests/test.fastq | 500 ++++++++++++++
screed/tests/test.fastq.gz | Bin 0 -> 4370 bytes
screed/tests/test.hava | 750 +++++++++++++++++++++
screed/tests/test_convert.py | 30 +
screed/tests/test_dictionary.py | 94 +++
screed/tests/test_fasta.py | 135 ++++
screed/tests/test_fasta_recover.py | 18 +
screed/tests/test_fastq.py | 144 ++++
screed/tests/test_fastq_recover.py | 19 +
screed/tests/test_hava_methods.py | 61 ++
screed/tests/test_nodb.py | 31 +
screed/tests/test_open.py | 62 ++
screed/tests/test_pygr_api.py | 93 +++
screed/tests/test_shell.py | 39 ++
setup.py | 18 +
tox.ini | 12 +
75 files changed, 7337 insertions(+)
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..25fb013
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,10 @@
+*.pyc
+*~
+*_screed
+*.fa
+*.fastq
+build
+screed.egg-info
+dist
+screed/tests/fa_to_fq
+screed/tests/fq_to_fa
diff --git a/.travis.yml b/.travis.yml
new file mode 100644
index 0000000..2899e55
--- /dev/null
+++ b/.travis.yml
@@ -0,0 +1,6 @@
+language: python
+python:
+ - "2.7"
+ - "2.6"
+install: python setup.py install
+script: python -m screed.tests.__main__
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..96d874f
--- /dev/null
+++ b/README.md
@@ -0,0 +1,11 @@
+screed -- short read sequence utils in Python.
+
+The official central repository for screed is:
+
+ https://github.com/ged-lab/screed
+
+See http://readthedocs.org/docs/screed/en/latest/ for docs.
+
+Issues are tracked at https://github.com/ged-lab/khmer/issues.
+
+[![Build Status](http://162.209.84.54:8080/job/screed-master/badge/icon)](http://162.209.84.54:8080/job/screed-master/)
diff --git a/TODO b/TODO
new file mode 100644
index 0000000..9df8096
--- /dev/null
+++ b/TODO
@@ -0,0 +1,14 @@
+Pragma statements in sqlite? http://www2.sqlite.org/pragma.html
+ - synchronous - enable for writing, maybe not important for reading
+ - locking_mode - can't do for reading, writing yes
+ - disable all locking for reading maybe
+ - cache_size
+
+fix the conversion so can be achieved with text -> text instead of
+db -> text
+
+==== 3/14/2010
+
+PEP-8 noncompliance, e.g. screedDB should be ScreedDB, although
+I kind of agree that screedDB is prettier.
+
diff --git a/benchmarks/faGen.py b/benchmarks/faGen.py
new file mode 100755
index 0000000..982f3c2
--- /dev/null
+++ b/benchmarks/faGen.py
@@ -0,0 +1,73 @@
+#!/usr/bin/env python
+
+import sys, os
+import random
+
+seqLength = (8000, 12000)
+
+class collectionOFiles(object):
+ def __init__(self, baseName, divisions, totalSize):
+ self.baseName = baseName
+ self.divisions = divisions
+ self.totalSize = totalSize
+
+ self.fileHandles = {}
+ for i in range(0, divisions):
+ filename = self.baseName + "_%d" % i
+ fh = open(filename, "wb")
+ divisor = 2 ** i
+
+ self.fileHandles[filename]= (fh, self.totalSize/divisor, 0)
+
+ def writeRecord(self, name, description, sequence):
+ toRemove = []
+ for filename in self.fileHandles:
+ file, limit, count = self.fileHandles[filename]
+ file.write("%s %s\n%s\n" % (name, description, sequence))
+ count += 1
+ if count >= limit:
+ file.close()
+ toRemove.append(filename)
+ else:
+ self.fileHandles[filename] = (file, limit, count)
+
+ for fh in toRemove:
+ self.fileHandles.pop(fh)
+
+ def finished(self):
+ return len(self.fileHandles) == 0
+
+def genSeq(min, max):
+ """
+ Generates a sequence with min <= length <= max
+ """
+ choices = ['A','T','C','G']
+ result = []
+ length = random.randrange(min, max)
+ for i in range(0, length):
+ result.append(random.choice(choices))
+ if i % 80 == 0:
+ result.append('\n')
+ return "".join(result)
+
+def createFastaFiles(filename, size, divisions):
+ cof = collectionOFiles(filename, divisions, size)
+ counter = 0
+ description="cdna:Genscan chromosome:PPYG2:6_qbl_hap2_random:95622:98297:1"
+ while(not cof.finished()):
+ name = ">GENSCAN00%d" % counter
+ sequence = genSeq(seqLength[0], seqLength[1])
+ cof.writeRecord(name, description, sequence)
+ counter += 1
+ return
+
+if __name__ == '__main__':
+ if len(sys.argv) != 4:
+ print "Usage: <filename> <size> <divisions>"
+ exit(1)
+
+ filename = sys.argv[1]
+ size = int(sys.argv[2])
+ divisions = int(sys.argv[3])
+
+ createFastaFiles(filename, size, divisions)
diff --git a/benchmarks/fqGen.py b/benchmarks/fqGen.py
new file mode 100755
index 0000000..b2c8463
--- /dev/null
+++ b/benchmarks/fqGen.py
@@ -0,0 +1,81 @@
+#!/usr/bin/env python
+
+import sys, os
+import random
+
+seqLength = 37
+
+class collectionOFiles(object):
+ def __init__(self, baseName, divisions, totalSize):
+ self.baseName = baseName
+ self.divisions = divisions
+ self.totalSize = totalSize
+
+ self.fileHandles = {}
+ for i in range(0, divisions):
+ filename = self.baseName + "_%d" % i
+ fh = open(filename, "wb")
+ divisor = 2 ** i
+
+ self.fileHandles[filename]= (fh, self.totalSize/divisor, 0)
+
+ def writeRecord(self, name, sequence, accuracy):
+ toRemove = []
+ for filename in self.fileHandles:
+ file, limit, count = self.fileHandles[filename]
+ file.write("%s\n%s\n+\n%s\n" % (name, sequence, accuracy))
+ count += 1
+ if count >= limit:
+ file.close()
+ toRemove.append(filename)
+ else:
+ self.fileHandles[filename] = (file, limit, count)
+
+ for fh in toRemove:
+ self.fileHandles.pop(fh)
+
+ def finished(self):
+ return len(self.fileHandles) == 0
+
+
+def genSeq(length):
+ """
+ Generates a sequence with length characters
+ """
+ choices = ['A','T','C','G']
+ result = []
+ for i in range(0, length):
+ result.append(random.choice(choices))
+ return "".join(result)
+
+def genAcc(length):
+ """
+ Generates an accuracy with length characters
+ """
+ choices = ['A','1','7','3','.',';','*','<']
+ result = []
+ for i in range(0, length):
+ result.append(random.choice(choices))
+ return "".join(result)
+
+def createFastqFiles(filename, size, divisions):
+ cof = collectionOFiles(filename, divisions, size)
+ counter = 0
+ while(not cof.finished()):
+ name = "@HWI-EAS_4_PE-F%d" % counter
+ sequence = genSeq(seqLength)
+ accuracy = genAcc(seqLength)
+ cof.writeRecord(name, sequence, accuracy)
+ counter += 1
+ return
+
+if __name__ == '__main__':
+ if len(sys.argv) != 4:
+ print "Usage: <filename> <size> <divisions>"
+ exit(1)
+
+ filename = sys.argv[1]
+ size = int(sys.argv[2])
+ divisions = int(sys.argv[3])
+
+ createFastqFiles(filename, size, divisions)
diff --git a/benchmarks/fqToFaConvert.py b/benchmarks/fqToFaConvert.py
new file mode 100755
index 0000000..a9c79d1
--- /dev/null
+++ b/benchmarks/fqToFaConvert.py
@@ -0,0 +1,68 @@
+#!/usr/bin/env python
+import sys
+import os
+
+class fastaModel(object):
+ """
+ Contains methods for writing data to a file in the fasta format
+ """
+ def __init__(self, fileHandle):
+ self.fileHandle = fileHandle
+ self.currSeq = ""
+
+ def writeName(self, name):
+ """
+ Writes the given name to the fileHandle in the fasta format
+ """
+ self.fileHandle.write(">%s " % name.strip())
+
+ def writeDescription(self, description):
+ """
+ Writes the given description and the stored sequence to the file
+ """
+ self.fileHandle.write("%s\n%s\n" % (description.strip(), self.currSeq))
+
+ def writeSequence(self, sequence):
+ """
+ Stores the given sequence until a call to writeDescription is made
+ so that the description and sequence will be stored in the correct
+ fasta order
+ """
+ self.currSeq = sequence.strip()
+
+def convertFastqToFasta(inputFilename, outputFilename):
+ """
+ Converts the given fastq file (inputFilename) to an equilivalent fasta file
+ (outputFilename). The fastq's accuracy information is converted to a fasta's
+ 'description' field. Sequence and name fields are left alone
+ """
+
+ inputFile = open(inputFilename, "rb")
+ outputFile = open(outputFilename, "wb")
+
+ model = fastaModel(outputFile)
+
+ for line in inputFile:
+ if line.startswith("@"): # Line is a name
+ model.writeName(line[1:])
+ elif line.startswith('+'): # Next line is the accuracy
+ accuracy = inputFile.next()
+ model.writeDescription(accuracy)
+ else: # Line is the sequence
+ model.writeSequence(line)
+
+ outputFile.close()
+
+if __name__ == '__main__':
+ if len(sys.argv) != 3:
+ print "Usage: <input filename> <output filename>"
+ exit(1)
+
+ inputFilename = sys.argv[1]
+ outputFilename = sys.argv[2]
+
+ if not os.path.isfile(inputFilename):
+ print "Error: %s doesn't exist" % inputFilename
+ exit(2)
+
+ convertFastqToFasta(inputFilename, outputFilename)
diff --git a/benchmarks/mysql/create.py b/benchmarks/mysql/create.py
new file mode 100644
index 0000000..4fec3e9
--- /dev/null
+++ b/benchmarks/mysql/create.py
@@ -0,0 +1,76 @@
+import os
+import MySQLdb
+import mdbConstants
+
+def create_db(fields, rcrditer):
+ """
+ Populates the mysql database with records from the record iter
+ """
+ conn = MySQLdb.connect(db=mdbConstants._DBNAME, user=mdbConstants._USER)
+
+ cur = conn.cursor()
+
+ # Create the admin table
+ cur.execute('CREATE TABLE %s (ID int NOT NULL auto_increment, '\
+ 'FIELDNAME TEXT, PRIMARY KEY(ID))' % mdbConstants._SCREEDADMIN)
+
+ for attribute in fields:
+ cur.execute("INSERT INTO %s (FIELDNAME) VALUES ('%s')" % \
+ (mdbConstants._SCREEDADMIN, attribute))
+
+ # Setup the dictionary table creation field substring
+ otherFields = fields[1:]
+ createsub = ['%s TEXT' % field for field in otherFields]
+ createsub.insert(0, '%s VARCHAR(100)' % fields[0])
+ createsub = ','.join(createsub)
+
+ # Create the dictionary table
+ cur.execute('CREATE TABLE %s (%s int NOT NULL auto_increment, %s, PRIMARY KEY(%s))' %
+ (mdbConstants._DICT_TABLE, mdbConstants._PRIMARY_KEY,
+ createsub,
+ mdbConstants._PRIMARY_KEY))
+
+ # Attribute to index
+ queryby = fields[0]
+
+ # Make the index on the 'queryby' attribute
+ cur.execute('CREATE UNIQUE INDEX %sidx ON %s(%s)' %
+ (queryby, mdbConstants._DICT_TABLE, queryby))
+
+ # Setup the 'perc' pgres substring
+ perc = ', '.join(['%s' for i in range(len(fields))])
+
+ # Setup the sql substring for inserting data into db
+ fieldsub = ','.join(fields)
+
+ # Pull data from rcrditer and store in database
+ for record in rcrditer:
+ data = tuple([record[key] for key in fields])
+ cur.execute('INSERT INTO %s (%s) VALUES (%s)' %\
+ (mdbConstants._DICT_TABLE, fieldsub, perc),
+ data)
+
+ conn.commit()
+ cur.close()
+ conn.close()
+
+def droptables():
+ """
+ Drops tables in db
+ """
+ conn = MySQLdb.connect(db=mdbConstants._DBNAME, user=mdbConstants._USER)
+
+ cur = conn.cursor()
+
+ try:
+ cur.execute('DROP TABLE %s;' % mdbConstants._DICT_TABLE)
+ except:
+ pass
+ try:
+ cur.execute('DROP TABLE %s;' % mdbConstants._SCREEDADMIN)
+ except:
+ pass
+
+ conn.commit()
+ cur.close()
+ conn.close()
diff --git a/benchmarks/mysql/mdbConstants.py b/benchmarks/mysql/mdbConstants.py
new file mode 100644
index 0000000..60ded7d
--- /dev/null
+++ b/benchmarks/mysql/mdbConstants.py
@@ -0,0 +1,5 @@
+_SCREEDADMIN = 'SCREEDADMIN'
+_DICT_TABLE = 'DICTIONARY_TABLE'
+_PRIMARY_KEY = 'id'
+_DBNAME = 'sdb'
+_USER = 'alex'
diff --git a/benchmarks/mysql/mydb.py b/benchmarks/mysql/mydb.py
new file mode 100644
index 0000000..9b863ca
--- /dev/null
+++ b/benchmarks/mysql/mydb.py
@@ -0,0 +1,83 @@
+import mdbConstants
+import MySQLdb
+import UserDict
+import types
+
+class _mdb_record_dict(UserDict.DictMixin):
+ """
+ Simple dict-like record interface with bag behavior.
+ """
+ def __init__(self, *args, **kwargs):
+ self.d = dict(*args, **kwargs)
+
+ def __getitem__(self, name):
+ return self.d[name]
+
+ def __setitem__(self, name, value):
+ self.d[name] = value
+
+ def __getattr__(self, name):
+ try:
+ return self.d[name]
+ except KeyError:
+ raise AttributeError, name
+
+ def keys(self):
+ return self.d.keys()
+
+class mydb(object):
+ def __init__(self):
+ self._conn = MySQLdb.connect(db=mdbConstants._DBNAME, user=mdbConstants._USER)
+
+ cur = self._conn.cursor()
+ cur.execute('SELECT id, fieldname FROM %s' % mdbConstants._SCREEDADMIN)
+ self._adm = dict(cur.fetchall())
+ keys = self._adm.keys()
+ keys.sort()
+
+ self._fields = self._adm.values()
+ self._fields.insert(0, mdbConstants._PRIMARY_KEY.lower())
+ self._fieldStr = ",".join(self._fields)
+
+ self._queryBy = self._adm[keys[0]]
+
+ def close(self):
+ """
+ Closes the database handles
+ """
+ self._conn.close()
+
+ def loadRecordByIndex(self, idx):
+ """
+ Loads a record from the database by index
+ """
+
+ def loadRecordByName(self, key):
+ """
+ As above, by name
+ """
+ cursor = self._conn.cursor()
+ query = "SELECT %s FROM %s WHERE %s='%s'" % (self._queryBy,
+ mdbConstants._DICT_TABLE,
+ self._queryBy,
+ key)
+ cursor.execute(query)
+ if type(cursor.fetchone()) == types.NoneType:
+ raise KeyError("Key %s not found" % key)
+
+ query = "SELECT %s FROM %s WHERE %s='%s'" % (self._fieldStr,
+ mdbConstants._DICT_TABLE,
+ self._queryBy,
+ key)
+ cursor.execute(query)
+ return _mdb_record_dict(zip(self._fields, cursor.fetchone()))
+
+ def keys(self):
+ """
+ Returns a list of keys in database
+ """
+ cursor = self._conn.cursor()
+ query = "SELECT %s FROM %s" % (self._queryBy,
+ mdbConstants._DICT_TABLE)
+ cursor.execute(query)
+ return [elem for elem, in cursor]
diff --git a/benchmarks/mysql/mysqlCreateTimeit.py b/benchmarks/mysql/mysqlCreateTimeit.py
new file mode 100755
index 0000000..830d458
--- /dev/null
+++ b/benchmarks/mysql/mysqlCreateTimeit.py
@@ -0,0 +1,59 @@
+#!/usr/bin/env python
+
+import sys
+import timeit
+
+if __name__ == '__main__':
+ if len(sys.argv) != 3:
+ print "Usage: %s <filename> <fa/fq>" % sys.argv[0]
+ exit(1)
+
+ filename = sys.argv[1]
+ fafq = sys.argv[2]
+
+ fqrunStatement = """
+create.create_db(FASTQFIELDTYPES, iterfunc)
+theFile.close()
+"""
+
+ fqsetupStatement = """
+import os, sys
+import create
+thisdir = sys.path[0]
+libdir = os.path.abspath(os.path.join(thisdir, '..', '..', 'screed'))
+sys.path.insert(0, libdir)
+from fastq import fqiter
+create.droptables()
+FASTQFIELDTYPES = ('name', 'annotations', 'sequence', 'accuracy')
+theFile = open('%s', 'rb')
+iterfunc = fqiter(theFile)
+""" % filename
+
+ farunStatement = """
+create.create_db(FASTAFIELDTYPES, iterfunc)
+theFile.close()
+"""
+
+ fasetupStatement = """
+import os, sys
+import create
+thisdir = sys.path[0]
+libdir = os.path.abspath(os.path.join(thisdir, '..', '..', 'screed'))
+sys.path.insert(0, libdir)
+from fasta import faiter
+create.droptables()
+FASTAFIELDTYPES = ('name', 'description', 'sequence')
+theFile = open('%s', 'rb')
+iterfunc = faiter(theFile)
+""" % filename
+
+ t = None
+ if fafq == 'fasta':
+ t = timeit.Timer(farunStatement, fasetupStatement)
+ elif fafq == 'fastq':
+ t = timeit.Timer(fqrunStatement, fqsetupStatement)
+ else:
+ raise ValueError("Invalid db type specified: %s" % fafq)
+
+ print "[MYSQL CREATE]%s:" % filename
+ print t.repeat(2, 1)
diff --git a/benchmarks/mysql/mysqlTimeit.py b/benchmarks/mysql/mysqlTimeit.py
new file mode 100755
index 0000000..95e99d4
--- /dev/null
+++ b/benchmarks/mysql/mysqlTimeit.py
@@ -0,0 +1,23 @@
+#!/usr/bin/env python
+
+import timeit
+import sys
+
+if __name__ == '__main__':
+ runStatement = """
+for i in xrange(0, 100000):
+ entry = db.loadRecordByName(random.choice(keys))
+"""
+
+ setupStatement = """
+import os, sys
+import random
+import mydb
+db = mydb.mydb()
+keys = db.keys()
+"""
+
+ t = timeit.Timer(runStatement, setupStatement)
+
+ print "[MYSQL TIMEIT]"
+ print t.repeat(2, 1)
diff --git a/benchmarks/mysql/mysql_login.txt b/benchmarks/mysql/mysql_login.txt
new file mode 100644
index 0000000..17cb089
--- /dev/null
+++ b/benchmarks/mysql/mysql_login.txt
@@ -0,0 +1,2 @@
+dbname: sdb
+user: alex
diff --git a/benchmarks/pgres/create.py b/benchmarks/pgres/create.py
new file mode 100644
index 0000000..7eacd2e
--- /dev/null
+++ b/benchmarks/pgres/create.py
@@ -0,0 +1,73 @@
+import os
+import psycopg2
+import pdbConstants
+
+def create_db(fields, rcrditer):
+ """
+ Populates the pgres database with records from the record iter
+ """
+
+ conn = psycopg2.connect('dbname=%s user=%s' % (pdbConstants._DBNAME,
+ pdbConstants._USER))
+ cur = conn.cursor()
+
+ # Create the admin table
+ cur.execute('CREATE TABLE %s (ID serial PRIMARY KEY, '\
+ 'FIELDNAME TEXT)' % pdbConstants._SCREEDADMIN)
+
+ for attribute in fields:
+ cur.execute("INSERT INTO %s (FIELDNAME) VALUES ('%s')" % \
+ (pdbConstants._SCREEDADMIN, attribute))
+
+ # Setup the dictionary table creation field substring
+ createsub = ','.join(['%s TEXT' % field for field in fields])
+
+ # Create the dictionary table
+ cur.execute('CREATE TABLE %s (%s serial PRIMARY KEY, %s)' %
+ (pdbConstants._DICT_TABLE, pdbConstants._PRIMARY_KEY,
+ createsub))
+
+ # Attribute to index
+ queryby = fields[0]
+
+ # Make the index on the 'queryby' attribute
+ cur.execute('CREATE UNIQUE INDEX %sidx ON %s(%s)' %
+ (queryby, pdbConstants._DICT_TABLE, queryby))
+
+ # Setup the 'perc' pgres substring
+ perc = ', '.join(['%s' for i in range(len(fields))])
+
+ # Setup the sql substring for inserting data into db
+ fieldsub = ','.join(fields)
+
+ # Pull data from rcrditer and store in database
+ for record in rcrditer:
+ data = tuple([record[key] for key in fields])
+ cur.execute('INSERT INTO %s (%s) VALUES (%s)' %\
+ (pdbConstants._DICT_TABLE, fieldsub, perc),
+ data)
+
+ conn.commit()
+ cur.close()
+ conn.close()
+
+def droptables():
+ """
+ Drops tables in db
+ """
+ conn = psycopg2.connect('dbname=%s user=%s' % (pdbConstants._DBNAME,
+ pdbConstants._USER))
+ cur = conn.cursor()
+
+ try:
+ cur.execute('DROP TABLE %s;' % pdbConstants._DICT_TABLE)
+ except:
+ pass
+ try:
+ cur.execute('DROP TABLE %s;' % pdbConstants._SCREEDADMIN)
+ except:
+ pass
+
+ conn.commit()
+ cur.close()
+ conn.close()
diff --git a/benchmarks/pgres/drop.py b/benchmarks/pgres/drop.py
new file mode 100755
index 0000000..1a0d6f4
--- /dev/null
+++ b/benchmarks/pgres/drop.py
@@ -0,0 +1,6 @@
+#!/usr/bin/env python
+
+from create import droptables
+
+if __name__ == '__main__':
+ droptables()
diff --git a/benchmarks/pgres/pdbConstants.py b/benchmarks/pgres/pdbConstants.py
new file mode 100644
index 0000000..60ded7d
--- /dev/null
+++ b/benchmarks/pgres/pdbConstants.py
@@ -0,0 +1,5 @@
+_SCREEDADMIN = 'SCREEDADMIN'
+_DICT_TABLE = 'DICTIONARY_TABLE'
+_PRIMARY_KEY = 'id'
+_DBNAME = 'sdb'
+_USER = 'alex'
diff --git a/benchmarks/pgres/pgdb.py b/benchmarks/pgres/pgdb.py
new file mode 100644
index 0000000..0fd69e0
--- /dev/null
+++ b/benchmarks/pgres/pgdb.py
@@ -0,0 +1,84 @@
+import pdbConstants
+import psycopg2
+import UserDict
+import types
+
+class _pdb_record_dict(UserDict.DictMixin):
+ """
+ Simple dict-like record interface with bag behavior.
+ """
+ def __init__(self, *args, **kwargs):
+ self.d = dict(*args, **kwargs)
+
+ def __getitem__(self, name):
+ return self.d[name]
+
+ def __setitem__(self, name, value):
+ self.d[name] = value
+
+ def __getattr__(self, name):
+ try:
+ return self.d[name]
+ except KeyError:
+ raise AttributeError, name
+
+ def keys(self):
+ return self.d.keys()
+
+class pgdb(object):
+ def __init__(self):
+ self._conn = psycopg2.connect('dbname=%s user=%s' %
+ (pdbConstants._DBNAME,
+ pdbConstants._USER))
+ cur = self._conn.cursor()
+ cur.execute('SELECT id, fieldname FROM %s' % pdbConstants._SCREEDADMIN)
+ self._adm = dict(cur.fetchall())
+ keys = self._adm.keys()
+ keys.sort()
+
+ self._fields = self._adm.values()
+ self._fields.insert(0, pdbConstants._PRIMARY_KEY.lower())
+ self._fieldStr = ",".join(self._fields)
+
+ self._queryBy = self._adm[keys[0]]
+
+ def close(self):
+ """
+ Closes the database handles
+ """
+ self._conn.close()
+
+ def loadRecordByIndex(self, idx):
+ """
+ Loads a record from the database by index
+ """
+
+ def loadRecordByName(self, key):
+ """
+ As above, by name
+ """
+ cursor = self._conn.cursor()
+ query = "SELECT %s FROM %s WHERE %s='%s'" % (self._queryBy,
+ pdbConstants._DICT_TABLE,
+ self._queryBy,
+ key)
+ cursor.execute(query)
+ if type(cursor.fetchone()) == types.NoneType:
+ raise KeyError("Key %s not found" % key)
+
+ query = "SELECT %s FROM %s WHERE %s='%s'" % (self._fieldStr,
+ pdbConstants._DICT_TABLE,
+ self._queryBy,
+ key)
+ cursor.execute(query)
+ return _pdb_record_dict(zip(self._fields, cursor.fetchone()))
+
+ def keys(self):
+ """
+ Returns a list of keys in database
+ """
+ cursor = self._conn.cursor()
+ query = "SELECT %s FROM %s" % (self._queryBy,
+ pdbConstants._DICT_TABLE)
+ cursor.execute(query)
+ return [elem for elem, in cursor]
diff --git a/benchmarks/pgres/pgresCreateTimeit.py b/benchmarks/pgres/pgresCreateTimeit.py
new file mode 100755
index 0000000..37e1963
--- /dev/null
+++ b/benchmarks/pgres/pgresCreateTimeit.py
@@ -0,0 +1,59 @@
+#!/usr/bin/env python
+
+import sys
+import timeit
+
+if __name__ == '__main__':
+ if len(sys.argv) != 3:
+ print "Usage: %s <filename> <fa/fq>" % sys.argv[0]
+ exit(1)
+
+ filename = sys.argv[1]
+ fafq = sys.argv[2]
+
+ fqrunStatement = """
+create.create_db(FASTQFIELDTYPES, iterfunc)
+theFile.close()
+"""
+
+ fqsetupStatement = """
+import os, sys
+import create
+thisdir = sys.path[0]
+libdir = os.path.abspath(os.path.join(thisdir, '..', '..', 'screed'))
+sys.path.insert(0, libdir)
+from fastq import fqiter
+create.droptables()
+FASTQFIELDTYPES = ('name', 'annotations', 'sequence', 'accuracy')
+theFile = open('%s', 'rb')
+iterfunc = fqiter(theFile)
+""" % filename
+
+ farunStatement = """
+create.create_db(FASTAFIELDTYPES, iterfunc)
+theFile.close()
+"""
+
+ fasetupStatement = """
+import os, sys
+import create
+thisdir = sys.path[0]
+libdir = os.path.abspath(os.path.join(thisdir, '..', '..', 'screed'))
+sys.path.insert(0, libdir)
+from fasta import faiter
+create.droptables()
+FASTAFIELDTYPES = ('name', 'description', 'sequence')
+theFile = open('%s', 'rb')
+iterfunc = faiter(theFile)
+""" % filename
+
+ t = None
+ if fafq == 'fasta':
+ t = timeit.Timer(farunStatement, fasetupStatement)
+ elif fafq == 'fastq':
+ t = timeit.Timer(fqrunStatement, fqsetupStatement)
+ else:
+ raise ValueError("Invalid db type specified: %s" % fafq)
+
+ print "[PGRES CREATE]%s:" % filename
+ print t.repeat(2, 1)
diff --git a/benchmarks/pgres/pgresTimeit.py b/benchmarks/pgres/pgresTimeit.py
new file mode 100755
index 0000000..1d7609b
--- /dev/null
+++ b/benchmarks/pgres/pgresTimeit.py
@@ -0,0 +1,23 @@
+#!/usr/bin/env python
+
+import timeit
+import sys
+
+if __name__ == '__main__':
+ runStatement = """
+for i in xrange(0, 100000):
+ entry = db.loadRecordByName(random.choice(keys))
+"""
+
+ setupStatement = """
+import os, sys
+import random
+import pgdb
+db = pgdb.pgdb()
+keys = db.keys()
+"""
+
+ t = timeit.Timer(runStatement, setupStatement)
+
+ print "[PGRES RUN]"
+ print t.repeat(2, 1)
diff --git a/benchmarks/pgres/pgres_login.txt b/benchmarks/pgres/pgres_login.txt
new file mode 100644
index 0000000..48fe909
--- /dev/null
+++ b/benchmarks/pgres/pgres_login.txt
@@ -0,0 +1,5 @@
+dbname: sdb
+user: postgres
+pass: blah
+
+user: alex
diff --git a/benchmarks/screedCreateTimeit.py b/benchmarks/screedCreateTimeit.py
new file mode 100755
index 0000000..8329003
--- /dev/null
+++ b/benchmarks/screedCreateTimeit.py
@@ -0,0 +1,59 @@
+#!/usr/bin/env python
+
+import sys
+import timeit
+
+if __name__ == '__main__':
+ if len(sys.argv) != 3:
+ print "Usage: %s <filename> <fa/fq>" % sys.argv[0]
+ exit(1)
+
+ filename = sys.argv[1]
+ fafq = sys.argv[2]
+
+ fqrunStatement = """
+createscreed.create_db(filename, fastq.FieldTypes, iterfunc)
+theFile.close()
+"""
+
+ fqsetupStatement = """
+import os, sys
+thisdir = sys.path[0]
+libdir = os.path.abspath(os.path.join(thisdir, '..', 'screed'))
+sys.path.insert(0, libdir)
+import createscreed
+import fastq
+FASTQFIELDTYPES = ('name', 'annotations', 'sequence', 'accuracy')
+filename = '%s'
+theFile = open(filename, 'rb')
+iterfunc = fastq.fastq_iter(theFile)
+""" % filename
+
+ farunStatement = """
+createscreed.create_db(filename, fasta.FieldTypes, iterfunc)
+theFile.close()
+"""
+
+ fasetupStatement = """
+import os, sys
+thisdir = sys.path[0]
+libdir = os.path.abspath(os.path.join(thisdir, '..', 'screed'))
+sys.path.insert(0, libdir)
+import createscreed
+import fasta
+FASTAFIELDTYPES = ('name', 'description', 'sequence')
+filename = '%s'
+theFile = open(filename, 'rb')
+iterfunc = fasta.fasta_iter(theFile)
+""" % filename
+
+ t = None
+ if fafq == 'fasta':
+ t = timeit.Timer(farunStatement, fasetupStatement)
+ elif fafq == 'fastq':
+ t = timeit.Timer(fqrunStatement, fqsetupStatement)
+ else:
+ raise ValueError("Invalid db type specified: %s" % fafq)
+
+ print "[SCREED CREATE]%s:" % filename
+ print t.repeat(2, 1)
diff --git a/benchmarks/screedTimeit.py b/benchmarks/screedTimeit.py
new file mode 100755
index 0000000..aee3493
--- /dev/null
+++ b/benchmarks/screedTimeit.py
@@ -0,0 +1,36 @@
+#!/usr/bin/env python
+
+import timeit
+import sys
+import os
+
+if __name__ == '__main__':
+ if len(sys.argv) != 2:
+ print "Usage: %s <filename>" % sys.argv[0]
+ exit(1)
+
+ screedFile = sys.argv[1]
+ if not os.path.isfile(screedFile):
+ print "No such file: %s" % screedFile
+ exit(1)
+
+ runStatement = """
+for i in xrange(0, 100000):
+ entry = str(db[random.choice(keys)].sequence)
+"""
+
+ setupStatement = """
+import os, sys
+import random
+thisdir = sys.path[0]
+libdir = os.path.abspath(os.path.join(thisdir, '..'))
+sys.path.insert(0, libdir)
+import screed
+db = screed.openscreed.ScreedDB('%s')
+keys = db.keys()
+""" % screedFile
+
+ t = timeit.Timer(runStatement, setupStatement)
+
+ print "[SCREED RUN]%s:" % screedFile
+ print t.repeat(2, 1)
diff --git a/benchmarks/screedTimeit1M.py b/benchmarks/screedTimeit1M.py
new file mode 100755
index 0000000..f5f70e1
--- /dev/null
+++ b/benchmarks/screedTimeit1M.py
@@ -0,0 +1,40 @@
+#!/usr/bin/env python
+
+import timeit
+import sys
+import os
+
+if __name__ == '__main__':
+ if len(sys.argv) != 2:
+ print "Usage: %s <filename>" % sys.argv[0]
+ exit(1)
+
+ screedFile = sys.argv[1]
+ if not os.path.isfile(screedFile):
+ print "No such file: %s" % screedFile
+ exit(1)
+
+ runStatement = """
+for i in xrange(0, 100000):
+ entry = str(db[random.choice(keys)].sequence)
+"""
+
+ setupStatement = """
+import os, sys
+import random
+thisdir = sys.path[0]
+libdir = os.path.abspath(os.path.join(thisdir, '..'))
+sys.path.insert(0, libdir)
+import screed
+db = screed.openscreed.ScreedDB('%s')
+keys = []
+for i, k in enumerate(db.iterkeys()):
+ if i > 1000000:
+ break
+ keys.append(k)
+""" % screedFile
+
+ t = timeit.Timer(runStatement, setupStatement)
+
+ print "[SCREED RUN]%s:" % screedFile
+ print t.repeat(2, 1)
diff --git a/bigtests/__init__.py b/bigtests/__init__.py
new file mode 100755
index 0000000..08e1d22
--- /dev/null
+++ b/bigtests/__init__.py
@@ -0,0 +1,1181 @@
+import sys, os, gc
+import urllib, tarfile
+
+thisdir = os.path.dirname(__file__)
+libdir = os.path.abspath(os.path.join(thisdir, '..', 'screed'))
+sys.path.insert(0, libdir)
+
+from screed import read_fastq_sequences
+from screed import read_fasta_sequences
+from screed import ScreedDB
+
+tests22 = os.path.join(thisdir, 's_2_2_sequence.fastq')
+tests31 = os.path.join(thisdir, 's_3_1_sequence.fastq')
+tests42 = os.path.join(thisdir, 's_4_2_sequence.fastq')
+pongo = os.path.join(thisdir, 'Pongo_pygmaeus.PPYG2.50.cdna.abinitio.fa')
+tri = os.path.join(thisdir, 'triCas2.fa')
+mus = os.path.join(thisdir, 'Mus_musculus.NCBIM37.50.dna_rm.chromosome.9.fa')
+xeno = os.path.join(thisdir, 'Xenopus_tropicalis.JGI4.1.50.dna.toplevel.fa')
+sorex = os.path.join(thisdir, 'Sorex_araneus.COMMON_SHREW1.53.dna.toplevel.fa')
+
+def getfile(f):
+ """
+ Downloads and extracts the given file
+ """
+ filetype = f[1]
+ filename = "%s.tar.gz" % f[0]
+ urlname = os.path.split(filename)[1]
+ base_url = 'http://lyorn.idyll.org/~nolleyal/genomes/%s/%s' % \
+ (filetype, urlname)
+
+ fp = open(filename, 'wb')
+ try:
+ up = urllib.urlopen(base_url)
+ except IOError:
+ raise IOError, "Error downloading testfiles, are you connected to " +\
+ "the internet?"
+ fp.write(up.read())
+ fp.close()
+
+ tar = tarfile.open(filename)
+ tar.extractall(path=thisdir)
+ tar.close()
+ os.unlink(filename)
+ return
+
+def setup():
+ # Create databases
+ endings = ['_screed']
+ filenames = [(tests22, 'fastq'), (tests31, 'fastq'), (tests42, 'fastq'),
+ (pongo, 'fasta'), (tri, 'fasta'), (mus, 'fasta'), (xeno, 'fasta'),
+ (sorex, 'fasta')]
+ for f in filenames:
+ fname = f[0]
+ if not os.path.isfile(fname): # Download files if necessary
+ getfile(f)
+ parser = None
+ if f[1] == 'fasta':
+ parser = read_fasta_sequences
+ elif f[1] == 'fastq':
+ parser = read_fastq_sequences
+ created = True
+ for end in endings:
+ if not os.path.isfile(fname + end):
+ created = False
+ if created == False:
+ parser(fname)
+
+class Test_s22_fastq:
+ """
+ Test screed methods on the s22 fastq file
+ """
+ def setup(self):
+ self.db = ScreedDB(tests22 + '_screed')
+
+ def tearDown(self):
+ del self.db
+ gc.collect()
+
+ def test_iteration(self):
+ """
+ Runs through the database, accessing each element by index and then by
+ name
+ """
+ for idx in xrange(0, len(self.db)):
+ rcrd = self.db.loadRecordByIndex(idx)
+ nameRcrd = self.db[rcrd.name]
+ assert rcrd == nameRcrd
+
+ def test_dict_stuff(self):
+ """
+ Tests some dictionary methods on the database
+ """
+ keys = self.db.keys()
+ ikeys = list(self.db.iterkeys())
+ assert sorted(keys) == sorted(ikeys)
+ del keys
+ del ikeys
+ gc.collect()
+
+ def test_contains(self):
+ for k in self.db:
+ assert k in self.db
+
+ assert not 'FOO' in self.db
+
+ def test_get(self):
+ for k in self.db:
+ record = self.db.get(k)
+ assert record.name == k
+
+ record = self.db[k]
+ assert record.name == k
+
+ assert self.db.get('FOO') == None
+ try:
+ self.db['FOO']
+ assert False, "the previous line should raise a KeyError"
+ except KeyError:
+ pass
+
+ def test_missing(self):
+ """
+ Make sure that unsupported dict attributes are actually missing.
+ """
+ db = self.db
+
+ try:
+ db.clear()
+ assert 0
+ except AttributeError:
+ pass
+
+ try:
+ db.update({})
+ assert 0
+ except AttributeError:
+ pass
+
+ try:
+ db.clear()
+ assert 0
+ except AttributeError:
+ pass
+
+ try:
+ db.setdefault(None)
+ assert 0
+ except AttributeError:
+ pass
+
+ try:
+ db.pop()
+ assert 0
+ except AttributeError:
+ pass
+
+ try:
+ db.popitem()
+ assert 0
+ except AttributeError:
+ pass
+
+ def test_certain_records(self):
+ """
+ Pulls first, last, middle and few other records out of database and
+ compares them to known quantities
+ """
+ testcases = {}
+ testcases['HWI-EAS_4_PE-FC20GCB:2:1:492:573/2'] = {
+ 'id': 0,
+ 'annotations': '',
+ 'accuracy': 'AA7AAA3+AAAAAA.AAA.;7;AA;;;;*;<1;<<<',
+ 'name' : 'HWI-EAS_4_PE-FC20GCB:2:1:492:573/2',
+ 'sequence': 'ACAGCAAAATTGTGATTGAGGATGAAGAACTGCTGT'}
+
+ testcases['HWI-EAS_4_PE-FC20GCB:2:162:131:826/2'] = {
+ 'id': 1895228,
+ 'annotations': '',
+ 'accuracy': 'AAAAAAAAAAAAAAAAAAAAAA+ at 6=7A<05<*15:',
+ 'name': 'HWI-EAS_4_PE-FC20GCB:2:162:131:826/2',
+ 'sequence': 'ATGAATACAAACAATGCGGCAGTCATAATGCCCCTC'}
+
+ testcases['HWI-EAS_4_PE-FC20GCB:2:330:88:628/2'] = {
+ 'id': 3790455,
+ 'annotations': '',
+ 'accuracy' : 'AA;AA??A5A;;+AA?AAAA;AA;9AA.AA?????9',
+ 'name': 'HWI-EAS_4_PE-FC20GCB:2:330:88:628/2',
+ 'sequence': 'AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACAAA'}
+
+ testcases['HWI-EAS_4_PE-FC20GCB:2:4:707:391/2'] = {
+ 'id': 29999,
+ 'annotations': '',
+ 'accuracy': 'AAAAAAAAAA@<)A*AAA6A::<@AA<>A>-8?>4<',
+ 'name': 'HWI-EAS_4_PE-FC20GCB:2:4:707:391/2',
+ 'sequence': 'ATTAATCTCCAGTTTCTGGCAAACATTCAGGCCATT'}
+
+ testcases['HWI-EAS_4_PE-FC20GCB:2:36:158:208/2'] = {
+ 'id': 342842,
+ 'annotations': '',
+ 'accuracy': 'AA5?AAAAA?AAAA5?AAA5A???5A>AAA4?;.;;',
+ 'name': 'HWI-EAS_4_PE-FC20GCB:2:36:158:208/2',
+ 'sequence': 'TTTCCCTACAGAAGTGTCTGTACCGGTAATAAAGAA'}
+
+ for case in testcases:
+ assert testcases[case] == self.db[case]
+
+class Test_s31_fastq:
+ """
+ Test screed methods on the s31 fastq file
+ """
+ def setup(self):
+ self.db = ScreedDB(tests31 + '_screed')
+
+ def tearDown(self):
+ del self.db
+ gc.collect()
+
+ def test_iteration(self):
+ """
+ Runs through the database, accessing each element by index and then by
+ name
+ """
+ for idx in xrange(0, len(self.db)):
+ rcrd = self.db.loadRecordByIndex(idx)
+ nameRcrd = self.db[rcrd.name]
+ assert rcrd == nameRcrd
+
+ def test_dict_stuff(self):
+ """
+ Tests some dictionary methods on the database
+ """
+ keys = self.db.keys()
+ ikeys = list(self.db.iterkeys())
+ assert sorted(keys) == sorted(ikeys)
+ del keys
+ del ikeys
+ gc.collect()
+
+ def test_contains(self):
+ for k in self.db:
+ assert k in self.db
+
+ assert not 'FOO' in self.db
+
+ def test_get(self):
+ for k in self.db:
+ record = self.db.get(k)
+ assert record.name == k
+
+ record = self.db[k]
+ assert record.name == k
+
+ assert self.db.get('FOO') == None
+ try:
+ self.db['FOO']
+ assert False, "the previous line should raise a KeyError"
+ except KeyError:
+ pass
+
+ def test_missing(self):
+ """
+ Make sure that unsupported dict attributes are actually missing.
+ """
+ db = self.db
+
+ try:
+ db.clear()
+ assert 0
+ except AttributeError:
+ pass
+
+ try:
+ db.update({})
+ assert 0
+ except AttributeError:
+ pass
+
+ try:
+ db.clear()
+ assert 0
+ except AttributeError:
+ pass
+
+ try:
+ db.setdefault(None)
+ assert 0
+ except AttributeError:
+ pass
+
+ try:
+ db.pop()
+ assert 0
+ except AttributeError:
+ pass
+
+ try:
+ db.popitem()
+ assert 0
+ except AttributeError:
+ pass
+
+ def test_certain_records(self):
+ """
+ Pulls first, last, middle and few other records out of database and
+ compares them to known quantities
+ """
+ testcases = {}
+ testcases['HWI-EAS_4_PE-FC20GCB:3:1:71:840/1'] = {
+ 'id': 0,
+ 'annotations': '',
+ 'accuracy': 'CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC',
+ 'name' : 'HWI-EAS_4_PE-FC20GCB:3:1:71:840/1',
+ 'sequence': 'AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA'}
+
+ testcases['HWI-EAS_4_PE-FC20GCB:3:330:957:433/1'] = {
+ 'id': 4439695,
+ 'annotations': '',
+ 'accuracy': 'AAAAAAAAAAA<A?<AA<AAAAA?AAA?<:*??&::',
+ 'name': 'HWI-EAS_4_PE-FC20GCB:3:330:957:433/1',
+ 'sequence': 'CTTTGTGGAGAAGAGGGCGTGGGCAAGGCACTGATA'}
+
+ testcases['HWI-EAS_4_PE-FC20GCB:3:166:443:410/1'] = {
+ 'id': 2219847,
+ 'annotations': '',
+ 'accuracy' : 'AAAAAAAAAAAAAAAAAAAAAAAA6<@AA959???%',
+ 'name': 'HWI-EAS_4_PE-FC20GCB:3:166:443:410/1',
+ 'sequence': 'TGGCATTCGCACACATCATGATGGTGCTGACCGTAA'}
+
+ testcases['HWI-EAS_4_PE-FC20GCB:3:1:803:878/1'] = {
+ 'id': 2999,
+ 'annotations': '',
+ 'accuracy': '?6AAA6A<A6AA<<AA?A&A066/6:/&?&1191+0',
+ 'name': 'HWI-EAS_4_PE-FC20GCB:3:1:803:878/1',
+ 'sequence': 'AAGATGCTGTAGTGGCCGCATGTGTAATAGGCTTTA'}
+
+ testcases['HWI-EAS_4_PE-FC20GCB:3:245:54:506/1'] = {
+ 'id': 3329772,
+ 'annotations': '',
+ 'accuracy': "AAAAAAAAAAAAAAAA>A+AAA+ at AA+A>A%8*?'%",
+ 'name': 'HWI-EAS_4_PE-FC20GCB:3:245:54:506/1',
+ 'sequence': 'CTTCGTTGCTGTTTATCAGTAACTTTTTCTGGCTAG'}
+
+ for case in testcases:
+ assert testcases[case] == self.db[case]
+
+class Test_s42_fastq:
+ """
+ Test screed methods on the s42 fastq file
+ """
+ def setup(self):
+ self.db = ScreedDB(tests42 + '_screed')
+
+ def tearDown(self):
+ del self.db
+ gc.collect()
+
+ def test_iteration(self):
+ """
+ Runs through the database, accessing each element by index and then by
+ name
+ """
+ for idx in xrange(0, len(self.db)):
+ rcrd = self.db.loadRecordByIndex(idx)
+ nameRcrd = self.db[rcrd.name]
+ assert rcrd == nameRcrd
+
+
+ def test_dict_stuff(self):
+ """
+ Tests some dictionary methods on the database
+ """
+ keys = self.db.keys()
+ ikeys = list(self.db.iterkeys())
+ assert sorted(keys) == sorted(ikeys)
+ del keys
+ del ikeys
+ gc.collect()
+
+ def test_contains(self):
+ for k in self.db:
+ assert k in self.db
+
+ assert not 'FOO' in self.db
+
+ def test_get(self):
+ for k in self.db:
+ record = self.db.get(k)
+ assert record.name == k
+
+ record = self.db[k]
+ assert record.name == k
+
+ assert self.db.get('FOO') == None
+ try:
+ self.db['FOO']
+ assert False, "the previous line should raise a KeyError"
+ except KeyError:
+ pass
+
+ def test_missing(self):
+ """
+ Make sure that unsupported dict attributes are actually missing.
+ """
+ db = self.db
+
+ try:
+ db.clear()
+ assert 0
+ except AttributeError:
+ pass
+
+ try:
+ db.update({})
+ assert 0
+ except AttributeError:
+ pass
+
+ try:
+ db.clear()
+ assert 0
+ except AttributeError:
+ pass
+
+ try:
+ db.setdefault(None)
+ assert 0
+ except AttributeError:
+ pass
+
+ try:
+ db.pop()
+ assert 0
+ except AttributeError:
+ pass
+
+ try:
+ db.popitem()
+ assert 0
+ except AttributeError:
+ pass
+
+ def test_certain_records(self):
+ """
+ Pulls first, last, middle and few other records out of database and
+ compares them to known quantities
+ """
+ testcases = {}
+ testcases['HWI-EAS_4_PE-FC20GCB:4:1:257:604/2'] = {
+ 'id': 0,
+ 'annotations': '',
+ 'accuracy': 'AAAAAAAA:4>>AAA:44>>->-&4;8+8826;66.',
+ 'name' : 'HWI-EAS_4_PE-FC20GCB:4:1:257:604/2',
+ 'sequence': 'TGTGGATAGTCGCCCGTGATGGCGTCGAAGTTCCGG'}
+
+ testcases['HWI-EAS_4_PE-FC20GCB:4:330:96:902/2'] = {
+ 'id': 4148632,
+ 'annotations': '',
+ 'accuracy': 'AAAAAAAAAAAAAAAAAAAAAAAAAAAAAA??????',
+ 'name': 'HWI-EAS_4_PE-FC20GCB:4:330:96:902/2',
+ 'sequence': 'CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC'}
+
+ testcases['HWI-EAS_4_PE-FC20GCB:4:166:158:532/2'] = {
+ 'id': 2074316,
+ 'annotations': '',
+ 'accuracy' : 'AAAAAAA?A?AAAAAAA?A>A?A?AAAAAA?.<?-?',
+ 'name': 'HWI-EAS_4_PE-FC20GCB:4:166:158:532/2',
+ 'sequence': 'ATCGCCAATGCCCAGGCCTGGTTCTCTTTAACCTAT'}
+
+ testcases['HWI-EAS_4_PE-FC20GCB:4:1:332:634/2'] = {
+ 'id': 3000,
+ 'annotations': '',
+ 'accuracy': '?A?AAAAAAAAA8>AAAAAA*AA?A?AA.?)<9)9?',
+ 'name': 'HWI-EAS_4_PE-FC20GCB:4:1:332:634/2',
+ 'sequence': 'ACCGTGCCAGATCAGAACCTAGTGGCGATTCCAATT'}
+
+ testcases['HWI-EAS_4_PE-FC20GCB:4:242:843:13/2'] = {
+ 'id': 3111474,
+ 'annotations': '',
+ 'accuracy': "ABAAACA?CAAA??%A;2A;/5/&:?-*1-'11%71",
+ 'name': 'HWI-EAS_4_PE-FC20GCB:4:242:843:13/2',
+ 'sequence': 'GTTTCTATATTCTGGCGTTAGTCGTCGCCGATAATT'}
+
+ for case in testcases:
+ assert testcases[case] == self.db[case]
+
+
+class Test_po_fasta:
+ """
+ Test screed methods on the pongo fasta file
+ """
+ def setup(self):
+ self.db = ScreedDB(pongo + '_screed')
+
+ def tearDown(self):
+ del self.db
+ gc.collect()
+
+ def test_iteration(self):
+ """
+ Runs through the database, accessing each element by index and then by
+ name
+ """
+ for idx in xrange(0, len(self.db)):
+ rcrd = self.db.loadRecordByIndex(idx)
+ nameRcrd = self.db[rcrd.name]
+ assert rcrd == nameRcrd
+
+ def test_dict_stuff(self):
+ """
+ Tests some dictionary methods on the database
+ """
+ keys = self.db.keys()
+ ikeys = list(self.db.iterkeys())
+ assert sorted(keys) == sorted(ikeys)
+ del keys
+ del ikeys
+ gc.collect()
+
+ def test_contains(self):
+ for k in self.db:
+ assert k in self.db
+
+ assert not 'FOO' in self.db
+
+ def test_get(self):
+ for k in self.db:
+ record = self.db.get(k)
+ assert record.name == k
+
+ record = self.db[k]
+ assert record.name == k
+
+ assert self.db.get('FOO') == None
+ try:
+ self.db['FOO']
+ assert False, "the previous line should raise a KeyError"
+ except KeyError:
+ pass
+
+ def test_missing(self):
+ """
+ Make sure that unsupported dict attributes are actually missing.
+ """
+ db = self.db
+
+ try:
+ db.clear()
+ assert 0
+ except AttributeError:
+ pass
+
+ try:
+ db.update({})
+ assert 0
+ except AttributeError:
+ pass
+
+ try:
+ db.clear()
+ assert 0
+ except AttributeError:
+ pass
+
+ try:
+ db.setdefault(None)
+ assert 0
+ except AttributeError:
+ pass
+
+ try:
+ db.pop()
+ assert 0
+ except AttributeError:
+ pass
+
+ try:
+ db.popitem()
+ assert 0
+ except AttributeError:
+ pass
+
+ def test_certain_records(self):
+ """
+ Pulls first, last, middle and few other records out of database and
+ compares them to known quantities
+ """
+ testcases = {}
+ testcases['GENSCAN00000032971'] = {
+ 'id': 0,
+ 'description': 'cdna:Genscan chromosome:PPYG2:6_qbl_hap2_random' \
+ ':95622:98297:1',
+ 'name' : 'GENSCAN00000032971',
+ 'sequence': 'ATGGCGCCCCGAACCCTCCTCCTGCTGCTCTCGGCGGCCCTGGCCCCGAC' \
+ 'CGAGACCTGG'}
+ testcases['GENSCAN00000042282'] = {
+ 'id': 53997,
+ 'description': 'cdna:Genscan chromosome:PPYG2:1:229892060:22989' \
+ '2800:1',
+ 'name': 'GENSCAN00000042282',
+ 'sequence': 'ATGATGCCATTGCAAGGACCCTCTGCAGGGCCTCAGTCCCGAGGATGGCA' \
+ 'CACAGCCTTC'}
+ testcases['GENSCAN00000051311'] = {
+ 'id': 30780,
+ 'description' : 'cdna:Genscan chromosome:PPYG2:10:132962172:132' \
+ '962871:1',
+ 'name': 'GENSCAN00000051311',
+ 'sequence': 'ATGACCCAGCCACCTACCAGGCCGCTCTGCAGACCCCCCACGGGAGCAGC' \
+ 'CTCTGCCCCC'}
+ testcases['GENSCAN00000006030'] = {
+ 'id': 1469,
+ 'description': 'cdna:Genscan chromosome:PPYG2:14_random:1765749' \
+ ':1766075:-1',
+ 'name': 'GENSCAN00000006030',
+ 'sequence': 'ATGTGTGGCAACAAGGGCATTTCTGCCTTCCCTGAATCAGACCACCTTTT' \
+ 'CACATGGGTA'}
+ testcases['GENSCAN00000048263'] = {
+ 'id': 43029,
+ 'description': 'cdna:Genscan chromosome:PPYG2:6:100388173:10048' \
+ '5454:-1',
+ 'name': 'GENSCAN00000048263',
+ 'sequence': 'ATGTGTCCCTTTGAATATGCCGGAGAACAACAGTTGCCATGGATGTGTTC' \
+ 'TGGGGAGCCC'}
+
+ for case in testcases:
+ assert testcases[case]['name'] == self.db[case]['name']
+ assert testcases[case]['description'] == self.db[case]\
+ ['description']
+ assert str(self.db[case]['sequence']).startswith(testcases[case]\
+ ['sequence'])
+
+class Test_mus_fasta:
+ """
+ Test screed methods on the mus_musculus fasta file
+ """
+ def setup(self):
+ self.db = ScreedDB(mus + '_screed')
+
+ def tearDown(self):
+ del self.db
+ gc.collect()
+
+ def test_iteration(self):
+ """
+ Runs through the database, accessing each element by index and then by
+ name
+ """
+ for idx in xrange(0, len(self.db)):
+ rcrd = self.db.loadRecordByIndex(idx)
+ nameRcrd = self.db[rcrd.name]
+ assert rcrd == nameRcrd
+
+ def test_dict_stuff(self):
+ """
+ Tests some dictionary methods on the database
+ """
+ keys = self.db.keys()
+ ikeys = list(self.db.iterkeys())
+ assert sorted(keys) == sorted(ikeys)
+ del keys
+ del ikeys
+ gc.collect()
+
+ def test_contains(self):
+ for k in self.db:
+ assert k in self.db
+
+ assert not 'FOO' in self.db
+
+ def test_get(self):
+ for k in self.db:
+ record = self.db.get(k)
+ assert record.name == k
+
+ record = self.db[k]
+ assert record.name == k
+
+ assert self.db.get('FOO') == None
+ try:
+ self.db['FOO']
+ assert False, "the previous line should raise a KeyError"
+ except KeyError:
+ pass
+
+ def test_missing(self):
+ """
+ Make sure that unsupported dict attributes are actually missing.
+ """
+ db = self.db
+
+ try:
+ db.clear()
+ assert 0
+ except AttributeError:
+ pass
+
+ try:
+ db.update({})
+ assert 0
+ except AttributeError:
+ pass
+
+ try:
+ db.clear()
+ assert 0
+ except AttributeError:
+ pass
+
+ try:
+ db.setdefault(None)
+ assert 0
+ except AttributeError:
+ pass
+
+ try:
+ db.pop()
+ assert 0
+ except AttributeError:
+ pass
+
+ try:
+ db.popitem()
+ assert 0
+ except AttributeError:
+ pass
+
+ def test_certain_records(self):
+ """
+ Pulls first, last, middle and few other records out of database and
+ compares them to known quantities
+ """
+ testcases = {}
+ testcases['9'] = {
+ 'id': 0,
+ 'description': 'dna_rm:chromosome chromosome:NCBIM37:9:1:124076' \
+ '172:1',
+ 'name' : '9',
+ 'sequence': 'NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN' \
+ 'NNNNNNNNNN'}
+
+ for case in testcases:
+ assert testcases[case]['name'] == self.db[case]['name']
+ assert testcases[case]['description'] == self.db[case]\
+ ['description']
+ assert str(self.db[case]['sequence']).startswith(testcases[case]\
+ ['sequence'])
+
+
+class Test_tri_fasta:
+ """
+ Test screed methods on the tri fasta file
+ """
+ def setup(self):
+ self.db = ScreedDB(tri + '_screed')
+
+ def tearDown(self):
+ del self.db
+ gc.collect()
+
+ def test_iteration(self):
+ """
+ Runs through the database, accessing each element by index and then by
+ name
+ """
+ for idx in xrange(0, len(self.db)):
+ rcrd = self.db.loadRecordByIndex(idx)
+ nameRcrd = self.db[rcrd.name]
+ assert rcrd == nameRcrd
+
+ def test_dict_stuff(self):
+ """
+ Tests some dictionary methods on the database
+ """
+ keys = self.db.keys()
+ ikeys = list(self.db.iterkeys())
+ assert sorted(keys) == sorted(ikeys)
+ del keys
+ del ikeys
+ gc.collect()
+
+ def test_contains(self):
+ for k in self.db:
+ assert k in self.db
+
+ assert not 'FOO' in self.db
+
+ def test_get(self):
+ for k in self.db:
+ record = self.db.get(k)
+ assert record.name == k
+
+ record = self.db[k]
+ assert record.name == k
+
+ assert self.db.get('FOO') == None
+ try:
+ self.db['FOO']
+ assert False, "the previous line should raise a KeyError"
+ except KeyError:
+ pass
+
+ def test_missing(self):
+ """
+ Make sure that unsupported dict attributes are actually missing.
+ """
+ db = self.db
+
+ try:
+ db.clear()
+ assert 0
+ except AttributeError:
+ pass
+
+ try:
+ db.update({})
+ assert 0
+ except AttributeError:
+ pass
+
+ try:
+ db.clear()
+ assert 0
+ except AttributeError:
+ pass
+
+ try:
+ db.setdefault(None)
+ assert 0
+ except AttributeError:
+ pass
+
+ try:
+ db.pop()
+ assert 0
+ except AttributeError:
+ pass
+
+ try:
+ db.popitem()
+ assert 0
+ except AttributeError:
+ pass
+
+ def test_certain_records(self):
+ """
+ Pulls first, last, middle and few other records out of database and
+ compares them to known quantities
+ """
+ testcases = {}
+ testcases['singleUn_100'] = {
+ 'id': 0,
+ 'description': '',
+ 'name' : 'singleUn_100',
+ 'sequence': 'TTTAAACACGTGTCCGCGCCATTTTTTTATTTATTTACCGATCAAGTGCA'}
+ testcases['singleUn_9'] = {
+ 'id': 2210,
+ 'description': '',
+ 'name': 'singleUn_9',
+ 'sequence': 'TTTAATTTTTTTACAACTCAAAATTTTGAGTAGTGTTTTAAATAGTACAC'}
+ testcases['ChLG6'] = {
+ 'id': 2016,
+ 'description' : '',
+ 'name': 'ChLG6',
+ 'sequence': 'CAAAAAAATTCATAACTCAAAAACTAAAAGTCGTAGAGCAATGCGGTTTG'}
+ testcases['singleUn_286'] = {
+ 'id': 186,
+ 'description': '',
+ 'name': 'singleUn_286',
+ 'sequence': 'AAACTAAAACATCCTTTTCAGCATATTATTTGTTATATTTAAAAAAAAAC'}
+ testcases['ChLG9'] = {
+ 'id': 2019,
+ 'description': '',
+ 'name': 'ChLG9',
+ 'sequence': 'CTGCCGATAATATTTCCTACCAGAAATAACCAATTTATTTTACGTATTAC'}
+
+ for case in testcases:
+ assert testcases[case]['name'] == self.db[case]['name']
+ assert testcases[case]['description'] == self.db[case]\
+ ['description']
+ assert str(self.db[case]['sequence']).startswith(testcases[case]\
+ ['sequence'])
+
+
+class Test_xeno_fasta:
+ """
+ Test screed methods on the xeno fasta file
+ """
+ def setup(self):
+ self.db = ScreedDB(xeno + '_screed')
+
+ def tearDown(self):
+ del self.db
+ gc.collect()
+
+ def test_iteration(self):
+ """
+ Runs through the database, accessing each element by index and then by
+ name
+ """
+ for idx in xrange(0, len(self.db)):
+ rcrd = self.db.loadRecordByIndex(idx)
+ nameRcrd = self.db[rcrd.name]
+ assert rcrd == nameRcrd
+
+ def test_dict_stuff(self):
+ """
+ Tests some dictionary methods on the database
+ """
+ keys = self.db.keys()
+ ikeys = list(self.db.iterkeys())
+ assert sorted(keys) == sorted(ikeys)
+ del keys
+ del ikeys
+ gc.collect()
+
+ def test_contains(self):
+ for k in self.db:
+ assert k in self.db
+
+ assert not 'FOO' in self.db
+
+ def test_get(self):
+ for k in self.db:
+ record = self.db.get(k)
+ assert record.name == k
+
+ record = self.db[k]
+ assert record.name == k
+
+ assert self.db.get('FOO') == None
+ try:
+ self.db['FOO']
+ assert False, "the previous line should raise a KeyError"
+ except KeyError:
+ pass
+
+ def test_missing(self):
+ """
+ Make sure that unsupported dict attributes are actually missing.
+ """
+ db = self.db
+
+ try:
+ db.clear()
+ assert 0
+ except AttributeError:
+ pass
+
+ try:
+ db.update({})
+ assert 0
+ except AttributeError:
+ pass
+
+ try:
+ db.clear()
+ assert 0
+ except AttributeError:
+ pass
+
+ try:
+ db.setdefault(None)
+ assert 0
+ except AttributeError:
+ pass
+
+ try:
+ db.pop()
+ assert 0
+ except AttributeError:
+ pass
+
+ try:
+ db.popitem()
+ assert 0
+ except AttributeError:
+ pass
+
+ def test_certain_records(self):
+ """
+ Pulls first, last, middle and few other records out of database and
+ compares them to known quantities
+ """
+ testcases = {}
+ testcases['scaffold_20095'] = {
+ 'id': 0,
+ 'description': 'dna:scaffold scaffold:JGI4.1:scaffold_20095:1:2'\
+ '001:1',
+ 'name' : 'scaffold_20095',
+ 'sequence': 'GATGAGATCACCTTTCATGCTTTTTGTATCCCTATTATCTAGAGACAACAA'\
+ 'ATCAGTTGC'}
+ testcases['scaffold_1'] = {
+ 'id': 19500,
+ 'description': 'dna:scaffold scaffold:JGI4.1:scaffold_1:1:781781'\
+ '4:1',
+ 'name': 'scaffold_1',
+ 'sequence': 'CCTCCCTTTTTGGCTGTCTTTTCACTGTATCATAGCCTGGCGTGAACCAAG'\
+ 'CCTCAAAAA'}
+ testcases['scaffold_271'] = {
+ 'id': 19230,
+ 'description' : 'dna:scaffold scaffold:JGI4.1:scaffold_271:1:156'\
+ '7461:1',
+ 'name': 'scaffold_271',
+ 'sequence': 'CGATTTTTGCGGAAAAACGCGAGTTTTTGGTAGCCATTCCGAAAGTTGCGA'\
+ 'TTTTTTGTA'}
+ testcases['scaffold_19901'] = {
+ 'id': 329,
+ 'description': 'dna:scaffold scaffold:JGI4.1:scaffold_19901:1:22'\
+ '56:1',
+ 'name': 'scaffold_19901',
+ 'sequence': 'ATACCGCAAAGGTTTCTTTCTTCTCAGTGCTCCATGCTGCCTCTCTTGTTT'\
+ 'TGCCTCCCT'}
+ testcases['scaffold_95'] = {
+ 'id': 19408,
+ 'description': 'dna:scaffold scaffold:JGI4.1:scaffold_95:1:28996'\
+ '70:1',
+ 'name': 'scaffold_95',
+ 'sequence': 'CCCTCCTGGTGATCCCACTTCAATCTCCCCATAGGCACACATCACTTCTAG'\
+ 'CAGTTCACA'}
+
+ for case in testcases:
+ assert testcases[case]['name'] == self.db[case]['name']
+ assert testcases[case]['description'] == self.db[case]\
+ ['description']
+ assert str(self.db[case]['sequence']).startswith(testcases[case]\
+ ['sequence'])
+
+class Test_sorex_fasta:
+ """
+ Test screed methods on the sorex fasta file
+ """
+ def setup(self):
+ self.db = ScreedDB(sorex + '_screed')
+
+ def tearDown(self):
+ del self.db
+ gc.collect()
+
+ def test_iteration(self):
+ """
+ Runs through the database, accessing each element by index and then by
+ name
+ """
+ for idx in xrange(0, len(self.db)):
+ rcrd = self.db.loadRecordByIndex(idx)
+ nameRcrd = self.db[rcrd.name]
+ assert rcrd == nameRcrd
+
+ def test_dict_stuff(self):
+ """
+ Tests some dictionary methods on the database
+ """
+ keys = self.db.keys()
+ ikeys = list(self.db.iterkeys())
+ assert sorted(keys) == sorted(ikeys)
+ del keys
+ del ikeys
+ gc.collect()
+
+ def test_contains(self):
+ for k in self.db:
+ assert k in self.db
+
+ assert not 'FOO' in self.db
+
+ def test_get(self):
+ for k in self.db:
+ record = self.db.get(k)
+ assert record.name == k
+
+ record = self.db[k]
+ assert record.name == k
+
+ assert self.db.get('FOO') == None
+ try:
+ self.db['FOO']
+ assert False, "the previous line should raise a KeyError"
+ except KeyError:
+ pass
+
+ def test_missing(self):
+ """
+ Make sure that unsupported dict attributes are actually missing.
+ """
+ db = self.db
+
+ try:
+ db.clear()
+ assert 0
+ except AttributeError:
+ pass
+
+ try:
+ db.update({})
+ assert 0
+ except AttributeError:
+ pass
+
+ try:
+ db.clear()
+ assert 0
+ except AttributeError:
+ pass
+
+ try:
+ db.setdefault(None)
+ assert 0
+ except AttributeError:
+ pass
+
+ try:
+ db.pop()
+ assert 0
+ except AttributeError:
+ pass
+
+ try:
+ db.popitem()
+ assert 0
+ except AttributeError:
+ pass
+
+ def test_certain_records(self):
+ """
+ Pulls first, last, middle and few other records out of database and
+ compares them to known quantities
+ """
+ testcases = {}
+ testcases['scaffold_93039'] = {
+ 'id': 0,
+ 'description': 'dna:scaffold scaffold:COMMON_SHREW1:scaffold_93'\
+ '039:1:203:1',
+ 'name': 'scaffold_93039',
+ 'sequence': 'GCTGAGCCTTGTAGTTCTGCTCCCTTTGACTGACGGCCCACTATGGACCG'\
+ 'GAAAAACTAC'}
+
+ testcases['scaffold_107701'] = {
+ 'id': 1,
+ 'description': 'dna:scaffold scaffold:COMMON_SHREW1:scaffold_10'\
+ '7701:1:203:1',
+ 'name' : 'scaffold_107701',
+ 'sequence': 'TAAACCCAAAATAAACATTCCCCAAATTATATTTCTTCCTTTCCTTCTGA'\
+ 'ATAAAAGAAA'}
+
+ testcases['GeneScaffold_6994'] = {
+ 'id': 243135,
+ 'description': 'dna:genescaffold genescaffold:COMMON_SHREW1:Gen'\
+ 'eScaffold_6994:1:2349312:1',
+ 'name': 'GeneScaffold_6994',
+ 'sequence': 'TATTGAGAGAAGTGGGAACTTCTCTAGTGGTGGGGTATGGTGATGGAATG'\
+ 'ATGTATGAAT'}
+
+ testcases['scaffold_118324'] = {
+ 'id': 13823,
+ 'description': 'dna:scaffold scaffold:COMMON_SHREW1:scaffold_11'\
+ '8324:1:884:1',
+ 'name': 'scaffold_118324',
+ 'sequence': 'CAGCCCCCTGCAACAAATTTTATACTCTAGAAACAGTTTAATGGCTGTTG'\
+ 'GAATATTTCC'}
+
+ testcases['scaffold_92895'] = {
+ 'id': 14573,
+ 'description': 'dna:scaffold scaffold:COMMON_SHREW1:scaffold_92'\
+ '895:1:890:1',
+ 'name': 'scaffold_92895',
+ 'sequence': 'GGGAAGCTTGCAAGGCTGTCCCATGTGGGCAGGAAGCTCTCAGTAGCTTG'\
+ 'CCAGTTTCTC'}
+
+ testcases['scaffold_62271'] = {
+ 'id': 37101,
+ 'description': 'dna:scaffold scaffold:COMMON_SHREW1:scaffold_62'\
+ '271:1:1064:1',
+ 'name': 'scaffold_62271',
+ 'sequence': 'AGAGTATCTCCCCCACATGGCAGAGCCTGGCAAGCTACCCATGGCGTATT'\
+ 'CAATATGCCA'}
+
+ for case in testcases:
+ assert testcases[case]['name'] == self.db[case]['name']
+ assert testcases[case]['description'] == \
+ self.db[case]['description']
+ assert str(self.db[case]['sequence']).startswith(testcases[case]\
+ ['sequence'])
diff --git a/doc/COPYRIGHT.txt b/doc/COPYRIGHT.txt
new file mode 100644
index 0000000..329971b
--- /dev/null
+++ b/doc/COPYRIGHT.txt
@@ -0,0 +1,2 @@
+screed is Copyright 2008-2010 Michigan State University
+screed is licensed under the BSD license
diff --git a/doc/LICENSE.txt b/doc/LICENSE.txt
new file mode 100644
index 0000000..9bc1947
--- /dev/null
+++ b/doc/LICENSE.txt
@@ -0,0 +1,10 @@
+Copyright (c) 2008-2010, Michigan State University
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+ * Neither the name of the author nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFI [...]
diff --git a/doc/Makefile b/doc/Makefile
new file mode 100644
index 0000000..fe3ec7b
--- /dev/null
+++ b/doc/Makefile
@@ -0,0 +1,153 @@
+# Makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS =
+SPHINXBUILD = sphinx-build
+PAPER =
+BUILDDIR = _build
+
+# Internal variables.
+PAPEROPT_a4 = -D latex_paper_size=a4
+PAPEROPT_letter = -D latex_paper_size=letter
+ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
+# the i18n builder cannot share the environment and doctrees with the others
+I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
+
+.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
+
+help:
+ @echo "Please use \`make <target>' where <target> is one of"
+ @echo " html to make standalone HTML files"
+ @echo " dirhtml to make HTML files named index.html in directories"
+ @echo " singlehtml to make a single large HTML file"
+ @echo " pickle to make pickle files"
+ @echo " json to make JSON files"
+ @echo " htmlhelp to make HTML files and a HTML help project"
+ @echo " qthelp to make HTML files and a qthelp project"
+ @echo " devhelp to make HTML files and a Devhelp project"
+ @echo " epub to make an epub"
+ @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
+ @echo " latexpdf to make LaTeX files and run them through pdflatex"
+ @echo " text to make text files"
+ @echo " man to make manual pages"
+ @echo " texinfo to make Texinfo files"
+ @echo " info to make Texinfo files and run them through makeinfo"
+ @echo " gettext to make PO message catalogs"
+ @echo " changes to make an overview of all changed/added/deprecated items"
+ @echo " linkcheck to check all external links for integrity"
+ @echo " doctest to run all doctests embedded in the documentation (if enabled)"
+
+clean:
+ -rm -rf $(BUILDDIR)/*
+
+html:
+ $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
+ @echo
+ @echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
+
+dirhtml:
+ $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
+ @echo
+ @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
+
+singlehtml:
+ $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
+ @echo
+ @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
+
+pickle:
+ $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
+ @echo
+ @echo "Build finished; now you can process the pickle files."
+
+json:
+ $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
+ @echo
+ @echo "Build finished; now you can process the JSON files."
+
+htmlhelp:
+ $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
+ @echo
+ @echo "Build finished; now you can run HTML Help Workshop with the" \
+ ".hhp project file in $(BUILDDIR)/htmlhelp."
+
+qthelp:
+ $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
+ @echo
+ @echo "Build finished; now you can run "qcollectiongenerator" with the" \
+ ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
+ @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/screed.qhcp"
+ @echo "To view the help file:"
+ @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/screed.qhc"
+
+devhelp:
+ $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
+ @echo
+ @echo "Build finished."
+ @echo "To view the help file:"
+ @echo "# mkdir -p $$HOME/.local/share/devhelp/screed"
+ @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/screed"
+ @echo "# devhelp"
+
+epub:
+ $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
+ @echo
+ @echo "Build finished. The epub file is in $(BUILDDIR)/epub."
+
+latex:
+ $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+ @echo
+ @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
+ @echo "Run \`make' in that directory to run these through (pdf)latex" \
+ "(use \`make latexpdf' here to do that automatically)."
+
+latexpdf:
+ $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+ @echo "Running LaTeX files through pdflatex..."
+ $(MAKE) -C $(BUILDDIR)/latex all-pdf
+ @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
+
+text:
+ $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
+ @echo
+ @echo "Build finished. The text files are in $(BUILDDIR)/text."
+
+man:
+ $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
+ @echo
+ @echo "Build finished. The manual pages are in $(BUILDDIR)/man."
+
+texinfo:
+ $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+ @echo
+ @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
+ @echo "Run \`make' in that directory to run these through makeinfo" \
+ "(use \`make info' here to do that automatically)."
+
+info:
+ $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+ @echo "Running Texinfo files through makeinfo..."
+ make -C $(BUILDDIR)/texinfo info
+ @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
+
+gettext:
+ $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
+ @echo
+ @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
+
+changes:
+ $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
+ @echo
+ @echo "The overview file is in $(BUILDDIR)/changes."
+
+linkcheck:
+ $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
+ @echo
+ @echo "Link check complete; look for any errors in the above output " \
+ "or in $(BUILDDIR)/linkcheck/output.txt."
+
+doctest:
+ $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
+ @echo "Testing of doctests in the sources finished, look at the " \
+ "results in $(BUILDDIR)/doctest/output.txt."
diff --git a/doc/RELEASE-0.5.txt b/doc/RELEASE-0.5.txt
new file mode 100644
index 0000000..8d17e06
--- /dev/null
+++ b/doc/RELEASE-0.5.txt
@@ -0,0 +1,18 @@
+We are proud to announce the release of screed v0.5. screed is a database engine
+capable of storing and retriving short-read sequence data. screed is designed
+to be fast and adaptable to different sequence file formats. This marks the
+first release of screed which we consider stable and complete.
+
+Features:
+ - Read sequence data from FASTA/FASTQ files into screed databases
+ - Save screed databases back to FASTA/FASTQ files
+ - Lookup sequence data by index (offset) or name
+ - Native support for sequence substring slicing
+ - Convert between FASTA <-> FASTQ file formats
+
+screed is written entirely in Python and uses the Sqlite database for backend
+storage. screed can be downloaded from the public git repository:
+http://github.com/acr/screed.git
+
+screed is licensed under the BSD license which can be viewed in the
+doc/LICENSE.txt file.
diff --git a/doc/conf.py b/doc/conf.py
new file mode 100644
index 0000000..66a8290
--- /dev/null
+++ b/doc/conf.py
@@ -0,0 +1,242 @@
+# -*- coding: utf-8 -*-
+#
+# screed documentation build configuration file, created by
+# sphinx-quickstart on Wed Jun 6 16:32:37 2012.
+#
+# This file is execfile()d with the current directory set to its containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+
+import sys, os
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#sys.path.insert(0, os.path.abspath('.'))
+
+# -- General configuration -----------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be extensions
+# coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
+extensions = []
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix of source filenames.
+source_suffix = '.txt'
+
+# The encoding of source files.
+#source_encoding = 'utf-8-sig'
+
+# The master toctree document.
+master_doc = 'index'
+
+# General information about the project.
+project = u'screed'
+copyright = u'2012, Alex Nolley and Titus Brown'
+
+# The version info for the project you're documenting, acts as replacement for
+# |version| and |release|, also used in various other places throughout the
+# built documents.
+#
+# The short X.Y version.
+version = '0.7'
+# The full version, including alpha/beta/rc tags.
+release = '0.7.1'
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#language = None
+
+# There are two options for replacing |today|: either, you set today to some
+# non-false value, then it is used:
+#today = ''
+# Else, today_fmt is used as the format for a strftime call.
+#today_fmt = '%B %d, %Y'
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+exclude_patterns = ['_build']
+
+# The reST default role (used for this markup: `text`) to use for all documents.
+#default_role = None
+
+# If true, '()' will be appended to :func: etc. cross-reference text.
+#add_function_parentheses = True
+
+# If true, the current module name will be prepended to all description
+# unit titles (such as .. function::).
+#add_module_names = True
+
+# If true, sectionauthor and moduleauthor directives will be shown in the
+# output. They are ignored by default.
+#show_authors = False
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx'
+
+# A list of ignored prefixes for module index sorting.
+#modindex_common_prefix = []
+
+
+# -- Options for HTML output ---------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages. See the documentation for
+# a list of builtin themes.
+html_theme = 'default'
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further. For a list of options available for each theme, see the
+# documentation.
+#html_theme_options = {}
+
+# Add any paths that contain custom themes here, relative to this directory.
+#html_theme_path = []
+
+# The name for this set of Sphinx documents. If None, it defaults to
+# "<project> v<release> documentation".
+#html_title = None
+
+# A shorter title for the navigation bar. Default is the same as html_title.
+#html_short_title = None
+
+# The name of an image file (relative to this directory) to place at the top
+# of the sidebar.
+#html_logo = None
+
+# The name of an image file (within the static path) to use as favicon of the
+# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
+# pixels large.
+#html_favicon = None
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
+# using the given strftime format.
+#html_last_updated_fmt = '%b %d, %Y'
+
+# If true, SmartyPants will be used to convert quotes and dashes to
+# typographically correct entities.
+#html_use_smartypants = True
+
+# Custom sidebar templates, maps document names to template names.
+#html_sidebars = {}
+
+# Additional templates that should be rendered to pages, maps page names to
+# template names.
+#html_additional_pages = {}
+
+# If false, no module index is generated.
+#html_domain_indices = True
+
+# If false, no index is generated.
+#html_use_index = True
+
+# If true, the index is split into individual pages for each letter.
+#html_split_index = False
+
+# If true, links to the reST sources are added to the pages.
+#html_show_sourcelink = True
+
+# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
+#html_show_sphinx = True
+
+# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
+#html_show_copyright = True
+
+# If true, an OpenSearch description file will be output, and all pages will
+# contain a <link> tag referring to it. The value of this option must be the
+# base URL from which the finished HTML is served.
+#html_use_opensearch = ''
+
+# This is the file name suffix for HTML files (e.g. ".xhtml").
+#html_file_suffix = None
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'screeddoc'
+
+
+# -- Options for LaTeX output --------------------------------------------------
+
+latex_elements = {
+# The paper size ('letterpaper' or 'a4paper').
+#'papersize': 'letterpaper',
+
+# The font size ('10pt', '11pt' or '12pt').
+#'pointsize': '10pt',
+
+# Additional stuff for the LaTeX preamble.
+#'preamble': '',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title, author, documentclass [howto/manual]).
+latex_documents = [
+ ('index', 'screed.tex', u'screed Documentation',
+ u'Alex Nolley and Titus Brown', 'manual'),
+]
+
+# The name of an image file (relative to this directory) to place at the top of
+# the title page.
+#latex_logo = None
+
+# For "manual" documents, if this is true, then toplevel headings are parts,
+# not chapters.
+#latex_use_parts = False
+
+# If true, show page references after internal links.
+#latex_show_pagerefs = False
+
+# If true, show URL addresses after external links.
+#latex_show_urls = False
+
+# Documents to append as an appendix to all manuals.
+#latex_appendices = []
+
+# If false, no module index is generated.
+#latex_domain_indices = True
+
+
+# -- Options for manual page output --------------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [
+ ('index', 'screed', u'screed Documentation',
+ [u'Alex Nolley and Titus Brown'], 1)
+]
+
+# If true, show URL addresses after external links.
+#man_show_urls = False
+
+
+# -- Options for Texinfo output ------------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+# dir menu entry, description, category)
+texinfo_documents = [
+ ('index', 'screed', u'screed Documentation',
+ u'Alex Nolley and Titus Brown', 'screed', 'One line description of project.',
+ 'Miscellaneous'),
+]
+
+# Documents to append as an appendix to all manuals.
+#texinfo_appendices = []
+
+# If false, no module index is generated.
+#texinfo_domain_indices = True
+
+# How to display URL addresses: 'footnote', 'no', or 'inline'.
+#texinfo_show_urls = 'footnote'
diff --git a/doc/example.txt b/doc/example.txt
new file mode 100644
index 0000000..50ddd20
--- /dev/null
+++ b/doc/example.txt
@@ -0,0 +1,34 @@
+===============
+screed examples
+===============
+
+.. contents:
+
+Basic Usage
+===========
+
+Load screed, index the database, and return a dictionary-like object:
+
+ >>> import screed
+ >>> db = screed.read_fasta_sequences('../screed/tests/test.fa')
+
+Get the list of sequence names, sort alphabetically, and look at the
+first one:
+
+ >>> names = db.keys()
+ >>> names.sort()
+ >>> names[0]
+ u'ENSMICT00000000730'
+
+Retrieve that record:
+
+ >>> r = db[names[0]]
+ >>> print r.keys()
+ [u'description', u'id', u'name', u'sequence']
+
+Print out the internal ID number and the name:
+
+ >>> print r.id
+ 13
+ >>> print r.name
+ ENSMICT00000000730
diff --git a/doc/index.txt b/doc/index.txt
new file mode 100644
index 0000000..bd3ad32
--- /dev/null
+++ b/doc/index.txt
@@ -0,0 +1,28 @@
+.. screed documentation master file, created by
+ sphinx-quickstart on Wed Jun 6 16:32:37 2012.
+ You can adapt this file completely to your liking, but it should at least
+ contain the root `toctree` directive.
+
+screed - short read sequence utils
+==================================
+
+:Copyright: 2008-2012 Michigan State University
+:Authors: Alex Nolley, C. Titus Brown
+:Contact: ctb at msu.edu
+:License: BSD
+
+Contents:
+
+.. toctree::
+ :maxdepth: 2
+
+ screed
+ example
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`modindex`
+* :ref:`search`
+
diff --git a/doc/run-doctests.py b/doc/run-doctests.py
new file mode 100755
index 0000000..015f6cf
--- /dev/null
+++ b/doc/run-doctests.py
@@ -0,0 +1,9 @@
+#! /usr/bin/env python
+import doctest
+import sys
+
+for filename in sys.argv[1:]:
+ print '... running doctests on', filename
+ doctest.testfile(filename)
+
+print '*** SUCCESS ***'
diff --git a/doc/schema.txt b/doc/schema.txt
new file mode 100644
index 0000000..e0529f0
--- /dev/null
+++ b/doc/schema.txt
@@ -0,0 +1,5 @@
+[ADMINISTRATION TABLE]
+FIELDNAME TEXT | FIELDTYPE TEXT
+
+[DICTIONARY TABLE]
+ID INTEGER PRIMARY KEY | KEY FOR NAME QUERY TEXT | OTHER FIELDS...
diff --git a/doc/screed.html b/doc/screed.html
new file mode 100644
index 0000000..caf69e0
--- /dev/null
+++ b/doc/screed.html
@@ -0,0 +1,743 @@
+<?xml version="1.0" encoding="utf-8" ?>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
+<head>
+<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
+<meta name="generator" content="Docutils 0.5: http://docutils.sourceforge.net/" />
+<title>screed - short read sequence database</title>
+<meta name="copyright" content="2008-2010 Michigan State University" />
+<meta name="authors" content="Alex Nolley C. Titus Brown" />
+<style type="text/css">
+
+/*
+:Author: David Goodger (goodger at python.org)
+:Id: $Id: html4css1.css 5196 2007-06-03 20:25:28Z wiemann $
+:Copyright: This stylesheet has been placed in the public domain.
+
+Default cascading style sheet for the HTML output of Docutils.
+
+See http://docutils.sf.net/docs/howto/html-stylesheets.html for how to
+customize this style sheet.
+*/
+
+/* used to remove borders from tables and images */
+.borderless, table.borderless td, table.borderless th {
+ border: 0 }
+
+table.borderless td, table.borderless th {
+ /* Override padding for "table.docutils td" with "! important".
+ The right padding separates the table cells. */
+ padding: 0 0.5em 0 0 ! important }
+
+.first {
+ /* Override more specific margin styles with "! important". */
+ margin-top: 0 ! important }
+
+.last, .with-subtitle {
+ margin-bottom: 0 ! important }
+
+.hidden {
+ display: none }
+
+a.toc-backref {
+ text-decoration: none ;
+ color: black }
+
+blockquote.epigraph {
+ margin: 2em 5em ; }
+
+dl.docutils dd {
+ margin-bottom: 0.5em }
+
+/* Uncomment (and remove this text!) to get bold-faced definition list terms
+dl.docutils dt {
+ font-weight: bold }
+*/
+
+div.abstract {
+ margin: 2em 5em }
+
+div.abstract p.topic-title {
+ font-weight: bold ;
+ text-align: center }
+
+div.admonition, div.attention, div.caution, div.danger, div.error,
+div.hint, div.important, div.note, div.tip, div.warning {
+ margin: 2em ;
+ border: medium outset ;
+ padding: 1em }
+
+div.admonition p.admonition-title, div.hint p.admonition-title,
+div.important p.admonition-title, div.note p.admonition-title,
+div.tip p.admonition-title {
+ font-weight: bold ;
+ font-family: sans-serif }
+
+div.attention p.admonition-title, div.caution p.admonition-title,
+div.danger p.admonition-title, div.error p.admonition-title,
+div.warning p.admonition-title {
+ color: red ;
+ font-weight: bold ;
+ font-family: sans-serif }
+
+/* Uncomment (and remove this text!) to get reduced vertical space in
+ compound paragraphs.
+div.compound .compound-first, div.compound .compound-middle {
+ margin-bottom: 0.5em }
+
+div.compound .compound-last, div.compound .compound-middle {
+ margin-top: 0.5em }
+*/
+
+div.dedication {
+ margin: 2em 5em ;
+ text-align: center ;
+ font-style: italic }
+
+div.dedication p.topic-title {
+ font-weight: bold ;
+ font-style: normal }
+
+div.figure {
+ margin-left: 2em ;
+ margin-right: 2em }
+
+div.footer, div.header {
+ clear: both;
+ font-size: smaller }
+
+div.line-block {
+ display: block ;
+ margin-top: 1em ;
+ margin-bottom: 1em }
+
+div.line-block div.line-block {
+ margin-top: 0 ;
+ margin-bottom: 0 ;
+ margin-left: 1.5em }
+
+div.sidebar {
+ margin: 0 0 0.5em 1em ;
+ border: medium outset ;
+ padding: 1em ;
+ background-color: #ffffee ;
+ width: 40% ;
+ float: right ;
+ clear: right }
+
+div.sidebar p.rubric {
+ font-family: sans-serif ;
+ font-size: medium }
+
+div.system-messages {
+ margin: 5em }
+
+div.system-messages h1 {
+ color: red }
+
+div.system-message {
+ border: medium outset ;
+ padding: 1em }
+
+div.system-message p.system-message-title {
+ color: red ;
+ font-weight: bold }
+
+div.topic {
+ margin: 2em }
+
+h1.section-subtitle, h2.section-subtitle, h3.section-subtitle,
+h4.section-subtitle, h5.section-subtitle, h6.section-subtitle {
+ margin-top: 0.4em }
+
+h1.title {
+ text-align: center }
+
+h2.subtitle {
+ text-align: center }
+
+hr.docutils {
+ width: 75% }
+
+img.align-left {
+ clear: left }
+
+img.align-right {
+ clear: right }
+
+ol.simple, ul.simple {
+ margin-bottom: 1em }
+
+ol.arabic {
+ list-style: decimal }
+
+ol.loweralpha {
+ list-style: lower-alpha }
+
+ol.upperalpha {
+ list-style: upper-alpha }
+
+ol.lowerroman {
+ list-style: lower-roman }
+
+ol.upperroman {
+ list-style: upper-roman }
+
+p.attribution {
+ text-align: right ;
+ margin-left: 50% }
+
+p.caption {
+ font-style: italic }
+
+p.credits {
+ font-style: italic ;
+ font-size: smaller }
+
+p.label {
+ white-space: nowrap }
+
+p.rubric {
+ font-weight: bold ;
+ font-size: larger ;
+ color: maroon ;
+ text-align: center }
+
+p.sidebar-title {
+ font-family: sans-serif ;
+ font-weight: bold ;
+ font-size: larger }
+
+p.sidebar-subtitle {
+ font-family: sans-serif ;
+ font-weight: bold }
+
+p.topic-title {
+ font-weight: bold }
+
+pre.address {
+ margin-bottom: 0 ;
+ margin-top: 0 ;
+ font-family: serif ;
+ font-size: 100% }
+
+pre.literal-block, pre.doctest-block {
+ margin-left: 2em ;
+ margin-right: 2em }
+
+span.classifier {
+ font-family: sans-serif ;
+ font-style: oblique }
+
+span.classifier-delimiter {
+ font-family: sans-serif ;
+ font-weight: bold }
+
+span.interpreted {
+ font-family: sans-serif }
+
+span.option {
+ white-space: nowrap }
+
+span.pre {
+ white-space: pre }
+
+span.problematic {
+ color: red }
+
+span.section-subtitle {
+ /* font-size relative to parent (h1..h6 element) */
+ font-size: 80% }
+
+table.citation {
+ border-left: solid 1px gray;
+ margin-left: 1px }
+
+table.docinfo {
+ margin: 2em 4em }
+
+table.docutils {
+ margin-top: 0.5em ;
+ margin-bottom: 0.5em }
+
+table.footnote {
+ border-left: solid 1px black;
+ margin-left: 1px }
+
+table.docutils td, table.docutils th,
+table.docinfo td, table.docinfo th {
+ padding-left: 0.5em ;
+ padding-right: 0.5em ;
+ vertical-align: top }
+
+table.docutils th.field-name, table.docinfo th.docinfo-name {
+ font-weight: bold ;
+ text-align: left ;
+ white-space: nowrap ;
+ padding-left: 0 }
+
+h1 tt.docutils, h2 tt.docutils, h3 tt.docutils,
+h4 tt.docutils, h5 tt.docutils, h6 tt.docutils {
+ font-size: 100% }
+
+ul.auto-toc {
+ list-style-type: none }
+
+</style>
+</head>
+<body>
+<div class="document" id="screed-short-read-sequence-database">
+<h1 class="title">screed - short read sequence database</h1>
+<table class="docinfo" frame="void" rules="none">
+<col class="docinfo-name" />
+<col class="docinfo-content" />
+<tbody valign="top">
+<tr><th class="docinfo-name">Copyright:</th>
+<td>2008-2010 Michigan State University</td></tr>
+<tr><th class="docinfo-name">Authors:</th>
+<td>Alex Nolley
+<br />C. Titus Brown</td></tr>
+<tr><th class="docinfo-name">Contact:</th>
+<td><a class="first reference external" href="mailto:badmit@gmail.com">badmit@gmail.com</a>, <a class="last reference external" href="mailto:ctb@msu.edu">ctb@msu.edu</a></td></tr>
+<tr class="field"><th class="docinfo-name">License:</th><td class="field-body">BSD</td>
+</tr>
+</tbody>
+</table>
+<!-- contents: -->
+<div class="section" id="notes-on-this-document">
+<h1>Notes on this document</h1>
+<p>This is the default documentation for screed. Some doctests are included
+in the file 'example.txt'. The examples in this file are meant for humans
+only: they will not work in doctests.</p>
+</div>
+<div class="section" id="introduction">
+<h1>Introduction</h1>
+<p>screed is a database generation and querying package made to be used with gene
+sequences generated by Solexa machines, namely the FASTQ format, though FASTA
+is supported by default as well. Values
+such as sequence name, sequence description and the sequence itself can be
+retrieved from these databases.</p>
+</div>
+<div class="section" id="getting-going">
+<h1>Getting Going</h1>
+<p>The following software packages are required to run screed:</p>
+<ul class="simple">
+<li>Python 2.4 or newer</li>
+<li>nose (for testing)</li>
+</ul>
+<div class="section" id="downloading">
+<h2>Downloading</h2>
+<dl class="docutils">
+<dt>You will need git to download a copy from the public git repository:</dt>
+<dd>git clone git://github.com/acr/screed.git</dd>
+</dl>
+</div>
+<div class="section" id="installing">
+<h2>Installing</h2>
+<p>Assuming you have already downloaded the package, this is how to install:</p>
+<pre class="literal-block">
+$ python setup.py install
+</pre>
+<p>To run the optional tests type:</p>
+<pre class="literal-block">
+$ python -m screed.tests.__main__
+</pre>
+</div>
+</div>
+<div class="section" id="quick-start">
+<h1>Quick-Start</h1>
+<div class="section" id="creating-a-database-from-the-api">
+<h2>Creating a database from the API</h2>
+<p>From a Python prompt type:</p>
+<pre class="literal-block">
+>>> import screed
+>>> screed.read_fasta_sequences('screed/tests/test.fa')
+</pre>
+<p>That command just parsed the FASTA file 'screed/tests/test.fa' into a
+screed-database named 'screed/tests/test.fa_screed'. The screed database
+is independent from the text file it was derived from, so moving, renaming
+or deleting the 'screed/tests/test.fa' file will not affect
+screed's operation. To create a screed database from a FASTQ file the
+syntax is similar:</p>
+<pre class="literal-block">
+>>> screed.read_fastq_sequences('screed/tests/test.fastq')
+</pre>
+</div>
+<div class="section" id="creating-a-database-from-a-script">
+<h2>Creating a database from a script</h2>
+<p>To create a screed db from a FASTQ file at the shell:</p>
+<pre class="literal-block">
+$ ./fqdbm screed/tests/test.fastq
+</pre>
+<p>Similarly, to create a screed db from a fasta file:</p>
+<pre class="literal-block">
+$ ./fadbm screed/tests/test.fa
+</pre>
+<p>Alternately, if the screed module is in your PATH:</p>
+<pre class="literal-block">
+$ python -m screed.fadbm <fasta file>
+$ python -m screed.fqdbm <fastq file>
+</pre>
+<p>where <fast* file> is the path to a sequence file.</p>
+<p>screed natively supports FASTA and FASTQ database creation. If you have a new
+sequence you want screed to work with, see the section below on Writing
+Custom Sequence Parsers.</p>
+</div>
+</div>
+<div class="section" id="reading-databases">
+<h1>Reading databases</h1>
+<p>The class ScreedDB is used to read screed databases, regardless of what file
+format they were derived from (FASTA/FASTQ/hava/etc..). One reader to
+rule them all!</p>
+<div class="section" id="opening">
+<h2>Opening</h2>
+<p>In the Python environment, import the ScreedDB class and load
+some databases:</p>
+<pre class="literal-block">
+>>> from screed import ScreedDB
+>>> fadb = ScreedDB('screed/tests/test.fa')
+>>> fqdb = ScreedDB('screed/tests/test.fastq')
+</pre>
+<p>Notice how you didn't need to write the '_screed' at the end of the file names?
+screed automatically adds that to the file name if you didn't.</p>
+</div>
+<div class="section" id="dictionary-interface">
+<h2>Dictionary Interface</h2>
+<p>Since screed emulates a read-only dictionary interface, any methods that
+don't modify a dictionary are supported:</p>
+<pre class="literal-block">
+>>> fadb.keys()
+>>> fqdb.keys()
+</pre>
+<p>Each record in the database contains 'fields' such as name and sequence
+information. If the database was derived from a FASTQ file, accuracy and
+optional annotation strings are included. Conversely, FASTA-derived
+databases have a description field.</p>
+<p>To retrieve the names of records in the database:</p>
+<pre class="literal-block">
+>>> names = fadb.keys()
+</pre>
+<p>Length of the databases are easily found:</p>
+<pre class="literal-block">
+>>> print len(fadb)
+22
+>>> print len(fqdb)
+125
+</pre>
+</div>
+<div class="section" id="retrieving-records">
+<h2>Retrieving Records</h2>
+<p>A record is the standard container unit in screed. Each has 'fields' that
+vary slightly depending on what kind of file the database was derived from.
+For instance, a FASTQ-derived screed database has an id, a name,
+a quality score and a sequence. A FASTA-derived screed database has an
+id, name, description and a sequence.</p>
+<p>Retrieving whole records:</p>
+<pre class="literal-block">
+>>> records = []
+>>> for record in fadb.itervalues():
+>>> records.append(record)
+</pre>
+<p>What is returned is a dictionary of fields. The names of fields
+are keys into this dictionary with the actual information as values.
+For example:</p>
+<pre class="literal-block">
+>>> record = fadb[fadb.keys()[0]]
+>>> index = record['id']
+>>> name = record['name']
+>>> description = record['description']
+>>> sequence = record['sequence']
+</pre>
+<p>What this does is retrieve the first record object in the screed database,
+then retrieve the index, name, description and sequence from the record
+object using standard dictionary key -> value pairs.</p>
+</div>
+<div class="section" id="retrieving-partial-sequences-slicing">
+<h2>Retrieving Partial Sequences (slicing)</h2>
+<p>screed supports the concept of retrieving a 'slice' or a subset of a
+sequence string. The motivation is speed: if you have a database
+entry with a very long sequence string but only want a small portion
+of the string, it is faster to retrieve only the portion than to
+retrieve the entire string and then perform standard Python string
+slicing.</p>
+<p>By default, screed's FASTA database creator sets up the 'sequence'
+column to support slicing. For example, if you have an entry with
+name 'someSeq' which has a 10K long sequence, and you want a
+slice of the sequence spanning positions 4000 to 4080:</p>
+<pre class="literal-block">
+>>> seq = db['someSeq'].sequence
+>>> slice = seq[4000:4080]
+</pre>
+<p>This is much faster than say:</p>
+<pre class="literal-block">
+>>> seq = str(db['someSeq'].sequence)
+>>> slice = seq[4000:4080]
+</pre>
+<p>Because deep down, less information is being read off the disk.
+The str() method above causes the entire sequence to be retrieved
+as a string. Then Python slicing is done on the string 'seq' and
+the subset stored in 'slice'.</p>
+</div>
+<div class="section" id="retrieving-via-index">
+<h2>Retrieving Via Index</h2>
+<p>Sometimes you don't care what the name of a sequence is; you're only
+interested in its position in the database. In these cases, retrieval via
+index is the method you'll want to use:</p>
+<pre class="literal-block">
+>>> record = fqdb.loadRecordByIndex(5)
+</pre>
+<p>An index is like an offset into the database. The order records were kept in
+the FASTA or FASTQ file determines the index in their resulting screed database.
+The first record in a sequence file will have an index of 0, the
+second, an index of 1 and so on.</p>
+</div>
+</div>
+<div class="section" id="writing-custom-sequence-parsers">
+<h1>Writing Custom Sequence Parsers</h1>
+<p>screed is built to be adaptable to new kinds of file sequence formats.
+Included with screed are parsers for handling FASTA and FASTQ sequence
+file types, though if you need screed to work with a new format,
+all you need to do is write a new parser.</p>
+<div class="section" id="field-roles">
+<h2>Field Roles</h2>
+<p>Each field in a screed database is assigned a role. These roles describe what
+kind of information is stored in their field. Right now there are only 4
+different roles in a screed database: the text role, the sliceable role,
+the indexed key role and the primary key role. All roles are defined in the
+file: screed/DBConstants.py</p>
+<p>The text role (DBConstants._STANDARD_TEXT) is the role most fields in a
+database will have. This role tells screed that the associated field is
+storing standard textual data. Nothing special.</p>
+<p>The sliceable role (DBConstants._SLICEABLE_TEXT) is a role that can be
+assigned to long sequence fields. screed's default FASTA parser defines
+the 'sequence' field with the sliceable role. When screed retrieves a field
+that has the sliceable role, it builds a special data structure that
+supports slicing into the text.</p>
+<p>The indexed key role (DBConstants._INDEXED_TEXT_KEY) is associated with
+exactly one of the fields in a screed database. In screed's FASTA and
+FASTQ parsers, this role is fulfilled by the 'name' field. This field
+is required because it is the field screed tells sqlite to index when
+creating the database and it is the field used for name look-ups when
+querying a screed database.</p>
+<p>The primary key role (DBConstants._PRIMARY_KEY_ROLE) is a role automatically
+associated with the 'id' field in each database. This field is always
+created with each screed database and always holds this role. You as a
+user of screed won't need to worry about this one.</p>
+</div>
+<div class="section" id="general-parsing-function-format">
+<h2>General Parsing Function Format</h2>
+<p>create_db is the function central to the creation of screed databases. This
+function accepts a file path, a tuple of field names and roles, and an
+iterator function. The file path describes where the screed database should
+go, the tuple contains the names of fields and their associated roles and
+the iterator function yields records in a dictionary format.</p>
+<p>This sub-section describes general steps for preparing and using screed with a
+custom sequence parser. Though they don't have to be, future sequence parsers
+should be located in the seqparse.py file for convenience.
+These steps will be described in the context of working from the Python shell.</p>
+<p>First import the create_db function:</p>
+<pre class="literal-block">
+>>> from screed import create_db
+</pre>
+<p>The create_db class handles the formatting of screed databases and
+provides a simple interface for storing sequence data.</p>
+<p>Next the database fields and roles must be specified. The fields tell
+screed the names and order of the data fields inside each record. For instance,
+lets say our new sequence has types 'name', 'bar', and 'baz', all text. The
+tuple will be:</p>
+<pre class="literal-block">
+>>> fields = (('name', DBConstants._INDEXED_TEXT_KEY),
+ ('bar', DBConstants._STANDARD_TEXT),
+ ('baz', DBConstants._STANDARD_TEXT))
+</pre>
+<p>Notice how 'name' is given the indexed key role and bar and baz are
+given text roles? If, for instance, you know 'baz' fields can be very long
+and you want to be able to retrieve slices of them, you could specify
+fields as:</p>
+<pre class="literal-block">
+>>> fields = (('name', DBConstants._INDEXED_TEXT_KEY),
+ ('bar', DBConstants._STANDARD_TEXT),
+ ('baz', DBConstants._SLICEABLE_TEXT))
+</pre>
+<p>All screed databases come with an 'id' field, which is a sequential
+numbering order starting at 0 for the first record, 1 for the second, and
+so on. The names and number of the other fields are arbitrary with one
+restriction: one and only one of the fields must fulfill the indexed key role.</p>
+<p>Next, you need to setup an iterator function that will return records in
+a dictionary format. Have a look at the 'fastq_iter', 'fasta_iter', or
+'hava_iter' functions in the screed/fastq.py, screed/fasta.py, and
+screed/hava.py files, respectively for examples on how to write one of these.
+If you don't know what an iterator function is, the documentation on the
+Python website gives a good description:
+<a class="reference external" href="http://docs.python.org/library/stdtypes.html#iterator-types">http://docs.python.org/library/stdtypes.html#iterator-types</a>.</p>
+<p>Once the iterator function is written, it needs to be instantiated. In the
+context of the built-in parsing functions, this means opening a file and
+passing the file handle to the iterator function:</p>
+<pre class="literal-block">
+>>> seqfile = open('path_to_seq_file', 'rb')
+>>> iter_instance = myiter(seqfile)
+</pre>
+<p>Assuming that your iterator function is called 'myiter', this sets up an
+instance of it ready to use with create_db.</p>
+<p>Now the screed database is created with one command:</p>
+<pre class="literal-block">
+>>> create_db('path_to_screed_db', fields, iter_instance)
+</pre>
+<p>If you want the screed database saved at 'path_to_screed_db'. If instead you
+want the screed database created in the same directory and with a
+similar file name as the sequence file, its OK to do this:</p>
+<pre class="literal-block">
+>>> create_db('path_to_seq_file', fields, iter_instance)
+</pre>
+<p>create_db will just append '_screed' to the end of the file name and make
+a screed database at that file path so the original file won't be
+overwritten.</p>
+<dl class="docutils">
+<dt>When you're done the sequence file should be closed::</dt>
+<dd><pre class="first last doctest-block">
+>>> seqfile.close()
+</pre>
+</dd>
+</dl>
+</div>
+<div class="section" id="using-the-built-in-sequence-iterator-functions">
+<h2>Using the Built-in Sequence Iterator Functions</h2>
+<p>This section shows how to use the 'fastq_iter' and 'fasta_iter' functions
+for returning records from a sequence file.</p>
+<p>These functions both take a file handle as the only argument and then return
+a dictionary for each record in the file containing names of fields and
+associated data. These functions are primarily used in conjunction with
+the db_create() function, but they can be useful by themselves.</p>
+<p>First, import the necessary module and open a text file containing sequences.
+For this example, the 'fastq_iter' function will be used:</p>
+<pre class="literal-block">
+>>> import screed.fastq
+>>> seqfile = open('path_to_seqfile', 'rb')
+</pre>
+<p>Now, the 'fastq_iter' can be instantiated and iterated over:</p>
+<pre class="literal-block">
+>>> fq_instance = screed.fastq(seqfile)
+>>> for record in fq_instance:
+... print record.name
+</pre>
+<p>That will print the name of every sequence in the file. If instead you want
+to accumulate the sequences:</p>
+<pre class="literal-block">
+>>> sequences = []
+>>> for record in fq_instance:
+... sequences.append(record.sequence)
+</pre>
+<p>These iterators are the core of screed's sequence modularity. If there is
+a new sequence format you want screed to work with, all it needs is its
+own iterator.</p>
+</div>
+<div class="section" id="error-checking-in-parsing-methods">
+<h2>Error checking in parsing methods</h2>
+<p>The existing FASTA/FASTQ parsing functions contain some error
+checking, such as making sure the file can be opened and checking correct
+data is being read. Though screed doesn't enforce this, it is strongly
+recommended to include error checking code in your parser. To remain
+non-specific to one file sequence type or another, the underlying screed
+library can't contain error checking code of this kind. If errors are not
+detected by the parsing function, they will be silently included into the
+database being built and could cause problems much later when trying to
+read from the database.</p>
+</div>
+</div>
+<div class="section" id="file-formats-as-understood-by-screed">
+<h1>File formats as understood by screed</h1>
+<p>While the screed database remains non-specific to file formats, the included
+FASTA and FASTQ parsers expect specific formats. These parsers attempt to
+handle the most common attributes of sequence files, though they can not
+support all features.</p>
+<div class="section" id="fastq">
+<h2>FASTQ</h2>
+<p>The FASTQ parsing function is read_fastq_sequences() and is located in the
+screed module.</p>
+<p>The first line in a record must begin with '@' and is
+followed by a record identifier (a name). An optional annotations string
+may be included after a space on the same line.</p>
+<p>The second line begins the sequence line(s) which may be line wrapped.
+screed defines no limit on the length of sequence lines and no length on
+how many sequence lines a record may contain.</p>
+<p>After the sequence line(s) comes a '+' character on a new line. Some
+FASTQ formats require the first line to be repeated after the '+'
+character, but since this adds no new information to the record,
+read_fastq_sequences() will ignore this if it is included.</p>
+<p>The accuracy line(s) is last. Like the sequence line(s) this may
+be line wrapped. read_fastq_sequences() will raise an exception if the
+accuracy and sequence strings are of unequal length. screed performs
+no checking for valid quality scores.</p>
+</div>
+<div class="section" id="fasta">
+<h2>FASTA</h2>
+<p>The FASTA parsing function is read_fasta_sequences() and is also located
+in the screed module.</p>
+<p>The first line in a record must begin with '>' and is followed with the
+sequence's name and an optional description. If the description is
+included, it is separated from the name with a space. Note that though
+the FASTA format doesn't require named records, screed does. Without a
+unique name, screed can't look up sequences by name.</p>
+<p>The second line begins the line(s) of sequence. Like the FASTQ parser,
+read_fasta_sequences() allows any number of lines of any length.</p>
+</div>
+</div>
+<div class="section" id="fasta-fastq-conversion">
+<h1>FASTA <-> FASTQ Conversion</h1>
+<p>As an extra nicety, screed can convert FASTA files to FASTQ and back again.</p>
+<div class="section" id="fasta-to-fastq">
+<h2>FASTA to FASTQ</h2>
+<p>The function used for this process is called 'ToFastq' and is located in the
+screed module. It takes the path to a screed database as the first argument
+and a path to the desired FASTQ file as the second argument. There is also
+a shell interface called ToFastq.py:</p>
+<pre class="literal-block">
+$ ./ToFastq.py <path to fasta db> <converted fastq file>
+</pre>
+<p>or:</p>
+<pre class="literal-block">
+$ python -m screed.ToFastq <path to fasta db> <converted fastq file>
+</pre>
+<p>if the screed module is in your PATH.</p>
+<p>The FASTA name attribute is directly dumped from the file. The sequence
+attribute is also dumped pretty much directly, but is line wrapped to 80
+characters if it is longer.</p>
+<p>Any description line in the FASTA database is stored as a FASTQ annotation
+string with no other interpretation done.</p>
+<p>Finally, as there is no accuracy or quality score in a FASTA file, a default
+one is generated. The generation of the accuracy follows the Sanger FASTQ
+conventions. The score is 1 (ASCII: '"') meaning a probability of about 75%
+that the read is incorrect (1 in 4 chance). This PHRED quality score is
+calculated from the Sanger format: Q = -10log(p) where p is the probability
+of an incorrect read. Obviously this is a very rough way of providing a
+quality score and it is only intended to fill in the requirements of a FASTQ
+file. Any application needing a true measurement of the accuracy should
+not rely on this automatic conversion.</p>
+</div>
+<div class="section" id="fastq-to-fasta">
+<h2>FASTQ to FASTA</h2>
+<p>The function used for this process is called 'toFasta' and is located in the
+screed module. It takes the path to a screed database as the first argument
+and a path to the desired FASTA file as the second argument. Like the ToFastq
+function before, there is a shell interface to ToFasta:</p>
+<pre class="literal-block">
+$ ./ToFasta.py <path to fastq db> <converted fasta file>
+</pre>
+<p>or:</p>
+<pre class="literal-block">
+$ python -m screed.ToFasta <path to fastq db> <converted fasta file>
+</pre>
+<p>if the screed module is in your PATH.</p>
+<p>As above, the name and sequence attributes are directly dumped from the FASTQ
+database to the FASTA file with the sequence line wrapping to 80 characters.</p>
+<p>If it exists, the FASTQ annotation tag is stored as the FASTA description tag.
+As there is no equivalent in FASTA, the FASTQ accuracy score is ignored.</p>
+<!-- Local Variables:
+mode: rst
+mode: outline-minor
+End: -->
+</div>
+</div>
+</div>
+</body>
+</html>
diff --git a/doc/screed.txt b/doc/screed.txt
new file mode 100755
index 0000000..223201d
--- /dev/null
+++ b/doc/screed.txt
@@ -0,0 +1,502 @@
+===================
+Basic Documentation
+===================
+
+.. contents:
+
+Notes on this document
+======================
+This is the default documentation for screed. Some doctests are included
+in the file 'example.txt'. The examples in this file are meant for humans
+only: they will not work in doctests.
+
+Introduction
+============
+
+screed parses FASTA and FASTQ files, generates databases, and lets you
+query these databases. Values such as sequence name, sequence
+description, sequence quality, and the sequence itself can be
+retrieved from these databases.
+
+Getting Going
+=============
+
+The following software packages are required to run screed:
+
+* Python 2.4 or newer
+* nose (for testing)
+
+Downloading
+-----------
+
+You will need git to download a copy from the public git repository:
+
+ git clone git://github.com/ged-lab/screed.git
+
+Installing
+----------
+
+Assuming you have already downloaded the package, this is how to install::
+
+ $ python setup.py install
+
+To run the optional tests type::
+
+ $ python -m screed.tests.__main__
+
+Quick-Start
+===========
+
+Reading FASTA/FASTQ files
+-------------------------
+
+At the Python prompt, type::
+
+ >>> import screed
+ >>> for read in screed.open(filename):
+ ... print read.name, read.sequence
+
+Here, 'filename' can be a FASTA or FASTQ file, and can be
+uncompressed, gzipped, or bz2-zipped.
+
+Creating a database from the API
+--------------------------------
+
+From a Python prompt type::
+
+ >>> import screed
+ >>> screed.read_fasta_sequences('screed/tests/test.fa')
+
+That command just parsed the FASTA file 'screed/tests/test.fa' into a
+screed-database named 'screed/tests/test.fa_screed'. The screed database
+is independent from the text file it was derived from, so moving, renaming
+or deleting the 'screed/tests/test.fa' file will not affect
+screed's operation. To create a screed database from a FASTQ file the
+syntax is similar::
+
+ >>> screed.read_fastq_sequences('screed/tests/test.fastq')
+
+Creating a database from a script
+---------------------------------
+
+To create a screed db from a FASTQ file at the shell::
+
+ $ ./fqdbm screed/tests/test.fastq
+
+Similarly, to create a screed db from a fasta file::
+
+ $ ./fadbm screed/tests/test.fa
+
+Alternately, if the screed module is in your PATH::
+
+ $ python -m screed.fadbm <fasta file>
+ $ python -m screed.fqdbm <fastq file>
+
+where <fast* file> is the path to a sequence file.
+
+screed natively supports FASTA and FASTQ database creation. If you
+have a new sequence you want screed to work with, see the section
+below on Writing Custom Sequence Parsers.
+
+Reading databases
+=================
+
+The class ScreedDB is used to read screed databases, regardless of
+what file format they were derived from (FASTA/FASTQ/hava/etc.). One
+reader to rule them all!
+
+Opening
+-------
+
+In the Python environment, import the ScreedDB class and load some
+databases::
+
+ >>> from screed import ScreedDB
+ >>> fadb = ScreedDB('screed/tests/test.fa')
+ >>> fqdb = ScreedDB('screed/tests/test.fastq')
+
+Notice how you didn't need to write the '_screed' at the end of the
+file names? screed automatically adds that to the file name if you
+didn't.
+
+Dictionary Interface
+--------------------
+
+Since screed emulates a read-only dictionary interface, any methods
+that don't modify a dictionary are supported::
+
+ >>> fadb.keys()
+ >>> fqdb.keys()
+
+Each record in the database contains 'fields' such as name and
+sequence information. If the database was derived from a FASTQ file,
+accuracy and optional annotation strings are included. Conversely,
+FASTA-derived databases have a description field.
+
+To retrieve the names of records in the database::
+
+ >>> names = fadb.keys()
+
+Length of the databases are easily found::
+
+ >>> print len(fadb)
+ 22
+ >>> print len(fqdb)
+ 125
+
+Retrieving Records
+------------------
+
+A record is the standard container unit in screed. Each has 'fields'
+that vary slightly depending on what kind of file the database was
+derived from. For instance, a FASTQ-derived screed database has an
+id, a name, a quality score and a sequence. A FASTA-derived screed
+database has an id, name, description and a sequence.
+
+Retrieving whole records::
+
+ >>> records = []
+ >>> for record in fadb.itervalues():
+ ... records.append(record)
+
+What is returned is a dictionary of fields. The names of fields
+are keys into this dictionary with the actual information as values.
+For example::
+
+ >>> record = fadb[fadb.keys()[0]]
+ >>> index = record['id']
+ >>> name = record['name']
+ >>> description = record['description']
+ >>> sequence = record['sequence']
+
+What this does is retrieve the first record object in the screed database,
+then retrieve the index, name, description and sequence from the record
+object using standard dictionary key -> value pairs.
+
+Retrieving Partial Sequences (slicing)
+--------------------------------------
+
+screed supports the concept of retrieving a 'slice' or a subset of a
+sequence string. The motivation is speed: if you have a database entry
+with a very long sequence string but only want a small portion of the
+string, it is faster to retrieve only the portion than to retrieve the
+entire string and then perform standard Python string slicing.
+
+By default, screed's FASTA database creator sets up the 'sequence'
+column to support slicing. For example, if you have an entry with name
+'someSeq' which has a 10K long sequence, and you want a slice of the
+sequence spanning positions 4000 to 4080::
+
+ >>> seq = db['someSeq'].sequence
+ >>> slice = seq[4000:4080]
+
+This is much faster than say::
+
+ >>> seq = str(db['someSeq'].sequence)
+ >>> slice = seq[4000:4080]
+
+Because deep down, less information is being read off the disk. The
+str() method above causes the entire sequence to be retrieved as a
+string. Then Python slicing is done on the string 'seq' and the subset
+stored in 'slice'.
+
+Retrieving Via Index
+--------------------
+
+Sometimes you don't care what the name of a sequence is; you're only
+interested in its position in the database. In these cases, retrieval
+via index is the method you'll want to use::
+
+ >>> record = fqdb.loadRecordByIndex(5)
+
+An index is like an offset into the database. The order records were
+kept in the FASTA or FASTQ file determines the index in their
+resulting screed database. The first record in a sequence file will
+have an index of 0, the second, an index of 1 and so on.
+
+Writing Custom Sequence Parsers
+===============================
+
+screed is built to be adaptable to new kinds of file sequence formats.
+Included with screed are parsers for handling FASTA and FASTQ sequence
+file types, though if you need screed to work with a new format, all
+you need to do is write a new parser.
+
+Field Roles
+-----------
+
+Each field in a screed database is assigned a role. These roles
+describe what kind of information is stored in their field. Right now
+there are only 4 different roles in a screed database: the text role,
+the sliceable role, the indexed key role and the primary key role. All
+roles are defined in the file: screed/DBConstants.py
+
+The text role (DBConstants._STANDARD_TEXT) is the role most fields in
+a database will have. This role tells screed that the associated field
+is storing standard textual data. Nothing special.
+
+The sliceable role (DBConstants._SLICEABLE_TEXT) is a role that can be
+assigned to long sequence fields. screed's default FASTA parser
+defines the 'sequence' field with the sliceable role. When screed
+retrieves a field that has the sliceable role, it builds a special
+data structure that supports slicing into the text.
+
+The indexed key role (DBConstants._INDEXED_TEXT_KEY) is associated
+with exactly one of the fields in a screed database. In screed's FASTA
+and FASTQ parsers, this role is fulfilled by the 'name' field. This
+field is required because it is the field screed tells sqlite to index
+when creating the database and it is the field used for name look-ups
+when querying a screed database.
+
+The primary key role (DBConstants._PRIMARY_KEY_ROLE) is a role
+automatically associated with the 'id' field in each database. This
+field is always created with each screed database and always holds
+this role. You as a user of screed won't need to worry about this one.
+
+General Parsing Function Format
+-------------------------------
+
+create_db is the function central to the creation of screed
+databases. This function accepts a file path, a tuple of field names
+and roles, and an iterator function. The file path describes where the
+screed database should go, the tuple contains the names of fields and
+their associated roles and the iterator function yields records in a
+dictionary format.
+
+This sub-section describes general steps for preparing and using
+screed with a custom sequence parser. Though they don't have to be,
+future sequence parsers should be located in the seqparse.py file for
+convenience. These steps will be described in the context of working
+from the Python shell.
+
+First import the create_db function::
+
+ >>> from screed import create_db
+
+The create_db class handles the formatting of screed databases and
+provides a simple interface for storing sequence data.
+
+Next the database fields and roles must be specified. The fields tell
+screed the names and order of the data fields inside each record. For instance,
+lets say our new sequence has types 'name', 'bar', and 'baz', all text. The
+tuple will be::
+
+ >>> fields = (('name', DBConstants._INDEXED_TEXT_KEY),
+ ('bar', DBConstants._STANDARD_TEXT),
+ ('baz', DBConstants._STANDARD_TEXT))
+
+Notice how 'name' is given the indexed key role and bar and baz are
+given text roles? If, for instance, you know 'baz' fields can be very long
+and you want to be able to retrieve slices of them, you could specify
+fields as::
+
+ >>> fields = (('name', DBConstants._INDEXED_TEXT_KEY),
+ ('bar', DBConstants._STANDARD_TEXT),
+ ('baz', DBConstants._SLICEABLE_TEXT))
+
+All screed databases come with an 'id' field, which is a sequential
+numbering order starting at 0 for the first record, 1 for the second, and
+so on. The names and number of the other fields are arbitrary with one
+restriction: one and only one of the fields must fulfill the indexed key role.
+
+Next, you need to setup an iterator function that will return records in
+a dictionary format. Have a look at the 'fastq_iter', 'fasta_iter', or
+'hava_iter' functions in the screed/fastq.py, screed/fasta.py, and
+screed/hava.py files, respectively for examples on how to write one of these.
+If you don't know what an iterator function is, the documentation on the
+Python website gives a good description:
+http://docs.python.org/library/stdtypes.html#iterator-types.
+
+Once the iterator function is written, it needs to be instantiated. In the
+context of the built-in parsing functions, this means opening a file and
+passing the file handle to the iterator function::
+
+ >>> seqfile = open('path_to_seq_file', 'rb')
+ >>> iter_instance = myiter(seqfile)
+
+Assuming that your iterator function is called 'myiter', this sets up an
+instance of it ready to use with create_db.
+
+Now the screed database is created with one command::
+
+ >>> create_db('path_to_screed_db', fields, iter_instance)
+
+If you want the screed database saved at 'path_to_screed_db'. If instead you
+want the screed database created in the same directory and with a
+similar file name as the sequence file, its OK to do this::
+
+ >>> create_db('path_to_seq_file', fields, iter_instance)
+
+create_db will just append '_screed' to the end of the file name and make
+a screed database at that file path so the original file won't be
+overwritten.
+
+When you're done the sequence file should be closed::
+
+ >>> seqfile.close()
+
+Using the Built-in Sequence Iterator Functions
+----------------------------------------------
+
+This section shows how to use the 'fastq_iter' and 'fasta_iter' functions
+for returning records from a sequence file.
+
+These functions both take a file handle as the only argument and then return
+a dictionary for each record in the file containing names of fields and
+associated data. These functions are primarily used in conjunction with
+the db_create() function, but they can be useful by themselves.
+
+First, import the necessary module and open a text file containing sequences.
+For this example, the 'fastq_iter' function will be used::
+
+ >>> import screed.fastq
+ >>> seqfile = open('path_to_seqfile', 'rb')
+
+Now, the 'fastq_iter' can be instantiated and iterated over::
+
+ >>> fq_instance = screed.fastq(seqfile)
+ >>> for record in fq_instance:
+ ... print record.name
+
+That will print the name of every sequence in the file. If instead you want
+to accumulate the sequences::
+
+ >>> sequences = []
+ >>> for record in fq_instance:
+ ... sequences.append(record.sequence)
+
+These iterators are the core of screed's sequence modularity. If there is
+a new sequence format you want screed to work with, all it needs is its
+own iterator.
+
+Error checking in parsing methods
+---------------------------------
+
+The existing FASTA/FASTQ parsing functions contain some error
+checking, such as making sure the file can be opened and checking
+correct data is being read. Though screed doesn't enforce this, it is
+strongly recommended to include error checking code in your parser. To
+remain non-specific to one file sequence type or another, the
+underlying screed library can't contain error checking code of this
+kind. If errors are not detected by the parsing function, they will be
+silently included into the database being built and could cause
+problems much later when trying to read from the database.
+
+File formats as understood by screed
+====================================
+
+While the screed database remains non-specific to file formats, the
+included FASTA and FASTQ parsers expect specific formats. These
+parsers attempt to handle the most common attributes of sequence
+files, though they can not support all features.
+
+FASTQ
+-----
+
+The FASTQ parsing function is read_fastq_sequences() and is located in
+the screed module.
+
+The first line in a record must begin with '@' and is followed by a
+record identifier (a name). An optional annotations string may be
+included after a space on the same line.
+
+The second line begins the sequence line(s) which may be line wrapped.
+screed defines no limit on the length of sequence lines and no length
+on how many sequence lines a record may contain.
+
+After the sequence line(s) comes a '+' character on a new line. Some
+FASTQ formats require the first line to be repeated after the '+'
+character, but since this adds no new information to the record,
+read_fastq_sequences() will ignore this if it is included.
+
+The accuracy line(s) is last. Like the sequence line(s) this may
+be line wrapped. read_fastq_sequences() will raise an exception if the
+accuracy and sequence strings are of unequal length. screed performs
+no checking for valid quality scores.
+
+FASTA
+-----
+
+The FASTA parsing function is read_fasta_sequences() and is also
+located in the screed module.
+
+The first line in a record must begin with '>' and is followed with
+the sequence's name and an optional description. If the description is
+included, it is separated from the name with a space. Note that though
+the FASTA format doesn't require named records, screed does. Without a
+unique name, screed can't look up sequences by name.
+
+The second line begins the line(s) of sequence. Like the FASTQ parser,
+read_fasta_sequences() allows any number of lines of any length.
+
+FASTA <-> FASTQ Conversion
+==========================
+
+ at CTB this doesn't work?
+
+As an extra nicety, screed can convert FASTA files to FASTQ and back again.
+
+FASTA to FASTQ
+--------------
+
+The function used for this process is called 'ToFastq' and is located
+in the screed module. It takes the path to a screed database as the
+first argument and a path to the desired FASTQ file as the second
+argument. There is also a shell interface called ToFastq.py::
+
+ $ ./ToFastq.py <path to fasta db> <converted fastq file>
+
+or::
+
+ $ python -m screed.ToFastq <path to fasta db> <converted fastq file>
+
+if the screed module is in your PATH.
+
+The FASTA name attribute is directly dumped from the file. The
+sequence attribute is also dumped pretty much directly, but is line
+wrapped to 80 characters if it is longer.
+
+Any description line in the FASTA database is stored as a FASTQ annotation
+string with no other interpretation done.
+
+Finally, as there is no accuracy or quality score in a FASTA file, a
+default one is generated. The generation of the accuracy follows the
+Sanger FASTQ conventions. The score is 1 (ASCII: '"') meaning a
+probability of about 75% that the read is incorrect (1 in 4
+chance). This PHRED quality score is calculated from the Sanger
+format: Q = -10log(p) where p is the probability of an incorrect
+read. Obviously this is a very rough way of providing a quality score
+and it is only intended to fill in the requirements of a FASTQ
+file. Any application needing a true measurement of the accuracy
+should not rely on this automatic conversion.
+
+FASTQ to FASTA
+--------------
+
+The function used for this process is called 'toFasta' and is located
+in the screed module. It takes the path to a screed database as the
+first argument and a path to the desired FASTA file as the second
+argument. Like the ToFastq function before, there is a shell interface
+to ToFasta::
+
+ $ ./ToFasta.py <path to fastq db> <converted fasta file>
+
+or::
+
+ $ python -m screed.ToFasta <path to fastq db> <converted fasta file>
+
+if the screed module is in your PATH.
+
+As above, the name and sequence attributes are directly dumped from
+the FASTQ database to the FASTA file with the sequence line wrapping
+to 80 characters.
+
+If it exists, the FASTQ annotation tag is stored as the FASTA description tag.
+As there is no equivalent in FASTA, the FASTQ accuracy score is ignored.
+
+..
+ Local Variables:
+ mode: rst
+ mode: outline-minor
+ End:
+
diff --git a/screed/DBConstants.py b/screed/DBConstants.py
new file mode 100644
index 0000000..8de0a1f
--- /dev/null
+++ b/screed/DBConstants.py
@@ -0,0 +1,26 @@
+# Copyright (c) 2008-2010, Michigan State University
+
+"""
+Defines some constant strings identifications used in multiple
+files throughout screed
+"""
+
+# Name of table holding information about rest of db
+_SCREEDADMIN = 'SCREEDADMIN'
+
+# Names of _SCREEDADMIN columns
+_FIELDNAME = 'FIELDNAME'
+_ROLENAME = 'ROLE'
+_PRIMARY_KEY = 'id'
+
+# Names of roles
+_STANDARD_TEXT = 'STANDARDATTR'
+_SLICEABLE_TEXT = 'SLICEABLEATTR'
+_INDEXED_TEXT_KEY = 'TEXTKEYATTR'
+_PRIMARY_KEY_ROLE = 'INTKEYATTR'
+
+# Name of table holding sequence information
+_DICT_TABLE = 'DICTIONARY_TABLE'
+
+# The file extension given to all screed databases
+fileExtension = '_screed'
diff --git a/screed/__init__.py b/screed/__init__.py
new file mode 100755
index 0000000..2c4c151
--- /dev/null
+++ b/screed/__init__.py
@@ -0,0 +1,33 @@
+# Copyright (c) 2008-2010, Michigan State University
+
+"""
+screed is a database tool useful for retrieving arbitrary kinds of sequence
+data through a on-disk database that emulates a read-only Python dictionary.
+
+For opening a screed database, the 'ScreedDB' class is used. This class
+accepts a string file path to a pre-created screed database. Read-only
+dictionary methods are implemented here.
+
+For creating a screed database, the 'create_db' function is used. This
+function accepts an iterator as an argument which will yield records
+from its respective sequence file. create_db will sequentially pull
+records from the iterator, writing them to disk in a screed database
+until the iterator is done.
+
+Automatic ways for parsing FASTA and FASTQ files are accessed through
+the read_fast*_sequences functions. These parse the given sequence
+file into a screed database.
+
+Conversion between sequence file types is provided in the ToFastq and
+ToFasta functions
+"""
+from openscreed import ScreedDB, open_writer
+from openscreed import open_reader as open
+from conversion import ToFastq
+from conversion import ToFasta
+from createscreed import create_db
+from seqparse import read_fastq_sequences
+from seqparse import read_fasta_sequences
+from dna import rc
+
+__version__ = '0.7'
diff --git a/screed/conversion.py b/screed/conversion.py
new file mode 100644
index 0000000..9e497ff
--- /dev/null
+++ b/screed/conversion.py
@@ -0,0 +1,73 @@
+# Copyright (c) 2008-2010, Michigan State University
+
+from openscreed import ScreedDB
+
+_MAXLINELEN = 80
+_null_accuracy = '\"' # ASCII 34, e.g 75% chance of incorrect read
+
+def GetComments(value):
+ """
+ Returns description or annotations attributes from given
+ dictionary object
+ """
+ if 'description' in value:
+ return value['description']
+ elif 'annotations' in value:
+ return value['annotations']
+ else:
+ return ''
+
+def linewrap(longString):
+ """
+ Given a long string of characters, inserts newline characters
+ every _MAXLINELEN characters
+ """
+ res = []
+ begin = 0
+ while begin < len(longString):
+ res.append(longString[begin:begin+_MAXLINELEN])
+ begin += _MAXLINELEN
+
+ return '\n'.join(res)
+
+def GenerateAccuracy(value):
+ """
+ Returns accuracy from value if it exists. Otherwise, makes
+ a null accuracy. Accuracy is line wrapped to _MAXLINELEN
+ either way
+ """
+ if 'accuracy' in value:
+ return linewrap(value['accuracy'])
+
+ return linewrap(_null_accuracy * len(str(value['sequence'])))
+
+def ToFastq(dbFile, outputFile):
+ """
+ Opens the screed database file and attempts to dump it
+ to a FASTQ-formatted text file
+ """
+ outFile = open(outputFile, 'wb')
+ db = ScreedDB(dbFile)
+
+ for value in db.itervalues():
+ outFile.write('@%s %s\n%s\n+\n%s\n' % (value['name'],
+ GetComments(value),
+ linewrap(str(value['sequence'])),
+ GenerateAccuracy(value)))
+ db.close()
+ outFile.close()
+
+def ToFasta(dbFile, outputFile):
+ """
+ Opens the screed database file and attempts to dump it
+ to a FASTA-formatted text file
+ """
+ outFile = open(outputFile, 'wb')
+ db = ScreedDB(dbFile)
+
+ for value in db.itervalues():
+ outFile.write('>%s %s\n%s\n' % (value['name'], GetComments(value),
+ linewrap(str(value['sequence']))))
+
+ db.close()
+ outFile.close()
diff --git a/screed/createscreed.py b/screed/createscreed.py
new file mode 100644
index 0000000..a15245c
--- /dev/null
+++ b/screed/createscreed.py
@@ -0,0 +1,80 @@
+import DBConstants
+import os
+import sqlite3
+import itertools
+
+def create_db(filepath, fields, rcrditer):
+ """
+ Creates a screed database in the given filepath. Fields is a tuple
+ specifying the names and relative order of attributes in a
+ record. rcrditer is an iterator returning records over a
+ sequence dataset. Records yielded are in dictionary form
+ """
+ if not filepath.endswith(DBConstants.fileExtension):
+ filepath += DBConstants.fileExtension
+
+ if os.path.exists(filepath): # Remove existing files
+ os.unlink(filepath)
+
+ con = sqlite3.connect(filepath)
+ cur = con.cursor()
+
+ # Sqlite PRAGMA settings for speed
+ cur.execute("PRAGMA synchronous='OFF'")
+ cur.execute("PRAGMA locking_mode=EXCLUSIVE")
+
+ # Create the admin table
+ cur.execute('CREATE TABLE %s (%s INTEGER PRIMARY KEY, '\
+ '%s TEXT, %s TEXT)' % (DBConstants._SCREEDADMIN,
+ DBConstants._PRIMARY_KEY,
+ DBConstants._FIELDNAME,
+ DBConstants._ROLENAME))
+ query = 'INSERT INTO %s (%s, %s) VALUES (?, ?)' % \
+ (DBConstants._SCREEDADMIN, DBConstants._FIELDNAME,
+ DBConstants._ROLENAME)
+
+ # Put the primary key in as an attribute
+ cur.execute(query, (DBConstants._PRIMARY_KEY,
+ DBConstants._PRIMARY_KEY_ROLE))
+ for attribute, role in fields:
+ cur.execute(query, (attribute, role))
+
+ # Setup the dictionary table creation field substring
+ fieldsub = ','.join(['%s TEXT' % field for field, role in fields])
+
+ # Create the dictionary table
+ cur.execute('CREATE TABLE %s (%s INTEGER PRIMARY KEY, %s)' %
+ (DBConstants._DICT_TABLE, DBConstants._PRIMARY_KEY,
+ fieldsub))
+
+ # Setup the 'qmarks' sqlite substring
+ qmarks = ','.join(['?' for i in range(len(fields))])
+
+ # Setup the sql substring for inserting fields into database
+ fieldsub = ','.join([fieldname for fieldname, role in fields])
+
+ query = 'INSERT INTO %s (%s) VALUES (%s)' %\
+ (DBConstants._DICT_TABLE, fieldsub, qmarks)
+ # Pull data from the iterator and store in database
+ # Commiting in batches seems faster than a single call to executemany
+ data = (tuple(record[fieldname] for fieldname, role in fields) \
+ for record in rcrditer)
+ while True:
+ batch = list(itertools.islice(data, 10000))
+ if not batch: break
+ cur.executemany(query, batch)
+ con.commit()
+
+ # Attribute to index
+ queryby = fields[0][0] # Defaults to the first field
+ for fieldname, role in fields:
+ if role == DBConstants._INDEXED_TEXT_KEY:
+ queryby = fieldname
+ break
+
+ # Make the index on the 'queryby' attribute
+ cur.execute('CREATE UNIQUE INDEX %sidx ON %s(%s)' %
+ (queryby, DBConstants._DICT_TABLE, queryby))
+
+ con.commit()
+ con.close()
diff --git a/screed/dna.py b/screed/dna.py
new file mode 100644
index 0000000..229cc89
--- /dev/null
+++ b/screed/dna.py
@@ -0,0 +1,48 @@
+import string, array
+
+legal_dna = "ACGTN"
+
+def is_DNA(seq):
+ """
+ Returns 1 if it contains only legal values for a DNA sequence.
+
+ c.f. http://www.ncbi.nlm.nih.gov/BLAST/fasta.html
+ """
+ for ch in seq:
+ if ch not in legal_dna:
+ return 0
+
+ return 1
+
+def reverse_complement(s):
+ """
+ Build reverse complement of 's'.
+ """
+ s = string.upper(s)
+ assert is_DNA(s), "Your sequence must be DNA!"
+
+ r = reverse(s)
+ rc = complement(r)
+
+ return rc
+
+rc = reverse_complement # alias 'rc' to 'reverse_complement'
+
+__complementTranslation = string.maketrans('ACTG', 'TGAC')
+
+def complement(s):
+ """
+ Return complement of 's'.
+ """
+ c = string.translate(s, __complementTranslation)
+ return c
+
+def reverse(s):
+ """
+ Return reverse of 's'.
+ """
+ r = array.array('c', s)
+ r.reverse()
+ r = string.join(r, '')
+
+ return r
diff --git a/screed/dump_to_fasta.py b/screed/dump_to_fasta.py
new file mode 100644
index 0000000..130399d
--- /dev/null
+++ b/screed/dump_to_fasta.py
@@ -0,0 +1,23 @@
+#!/usr/bin/env python
+
+# Copyright (c) 2008-2010, Michigan State University
+
+from screed import ToFasta
+import sys, os
+
+# Shell interface to the ToFasta screed conversion function
+if __name__ == '__main__':
+ if len(sys.argv) != 3:
+ print "Usage: %s <dbfilename> <outputfilename>" % sys.argv[0]
+ exit(1)
+
+ dbFile = sys.argv[1]
+ outputFile = sys.argv[2]
+
+ if not os.path.isfile(dbFile):
+ print "No such file: %s" % dbFile
+ exit(1)
+ if os.path.isfile(outputFile):
+ os.unlink(outputFile)
+
+ ToFasta(dbFile, outputFile)
diff --git a/screed/dump_to_fastq.py b/screed/dump_to_fastq.py
new file mode 100644
index 0000000..be86015
--- /dev/null
+++ b/screed/dump_to_fastq.py
@@ -0,0 +1,23 @@
+#!/usr/bin/env python
+
+# Copyright (c) 2008-2010, Michigan State University
+
+from screed import ToFastq
+import sys, os
+
+# Shell interface to the ToFastq screed conversion function
+if __name__ == '__main__':
+ if len(sys.argv) != 3:
+ print "Usage: %s <dbfilename> <outputfilename>" % sys.argv[0]
+ exit(1)
+
+ dbFile = sys.argv[1]
+ outputFile = sys.argv[2]
+
+ if not os.path.isfile(dbFile):
+ print "No such file: %s" % dbFile
+ exit(1)
+ if os.path.isfile(outputFile):
+ os.unlink(outputFile)
+
+ ToFastq(dbFile, outputFile)
diff --git a/screed/fadbm.py b/screed/fadbm.py
new file mode 100755
index 0000000..81a5c1b
--- /dev/null
+++ b/screed/fadbm.py
@@ -0,0 +1,20 @@
+#!/usr/bin/env python
+
+# Copyright (c) 2008-2010, Michigan State University
+
+import sys
+from __init__ import read_fasta_sequences
+import DBConstants
+
+# A shell interface to the screed FADBM database writing function
+if __name__ == "__main__":
+ # Make sure the user entered the command line arguments correctly
+ if len(sys.argv) != 2:
+ sys.stderr.write("ERROR: USAGE IS: %s <dbfilename>\n" % sys.argv[0])
+ exit(1)
+
+ filename = sys.argv[1]
+ read_fasta_sequences(filename)
+
+ print "Database saved in %s%s" % (sys.argv[1], DBConstants.fileExtension)
+
diff --git a/screed/fasta.py b/screed/fasta.py
new file mode 100644
index 0000000..fa44502
--- /dev/null
+++ b/screed/fasta.py
@@ -0,0 +1,50 @@
+import DBConstants
+from screedRecord import _screed_record_dict, _Writer
+
+FieldTypes = (('name', DBConstants._INDEXED_TEXT_KEY),
+ ('description', DBConstants._STANDARD_TEXT),
+ ('sequence', DBConstants._SLICEABLE_TEXT))
+
+def fasta_iter(handle, parse_description=True, line=None):
+ """
+ Iterator over the given FASTA file handle, returning records. handle
+ is a handle to a file opened for reading
+ """
+ if line is None:
+ line = handle.readline()
+
+ while line:
+ data = _screed_record_dict()
+
+ line = line.strip()
+ if not line.startswith('>'):
+ raise IOError("Bad FASTA format: no '>' at beginning of line")
+
+ if parse_description: # Try to grab the name and optional description
+ try:
+ data['name'], data['description'] = line[1:].split(' ', 1)
+ except ValueError: # No optional description
+ data['name'] = line[1:]
+ data['description'] = ''
+ else:
+ data['name'] = line[1:]
+ data['description'] = ''
+
+ data['name'] = data['name'].strip()
+ data['description'] = data['description'].strip()
+
+ # Collect sequence lines into a list
+ sequenceList = []
+ line = handle.readline()
+ while line and not line.startswith('>'):
+ sequenceList.append(line.strip())
+ line = handle.readline()
+
+ data['sequence'] = ''.join(sequenceList)
+ yield data
+
+class FASTA_Writer(_Writer):
+ def write(self, record):
+ s = ">%s %s\n%s\n" % (record.name, record.description,
+ record.sequence,)
+ self.fp.write(s)
diff --git a/screed/fastq.py b/screed/fastq.py
new file mode 100644
index 0000000..7d37bbf
--- /dev/null
+++ b/screed/fastq.py
@@ -0,0 +1,64 @@
+import DBConstants
+from screedRecord import _screed_record_dict, _Writer
+FieldTypes = (('name', DBConstants._INDEXED_TEXT_KEY),
+ ('annotations', DBConstants._STANDARD_TEXT),
+ ('sequence', DBConstants._STANDARD_TEXT),
+ ('accuracy', DBConstants._STANDARD_TEXT))
+
+def fastq_iter(handle, line=None, parse_description=True):
+ """
+ Iterator over the given FASTQ file handle returning records. handle
+ is a handle to a file opened for reading
+ """
+ if line is None:
+ line = handle.readline()
+ line = line.strip()
+ while line:
+ data = _screed_record_dict()
+
+ if not line.startswith('@'):
+ raise IOError("Bad FASTQ format: no '@' at beginning of line")
+
+ # Try to grab the name and (optional) annotations
+ if parse_description:
+ try:
+ data['name'], data['annotations'] = line[1:].split(' ',1)
+ except ValueError: # No optional annotations
+ data['name'] = line[1:]
+ data['annotations'] = ''
+ pass
+ else:
+ data['name'] = line[1:]
+ data['annotations'] = ''
+
+ # Extract the sequence lines
+ sequence = []
+ line = handle.readline().strip()
+ while not line.startswith('+') and not line.startswith('#'):
+ sequence.append(line)
+ line = handle.readline().strip()
+
+ data['sequence'] = ''.join(sequence)
+
+ # Extract the accuracy lines
+ accuracy = []
+ line = handle.readline().strip()
+ seqlen = len(data['sequence'])
+ aclen = 0
+ while not line == '' and aclen < seqlen:
+ accuracy.append(line)
+ aclen += len(line)
+ line = handle.readline().strip()
+
+ data['accuracy'] = ''.join(accuracy)
+ if len(data['sequence']) != len(data['accuracy']):
+ raise IOError('sequence and accuracy strings must be '\
+ 'of equal length')
+
+ yield data
+
+class FASTQ_Writer(_Writer):
+ def write(self, record):
+ s = "@%s %s\n%s\n+\n%s\n" % (record.name, record.description,
+ record.sequence, record.accuracy)
+ self.fp.write(s)
diff --git a/screed/fqdbm.py b/screed/fqdbm.py
new file mode 100755
index 0000000..4559622
--- /dev/null
+++ b/screed/fqdbm.py
@@ -0,0 +1,20 @@
+#!/usr/bin/env python
+
+# Copyright (c) 2008-2010, Michigan State University
+
+import sys
+from __init__ import read_fastq_sequences
+import DBConstants
+
+# A shell interface to the screed FQDBM database writing function
+if __name__ == "__main__":
+ # Make sure the user entered the command line arguments correctly
+ if len(sys.argv) != 2:
+ sys.stderr.write("ERROR: USAGE IS: %s <dbfilename>\n" % sys.argv[0]);
+ exit(1)
+
+ filename = sys.argv[1]
+ read_fastq_sequences(filename)
+
+ print "Database saved in %s%s" % (sys.argv[1], DBConstants.fileExtension)
+ exit(0)
diff --git a/screed/hava.py b/screed/hava.py
new file mode 100644
index 0000000..fa6332c
--- /dev/null
+++ b/screed/hava.py
@@ -0,0 +1,27 @@
+import DBConstants
+
+FieldTypes = (('hava', DBConstants._INDEXED_TEXT_KEY),
+ ('quarzk', DBConstants._STANDARD_TEXT),
+ ('muchalo', DBConstants._STANDARD_TEXT),
+ ('fakours', DBConstants._STANDARD_TEXT),
+ ('selimizicka', DBConstants._STANDARD_TEXT),
+ ('marshoon', DBConstants._STANDARD_TEXT))
+
+
+def hava_iter(handle):
+ """
+ Iterator over a 'hava' sequence file, returning records. handle
+ is a handle to a file opened for reading
+ """
+ data = {}
+ line = handle.readline().strip()
+ while line:
+ data['hava'] = line
+ data['quarzk'] = handle.readline().strip()
+ data['muchalo'] = handle.readline().strip()
+ data['fakours'] = handle.readline().strip()
+ data['selimizicka'] = handle.readline().strip()
+ data['marshoon'] = handle.readline().strip()
+
+ line = handle.readline().strip()
+ yield data
diff --git a/screed/openscreed.py b/screed/openscreed.py
new file mode 100644
index 0000000..25581b6
--- /dev/null
+++ b/screed/openscreed.py
@@ -0,0 +1,281 @@
+import os
+import types
+import UserDict
+import types
+import sqlite3
+import gzip
+import bz2
+
+import DBConstants
+import screedRecord
+from fastq import fastq_iter, FASTQ_Writer
+from fasta import fasta_iter, FASTA_Writer
+
+def get_writer_class(read_iter):
+ if read_iter.__name__ == 'fasta_iter':
+ return FASTA_Writer
+ elif read_iter.__name__ == 'fastq_iter':
+ return FASTQ_Writer
+
+def open_writer(inp_filename, outp_filename):
+ read_iter = open_reader(inp_filename)
+ klass = get_writer_class(read_iter)
+ return klass(outp_filename)
+
+def open_reader(filename, *args, **kwargs):
+ """
+ Make a best-effort guess as to how to open/parse the given sequence file.
+
+ Deals with .gz, FASTA, and FASTQ records.
+ """
+ if filename.endswith('.gz'):
+ fp = gzip.open(filename)
+ elif filename.endswith('.bz2'):
+ fp = bz2.BZ2File(filename)
+ else:
+ fp = file(filename)
+
+ line = fp.readline()
+
+ if not line:
+ return []
+
+ iter_fn = None
+ if line.startswith('>'):
+ iter_fn = fasta_iter
+ elif line.startswith('@'):
+ iter_fn = fastq_iter
+
+ if iter_fn is None:
+ raise Exception("unknown file format for '%s'" % filename)
+
+ fp.seek(0)
+ return iter_fn(fp, *args, **kwargs)
+
+open = open_reader
+
+class ScreedDB(object, UserDict.DictMixin):
+ """
+ Core on-disk dictionary interface for reading screed databases. Accepts a
+ path string to a screed database
+ """
+ def __init__(self, filepath):
+ self._filepath = filepath
+ self._db = None
+ if not self._filepath.endswith(DBConstants.fileExtension):
+ self._filepath += DBConstants.fileExtension
+
+ if not os.path.exists(self._filepath):
+ raise ValueError('No such file: %s' % self._filepath)
+
+ self._db = sqlite3.connect(self._filepath)
+ cursor = self._db.cursor()
+
+ # Make sure the database is a prepared screed database
+ query = "SELECT name FROM sqlite_master WHERE type='table' "\
+ "ORDER BY name"
+ res = cursor.execute(query)
+ try:
+ dictionary_table, = res.fetchone()
+ admin_table, = res.fetchone()
+
+ if dictionary_table != DBConstants._DICT_TABLE:
+ raise TypeError
+ if admin_table != DBConstants._SCREEDADMIN:
+ raise TypeError
+
+ except TypeError:
+ self._db.close()
+ raise TypeError("Database %s is not a proper screed database"
+ % self._filepath)
+
+ nothing = res.fetchone()
+ if type(nothing) != types.NoneType:
+ self._db.close()
+ raise TypeError("Database %s has too many tables." % filename)
+
+ # Store the fields of the admin table in a tuple
+ query = "SELECT %s, %s FROM %s" % \
+ (DBConstants._FIELDNAME,
+ DBConstants._ROLENAME,
+ DBConstants._SCREEDADMIN)
+ res = cursor.execute(query)
+ self.fields = tuple([(str(field), role) for field, role in res])
+
+ # Indexed text column for querying, search fields to find
+ self._queryBy = self.fields[1][0]
+ for fieldname, role in self.fields:
+ if role == DBConstants._INDEXED_TEXT_KEY:
+ self._queryBy = fieldname
+
+ # Sqlite PRAGMA settings for speed
+ cursor.execute("PRAGMA cache_size=2000")
+
+ # Retrieve the length of the database
+ query = 'SELECT MAX(%s) FROM %s' % (DBConstants._PRIMARY_KEY,
+ DBConstants._DICT_TABLE)
+ self._len, = cursor.execute(query).fetchone()
+
+ def __del__(self):
+ """
+ Alias for close()
+ """
+ self.close()
+
+ def close(self):
+ """
+ Closes the sqlite database handle
+ """
+ if self._db is not None:
+ self._db.close()
+ self._db = None
+
+ def __getitem__(self, key):
+ """
+ Retrieves from database the record with the key 'key'
+ """
+ cursor = self._db.cursor()
+ key = str(key) # So lazy retrieval objectes are evaluated
+ query = 'SELECT %s FROM %s WHERE %s=?' % (self._queryBy,
+ DBConstants._DICT_TABLE,
+ self._queryBy)
+ res = cursor.execute(query, (key,))
+ if type(res.fetchone()) == types.NoneType:
+ raise KeyError("Key %s not found" % key)
+ return screedRecord._buildRecord(self.fields, self._db,
+ key,
+ self._queryBy)
+
+ def values(self):
+ """
+ Retrieves all records from the database and returns them as a list
+ """
+ return list(self.itervalues())
+
+ def items(self):
+ """
+ Retrieves all records from the database and returns them as a list of
+ (key, record) tuple pairs
+ """
+ return list(self.iteritems())
+
+ def loadRecordByIndex(self, index):
+ """
+ Retrieves record from database at the given index
+ """
+ cursor = self._db.cursor()
+ index = int(index) + 1 # Hack to make indexing start at 0
+ query = 'SELECT %s FROM %s WHERE %s=?' % (DBConstants._PRIMARY_KEY,
+ DBConstants._DICT_TABLE,
+ DBConstants._PRIMARY_KEY)
+ res = cursor.execute(query, (index,))
+ if type(res.fetchone()) == types.NoneType:
+ raise KeyError("Index %d not found" % index)
+ return screedRecord._buildRecord(self.fields, self._db,
+ index,
+ DBConstants._PRIMARY_KEY)
+
+ def __len__(self):
+ """
+ Returns the number of records in the database
+ """
+ return self._len
+
+ def keys(self):
+ """
+ Returns a list of keys in the database
+ """
+ return list(self.iterkeys())
+
+ def __repr__(self):
+ """
+ Returns a string with some general information about the database
+ """
+ return "<%s, '%s'>" % (self.__class__.__name__,
+ self._filepath)
+
+ def itervalues(self):
+ """
+ Iterator over records in the database
+ """
+ for index in xrange(1, self.__len__()+1):
+ yield screedRecord._buildRecord(self.fields, self._db,
+ index,
+ DBConstants._PRIMARY_KEY)
+
+ def iterkeys(self):
+ """
+ Iterator over keys in the database
+ """
+ cursor = self._db.cursor()
+ query = 'SELECT %s FROM %s ORDER BY id' % (self._queryBy, DBConstants._DICT_TABLE)
+ for key, in cursor.execute(query):
+ yield key
+
+ def iteritems(self):
+ """
+ Iterator returning a (index, record) pairs
+ """
+ for v in self.itervalues():
+ yield v[DBConstants._PRIMARY_KEY], v
+
+ def has_key(self, key):
+ """
+ Returns true if given key exists in database, false otherwise
+ """
+ return key in self
+
+ def copy(self):
+ """
+ Returns shallow copy
+ """
+ return self
+
+ def __contains__(self, key):
+ """
+ Returns true if given key exists in database, false otherwise
+ """
+ cursor = self._db.cursor()
+ query = 'SELECT %s FROM %s WHERE %s = ?' % \
+ (self._queryBy, DBConstants._DICT_TABLE, self._queryBy)
+ if cursor.execute(query, (key,)).fetchone() == None:
+ return False
+ return True
+
+ # Here follow the methods that are not implemented
+
+ def __setitem__(self, something):
+ """
+ Not implemented (Read-only database)
+ """
+ raise AttributeError
+
+ def clear(self):
+ """
+ Not implemented (Read-only database)
+ """
+ raise AttributeError
+
+ def update(self, something):
+ """
+ Not implemented (Read-only database)
+ """
+ raise AttributeError
+
+ def setdefault(self, something):
+ """
+ Not implemented (Read-only database)
+ """
+ raise AttributeError
+
+ def pop(self):
+ """
+ Not implemented (Read-only database)
+ """
+ raise AttributeError
+
+ def popitem(self):
+ """
+ Not implemented (Read-only database)
+ """
+ raise AttributeError
diff --git a/screed/pygr_api.py b/screed/pygr_api.py
new file mode 100644
index 0000000..81f3545
--- /dev/null
+++ b/screed/pygr_api.py
@@ -0,0 +1,163 @@
+# Copyright (c) 2008-2010, Michigan State University
+
+"""
+A simple wrapper implementing a pygr-compatible SequenceDB based on screed.
+
+There are two implementions:
+ - ScreedSequenceDB
+ - ScreedSequenceDB_ByIndex
+
+ScreedSequenceDB uses the sequence name as the sequence ID, which
+mimics the behavior of pygr's SequenceFileDB and is good for
+small-to-medium sized collections of sequences.
+ScreedSequenceDB_ByIndex uses the sequence's index (0...size of
+database) as a sequence ID, rather than the sequence name; this is
+much faster for databases with many, many sequences.
+
+Unlike the normal seqdb, screed will load the entire sequence record
+into memory on request, so it's not good for large sequences.
+
+All screed records are guaranteed to have an 'index', a 'name', and a
+'sequence' attribute; anything else is specific to the database writer
+you use. The raw screed record (which contains any other information)
+is available under seqObj.record.
+
+Note: the underlying screed database must already have been built with
+fadbm or fqdbm.
+
+CTB 3/20/09
+"""
+
+import UserDict
+
+from screed import ScreedDB
+
+from pygr.sequence import SequenceBase
+from pygr.seqdb import SequenceDB
+from pygr.sequtil import DNA_SEQTYPE
+
+###
+
+class ScreedSequence(SequenceBase):
+ """Sequence implementation based on screed; stores screed record info.
+
+ Attributes:
+ - 'id' and 'db' are the standard pygr-ish name/database attrs.
+ - 'record' is the screed 'record' object, containing name, etc.
+ - 'name' is the record name, which can be the same as 'id' but
+ can also be different (see ScreedSequenceDB_ByIndex).
+ - 'seq' is the sequence.
+
+ """
+ def __init__(self, db, id):
+ self.id = id
+ SequenceBase.__init__(self)
+ info = db.seqInfoDict[id]
+
+ self.record = info.record
+ self.name = info.record.name
+ self.seq = info.record.sequence
+
+class ScreedSequenceDB(SequenceDB):
+ """SequenceDB implementation based on screed; retrieve seqs by name."""
+ itemClass = ScreedSequence
+
+ def __init__(self, filepath):
+ self.filepath = filepath
+ self.seqInfoDict = _ScreedSeqInfoDict_ByName(filepath)
+ SequenceDB.__init__(self)
+
+ def _set_seqtype(self):
+ self._seqtype = DNA_SEQTYPE
+
+ def __repr__(self):
+ return "<%s '%s'>" % (self.__class__.__name__, self.filepath)
+
+ # override inherited __reduce__/__getstate__/__setstate__ from SequenceDB.
+ def __reduce__(self):
+ return (ScreedSequenceDB, (self.filepath,))
+
+class ScreedSequenceDB_ByIndex(SequenceDB):
+ """SequenceDB implementation based on screed; retrieve seqs by index."""
+ itemClass = ScreedSequence
+
+ def __init__(self, filepath):
+ self.filepath = filepath
+ self.seqInfoDict = _ScreedSeqInfoDict_ByIndex(filepath)
+ SequenceDB.__init__(self)
+
+ def _set_seqtype(self):
+ self._seqtype = DNA_SEQTYPE
+
+ def __repr__(self):
+ return "<%s '%s'>" % (self.__class__.__name__, self.filepath)
+
+ # override inherited __reduce__/__getstate__/__setstate__ from SequenceDB.
+ def __reduce__(self):
+ return (ScreedSequenceDB_ByIndex, (self.filepath,))
+
+class _ScreedSequenceInfo(object):
+ """Objects to put in seqInfoDict values, for holding screed record info."""
+ def __init__(self, id, record):
+ self.id = id
+ self.record = record
+ self.length = len(record.sequence)
+
+class _ScreedSeqInfoDict_ByName(object, UserDict.DictMixin):
+ """seqInfoDict implementation that uses names to retrieve records."""
+ def __init__(self, filepath):
+ self.sdb = ScreedDB(filepath)
+
+ def __getitem__(self, k):
+ v = self.sdb[k]
+ return _ScreedSequenceInfo(k, v)
+
+ def keys(self):
+ return self.sdb.keys()
+
+ def itervalues(self):
+ i = 0
+ max_index = len(self.sdb)
+ while i < max_index:
+ v = self.sdb.loadRecordByIndex(i)
+ yield _ScreedSequenceInfo(v.name, v)
+ i += 1
+
+ def iteritems(self):
+ for v in self.itervalues():
+ yield v.record.name, v
+
+
+class _ScreedSeqInfoDict_ByIndex(object, UserDict.DictMixin):
+ """seqInfoDict implementation that uses indices to retrieve records."""
+ def __init__(self, filepath):
+ self.sdb = ScreedDB(filepath)
+
+ def __getitem__(self, k):
+ n = int(k)
+ v = self.sdb.loadRecordByIndex(n)
+ return _ScreedSequenceInfo(k, v)
+
+ def keys(self):
+ return xrange(0, len(self.sdb))
+
+ def iterkeys(self):
+ i = 0
+ max_index = len(self.sdb)
+ while i < max_index:
+ yield i
+ i += 1
+
+###
+
+if __name__ == '__main__':
+ import sys
+ filename = sys.argv[1]
+
+ db = ScreedSequenceDB(filename)
+ for k in db:
+ print k, repr(db[k]), db[k].name
+
+ db = ScreedSequenceDB_ByIndex(filename)
+ for k in db:
+ print k, repr(db[k]), db[k].name
diff --git a/screed/screedRecord.py b/screed/screedRecord.py
new file mode 100644
index 0000000..e1d5434
--- /dev/null
+++ b/screed/screedRecord.py
@@ -0,0 +1,195 @@
+import UserDict
+import types
+import DBConstants
+import gzip
+import bz2
+
+class _screed_record_dict(UserDict.DictMixin):
+ """
+ Simple dict-like record interface with bag behavior.
+ """
+ def __init__(self, *args, **kwargs):
+ self.d = dict(*args, **kwargs)
+
+ def __getitem__(self, name):
+ return self.d[name]
+
+ def __setitem__(self, name, value):
+ self.d[name] = value
+
+ def __getattr__(self, name):
+ try:
+ return self.d[name]
+ except KeyError:
+ raise AttributeError, name
+
+ def keys(self):
+ return self.d.keys()
+
+class _screed_attr(object):
+ """
+ Sliceable database object that supports lazy retrieval
+ """
+ def __init__(self, dbObj, attrName, rowName, queryBy):
+ """
+ Initializes database object with specific record retrieval
+ information
+ dbOjb = database handle
+ attrName = name of attr in db
+ rowName = index/name of row
+ queryBy = by name or index
+ """
+ self._dbObj = dbObj
+ self._attrName = attrName
+ self._rowName = rowName
+ self._queryBy = queryBy
+
+ def __getitem__(self, sliceObj):
+ """
+ Slicing interface. Returns the slice range given.
+ *.start + 1 to be compatible with sqlite's 1 not 0 scheme
+ """
+ if type(sliceObj) != types.SliceType:
+ raise TypeError('__getitem__ argument must be of slice type')
+ if not sliceObj.start <= sliceObj.stop: # String reverse in future?
+ raise ValueError('start must be less than stop in slice object')
+ length = sliceObj.stop - sliceObj.start
+
+ query = 'SELECT substr(%s, %d, %d) FROM %s WHERE %s = ?' \
+ % (self._attrName, sliceObj.start+1, length,
+ DBConstants._DICT_TABLE,
+ self._queryBy)
+ cur = self._dbObj.cursor()
+ result = cur.execute(query, (str(self._rowName),))
+ try:
+ subStr, = result.fetchone()
+ except TypeError:
+ raise KeyError("Key %s not found" % self._rowName)
+ return str(subStr)
+
+ def __len__(self):
+ """
+ Returns the length of the string
+ """
+ return len(self.__str__())
+
+ def __repr__(self):
+ """
+ Prints out the name of the class and the name of the sliceable attr
+ """
+ return "<%s '%s'>" % (self.__class__.__name__, self._attrName)
+
+ def __cmp__(self, given):
+ """
+ Handles comparisons other than == and !=
+ """
+ ownString = __str__()
+ if isinstance(given, _screed_attr):
+ given = str(given)
+ elif not isinstance(given, str):
+ raise TypeError("Cannot compare to given type: %s" % type(given))
+
+ if ownString < given:
+ return -1
+ elif ownString > given:
+ return 1
+ else:
+ return 0
+
+ def __eq__(self, given):
+ """
+ Compares attribute to given object in string form
+ """
+ if type(given) == types.StringType:
+ return given == self.__str__()
+
+ try:
+ return str(given) == self.__str__()
+ except AttributeError:
+ raise TypeError("Cannot compare to given type: %s" % type(given))
+
+ def __ne__(self, given):
+ """
+ Compares attribute to given object in string form
+ """
+ if type(given) == types.StringType:
+ return self.__repr__() != given
+
+ try:
+ return self.__repr__() != str(given)
+ except AttributError:
+ raise TypeError("Cannot compare to given type: %s" % type(given))
+
+ def __str__(self):
+ """
+ Returns the full attribute as a string
+ """
+ query = 'SELECT %s FROM %s WHERE %s = ?' \
+ % (self._attrName, DBConstants._DICT_TABLE, self._queryBy)
+ cur = self._dbObj.cursor()
+ result = cur.execute(query, (str(self._rowName),))
+ try:
+ record, = result.fetchone()
+ except TypeError:
+ raise KeyError("Key %s not found" % self._rowName)
+ return str(record)
+
+def _buildRecord(fieldTuple, dbObj, rowName, queryBy):
+ """
+ Constructs a dict-like object with record attribute names as keys and
+ _screed_attr objects as values
+ """
+
+ # Separate the lazy and full retrieval objects
+ kvResult = []
+ fullRetrievals = []
+ for fieldname, role in fieldTuple:
+ if role == DBConstants._SLICEABLE_TEXT:
+ kvResult.append((fieldname, _screed_attr(dbObj,
+ fieldname,
+ rowName,
+ queryBy)))
+ else:
+ fullRetrievals.append(fieldname)
+
+ # Retrieve the full text fields from the db
+ subs = ','.join(fullRetrievals)
+ query = 'SELECT %s FROM %s WHERE %s=?' % \
+ (subs, DBConstants._DICT_TABLE, queryBy)
+ cur = dbObj.cursor()
+ res = cur.execute(query, (rowName,))
+
+ # Add the full text fields to the result tuple list
+ data = tuple([str(r) for r in res.fetchone()])
+ kvResult.extend(zip(fullRetrievals, data))
+
+ # Hack to make indexing start at 0
+ hackedResult = []
+ for key, value in kvResult:
+ if key == DBConstants._PRIMARY_KEY:
+ hackedResult.append((key, int(value)-1))
+ else:
+ hackedResult.append((key, value))
+
+ return _screed_record_dict(hackedResult)
+
+
+class _Writer(object):
+ def __init__(self, filename, fp=None):
+ self.filename = filename
+ if fp is None:
+ if filename.endswith('.gz'):
+ fp = gzip.open(filename, 'w')
+ elif filename.endswith('.bz2'):
+ fp = bz2.BZ2File(filename, 'w')
+ else:
+ fp = file(filename, 'wb')
+
+ self.fp = fp
+
+ def consume(self, read_iter):
+ for read in read_iter:
+ self.write(read)
+
+ def close(self):
+ self.fp.close()
diff --git a/screed/seqparse.py b/screed/seqparse.py
new file mode 100644
index 0000000..e07ffa7
--- /dev/null
+++ b/screed/seqparse.py
@@ -0,0 +1,59 @@
+# Copyright (c) 2008-2010, Michigan State University
+
+"""
+seqparse contains custom sequence parsers for extending screed's
+functionality to arbitrary sequence formats. An example 'hava'
+parser is included for API reference
+"""
+
+import os
+from createscreed import create_db
+from openscreed import ScreedDB
+import fastq
+import fasta
+import hava
+
+# [AN] these functions look strangely similar
+def read_fastq_sequences(filename):
+ """
+ Function to parse text from the given FASTQ file into a screed database
+ """
+ import openscreed
+
+ # Will raise an exception if the file doesn't exist
+ iterfunc = openscreed.open(filename)
+
+ # Create the screed db
+ create_db(filename, fastq.FieldTypes, iterfunc)
+
+ return ScreedDB(filename)
+
+def read_fasta_sequences(filename):
+ """
+ Function to parse text from the given FASTA file into a screed database
+ """
+ import openscreed
+
+ # Will raise an exception if the file doesn't exist
+ iterfunc = openscreed.open(filename)
+
+ # Create the screed db
+ create_db(filename, fasta.FieldTypes, iterfunc)
+
+ return ScreedDB(filename)
+
+def read_hava_sequences(filename):
+ """
+ Function to parse text from the given HAVA file into a screed database
+ """
+ # Will raise an exception if the file doesn't exist
+ theFile = open(filename, "rb")
+
+ # Setup the iterator function
+ iterfunc = hava.hava_iter(theFile)
+
+ # Create the screed db
+ create_db(filename, hava.FieldTypes, iterfunc)
+ theFile.close()
+
+ return ScreedDB(filename)
diff --git a/screed/tests/__init__.py b/screed/tests/__init__.py
new file mode 100644
index 0000000..792d600
--- /dev/null
+++ b/screed/tests/__init__.py
@@ -0,0 +1 @@
+#
diff --git a/screed/tests/__main__.py b/screed/tests/__main__.py
new file mode 100644
index 0000000..2f8ee30
--- /dev/null
+++ b/screed/tests/__main__.py
@@ -0,0 +1,8 @@
+import os
+import sys
+
+if __name__ == '__main__':
+ if len(sys.argv) == 1:
+ sys.argv.append(os.path.dirname(__file__))
+ import nose
+ nose.main()
diff --git a/screed/tests/empty.fa b/screed/tests/empty.fa
new file mode 100644
index 0000000..e69de29
diff --git a/screed/tests/havaGen.py b/screed/tests/havaGen.py
new file mode 100755
index 0000000..2dda9cd
--- /dev/null
+++ b/screed/tests/havaGen.py
@@ -0,0 +1,93 @@
+#!/usr/bin/env python
+
+"""
+havaGen is for generating sequence files of the imaginary type 'hava'.
+These files consist of attributes in the following newline seperated order
+hava
+quarzk
+muchalo
+fakours
+selimizicka
+marshoon
+
+Since this 'sequence' has absolutely no utility outside of screed, it's only
+purpose is to make sure screed can work with arbitrary fields when running
+the nosetests.
+
+This is a work of fiction. Names are the product of the author's imagination
+and any resemblance to real life is entirely coincidental.
+"""
+
+import sys, os
+import random
+
+class collectionOFiles(object):
+ def __init__(self, baseName, divisions, totalSize):
+ self.baseName = baseName
+ self.divisions = divisions
+ self.totalSize = totalSize
+
+ self.fileHandles = {}
+ for i in range(0, divisions):
+ filename = self.baseName + "_%d" % i
+ fh = open(filename, "wb")
+ divisor = i * 2
+ if divisor == 0:
+ divisor = 1
+ self.fileHandles[filename]= (fh, self.totalSize/divisor, 0)
+
+ def writeRecord(self, hava, quarzk, muchalo, fakours, selimizicka, marshoon):
+ toRemove = []
+ for filename in self.fileHandles:
+ file, limit, count = self.fileHandles[filename]
+ file.write("%s\n%s\n%s\n%s\n%s\n%s\n" % (hava, quarzk, muchalo, fakours, selimizicka, marshoon))
+ count += 1
+ if count >= limit:
+ file.close()
+ toRemove.append(filename)
+ else:
+ self.fileHandles[filename] = (file, limit, count)
+
+ for fh in toRemove:
+ self.fileHandles.pop(fh)
+
+ def finished(self):
+ return len(self.fileHandles) == 0
+
+def genString(length, allowedChars):
+ res = []
+ for i in range(0, length):
+ char = allowedChars[random.randint(0, len(allowedChars)-1)]
+ res.append(char)
+ return "".join(res)
+
+def createHavaFiles(filename, size, divisions):
+ cof = collectionOFiles(filename, divisions, size)
+ counter = 0
+ lenString = 80
+ allowedQuarzk = ['A', 'T', 'C', 'G']
+ allowedMuchalo = "A B C D E F G H I J K L M N O P".split(' ')
+ allowedFakours = "1 2 3 4 5 6 7 8 9".split(' ')
+ allowedSelimizicka = ["b"]
+ allowedMarshoon = "A 1 B 2 C 3 D 4 E 5 G 6 F 7".split(' ')
+ while(not cof.finished()):
+ hava = "test_00%d" % counter
+ quarzk = genString(lenString, allowedQuarzk)
+ muchalo = genString(lenString, allowedMuchalo)
+ fakours = genString(lenString, allowedFakours)
+ selimizicka = genString(lenString, allowedSelimizicka)
+ marshoon = genString(lenString, allowedMarshoon)
+ cof.writeRecord(hava, quarzk, muchalo, fakours, selimizicka, marshoon)
+ counter += 1
+ return
+
+if __name__ == '__main__':
+ if len(sys.argv) != 4:
+ print "Usage: <filename> <size> <divisions>"
+ exit(1)
+
+ filename = sys.argv[1]
+ size = int(sys.argv[2])
+ divisions = int(sys.argv[3])
+
+ createHavaFiles(filename, size, divisions)
diff --git a/screed/tests/test-whitespace.fa b/screed/tests/test-whitespace.fa
new file mode 100644
index 0000000..2fe4b7d
--- /dev/null
+++ b/screed/tests/test-whitespace.fa
@@ -0,0 +1,12 @@
+>ENSMICT00000012722 cdna:pseudogene scaffold:micMur1:scaffold_185008:9:424:1 gene:ENSMICG00000012730
+TGCAGAAAATATCAAGAGTCAGCAGAAAAACTATACAAGGGCTGGTATTTTGATTATTCT
+ATAAAAATTCACTTTTTGCTCAGTGTCTTTCATCTGGGCCTGGCCTCCTCTCTTGCAAGC
+CCTGGATTCATAACATCTATAATAATTTTTATATGTGGTAGAGTAATATTAGCTGATTCC
+TTTGCCTCCTGTTCCTTCCCCTCATTCAGGCAGCTGGCCAGGTTTGTGCTCCTTATCTCG
+CAGAAGAGATGTGATAGCAGGCAGAGAATTAAAGTCTTCCTGGCTTTTGGTTTCAGAAGC
+TGCCTTGGGAAGGAAGCAAACAAACATGCCACAGATAAAATATTTGAAAGAAAAGATAAT
+GAAAGTAGAAAAGGGTTCCCTGTTCTTGTGGGGAGGAAGTGA
+
+>ENSMICT00000012401 cdna:novel scaffold:micMur1:scaffold_184912:461:550:-1 gene:ENSMICG00000012409
+GAACAGTCTCCTTTGGTTTGTGAAAAGAAACAAAAGAGTGTGGGGGTCGGGGAGCTCATC
+CAGCACTTCGTCGATTTCATGACCAACCAG
diff --git a/screed/tests/test.fa b/screed/tests/test.fa
new file mode 100644
index 0000000..7324b2c
--- /dev/null
+++ b/screed/tests/test.fa
@@ -0,0 +1,204 @@
+>ENSMICT00000012722 cdna:pseudogene scaffold:micMur1:scaffold_185008:9:424:1 gene:ENSMICG00000012730
+TGCAGAAAATATCAAGAGTCAGCAGAAAAACTATACAAGGGCTGGTATTTTGATTATTCT
+ATAAAAATTCACTTTTTGCTCAGTGTCTTTCATCTGGGCCTGGCCTCCTCTCTTGCAAGC
+CCTGGATTCATAACATCTATAATAATTTTTATATGTGGTAGAGTAATATTAGCTGATTCC
+TTTGCCTCCTGTTCCTTCCCCTCATTCAGGCAGCTGGCCAGGTTTGTGCTCCTTATCTCG
+CAGAAGAGATGTGATAGCAGGCAGAGAATTAAAGTCTTCCTGGCTTTTGGTTTCAGAAGC
+TGCCTTGGGAAGGAAGCAAACAAACATGCCACAGATAAAATATTTGAAAGAAAAGATAAT
+GAAAGTAGAAAAGGGTTCCCTGTTCTTGTGGGGAGGAAGTGA
+>ENSMICT00000012401 cdna:novel scaffold:micMur1:scaffold_184912:461:550:-1 gene:ENSMICG00000012409
+GAACAGTCTCCTTTGGTTTGTGAAAAGAAACAAAAGAGTGTGGGGGTCGGGGAGCTCATC
+CAGCACTTCGTCGATTTCATGACCAACCAG
+>ENSMICT00000004627 cdna:novel scaffold:micMur1:scaffold_184569:3:516:-1 gene:ENSMICG00000004628
+GCGAGCTCAGGCCGCCCTGGTTCACTCGTGTACCTCATGACCGCCCTCAAGGAAGACAAC
+CTGCGCCGCGCCTCGCCTGACTCGCGGTGGCACGTCTTAGAGAAATACGGGCGCATCGGG
+GACGAGTACAGCCCGCGGGTCTACTCCAAGGCGATCGGGTTCGCCTTCCTCCGCTTCCCC
+CACAGACAGGACGCCCAGGCCCGCCAGTACGCCCTGGGCGGGGGCCCTCCAGCAGCCCGC
+GACCTGCGCGTGCACAGGGCGCGCAGCCGCCCTCGGGTCTCCCAGCGCGGCCTCCCCCAC
+AGGTCCCGGTGCAGCTACACACGCTGCAGCCACTGGTCCTGCACTGGCTCTCCAGGCCCC
+AAATCCACAAGGGCGCGAAAGTGCAAGTCCCCATCAGGGTCCCGATGCCTCTCCAGGGCC
+AGGGCCACGTCCCTGTCCAGG
+>ENSMICT00000007212 cdna:novel scaffold:micMur1:scaffold_184313:3:308:1 gene:ENSMICG00000007217
+GTGAAGAAACCTCATCGCTACAGGCCTGGTACAGTGGCACTCCGTGAAATTAGACGTTAT
+CAGAAGTCCACTGAACTTCTGATTCGCAAACTTCCTTTCCAGCGTCTGGTGCGAGAAATT
+GCTCAGGACTTCAAAACAGATCTGCGCTTCCAGAGCGCAGCTATTGGTGCTTTGCAGGAG
+GCAAGTGAGGCCTATCTGGTTGGCCTTTTTGAAGACACCAATCTGTGTGCTATCCATGCC
+AAACGTGTAACAATTATGCCAAAAGACATCCAGCTAGCACGCCGCATACGTGGAGAACGT
+GCTTAA
+>ENSMICT00000008902 cdna:novel scaffold:micMur1:scaffold_183834:3:632:1 gene:ENSMICG00000008909
+TTCTTTGGGGGAGTTGAGATCATCCTGCTCGTGGTGATGGCCTATGACCGCTATGTGGCC
+ATCTGCAAACCCCTACACTACATGATCACAATGAACAGACAGGTGTGCAGCCTCCTGGTG
+GCTATGGCCTGGGCTGGCGGTTTTCTTCATGCTCTGATTCAAATTCTCTGTATGGTCTGG
+TTGCCTTTCTGTGGCCCCAATTTCATTGACCATTTCATCTGTGACCTTTTCCCTCTGCTA
+AAACTCTCCTGCACTGACACTCATATCTTTGGACTCTTTGTTGCCGCCAACAGTGGGCTG
+ATGTGTATGCTCATTTTTTCTATTCTTATCACCTCCTATGTCCTCATCCTTTGCTCACAG
+CGGAAGGCTCTCTCCACCTGTGCCTCCCATATCACTGTAGTCGTCCTATTCTTTGTACCC
+TGTATATTCGTGTACCTTCGGCCCATGATCACCTTCCCCGTTGATAAAGCTGTGGCCGTA
+TTTTATACGATGGTGACACCTATGCTAAACCCTTTAATCTATACCCTCAGAAACACAGAG
+GTGAAAAATGCCATGAGGAAGCTCTGGTGCCAAAGTGTAATCTTGGGTAATAATTTGTGT
+GCATAG
+>ENSMICT00000007189 cdna:pseudogene scaffold:micMur1:scaffold_183765:48:677:-1 gene:ENSMICG00000007188
+CGTGTCTGCTGCTCTGCTGGTCCCTGTCCGCCCCTGGCGCATGGGTGGTGCGAGGACCCA
+GAAGGTTCTGGGGCACAGAGGGCCCAGGCAGGCGGGACCGGGGATGTAAAGTGCCCCAGG
+GGTAGCCCAGGTCCCCCAGAGGCTGCCAAGAGAGCCGGGAGCGCTGGTGCCAGGGCCAGG
+AGAAAGGCTGGTATTTATTCACCGTCTCTGATGCAGGAGCCCCCACGGGCCAAGGTGAGG
+AAGGGTCTGGGGCAGAGATTCCGCCCACCGTCTTCCACCCAGGTGGCTTCTCCATCCATC
+CCTCTCAGGACAGAAACTGTCCCACAGGTCCAGAGCAGAGGACCCCTCTGGCTCCAGCTT
+CCTAAGCAAATACTGGATTCCAAGCTGAGGGGCTTTGGCCATCTAAGCCGATCCTGCATG
+GTGGAGGAATCAAAGGCCCAGAGAGGGGAAGCAATGCCTCCAGTGTCACACAGCACATCA
+GGGGCAGAGCAGGGACCAGTGAGACATGAGGAGAGCGGTAAAGGCCCTGCAGCCAAGCCC
+CACATCTACAAACGCCAACTTCACCCTCGCAACCTG
+>ENSMICT00000005512 cdna:pseudogene scaffold:micMur1:scaffold_183691:5:716:1 gene:ENSMICG00000005502
+GCACCCCCCGACGGCCTGCTCGCCTCCCCAGACCTGGGACTGCTCAAGCTCGCCTCCCCG
+GAGCTCGAGCGCCTCATCATCCAGTCCAACGGGCTGGTCACCACCACGCCGACCAGCACG
+CAGTTCCTCTACCCCAAGGTGGCGGCCAGCGAGGAGCAGGAGTTCGCCGAGGGCTTCGTC
+AAGGCCCTGGAGGATTTGCACAAGCAGAACCAGCTCGGCGCGGGCGCGGCCGCCGCTGCC
+GCCGCCGCCAGGGGCCCCTCGGGCACGGCCGCGGGCTCCGCGCCCTCCGGCGAGCTGGCC
+GCCGCGCCCGAGGCGCCCGTCTACGTGAACCTGAGCAGCTACGCGGGCGGCGACGTCGCT
+TTCGCCGCCGAGCCCATGCCCTTCCCGCCTCCGCTGCCACCCCCGGGCGCGCTGGGCCCG
+CCCCTGGCCGCGCTCAAGGATGAACCACAAACGGGGCCCGACGTGCCGAGCTTTGGCGAG
+AGCCCGCAGCTGTCGCCAATCGAAATGAAAACGCAGAAACGCATCAAAGGAGAGCGAAAG
+CGGTTGCACAATCGAATCGCCGCCTCAAAGTGCCGCAAGAGCAGGATGGAACATGGCTTC
+TTCCAAGAGAAAGTGAAAACTCTCAAGAGCAAGAACAAGAAGATGGAGGTCACGGCGAGT
+CTGCTGGCTCAGCAGTGGGGTCCGCAGCCT
+>ENSMICT00000008431 cdna:novel scaffold:micMur1:scaffold_182654:4:748:1 gene:ENSMICG00000008436
+CTGGTCATCGTGCTCGTGTGCAAGAAGCTGAGGAGCATCACGGACGTGTACCTCTTGAAC
+CTGGCCCTGTCCGACCTGCTCTTTGTCTTCTCCTTCCCCTTTCAGACCCACTATCAGCTG
+GACCAGTGGGTGTTTGGGACCGTGATGTGCAAAGTGGTCTCTGGATTTTATTACATTGGC
+TTCTTCAGCAGCATGTTCTTCATCACCCTCATGAGCGTGGACAGGTACCTGGCTATCGTC
+CACGCCGTGTACGCCATAAAAGTGAGGACGGCCAGAATGGGCACAGCCCTGAGCCTGGTA
+GTGTGGCTGACAGCCATCATGGCCACCAGCCCACTGCTAGTATTTTACCAAGTGGCCTCT
+GAAGACGGCGTCCTGCAGTGTTACTTGTCTTACAACCAGCAGACTTTGAAGTGGAAGATC
+TTCACCCACTTCGAAATGAACATCTTGGGCCTGTTGATCCCGTTCACCGTTCTTCTGTTC
+TGCTACCTTAGCATCCTGCACCAGCTGAGGAGGTGCCAGAACCACAACAAGACCAAGGCC
+ATCAAGCTGGTGCTCATCGTGGTCGTTGCATCTTTACTCTTCTGGGTCCCATTCAACGTG
+GTCCTCTTCCTCACGTCCCTGCATAACATGCACGTCTTGGATGGGTGTGCCCTGAGCCAG
+CAGCTGATTTATGCCACCCATGTCACAGAAACCATTTCGTTCACTCACTGCTGCGTGAAC
+CCAATTATCTATGCTTTCATG
+>ENSMICT00000004285 cdna:pseudogene scaffold:micMur1:scaffold_182595:5:483:-1 gene:ENSMICG00000004276
+ATGGTGCTACTGTTGCTAGTGGCCATCCCGCTGCTGGTGCACAGCTCCCGCGGGCCCGCG
+CACTACGAGATGCTGGGTCGTTGCCGTATGGTGTGCGACCCGCACGGGGCCCGAGGCCCG
+GGGCCCGACAGCGCGCCAGCTTCCGTGCCCCCCTTCCCGCCGGGCACCAAGGGAGAGGTG
+GGCCGGCGGGGAAAGGCAGGCCTGCGGGGGGCCCCCGGACCGCCAGGGCCCCCAGAGGGG
+CCCCCAGGAGAGCCGGGCAGGCCAGGCCCCCTGGGCCCTCCCGGCCCAGGTCCCGGGAAG
+GGGGGGCTTGCTGGTGGGTCCGGGCCTCGAATTGCTTTCTACCCGGCCTTGGGGGGGCCC
+CATGAAGGTAACAAGGGGCTGCGCTTCACCAACGGGGGGACTAAAGTGGGCAACGCGTAT
+GAAGCAGCCAGCGGCAAGTTTCTTTGCCCCATG
+>ENSMICT00000002278 cdna:pseudogene scaffold:micMur1:scaffold_181077:4:616:1 gene:ENSMICG00000002278
+ATGAGAATGATTGAACTGCACAATGGAGAATACAGCCAAGGGAAACAAGGCTTCACCATG
+GCAATGAACACCTTTGGTGACATGACCAATGAAGAATTCAGGCAGGTGATGAATGGCTTT
+CGAAACCAGAATCACGGGAAGGGGGAAGTTTTCCAAGAGCCTCTGCATGTCCCCAAATCT
+GTAGACTGGAGAGAGAAAGGCTATGTGACCCCTGTGAAGAATCAGGGTCGGTGTGGTTCT
+TGTTGGGCTTTTGGTTCAACTGGTGCTCTTGAAGGAAAGATCTTCTGGAAAACTGTCAAA
+GCTGTCAGTGAGCAGAACCTGATGGACTGCTCTTGGCCTCAAGGCAATCATGGCGGAGAT
+GGTGGCCATATGAACTATGCCTTCCAGTATATTAAGGAGACCGGAGGCCTGGAGTCTGAA
+AAATCCTATCCATACGTGGCAAGGGGTGAAATCTGTAAATACAAGCCTAAGAATTCTGTT
+GCTAATGACACTGGTTTCGTGGACGTCCCTGTTTGGGAGGAGGCTCTGATGAAGGCAGCG
+GCTATTGTGGGGCCCATTTCTGTTGCTACTGATGCAGGCCATGTCTCCTTCCAGTTCTAT
+ACATCA
+>ENSMICT00000012078 cdna:pseudogene scaffold:micMur1:scaffold_180699:3:774:-1 gene:ENSMICG00000012085
+GCGCACTCCCAGTGGCTACCCACGGCAGGAGGCGGCGGCAGTGACTGGGCCGGCGGCCCG
+CACTTGGAACACGGCAAGGCGGGCGGCGGCGGCACCGGCCGAGCCGACGACGGCGGAGGA
+GGAGGTTTCCACGCGCGCCTGGTGCACCAGGGGGCGGCCCACGCGGGCGCGGCATGGGCG
+CAGGGAGGCACAGCGCACCACTTGGGCCCAGCGATGTCGCCGTCGCCCGGAGCTGGCGGG
+GGCCATCAGCACCAGCCGCTCGGGCTGTACGCGCAGGCGGCCTACCCGGGGGGCGGCGGC
+GGCGGCCTGGCCGGGATGCTGGCGGCGGGTGGCGGCGGCGCGGGGCCGGGCCTGCACCAC
+GCGCTGCACGAGGACGGCCACGAGGCGCAGCTGGAGCCGTCGCCGCCGCCGCATCTGGGC
+GCCCACGGACACGCACACGGACACGCACACGCGGGCGGCCTGCACGCGGCGGCGGCGCAC
+CTGCACCCGGGCGCGGGCGGCGGCGGCTCGTCGGTGGGCGAGCACTCGGACGAGGATGCG
+CCCAGCTCAGACGACCTGGAGCAGTTCGCCAAGCAGTTCAAGCAGCGGCGCATCAAGCTG
+GGCTTCACGCAGGCCGACGTGGGGCTGGCCCTGGGCACACTCTACGGTAACGTGTTCTCG
+CAGACCACCATCTGCCGATTCGAGGCCCTGCAGCTGAGCTTCAAGAACATGTGCAAGCTG
+AAGCCGCTGCTCAACAAGTGGCTGGAGGAGACCGACTCGTCTAGCGGCAGC
+>ENSMICT00000017121 cdna:pseudogene scaffold:micMur1:scaffold_180116:146:531:-1 gene:ENSMICG00000017124
+ATGGCTGGCGGTAAGGCTGGGAAGGACTGCGGAAAGGCCAAGACAAAGGCGTCCCGCTGG
+CAGGGAGCCGGCTTGCAGTTCCCGGTGGGCCGCATTCATCCACACCTGAGATCCGGGACG
+ACCAGTCATGGCCGAGAGGGCGCGGCCGCCGCCCACAGCGCAGCCATCCTGCAGTGCCTC
+GCCGCCGCAGAGGTACTGGAATTGGCAGGAAATGCATCAAAAGACTTAAAGGTAAAGCGT
+ATTACCCCTCGCCATTTGCAACTTGCTGTTGGTGGAGACGAAGAATTGTCTCTCTTCAAG
+GCTACAATTGCTGGTGGTGGTGTCATCCCACACATCCACAAGTGTCTGATTGGGAAGAAA
+GGACAACAGAAGACTGTCTAA
+>ENSMICT00000017514 cdna:novel scaffold:micMur1:scaffold_179616:4:531:-1 gene:ENSMICG00000017514
+ATGGAGGCTGACAGTTGCTCCGTGGCCGCGGTGGGGAACAGGGACGTGGGGTCCCAGCGG
+GACATGAGCCTGGTGCCTGAGCGGCTTCAGAGACGGGAGCAAGATCGGCAACTAGAGGTG
+GAAAAGCGGACGCAAAAGCGACAGAACCAGGAGGTGGAGGAGGAGAAGAGCGACTTCTTC
+ACTGCCGCCTTCGCTCGGGAGCGAGCGGCGGTGGAAGAGCTTCTGGAGGGTGGGGAGTCA
+GTCCCTCGGCTGGAGGAGGCAGCCTCTCGGCTCCAGGGGCTGCAGAAACTTCTCAACGAC
+TCGGTTTTGTTCCTGGCCGCCTACGACGTGCGGCAGGGACAAGAGACACTGGCGCGGCTG
+CAGGCGGCCCTAGCTGAGCGGCGCCAGGAACTGCAGCCCAAAAAGCGTTTCGCTTTCAAG
+AACCGGAGAAAGGATAGTGCTTCGTCCACCAAAGTGGACGCGGCTCCTGCCGCCCCGGCT
+GAAAGCATCCTGGCCTCGCTGCCTAAGGAGGAGGGAAGCTTT
+>ENSMICT00000000730 cdna:pseudogene scaffold:micMur1:scaffold_178215:392:780:-1 gene:ENSMICG00000000731
+CCAACTATTGTGCAAGGGGTGGATACTCACATGCATCAGACCCTCGGTGAAGCTCGTGGC
+TGTCCTGGAAGAGATTGGGGGAGGGTCACCAAAGATCCCATGCACCTGGCACTCCAGGGT
+GAGGATAAATTGGGCCCAATCCCCAAACCTACTTTTCCTTCTCCCTCTCTCCTCACTCGG
+TTTGTAACTAGCCCTCAGATGAAAGATCATGGGTTTCCAGCAGGGAGAAACGGATTGACT
+CAGACAAGCTTCACATGCCAGGTGCCCTCAGGCTGGGGCAGTCCCTGGGGCTTCTTCCTG
+CCTCACCAGGTGCAATCCGTTCGAAGACCCTCGCTGCCACCCTGTCCTATTTCCCCCTGC
+>ENSMICT00000015022 cdna:pseudogene scaffold:micMur1:scaffold_177925:4:797:1 gene:ENSMICG00000015010
+CCGGAGCAGTACGGCGCGGGCATGCGCTCCTACGCGCCCTACCAACCACACCAGCCCGCG
+GCGCCCAAGGACCTGGTGAAGCCGCCCTACAGCTACATCGCGCTCATCACCATGGCCATC
+CAGAACGCGCCCGAGAAAGAAAGAATCACCCTGAACGGCATCTACCAGTTCATCATGGAC
+CGTTCCCCCTTCTACCGGGAGAACAAACAGGGCTGGCAGAACAGCATTCGCCACAACCTG
+TCGCTCAACGAGTGCTTCGTCAAGGTGCCCCGCGACGGCAAGAAGCCCGGCAAGGGCAGC
+TACTGGACGCTGGACCCGGACTCCTACAACATGTTCGAGAACGGCAGCTTCCTGGGGCGC
+CGGCGGCGCTTCAAAAAAGAAAGGAACGTGCCCAAGGAGAAGGAGGAGCGGCCCCACCTC
+AAGGAGCCCCCCCCCGCGGCGTCCAAAGGCGCCCCGGCCGGCCCCCCGCTGGCGGACGCC
+CCCAAGGAGGCCGAGAAGAAAGTGGTCATCAAGAGCGAGGCGGCGTCGCCGGCGCTGCCG
+GTCATCACCAAGGTGGAGACGCTGAGCCCCGAGAGCGCGCTGCAGGGCAGCCCGCGCAGC
+GCGGCCTCCACGCCCGCCGCGTCCCCCGACGGCTCGCTGCCCGAGCACCACGCCGCCGCG
+CCCAACGGGCTGCCCGGCTTCAGCGTGGAGAACATCATGACTCTGCGAACCTCGCCGCCG
+GGCGGCGAGCTGAGCCCGGCGGCAGGGCGCGCGGGCCTGGTG
+>ENSMICT00000010035 cdna:pseudogene scaffold:micMur1:scaffold_177605:351:798:1 gene:ENSMICG00000010041
+ATGGCCTCCATGGGGCTGCAGGTGACCGGCATCGCGCTGGCCGTGCTGGGCTGGCTGGGC
+GCCATGCTGAGCTGCGCGCTGCCCATGTGGCGCGTGACGGCCTTCATCGGCAGCAACATC
+GTCACGTCGCAGACCATCTGGGAGGGCCTGTGGATGAACTGCGTGGTGCAGAGCACCGGC
+CAGATGCAGTGCAAGGTGTACGACTCGCTGCTGGCGCTGCCGCAGGACCTGCAGGCAGCC
+CGCGCCCTCATCGTCATCTGCATCATCGTGGCCGCGCTGGGCGTGCTGCTGTCCGTGGTC
+GGGGGCAAGTGCACCAACTGCGTGGAGGACGAGAGCGCCAAGGCCAAGACCATGATTGTG
+GCCGGCGTGGTGTTCCTGTTGGCTGGCCTGCTGGTGATGGTGCCCGTGTCCTGGACGGCC
+AACAACATCATCCGCGACTTCTACAAC
+>ENSMICT00000014660 cdna:pseudogene scaffold:micMur1:scaffold_177020:625:796:-1 gene:ENSMICG00000014670
+CAAAAAATATTTGATTCTGGGGATTACAACATGGCTAAAGCAAAAATGAAGAACAAGCAA
+CTTCCTGCTGCAGCCCCGGATAAGACGGAGGTCACTGGTGACCACATTCCCACTCCACAA
+GACCTTCCTCAGCGGAAACCATCCCTTGTTGCTAGCAAGCTGGCTGGCTGA
+>ENSMICT00000007564 cdna:novel scaffold:micMur1:scaffold_176373:5:474:-1 gene:ENSMICG00000007566
+ATGGTGGATCGCTTGGCAAACAGTGAAGCAAATACTAGACGTATAAGTATAGTGGAAAAC
+TGCTTTGGAGCAGCTGGTCAACCCTTAACTATACCGGGACGGGTTCTTATTGGAGAAGGA
+GTATTGACTAAGTTGTGCAGAAAAAAGCCCAAAGCAAGGCAGTTTTTCTTATTTAATGAT
+ATTCTTGTATATGGTAATATTGTCATCCAGAAGAAAAAATATAACAAACAACATATTATT
+CCCCTGGAAAATGTCACTATTGATTCCATTAAAGATGAGGGAGACTTAAGGAATGGATGG
+CTTATCAAGACACCAACTAAATCATTTGCAGTTTATGCTGCCACTGCTACTGAGAAATCA
+GAATGGATGAATCACATAAATAAATGTGTTACTGATTTACTCTCCAAAAGTGGGAAGACA
+CCCAGTAATGAACACGCTGCTGTCTGGGTTCCTTCTGAGGCAACTGTA
+>ENSMICT00000001324 cdna:pseudogene scaffold:micMur1:scaffold_175958:285:807:1 gene:ENSMICG00000001326
+CTTCTGGCTGGAACCATGGATGGTGTCGAAGAGAAGAAAAAGAAGGCTCCTGCTGTGCCA
+GAGACCCTTAAGAAAAAGCAAAGGAATTTCGCGGAGCTGAAGATCAAACTCCTGAGAAAG
+TTTGCCCAAAAGATGCTTCAAAAGGCAAGGAGGAAGCTTATCTATGAAAAAGCTAAACAT
+TATCACAAGAAATATAGGCAGATGTACAGAACTGAGACTGGAATGGCTAGGATGGCGAGA
+AAAGCAGGCAACTTTTATGTACCTGCAGAACCCAAATTGGCATTTGTCATCAGGATCCGA
+GGTATCGATGGTGTGAGCCCAAAGGCCCAAAAGGTACTGCAGCTTCTTCGCCTTCACCAG
+ATCTTCAGTGGAACCTTTGTTAAGCTCAACAAGGCTTCAGTTAACATGCTGAGGATTGTA
+GAACCATATATTCCACGGGGGAACCCAAACCTAAAGTCAGTAAATGAACTAATCTACAAG
+TGTGGCTATGGCAAAATCAATAAGAAAGGAATTGCGTTG
+>ENSMICT00000005306 cdna:novel scaffold:micMur1:scaffold_175695:466:797:-1 gene:ENSMICG00000005309
+TCTGCCTTCTTTCTGTTTTGTTCTATACATTGTCCAAAGATCAAAAGTGAACCCCCGGAC
+CTACCTATTGGGGATACTGCAAAAAAAAAACTGGGCGAGATGTGCCCTGAACAGTCAGCC
+AAAGATAAACAACCCTATGAACAGAAAGTAGCTAAGCCAAAGGAGAAATCTGAAAAGGAT
+ATTGTTGCATTCCGTGCCAAGGGCAAAAGGGAAGCAGGAAAGAAAGGCCCCGGCAGGCCA
+ATAGGCTCAGAGAAGAAGCATGAATCAGAAGATGAGGAGCGGGAGGAGGAAGATGATGAA
+GATGACGAGGAACAGGAAGATGAAGAATAA
+>ENSMICT00000007845 cdna:novel scaffold:micMur1:scaffold_175779:3:296:1 gene:ENSMICG00000007850
+TATGCTCACTTCCCCATCAACGTTGTTATCCAGGAGAATGGGTCTCTTGTTGAAATACGA
+AATTTCTTAGGTGAAAAATACATCTGCAGGGTTCGGATGAGGCCAGGTGTTGCTTGTTCA
+GTATCTCAAGCACAGAAAGATGAGTTGATCCTCGAAGGAAATGACATTGAGCTTGTTTCA
+AATTCGGCTGCATTGATTCAGCAGGCCACAACCGTTAAAAACAAGGATATCAGAAAATTT
+GGGGATGGTATCTATGTCTCTGAAAAACGAACAGCTCAGCAGGCTGATGAATAA
+>ENSMICT00000003880 cdna:novel scaffold:micMur1:scaffold_175819:130:631:1 gene:ENSMICG00000003884
+ATGCTGCCTAAGTTTGACCCCAACGCGATCAAAGTCATGTACCTGAGGTGCACGGGTGGC
+GAAGTCAGTGCCGTGTCTGCCCTGGCTCCCAAGATCAGCCCCCTGGGTCTGTTGATTGAA
+GTGGTACCTTCTGCCTCTGCCCTGATCATCAAAGCTCTCAAGGAACCACTAGGAGACAGA
+AAGAAACAGAAAAACATTAAACACAACAGAAACATCACTTTTCATAAGATTGTCAACACT
+GCCAGACAGATGTGGCATCCATCTTTAGCCAGAGAACTCTCCAAAACCATTAAATATATC
+CTGGGGACTGCCTCTGTGGGATGCAACATTGATGGCCACCATCTTCATGCCATTATAGAT
+GACATCAACAGTGGTGCAGTGGCATGCCCAGCTAGTTAA
diff --git a/screed/tests/test.fa.bz2 b/screed/tests/test.fa.bz2
new file mode 100644
index 0000000..0e663d3
Binary files /dev/null and b/screed/tests/test.fa.bz2 differ
diff --git a/screed/tests/test.fa.gz b/screed/tests/test.fa.gz
new file mode 100644
index 0000000..8a43d89
Binary files /dev/null and b/screed/tests/test.fa.gz differ
diff --git a/screed/tests/test.fastq b/screed/tests/test.fastq
new file mode 100644
index 0000000..def4ca6
--- /dev/null
+++ b/screed/tests/test.fastq
@@ -0,0 +1,500 @@
+ at HWI-EAS_4_PE-FC20GCB:2:1:492:573/2
+ACAGCAAAATTGTGATTGAGGATGAAGAACTGCTGT
++
+AA7AAA3+AAAAAA.AAA.;7;AA;;;;*;<1;<<<
+ at HWI-EAS_4_PE-FC20GCB:2:1:353:708/2
+AATACTGATTGTTAGTTAATTTATATTAAGTAGCGC
++
+AAAAAAAAAAAA3AAAAAAAAAAAAAAAA2>?(<6:
+ at HWI-EAS_4_PE-FC20GCB:2:1:188:749/2
+ATTTATCGTATAGCCCCATTATTTATTCTCCAGCCA
++
+AAAAAAAAAAAAAAAAA?AA>AAA>AA???>//<9+
+ at HWI-EAS_4_PE-FC20GCB:2:1:289:559/2
+TGTTAACAAACAAGGAGAAAACAGTATGAAACACGG
++
+AAAAAAAAAAAAAAA?AAAAAAAAAAAAAA<><><<
+ at HWI-EAS_4_PE-FC20GCB:2:1:292:704/2
+TATCCACACCTGTGCCAGGAACTTCCTGAGCAACAG
++
+AAAAAAAAAAAA?AAAAAAAAAAAAAAAA=?>6>?0
+ at HWI-EAS_4_PE-FC20GCB:2:1:387:804/2
+GATGAGGATGGGAGTTGTTTTATTGCTTTAGTTTTA
++
+AAAAAAA*AAAA.AAAAAAAAAAA7AAAA;.<6&<:
+ at HWI-EAS_4_PE-FC20GCB:2:1:386:461/2
+CGTAACTTCCTGGACCGACGTACTGCCAGAATTCGG
++
+AAAAAAAAAAAAAAAA?AAAAAAAAAA<AA9>/>66
+ at HWI-EAS_4_PE-FC20GCB:2:1:143:673/2
+CATGTCACATTTTTTCACGCTATTTGTTGGAGAACA
++
+AAAAAA5AAAAA?5;A?A5AA;AAA.AA3+;.-;?;
+ at HWI-EAS_4_PE-FC20GCB:2:1:199:702/2
+GTGGGCAGCAACTTCAGCGTGAAATCGCCGAACGTG
++
+A<AAAAAAAAAAAAAA<AA<AA<9A>AAAA><?98<
+ at HWI-EAS_4_PE-FC20GCB:2:1:416:494/2
+AAATTCAAAAAGAAACTTTTGATAAAGTAATGACAT
++
+AAAAAAAAA<AAAAAAAA%AA<AAA>5><>?<??>9
+ at HWI-EAS_4_PE-FC20GCB:2:1:186:430/2
+TTATCTTTATGTGAATCTCATTTTTTATAATGATGT
++
+A5AA?AAA?A;A?AA?5A??AAAAA>8AA+?.??.;
+ at HWI-EAS_4_PE-FC20GCB:2:1:259:697/2
+TTATTAACTTTATCCGCCAGCAAGCGGCAAATCTCT
++
+AAAAAAAAAAAAAAAAAAAAAAAAAA=AAA0>?>??
+ at HWI-EAS_4_PE-FC20GCB:2:1:304:517/2
+TAGATTAAATACGATAAGTTAAGCATCATCTCTCAT
++
+AAAAAA9.AA9A=3A=93=A3=3A=AA363<<;<;2
+ at HWI-EAS_4_PE-FC20GCB:2:1:257:905/2
+TAAATGATCGCTCTGTTTGCAATATTGTTTTTAATG
++
+AAAAAAAAAAAAAAAAAAAAAAAAAA=AAA??%0?+
+ at HWI-EAS_4_PE-FC20GCB:2:1:244:675/2
+TTTTTCAAACTGGGTTTTGATCTCCAGTTTATTACG
++
+AAAAAAAAAAAAAAAAAA;AAAAA9=7AAA:??660
+ at HWI-EAS_4_PE-FC20GCB:2:1:378:409/2
+TTTTTAGAGTTTCTCGATATCAATTGCCTGAATAGC
++
+AAAAAAAAAAAAAAA;AAAAAAAAA?<AA6<6699/
+ at HWI-EAS_4_PE-FC20GCB:2:1:288:762/2
+AAATCATTTAACGCATCTTCCAGACAGTATGATTCA
++
+AAAAAAAAAA*AAA7AAA;AA777A6;A*A6)6<<.
+ at HWI-EAS_4_PE-FC20GCB:2:1:258:492/2
+CGTAAAATATTGCGCCAGGGCTGCAATGACAAATAT
++
+AAAAAAAAAAAAAAAAAAA?AAAAA*A??<9<>?<?
+ at HWI-EAS_4_PE-FC20GCB:2:1:541:553/2
+TCCCTGGGTCACCTTGATGACTTCCACGTTGTTTTT
++
+7+.*3337.+7;.+3733A+&*&*-66;&A1<1.<.
+ at HWI-EAS_4_PE-FC20GCB:2:1:301:548/2
+CAGGTGAGAGATATTTTCTTATTTGTGTGGTGAATG
++
+=A=A.A9=3=9=A9==A=AA9AAA,;.;.&<.6-<.
+ at HWI-EAS_4_PE-FC20GCB:2:1:397:790/2
+ATCCATCGTTTTAACAATGGGCTGCATACTCAGAAC
++
+AAAAAAAAAAAA++AAAA??>AA>A<AA*?<<%<<6
+ at HWI-EAS_4_PE-FC20GCB:2:1:529:669/2
+GACGCCACCACACTCCACAGCGCGTTGATCTGTTCC
++
+93339A9.9AA9.+9=3=3&99A93A336A62%2.;
+ at HWI-EAS_4_PE-FC20GCB:2:1:287:685/2
+TGATCGGTCAACTCCAGAATCTCATTGTCAGGCATA
++
+AAAAAAAAAAAAAA?AA?AAAAA?AAAA7=666>>:
+ at HWI-EAS_4_PE-FC20GCB:2:1:351:597/2
+TGATGACAAAAGTTACACCGTCAGACGTGCTACGCC
++
+AAAAAAAAAAAA?3AAAAAAAAAAAAAA&0>>>6<:
+ at HWI-EAS_4_PE-FC20GCB:2:1:385:762/2
+CTGGATCATAAATTAGCGGCACAACTCGGTATCGGG
++
+AAAAAAAAA<A<A<*AAA<AAAAAAAA<>>9??599
+ at HWI-EAS_4_PE-FC20GCB:2:1:208:643/2
+CTTTACGCATTTTATCGACGTAATAGGTTGAAGGAA
++
+AAAAAAAAAAAAAAAA?AAAAAAA?AAAAA<?26>6
+ at HWI-EAS_4_PE-FC20GCB:2:1:184:411/2
+CGTTGCAGCTCAGCTCTACTGGCAGGGTGAAGTCAT
++
+AAAAAA.AAAAAAAAAA=AA=3AA=A;+6.62*2-<
+ at HWI-EAS_4_PE-FC20GCB:2:1:403:699/2
+CCTAAGCCAACGACACCGACATGAAACTCAGATCCC
++
+AAAAAAAAAAAAA1?AAA+AAAAAA?A?A<9<4?>?
+ at HWI-EAS_4_PE-FC20GCB:2:1:340:602/2
+TAGTCGACAATAAAGGTCAGCGTTATACCGCCGATG
++
+AAAAAAAAAA<AAAAAAAAAAAAAAAAA5<?99?99
+ at HWI-EAS_4_PE-FC20GCB:2:1:330:641/2
+TGAAAAAGGCATTGGGATTTATGCCGTATTCCTGTA
++
+AAAAAAAAAA<AAAAA<AAAAAAAA<AAAA9>?>%<
+ at HWI-EAS_4_PE-FC20GCB:2:1:221:646/2
+TTTTTCGTGAGTATTCTTAGCGCCAGTTTGGTTTGG
++
+AAAAAA9A.A3A9AA=AA339.9=3.AAAA&<.<+*
+ at HWI-EAS_4_PE-FC20GCB:2:1:151:345/2
+CTATTTATTGGCCGCAGTTTTTCCCGTAGCGAGGCG
++
+AAAAAAAAAAAAAA?A?AAAAA1A+AA?+<//69/<
+ at HWI-EAS_4_PE-FC20GCB:2:1:166:739/2
+TATATCATCGCACGACCGCAGTCGGGTTATTACGTT
++
+AAAAAAAAAAAAAAAAAAA:AAAA?A?A=A?:>6?<
+ at HWI-EAS_4_PE-FC20GCB:2:1:398:570/2
+AGACTGTTGATAAAACGTAAAAAGGAATGCTCTGTT
++
+AAAAAAAAAAAAAAAAAAAAAAAAAAAAA??????<
+ at HWI-EAS_4_PE-FC20GCB:2:1:298:584/2
+TAATAGCACAATTTTCATACTATTTTTTGGCATTCT
++
+AAAAAAAAAAAAAAAAAAAAA?AAAAAA=&>:??:?
+ at HWI-EAS_4_PE-FC20GCB:2:1:310:418/2
+AAAGATTAATGTCTCTAATTAATAATGCAAACCTTT
++
+AA<AAAAAAAAAAAAAAAAAAAAAAA5A>A><>???
+ at HWI-EAS_4_PE-FC20GCB:2:1:154:875/2
+CTGAGAAAGACATGAAACAACCATCATTCAGAAAAC
++
+AAAAAAAAAAAAAAAAAAAAAAAAAAAA<A<>%<?<
+ at HWI-EAS_4_PE-FC20GCB:2:1:339:736/2
+AGGTCAGCACGCTGTTTCTGGGCTTTTGCTACGCGC
++
+AAAAAAAAAAAAAAAAAAA???AAAAA=AA><(:::
+ at HWI-EAS_4_PE-FC20GCB:2:1:254:757/2
+CTATCGACTGGACCAAAGTGATCTTTAAAGGCTTTT
++
+AAAAAAAAAAAAAA?>?AAAAAAAAAA?*<6>>?%<
+ at HWI-EAS_4_PE-FC20GCB:2:1:426:791/2
+TTCGCTTTCAGTGGCGAACTTATGATTGATATAAAT
++
+AAAAAAAAA<AA9<AA<A&AA<A<9AA//A9?5559
+ at HWI-EAS_4_PE-FC20GCB:2:1:305:667/2
+TATAAATATTTTCACGTTGTCAGTTTGTTCGTTCTT
++
+AAAAAAAAAAAA?A+AAA;AAA;A&A8A*2:??7(?
+ at HWI-EAS_4_PE-FC20GCB:2:1:315:601/2
+AATATACCAAAGAGTGAAGTTGAAAGCCGATAGCAG
++
+AAAAAAAAAAAAAAAAAAAAAAAAA?AAA<></9<9
+ at HWI-EAS_4_PE-FC20GCB:2:1:275:625/2
+TAATTTTAGTTATGATTGCCTTGTCGGGTTTTGGAT
++
+AAAAAAAAAAAAA?AAA;AAAA3AA=77AA?&+%:?
+ at HWI-EAS_4_PE-FC20GCB:2:1:521:849/2
+TCTCATACCATGAGATAGTTATCTGAACCACCCTCA
++
+AAAAAAAAAAAAAA?AAAAA>AAA9A6<??>>9996
+ at HWI-EAS_4_PE-FC20GCB:2:1:299:561/2
+TAAAGACGTCATTGCAATTGATGGAAAAACGCTCCG
++
+AAAAAAAAAAAAAAAAAAA?AAA?AAAAAA::?<><
+ at HWI-EAS_4_PE-FC20GCB:2:1:109:205/2
+ACAATCCCCATGTCAGTACTTGCAAGATAGTCAATC
++
+AAAAAAAAAA5AAAA?AAAAA;AAA8AAA8??;3?;
+ at HWI-EAS_4_PE-FC20GCB:2:1:145:390/2
+AGATGGGTAAGGAAGAATTTTGCTCACGGAGTTTGC
++
+AAAAAAAAAAAA?AAAAAAAA9>9><A6A?6<>?/-
+ at HWI-EAS_4_PE-FC20GCB:2:1:294:549/2
+TTTCGTGACCAACGTAGATAAAGCTGCCGAAGCGGT
++
+AAAAAAAAAAAAAAA;A;AA9A>AAA?<6/9696</
+ at HWI-EAS_4_PE-FC20GCB:2:1:430:414/2
+ACATCCCGATGCGGCGTGTAAACCCATAATTTGTTT
++
+AAA3A*;3A;A3A.A;A;*..A7A3-166+;6)&;<
+ at HWI-EAS_4_PE-FC20GCB:2:1:242:635/2
+CTTCTCCTGCTAATTTCTCTGCCGCTTGTAGGGTAA
++
+AAAAAAAAAAAAAAAAAA;A>AAAAAA</A66+?6/
+ at HWI-EAS_4_PE-FC20GCB:2:1:492:569/2
+CGGGGCGCTGGTTTATCTGGTTGGGCTATGGATCTC
++
+AAAAAAAAAAAAAA1AAAA;AAA3?AAAAA::<<?6
+ at HWI-EAS_4_PE-FC20GCB:2:1:510:651/2
+TGAAAATATGGCTTTGAATAAGTAAAAGATGAAAGT
++
+AAAAAAA<AAAAAAA<AAAA<9AAAAA/5<<9>>/5
+ at HWI-EAS_4_PE-FC20GCB:2:1:294:569/2
+ATGCGACAATCACTTCAGGCTGCTTTACCTGAAATT
++
+AAAAAAAAAAAAAAAAAAAAAAAAAAAAAA:>0?><
+ at HWI-EAS_4_PE-FC20GCB:2:1:287:885/2
+TCTCTGACTGTAAGCCAACAAAGTATGCCGACGATT
++
+AAAAAAAAAAAAAAAAAAAAAAAAAAAAAA<<%<><
+ at HWI-EAS_4_PE-FC20GCB:2:1:268:712/2
+CCACACGGGCACCGACTGAAGTTCTGGTGTTATCAT
++
+AAAAAA<AAAAAAAAAAAAAAAAAAA5AA>?</5<<
+ at HWI-EAS_4_PE-FC20GCB:2:1:196:443/2
+GGCAGATGGTCATTGACCTCGGCATAGGTCAGATAG
++
+AAAAAAAAAAAAAAAAAAAAAAA>AAAAAA<:?>?<
+ at HWI-EAS_4_PE-FC20GCB:2:1:580:641/2
+CTGTGCAGCTTATTCTCCACGTGCCAGTGGTTTCGG
++
+AAA=AA=.AA=A9A=AA=A==93AA-;;6;<<<26+
+ at HWI-EAS_4_PE-FC20GCB:2:1:383:591/2
+CATCAGCCGTGAAGATCTGCACGCCGTGCTGACCGG
++
+AAAAAAAAAAAAA;?AAAAAAAAAA<<?AA9?>%66
+ at HWI-EAS_4_PE-FC20GCB:2:1:230:738/2
+AGATATAGTTACTTTTCCTTTGCCAATAATTGAAGA
++
+AAAAAAAAAAAAAAAAAAAAA;AAA=A==A?0+<06
+ at HWI-EAS_4_PE-FC20GCB:2:1:476:683/2
+TTGCGGTTCGTCACTGTGGATTTGGCTGGCAGCGGT
++
+AAAAAAAAAAAA3AA?A;A?AAA;AAA=*76+>*60
+ at HWI-EAS_4_PE-FC20GCB:2:1:394:610/2
+GAAAGCAACGTTACCGTGACCATCAAATACCAGGTT
++
+AAAAAAAAAAAAAAA?AAAAAAAAAAAA-A<>69<<
+ at HWI-EAS_4_PE-FC20GCB:2:1:268:893/2
+CTAACGGCAAAAACCAAAGGGAATCCTGACAGCTGG
++
+9-.99.AAAA39+3=A3+9933..=;3.366.;6;<
+ at HWI-EAS_4_PE-FC20GCB:2:1:245:483/2
+TGTCGAGCAAAGCAAAACAGGCGTAAAAATTGCCAT
++
+AAAAAAAAAAAAAAAAAAAAA>AAAAAAAA?9>6><
+ at HWI-EAS_4_PE-FC20GCB:2:1:184:640/2
+AAACTTAATGACGCTCACATTATCGACTTGACTTTT
++
+...&.+..+*..&*..*...*,..*...(....().
+ at HWI-EAS_4_PE-FC20GCB:2:1:169:676/2
+CCTGTTGTTTATGCACAGATTGCGTAACCAGAATAC
++
+AAAAAAAAAA<A<AAA<97AA9A1A>>AA599(><<
+ at HWI-EAS_4_PE-FC20GCB:2:1:537:477/2
+TGATGATTATTAATAAATGTCAAATATGAAAGAAAC
++
+AAAAAAAAAAAA?AA?AA;AAAAAAAA===<2?:6:
+ at HWI-EAS_4_PE-FC20GCB:2:1:383:774/2
+CATACTCAAGATCTTCATCTGTTGAAAAGGATTCAT
++
+AAAAAAAAAAAAAAAAAAAA>AAAAAA<<6>?6><?
+ at HWI-EAS_4_PE-FC20GCB:2:1:161:706/2
+CTGTACAACATCGTTCCTTACGGTATCGATGCTACC
++
+AAAAAA5?AAAAAAAAAAA;A;0??AA48A.<8<<8
+ at HWI-EAS_4_PE-FC20GCB:2:1:353:459/2
+AATCCCTTGTATATCAGTATTTATTTGTATCCAGAG
++
+AAAAAAAAAAAAAAAA?AAAAAAAAA2AAA<<>2?+
+ at HWI-EAS_4_PE-FC20GCB:2:1:383:678/2
+TTCAGCGCGCCTCCTTGATTATTATCATGGGTTGCG
++
+=AA.====39A9&A+&939A+A..=6;A%..2.22.
+ at HWI-EAS_4_PE-FC20GCB:2:1:390:661/2
+ATTTTCTAAACGAATTTTAAACGGCGTCATTTATAA
++
+AAAA=AA.AAAA..AAAA3AAA93A;AA*A<<&<<;
+ at HWI-EAS_4_PE-FC20GCB:2:1:406:796/2
+ACTGCCTTGTAAACTTACATCGCGTACACCAAGATG
++
+AAAAAAA5;A55+A?A?A&AA5A0?8A>*A88*<<.
+ at HWI-EAS_4_PE-FC20GCB:2:1:282:597/2
+CATCCTGAATAGCCAAACAATGGCAATGTTCCACTT
++
+AAAAAAAAAAAAAAA?AAAAAA;AAAAAAA>?6<>?
+ at HWI-EAS_4_PE-FC20GCB:2:1:204:474/2
+GGTGATGAACTGGAACTTGTAGCCCATATCCGACAG
++
+AAAAAAAAAAAAAAAAAAAAAAAAAAAAAA?6?9>/
+ at HWI-EAS_4_PE-FC20GCB:2:1:151:368/2
+TGGTTATCTATTGCTTTAAAATTACCACTGGGCGTA
++
+..*&....................*.....&....,
+ at HWI-EAS_4_PE-FC20GCB:2:1:518:680/2
+AATGTGGGTAGCCAATGCGGCAACGATCGCGCCCTC
++
+A.=A3==AAAA=A+3==99AA9==.36A*6-;;%-2
+ at HWI-EAS_4_PE-FC20GCB:2:1:267:385/2
+TCTCCTGATGATGAATTGTTTTATTTAGTCCACTGT
++
+AAAAAAAAAAAAA+AAA?AAAAAAAAA7A=:6:?<+
+ at HWI-EAS_4_PE-FC20GCB:2:1:251:692/2
+GTTCATAAAACGCAAAACGATTGATGCGGATGATGT
++
+AAAAAAAAAAAAAAAAAAAAAAA>AAA<<?<99>>9
+ at HWI-EAS_4_PE-FC20GCB:2:1:84:839/2
+AATTATCAGTCAGGGATACAGGTATTGATACCTACG
++
+A5AAAAA5AAAAA??AAAAA??AAAA8888;;8<8.
+ at HWI-EAS_4_PE-FC20GCB:2:1:415:656/2
+CGACAATTCTGACGTCGGTAAAACCAGAGCAATATC
++
+AAAAAAAAAAAAAAAAAAAA>A>AAA6A6A<9<?><
+ at HWI-EAS_4_PE-FC20GCB:2:1:268:818/2
+AGTTTTCGGGCCTCATCCCGTTTTGCACGAGCTTCG
++
+A;A3AAA*;;AAAA3AAAA;7A;.7A1A.-1<<;<.
+ at HWI-EAS_4_PE-FC20GCB:2:1:188:845/2
+GGACTCAATGTCCCCATTTGTAAATGGCTATGCTTA
++
+AAAAAAAAAAAAAAAAAAAAAAAAAAAAA=<0>>?:
+ at HWI-EAS_4_PE-FC20GCB:2:1:152:689/2
+GTCGTGCCTGGTCTACTAGTCGTAAAAATTGACCGC
++
+AAAAAAAAAAAAAA&AA:8AA9AA1???<A664</<
+ at HWI-EAS_4_PE-FC20GCB:2:1:290:530/2
+CATCTCTACATTGATTATGAGTATTCAGAAATTCCT
++
+AAAAAAA<AAAAA7AA<AAA9AAAAA<5<5<?>>>?
+ at HWI-EAS_4_PE-FC20GCB:2:1:198:672/2
+ATGGAATGGCAACTTTCATTCCTTTTTTTAATGTTT
++
+5AAAAAA??AA;AAAAA5AAAAAAAAAAA8-?+<??
+ at HWI-EAS_4_PE-FC20GCB:2:1:378:827/2
+CGTGACCGATATGACAGAGATCGTAAACGGTGATTC
++
+A<AAAAAAAAAAAAAAA<AAAAAAAA>AAA<9?>??
+ at HWI-EAS_4_PE-FC20GCB:2:1:330:467/2
+CCCAGGAAGTTGCAGAACAAGCAGAACCGGGGGTTG
++
+AAAAAAAAAAAAAAAA?>AAA>>A?AAA?</<?69/
+ at HWI-EAS_4_PE-FC20GCB:2:1:601:561/2
+AGATTAAAATATAAGGTTCTTTAGCAGTTCTTTAGG
++
+AAAAAA9.AA.A39=9AA3AAA=.936AAA<<(2+.
+ at HWI-EAS_4_PE-FC20GCB:2:1:426:665/2
+AAGCCATTAAGGCAAAACTGGCGGAATTACAGCCAT
++
+AAAAAAAAAAAAA??AAAAAAA?AAAAA%A>6<>::
+ at HWI-EAS_4_PE-FC20GCB:2:1:173:732/2
+TGTTTTTGCCATGATCGTGCCTTGAGTGAATAATTG
++
+AAAAAAAAAAAAA?AA?AAAAAA9<AA?6:?96<?6
+ at HWI-EAS_4_PE-FC20GCB:2:1:284:789/2
+GTGCATCAGATTCTGTACGGAGAAAATATCGTCAAG
++
+5;AAAAA????5AA?A1A;1+5;A?8A8AA4/<;;+
+ at HWI-EAS_4_PE-FC20GCB:2:1:91:525/2
+TTGGAGAACGTCAGGAGAGGCATTTTGGCAGATCAC
++
+AAAAAAAAAAAAAAAA?1+AA8AAAA66A669<<9<
+ at HWI-EAS_4_PE-FC20GCB:2:1:553:711/2
+CATTACTCTTTATTAAGGAATTTTACTGCCTTATCA
++
+AAAAAAAAAAAAAAAA;;AAAAAAAAA8==????<+
+ at HWI-EAS_4_PE-FC20GCB:2:1:86:239/2
+AAGATGTGGAAAATGCACGTCATTCATTTCGTCATT
++
+AAAAAAAAAAA<AAAAAAAAAAAAA<AAA>5>85??
+ at HWI-EAS_4_PE-FC20GCB:2:1:343:767/2
+ACCAATAAGTTGATTGAACTCGCCTTTGGTCGCGTG
++
+AAAAAAA.AA9AAA3AAAA=A=AAA;A+;;<6<;62
+ at HWI-EAS_4_PE-FC20GCB:2:1:387:326/2
+GGCAAAGTTAAAGCCATGCATTATATGAGTGATAAC
++
+AAAAAAAAAAAAAAAAAAAAAAAAAA=A=A<00:<6
+ at HWI-EAS_4_PE-FC20GCB:2:1:185:656/2
+TTTCTTTCGGCAGCCATGGCATGGGCCGATACTGAA
++
+AAAAAAAA<<AAAAAAA<<A<A1<9></<>04>/--
+ at HWI-EAS_4_PE-FC20GCB:2:1:192:634/2
+TTCCGTGGCGCTCATCAACGTCGGTTTATCCCCCAT
++
+AAAAAAAAAAAAAAAAAAA?AAAAAAA+=7<:<6*<
+ at HWI-EAS_4_PE-FC20GCB:2:1:210:740/2
+CCGACCGGGATTATCTCCGCATTCGCCTTTACCGGT
++
+AAAAAAAAAAAAAAAAAAAA8AAAAABA5=<:?:0?
+ at HWI-EAS_4_PE-FC20GCB:2:1:210:630/2
+TAATGAAAAAGATATAAATTATGGGGTGGAAATAGA
++
+AAAAAAAAAAAAAAAAAAAAAA?A?7AA2A<<??:<
+ at HWI-EAS_4_PE-FC20GCB:2:1:411:681/2
+AGAGCTTCGCCACTCAACTTAAGCAGAATGCGTTTA
++
+AA<AAAAAAAAAA9AA<A9A<<AAAA5>>A?99?<<
+ at HWI-EAS_4_PE-FC20GCB:2:1:585:657/2
+TCAAATTTTAACCGAATTCAACTGTTTTATCATTGA
++
+AAAAAAAAAAAAA?AAAAAAAAAAAAAAAA><??96
+ at HWI-EAS_4_PE-FC20GCB:2:1:63:801/2
+TATAGTTACTTTTCCTTTGCCAATAATTGAAGAATA
++
+AAAAAAAAAAAAAAAAAA;AA9?AAAAA7A:6*<?<
+ at HWI-EAS_4_PE-FC20GCB:2:1:185:700/2
+CAAATCCAGATTGTTGTGTTGTTGCCATGGTATTCT
++
+AA<AAAAAAAAAAAA<AAAA<AA1AA5A/+?5<>><
+ at HWI-EAS_4_PE-FC20GCB:2:1:292:584/2
+TTGCCAAAACGTCTGACTGAGGCAGGCATCAGGAAT
++
+AAAAAAAAAAAAAAAAAAAAAAA?AAA7A=::>><:
+ at HWI-EAS_4_PE-FC20GCB:2:1:391:779/2
+TATGACGTTGTTGATGATATAAGGAAAGATTCTAAA
++
+.......&.....*..............,.......
+ at HWI-EAS_4_PE-FC20GCB:2:1:346:503/2
+GCAGTCACGTAATCTTCGTCATAGAACATGGCTTCT
++
+AA7AAA*AAA3AAAAAA33A7AA37AA;116;<;6%
+ at HWI-EAS_4_PE-FC20GCB:2:1:253:837/2
+CAGCGCCCTTCTTTCCACGCATACTGGGCGCTGTTG
++
+.&..................................
+ at HWI-EAS_4_PE-FC20GCB:2:1:214:402/2
+GGTTAAACAGCTGACGATCAGGGCGATTTACATCGC
++
+AAAAAA.A77*A;&A;AA;A;7AAAAAA%A<;<86<
+ at HWI-EAS_4_PE-FC20GCB:2:1:548:624/2
+GGTCATGGTGTTTCCTTCTTATGATATGCAGTTGAT
++
+AAAAAAAA<AAAAAAAAAAAAA9<A>A<><5&(+9<
+ at HWI-EAS_4_PE-FC20GCB:2:1:73:597/2
+CATCAGGATTGCCATCAGGATCATTTGATTGCGTAA
++
+AAAAAAAAAAAAAAAAAA?AAAAAAAAAAA/>/<>6
+ at HWI-EAS_4_PE-FC20GCB:2:1:543:793/2
+TAGCTCCCGTAGTAATCGACAATAAGAGAAACAATG
++
+AAAAAAAAAA5?AA?AA?5A;0A5;A>84888;8?<
+ at HWI-EAS_4_PE-FC20GCB:2:1:240:771/2
+AGAAATGACATTATCTTTGAGGAATATGCCCTTATT
++
+AAAAAAAAAAAAAAAAAA?391A>A?A?AA<>>/?<
+ at HWI-EAS_4_PE-FC20GCB:2:1:118:528/2
+TTATGGTTACTTGCAGATCACCACCATTGGCCGCAG
++
+AAAAAAA?5AAA?A?1?AA;?A6?A8AA..;4.4(4
+ at HWI-EAS_4_PE-FC20GCB:2:1:368:774/2
+CTGTTGCATTATTCAAAAAGCAACCCAATTAGTGCA
++
+AAAAAAAAAAAAAA+AAA3AAAAAAAA=AA::6:?<
+ at HWI-EAS_4_PE-FC20GCB:2:1:290:601/2
+CCACAATTTGAAGGCGGCTCATGCGCATCATCTTTT
++
+AAAAAAAAAAAAAAAA?AAAAA>>>AAA??>>9?<?
+ at HWI-EAS_4_PE-FC20GCB:2:1:128:645/2
+GTTTAGCTACGCAGGATTATGCAAGCTGGCCCACGC
++
+AAAAAAAAAAAAAA?AAAAA9AAA>AA<<??<?<6<
+ at HWI-EAS_4_PE-FC20GCB:2:1:540:680/2
+AAACACTCCCCAGCAACTGACAGACAAAGATTTATT
++
+AAAAAAAAAAAAAA;AAAAAAAAAAA2AAA>>><?<
+ at HWI-EAS_4_PE-FC20GCB:2:1:213:489/2
+AGTTGCTGGGCCTGCCGTACCGTAAAATCATCCTTT
++
+AAAAAAAAA<AAAAAAAAA<A<AA9<<A<>?/4???
+ at HWI-EAS_4_PE-FC20GCB:2:1:490:531/2
+TATTTTTTTATTCACGACACTGTTTCTGCTTTCTTC
++
+A;3AA;*AA;A7AAA77A.A73;A;;;.-A<<-<),
+ at HWI-EAS_4_PE-FC20GCB:2:1:419:475/2
+CGGTCGATTCAGCAAAAACGGTTGGCTTAGTGAGTT
++
+AAAAAAAAAAAAAAAAAAAAAAAAAAA=A2<2?20?
+ at HWI-EAS_4_PE-FC20GCB:2:1:326:304/2
+CTAAAGATGAAAATTATTAGAGTCGCATTAAAAATG
++
+AAAAAA5AAA;A;AAAAAA;?;AA;AAA>88;4<?4
+ at HWI-EAS_4_PE-FC20GCB:2:1:201:683/2
+GAAGAACAACAGCGGGGGATTGAGCAGGTTGCACAG
++
+AAAA<AAAAAAAAAAAAA<AAAAAA<<>4>9<-945
+ at HWI-EAS_4_PE-FC20GCB:2:1:126:737/2
+ACATTCTTCGTCACATCCCAGGTCGCGCTCAGGCCA
++
+AAAAAAAAA<AAAAAAAAA<<<@AAA>A<A>/5?>?
+ at HWI-EAS_4_PE-FC20GCB:2:1:350:588/2
+GGTACAAAATAGATGCTGGACTCTCCGAATCCTATA
++
+;?5AAAAAAAAAA?A??;?AA;AAA>AAAA?4?844
diff --git a/screed/tests/test.fastq.gz b/screed/tests/test.fastq.gz
new file mode 100644
index 0000000..391cffb
Binary files /dev/null and b/screed/tests/test.fastq.gz differ
diff --git a/screed/tests/test.hava b/screed/tests/test.hava
new file mode 100644
index 0000000..e0cf223
--- /dev/null
+++ b/screed/tests/test.hava
@@ -0,0 +1,750 @@
+test_000
+ACGGTGACGGTCACCGTCGACGGCCCAAGCCCATCGAACGTACCACCCCCACCTATCGTCACGCTGGTGGAGAGCCAATG
+AFPPCLHBCCILGMMOCHKNNDBKCCPNHAMKJOCCDJAOEPNMHFHCBAJOKEMMMBHCPHIOAEPFFCAOJPGIMKGK
+21858316587186112771945148345529452186568176931571171542294878855181415261425688
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+C7AF246AC7AAEABE5A557FCBC6FD5F5263BCDE4E745BEF1GG7DD1AB511GBC63A4GF1F4E1A154B35D
+test_001
+TGCAATCATCAACCCACGGGAACATCATAAACTGGACTCGAAATGGCGAAGTCGAGTAATGAGGATCCGAGAGCAAAGAC
+AIFBGLGJOMIMLDNMLMLJOJNLKLCHHFHILKAICJBHAKJLOBCOOBNAKLLACNBDEBHEADDEGICJGCGOOMHN
+62593992994659868751136757367227476216781254362816675423144855254753648748538152
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+1C7EG7FC272A6751C2CGDF22G764752GD7C17DB6C1G421CC5BG522EB6E147C5C7G4DEE4633B4E72D
+test_002
+GCGTTTATAGCACGGATAGTTGCCAGGACAGGGAACCTCTAAGTGTGCGTAGCCTCTATTCACACAGAGCCTGAAAGTCC
+EFHKLKMHJFJPPGDKKDOCHDBHGDECJOFPKBPJDFOENOALDDPEEANDDLGDNMMAKDEMLFIDENDIGPPDJJAC
+78536693762311884563354155522557169162129126576387473269132422223638975685199863
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+EB42AB57DD75FD337CGC7EBE63DG6B4EB5411DFGA246311AF37A752713771ECE3GDGGB4662E275GG
+test_003
+TGTGGATAGCCCGATGGCCCGCCCAGTACCTGACTGGAGCATCGAGGTTCGAAGTCACTCCACGCCTATCCGTCGTTAAA
+BMCBLNAFDLPMIOLHNBFCCFNGBJHKPKOALBCELLMFAPPODAOFFPPKPALCAKJJAMMEKPEBGCPOPPIJNBNA
+27747287426456163221589261815523389637172687554546925639456622561674574729521655
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+6E34AGE6BBD41EAE277A6CDDEE616EA2DB51FG6CEFCB5D7163E5B12FCC6A2BD3BGE45F765DD1E4E5
+test_004
+GACCCCTGTAATCGGCTCCGCCAAGTTTTCTGACCCCGGCCAGTCCCTTCAAGTCGCTAAGAAACTCACTGTGTAGGGCC
+JKHDPCBAGHPOGDILFLFNHGIAHPNFLFCAPGPMMEGCJLKIDMGHAOEOBBKBEIMOIIOLGLDCIEFFPNCCINED
+47899223133653587612618526586758365222316812711834657739527821686155729181446174
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+5A7ADE3F5BG47CA7AC2F2BG7ACGBDBC353AG1A43C5F53D4C76471GEC243525CCB715F71GG234C4A6
+test_005
+AGTGGCGACAACTACGCATTGAAATGCGCCATTCTAACACCAGTCTAGTTGTTAGCGTTTTCAGGGCAGACGTTATTGGG
+DIAHEKLMDCBDKACONPNJKAINIFIBBCMHOHACJHCJCILEFJPCHCMLLAPPBALKAKFENIHMJDNGKGPPOMOP
+72764396492884284826783192178238146279499677723626742429852829489972142139447344
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+5EDEA5FDB6EBAE11D71666BD7F73D2G314ABE6CBF33BCG3321365E1BF4BF5D43C3CCG7F7CGE25CB1
+test_006
+CGCTCTCTCTGCCGACAATTGGATCAGTAGACGCAAAACCATGTAAGACAAGCGCGAGGCCTGTAGACCAGTTGCATTAC
+DNJBNMOPPNGHGNCEKJAGBOJLCMICHBBGMEFNIMJJIMCIOLFOOJFJJOJEIOPBMENPCEJIJELPDJLOBFLC
+48614739589429772668713147881918851939242957652966437767833692555575532884282395
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+BG5B345D5EABGD2G6BCC4D622B3CCGEFBE156C3DG1C3G3G1CGD2B17B7D2GB6A7A3E2122GEC651462
+test_007
+GCCTTACATAACACCAACATGAAACTTCGAGCATCATCCCCGACGGGTGATTCTGTAGGAAGACGAGCCTTCCGGATTGC
+LILGCMAKGOCINFBOJNGNLCGEDLPCHHMJDCLFKKLKJFLPGLCIAMLIOBNKDIABFBAONFDNNMGOOFAFKFIC
+83937878511549537465756235171841337767453419226357338351967573989215271567439417
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+D261EC2B7C2EGAD17EDB7A43AGG5GDD1CDBCE6F2C5B7B4EG6B55FE33136DC2C3F567CB262B7A54E2
+test_008
+GGGAAGGGTTGCAGGCCCCAACGTGTGCGAGAACCTAGTGGCGGTCATCGTTGGATACGCCAGCAGAATAGTTGGGTTCG
+CHEPNOCLGDBNEKAFDDJNNCPJOMDMAAFFJMKOGAAFAKLAKEDMLBKHLAKKIFBKPPCLKFMMIKKNEBNPBLEJ
+16581617162925937328255777588766672576532521144788836865515162434989262796581269
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+3B3BE7BAF4A47EEGE341FCE1E4G4GBC2EFE324FD51B1156C337B6E2E51G3FB54E626G4735AD53C44
+test_009
+CATTGTTGCAAGTCGTCCGCACGAATCTCGGAACAATGCTGGCAATTAAGAATAGAGACGTCGTCTGCAATGTGTTTACC
+FDGAFMFDPLJODJFONOOEKKJFELJBCIKKMGCDNOPHOKHKAHAMPOFILOHMNCDLJGANLAHHLOPJMNGPLMMH
+76877638698925285668832341928686627748137552513843788716472721819959444759116864
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+3DEBG2E67C1E6CF6AF424EG6FG213C6D24B652AA6AEG5652ABAE2CD53C4E371D226A2BADB4EAFBDG
+test_0010
+TACCGCCTGGTCTAACGGACCGCTTGGTGGGTGGAGCACATGTTCATTGAGGGCCCAGATCTACATGGCACCCAATTGTT
+BGNOLIDHEBJIBHILFOEFFHGFNBHOIHFLMDHKILMHBHKNKCKHMOHEJLCDOFKCJMHAMKCNMNCJIBLODGIB
+27289157482581319615229981144615855345155321154321958171658558797295463871724431
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+13BEEFDDGFF54212FEA6B5216CGDAABF56E716GDEG325411EE4A37CG3DE715EDAG42GF1C5B3E245E
+test_0011
+TGCATACATACTATTTTAGGGTCATCCGTATTTGATGGCATGTGCCAGTTACTGACCCGATCTTTCGTTTTTATTTCACA
+KIECNGFDFGMCPJHPHIFBFPNHBJKDLJPIJCCOPBMLEKBAIOFDGMPKJKBKDECBFEPMIAPCHDIGHCHBODCC
+85787429396344346154491182333118939165983879521663561731619119299656328158516155
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+76671EAFC2E75EFDGDABE4G2D43G6331EFCEAB761E612C631G6FEG312667CGEF27DDA4C5E3G5AB61
+test_0012
+AAACCATGCATCAGACCTTACTTCACTTAGTAAGCGTCCACTCTAGCATCCTATTCCAATCTCGAGTATATGACCTAGAA
+FKLDPMAHPPFBOBFONNPNLBDDDLIABLEPHDLFNOPONFDMNMMFPABMIIGAAHGNMHCGOCFPKFLOEHEMLJPM
+34122721333565925688921366787627251144263161318554726691873328826772839179649921
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+FACDC22AG75C4B3GBCD613ACCA4FED7FCDF663CE453B676DCBE274D5GEDGBCAAF4ABA4B31F4GF7CF
+test_0013
+CCTAACGCCCCTCTGAACACTATGCAAGGAACTCATCGAATGGTGCTGCTCGAGGATATCCCGAGAAACTGAAGAGCACG
+FEGJMAOCGHCIOEAKKCFHIPDCHMJGKJHNMNNBKGOCBHPGDBPDDKGKGHMCJHBLDKHLHLNHKFHBPAFKEHID
+23822271978484735714599954743153561784818377116727225797677236482641992327682113
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+2C23DA1B565GD37GDD3DA2F62E6FCCBC37EB11A4D74123GFGD6D3BF712313G33E5B42G6GF41E2BBD
+test_0014
+ATTTGTAGTACCAGCCTTGTTTTTCGCCTATGAACTATGTAAATGTGAGGAACGATTTGAGAGCAATCTCAGCATCAACC
+FMJFFJCECHPGLJHENOBBLNEDDGDIMGKNALAFFAJCMAPOEBNOLFNDNBHGNLFPCNEECKBJONAENKFPCEOP
+32288617281452393752642221211634688646655994862652317737143372322743117552953651
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+66ABGA42B67F42C2D2C7CFG3A656C57EA7G2BABA71161EGCD51FF47A7422AC6575G73CGB2E6AFCFG
+test_0015
+ACCTAGTCCTCGAAGATAAACTATACGACCTTGGAAGGTACCATCCACCACGGGATTGCACAACGCTGTTCCAACCCAGC
+HLFIFJAAJKMJIFGGOECBFHHADPCDOMDDFFPCLBMKLEOJGHDJFBPHAOBOOMEJCEFFHADKEAHCHHCMMIDK
+37639256515322739973883259849237624596782164637164159724219975311754952669735787
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+D341F145CE5F4635B16B3D6C51E4C1D25GD2G325D76AFEB161212CFDA5G46AG31AC6C63F5EC773A4
+test_0016
+ACCGAGGATGAGGAGTTAATCGACTGCAATAGGCTGCCTCGGACATGCTTAGGTTTTGGAAAGGTGTGCACCGCATCGTG
+EMKDJOJDFIOHBPIMBNDIOIABDJIMPJFIOHBIOBGDIAKHCGGIMBENAOPKKGJBMIPEBFHMOOJACOCNOBFD
+85237914323454647779549733863226494397497132236269123594244887548952758966556511
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+GEDAE1773BG35E16DAD7F576F2A3564G2A5B54A7A175176GFE6AD5E14A4B7BF6C5361DBDDE347C33
+test_0017
+AAAAGCTCTAGAAGGCCACACCGAGGGAAGAACCGCGCTAGGGAACGCACCCTAGGCGATACGAGTACGCTTATGTTTTT
+OIOAGBKAIJMOPHJALEFOALJOHGAIHLLLADMEJAHMHAGKEJDMNAOIEGCIAJBPNNMHPDOKMFBIJELIHNIJ
+61354898641868226222688267619817289951826967532632829541876197319862898629571495
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+1FC37F131AG7GGEB355654BE2B353B4B1137A4F476B54DBA2622EF7CA233AC43AFA5G1GGDB655DBF
+test_0018
+CCATAAACCGATGCACGTAGGGACTTAGTCGACCACATGCAATACGGTAGATAAAGCACACTGGGGGGGGCTAGCGCTAA
+DGDCGDPKCMKKIKHPBLNDLFCPCJPAIFPAPNEKLHCAHFCHLCFPNFBBDACCDIIFCJIHILEHPBEICGIBOKDD
+53321998717928489391795416497895519425331636399158717861536713693577777114952943
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+AB4GE1233AG4GED44CDCA4CEDC63317AB41DF65FED5AFG72F2FF1371A22A12E416E116EB25BFAG65
+test_0019
+TTCCGTTAATGCCTCAACGCTGAGAAACCCGTATGCCCATGAAGCCTAGAGTTTGGAAATTAAAACCGCTATAGCGGGGC
+GBBKCAELNMFGDKLHAGEJEMPJFLBJLGKDLKGHEHBFNBJBGDPHOIJDOOFPAMJIFKNEKHMDNOHHOLBHAPKE
+16331486163555666958653865864275744382283557838745642357574379915261645247887524
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+B3GBB6BA652F46AGF575D76434CAAGE127ECB5A4BAE47FFB1CACC5746GF2543A1EBADA1C52DC7B7F
+test_0020
+CAAGACTTCTTTCCGTATCAAAGATATTTCCTTCCTTGCTGAGGACCTGTAGTATAGTAAACAAATCCTGTCTTAGACAG
+NGKJIEDBJHPNOHJFDOCLJPKEMFJPKIDECEBMHBNLIFPIHCBODBHIJJGFJBLCNMFCBDNIHDNFJHMIICIP
+18745766261937393334539765745293683661936753716537718453417899384732855773425762
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+271FG5DDAGA44C3D3DAA3BGB611E1A7AE6FE3254B611C4C3E7G7CEG7A5F5AD3FGGE1EG2752A3AG7F
+test_0021
+CACTCTGTAGGAATCGAAGTACAGCACTCCAATAAATCGGCGCTGTTCTGACTAAGTCTGTATAGATGAGCGGGTTAGAC
+EKJABKBKACIJGAGCBPGEBPGHJNKOLPFKEKFFCOPFKPJNIMGAPCENEDPNGLDDPKOMPNEOCPAPMNBGEGIE
+67488949197613686914793385932899147623414948899821553553458493531115391622234728
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+EFF25D4D7F32GF4GCE24CF3BF6GG522A376BB7D425CE43G11ABD5C66314GF1EFCE2GDE66CC75DDG5
+test_0022
+CTATGACGTTATATTTCACTCCGGGGAAACCAACTGTGCAGAATGCACCATATACAATCAAATACCGCAGCACATTGAAC
+PNAJDBCEIDLFNACGEBJNMNIFLJHGOIHEPJEJPANNGJHDCACPGFJGFPKAKKNMAGJIBENCJBEDHGDLBBNK
+61759511214558276982238241868528567864315888249911595825185575972891711396578942
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+A75EABE4CCG5F3F35F3D3342BB2C347C4GDE7AD24C276FFBG6D3G27C5AEADC1561E36156BBB5E13C
+test_0023
+TAACGATGTCCGTGTCCAGCCTGGATACCCCTTTCAGTAGATGTGCTGGAACTCAAGTCCCCGAAAGGATGCAACACACG
+KNFNOGPHANLKADLKHAMGKEHHGEEKCGCNNINGBNMFFGHEICAKGEONMEFNGMICHGKEKAOIGOPOKHLPAHEG
+64151343891282151692438352851231731854136521165841183138725361697774118388255332
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+C664FCB5F1EFF7AD6CC2E2G464A7E37GBCC1E57EFGC15761FEDCDFB25ABAE4FG27DBE6G6ED2GBG76
+test_0024
+CCTGCGGACAGATCATATGGCCAAAGAGCCCGATTGACAAGCCCGTCCTTGAAACCGCGGTCTACATTCGAGTCTAGAAC
+KFPGIBJBNILLGELCENENIMHKGEPGLMMLGLOFPNLFELMKFHKJJMHAEIFHIBGEKMCAKAOAPDHHGJBHKPPB
+13822366782755311181656515142651958893448916949642268757375174738395275726621233
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+B72E2647A6E7C54FB1BACC6A1CFDB1CE1G5E7EEF27364D1676F7GG6FBEG52F1G1F14G2AA4DA5G3F3
+test_0025
+GCATGGATTTCGAGGGGGCAAAGCCAACATCTAGATACGGTAGGCCCTTCCGAGCCCACGGGATGTCGTGCCAGGCGCTT
+ALJAJPCPOIOGPBPCLABEDBGGCKFOPAFOLDJCBHCNEIMCJAMKAIFMBICOGHJNIGCINLFPDFBBBEKPGJPJ
+55292985112444651275639326168418442721964645814594129497322377745383811281575727
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+4CDCCGE5B37B32A6D21EA1FG76C6BCB134G44EAD5CEE2C1FC6E2A662B4BFG7EB6A56FCF57BAA42D2
+test_0026
+ACCTCAACTTAGAGCTTCCTTCGAGTCCAATCCGCTGGATATTACGGAGTAGTGCTCCAGACCTTGCTGCGACGCGTTTA
+AFGAGGKCMCJCELPNNBEEOKOCIAGPFJLCGPAPNAJNOAPAMHBILKILECCJMNMGOGBLKFBDMIJJHKCBHPCA
+57425361478768577341187297499323582897711148513187636582362669751181364573246552
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+3BAEAEG7E732B21C2EECDE2B13AAE5D3A63BG7B4GD52E6E33A5A327F7B2DDAAF2215A3C6C4A33D62
+test_0027
+AGATCTGCGTCGGTCCGCTGTAGATTGGAGCGCCCGCGCCGTCTTAGACACAATCGAATGTAGGCTTTACGTCTTCATGC
+MDKMKCJCBIPPAJACMKCGEPJHKIJHONHDGLIPBGOHLGFAHPKMBFHPJJFMKNILHOCDMCFCKIBGKDNLNGIB
+61745865399942998192598571865336588886495886718715269292152161181432491252652999
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+4D65A2G7DC367332F6GD7C7EEFE4DBA52F362GD5654C245225ACBDCE54A37GCA6GAB6EE43C71EDEC
+test_0028
+GCTTACGCATGCTGTTCCGGCGGGGGGCAAAGATGAAGCTTATTCCTTTCCTCGAACAGTCGTGGGATCTATCGCTTTAA
+CNPMMPJKMAIAPPEMKHKPIGDJEJBBCPHLEJHNJGAEOPALHGOMMGECOFDIOHHKAOAILFDAKBCCEGKAIBKK
+89196764718574547883574751897567417716131768225388229355324125158915115658435267
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+G7F6GBD6CDCB611F6DBCAA1673BCD2711F2352121BC34BDCFG65BAA7F6BG1A2B4FD45E5EGA6A53E3
+test_0029
+ACGTCCGATTCGCTGGCGGAGCTTTTACTCTATAGCGGAGCCTGTAACATGGGACTGCCAATAATTTTAGTCTTTTAGAA
+NLOCOKIOBIJHPACOELIMCGFENHFKDFHAJNINHPJNPKPPFDIAPHAEOOKOLLMCACPBHICFLOAOLAEJDOPE
+31531422789383296287554232362878385313771814285266746141465892381318336365781391
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+5642GD7FE16E7ADG3E6C4DCE74661ED27GC362273C1EE1C15AF1CB3563BGFDB64EFBA25B12AG25GD
+test_0030
+CCTAGAATCAGCACGGCACATGAACTATAGTTTCGAATGGGTTTATTCATGGACTAAAGTCAAGTCCTCCGCCAACGTGT
+AEDMNIJCKEEHPEMOFBBODMIGNCFGOFGDEMCNOOKCFDJIOBPJAEMDNDPCJCNFLHEDJJGLFBJICKPNNEIC
+15976694644226677746963588363831157334413356619878719918563296537646868249291841
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+A2F33DA326BF5BDEE5GGFF5BAC27AA3B75E5B26256A3134C5G44AACDFD2FB2E7477221GGC764DF5G
+test_0031
+CTGTAAACGTCGTGACTACAACTAGTTAAAGCCAACATGCTTAAGGGTGGTCACTTGCTCAAGATTTCAGAAGACTCTAC
+HKCIGDDOFMEKBOGAOJEDFLCDKNCDDIENDJOANHNJNILOAMDAIJPMEJPIHAIEIAEJADHBABPCAHKJIGPA
+15796916659264129452575465735359945814578657984211239267358275479492929528858976
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+5AGG67575AB553DCE3EABCF4AA6BC7F15C3C7A372G44CD1FCEF47GG1FF2ADGGB5A3BC41EDGE77G2A
+test_0032
+ACACGGACCCTGTGGGTCCAGGACCGGTTAATACCACAATATGTCGCCTACTAGTGCATTCTAAGACACATCTCGACTTA
+FHGNGACODCNGGJADFLGKJAALFLJHDMFMKNAGOBLHPGMHOCLDGNFHLHMPLPPINOCGPKNCCEDCNOHODPIP
+74641567358433645612443376461431884815772566269375786996541589831118578392635538
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+BABBC22EEBG7G3E1B133F5CEG5GB2523C6AE74256DC2B6B7EB44C6C72E52FBD5CC27EDE23563A4B5
+test_0033
+CTACTAACGCTAAACTCTCATACTTGTTCATGCCTTCGCTGTACCTAAGTGACCTCATATGTAGAGATTTCTCGCCTTCC
+MKAGNMFBCEPFOMMAJAOPJKIONBNDKPFEDNOKBDHHNAIDNKHBPGKCEJAONLPGBHLLDHPLNANKMNFHOKIB
+31817146725175882271712831816659655292221225781415696947615226375273977831433529
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+G47BCD3FG5321FDFEAA74D65DF4C37AB14GD2A42E3EDEB3F66E2GG6GF4CB734DB4544G423B7DC7E4
+test_0034
+AGGGCTTATGTAATTGCTTGACATGACGAAAGGAAGAAGTACTGCCCTGCAATCAGGCAGATATCCACAAACAGTGATTG
+NPEBDBALJIHCNOOCDKIJKLHNDDJOBMJNOGEKOMLDOKKNEFLEIOHBGDNOGELHAPOIMJALLJJHIMBPGKAN
+48823822832964288949441793418933652755835489946253252354392629565758284228582314
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+CB2D7344B75FEG5AGFE3GB1B124DEE4163C7CBA7B3341D5C7F2266DA43GB5E5C3G1DACFD1A36D4FG
+test_0035
+TCGGCCGTGTCTCAGCCGTTCTGGTAAAAGTGGGAGTGGACCCACCCTGTGCACCGCCCAACTCGACGCAATTTTCTTAT
+NLLFLIAFOGPDOACDMKKHDDBLEKADKDDAHHBHBOJJDFBFELHNDHNIIJANJCOMIOCIKJBIEFOIOIHHKNMO
+78749535912427313775732484513612464912171915772896569925223628988382923887657787
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+CAFB6DEEC2GA477CA667CFD7GBGA3F1F15724766G3124723E3C2D4A4B6C21D2576A155G5FDD26A74
+test_0036
+TTTTCAATGATTTTCTTAGGATGCGGCGGGCGTCGATTTTCAACCTTCTCTCGACCCGACTATTGAATGTTATAGTAATA
+ANNGCJCLOABCEEDMBBIMFFAGNBMOLFHMPCJHGLHOJEMFOJINPLHACHFBPHKPMDJJOKHBMHHBDBKICAPH
+58793457738784331979465787553584794462954694763498672168764729574763455886754245
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+31AG6F7F2BGGB46C7447D55332F327AB3GC7C7F3B276C41362E427G4A2FE1131C513EDAB5F6G43FE
+test_0037
+AGACGTTGGAGGAGAGGCCAAGGCATTTTTTAACCAATCTGACTGAGGTCTTGCGGTGGGCAACCCAAAGTGCATATAAC
+CBNFEPMKIPBKFEGCFDJAOPKCOCHPNDGPIPCNNGHBGMMBIKHJGLPEHCKDPCPOLAJLEJGPLLMCGLNIMIKP
+46577783628848459432142378393595764713979442474995179524951817688946944926386819
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+1DC3A7C3733BGG1BFBFD77D3DBA11B41DGF1FA7B72C56G41ACBC4C174CB161EE415GA35EF45DEAGA
+test_0038
+ACTACCCTTGGGGCTTAGCTAATGCTGACCTGAGCCATTCCGTGGGCCAGTGAGTCTGCCAGTCCACAGGGCGTCTTCTG
+JPOICDICEHKIJLNCPFHGDOJFEHCDNHNHADGOIAPIKIPEIEOFLAEOOKCMMOFNABJNMMDBIJALFJDLKNKA
+14328727263371317819795777118588957756615153415685131828117523975928678228812243
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+2D7B3BF46B4C7352BC4FBE6G4FC14B4AF34F21B22G535FA4D24C1FD7DD1AD5ADD6B7347FA1B6E7C6
+test_0039
+TAGCCGCGAGATATCGTTGAAACCGCGGGCGTAGATTATGACTAGAACCCCAACACAGAAGTGCGGGCAGGTTCCGTGTT
+EAHIKBABOKIOGHIOFBKEHEHMLANNEPHKEBMHAOPLMFDHGDMABCPCKOELDBKAHHGMEOOPNFLMAJHNHOPA
+26838958393387759657774578743333817466239941171666947299277816716516988269955845
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+1BD7AC4CGDEGG2CE4C71GE1F17GDE5G5FBA5EF276D3F447AA76D2123D5BC11FA413E2D5FB4EEE427
+test_0040
+ACGCGACACGTAGCTTACACTGTCTATGGGAGTATAACCAAGATCCATCGCCCTGTGTGCGGCCTCCATTGATAGGTCGG
+JAMEHGHJLAMDNOMBLPBOEHGCMNANIFELICGCLGIBFFDNPDODFNGGFNOOCPCDIJPFMPKBNIEICBAHELHJ
+51324521899342848751327751916446778116141674732664578612852781478435918593594499
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+3ABFBA2BF6FAGG6DDF3GBF4217C52FFD5C34E35A7EBC3E3GDB2AF43ABEDFFC3C6E4EGDA51BBA25EA
+test_0041
+ACTTGTCGTGCCTTTAAGGCCTTCTTGACCGCCGGATCGCTAAAACAATAAAGCACACTGGGTATAAAAAGTCTGGACAT
+CFOKOFKMEDGIFFIELAOGDLHCIHEKODCBGDNHNOGDPCKPMJAKNNBNNCEENAHAJMACBOEOAPPMCPBNICDB
+54265668828165995119221724234391523323549413916816781599967165612747426146845199
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+FCG566G3DD42C5A5C4CA2547G26DE4C2BCCA244343A4F25B5A547413577AA541ABGE75G5G42DD6B3
+test_0042
+AGTAATACTATGATCACGGATCCCAGGTCTTGGTAGGGGGGCGGGGACGCCGATCCCGGGCCGCCTTGGAAATCCAGGTA
+EAGIBBPGNFOPFHKOPLBBJINFPEPGHCHHHANDMHNLLPDHFNMDGNHFGKBMPJGPPBEBLPEJAPLAHAKEADDK
+85859264887636295451522616378591939569731574972127187719192298732152397157275573
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+F7535463EBCA44F317F61D5265AC714A57C6AC71AAB32712BEDG75A442EAGEF337E7D4B5E1F333C3
+test_0043
+ACAGTTATAGTTTGGCGGCATTAACTGGTATGAAAGATCCGACGTCAATACATTAAGCATCGGAAGATCGAATGGGCGCG
+BBCFFODHLDOPDFGOMKIBNHGGIHECIEDAMLKLLOKJKEJINPIEDIDKCCFCJGKFAHJCHPPLALLBPKDGLPKK
+66526135825544781594597499314545278672889966345457966487496864168844757995373584
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+6A1A3B27DB1DFG164CB3B45ACC613EDFGFCB5AA2476437DBB4GBBFG6E3GFD3EF4CCGD46G44D756GB
+test_0044
+CGAGAGAAGCAGCTTTGTTGTGCCTCCATATCCAACCGTTGGTATGAGCCGAATTTTCATGTGCACTTTACACGTCCGTT
+INIDILDOGDLKLLAICDALDPOMIFEPKAILMAEFGBCHKHLBEIMFDLPINFPCNIEFGHJCPDCAHKFICEEHCHEM
+11769537989293157162788426818387796988177171231148532368544491873976618623334586
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+BG5363EFF522F1GFGEDD7E24EF7B246DB517BE3F734175G7B164E3F21F1BFAFGD27ECD2674A24GB7
+test_0045
+AACGCTCAGGTCATCATCGCTTCGTAGCTCAGTTCGTCCCGAGATGCAGGCAGCGCGACATAACCGAAACTGTGGGAACG
+KFPKAGOKAPPKGDNPMOFGHICLFKIBONFKFCPKBKLJOJMHJJKCCCHCFFBOHFOMHLEBBALFLLNLMOPKAAFM
+57648693638732753451547772117256888168533356546121445362112338973978649944921327
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+C6C3G5FE1E4271BA211DEFBA172D441AG325276EAG635E337C6C516FB7G4E4A1G5D7E2633EG5D1AE
+test_0046
+GCTGGACGCGTGTAGGCAGCCCTAAGCTTTTTAGTTTGAGGTGCCGGGTTGGACCGAGATGTGACCGCCAGGGGAAAATA
+BGLFPKCCBJOBKOJEMLINDAOMBDJJIMFBPPCNNMHHCMGIIGMBDNDBNPOPOEIEFLKJMFIHIEHDIOKDKEBD
+49699354618469458613815567625129954919729363577585673851836327373711884921743111
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+GEE27E51BC1A53DDGEFG4FDEGD63AB7E532DFG2F4AB17A27DDAC1D2F1E17252571142D1464BGB1B7
+test_0047
+ATGTCCATTGCAATAACGGGTATACTAAGCAACGCCCGAGCGTGTGACCATCGACACGAGGGCCACTTTCTAATTGGTGC
+BDIJLOIEDNLFNGBJFOHHMDKFPANIBGLPGLCCKHAPMHKOMBJAKEGNMGBHDFLIHMNLCMDGEEGLFINIKJIH
+46426351246839414758388877732622845723263939919666558991693272656516159451479335
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+CEA3A2A27ACB2EB5FG6A36232BBEE7D45G346ECBDG575GC1A16E5A2E773GEA2655E1AGF7F5D3516B
+test_0048
+TCAGATTAGGGGAGGCGAGGATAGTTGTGGAGTAGTCTGACGATCGGTTACTGCCTGTGGCACAGCATATATGATCCGCT
+IPCLKLFBJOKHLHAMFBOCOCFPHHOGCMHHIANFKEHAFHJLELIOGEHOKHPPMNEKFMKHAJOPECCIIGFPFGBI
+92369858156838978246559661221383354556336859941279429957698123623588418895751641
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+612A356A456G7CDDC2G6E47CCF3FG16ADGDEF1F62BFC221G23C5AEF13677C3E66A5AFCEGA1D317DB
+test_0049
+CTAGCTCGGACCTATAGTTATCTGTGGGGGCCTGAGTATCGCTGGCTGTTCGCTTTGGGTACGTGTAGAACCGATGATCG
+JNBJIJFFEFKANBFLBCGNEFHPICEIKNJFPCGBBMGDCONJKIJBLGIMGEGHHKOHBPMLLDNPHJONKGDPAJPO
+18891344975869153835751187646712583323159232531765339759568981896197266858914778
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+AA2DE22424GA33C7D465B213GDBBGE4E57CEFAD75E6B1ABF454B2AFGDCCECC2E145F73EAG71616FE
+test_0050
+CACCAGATGATGCTTGGATCTGCACTCACTAGAAGACGTATAACCGGACGTGTTCATGGCTCAAAAGCGTGTTCAGCCCT
+PJFIIIBOEPFPJKAAFEAEBJFEOOLLIPPMJPIICAMJJALDNOFPALBLPKKDCEMJNILJINPDDKPLLLKMIBMA
+24637899819275384449868675343836512877229483948929298886337819745975355716625686
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+EEA1GD14EGGDC2B3E55577CBD4D2EBAF41C4F263534C5D43EG6G231343F1EAC5F513B47CGCG42136
+test_0051
+TATAATTGGTCGATCTCTGCTTCGGCGGTCCTGAATCAAGTGAGATCCATACAGTCGACCAGGATAGTCAACAACAGCAA
+BKADKPFPHINENDCBCMAMGNLKIGDPMANCAKEAMHHDPBOFGDLKJBONCKDNJAAMJMIACEKDBNCCGMDABJMH
+98832295355837395842726872839756197548372293999874225869139137363164186276498382
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+DBCA372C3A13D6E4413D24D3671AGF2D7336AG61BEE53D5AB752B7415521A73AFEECB7CE4AFC2331
+test_0052
+TCAGTTACAACACAGTTGTCTCTAAAGTATAAATATGGGACCCTAAGGGCGCGCTAAAACTCCACTTATCACGCACAACA
+NNFKNAALLKMAJLKPPLECNOFCMPJGNPGEEIJLCADPHPLLCPCHPPLAPKDOELPOEILHDBBLILGCHNFKMJCP
+77498179376186159435988871238669299775938394674467253335317997653633564986952353
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+EA11ADCBEF6CA5G2C6B513GCFBDG54323557DF45FB33ABC3BGB7675F22E426FF711E53624F5A7C4F
+test_0053
+GCTACGAGGCGTTCTTACTAGACATGAATACGCACGTTCTACGAATCTTGCGCACGAGGGTGAAAGTGTCCTTGAAAGCA
+DHAAEGBFMLBBFKIDEIKHGBJJKONIJPCMDGLKIFEOFNNJILKANLEONDFJFNMIDHHNCBGMPNGEGPIIMJIH
+83763741376489879151271784677263782479966789327275486762829452824589963483537247
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+14FA4FB6DA76CGAB6GF2714B4A637ABDCF43AE7DC45E442DEA3G6E7E5533C43C621E2DAD2CAD7AGG
+test_0054
+GCGAAGTTATAAACTCAAAGCCCTCTTGAAAGTGGGACAACAGAGGTGTCTTGATTTGCGTCATTTTTTGGCACGCCGAT
+ANIFHGJOKIMIICPCJNHAPGNFBGBENCDBODKPGLBLEIHLLKCLINPBLDLHJJOBMFAGLOIJIFDBINKEMMKO
+54459318526238397679888836489163322182134144486658974123591297133544747539532555
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+F15DA251273E6G6A4163A27D17112G2DB217E46246EA72E424ABB76EFDCGAB576E5146E64GFGAGFG
+test_0055
+ATGTGATCTGTGGCTTGGCTCAACAAATCGTACCCCCTACCTGAAGTTTCCACCGTCATGAATAGTGTGGACGTGCCTTC
+IHKJBNCMJLLGJHOPOPCJLPNOJCDCCPBBLOCMBCGJFNLHPKHGEBJNDFJNBGFJINIJIBOHHLMJLFLCIPCA
+75891658442653235646529695675646434244554723621911966394248387552111652585473861
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+F41AG1CACG154C2BB176F2C772GCFEFEDCE2B5AAA5CBAE7BD5256D3DDF7B644EF51735BCB4CDGE5G
+test_0056
+ACGGGTTGACTTTGGTCAATAGACCACCTGGCCGCGAAGGATCCGCACCGAGCGTATAGGAAGCCCGCAATCTCATAAGT
+JNMCLPMMGPKBMAIAFJADLOIONDNCJEMFKPLKJBNOFJHKMLKODGILCOAAKFLNMMNLHKCAJCHIOBKMDLNI
+47629341952632658365834271531423886357297514562739744873547514866551128578424768
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+F2EC21CC6GC5D1AB7EC53CCFCGE1FF6666C1DA1C533CA4DGFA2A3617DDG71E2BFAE3ACGDBG4GD3B7
+test_0057
+ACAGGTATTGTATAGTCGGTGAATGTAGCGCTCCATTGTGAATTATATGCATGGTATAGCGCTGTACGACAGAGCACAGC
+MJAEBJMALOJBONADCHIKCAGDKLGDEGAFGBNDNMNJDGKNIOMHKNKDKMBIGLKFLPLGHLGIGOFPAGHFMPLM
+98997584114838595748398576615319423521528286654624831218392281868515338819364462
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+DAF5ACDGA72C455EF135FE2A6A16C3F41D42AF5E66231A53C5CCGB653175EG6AB6B21DBE65A5CDEG
+test_0058
+GGTATTAGGCTGGAGCTAAGAGGATCATGGCAGACGATAAATTGAAAATATAGCCTAGAGACGTGGTACGTTTCCAGCAC
+MJIGCJBMJCBFOCNFELNJBAPCANPNMENHPNGFANGNPMFMEKKLGGLJJHOOHDOHOJAEIJGMBGKLLOJOGJHC
+11826586317344771372257817274581774989566915168126446461779759357769485867737635
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+BAFF65CCD62AF3CF5GAADE6G3F2BE1B24E36E6B21GCGD7G27F3CGBE6CA3BAD47C7A7E41G143A163C
+test_0059
+TGCGCAGTTTGAAGGGTGCATTTGCTTTGCAGGGGCTACGCTAAGACGCAGTAGCTAAAATTTGTCTCAGTCGGGTGCGG
+ALMEBOHEPPKDDOIHJHKALKBKDEENICIHGMLHFEDCFOPBKLNCJHCNBMFDPMJJNIAHMNPOGFKAFABEDKAL
+29379272523872474241322447424195522227637456932962349429421445444258377399794989
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+E61G71737F21156F2G5G4CGE342C61F33F511637ADD5C12C2CABCDDA21EF3D5CG53CA365GAA15F7A
+test_0060
+AACACAACATGCATCAAGATTAACGGTAAAAGGACCGTTATATATCACTTCTCGCTTAACTGAGTTTATATTCTAATGCC
+OJIALLAAHMFGNOKFLPGCFDBNEFKEPNDNCECNIOJDNFGMDKEHNNJJIOOCGAFIMMNIKOELBFJCPKLCPOBM
+25997835957959776556748591648738195855152742919367818499136278126175197622554972
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+F71C43247A4F677D5DCC4GBGE342475C44CFA2GBDF4EFE345FCAAAB7662D661B5EE4CABF7F2GGE34
+test_0061
+ATACACGCCAATGGCTGTCTACAAAAGTGTGATCAGGCGGAGTCCCGGATTAAAGGTAGATCTCAACTTGCGTTGTCTAA
+EAGAKMDAHAGDAMOKKFMILAPKKFAMEIAGGJNBJJEHKGEDAMBPHGBADHLMMAGLKBIMGBFAPIAGMLGHPNAH
+93848262541443863327529588489499411646451658554797515584794392663242554415578243
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+6161B1GB4DF7162F6E2AC413CGD63156565C7G71A14B7GGGDD57A16G25G357A2BD714FFFBDA2F3EC
+test_0062
+ACAGTGACCAGGTTAAATGGGGGACTCGACTGACGGCCAAGTGCCGGGGGCGGCAGCTTATTGGTATCGAGCTCGTCGCA
+PGJKIOCDNCBJJIIPKOHDCHIONJCFNKFIEDIJEBLGEBPGILGCPLCJNHJADFOGBDBHIHKHMFCGBAMNLDOJ
+37833692786371957424376655627422521743468873593115897125254627675852918313198452
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+BB3B23EGC6G21D32A2ECCBDGEB1ECB7G2B5BB7273BA5253BFG5FEC35A2G75D73BF72AE115E7B26B6
+test_0063
+CAACACGATCAAGTTTGGTAAGAATTCCGCCTTAAGCTTTCTAGAACGATAGTTGCCCCCAATCTGGTTCGAAATCTCTT
+GMDAPLMOOFANDHHMLBPIKGHIAFFFOABFMNNJNIJILEEFEPOCAJLNDLIFBPMGKOFJIEFAHNJPIOFAJMLM
+39236397139389852275613887648533427438439122136418369146118333919885587613673488
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+FC25E2CFC2BAFA7A2AA4757F3GFFFEE37G7752FCDBAEADBA1AC7374FB5C15552E6E2GG6GFF62C6GE
+test_0064
+AACATCATGACTACGACTTCTTCGCTCAATACAATCAAATCAAATCCTCGATCACACGGTTGGACATGCCTCCTACGGCG
+BOGEBCAMHHKIHIINJHPNAAJGKJNMOKOKDNPPHBCODOKAGPNHPBFOLPDPICIMDHKIHNGLGMFNLHHEJNFD
+98964211663843276997765364227649534777147961694715979581978472772981699818568954
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+614627C3CC27D34G7G24A31A565FFA6D73FG71A5C74FB41151254DGDCG5FBD27BF3A6B3E6657CGAA
+test_0065
+GGATGAGCTCGCAGTCCATACTACCCTGCGGTAGTCTCATGTCCCGCGACGGTAGAACTGATCCAGCCTTACAAATCGAT
+PDPAPHIFOBJHAABEJALKHIPOAAJIFKOMJPKDGKEMPKPELHFNOAMKEPFBFOHJHIBICNJMGLBFOAJBNEOI
+32293565669732847319647622244918182175843916859925725296541538694379748544835772
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+5BG2463A66BACC74F21DF55CBBFFFF17E46341AF74GDE77EDC2B2GAC4FBE4GA6FB3A2D336F5E1FAG
+test_0066
+TCCCCTAAGGACCGGTTAACCTCTGGTCGTCGCAAATTACTAAAGTAATTGTAGAAATCCGGGTTGAGTGAGCTCGCGTA
+HPOKCLCIEPNJGHKHAOKMHEFIDJALPPELKJJIOOFBNCPHLMONMBGFECDINILOMMAECKFGGGIKEBIPLABP
+82274319816334366889251179512164974439576611119573618152842734696516297251676453
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+FC27B22BDA6C5754C14A51A7B6B345E7CBCEFFEFBD32131EEEF755BE6CA27157CFF7E44A771562C6
+test_0067
+GGAAATCCGGTGAAACACGTTAAGCACATGAATCGTCGAGCCTGAGCGTGCCCCGCGTCCGGGACCTCCTTGGTAATTTC
+LGOJDHJBIBMHBFNFDGLCKAMJDJLEKAMAAAKBICCLFFDPPKIIKHCAGALPLMBENNGOCNHFPNGCFHNCKBPF
+19328885289929179417771744633752518336284137986374281873554685283318926194275191
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+6734F5FGEG213542CFC147BCA6F6515735G6B52F1AF2D77G4F1F6AF1BDF46FF613FF6FAG2DF5G52D
+test_0068
+TTTTAGTGGTAGCTTATGCCTTAGTACGTCGTCTCTTTCCAAAAATCAGAGGTTTACCAAATCCTGCACAACGATGTCAG
+GMFEPDCMPACBMPJICBEANGBOOHHBCDDLAGLLKMLPDMPOGNJKJJMADIPCIHMMNIJPOKKCMMMMAILKFNDD
+91216199316744587834722645312365138672671252639568148467823416432342254555226888
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+4DA2B47164C11AG27D11ECED3D717GE7E46471ECDGB4E1FEFFG3C7B2D7G5ED46EFCBBAG532D64BBG
+test_0069
+TAATTTAACCCGGTGACACTAATATCGTAGTCGGTCCTTCTCTCGCCTCTCCTTGTAGTGCTATCACACCTGCCCATGGG
+KPNCMJKBBOMBDGMOJCPDNIDBFKMEOHHPKJCOMDDMHJJHELPKCBMFGDKEFHPAIECAIOFACAOGLCNNGHKE
+96248237757437342537618964763316678142621938665329872646135299144579478337819139
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+BFBD543F26E7DC533BG7FC213BDD54D4F3D33F7D27FF5C6CCF44C114A3CE7AGBFF27G47A1F5B67B3
+test_0070
+TGGCCCCCTTTATCAACCGCCCCGAGAAAGCCTCAACTGGGTTGTAATGAACTCGTAGCAGAGGTACTTCCAGAAGCGTC
+KKCBEGMPADIIKJDHJHNKCMGLGDDLOGLEHEHEDJONDEJDNLNEKCHGPKJBBCNKMDGFOMBDKMHDNOBDNKFC
+81737193131162146564763762816897997916431556487993514372423673884652432378325225
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+374DDCEF42E4FABDBDB156B2DB2422B2411FD1G43F3F3A71BB7AA2EGAGBBB21GCB7EGDDG4D1C6FAB
+test_0071
+CGCGCAAAAAAAGTCAGAAATCCCACGTTTATGATCATTTTTGCCCTGCTCTTTGAGCATTATTGCCATTACTGAGGACG
+FFADJGOFJEAGGOIMPEEAONIFBKNEEPPANJHIDEBMGILKGJKGCGAFICMJONGGCDDIHOAOKOFLGHCDKNFO
+78342164789836185868452246758598144586831548433429467655974484438161488419988249
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+D5BBB471155EECE6F4D4336B7D17CF17567G5751C5G724267D17F143GF21CEG63BD7F6F7GDCB5573
+test_0072
+AGGTGGGGCCCATGGAAGGGTAGCCTCCTTACGTCCCGTTCATTGACTCCTTCAGGGGAACTGCCCGTCATTGTAGGGGA
+CKNGDOHOFHKGPIABIPFFBNDNGDGIGADLGHMHDPCJDMKFAGCJLKOKFPIBPAKKGPGGMJNMNIIGFMLDIAAN
+12536748298235394771247452247933884865493798268485673894826524739726323677118234
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+AA3E1F5B1334A3C63E6237AC1AE736CC72A525A1BDEABCC6E3FB242B654FC5C7C31E3EDF45EA2EGG
+test_0073
+CAGCGAGCTAGCACGCTAGAGTTCGGTATACCTGGATTAATGGAAGGAAATTGAGTCGTCACGGACAAGAATTCACCCAC
+BOOIENEHEPLOLGFNFPCALDMLPPJJGHAFDAGKDLAJIKGGINEKGHJKFJJCLPPLBFBHEAAENHFDKBPKPJNK
+29435262964991921667329475234372165539446561449268722119726725864257769253334242
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+DDD6ADDA7BDBA2366AGC14AB4CCD42D5B556DB36C6E3D4CAE47B1GAC152GCE1F5EFCE3C55215C1E3
+test_0074
+TTCCGGATGGGAAATGCACAAGGGTCATATGGCTCTTGCTGAACTGTTGCGGGGGGATACGAATGAGGTTGCTAGTAACG
+EPPKAMBAHMHOBAAOHCFBPCDKHIMNJHJKHJJOEBIFFAGAOKCFDKGLLBBELNJJFFNCJPCHCFLPMLAHBGHI
+71968287933439484343643299776422756888819386982944337243161114813682353766413899
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+A62CA1GG6CGCF1F6G1D62GBFGC43CC257DAA762CCG6C36136C7C226B2FG545FAF6GC7A6AG63BEGDD
+test_0075
+ACAAATCTGTTACATCAAGAATTTTTTGACCGAGAGCATCACCAATCAAGATTTGCTTACTGTCAGAAATTATCCCATAA
+GLAINOFIKIILOHNBGJFNLDDCLCLAABFDBCLDPKBKJFKDKFOJABOICGCAFIDCHJMCCNCJPPEPHMOBPALB
+97967789832274736714532469181491957623687738724994843591392974322182277239741732
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+1BA3CDEEFB3B6ADGGG67BB1EA5BE6AB6EE742FAAD7B7F47245316EC6411GA51BB347FA371E4543F7
+test_0076
+GCAAGAATCGGGAAGACCTCTAGTTCACGGTCTTGAGGATCTTGACAATATAACACAGGACATTAACGGTCCGGAGGGAT
+LAEIHNGBDFNAAMBJMEAJHDMEACCKFOKCAKHNPFLFACOFGJBDFFIJPELHMCBNAJDLGPEFNLLHLANCOHON
+65271154584825496355911162142377717911924961158769149618816998356611927687999857
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+1E321AF23G7DBD5E1GE4F1EEC1DC67FGD7AC23CF3EG5AD57BFF5EA2G44F2717C6G4B23D443CC5DBE
+test_0077
+CCGTGTGAAATTTCATCTGTGGAGTCTGGAGATCGCCCTGTAGATTTGGGCAAGCTCAACGAGTTGTCACGCAGTATTCT
+FAJJNGOIFIMNMIJMCEOKKEOENOGABIIBCINFJBEHNOBOAPKBJAJGPFAHFNCELHKJHHEJLDGGCLFMOCBD
+56584149128126188227531673939898978893417971498714862962638949624544188239989999
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+G4EDEG52G57G31FBAA5EA43C5GE4F3327GFD5A721DE2G22C25F616C6EECF5G1DA75FG6A4B4AE5D1E
+test_0078
+CCTTGTCGCTTATGTTTTTAGAACTCCTCGGATTCATTCGTGAAGACGGTTAAGCCGAATATAGGCCTAATTCTGTACAT
+BBMMJHDBEMNPDFGGEFLJOKIIBMAMIJAOBICJIJPDHAABCHONBPJFBDDDHJPAAPCNJEDHAAACFGPANPEC
+74196963435161516162746594374778165291976531825159448239888376342296365129373779
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+76264D42CBFAEAFEA4GECB3GF772G727G4A7DG36EGB6AG7G7AFD4F2DBAB7C2F3FB5GGDCB25A4C2D4
+test_0079
+ACTCTCTGTACTGTATGCATGTGACGCGTGCCTCTTCATGATGATATATGCTAATGCCCTCCCTCGCGAACGAGGTAAAG
+KHHCENIAFLBKAMNMLMHHOKPOKMDMDAIJPHKDDANLINPIIOOLMMCNJMOJLKGPIGHOLCCIHBFHJCDKDEFD
+66721468525996753294262534653316155946482112586928788287297711753121556891947366
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+E6GGB5CG37AEAFED7DCD217CD2734DE3GA46EF73F74BBGED3CAB2G5CE5G7BFDFEED4E3BB62EDGCFG
+test_0080
+GTGCGACCTGAGGAGCACATTCTGTCTCAGCTCTGACACTGATTCGGTATCGATGACTGCTTAAAGCCCCTGGTAGGGGG
+OCIPKJJOBFDOBPOPBAFAHPPJMPKCDJHJKMLJEBPNKFGGAEFNIFKMJBKAGIKGCHHHAKKIPIDELCJFHGMI
+26931675842573137163356643568889768718819113441593174814441794718383666654345882
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+55FFC555A371DEC2EA561B14DBD56C6DB5D3BAG11FC34DDG3EBG2GDEAD47F34G6E14A5CA14644CD7
+test_0081
+GCGACCTGCACGCGGGCATTCCGCGCCTACATTGCCTGACTCCATTCGGTTGGGTCTCTTCGCTTAACTCCCGGATATCG
+OHBMCIDACFNPKLIGFOMFHAOBGLFHDLCJEEJPBOLALIENHNPDOBGOKFCEMBNKHJLIOPKIBHFMDBFOOKDE
+66426984337126556283846865189154441212956284923373756171758955458739155376823126
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+3D3B2DG66DG3AAEAG1D4655C3D4B31255BG766A424711DG3CE5BG34B36BA6D7FFFF4AD4EC1413B3B
+test_0082
+CGGGTACTATTAACTATTCCCCATAAGAGGCGGATAAGTTATCGCTTAAATCATCATACGTAGCAAGTGTGGAGTGCAGG
+OHODENIDMFOKCLMLFIEIDBIAOKLPBLNCPFNDGMAHOEGNMJBDPMJNCPPEOAGADOBEDOENILFAPNEAFHBH
+32964254591129317923297296247476656339547869419343621245637627972994845892377978
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+G5DF5C1A5GBC55ECC6EBGD367F54G43F7B37CD5BE12147EAD16GC6BE1B217CFC1131CDA1D577G725
+test_0083
+CCGCACTTAACGCAGATCTTCCGTTATTTGTGCATTCCGATATTCGCAACTTTTTTTTCAGATTTTATACCTCGTTGCAT
+DJJGNMDDBDBLACMGLIGICPHGCCDHIOALPEGEKIAHHBHJPJKFEJPLIGAFEFNGJDJILHGICKCBGCJBOLLN
+98764535613734346959263389452559737738629593175364469243978252625549443139751136
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+21723F51A7CFA35251GEBC2F433CE55EG6521G67BC576A54CE34D1517EFGFB113G61CF1GC24CD3EA
+test_0084
+GTGTGCATGTGACGTGGCTAGTGCCCAATAGGTTAGCTACGACCGATCAGCCAAATAATCATGTAAGGCCCGAACGTCTT
+AHIEFHPKNECAOILDDKPJIBPDKBADBHDNOFMFHFODPFEKAOGPHGPAFDDLIOENBOAJBAEDKFFOMEGCAIFM
+38379527934118554882261846312757259511858687833625847534311133261341334445929423
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+B5ABAG7A5EEE73GG3DGA4G17366AFB2GCB22A6CE4E3E6GCEGA1A524A541DCD6FB6CGG6D133GC4E64
+test_0085
+CCATCTCAGTCGAAGCCCCTCCGAAGTATTGTACTCCCGCCAGATTTCGTTAAGGCCAAAGCTCGCGGGTTTAAGAACCA
+AGBONPOPGHDCCNHKCIPDCMEPEGJCDEMBOBBABAELBCDHFLGMKKCMGDEPKGJGFACEMHHDEIODHMAEBMGD
+12525383175696732883283913912464263163583539713545534289832769382648239875755153
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+31321G5FDFDGB3G5664F4BAC13GCGA665BGFD511D1BDD6DBAB2A6171EBE5EFF77G7GAB26ECE61477
+test_0086
+CATCGCATATGACCCGGTGTTGCTAGCTGGCAACTCGTGGATGTTCTCGGTAACAGCTGACAGCAATTGCCAAACCTTCC
+PGLKFDEAPJBGJKGAFCDGPCLIGFOHBBPHDKLNNFPHBCHACFFFCDPIBDOKBDKBCKJICMENOOKGJNDAMLDD
+37967979764312898491537149461243146631379849664688115787754987132258661294145687
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+D4D24AD51C4G65D37CA556AFCGDB5AGF165F5FEDC76BC2B5F5F7DB74F254EC412GFC7F1GB2472E6C
+test_0087
+TTATCTGGAGCGGGCGTCCACAAGACGTCACCACACTTTTGGTCGGGCCAAGGGGTGAATCTGAAATTCGTTCTCTACGC
+LHIDLJPNHEJECLBANFAPJMDMIHJLELDBKDDDKNIGEEMIKHIJOLODNOHNIHHBPHMMPLHENJAMEMNFJBNH
+77475681986816262542298656745133117613335515872177861897347112933866136429279861
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+D1F5E55BD26D6AA4BE1A4GC3BBC76A2426F5BA7C6153GDBB5G65G7E7765BG33FD5G21C5AA54B4F53
+test_0088
+CTGGGGATTTTCTTATGTGTCATCGCGCGGTATAGTCGCTGAACGACCCCACAGAATGAGACGCCCAGACCACTCTTAAC
+HNFLPANPEOBBBKDOLHHEENMHMHPPHMKMDBIHEHAEICOIJIHNKLHDGAKNFBJHCCGNGPHLBGLCJLPPDBLE
+14225529626337171434333958548486769918991946119874968797421341261435798624553465
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+BCAG3BG2GA2B36DD6D2446E331CDFB2CD4FA7237GBBEFADG4666CBG4B25D6CFC6G3A74DAGGGGB7C5
+test_0089
+GGATCTCGGACTTCCTGACGGTTTTGAGGTCCCCAAGACGATTGTTGCCAAACTGCCGAACCGATGGTTCGTATCGGTAC
+GIPEDFCKJPLBDOIHDPDOAAMIEEFNELOCFCCLPKJNFHAHGCLCOLFBHPPLKGBKHCOGJAOIOJDHGDBFNMLC
+83133831622285678945361998581969128839223581221327262274543289581728653253526549
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+71A66642ACGC46D56D4F1B4E4G6B51A56BGBDD5DBBGC4B4CCBFG531E66E4GAE75BFB3AG143GA717G
+test_0090
+CCATACAGTAGCTACGACCTATTGTGCATCGGTGCCCAGCCATCCTTAGTTACCTATGAACGGGAAGAGAGATATCATCG
+JCOEAEGPIEKJNCFJPLBPFMEAIFECFGOBBMKNKOJNGNPPKONAGEPBDBDKAKEOELKFKALDHDFIEKPHPCCI
+46958599273582837375931937984834961651119158955679693157478928511434879874214967
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+EBDGG364375E5FB7GDF26G5D476C32C1C2DAEG15D44D23CE54DD5FE6EB23DEAC7EC5ECF5CFFBBB73
+test_0091
+CCTCGCTTAAGGAAATTTTTCCTGCAACTGTGAGCTGCTTCCAGGTCATGGTAAACAATTGGGCCGAGCGTTCACCCAGG
+GIPLIAJEAHJOAIAJADDKCDLKGFPLCIAFJIKIODHGLFJAHKOGIKJOGHAMHJHJHLPMDEPJFLCKLOCOGKME
+26323133548227376937298653417277475885833852671689875575837378853559222858418454
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+1DD36D4ABA62D6E2D7ADAC1GF5172273C4GBG4F2B61C4BD4F3GFCFFBFG7D3313D3A636CFA64C7561
+test_0092
+AATGTACCACGTGATGGTTTAACCAAATCGGCGCTATCGTCCACACAGCTGTGTAAGACAGCAGCCCAGCACTGCAAGTG
+FBNCJDONLNOPOBFFHBNJGFLGGNPFJGKFBMDCDBENPOPCONCMMEPIFMHFNEKPLJIKEPMMMFGBFOENHEPK
+53435725977455648367592665733564195582567486296515385319542559555695788279288217
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+D6FC2467AEG114A1AGEF3GF71ABABED4C7CBFGC3FGF1EFD336DG35D7E7G2473265GC5AF51G5322BF
+test_0093
+CTTTGAAAAAGCAAGATTCCTAGACCTAGGGCATCAGGATTGGTGGCTTCGATAAAAAGTTCAGACCAAGGACACACGTG
+ABNCHBIDLCDALHHDILNOCMBCLAPCNLGIAJCJELDMJBOMAIMFCHBEAOKEKGBDIIDGEEJFPBCKFNBGCDCG
+42947835634728414495152376813136349822141257432941996981939967975856696158797628
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+GB535551G2D6FEA37F1D2F6AG4FB17FF4765F6A44D6B37DDC16C322263C25FD3416E7EE37AF1BD43
+test_0094
+CATCGGCGGTAATGATACGAAGCCAATCTCACAGATAAAATCACTCTAAAGTGAAAGCCGGTCTCACGAGGCTCCGATCC
+JENLBKKKMKDGCGPMCMBGMHJFAHPADCJECKDCHHFEDFPGNMIMMJFHPAACHBGCEMEKMDDOKENDCFMIPBCG
+33528257829251979463555562173919446317772486916962384934381786257544294521189979
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+76743D75DGE624GDG6G6C4C7ED36FCGD635266133A26C2B3A6F46CDCB4275E4731GCGCF655A3FB3B
+test_0095
+TAGGTCACTAGTTTCGTAGTCTTCTCATCTTAGGGAGATGACCCTTACTAGTTGGCCCATGGCGAGCTGAGCTCTATAAC
+BHCOHMMONJPLKBCFLICCPGOBDNPCIKANPOIIDKOJHKNKFBFLHPNDCCMFCOENPFEMEHIPLMBFNBKCOCPM
+48121285311424618933763746456193745537827186159781273249828559276673149965272975
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+76CDBEAE61E46D6EG6G4D7G45C1B6A66EG37FAE33C2244B5EAA55CED4D4AE6AE27E66A4C6BBFE5G4
+test_0096
+GTCTCCCAGACTCCTGTGGTCATAGATCAGAGGCTACCCATTGTATCGGTTTACACGTTGGCCAATTTGGACGTTAGGAA
+DAKJHDBDLNDIPMPANCOFPPKPAHCAEBAGHOCEDCCBJBIMCCDCEIINOAOKPBCPJOFJKCNDGCCDGIHDJNCE
+18315257456291675547759415197561695929649861955672294788882796867386442836474391
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+A1C1221DA652G2D62B7FBA5E4G4C113C7DE65ED4C33D1F711C71G33GCCEFFCF35AC1542AC7B373D5
+test_0097
+CGAGCTCATTTCGTAAATACTATGCGACACTTGATCAAGACCGTGGTTGTGGAATCTCAGTTGACACTTCTCGTACGTCT
+PNNFAKACOHIGJGFGPBMBMHEGMCLCBGHPKOPMNOELDOFJOGGKGLLMGIEHDECLHMDJLFNEPAMDKCNPNHDB
+42252256498637775327722366624776923485159165434492522728115141774759973537249782
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+1BFB161C645F2DD14F7DF3A6EFCFA15FD257CG7ED3BCD4EG1F15EGEE7F5FBD267214ED25G71C41FF
+test_0098
+GTTTTACATATGAGAAACATCGGCGGAAGTCACTGAACGGCTCTCGCGCCGATGATGATATGGCCTTCGGGGAACACAAT
+JNLINOEAJPEHOHNGHKKPLIDGLBOCCBHAMKKHPOGLCHDPPHONNJACNEDAPMJFJDMODGBENBFENCDJIMIF
+19513636857648871273453445987328351358999563897729453656215295288291231692338471
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+FC4777BD2DA1F52D3DF41CBG34A33DBG42ED11GBEA3CCD16156D3AACE6DA3F7B264DE4B671B2A5G6
+test_0099
+CTCCAATCGGTAAGGAGAATGGTGGAATACTGATGTCTTACAGGCGGATGTTAGTTTAGGACCGCGACACCCTAGATGCC
+LCGIKHPOAEMGIPOABPAOIJCAEMPHEIEGEBKMPJIJGHHHMFDJEHMHJNFFLJLHGIOLFOGEMKJKGOFDJOHF
+21915681739679662218656931629651221761774847541982595883644234925144584555224138
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+51F51FA53D4B4CBC2634DAF2B5E6D4C5C1B714EGGDAD7B1B52DEDFFBA4G515EGGF25E7FF6BDBBDCG
+test_00100
+CGAAATCTATCTGGGCCACAAGGGTTCATGCGGACTACTGTCTACAGCGTGCGTATTGCTCCCATTTCGGTATGTCCATG
+HOMOJJGKOJJJPOMKMJJBLIJEAHMPIOOGFIICLDCPCFNMNDLPIKCCAEJFIHHMPLNCDAGJFGMEOOEBIEBI
+99138613786395816959951122862554598413673384144476311148998598456952885347683413
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+35B63ECEB246E16BAG65GA7167D65ECCGECG325B3C6E7764E1A16411FA2C2EED4EF7BCDD2CC4G514
+test_00101
+TCAAAAACATTGTCGTGCGATTGTAGCTGTGAACCCTTCTTTGTGTTGTAACAACAGATTGTAGACTTGTCAAATCGCTG
+MKFOLBKLAFDCEIPNDNBOINBANHNDCNDFNCEJCMJHOAPCOMJDHJJFGJOFOEBPKBABEFEKEGFIGHMFAOLJ
+44829679227937138975769832729322181351152329246986442755222844553186554396558834
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+C4CE3533BC7FE7A6FA22CG2B2714F11AG54325A22456DAGE2GEE165ACB277BB5FEC454GG64AFA2CA
+test_00102
+ACGACTATGGGCGGGCGAATAAGGCCTTGAATCCGTCTGTGTCGCACCTGTGTAGAATCCTTGGTTAAGCCGACTGTTAC
+GOMILBNFKGEBMEHKIKJJNGAJFBAGNGFNEKJPBJONCGGGOABDMBHMAJIIAPFHKEBNCDNPFJBPNLPJIGLH
+77439853543133575536169935794488211651764216722884267652879697331256461265815944
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+3BGDCA7BB4C33DC67E7GFCAAG6164BD3373E5C3571BA2EE7A4D4G67A75F43B13GEG5EE66EAA6AG63
+test_00103
+CAAGGGACACCGTATGGGAGATAGTTCAGGATACCCGTGCTTACATGAACAGTGGTTCCTTACAAACGTTTAGACGTAGT
+OLOBOLCBFMIPBPDDLMMDLICNLFOHCADMGPPLAHMMIIOPGOJLKOMFLHIKCINDKFENMPAJEFICPFGNFMOI
+34688856121184852395318911711715768326198151362642655592883961379298469654851999
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+23154DG477FE23A1GB1A1253B16F317B2AEE1GDADBBG774AG45BG7156514E4AGE2F7G11277F1CB26
+test_00104
+TATGGGAAAGACGATTCAAGTTTTAATACTGCCCCAGGTGACGGTCCACTTGTCTACTACGATTGTGTTACGGTCGGGCG
+EEBOGOKFGHMGMPLAHBKAHMOLCAOIGPOMMHOKHFHAMAKMDOGICGICLPDHAFGOAAIPINDOGLLEGGBAFGHH
+12551971121825231779135538433267465783251791324322717276649349716283453437496616
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+4ABDC53G2C536D7E6A1F262G12C4C3A5CCC57GG5EDB56EADBACB673161EFDACEE23CF36B5CG72227
+test_00105
+CTGGCGTATAAGTTAGTTCCGGCATAAAAATTCTCGCGGCCGGCACCTTTTCCATGATTAGTAATCGCCGATACCACGCT
+AIOHJNIJCEPINHBOCIBPPLBABKDCPIBAECNALPJDAAFOOLMHHNGMCAPMFDGMGDDBEFMAABEOHMGEODHK
+63891822757621662962959652476149417187495923188743499965471182225614693114979632
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+C25D73B5E6BFAFB721F7FGFG5F3ED2731GBG66CDCC3ADB45E53F2D4A4GF22FC67F33B244D7GA4AE3
+test_00106
+ATTGTCCCAAATTTTTCGGCATAGAGCAGGACGGATAGGTAGCTACTTGATTTTTTGACTACCACCACAAGGTTGAGGAG
+CNOJKOFMBFGENAECMGKDDLFPADPDBKINPAOAGGLFIOBMOCJHEBCAKFDMNIPMLMPNKINNOLAPCCMDMJGO
+38653618523186736876228486811965858289277577224489683733623539942417372926686695
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+7GACE111EF36E3GAAA546B3E252AB1AAGB55A3AA2E4BG5A3A6AEDD75CE1B7334FAFB5EAG3B631EG1
+test_00107
+CTCACACATGATGAATAATTCGCGACCGCCTTAAAGCGTGAATAGTTTCATGCGTAAAAATAAAGGTGCCGCTTTAGTCA
+FAEMKLLKAPEENEHKPHBDGLMKJBKIEICDMOELNELBMBINDIPCGHEMDIJEGEJGJDOKENCLLBGAEANCDGFN
+31874496315782724986241815697898633356598659143428859372118463893463347812214644
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+DFF667DE344A5741F6DG6512GF54EEAAG7ADD5B47C2ECE3EA65BCGF2DAF3A1522DB36E4BED1FBGC5
+test_00108
+GTTATGCGGCAGACGTTGGACCATTCTCATGGGTTACGACCAGAGTCTTCCGAGAGGAGGACAATAATGCACCGCGAGGT
+MPNIKBMNMLAACINENPCGIGLOJIIGEFDNGKNMEAPDFHDDLCLFCLHNCGJJOOKOBIDNMGFKKLJHIPNKNKGJ
+54575836942247288217273875533685445585252997462285628714665729112645617272744942
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+GAE72E67DF1A7BDEEDA11GDAF7EC61DG737FFG1A6321DGFB6A7CEG4A4ECDDE42C32242C162E364D1
+test_00109
+ACAGGAACTCGCTGATTTATGATCATTACCCCGGAGCACTTTTCGGCCGCTCCCCAGGTGTATGGAGGTAGGAGCCGGAC
+DFJPMENKKGDAOHPKJAKHMIIJNFIFMFOEPGKMFDBNKLEBBMJJBEJKEFDPNOOFGGJMONOFDICHBODEOPEA
+41779545525999827428236638335737542422624918185987996289826469933235235612895362
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+E437B1D635E5G5G55BB1A6C3B24CEBCAF7122CDAEA4GBAF5E17CC1B4BEDAGF4G3G52FB73EG6CEF2A
+test_00110
+TTCACTACCCAGACTGTAATGCGATGATGCTTAGAACAGTGGCACCTTCACGTAGATTCCCTGCTTCCGCGCTTCTAGCC
+NOPDBLPABIJDMGJNHCOALBFECMDPCIFDIJIPCJAHEKIDMLLGODDJNFJGNBOFMDGHEGANBGFIECEIGBMI
+17445965818571397921244929722717422719346885111583136284997576618385219495268322
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+EGEGB2CA71BC6DA2FC22A6A1G73EG2E1AE1D56D2DGG3A37BBB26GD5D2GEFF532BBE1D6AFGG237DCE
+test_00111
+CCTCGGAGGGTCAGGTACTGTTCTAACACTCGCTCATAAACCTGCATCAGCATTGAGTGGCTACGCGGACGTGGCTACCG
+BNNAGNBCHACHJNMFHIHIPOHGFCHDMKOJDGNJDKOPOJNHALAOABDODEGKBHHLEDOMHCDGBGIDJBAPAEPH
+87145178987444389941447159286473817268261727352842126113659246844471996615439196
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+D6B747431G427CB1554D2GE4BBDDF4G1E3D213EG742E2D3412C3E5G7644FAAAD2B42BE52E6E44G3G
+test_00112
+ACGTTCACCCAGTGCGTCATTGCCCTCGAGGTATTTATTTCAGCCTCCTTGAACCGCCGAACCAGTTGATGCGGTCACAC
+DHDLIKIEOHELGKMJGKMLPBLEHAINNIFOFAEGHONHGOIEMMFNOKLKGGJHMJJOPOJPMPDHKMIPEJDPGDFO
+88682915563723595283523445798879583633795821342567584335265978446421546188488139
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+333B53BCCF22GD35411733G2DF2A7BC36DGAAFF5GCA74AF526BD2C3FF1D547731532555C1F31F52A
+test_00113
+TGACATACATACCGACGTGGAGCCGTGTCAATGAGAATCCTCGCCACGGAGCGGCGTGGTCTGGCACGTGTGGATCGGTA
+HHOOCJAHBKFAPELCOBLNIIOAAPAAICGCOCDINFEJECNIOPNKGKEFEIMBNGICPFAABCEFBPPBJEDACMJJ
+18278776969332419593486897595886212745235629666296732518477745757781947943747278
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+3C24A13G44F31BC4264471GFD27AC5G4B75575AAF57A2142DF26GDD7GBBFF376A34724AC6ACBA14D
+test_00114
+AGTCACATCACCTGCTCTACCCAGCTCGATACAAAGCAGTCGTGCTAGCGATCGGAGAAAGGTTAGCTCATACTTCAAAA
+PBLECENKBIIHIDCDLEIGPPCPABLHPOPDNADEOHCKMJDHHCADINKLHNGOEEBNCFFDBGJOBEGADIKAPPEH
+95586744866999559677597115944317563832195845249565676155497921917841365337457686
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+FGF7BGFE614C41ADG5CC5FDFE52EB2EA2514GCGG54A7F4FC234FDD7ECFE2A6GCGFED756CCG5E23G7
+test_00115
+GTAGTTAACCATCCTTGCAAGACAATAACGTGTTCAAGCGACGCAAGTGCACTTAGCGGTTATCGGGCGGCGACAGATAT
+HMDEOFMNEGOBDFPJCKPAFLCNCNIAGAFLNNLMBCONMNMGIGGDDCNLGEENJNJHBAPPEDANNALJNMEEAGPP
+69547681964541923767585198417764595417258765983415245913599294184848656745271975
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+2EB25B1FCG6F7DAB7A16BG54A4526C3F6FE3AAD6FCA3GCDA21FE5D1BB172A7B7B556B2G561C4BFE7
+test_00116
+TCGAGGGATATAAATATGACTGCACAGACGCGAGTACACTAGGCTAGAGGTACGGAACGCACAGGGTACAACTCTCGTAG
+HFNNPPHGJPEBBLKHBFMNHECOHJICLFIANAAPLEIOICOKMHCMMANNACFAFLAAGAGCGGENGMFOEOJLHOAO
+48618892983461767314357437886227114947336285874746635924249319395699863852111881
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+B615G345771GCBED631A57BCBBCF2DDE13717D11215B6E3F71136ABBE7B4774FEC541571FBG5E264
+test_00117
+AGTTGACAACGGGCCTTCCAGCTGTGTACGGCCACTCCGCCTTAAGCGCAGTTCGGCCCTCCGGAGCCTGGAGAGTAAGT
+HINJDEBLOPPEPBCOMPIFMCMJBNGHNJEBBECHDJAEFMKGFIMEDJNPKKBDKMKBAGDOFAACIOPKKFBFABFD
+75429927197311913242114351362783627425356812221437725936799588343323863969595491
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+6D7CFD2353ACC2ADFBGA7FC42EBEB5CA54617EGD327BE4GG5DGB22573DDE16C4E1EAAEEEDC6G7A23
+test_00118
+CAGGGAAGAAAGAAATAAAAGCTAATATACCTAGAGGTGCCAGTGTTTTATCAATAGGGGAGGTACAACTAAATTTAGGC
+OEIKBFLDGAMNNHOCLKDOKPEDFBJHFAGJJFGKOOHNDEGPGMACNDAOOMAAHCDBOPDLHLAMAPBOMDBLBIIB
+94157999869677317961318993889657485584264469629924377677141683172617475842955935
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+1DFDD3BEFEGGF26CA55736FF742CD5A675CB66EG43EDC46GB62A7EBFBAG625E34A31CBG4742ED22C
+test_00119
+AGCCCTCCTTCCCCGGGGTCGCTTCGGAACGGAGAAGGCTCTGCCAGACATCTACGGACAAACACTATTTTTGCATTCGA
+EDMILDEMNPNHGAMHLOJBPIBAEFMPFBJAIMMKCNFEGOGNOBBDMGAFAMOHAHNMAHIDFEACDNNOHEGENIJE
+17451614631236327593188879594634677926397576643134973149112737934489139742693851
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+A4D1CGE3A1GA23GA21G72131F52DACCG17DEFCAG26EF3CD213CG5437CB52D43754GBAF7A72FB2G75
+test_00120
+GTGACGGCCGTAAGGGTTTAAGTTCTCGGATGGCTGAGAGAGTCGGTTAGCAATGGTCATGCAAAGTTTCTCCGCCAATA
+OLIBBPBGNLKCGPOKHHAEHGOPDGBBHFOGOCLNKKCIGOGFMHHIGNBGAGBEOGAFBPKILHBHIEOGDODDODLI
+45171153287639976523954555525264286626678814646812578584779547613519286254436936
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+F6D47B4E1DFG4CGE1D27A5F4F714ADD4FD3E2A5F6GEE6214F2C3DCE7D516F51223DC31F5C213AB12
+test_00121
+GTGATCCTTGAATACCCCCAATGCGGACGAATCCTAGGTTGAGTCCGTATGCTGTATGAACAGTGATTGGGATGTTCCCA
+HMDPMBHAJPJEFOECAONGBBHBCLMHGMIBHHKILPFJKDNEMNGKFBBEACDLEOPINLEONBJNGJNHFIGCHHNM
+13154134637727129225632289979522888647127782482749387728919919731937173467428589
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+4CF6D65D21B5BCG2DGDAC4G16426CF3FB5EBD7EEB1323A241716F3G1C3CGC235GF27GA11446FCDB2
+test_00122
+AGGCCCAGTCAGTCCCACCCATAAAGCCTATGCCCCGACGGACATACTCGCGACACCGGTTTCCAATCACATGTAACCGC
+FJKCPDCGEMJHKPEFHMPPCJEGHJOMCIKEJIMAFIIGFCBAPMFJCILIHLEBBOCJAIGMMDBJBEDIHLAFIBCJ
+16644147722341725851621985327563671168669441299633979576997538884581357423542143
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+53EAD41G23DDGGBA346FF47F7FCG47CE4327D4AD6CGG6BFCG15DBD6FA67A1AEB1CA5643G27GBGCB7
+test_00123
+GCCCGCTCCAGGCTTCGCCGAGTCGTGCCGATTGAGTAAATTGCACACGGTGATCAGGCGATCGGATACTCCTGCTAGGG
+IAFGEBANBJFJCKJPCEPJNJCNLFJKMFCBBMDIFDHBGGPEMDBNEEDNLJNPCJCDLCJLIJLMJFNEILJCMCHL
+49322749656982931631484325253847267164786468395113671924895969865688824123179991
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+AF1A4BD5335E3E6B7DB3DF4AEE1F515AD6C7A2A4C4C2D57DAFA2D34GB73111F5G5C5FCGE1A33C3FC
+test_00124
+ATCGCAACCGTTTCCCCTATCTGGCAATTGAATCCGCGTCCTAAAACGAAAGCTTATCCCTGGCGAGGCACGCTAGGCCT
+CIHNCECANFNLKGCHNOEHJDHADHPAEMMNKGMMMPDOBMOCKNBCMCPHEBEOINHMBMMGCHEMOIOAPEFPDDJP
+32736451148353713169532559587626971677814946924334424648676283848861393812686731
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+4FE5FDD76CC5DE4DC2F25AA2GFBD7BEG326C6D7AB5B71GA67BAFD63AE1A562CDC1C2D157G6EF17CD
diff --git a/screed/tests/test_convert.py b/screed/tests/test_convert.py
new file mode 100644
index 0000000..1e1e6d9
--- /dev/null
+++ b/screed/tests/test_convert.py
@@ -0,0 +1,30 @@
+import test_fasta
+import os
+import screed
+from screed.DBConstants import fileExtension
+
+class Test_fasta_to_fastq(test_fasta.Test_fasta):
+ """
+ Tests the ability to convert a fasta db to a fastq file, parse it into
+ a fastq db, save to a fasta file, parse the fasta file into a fasta
+ db and then run the fasta suite
+ """
+ def setup(self):
+ thisdir = os.path.dirname(__file__)
+ self._fqName = os.path.join(thisdir, 'fa_to_fq')
+ self._faName = os.path.join(thisdir, 'fq_to_fa')
+ self._testfa = os.path.join(thisdir, 'test.fa')
+
+ screed.read_fasta_sequences(self._testfa)
+ screed.ToFastq(self._testfa, self._fqName) # Fasta db -> fasta text
+ screed.read_fastq_sequences(self._fqName) # Fastq file -> fastq db
+ screed.ToFasta(self._fqName, self._faName) # Fastq db -> fasta text
+ screed.read_fasta_sequences(self._faName) # Fasta file -> fasta db
+ self.db = screed.ScreedDB(self._faName)
+
+ def teardown(self):
+ os.unlink(self._fqName)
+ os.unlink(self._fqName + fileExtension)
+ os.unlink(self._faName)
+ os.unlink(self._faName + fileExtension)
+ os.unlink(self._testfa + fileExtension)
diff --git a/screed/tests/test_dictionary.py b/screed/tests/test_dictionary.py
new file mode 100644
index 0000000..9164423
--- /dev/null
+++ b/screed/tests/test_dictionary.py
@@ -0,0 +1,94 @@
+import os
+import screed
+from screed.DBConstants import fileExtension
+
+class Test_dict_methods(object):
+ """
+ Make sure that screed returns sensible results for standard dictionary
+ queries.
+ """
+ def setup(self):
+ self._testfa = os.path.join(os.path.dirname(__file__), 'test.fa')
+ screed.read_fasta_sequences(self._testfa)
+ self.db = screed.ScreedDB(self._testfa)
+
+ def teardown(self):
+ os.unlink(self._testfa + fileExtension)
+
+ def test_iter_stuff(self):
+ db = self.db
+ keys = db.keys()
+ ikeys = list(db.iterkeys())
+ assert sorted(keys) == sorted(ikeys)
+
+ values = db.values()
+ ivalues = list(db.itervalues())
+ assert sorted(values) == sorted(ivalues)
+
+ items = db.items()
+ iitems = list(db.iteritems())
+ assert sorted(items) == sorted(iitems)
+
+ def test_contains(self):
+ for k in self.db:
+ assert self.db.has_key(k)
+
+ assert db.get('FOO') == None
+
+ assert not self.db.has_key('FOO')
+
+ def test_contains(self):
+ for k in self.db:
+ assert k in self.db
+
+ assert not 'FOO' in self.db
+
+ def test_get(self):
+ for k in self.db:
+ record = self.db.get(k)
+ assert record.name == k
+
+ record = self.db[k]
+ assert record.name == k
+
+ try:
+ self.db['FOO']
+ assert False, "the previous line should raise a KeyError"
+ except KeyError:
+ pass
+
+ def test_missing(self):
+ """
+ Make sure that unsupported dict attributes are actually missing.
+ """
+ db = self.db
+
+ try:
+ db.clear()
+ assert 0
+ except AttributeError:
+ pass
+
+ try:
+ db.update({})
+ assert 0
+ except AttributeError:
+ pass
+
+ try:
+ db.setdefault(None)
+ assert 0
+ except AttributeError:
+ pass
+
+ try:
+ db.pop()
+ assert 0
+ except AttributeError:
+ pass
+
+ try:
+ db.popitem()
+ assert 0
+ except AttributeError:
+ pass
diff --git a/screed/tests/test_fasta.py b/screed/tests/test_fasta.py
new file mode 100644
index 0000000..8de9e1f
--- /dev/null
+++ b/screed/tests/test_fasta.py
@@ -0,0 +1,135 @@
+import screed
+from screed.DBConstants import fileExtension
+import os
+from cStringIO import StringIO
+
+def test_new_record():
+ # test for a bug where the record dict was not reset after each
+ # sequence load, leading to all records being identical if you
+ # kept a handle on the returned dictionary.
+
+ s = StringIO(">1\nACTG\n>2\nACGG\n")
+
+ records = list(iter(screed.fasta.fasta_iter(s)))
+ assert records[0]['name'] == '1'
+ assert records[1]['name'] == '2'
+
+class Test_fasta(object):
+ def setup(self):
+ self._testfa = os.path.join(os.path.dirname(__file__), 'test.fa')
+ screed.read_fasta_sequences(self._testfa)
+ self.db = screed.ScreedDB(self._testfa)
+
+ def teardown(self):
+ os.unlink(self._testfa + fileExtension)
+
+ def test_length(self):
+ assert len(self.db) == 22
+
+ def test_keys(self):
+ for key in self.db:
+ assert key == self.db[key].name
+
+ def test_id_retrieval(self):
+ for key in self.db:
+ record = self.db[key]
+ intRcrd = self.db.loadRecordByIndex(record.id)
+ assert record == intRcrd
+
+ def test_contains_front(self):
+ first = self.db[self.db.keys()[0]]
+ assert first.id == 0
+ assert first.name == 'ENSMICT00000012722'
+ assert first.description == 'cdna:pseudogene scaffold:micMur1:'\
+ 'scaffold_185008:9:424:1 gene:ENSMICG00000012730'
+ assert str(first.sequence).startswith('TGCAGAAAATATCAAGAGTCAGC'\
+ 'AGAAAAACTATACAAGGGCTGGT'\
+ 'ATTTTGATTATTCT')
+
+ def test_contains_middle(self):
+ middle = self.db[self.db.keys()[10]]
+ assert middle.id == 10
+ assert middle.name == 'ENSMICT00000012078'
+ assert middle.description == 'cdna:pseudogene scaffold:micMur1'\
+ ':scaffold_180699:3:774:-1 gene:ENSMICG00000012085'
+ assert str(middle.sequence).startswith('GCGCACTCCCAGTGGCTACCCA'\
+ 'CGGCAGGAGGCGGCGGCAGTGA'\
+ 'CTGGGCCGGCGGCCCG')
+
+ def test_contains_end(self):
+ end = self.db[self.db.keys()[21]]
+ assert end.id == 21
+ assert end.name == 'ENSMICT00000003880'
+ assert end.description == 'cdna:novel scaffold:micMur1:scaffol'\
+ 'd_175819:130:631:1 gene:ENSMICG00000003884'
+ assert str(end.sequence).startswith('ATGCTGCCTAAGTTTGACCCCAACG'\
+ 'CGATCAAAGTCATGTACCTGAGGTG'\
+ 'CACGGGTGGC')
+
+ def test_contains(self):
+ for k in self.db:
+ assert self.db.has_key(k)
+
+ assert self.db.get('FOO') == None
+
+ assert not 'FOO' in self.db
+
+ def test_iterv(self):
+ entries = []
+ for entry in self.db:
+ entries.append(self.db[entry])
+
+ ivalues = list(self.db.itervalues())
+ assert sorted(entries) == sorted(ivalues)
+
+ def test_iteri(self):
+ for id, entry in self.db.iteritems():
+ assert id == self.db[entry.name].id
+ assert entry == self.db[entry.name]
+
+class Test_fasta_whitespace(object):
+ def setup(self):
+ self._testfa = os.path.join(os.path.dirname(__file__),
+ 'test-whitespace.fa')
+ screed.read_fasta_sequences(self._testfa)
+ self.db = screed.ScreedDB(self._testfa)
+
+ def test_for_omitted_record(self):
+ assert 'ENSMICT00000012401' in self.db
+
+ def teardown(self):
+ os.unlink(self._testfa + fileExtension)
+
+def test_writer():
+ fp = StringIO()
+ w = screed.fasta.FASTA_Writer("", fp)
+
+ class FakeRecord(object):
+ pass
+
+ read = FakeRecord()
+ read.name = 'foo'
+ read.description = 'bar'
+ read.sequence = 'ATCG'
+
+ w.write(read)
+
+ assert fp.getvalue() == '>foo bar\nATCG\n'
+
+def test_writer_2():
+ fp = StringIO()
+ w = screed.fasta.FASTA_Writer("", fp)
+
+ class FakeRecord(object):
+ pass
+
+ read = FakeRecord()
+ read.name = 'foo'
+ read.description = 'bar'
+ read.sequence = 'ATCG'
+
+ read_iter = [read]
+
+ w.consume(read_iter)
+
+ assert fp.getvalue() == '>foo bar\nATCG\n'
diff --git a/screed/tests/test_fasta_recover.py b/screed/tests/test_fasta_recover.py
new file mode 100644
index 0000000..27a1f37
--- /dev/null
+++ b/screed/tests/test_fasta_recover.py
@@ -0,0 +1,18 @@
+import test_fasta
+import os
+import screed
+from screed.DBConstants import fileExtension
+
+class test_fa_recover(test_fasta.Test_fasta):
+ def setup(self):
+ self._fileName = os.path.join(os.path.dirname(__file__), 'fastaRecovery')
+ self._testfa = os.path.join(os.path.dirname(__file__), 'test.fa')
+ screed.read_fasta_sequences(self._testfa)
+ screed.ToFasta(self._testfa, self._fileName)
+ screed.read_fasta_sequences(self._fileName)
+ self.db = screed.ScreedDB(self._fileName)
+
+ def teardown(self):
+ os.unlink(self._fileName)
+ os.unlink(self._fileName + fileExtension)
+ os.unlink(self._testfa + fileExtension)
diff --git a/screed/tests/test_fastq.py b/screed/tests/test_fastq.py
new file mode 100644
index 0000000..4298a1b
--- /dev/null
+++ b/screed/tests/test_fastq.py
@@ -0,0 +1,144 @@
+import screed
+from screed.DBConstants import fileExtension
+import os
+from cStringIO import StringIO
+
+def test_new_record():
+ # test for a bug where the record dict was not reset after each
+ # sequence load, leading to all records being identical if you
+ # kept a handle on the returned dictionary.
+
+ s = StringIO("@1\nACTG\n+\nAAAA\n at 2\nACGG\n+\nAAAA\n")
+
+ records = list(iter(screed.fastq.fastq_iter(s)))
+ assert records[0]['name'] == '1'
+ assert records[1]['name'] == '2'
+
+def test_parse_description_true():
+ # test for a bug where the record dict was not reset after each
+ # sequence load, leading to all records being identical if you
+ # kept a handle on the returned dictionary.
+
+ s = StringIO("@1 FOO\nACTG\n+\nAAAA\n at 2\nACGG\n+\nAAAA\n")
+
+ records = list(iter(screed.fastq.fastq_iter(s, parse_description=True)))
+ assert records[0]['name'] == '1'
+ assert records[1]['name'] == '2'
+
+ # also is default behavior
+ s = StringIO("@1 FOO\nACTG\n+\nAAAA\n at 2\nACGG\n+\nAAAA\n")
+
+ records = list(iter(screed.fastq.fastq_iter(s)))
+ assert records[0]['name'] == '1'
+ assert records[1]['name'] == '2'
+
+def test_parse_description_false():
+ # test for a bug where the record dict was not reset after each
+ # sequence load, leading to all records being identical if you
+ # kept a handle on the returned dictionary.
+
+ s = StringIO("@1 FOO\nACTG\n+\nAAAA\n at 2\nACGG\n+\nAAAA\n")
+
+ records = list(iter(screed.fastq.fastq_iter(s, parse_description=False)))
+ assert records[0]['name'] == '1 FOO'
+ assert records[1]['name'] == '2'
+
+class Test_fastq(object):
+ def setup(self):
+ self._testfq = os.path.join(os.path.dirname(__file__), 'test.fastq')
+ screed.read_fastq_sequences(self._testfq)
+ self.db = screed.ScreedDB(self._testfq)
+
+ def teardown(self):
+ os.unlink(self._testfq + fileExtension)
+
+ def test_length(self):
+ assert len(self.db) == 125
+
+ def test_keys(self):
+ for key in self.db:
+ assert key == self.db[key].name
+
+ def test_id_retrieval(self):
+ for key in self.db:
+ record = self.db[key]
+ intRcrd = self.db.loadRecordByIndex(record.id)
+ assert record == intRcrd
+
+ def test_contains_front(self):
+ first = self.db[self.db.keys()[0]]
+ assert first.id == 0
+ assert first.name == 'HWI-EAS_4_PE-FC20GCB:2:1:492:573/2'
+ assert first.sequence == 'ACAGCAAAATTGTGATTGAGGATGAAGAACTGCTGT'
+ assert first.accuracy == 'AA7AAA3+AAAAAA.AAA.;7;AA;;;;*;<1;<<<'
+
+ def test_contains_middle(self):
+ middle = self.db[self.db.keys()[62]]
+ assert middle.id == 62
+ assert middle.name == 'HWI-EAS_4_PE-FC20GCB:2:1:245:483/2'
+ assert middle.sequence == 'TGTCGAGCAAAGCAAAACAGGCGTAAAAATTGCCAT'
+ assert middle.accuracy == 'AAAAAAAAAAAAAAAAAAAAA>AAAAAAAA?9>6><'
+
+ def test_contains_end(self):
+ end = self.db[self.db.keys()[124]]
+ assert end.id == 124
+ assert end.name == 'HWI-EAS_4_PE-FC20GCB:2:1:350:588/2'
+ assert end.sequence == 'GGTACAAAATAGATGCTGGACTCTCCGAATCCTATA'
+ assert end.accuracy == ';?5AAAAAAAAAA?A??;?AA;AAA>AAAA?4?844'
+
+ def test_contains(self):
+ for k in self.db:
+ assert self.db.has_key(k)
+
+ assert self.db.get('FOO') == None
+
+ assert not 'FOO' in self.db
+
+ def test_iterv(self):
+ entries = []
+ for entry in self.db:
+ entries.append(self.db[entry])
+
+ ivalues = list(self.db.itervalues())
+ assert sorted(entries) == sorted(ivalues)
+
+ def test_iteri(self):
+ for id, entry in self.db.iteritems():
+ assert id == self.db[entry.name].id
+ assert entry == self.db[entry.name]
+
+def test_writer():
+ fp = StringIO()
+ w = screed.fastq.FASTQ_Writer("", fp)
+
+ class FakeRecord(object):
+ pass
+
+ read = FakeRecord()
+ read.name = 'foo'
+ read.description = 'bar'
+ read.sequence = 'ATCG'
+ read.accuracy = '####'
+
+ w.write(read)
+
+ assert fp.getvalue() == '@foo bar\nATCG\n+\n####\n'
+
+def test_writer_2():
+ fp = StringIO()
+ w = screed.fastq.FASTQ_Writer("", fp)
+
+ class FakeRecord(object):
+ pass
+
+ read = FakeRecord()
+ read.name = 'foo'
+ read.description = 'bar'
+ read.sequence = 'ATCG'
+ read.accuracy = '####'
+
+ read_iter = [read]
+
+ w.consume(read_iter)
+
+ assert fp.getvalue() == '@foo bar\nATCG\n+\n####\n'
diff --git a/screed/tests/test_fastq_recover.py b/screed/tests/test_fastq_recover.py
new file mode 100644
index 0000000..a3a7b3c
--- /dev/null
+++ b/screed/tests/test_fastq_recover.py
@@ -0,0 +1,19 @@
+import test_fastq
+import os
+import screed
+from screed.DBConstants import fileExtension
+
+class test_fq_recover(test_fastq.Test_fastq):
+ def setup(self):
+ thisdir = os.path.dirname(__file__)
+ self._fileName = os.path.join(thisdir, 'fastqRecovery')
+ self._testfq = os.path.join(thisdir, 'test.fastq')
+ screed.read_fastq_sequences(self._testfq)
+ screed.ToFastq(self._testfq, self._fileName)
+ screed.read_fastq_sequences(self._fileName)
+ self.db = screed.ScreedDB(self._fileName)
+
+ def teardown(self):
+ os.unlink(self._fileName)
+ os.unlink(self._fileName + fileExtension)
+ os.unlink(self._testfq + fileExtension)
diff --git a/screed/tests/test_hava_methods.py b/screed/tests/test_hava_methods.py
new file mode 100644
index 0000000..81bccdf
--- /dev/null
+++ b/screed/tests/test_hava_methods.py
@@ -0,0 +1,61 @@
+import screed
+import screed.seqparse
+from screed.DBConstants import fileExtension
+import os
+
+testha = os.path.join(os.path.dirname(__file__), 'test.hava')
+
+class test_hava(object):
+ def setup(self):
+ screed.seqparse.read_hava_sequences(testha)
+ self._db = screed.ScreedDB(testha)
+
+ def teardown(self):
+ b = 7
+ #os.unlink(testha + fileExtension)
+
+ def test_contains(self):
+ assert 'test_006' in self._db
+
+ def test_beginning_key_retrieval(self):
+ result = self._db['test_000']
+ assert result.hava == 'test_000'
+ assert result.quarzk == 'ACGGTGACGGTCACCGTCGACGGCCCAAGCCCATCGAACG'\
+ 'TACCACCCCCACCTATCGTCACGCTGGTGGAGAGCCAATG'
+ assert result.muchalo == 'AFPPCLHBCCILGMMOCHKNNDBKCCPNHAMKJOCCDJA'\
+ 'OEPNMHFHCBAJOKEMMMBHCPHIOAEPFFCAOJPGIMKGK'
+ assert result.fakours == '218583165871861127719451483455294521865'\
+ '68176931571171542294878855181415261425688'
+ assert result.selimizicka == 'bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb'\
+ 'bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb'
+ assert result.marshoon == 'C7AF246AC7AAEABE5A557FCBC6FD5F5263BCDE'\
+ '4E745BEF1GG7DD1AB511GBC63A4GF1F4E1A154B35D'
+
+ def test_middle_key_retrieval(self):
+ result = self._db['test_0063']
+ assert result.hava == 'test_0063'
+ assert result.quarzk == 'CAACACGATCAAGTTTGGTAAGAATTCCGCCTTAAGCTTT'\
+ 'CTAGAACGATAGTTGCCCCCAATCTGGTTCGAAATCTCTT'
+ assert result.muchalo == 'GMDAPLMOOFANDHHMLBPIKGHIAFFFOABFMNNJNIJ'\
+ 'ILEEFEPOCAJLNDLIFBPMGKOFJIEFAHNJPIOFAJMLM'
+ assert result.fakours == '392363971393898522756138876485334274384'\
+ '39122136418369146118333919885587613673488'
+ assert result.selimizicka == 'bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb'\
+ 'bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb'
+ assert result.marshoon == 'FC25E2CFC2BAFA7A2AA4757F3GFFFEE37G7752'\
+ 'FCDBAEADBA1AC7374FB5C15552E6E2GG6GFF62C6GE'
+
+ def test_end_key_retrieval(self):
+ result = self._db['test_00124']
+ assert result.hava == 'test_00124'
+ assert result.quarzk == 'ATCGCAACCGTTTCCCCTATCTGGCAATTGAATCCGCGTC'\
+ 'CTAAAACGAAAGCTTATCCCTGGCGAGGCACGCTAGGCCT'
+ assert result.muchalo == 'CIHNCECANFNLKGCHNOEHJDHADHPAEMMNKGMMMPD'\
+ 'OBMOCKNBCMCPHEBEOINHMBMMGCHEMOIOAPEFPDDJP'
+ assert result.fakours == '327364511483537131695325595876269716778'\
+ '14946924334424648676283848861393812686731'
+ assert result.selimizicka == 'bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb'\
+ 'bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb'
+ assert result.marshoon == '4FE5FDD76CC5DE4DC2F25AA2GFBD7BEG326C6D'\
+ '7AB5B71GA67BAFD63AE1A562CDC1C2D157G6EF17CD'
+
diff --git a/screed/tests/test_nodb.py b/screed/tests/test_nodb.py
new file mode 100644
index 0000000..198697e
--- /dev/null
+++ b/screed/tests/test_nodb.py
@@ -0,0 +1,31 @@
+import screed
+import os
+from screed.DBConstants import fileExtension
+
+def test_nodb():
+ """
+ Tests if screed throws an appropriate exception if it is
+ asked to open a non-existant screed database
+ """
+ try:
+ db = screed.ScreedDB('foo')
+ assert 1 == 0 # Previous line should throw an error
+ except ValueError:
+ pass
+
+def test_wrongdb():
+ """
+ Tests if screed throws an appropriate exception if it is
+ asked to open a file that isn't a screed database
+ """
+ try:
+ blah = 'blah_screed'
+ blah_file = open(blah, 'wb')
+ blah_file.close()
+
+ db = screed.ScreedDB(blah)
+ os.unlink(blah)
+ assert 1 == 0
+ except TypeError:
+ os.unlink(blah)
+ pass
diff --git a/screed/tests/test_open.py b/screed/tests/test_open.py
new file mode 100644
index 0000000..d2c5de9
--- /dev/null
+++ b/screed/tests/test_open.py
@@ -0,0 +1,62 @@
+import os.path
+
+import screed, screed.openscreed
+
+def test_empty_open():
+ filename = os.path.join(os.path.dirname(__file__), 'empty.fa')
+
+ assert len(list(iter(screed.open(filename)))) == 0
+
+def test_simple_open():
+ filename = os.path.join(os.path.dirname(__file__), 'test.fa')
+
+ n = -1
+ for n, record in enumerate(screed.open(filename)):
+ assert record.name == 'ENSMICT00000012722'
+ break
+ assert n == 0, n
+
+def test_simple_open_fq():
+ filename = os.path.join(os.path.dirname(__file__), 'test.fastq')
+
+ n = -1
+ for n, record in enumerate(screed.open(filename)):
+ assert record.name == 'HWI-EAS_4_PE-FC20GCB:2:1:492:573/2'
+ break
+ assert n == 0
+
+def test_gz_open():
+ filename1 = os.path.join(os.path.dirname(__file__), 'test.fa')
+ filename2 = os.path.join(os.path.dirname(__file__), 'test.fa.gz')
+ for n, (r1, r2) in enumerate(zip(screed.open(filename1),
+ screed.open(filename2))):
+ assert r1.name == r2.name
+
+ assert n > 0
+
+def test_bz2_open():
+ filename1 = os.path.join(os.path.dirname(__file__), 'test.fa')
+ filename2 = os.path.join(os.path.dirname(__file__), 'test.fa.bz2')
+ for n, (r1, r2) in enumerate(zip(screed.open(filename1),
+ screed.open(filename2))):
+ assert r1.name == r2.name
+
+ assert n > 0
+
+def test_gz_open_fastq():
+ filename1 = os.path.join(os.path.dirname(__file__), 'test.fastq')
+ filename2 = os.path.join(os.path.dirname(__file__), 'test.fastq.gz')
+ for n, (r1, r2) in enumerate(zip(screed.open(filename1),
+ screed.open(filename2))):
+ assert r1.name == r2.name
+
+ assert n > 0
+
+def test_get_writer_class_fasta():
+ import screed.fasta
+
+ filename = os.path.join(os.path.dirname(__file__), 'test.fa')
+
+ read_iter = screed.open(filename)
+ x = screed.openscreed.get_writer_class(read_iter)
+ assert x is screed.fasta.FASTA_Writer, x
diff --git a/screed/tests/test_pygr_api.py b/screed/tests/test_pygr_api.py
new file mode 100644
index 0000000..ba1b1a7
--- /dev/null
+++ b/screed/tests/test_pygr_api.py
@@ -0,0 +1,93 @@
+"""
+Test the pygr API.
+"""
+
+try:
+ import pygr
+except ImportError:
+ import nose
+ raise nose.SkipTest, "pygr is required for these tests"
+
+import screed
+from screed.DBConstants import fileExtension
+from screed.pygr_api import ScreedSequenceDB, ScreedSequenceDB_ByIndex
+from pickle import dump, load
+from cStringIO import StringIO
+import os
+
+testfa = os.path.join(os.path.dirname(__file__), 'test.fa')
+
+def setup():
+ screed.read_fasta_sequences(testfa)
+
+def teardown():
+ os.unlink(testfa + fileExtension)
+
+def test_name_iterator_methods():
+ db = ScreedSequenceDB(testfa)
+
+ # test the various iterator methods for equal results from db
+ a = sorted([ (x, db[x]) for x in db ])
+ b = sorted([ i for i in db.iteritems() ])
+ c = sorted([ (v.name, v) for v in db.itervalues() ])
+
+ assert a == b
+ assert a == c
+
+def test_index_iterator_methods():
+ db = ScreedSequenceDB_ByIndex(testfa)
+
+ # test the various iterator methods for equal results from db
+ m = sorted([ (x, db[x]) for x in db ])
+ n = sorted([ i for i in db.iteritems() ])
+ o = sorted([ (v.record.id, v) for v in db.itervalues() ])
+
+ assert m == n
+ assert m == o, (m, o)
+
+def test_name_index_equality():
+ db1 = ScreedSequenceDB(testfa)
+ db2 = ScreedSequenceDB_ByIndex(testfa)
+
+ # must use something other than the obj itself for comparison...
+ v1 = sorted([ (v.name, v.seq) for v in db1.itervalues() ])
+ v2 = sorted([ (v.name, v.seq) for v in db2.itervalues() ])
+ assert v1 == v2, (v1, v2)
+
+def test_seqinfodict_by_name():
+ db1 = ScreedSequenceDB(testfa)
+ sd = db1.seqInfoDict
+
+ m = sorted([ y.id for (x, y) in sd.iteritems() ])
+ n = sorted([ x.id for x in sd.itervalues() ])
+
+ assert m == n, (m, n)
+
+def test_seqinfodict_by_index():
+ db1 = ScreedSequenceDB_ByIndex(testfa)
+ sd = db1.seqInfoDict
+
+ m = sorted([ x for (x, y) in sd.iteritems() ])
+ n = sorted([ x for x in sd.iterkeys() ])
+
+ assert m == n, (m, n)
+
+def test_pickle_ByName():
+ db = ScreedSequenceDB(testfa)
+ ofp = StringIO()
+
+ dump(db, ofp)
+
+ ifp = StringIO(ofp.getvalue())
+ db2 = load(ifp)
+ assert db.filepath == db2.filepath
+
+def test_pickle_ByIndex():
+ db = ScreedSequenceDB_ByIndex(testfa)
+ ofp = StringIO()
+
+ dump(db, ofp)
+
+ ifp = StringIO(ofp.getvalue())
+ db2 = load(ifp)
+ assert db.filepath == db2.filepath
diff --git a/screed/tests/test_shell.py b/screed/tests/test_shell.py
new file mode 100644
index 0000000..a653a73
--- /dev/null
+++ b/screed/tests/test_shell.py
@@ -0,0 +1,39 @@
+import test_fasta
+import test_fastq
+import os
+import subprocess
+import screed
+from screed.DBConstants import fileExtension
+
+class Test_fa_shell(test_fasta.Test_fasta):
+ """
+ Tests the functionality of the script 'fadbm' in creating a
+ screed database correctly from the shell
+ """
+ def setup(self):
+ thisdir = os.path.dirname(__file__)
+ self._testfa = os.path.join(thisdir, 'test.fa')
+ fadbm = os.path.join(thisdir, '..', 'fadbm.py')
+ subprocess.check_call(['python', fadbm, self._testfa],
+ stdout=subprocess.PIPE)
+ self.db = screed.ScreedDB(self._testfa)
+
+ def teardown(self):
+ os.unlink(self._testfa + fileExtension)
+
+class Test_fq_shell(test_fastq.Test_fastq):
+ """
+ Tests the functionality of the script 'fqdbm' in creating a
+ screed database correctly from the shell
+ """
+ def setup(self):
+ thisdir = os.path.dirname(__file__)
+ self._testfq = os.path.join(thisdir, 'test.fastq')
+
+ fqdbm = os.path.join(thisdir, '..', 'fqdbm.py')
+ subprocess.check_call(['python', fqdbm, self._testfq],
+ stdout=subprocess.PIPE)
+ self.db = screed.ScreedDB(self._testfq)
+
+ def teardown(self):
+ os.unlink(self._testfq + fileExtension)
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..5cef938
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,18 @@
+#!/usr/bin/env python
+try:
+ from setuptools import setup
+except ImportError:
+ print '(WARNING: importing distutils, not setuptools!)'
+ from distutils.core import setup
+
+setup(name='screed',
+ version='0.7.1',
+ description='A short read database',
+ author='Alex Nolley, C. Titus Brown',
+ author_email='ctb at msu.edu',
+ url='http://github.com/ged-lab/screed/',
+ packages=['screed', 'screed.tests'],
+ package_data={'screed.tests': ['test.*', 'test-whitespace.fa']},
+ license='BSD',
+ test_suite = 'nose.collector'
+ )
diff --git a/tox.ini b/tox.ini
new file mode 100644
index 0000000..ca5a7d4
--- /dev/null
+++ b/tox.ini
@@ -0,0 +1,12 @@
+[tox]
+envlist = py27, py33
+
+[testenv]
+commands = nosetests --with-xcoverage --with-xunit --cover-package=screed --cover-erase
+deps =
+ nosexcover
+ pygr
+
+[testenv:py33]
+deps =
+ nosexcover
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/python-screed.git
More information about the debian-med-commit
mailing list