[med-svn] [obitools] 07/09: New upstream version 1.2.11
Andreas Tille
tille at debian.org
Wed Dec 13 18:18:17 UTC 2017
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch master
in repository obitools.
commit 207ebbe264e5bb517f8361959a3afffa9da4db0e
Author: Andreas Tille <tille at debian.org>
Date: Wed Dec 13 19:17:12 2017 +0100
New upstream version 1.2.11
---
MANIFEST.in | 10 +
PKG-INFO | 19 +
README.txt | 0
debian/changelog | 9 -
debian/compat | 1 -
debian/control | 22 -
debian/copyright | 561 -------
debian/docs | 3 -
debian/obitools.doc-base | 9 -
debian/obitools.install | 1 -
debian/obitools.links | 3 -
debian/obitools.manpages | 1 -
debian/patches/fix_path_interpreter | 315 ----
debian/patches/series | 2 -
debian/patches/use_debian_libs | 82 -
debian/rules | 35 -
debian/source/format | 1 -
debian/watch | 3 -
distutils.ext/obidistutils/__init__.py | 0
distutils.ext/obidistutils/command/__init__.py | 7 +
distutils.ext/obidistutils/command/build.py | 47 +
distutils.ext/obidistutils/command/build_cexe.py | 72 +
distutils.ext/obidistutils/command/build_ctools.py | 60 +
distutils.ext/obidistutils/command/build_exe.py | 213 +++
distutils.ext/obidistutils/command/build_ext.py | 114 ++
distutils.ext/obidistutils/command/build_files.py | 58 +
.../obidistutils/command/build_filters.py | 10 +
.../obidistutils/command/build_scripts.py | 102 ++
distutils.ext/obidistutils/command/build_sphinx.py | 38 +
distutils.ext/obidistutils/command/install.py | 25 +
.../obidistutils/command/install_scripts.py | 79 +
.../obidistutils/command/install_sphinx.py | 46 +
distutils.ext/obidistutils/command/littlebigman.py | 62 +
distutils.ext/obidistutils/command/pidname.py | 53 +
distutils.ext/obidistutils/command/sdist.py | 46 +
distutils.ext/obidistutils/core.py | 205 +++
distutils.ext/obidistutils/dist.py | 48 +
distutils.ext/obidistutils/serenity/__init__.py | 117 ++
.../obidistutils/serenity/checkpackage.py | 184 +++
distutils.ext/obidistutils/serenity/checkpip.py | 82 +
distutils.ext/obidistutils/serenity/checkpython.py | 170 +++
distutils.ext/obidistutils/serenity/checksystem.py | 19 +
distutils.ext/obidistutils/serenity/getcython.py | 72 +
distutils.ext/obidistutils/serenity/globals.py | 15 +
distutils.ext/obidistutils/serenity/rerun.py | 60 +
distutils.ext/obidistutils/serenity/snake.py | 35 +
distutils.ext/obidistutils/serenity/util.py | 27 +
distutils.ext/obidistutils/serenity/virtual.py | 133 ++
distutils.ext/src/littlebigman.c | 24 +
distutils.ext/src/pidname.c | 24 +
doc/sphinx/Makefile | 100 ++
doc/sphinx/make.bat | 113 ++
doc/sphinx/source/annotations.rst | 11 +
doc/sphinx/source/attributes.rst | 128 ++
doc/sphinx/source/attributes/ali_dir.rst | 9 +
doc/sphinx/source/attributes/ali_length.rst | 9 +
doc/sphinx/source/attributes/avg_quality.rst | 18 +
doc/sphinx/source/attributes/best_identity.rst | 11 +
doc/sphinx/source/attributes/best_match.rst | 11 +
doc/sphinx/source/attributes/class.rst | 11 +
doc/sphinx/source/attributes/cluster.rst | 8 +
doc/sphinx/source/attributes/complemented.rst | 9 +
doc/sphinx/source/attributes/count.rst | 16 +
doc/sphinx/source/attributes/cut.rst | 11 +
doc/sphinx/source/attributes/direction.rst | 8 +
doc/sphinx/source/attributes/distance.rst | 11 +
doc/sphinx/source/attributes/error.rst | 11 +
doc/sphinx/source/attributes/experiment.rst | 9 +
doc/sphinx/source/attributes/family.rst | 30 +
doc/sphinx/source/attributes/family_name.rst | 29 +
doc/sphinx/source/attributes/forward_error.rst | 13 +
doc/sphinx/source/attributes/forward_match.rst | 15 +
doc/sphinx/source/attributes/forward_primer.rst | 12 +
doc/sphinx/source/attributes/forward_score.rst | 11 +
doc/sphinx/source/attributes/forward_tag.rst | 14 +
doc/sphinx/source/attributes/forward_tm.rst | 13 +
doc/sphinx/source/attributes/genus.rst | 33 +
doc/sphinx/source/attributes/genus_name.rst | 29 +
doc/sphinx/source/attributes/head_quality.rst | 18 +
doc/sphinx/source/attributes/id_status.rst | 8 +
doc/sphinx/source/attributes/merged.rst | 9 +
doc/sphinx/source/attributes/merged_star.rst | 14 +
doc/sphinx/source/attributes/mid_quality.rst | 19 +
doc/sphinx/source/attributes/mode.rst | 11 +
doc/sphinx/source/attributes/obiclean_cluster.rst | 11 +
doc/sphinx/source/attributes/obiclean_count.rst | 13 +
doc/sphinx/source/attributes/obiclean_head.rst | 12 +
.../source/attributes/obiclean_headcount.rst | 14 +
.../source/attributes/obiclean_internalcount.rst | 14 +
.../source/attributes/obiclean_samplecount.rst | 13 +
.../source/attributes/obiclean_singletoncount.rst | 14 +
doc/sphinx/source/attributes/obiclean_status.rst | 14 +
doc/sphinx/source/attributes/occurrence.rst | 9 +
doc/sphinx/source/attributes/order.rst | 30 +
doc/sphinx/source/attributes/order_name.rst | 29 +
doc/sphinx/source/attributes/pairend_limit.rst | 13 +
doc/sphinx/source/attributes/partial.rst | 11 +
doc/sphinx/source/attributes/rank.rst | 15 +
doc/sphinx/source/attributes/reverse_error.rst | 13 +
doc/sphinx/source/attributes/reverse_match.rst | 15 +
doc/sphinx/source/attributes/reverse_primer.rst | 13 +
doc/sphinx/source/attributes/reverse_score.rst | 11 +
doc/sphinx/source/attributes/reverse_tag.rst | 13 +
doc/sphinx/source/attributes/reverse_tm.rst | 13 +
doc/sphinx/source/attributes/sample.rst | 10 +
doc/sphinx/source/attributes/scientific_name.rst | 28 +
doc/sphinx/source/attributes/score.rst | 8 +
doc/sphinx/source/attributes/score_norm.rst | 8 +
doc/sphinx/source/attributes/select.rst | 11 +
doc/sphinx/source/attributes/seq_a_deletion.rst | 8 +
doc/sphinx/source/attributes/seq_a_insertion.rst | 8 +
doc/sphinx/source/attributes/seq_a_mismatch.rst | 8 +
doc/sphinx/source/attributes/seq_a_single.rst | 9 +
doc/sphinx/source/attributes/seq_ab_match.rst | 7 +
doc/sphinx/source/attributes/seq_b_deletion.rst | 8 +
doc/sphinx/source/attributes/seq_b_insertion.rst | 8 +
doc/sphinx/source/attributes/seq_b_mismatch.rst | 8 +
doc/sphinx/source/attributes/seq_b_single.rst | 9 +
doc/sphinx/source/attributes/seq_length.rst | 10 +
doc/sphinx/source/attributes/seq_length_ori.rst | 9 +
doc/sphinx/source/attributes/seq_rank.rst | 9 +
doc/sphinx/source/attributes/sminL.rst | 8 +
doc/sphinx/source/attributes/sminR.rst | 8 +
doc/sphinx/source/attributes/species.rst | 32 +
doc/sphinx/source/attributes/species_list.rst | 12 +
doc/sphinx/source/attributes/species_name.rst | 31 +
doc/sphinx/source/attributes/status.rst | 10 +
doc/sphinx/source/attributes/strand.rst | 10 +
doc/sphinx/source/attributes/tail_quality.rst | 18 +
doc/sphinx/source/attributes/taxid.rst | 30 +
doc/sphinx/source/barcodes.rst | 11 +
doc/sphinx/source/conf.py | 262 ++++
doc/sphinx/source/conversions.rst | 11 +
doc/sphinx/source/embl.rst | 2 +
doc/sphinx/source/fasta.rst | 47 +
doc/sphinx/source/fastq.rst | 163 ++
doc/sphinx/source/filtering.rst | 14 +
doc/sphinx/source/formats.rst | 59 +
doc/sphinx/source/genbank.rst | 2 +
doc/sphinx/source/index.rst | 23 +
doc/sphinx/source/introduction.rst | 8 +
doc/sphinx/source/iupac.rst | 63 +
doc/sphinx/source/manipulations.rst | 15 +
doc/sphinx/source/obitaxonomy.rst | 26 +
doc/sphinx/source/optionsSet/defaultoptions.txt | 13 +
doc/sphinx/source/optionsSet/inputformat.txt | 79 +
doc/sphinx/source/optionsSet/outputformat.txt | 30 +
doc/sphinx/source/optionsSet/sequenceEdit.txt | 83 +
doc/sphinx/source/optionsSet/sequenceFilter.txt | 174 +++
doc/sphinx/source/optionsSet/taxonomyDB.txt | 13 +
doc/sphinx/source/optionsSet/taxonomyFilter.txt | 14 +
doc/sphinx/source/scripts.rst | 13 +
doc/sphinx/source/scripts/ecoPCR.rst | 193 +++
doc/sphinx/source/scripts/ecoPrimers.rst | 253 ++++
doc/sphinx/source/scripts/ecodbtaxstat.rst | 49 +
doc/sphinx/source/scripts/ecofind.rst | 85 ++
doc/sphinx/source/scripts/ecotag.rst | 91 ++
doc/sphinx/source/scripts/ecotaxspecificity.rst | 31 +
doc/sphinx/source/scripts/ecotaxstat.rst | 21 +
doc/sphinx/source/scripts/illuminapairedend.rst | 61 +
doc/sphinx/source/scripts/ngsfilter.rst | 54 +
doc/sphinx/source/scripts/obiaddtaxids.rst | 57 +
doc/sphinx/source/scripts/obiannotate.rst | 36 +
doc/sphinx/source/scripts/obiclean.rst | 63 +
doc/sphinx/source/scripts/obicomplement.rst | 7 +
doc/sphinx/source/scripts/obiconvert.rst | 21 +
doc/sphinx/source/scripts/obicount.rst | 42 +
doc/sphinx/source/scripts/obicut.rst | 21 +
doc/sphinx/source/scripts/obidistribute.rst | 20 +
doc/sphinx/source/scripts/obiextract.rst | 61 +
doc/sphinx/source/scripts/obigrep.rst | 11 +
doc/sphinx/source/scripts/obihead.rst | 14 +
doc/sphinx/source/scripts/obijoinpairedend.rst | 15 +
doc/sphinx/source/scripts/obipr2.rst | 22 +
doc/sphinx/source/scripts/obisample.rst | 67 +
doc/sphinx/source/scripts/obiselect.rst | 126 ++
doc/sphinx/source/scripts/obisilva.rst | 71 +
doc/sphinx/source/scripts/obisort.rst | 35 +
doc/sphinx/source/scripts/obisplit.rst | 21 +
doc/sphinx/source/scripts/obistat.rst | 95 ++
doc/sphinx/source/scripts/obisubset.rst | 81 +
doc/sphinx/source/scripts/obitab.rst | 48 +
doc/sphinx/source/scripts/obitail.rst | 14 +
doc/sphinx/source/scripts/obitaxonomy.rst | 160 ++
doc/sphinx/source/scripts/obiuniq.rst | 79 +
doc/sphinx/source/scripts/oligotag.rst | 134 ++
doc/sphinx/source/statistics.rst | 9 +
doc/sphinx/source/taxdump.rst | 2 +
doc/sphinx/source/tutorials.rst | 12 +
doc/sphinx/source/utilities.rst | 13 +
doc/sphinx/source/welcome.rst | 405 +++++
doc/sphinx/source/wolves.rst | 648 ++++++++
doc/sphinx/sphinxext/apigen.py | 427 ++++++
doc/sphinx/sphinxext/docscrape.py | 497 ++++++
doc/sphinx/sphinxext/docscrape_sphinx.py | 136 ++
.../sphinxext/ipython_console_highlighting.py | 114 ++
doc/sphinx/sphinxext/numpydoc.py | 116 ++
requirements.txt | 6 +
setup.cfg | 5 +
setup.py | 70 +
src/OBITools.egg-info/PKG-INFO | 19 +
src/OBITools.egg-info/SOURCES.txt | 591 ++++++++
src/OBITools.egg-info/dependency_links.txt | 1 +
src/OBITools.egg-info/not-zip-safe | 1 +
src/OBITools.egg-info/requires.txt | 5 +
src/OBITools.egg-info/top_level.txt | 1 +
src/ali2consensus.py | 111 ++
src/ecodbtaxstat.py | 76 +
src/ecotag.py | 460 ++++++
src/ecotaxspecificity.py | 239 +++
src/ecotaxstat.py | 109 ++
src/extractreads.py | 243 +++
src/extractreads2.py | 119 ++
src/illuminapairedend.py | 280 ++++
src/ngsfilter.py | 458 ++++++
src/obiaddtaxids.py | 424 ++++++
src/obiannotate.py | 85 ++
src/obiclean.py | 416 +++++
src/obicomplement.py | 62 +
src/obiconvert.py | 54 +
src/obicount.py | 59 +
src/obicut.py | 53 +
src/obidistribute.py | 140 ++
src/obiextract.py | 81 +
src/obigrep.py | 45 +
src/obihead.py | 57 +
src/obijoinpairedend.py | 134 ++
src/obipr2.py | 302 ++++
src/obisample.py | 119 ++
src/obiselect.py | 281 ++++
src/obisilva.py | 355 +++++
src/obisort.py | 61 +
src/obisplit.py | 135 ++
src/obistat.py | 221 +++
src/obisubset.py | 116 ++
src/obitab.py | 178 +++
src/obitail.py | 54 +
src/obitaxonomy.py | 350 +++++
src/obitools/SVGdraw.py | 1054 +++++++++++++
src/obitools/__init__.py | 57 +
src/obitools/_obitools.h | 25 +
src/obitools/_obitools.pxd | 109 ++
src/obitools/_obitools.pyx | 800 ++++++++++
src/obitools/align/__init__.py | 15 +
src/obitools/align/_assemble.pxd | 10 +
src/obitools/align/_assemble.pyx | 169 +++
src/obitools/align/_codonnws.pxd | 15 +
src/obitools/align/_codonnws.pyx | 1589 ++++++++++++++++++++
src/obitools/align/_dynamic.pxd | 90 ++
src/obitools/align/_dynamic.pyx | 365 +++++
src/obitools/align/_freeendgap.pxd | 9 +
src/obitools/align/_freeendgap.pyx | 161 ++
src/obitools/align/_freeendgapfm.pxd | 5 +
src/obitools/align/_freeendgapfm.pyx | 19 +
src/obitools/align/_gprofilenws.pxd | 8 +
src/obitools/align/_gprofilenws.pyx | 167 ++
src/obitools/align/_lcs.cfiles | 1 +
src/obitools/align/_lcs.ext.1.c | 168 +++
src/obitools/align/_lcs.ext.2.c | 34 +
src/obitools/align/_lcs.ext.3.c | 34 +
src/obitools/align/_lcs.ext.4.c | 225 +++
src/obitools/align/_lcs.h | 29 +
src/obitools/align/_lcs.pxd | 9 +
src/obitools/align/_lcs.pyx | 206 +++
src/obitools/align/_lcs_fast.h | 597 ++++++++
src/obitools/align/_nws.pxd | 10 +
src/obitools/align/_nws.pyx | 162 ++
src/obitools/align/_nwsdnabyprot.pxd | 36 +
src/obitools/align/_nwsdnabyprot.pyx | 516 +++++++
src/obitools/align/_profilenws.pxd | 23 +
src/obitools/align/_profilenws.pyx | 211 +++
src/obitools/align/_qsassemble.pyx | 89 ++
src/obitools/align/_qsrassemble.pyx | 88 ++
src/obitools/align/_rassemble.pxd | 10 +
src/obitools/align/_rassemble.pyx | 157 ++
src/obitools/align/_sse.h | 929 ++++++++++++
src/obitools/align/_upperbond.cfiles | 1 +
src/obitools/align/_upperbond.ext.1.c | 225 +++
src/obitools/align/_upperbond.h | 7 +
src/obitools/align/_upperbond.pxd | 16 +
src/obitools/align/_upperbond.pyx | 90 ++
src/obitools/align/homopolymere.py | 56 +
src/obitools/align/ssearch.py | 46 +
src/obitools/alignment/__init__.py | 175 +++
src/obitools/alignment/ace.py | 47 +
src/obitools/barcodecoverage/__init__.py | 7 +
src/obitools/barcodecoverage/calcBc.py | 62 +
src/obitools/barcodecoverage/drawBcTree.py | 108 ++
src/obitools/barcodecoverage/findErrors.py | 56 +
src/obitools/barcodecoverage/readFiles.py | 69 +
src/obitools/barcodecoverage/writeBcTree.py | 42 +
src/obitools/blast/__init__.py | 207 +++
src/obitools/carto/__init__.py | 376 +++++
src/obitools/collections.py | 190 +++
src/obitools/decorator.py | 0
src/obitools/distances/__init__.py | 29 +
src/obitools/distances/observed.py | 77 +
src/obitools/distances/phylip.py | 35 +
src/obitools/distances/r.py | 25 +
src/obitools/dnahash/__init__.py | 100 ++
src/obitools/ecobarcode/__init__.py | 0
src/obitools/ecobarcode/databases.py | 32 +
src/obitools/ecobarcode/ecotag.py | 50 +
src/obitools/ecobarcode/options.py | 64 +
src/obitools/ecobarcode/rawdata.py | 38 +
src/obitools/ecobarcode/taxonomy.py | 120 ++
src/obitools/ecopcr/__init__.py | 69 +
src/obitools/ecopcr/annotation.py | 104 ++
src/obitools/ecopcr/options.py | 140 ++
src/obitools/ecopcr/sequence.py | 183 +++
src/obitools/ecopcr/taxonomy.py | 704 +++++++++
src/obitools/ecotag/__init__.py | 2 +
src/obitools/ecotag/parser.py | 150 ++
src/obitools/eutils/__init__.py | 54 +
src/obitools/fast.py | 56 +
src/obitools/fasta/__init__.py | 13 +
src/obitools/fasta/_fasta.pxd | 13 +
src/obitools/fasta/_fasta.pyx | 515 +++++++
src/obitools/fastq/__init__.py | 19 +
src/obitools/fastq/_fastq.pyx | 530 +++++++
src/obitools/fnaqual/__init__.py | 2 +
src/obitools/fnaqual/fasta.py | 8 +
src/obitools/fnaqual/quality.py | 137 ++
src/obitools/format/__init__.py | 28 +
src/obitools/format/_format.pyx | 19 +
src/obitools/format/genericparser/__init__.py | 219 +++
.../format/genericparser/_genericparser.pyx | 232 +++
src/obitools/format/ontology/__init__.py | 0
src/obitools/format/ontology/go_obo.py | 274 ++++
src/obitools/format/options.py | 375 +++++
src/obitools/format/sequence/__init__.py | 69 +
src/obitools/format/sequence/embl.py | 2 +
src/obitools/format/sequence/fasta.py | 4 +
src/obitools/format/sequence/fastq.py | 16 +
src/obitools/format/sequence/fnaqual.py | 8 +
src/obitools/format/sequence/genbank.py | 4 +
src/obitools/format/sequence/tagmatcher.py | 5 +
src/obitools/goa/__init__.py | 0
src/obitools/goa/parser.py | 33 +
src/obitools/graph/__init__.py | 1016 +++++++++++++
src/obitools/graph/algorithms/__init__.py | 0
src/obitools/graph/algorithms/clique.py | 134 ++
src/obitools/graph/algorithms/compact.py | 8 +
src/obitools/graph/algorithms/component.py | 82 +
src/obitools/graph/dag.py | 99 ++
src/obitools/graph/layout/__init__.py | 0
src/obitools/graph/layout/radialtree.py | 0
src/obitools/graph/rootedtree.py | 115 ++
src/obitools/graph/tree.py | 37 +
src/obitools/gzip.py | 504 +++++++
src/obitools/interactive/__init__.py | 30 +
src/obitools/location/__init__.py | 547 +++++++
src/obitools/location/feature.py | 177 +++
src/obitools/metabarcoding/__init__.py | 301 ++++
src/obitools/metabarcoding/options.py | 34 +
src/obitools/obischemas/__init__.py | 28 +
src/obitools/obischemas/kb/__init__.py | 55 +
src/obitools/obischemas/kb/extern.py | 78 +
src/obitools/obischemas/options.py | 31 +
src/obitools/obo/__init__.py | 0
src/obitools/obo/go/__init__.py | 0
src/obitools/obo/go/parser.py | 53 +
src/obitools/obo/parser.py | 707 +++++++++
src/obitools/options/__init__.py | 101 ++
src/obitools/options/_bioseqfilter.pyx | 82 +
src/obitools/options/_options.pyx | 124 ++
src/obitools/options/bioseqcutter.py | 87 ++
src/obitools/options/bioseqedittag.py | 317 ++++
src/obitools/options/bioseqfilter.py | 121 ++
src/obitools/options/taxonomyfilter.py | 6 +
src/obitools/parallel/__init__.py | 99 ++
src/obitools/parallel/jobqueue.py | 183 +++
src/obitools/phylogeny/__init__.py | 119 ++
src/obitools/phylogeny/newick.py | 123 ++
src/obitools/profile/__init__.py | 0
src/obitools/profile/_profile.pxd | 72 +
src/obitools/profile/_profile.pyx | 289 ++++
src/obitools/sample.py | 87 ++
src/obitools/seqdb/__init__.py | 88 ++
src/obitools/seqdb/blastdb/__init__.py | 0
src/obitools/seqdb/dnaparser.py | 16 +
src/obitools/seqdb/embl/__init__.py | 13 +
src/obitools/seqdb/embl/parser.py | 52 +
src/obitools/seqdb/genbank/__init__.py | 84 ++
src/obitools/seqdb/genbank/ncbi.py | 79 +
src/obitools/seqdb/genbank/parser.py | 53 +
src/obitools/sequenceencoder/__init__.py | 73 +
src/obitools/solexa/__init__.py | 45 +
src/obitools/solexaPairEnd.py | 103 ++
src/obitools/statistics/__init__.py | 0
src/obitools/statistics/hypergeometric.py | 166 ++
src/obitools/statistics/noncentralhypergeo.py | 208 +++
src/obitools/svg.py | 120 ++
src/obitools/table/__init__.py | 633 ++++++++
src/obitools/table/csv.py | 52 +
src/obitools/tagmatcher/__init__.py | 35 +
src/obitools/tagmatcher/options.py | 14 +
src/obitools/tagmatcher/parser.py | 89 ++
src/obitools/thermo/__init__.py | 597 ++++++++
src/obitools/tools/__init__.py | 0
src/obitools/tools/_solexapairend.pyx | 187 +++
src/obitools/tools/solexapairend.py | 51 +
src/obitools/tree/__init__.py | 116 ++
src/obitools/tree/dot.py | 18 +
src/obitools/tree/layout.py | 103 ++
src/obitools/tree/newick.py | 117 ++
src/obitools/tree/svg.py | 70 +
src/obitools/tree/unrooted.py | 33 +
src/obitools/unit/__init__.py | 8 +
src/obitools/unit/obitools/__init__.py | 91 ++
src/obitools/utils/__init__.py | 319 ++++
src/obitools/utils/_utils.pxd | 42 +
src/obitools/utils/_utils.pyx | 160 ++
src/obitools/utils/bioseq.py | 234 +++
src/obitools/utils/crc64.py | 53 +
src/obitools/utils/iterator.py | 8 +
src/obitools/version.py | 5 +
src/obitools/word/__init__.py | 135 ++
src/obitools/word/_binary.pyx | 269 ++++
src/obitools/word/_readindex.cfiles | 1 +
src/obitools/word/_readindex.ext.1.c | 202 +++
src/obitools/word/_readindex.h | 94 ++
src/obitools/word/_readindex.pyx | 805 ++++++++++
src/obitools/word/options.py | 117 ++
src/obitools/word/predicate.py | 41 +
src/obitools/zipfile.py | 1282 ++++++++++++++++
src/obiuniq.py | 107 ++
src/oligotag.py | 106 ++
428 files changed, 43590 insertions(+), 1048 deletions(-)
diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 0000000..95c028b
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1,10 @@
+include setup.py
+recursive-include distutils.ext *.py *.c *.pem
+recursive-include src *.pyx *.pxd *.c *.h *.cfiles
+recursive-include doc/sphinx/source *.txt *.rst *.py
+recursive-include doc/sphinx/sphinxext *.py
+include doc/sphinx/make.bat
+include doc/sphinx/Makefile
+include README.txt
+include requirements.txt
+
diff --git a/PKG-INFO b/PKG-INFO
new file mode 100644
index 0000000..91758e5
--- /dev/null
+++ b/PKG-INFO
@@ -0,0 +1,19 @@
+Metadata-Version: 1.1
+Name: OBITools
+Version: 1.2.11
+Summary: Scripts and library for sequence analysis
+Home-page: http://metabarcoding.org/obitools
+Author: Eric Coissac
+Author-email: eric at coissac.eu
+License: CeCILL-V2
+Description: UNKNOWN
+Platform: UNKNOWN
+Classifier: Development Status :: 5 - Production/Stable
+Classifier: Environment :: Console
+Classifier: Intended Audience :: Science/Research
+Classifier: License :: Other/Proprietary License
+Classifier: Operating System :: Unix
+Classifier: Programming Language :: Python
+Classifier: Programming Language :: Python :: 2
+Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
+Classifier: Topic :: Utilities
diff --git a/README.txt b/README.txt
new file mode 100644
index 0000000..e69de29
diff --git a/debian/changelog b/debian/changelog
deleted file mode 100644
index b302af6..0000000
--- a/debian/changelog
+++ /dev/null
@@ -1,9 +0,0 @@
-obitools (1.1.16+dfsg-1) UNRELEASED; urgency=low
-
- * Initial release (Closes: #783833)
- [ According to Debian Python Policy, only Python3 compatible packages will be
- accepted in NEW. This package is not python 3 compatible.
- I tried to update it with 2to3, but several warnings are raised and C
- extensions fail at link time.]
-
- -- Olivier Sallou <osallou at debian.org> Thu, 30 Apr 2015 15:24:37 +0200
diff --git a/debian/compat b/debian/compat
deleted file mode 100644
index ec63514..0000000
--- a/debian/compat
+++ /dev/null
@@ -1 +0,0 @@
-9
diff --git a/debian/control b/debian/control
deleted file mode 100644
index af466a3..0000000
--- a/debian/control
+++ /dev/null
@@ -1,22 +0,0 @@
-Source: obitools
-Section: python
-Priority: optional
-Maintainer: Debian Med Packaging Team <debian-med-packaging at lists.alioth.debian.org>
-Uploaders: Olivier Sallou <osallou at debian.org>
-Build-Depends: debhelper (>= 9), dh-python,
- python-sphinx, cython, ipython, python-wheel
-Standards-Version: 3.9.6
-Homepage: https://pypi.python.org/pypi/OBITools
-Vcs-Svn: svn://anonsvn.debian.org/debian-med/trunk/packages/obitools/trunk/
-Vcs-Browser: http://anonscm.debian.org/viewvc/debian-med/trunk/packages/obitools/trunk/
-#X-Python-Version: >= 2.7
-
-Package: obitools
-Architecture: amd64
-Depends: ${misc:Depends}, ${python:Depends}, ${shlibs:Depends},
- ${sphinxdoc:Depends}, libjs-sphinxdoc
-Description: programs to analyze NGS data in a DNA metabarcoding context
- The OBITools programs aims to help you to manipulate various data and
- sequence files in a convenient way using the Unix command line interface.
- They follow the standard Unix interface for command line program, allowing to
- chain a set of commands using the pipe mecanism.
diff --git a/debian/copyright b/debian/copyright
deleted file mode 100644
index 5639983..0000000
--- a/debian/copyright
+++ /dev/null
@@ -1,561 +0,0 @@
-Format: http://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
-Upstream-Name: obitools
-Source: http://metabarcoding.org//obitools
-Files-Excluded:
- distutils.ext/obidistutils/serenity/pip
- build
-
-
-Files: src/obitools/SVGdraw.py
-Copyright: 2002, Fedor Baart & Hans de Wit (Stichting Farmaceutische Kengetallen)
-License: BSD
- All rights reserved.
- .
- Redistribution and use in source and binary forms, with or without modification,
- are permitted provided that the following conditions are met:
- .
- Redistributions of source code must retain the above copyright notice, this
- list of conditions and the following disclaimer.
- .
- Redistributions in binary form must reproduce the above copyright notice,
- this list of conditions and the following disclaimer in the documentation and/or
- other materials provided with the distribution.
- .
- Neither the name of the Stichting Farmaceutische Kengetallen nor the names of
- its contributors may be used to endorse or promote products derived from this
- software without specific prior written permission.
- .
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
- FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-Files: *
-Copyright: 2014 Eric Coissac <eric at coissac.eu>
- 2014 Boyer F
- 2014 Mercier C
- 2014 Bonin A
- 2014 Taberlet P
-License: CeCILL-2.1
- CeCILL FREE SOFTWARE LICENSE AGREEMENT
- .
- Notice
- .
- This Agreement is a Free Software license agreement that is the result
- of discussions between its authors in order to ensure compliance with
- the two main principles guiding its drafting:
- .
- * firstly, compliance with the principles governing the distribution
- of Free Software: access to source code, broad rights granted to
- users,
- * secondly, the election of a governing law, French law, with which
- it is conformant, both as regards the law of torts and
- intellectual property law, and the protection that it offers to
- both authors and holders of the economic rights over software.
- .
- The authors of the CeCILL (for Ce[a] C[nrs] I[nria] L[ogiciel] L[ibre])
- license are:
- .
- Commissariat à l'Energie Atomique - CEA, a public scientific, technical
- and industrial research establishment, having its principal place of
- business at 25 rue Leblanc, immeuble Le Ponant D, 75015 Paris, France.
- .
- Centre National de la Recherche Scientifique - CNRS, a public scientific
- and technological establishment, having its principal place of business
- at 3 rue Michel-Ange, 75794 Paris cedex 16, France.
- .
- Institut National de Recherche en Informatique et en Automatique -
- INRIA, a public scientific and technological establishment, having its
- principal place of business at Domaine de Voluceau, Rocquencourt, BP
- 105, 78153 Le Chesnay cedex, France.
- .
- Preamble
- .
- The purpose of this Free Software license agreement is to grant users
- the right to modify and redistribute the software governed by this
- license within the framework of an open source distribution model.
- .
- The exercising of these rights is conditional upon certain obligations
- for users so as to preserve this status for all subsequent redistributions.
- .
- In consideration of access to the source code and the rights to copy,
- modify and redistribute granted by the license, users are provided only
- with a limited warranty and the software's author, the holder of the
- economic rights, and the successive licensors only have limited liability.
- .
- In this respect, the risks associated with loading, using, modifying
- and/or developing or reproducing the software by the user are brought to
- the user's attention, given its Free Software status, which may make it
- complicated to use, with the result that its use is reserved for
- developers and experienced professionals having in-depth computer
- knowledge. Users are therefore encouraged to load and test the
- suitability of the software as regards their requirements in conditions
- enabling the security of their systems and/or data to be ensured and,
- more generally, to use and operate it in the same conditions of
- security. This Agreement may be freely reproduced and published,
- provided it is not altered, and that no provisions are either added or
- removed herefrom.
- .
- This Agreement may apply to any or all software for which the holder of
- the economic rights decides to submit the use thereof to its provisions.
- .
- Article 1 - DEFINITIONS
- .
- For the purpose of this Agreement, when the following expressions
- commence with a capital letter, they shall have the following meaning:
- .
- Agreement: means this license agreement, and its possible subsequent
- versions and annexes.
- .
- Software: means the software in its Object Code and/or Source Code form
- and, where applicable, its documentation, "as is" when the Licensee
- accepts the Agreement.
- .
- Initial Software: means the Software in its Source Code and possibly its
- Object Code form and, where applicable, its documentation, "as is" when
- it is first distributed under the terms and conditions of the Agreement.
- .
- Modified Software: means the Software modified by at least one
- Contribution.
- .
- Source Code: means all the Software's instructions and program lines to
- which access is required so as to modify the Software.
- .
- Object Code: means the binary files originating from the compilation of
- the Source Code.
- .
- Holder: means the holder(s) of the economic rights over the Initial
- Software.
- .
- Licensee: means the Software user(s) having accepted the Agreement.
- .
- Contributor: means a Licensee having made at least one Contribution.
- .
- Licensor: means the Holder, or any other individual or legal entity, who
- distributes the Software under the Agreement.
- .
- Contribution: means any or all modifications, corrections, translations,
- adaptations and/or new functions integrated into the Software by any or
- all Contributors, as well as any or all Internal Modules.
- .
- Module: means a set of sources files including their documentation that
- enables supplementary functions or services in addition to those offered
- by the Software.
- .
- External Module: means any or all Modules, not derived from the
- Software, so that this Module and the Software run in separate address
- spaces, with one calling the other when they are run.
- .
- Internal Module: means any or all Module, connected to the Software so
- that they both execute in the same address space.
- .
- GNU GPL: means the GNU General Public License version 2 or any
- subsequent version, as published by the Free Software Foundation Inc.
- .
- Parties: mean both the Licensee and the Licensor.
- .
- These expressions may be used both in singular and plural form.
- .
- Article 2 - PURPOSE
- .
- The purpose of the Agreement is the grant by the Licensor to the
- Licensee of a non-exclusive, transferable and worldwide license for the
- Software as set forth in Article 5 hereinafter for the whole term of the
- protection granted by the rights over said Software.
- .
- Article 3 - ACCEPTANCE
- .
- 3.1 The Licensee shall be deemed as having accepted the terms and
- conditions of this Agreement upon the occurrence of the first of the
- following events:
- .
- * (i) loading the Software by any or all means, notably, by
- downloading from a remote server, or by loading from a physical
- medium;
- * (ii) the first time the Licensee exercises any of the rights
- granted hereunder.
- .
- 3.2 One copy of the Agreement, containing a notice relating to the
- characteristics of the Software, to the limited warranty, and to the
- fact that its use is restricted to experienced users has been provided
- to the Licensee prior to its acceptance as set forth in Article 3.1
- hereinabove, and the Licensee hereby acknowledges that it has read and
- understood it.
- .
- Article 4 - EFFECTIVE DATE AND TERM
- .
- 4.1 EFFECTIVE DATE
- .
- The Agreement shall become effective on the date when it is accepted by
- the Licensee as set forth in Article 3.1.
- .
- 4.2 TERM
- .
- The Agreement shall remain in force for the entire legal term of
- protection of the economic rights over the Software.
- .
- Article 5 - SCOPE OF RIGHTS GRANTED
- .
- The Licensor hereby grants to the Licensee, who accepts, the following
- rights over the Software for any or all use, and for the term of the
- Agreement, on the basis of the terms and conditions set forth hereinafter.
- .
- Besides, if the Licensor owns or comes to own one or more patents
- protecting all or part of the functions of the Software or of its
- components, the Licensor undertakes not to enforce the rights granted by
- these patents against successive Licensees using, exploiting or
- modifying the Software. If these patents are transferred, the Licensor
- undertakes to have the transferees subscribe to the obligations set
- forth in this paragraph.
- .
- 5.1 RIGHT OF USE
- .
- The Licensee is authorized to use the Software, without any limitation
- as to its fields of application, with it being hereinafter specified
- that this comprises:
- .
- 1. permanent or temporary reproduction of all or part of the Software
- by any or all means and in any or all form.
- .
- 2. loading, displaying, running, or storing the Software on any or
- all medium.
- .
- 3. entitlement to observe, study or test its operation so as to
- determine the ideas and principles behind any or all constituent
- elements of said Software. This shall apply when the Licensee
- carries out any or all loading, displaying, running, transmission
- or storage operation as regards the Software, that it is entitled
- to carry out hereunder.
- .
- 5.2 ENTITLEMENT TO MAKE CONTRIBUTIONS
- .
- The right to make Contributions includes the right to translate, adapt,
- arrange, or make any or all modifications to the Software, and the right
- to reproduce the resulting software.
- .
- The Licensee is authorized to make any or all Contributions to the
- Software provided that it includes an explicit notice that it is the
- author of said Contribution and indicates the date of the creation thereof.
- .
- 5.3 RIGHT OF DISTRIBUTION
- .
- In particular, the right of distribution includes the right to publish,
- transmit and communicate the Software to the general public on any or
- all medium, and by any or all means, and the right to market, either in
- consideration of a fee, or free of charge, one or more copies of the
- Software by any means.
- .
- The Licensee is further authorized to distribute copies of the modified
- or unmodified Software to third parties according to the terms and
- conditions set forth hereinafter.
- .
- 5.3.1 DISTRIBUTION OF SOFTWARE WITHOUT MODIFICATION
- .
- The Licensee is authorized to distribute true copies of the Software in
- Source Code or Object Code form, provided that said distribution
- complies with all the provisions of the Agreement and is accompanied by:
- .
- 1. a copy of the Agreement,
- .
- 2. a notice relating to the limitation of both the Licensor's
- warranty and liability as set forth in Articles 8 and 9,
- .
- and that, in the event that only the Object Code of the Software is
- redistributed, the Licensee allows future Licensees unhindered access to
- the full Source Code of the Software by indicating how to access it, it
- being understood that the additional cost of acquiring the Source Code
- shall not exceed the cost of transferring the data.
- .
- 5.3.2 DISTRIBUTION OF MODIFIED SOFTWARE
- .
- When the Licensee makes a Contribution to the Software, the terms and
- conditions for the distribution of the resulting Modified Software
- become subject to all the provisions of this Agreement.
- .
- The Licensee is authorized to distribute the Modified Software, in
- source code or object code form, provided that said distribution
- complies with all the provisions of the Agreement and is accompanied by:
- .
- 1. a copy of the Agreement,
- .
- 2. a notice relating to the limitation of both the Licensor's
- warranty and liability as set forth in Articles 8 and 9,
- .
- and that, in the event that only the object code of the Modified
- Software is redistributed, the Licensee allows future Licensees
- unhindered access to the full source code of the Modified Software by
- indicating how to access it, it being understood that the additional
- cost of acquiring the source code shall not exceed the cost of
- transferring the data.
- .
- 5.3.3 DISTRIBUTION OF EXTERNAL MODULES
- .
- When the Licensee has developed an External Module, the terms and
- conditions of this Agreement do not apply to said External Module, that
- may be distributed under a separate license agreement.
- .
- 5.3.4 COMPATIBILITY WITH THE GNU GPL
- .
- The Licensee can include a code that is subject to the provisions of one
- of the versions of the GNU GPL in the Modified or unmodified Software,
- and distribute that entire code under the terms of the same version of
- the GNU GPL.
- .
- The Licensee can include the Modified or unmodified Software in a code
- that is subject to the provisions of one of the versions of the GNU GPL,
- and distribute that entire code under the terms of the same version of
- the GNU GPL.
- .
- Article 6 - INTELLECTUAL PROPERTY
- .
- 6.1 OVER THE INITIAL SOFTWARE
- .
- The Holder owns the economic rights over the Initial Software. Any or
- all use of the Initial Software is subject to compliance with the terms
- and conditions under which the Holder has elected to distribute its work
- and no one shall be entitled to modify the terms and conditions for the
- distribution of said Initial Software.
- .
- The Holder undertakes that the Initial Software will remain ruled at
- least by this Agreement, for the duration set forth in Article 4.2.
- .
- 6.2 OVER THE CONTRIBUTIONS
- .
- The Licensee who develops a Contribution is the owner of the
- intellectual property rights over this Contribution as defined by
- applicable law.
- .
- 6.3 OVER THE EXTERNAL MODULES
- .
- The Licensee who develops an External Module is the owner of the
- intellectual property rights over this External Module as defined by
- applicable law and is free to choose the type of agreement that shall
- govern its distribution.
- .
- 6.4 JOINT PROVISIONS
- .
- The Licensee expressly undertakes:
- .
- 1. not to remove, or modify, in any manner, the intellectual property
- notices attached to the Software;
- .
- 2. to reproduce said notices, in an identical manner, in the copies
- of the Software modified or not.
- .
- The Licensee undertakes not to directly or indirectly infringe the
- intellectual property rights of the Holder and/or Contributors on the
- Software and to take, where applicable, vis-à-vis its staff, any and all
- measures required to ensure respect of said intellectual property rights
- of the Holder and/or Contributors.
- .
- Article 7 - RELATED SERVICES
- .
- 7.1 Under no circumstances shall the Agreement oblige the Licensor to
- provide technical assistance or maintenance services for the Software.
- .
- However, the Licensor is entitled to offer this type of services. The
- terms and conditions of such technical assistance, and/or such
- maintenance, shall be set forth in a separate instrument. Only the
- Licensor offering said maintenance and/or technical assistance services
- shall incur liability therefor.
- .
- 7.2 Similarly, any Licensor is entitled to offer to its licensees, under
- its sole responsibility, a warranty, that shall only be binding upon
- itself, for the redistribution of the Software and/or the Modified
- Software, under terms and conditions that it is free to decide. Said
- warranty, and the financial terms and conditions of its application,
- shall be subject of a separate instrument executed between the Licensor
- and the Licensee.
- .
- Article 8 - LIABILITY
- .
- 8.1 Subject to the provisions of Article 8.2, the Licensee shall be
- entitled to claim compensation for any direct loss it may have suffered
- from the Software as a result of a fault on the part of the relevant
- Licensor, subject to providing evidence thereof.
- .
- 8.2 The Licensor's liability is limited to the commitments made under
- this Agreement and shall not be incurred as a result of in particular:
- (i) loss due the Licensee's total or partial failure to fulfill its
- obligations, (ii) direct or consequential loss that is suffered by the
- Licensee due to the use or performance of the Software, and (iii) more
- generally, any consequential loss. In particular the Parties expressly
- agree that any or all pecuniary or business loss (i.e. loss of data,
- loss of profits, operating loss, loss of customers or orders,
- opportunity cost, any disturbance to business activities) or any or all
- legal proceedings instituted against the Licensee by a third party,
- shall constitute consequential loss and shall not provide entitlement to
- any or all compensation from the Licensor.
- .
- Article 9 - WARRANTY
- .
- 9.1 The Licensee acknowledges that the scientific and technical
- state-of-the-art when the Software was distributed did not enable all
- possible uses to be tested and verified, nor for the presence of
- possible defects to be detected. In this respect, the Licensee's
- attention has been drawn to the risks associated with loading, using,
- modifying and/or developing and reproducing the Software which are
- reserved for experienced users.
- .
- The Licensee shall be responsible for verifying, by any or all means,
- the suitability of the product for its requirements, its good working
- order, and for ensuring that it shall not cause damage to either persons
- or properties.
- .
- 9.2 The Licensor hereby represents, in good faith, that it is entitled
- to grant all the rights over the Software (including in particular the
- rights set forth in Article 5).
- .
- 9.3 The Licensee acknowledges that the Software is supplied "as is" by
- the Licensor without any other express or tacit warranty, other than
- that provided for in Article 9.2 and, in particular, without any warranty
- as to its commercial value, its secured, safe, innovative or relevant
- nature.
- .
- Specifically, the Licensor does not warrant that the Software is free
- from any error, that it will operate without interruption, that it will
- be compatible with the Licensee's own equipment and software
- configuration, nor that it will meet the Licensee's requirements.
- .
- 9.4 The Licensor does not either expressly or tacitly warrant that the
- Software does not infringe any third party intellectual property right
- relating to a patent, software or any other property right. Therefore,
- the Licensor disclaims any and all liability towards the Licensee
- arising out of any or all proceedings for infringement that may be
- instituted in respect of the use, modification and redistribution of the
- Software. Nevertheless, should such proceedings be instituted against
- the Licensee, the Licensor shall provide it with technical and legal
- assistance for its defense. Such technical and legal assistance shall be
- decided on a case-by-case basis between the relevant Licensor and the
- Licensee pursuant to a memorandum of understanding. The Licensor
- disclaims any and all liability as regards the Licensee's use of the
- name of the Software. No warranty is given as regards the existence of
- prior rights over the name of the Software or as regards the existence
- of a trademark.
- .
- Article 10 - TERMINATION
- .
- 10.1 In the event of a breach by the Licensee of its obligations
- hereunder, the Licensor may automatically terminate this Agreement
- thirty (30) days after notice has been sent to the Licensee and has
- remained ineffective.
- .
- 10.2 A Licensee whose Agreement is terminated shall no longer be
- authorized to use, modify or distribute the Software. However, any
- licenses that it may have granted prior to termination of the Agreement
- shall remain valid subject to their having been granted in compliance
- with the terms and conditions hereof.
- .
- Article 11 - MISCELLANEOUS
- .
- 11.1 EXCUSABLE EVENTS
- .
- Neither Party shall be liable for any or all delay, or failure to
- perform the Agreement, that may be attributable to an event of force
- majeure, an act of God or an outside cause, such as defective
- functioning or interruptions of the electricity or telecommunications
- networks, network paralysis following a virus attack, intervention by
- government authorities, natural disasters, water damage, earthquakes,
- fire, explosions, strikes and labor unrest, war, etc.
- .
- 11.2 Any failure by either Party, on one or more occasions, to invoke
- one or more of the provisions hereof, shall under no circumstances be
- interpreted as being a waiver by the interested Party of its right to
- invoke said provision(s) subsequently.
- .
- 11.3 The Agreement cancels and replaces any or all previous agreements,
- whether written or oral, between the Parties and having the same
- purpose, and constitutes the entirety of the agreement between said
- Parties concerning said purpose. No supplement or modification to the
- terms and conditions hereof shall be effective as between the Parties
- unless it is made in writing and signed by their duly authorized
- representatives.
- .
- 11.4 In the event that one or more of the provisions hereof were to
- conflict with a current or future applicable act or legislative text,
- said act or legislative text shall prevail, and the Parties shall make
- the necessary amendments so as to comply with said act or legislative
- text. All other provisions shall remain effective. Similarly, invalidity
- of a provision of the Agreement, for any reason whatsoever, shall not
- cause the Agreement as a whole to be invalid.
- .
- 11.5 LANGUAGE
- .
- The Agreement is drafted in both French and English and both versions
- are deemed authentic.
- .
- Article 12 - NEW VERSIONS OF THE AGREEMENT
- .
- 12.1 Any person is authorized to duplicate and distribute copies of this
- Agreement.
- .
- 12.2 So as to ensure coherence, the wording of this Agreement is
- protected and may only be modified by the authors of the License, who
- reserve the right to periodically publish updates or new versions of the
- Agreement, each with a separate number. These subsequent versions may
- address new issues encountered by Free Software.
- .
- 12.3 Any Software distributed under a given version of the Agreement may
- only be subsequently distributed under the same version of the Agreement
- or a subsequent version, subject to the provisions of Article 5.3.4.
- .
- Article 13 - GOVERNING LAW AND JURISDICTION
- .
- 13.1 The Agreement is governed by French law. The Parties agree to
- endeavor to seek an amicable solution to any disagreements or disputes
- that may arise during the performance of the Agreement.
- .
- 13.2 Failing an amicable solution within two (2) months as from their
- occurrence, and unless emergency proceedings are necessary, the
- disagreements or disputes shall be referred to the Paris Courts having
- jurisdiction, by the more diligent Party.
- .
- Version 2.0 dated 2006-09-05.
-
- You may Distribute or Publicly Perform the Work only under the terms of this License. You must include a copy of, or the Uniform Resource Identifier (URI) for, this License with every copy of the Work You Distribute or Publicly Perform. You may not offer or impose any terms on the Work that restrict the terms of this License or the ability of the recipient of the Work to exercise the rights granted to that recipient under the terms of the License. You may not sublicense the Work. You mu [...]
- If You Distribute, or Publicly Perform the Work or any Adaptations or Collections, You must, unless a request has been made pursuant to Section 4(a), keep intact all copyright notices for the Work and provide, reasonable to the medium or means You are utilizing: (i) the name of the Original Author (or pseudonym, if applicable) if supplied, and/or if the Original Author and/or Licensor designate another party or parties (e.g., a sponsor institute, publishing entity, journal) for attribut [...]
- Except as otherwise agreed in writing by the Licensor or as may be otherwise permitted by applicable law, if You Reproduce, Distribute or Publicly Perform the Work either by itself or as part of any Adaptations or Collections, You must not distort, mutilate, modify or take other derogatory action in relation to the Work which would be prejudicial to the Original Author's honor or reputation. Licensor agrees that in those jurisdictions (e.g. Japan), in which any exercise of the right gra [...]
- 5. Representations, Warranties and Disclaimer
- .
- UNLESS OTHERWISE MUTUALLY AGREED TO BY THE PARTIES IN WRITING, LICENSOR OFFERS THE WORK AS-IS AND MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE WORK, EXPRESS, IMPLIED, STATUTORY OR OTHERWISE, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF TITLE, MERCHANTIBILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, ACCURACY, OR THE PRESENCE OF ABSENCE OF ERRORS, WHETHER OR NOT DISCOVERABLE. SOME JURISDICTIONS DO NOT ALLOW THE EXC [...]
- .
- 6. Limitation on Liability. EXCEPT TO THE EXTENT REQUIRED BY APPLICABLE LAW, IN NO EVENT WILL LICENSOR BE LIABLE TO YOU ON ANY LEGAL THEORY FOR ANY SPECIAL, INCIDENTAL, CONSEQUENTIAL, PUNITIVE OR EXEMPLARY DAMAGES ARISING OUT OF THIS LICENSE OR THE USE OF THE WORK, EVEN IF LICENSOR HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
- .
- 7. Termination
- .
- This License and the rights granted hereunder will terminate automatically upon any breach by You of the terms of this License. Individuals or entities who have received Adaptations or Collections from You under this License, however, will not have their licenses terminated provided such individuals or entities remain in full compliance with those licenses. Sections 1, 2, 5, 6, 7, and 8 will survive any termination of this License.
- Subject to the above terms and conditions, the license granted here is perpetual (for the duration of the applicable copyright in the Work). Notwithstanding the above, Licensor reserves the right to release the Work under different license terms or to stop distributing the Work at any time; provided, however that any such election will not serve to withdraw this License (or any other license that has been, or is required to be, granted under the terms of this License), and this License [...]
- 8. Miscellaneous
- .
- Each time You Distribute or Publicly Perform the Work or a Collection, the Licensor offers to the recipient a license to the Work on the same terms and conditions as the license granted to You under this License.
- Each time You Distribute or Publicly Perform an Adaptation, Licensor offers to the recipient a license to the original Work on the same terms and conditions as the license granted to You under this License.
- If any provision of this License is invalid or unenforceable under applicable law, it shall not affect the validity or enforceability of the remainder of the terms of this License, and without further action by the parties to this agreement, such provision shall be reformed to the minimum extent necessary to make such provision valid and enforceable.
- No term or provision of this License shall be deemed waived and no breach consented to unless such waiver or consent shall be in writing and signed by the party to be charged with such waiver or consent.
- This License constitutes the entire agreement between the parties with respect to the Work licensed here. There are no understandings, agreements or representations with respect to the Work not specified here. Licensor shall not be bound by any additional provisions that may appear in any communication from You. This License may not be modified without the mutual written agreement of the Licensor and You.
- The rights granted under, and the subject matter referenced, in this License were drafted utilizing the terminology of the Berne Convention for the Protection of Literary and Artistic Works (as amended on September 28, 1979), the Rome Convention of 1961, the WIPO Copyright Treaty of 1996, the WIPO Performances and Phonograms Treaty of 1996 and the Universal Copyright Convention (as revised on July 24, 1971). These rights and subject matter take effect in the relevant jurisdiction in whi [...]
-
-
-Files: debian/*
-Copyright: 2015 Olivier Sallou <osallou at debian.org>
-License: GPL-2+
- This package is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2 of the License, or
- (at your option) any later version.
- .
- This package is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
- .
- You should have received a copy of the GNU General Public License
- along with this program. If not, see <http://www.gnu.org/licenses/>
- .
- On Debian systems, the complete text of the GNU General
- Public License version 2 can be found in "/usr/share/common-licenses/GPL-2".
diff --git a/debian/docs b/debian/docs
deleted file mode 100644
index f1db997..0000000
--- a/debian/docs
+++ /dev/null
@@ -1,3 +0,0 @@
-README.txt
-README.txt
-requirements.txt
diff --git a/debian/obitools.doc-base b/debian/obitools.doc-base
deleted file mode 100644
index 08c21f7..0000000
--- a/debian/obitools.doc-base
+++ /dev/null
@@ -1,9 +0,0 @@
-Document: obitools
-Title: OBITools manual
-Author: Eric coissac
-Abstract: This manual describes OBITools and its tools
-Section: Science/Biology
-
-Format: HTML
-Index: /usr/share/doc/obitools/html/index.html
-Files: /usr/share/doc/obitools/html/*.html
diff --git a/debian/obitools.install b/debian/obitools.install
deleted file mode 100644
index 167aea6..0000000
--- a/debian/obitools.install
+++ /dev/null
@@ -1 +0,0 @@
-build/sphinx/html usr/share/doc/obitools
diff --git a/debian/obitools.links b/debian/obitools.links
deleted file mode 100644
index a3a22ea..0000000
--- a/debian/obitools.links
+++ /dev/null
@@ -1,3 +0,0 @@
-usr/share/doc/obitools/html/_sources usr/share/doc/obitools/rst
-#usr/share/doc/obitools/html/_static/jquery.js usr/share/javascript/jquery/jquery.js
-#usr/share/doc/obitools/html/_static/underscore.js usr/share/javascript/underscore/underscore.js
diff --git a/debian/obitools.manpages b/debian/obitools.manpages
deleted file mode 100644
index 90766e9..0000000
--- a/debian/obitools.manpages
+++ /dev/null
@@ -1 +0,0 @@
-build/sphinx/man/*.1
diff --git a/debian/patches/fix_path_interpreter b/debian/patches/fix_path_interpreter
deleted file mode 100644
index 5a93ac9..0000000
--- a/debian/patches/fix_path_interpreter
+++ /dev/null
@@ -1,315 +0,0 @@
-Subject: wrong interpreter path
-Description: use Debian Python
-Author: Olivier Sallou <osallou at debian.org>
-Last-Updated: 2015-04-30
-Forwarded: no
---- a/src/extractreads.py
-+++ b/src/extractreads.py
-@@ -1,3 +1,4 @@
-+#!/usr/bin/python
- '''
- Created on 9 juin 2012
-
---- a/src/extractreads2.py
-+++ b/src/extractreads2.py
-@@ -1,3 +1,4 @@
-+#!/usr/bin/python
- '''
- Created on 9 juin 2012
-
---- a/src/ali2consensus.py
-+++ b/src/ali2consensus.py
-@@ -1,4 +1,4 @@
--#!/usr/local/bin/python
-+#!/usr/bin/python
-
- '''
- Created on 30 sept. 2011
---- a/src/ecodbtaxstat.py
-+++ b/src/ecodbtaxstat.py
-@@ -1,4 +1,4 @@
--#!/usr/local/bin/python
-+#!/usr/bin/python
- '''
- :py:mod:`ecodbtaxstat`: gives taxonomic rank frequency of a given ``ecopcr`` database
- =====================================================================================
---- a/src/ecotag.py
-+++ b/src/ecotag.py
-@@ -1,4 +1,4 @@
--#!/usr/local/bin/python
-+#!/usr/bin/python
- '''
- :py:mod:`ecotag`: assigns sequences to taxa
- ===========================================
---- a/src/ecotaxspecificity.py
-+++ b/src/ecotaxspecificity.py
-@@ -1,4 +1,4 @@
--#!/usr/local/bin/python
-+#!/usr/bin/python
- '''
- :py:mod:`ecotaxspecificity`: Evaluates barcode resolution
- =========================================================
---- a/src/ecotaxstat.py
-+++ b/src/ecotaxstat.py
-@@ -1,4 +1,4 @@
--#!/usr/local/bin/python
-+#!/usr/bin/python
- '''
- :py:mod:`ecotaxstat` : getting the coverage of an ecoPCR output compared to the original ecoPCR database
- ========================================================================================================
---- a/src/illuminapairedend.py
-+++ b/src/illuminapairedend.py
-@@ -1,4 +1,4 @@
--#!/usr/local/bin/python
-+#!/usr/bin/python
- '''
- :py:mod:`illuminapairedend`: aligns paired-end Illumina reads
- =============================================================
---- a/src/ngsfilter.py
-+++ b/src/ngsfilter.py
-@@ -1,4 +1,4 @@
--#!/usr/local/bin/python
-+#!/usr/bin/python
- '''
- :py:mod:`ngsfilter` : Assigns sequence records to the corresponding experiment/sample based on DNA tags and primers
- ===================================================================================================================
---- a/src/obiaddtaxids.py
-+++ b/src/obiaddtaxids.py
-@@ -1,4 +1,4 @@
--#!/usr/local/bin/python
-+#!/usr/bin/python
- '''
- :py:mod:`obiaddtaxids`: adds *taxids* to sequence records using an ecopcr database
- ==================================================================================
---- a/src/obiannotate.py
-+++ b/src/obiannotate.py
-@@ -1,4 +1,4 @@
--#!/usr/local/bin/python
-+#!/usr/bin/python
-
- '''
- :py:mod:`obiannotate`: adds/edits sequence record annotations
---- a/src/obiclean.py
-+++ b/src/obiclean.py
-@@ -1,4 +1,4 @@
--#!/usr/local/bin/python
-+#!/usr/bin/python
-
- '''
- :py:mod:`obiclean`: tags a set of sequences for PCR/sequencing errors identification
---- a/src/obicomplement.py
-+++ b/src/obicomplement.py
-@@ -1,4 +1,4 @@
--#!/usr/local/bin/python
-+#!/usr/bin/python
- """
- :py:mod:`obicomplement`: reverse-complements sequences
- ======================================================
---- a/src/obiconvert.py
-+++ b/src/obiconvert.py
-@@ -1,4 +1,4 @@
--#!/usr/local/bin/python
-+#!/usr/bin/python
- '''
- :py:mod:`obiconvert`: converts sequence files to different output formats
- =========================================================================
---- a/src/obicount.py
-+++ b/src/obicount.py
-@@ -1,4 +1,4 @@
--#!/usr/local/bin/python
-+#!/usr/bin/python
- '''
- :py:mod:`obicount`: counts the number of sequence records
- =========================================================
---- a/src/obicut.py
-+++ b/src/obicut.py
-@@ -1,4 +1,4 @@
--#!/usr/local/bin/python
-+#!/usr/bin/python
- '''
- :py:mod:`obicut`: trims sequences
- =================================
---- a/src/obidistribute.py
-+++ b/src/obidistribute.py
-@@ -1,4 +1,4 @@
--#!/usr/local/bin/python
-+#!/usr/bin/python
- '''
- :py:mod:`obidistribute`: Distributes sequence records over several sequence records files
- =========================================================================================
---- a/src/obiextract.py
-+++ b/src/obiextract.py
-@@ -1,4 +1,4 @@
--#!/usr/local/bin/python
-+#!/usr/bin/python
- '''
- :py:mod:`obiextract`: extract samples from a dataset
- ====================================================
---- a/src/obigrep.py
-+++ b/src/obigrep.py
-@@ -1,4 +1,4 @@
--#!/usr/local/bin/python
-+#!/usr/bin/python
- '''
- :py:mod:`obigrep`: filters sequence file
- ========================================
---- a/src/obihead.py
-+++ b/src/obihead.py
-@@ -1,4 +1,4 @@
--#!/usr/local/bin/python
-+#!/usr/bin/python
- '''
- :py:mod:`obihead`: extracts the first sequence records
- ======================================================
---- a/src/obijoinpairedend.py
-+++ b/src/obijoinpairedend.py
-@@ -1,4 +1,4 @@
--#!/usr/local/bin/python
-+#!/usr/bin/python
- '''
- :py:mod:`obijoinpairedend`: Joins paired-end reads
- ==================================================
---- a/src/obipr2.py
-+++ b/src/obipr2.py
-@@ -1,4 +1,4 @@
--#!/usr/local/bin/python
-+#!/usr/bin/python
- '''
- :py:mod:`obipr2`: converts silva database into an ecoPCR database
- =================================================================
---- a/src/obisample.py
-+++ b/src/obisample.py
-@@ -1,4 +1,4 @@
--#!/usr/local/bin/python
-+#!/usr/bin/python
- '''
- :py:mod:`obisample`: randomly resamples sequence records
- ========================================================
---- a/src/obiselect.py
-+++ b/src/obiselect.py
-@@ -1,4 +1,4 @@
--#!/usr/local/bin/python
-+#!/usr/bin/python
- """
- :py:mod:`obiselect` : selects representative sequence records
- =============================================================
---- a/src/obisilva.py
-+++ b/src/obisilva.py
-@@ -1,4 +1,4 @@
--#!/usr/local/bin/python
-+#!/usr/bin/python
- '''
- :py:mod:`obisilva`: converts silva database into an ecoPCR database
- ===================================================================
---- a/src/obisort.py
-+++ b/src/obisort.py
-@@ -1,4 +1,4 @@
--#!/usr/local/bin/python
-+#!/usr/bin/python
- '''
- :py:mod:`obisort`: Sorts sequence records according to the value of a given attribute
- =====================================================================================
---- a/src/obisplit.py
-+++ b/src/obisplit.py
-@@ -1,4 +1,4 @@
--#!/usr/local/bin/python
-+#!/usr/bin/python
- '''
- :py:mod:`obisplit`: Splits a sequence file in a set of subfiles
- ===============================================================
---- a/src/obistat.py
-+++ b/src/obistat.py
-@@ -1,4 +1,4 @@
--#!/usr/local/bin/python
-+#!/usr/bin/python
- '''
- :py:mod:`obistat`: computes basic statistics for attribute values
- =================================================================
---- a/src/obitab.py
-+++ b/src/obitab.py
-@@ -1,4 +1,4 @@
--#!/usr/local/bin/python
-+#!/usr/bin/python
- '''
- :py:mod:`obitab`: converts a sequence file to a tabular file
- ============================================================
---- a/src/obitail.py
-+++ b/src/obitail.py
-@@ -1,4 +1,4 @@
--#!/usr/local/bin/python
-+#!/usr/bin/python
- '''
- :py:mod:`obitail`: extracts the last sequence records
- =====================================================
---- a/src/obitaxonomy.py
-+++ b/src/obitaxonomy.py
-@@ -1,4 +1,4 @@
--#!/usr/local/bin/python
-+#!/usr/bin/python
- '''
- :py:mod:`obitaxonomy`: manages taxonomic databases
- ==================================================
---- a/src/obiuniq.py
-+++ b/src/obiuniq.py
-@@ -1,4 +1,4 @@
--#!/usr/local/bin/python
-+#!/usr/bin/python
- '''
- :py:mod:`obiuniq`: groups and dereplicates sequences
- ====================================================
---- a/src/oligotag.py
-+++ b/src/oligotag.py
-@@ -1,4 +1,4 @@
--#!/usr/local/bin/python
-+#!/usr/bin/python
- '''
- :py:mod:`oligotag`: Designs a set of oligonucleotides with specified properties
- ===============================================================================
---- a/src/obitools/barcodecoverage/calcBc.py
-+++ b/src/obitools/barcodecoverage/calcBc.py
-@@ -1,4 +1,4 @@
--#!/usr/local/bin/python
-+#!/usr/bin/python
- '''
- Created on 24 nov. 2011
-
---- a/src/obitools/barcodecoverage/drawBcTree.py
-+++ b/src/obitools/barcodecoverage/drawBcTree.py
-@@ -1,4 +1,4 @@
--#!/usr/local/bin/python
-+#!/usr/bin/python
- '''
- Created on 25 nov. 2011
-
---- a/src/obitools/barcodecoverage/findErrors.py
-+++ b/src/obitools/barcodecoverage/findErrors.py
-@@ -1,4 +1,4 @@
--#!/usr/local/bin/python
-+#!/usr/bin/python
- '''
- Created on 24 nov. 2011
-
---- a/src/obitools/barcodecoverage/readFiles.py
-+++ b/src/obitools/barcodecoverage/readFiles.py
-@@ -1,4 +1,4 @@
--#!/usr/local/bin/python
-+#!/usr/bin/python
- '''
- Created on 23 nov. 2011
-
---- a/src/obitools/barcodecoverage/writeBcTree.py
-+++ b/src/obitools/barcodecoverage/writeBcTree.py
-@@ -1,4 +1,4 @@
--#!/usr/local/bin/python
-+#!/usr/bin/python
- '''
- Created on 25 nov. 2011
-
---- a/src/obitools/solexaPairEnd.py
-+++ b/src/obitools/solexaPairEnd.py
-@@ -1,4 +1,4 @@
--#!/usr/local/bin/python
-+#!/usr/bin/python
- '''
- Created on 30 dec. 2009
-
diff --git a/debian/patches/series b/debian/patches/series
deleted file mode 100644
index 1a4ce68..0000000
--- a/debian/patches/series
+++ /dev/null
@@ -1,2 +0,0 @@
-use_debian_libs
-fix_path_interpreter
diff --git a/debian/patches/use_debian_libs b/debian/patches/use_debian_libs
deleted file mode 100644
index 3777672..0000000
--- a/debian/patches/use_debian_libs
+++ /dev/null
@@ -1,82 +0,0 @@
-Subject: skip virtualenv installation
-Description: installer uses pip and virtualenv for install,
- this patch removes this specific install to install in Debian dirs
- and use Debian libs
-Author: Olivier Sallou <osallou at debian.org>
-Last-Updated: 2015-04-30
-Forwarded: no
---- a/distutils.ext/obidistutils/serenity/__init__.py
-+++ b/distutils.ext/obidistutils/serenity/__init__.py
-@@ -63,7 +63,7 @@
-
- log.info("%s will be installed with python : %s" % (package,virtualpython))
-
-- if install_requirements():
-+ if install_requirements(False):
- log.info("Restarting installation with all dependencies ok")
- rerun_with_anothe_python(virtualpython)
-
---- a/distutils.ext/obidistutils/serenity/checkpackage.py
-+++ b/distutils.ext/obidistutils/serenity/checkpackage.py
-@@ -15,48 +15,7 @@
- from obidistutils.serenity.checkpip import get_a_pip_module
-
- def is_installed(requirement,pip=None):
--
-- if pip is None:
-- pip = get_a_pip_module()
--
-- get_installed_distributions=pip.util.get_installed_distributions
--
-- requirement_project,requirement_relation,requirement_version = parse_package_requirement(requirement)
--
-- package = [x for x in get_installed_distributions() if x.project_name==requirement_project]
--
-- if len(package)==1:
-- if requirement_version is not None and requirement_relation is not None:
-- rep = (len(package)==1) and eval("StrictVersion('%s') %s StrictVersion('%s')" % (package[0].version,
-- requirement_relation,
-- requirement_version)
-- )
-- else:
-- rep=True
-- else:
-- rep=False
--
-- if rep:
-- if requirement_version is not None and requirement_relation is not None:
-- log.info("Look for package %s (%s%s) : ok version %s installed" % (requirement_project,
-- requirement_relation,
-- requirement_version,
-- package[0].version))
-- else:
-- log.info("Look for package %s : ok version %s installed" % (requirement_project,
-- package[0].version))
-- else:
-- if len(package)!=1:
-- log.info("Look for package %s (%s%s) : not installed" % (requirement_project,
-- requirement_relation,
-- requirement_version))
-- else:
-- log.info("Look for package %s (%s%s) : failed only version %s installed" % (requirement_project,
-- requirement_relation,
-- requirement_version,
-- package[0].version))
--
-- return rep
-+ return True
-
-
- def get_requirements(pip=None):
---- a/setup.py
-+++ b/setup.py
-@@ -66,7 +66,7 @@
- license=LICENSE,
- url=URL,
- python_src=SRC,
-- sse='sse2',
-- serenity=serenity)
-+ sse='sse2')
-+ #serenity=serenity)
-
-
diff --git a/debian/rules b/debian/rules
deleted file mode 100755
index 2262ab4..0000000
--- a/debian/rules
+++ /dev/null
@@ -1,35 +0,0 @@
-#!/usr/bin/make -f
-# See debhelper(7) (uncomment to enable)
-# output every command that modifies files on the build system.
-#DH_VERBOSE = 1
-
-# see EXAMPLES in dpkg-buildflags(1) and read /usr/share/dpkg/*
-DPKG_EXPORT_BUILDFLAGS = 1
-include /usr/share/dpkg/default.mk
-
-# see FEATURE AREAS in dpkg-buildflags(1)
-export DEB_BUILD_MAINT_OPTIONS = hardening=+all
-
-# see ENVIRONMENT in dpkg-buildflags(1)
-# package maintainers to append CFLAGS
-#export DEB_CFLAGS_MAINT_APPEND = -Wall -pedantic
-# package maintainers to append LDFLAGS
-#export DEB_LDFLAGS_MAINT_APPEND = -Wl,--as-needed
-
-export PYBUILD_NAME=obitools
-
-
-# main packaging script based on dh7 syntax
-%:
- dh $@ --with python2,sphinxdoc
-
-
-override_dh_install:
- dh_install
-
-
-override_dh_clean:
- dh_clean
- rm -rf build
- rm -f doc/sphinx/build_dir.txt
-
diff --git a/debian/source/format b/debian/source/format
deleted file mode 100644
index 163aaf8..0000000
--- a/debian/source/format
+++ /dev/null
@@ -1 +0,0 @@
-3.0 (quilt)
diff --git a/debian/watch b/debian/watch
deleted file mode 100644
index 3f37a05..0000000
--- a/debian/watch
+++ /dev/null
@@ -1,3 +0,0 @@
-version=3
-opts=dversionmangle=s/\+dfsg//,uversionmangle=s/(rc|a|b|c)/~$1/ \
-http://pypi.debian.net/OBITools/OBITools-(.+)\.(?:zip|tgz|tbz|txz|(?:tar\.(?:gz|bz2|xz)))
diff --git a/distutils.ext/obidistutils/__init__.py b/distutils.ext/obidistutils/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/distutils.ext/obidistutils/command/__init__.py b/distutils.ext/obidistutils/command/__init__.py
new file mode 100644
index 0000000..92561cf
--- /dev/null
+++ b/distutils.ext/obidistutils/command/__init__.py
@@ -0,0 +1,7 @@
+
+def build_ext(*args,**kargs):
+ '''
+ Wrapper over the build_ext class to postpone the import of cython
+ '''
+ from build_ext import build_ext as _build_ext
+ return _build_ext(*args,**kargs)
diff --git a/distutils.ext/obidistutils/command/build.py b/distutils.ext/obidistutils/command/build.py
new file mode 100644
index 0000000..b95280e
--- /dev/null
+++ b/distutils.ext/obidistutils/command/build.py
@@ -0,0 +1,47 @@
+'''
+Created on 20 oct. 2012
+
+ at author: coissac
+'''
+
+from distutils.command.build import build as ori_build
+from obidistutils.serenity.checksystem import is_mac_system
+
+from distutils import log
+
+class build(ori_build):
+
+ def has_ctools(self):
+ return self.distribution.has_ctools()
+
+ def has_files(self):
+ return self.distribution.has_files()
+
+ def has_executables(self):
+ return self.distribution.has_executables()
+
+ def has_ext_modules(self):
+ return self.distribution.has_ext_modules()
+
+ def has_littlebigman(self):
+ return True
+
+ def has_pidname(self):
+ return is_mac_system()
+
+ def has_doc(self):
+ return True
+
+
+ sub_commands = [('littlebigman', has_littlebigman),
+ ('pidname',has_pidname),
+ ('build_ctools', has_ctools),
+ ('build_files', has_files),
+ ('build_cexe', has_executables)] \
+ + ori_build.sub_commands + \
+ [('build_sphinx',has_doc)]
+
+ def run(self):
+ log.info('\n\nRunning obidistutils build process\n\n')
+ ori_build.run(self)
+
diff --git a/distutils.ext/obidistutils/command/build_cexe.py b/distutils.ext/obidistutils/command/build_cexe.py
new file mode 100644
index 0000000..6fd164b
--- /dev/null
+++ b/distutils.ext/obidistutils/command/build_cexe.py
@@ -0,0 +1,72 @@
+'''
+Created on 20 oct. 2012
+
+ at author: coissac
+'''
+
+from obidistutils.command.build_ctools import build_ctools
+from distutils.errors import DistutilsSetupError
+
+
+class build_cexe(build_ctools):
+
+ description = "build C/C++ executable distributed with Python extensions"
+
+
+ def initialize_options(self):
+ build_ctools.initialize_options(self)
+ self.built_files = None
+
+
+ def finalize_options(self):
+ # This might be confusing: both build-cexe and build-temp default
+ # to build-temp as defined by the "build" command. This is because
+ # I think that C libraries are really just temporary build
+ # by-products, at least from the point of view of building Python
+ # extensions -- but I want to keep my options open.
+
+ build_cexe_dir = self.build_cexe
+ build_ctools.finalize_options(self)
+
+ if build_cexe_dir is None:
+ self.build_cexe=None
+
+ self.set_undefined_options('build',
+ ('build_scripts', 'build_cexe'))
+
+ self.set_undefined_options('build_files',
+ ('files', 'built_files'))
+
+ self.executables = self.distribution.executables
+
+ if self.executables:
+ self.check_executable_list(self.executables)
+
+
+ # XXX same as for build_ext -- what about 'self.define' and
+ # 'self.undef' ?
+
+ def substitute_sources(self,exe_name,sources):
+ """
+ Substitutes source file name starting by an @ by the actual
+ name of the built file (see --> build_files)
+ """
+ sources = list(sources)
+ for i in xrange(len(sources)):
+ print exe_name,sources[i],
+ if sources[i][0]=='@':
+ try:
+ filename = self.built_files[sources[i][1:]]
+ except KeyError:
+ raise DistutilsSetupError, \
+ ('The %s filename declared in the source '
+ 'files of the program %s have not been '
+ 'built by the installation process') % (sources[i],
+ exe_name)
+ sources[i]=filename
+ print "changed to ",filename
+ else:
+ print " ok"
+
+ return sources
+
diff --git a/distutils.ext/obidistutils/command/build_ctools.py b/distutils.ext/obidistutils/command/build_ctools.py
new file mode 100644
index 0000000..d3a3c1a
--- /dev/null
+++ b/distutils.ext/obidistutils/command/build_ctools.py
@@ -0,0 +1,60 @@
+'''
+Created on 20 oct. 2012
+
+ at author: coissac
+'''
+
+import os
+
+
+from obidistutils.command.build_exe import build_exe
+from distutils.sysconfig import customize_compiler
+from distutils.errors import DistutilsSetupError
+from distutils import log
+
+class build_ctools(build_exe):
+ description = "build C/C++ executable not distributed with Python extensions"
+
+ def initialize_options(self):
+ build_exe.initialize_options(self)
+
+ # List of built tools
+ self.ctools = None
+ self.littlebigman = None
+
+
+ def finalize_options(self):
+ # This might be confusing: both build-cexe and build-temp default
+ # to build-temp as defined by the "build" command. This is because
+ # I think that C libraries are really just temporary build
+ # by-products, at least from the point of view of building Python
+ # extensions -- but I want to keep my options open.
+
+ build_exe.finalize_options(self)
+
+
+ self.set_undefined_options('build',
+ ('build_temp', 'build_cexe'))
+
+ self.set_undefined_options('littlebigman',
+ ('littlebigman', 'littlebigman'))
+
+ self.executables = self.distribution.ctools
+ self.check_executable_list(self.executables)
+
+ if self.littlebigman =='-DLITTLE_END':
+ if self.define is None:
+ self.define=[('LITTLE_END',None)]
+ else:
+ self.define.append('LITTLE_END',None)
+
+ self.ctools = set()
+
+ def run(self):
+
+ build_exe.run(self)
+ for e,p in self.executables:
+ self.ctools.add(e)
+
+
+
diff --git a/distutils.ext/obidistutils/command/build_exe.py b/distutils.ext/obidistutils/command/build_exe.py
new file mode 100644
index 0000000..e39cecd
--- /dev/null
+++ b/distutils.ext/obidistutils/command/build_exe.py
@@ -0,0 +1,213 @@
+'''
+Created on 20 oct. 2012
+
+ at author: coissac
+'''
+
+import os
+
+from distutils.core import Command
+from distutils.sysconfig import customize_compiler
+from distutils.errors import DistutilsSetupError
+from distutils import log
+from distutils.ccompiler import show_compilers
+
+import subprocess
+
+class build_exe(Command):
+
+ description = "build an executable -- Abstract command "
+
+ user_options = [
+ ('build-cexe', 'x',
+ "directory to build C/C++ libraries to"),
+ ('build-temp', 't',
+ "directory to put temporary build by-products"),
+ ('debug', 'g',
+ "compile with debugging information"),
+ ('force', 'f',
+ "forcibly build everything (ignore file timestamps)"),
+ ('compiler=', 'c',
+ "specify the compiler type"),
+ ]
+
+ boolean_options = ['debug', 'force']
+
+ help_options = [
+ ('help-compiler', None,
+ "list available compilers", show_compilers),
+ ]
+
+ def initialize_options(self):
+ self.build_cexe = None
+ self.build_temp = None
+
+ # List of executables to build
+ self.executables = None
+
+ # Compilation options for all libraries
+ self.include_dirs = None
+ self.define = None
+ self.undef = None
+ self.extra_compile_args = None
+ self.debug = None
+ self.force = 0
+ self.compiler = None
+ self.sse = None
+ self.built_files=None
+
+ def finalize_options(self):
+ # This might be confusing: both build-cexe and build-temp default
+ # to build-temp as defined by the "build" command. This is because
+ # I think that C libraries are really just temporary build
+ # by-products, at least from the point of view of building Python
+ # extensions -- but I want to keep my options open.
+ self.set_undefined_options('build',
+ ('build_temp', 'build_temp'),
+ ('compiler', 'compiler'),
+ ('debug', 'debug'),
+ ('force', 'force'))
+
+ if self.include_dirs is None:
+ self.include_dirs = self.distribution.include_dirs or []
+
+ if isinstance(self.include_dirs, str):
+ self.include_dirs = self.include_dirs.split(os.pathsep)
+
+ self.sse = self.distribution.sse
+
+ if self.sse is not None:
+ if self.extra_compile_args is None:
+ self.extra_compile_args=['-m%s' % self.sse]
+ else:
+ self.extra_compile_args.append('-m%s' % self.sse)
+
+ # XXX same as for build_ext -- what about 'self.define' and
+ # 'self.undef' ?
+
+ def run(self):
+
+ if not self.executables:
+ return
+
+ self.mkpath(self.build_cexe)
+
+ # Yech -- this is cut 'n pasted from build_ext.py!
+ from distutils.ccompiler import new_compiler
+ self.compiler = new_compiler(compiler=self.compiler,
+ dry_run=self.dry_run,
+ force=self.force)
+ customize_compiler(self.compiler)
+
+ if self.include_dirs is not None:
+ self.compiler.set_include_dirs(self.include_dirs)
+ if self.define is not None:
+ # 'define' option is a list of (name,value) tuples
+ for (name,value) in self.define:
+ self.compiler.define_macro(name, value)
+
+ if self.undef is not None:
+ for macro in self.undef:
+ self.compiler.undefine_macro(macro)
+
+ self.build_executables(self.executables)
+
+
+ def check_executable_list(self, executables):
+ """Ensure that the list of executables is valid.
+
+ `executable` is presumably provided as a command option 'executables'.
+ This method checks that it is a list of 2-tuples, where the tuples
+ are (executable_name, build_info_dict).
+
+ Raise DistutilsSetupError if the structure is invalid anywhere;
+ just returns otherwise.
+ """
+ if not isinstance(executables, list):
+ raise DistutilsSetupError, \
+ "'executables' option must be a list of tuples"
+
+ for exe in executables:
+ if not isinstance(exe, tuple) and len(exe) != 2:
+ raise DistutilsSetupError, \
+ "each element of 'executables' must a 2-tuple"
+
+ name, build_info = exe
+
+ if not isinstance(name, str):
+ raise DistutilsSetupError, \
+ "first element of each tuple in 'executables' " + \
+ "must be a string (the executables name)"
+ if '/' in name or (os.sep != '/' and os.sep in name):
+ raise DistutilsSetupError, \
+ ("bad executable name '%s': " +
+ "may not contain directory separators") % \
+ exe[0]
+
+ if not isinstance(build_info, dict):
+ raise DistutilsSetupError, \
+ "second element of each tuple in 'executables' " + \
+ "must be a dictionary (build info)"
+
+ def get_executable_names(self):
+ # Assume the executables list is valid -- 'check_executable_list()' is
+ # called from 'finalize_options()', so it should be!
+ if not self.executables:
+ return None
+
+ exe_names = []
+ for (exe_name, build_info) in self.executables:
+ exe_names.append(exe_name)
+ return exe_names
+
+
+ def get_source_files(self):
+ self.check_executable_list(self.executables)
+ filenames = []
+ for (exe_name, build_info) in self.executables:
+ sources = build_info.get('sources')
+ if sources is None or not isinstance(sources, (list, tuple)):
+ raise DistutilsSetupError, \
+ ("in 'executables' option (library '%s'), "
+ "'sources' must be present and must be "
+ "a list of source filenames") % exe_name
+
+ filenames.extend(sources)
+ return filenames
+
+ def substitute_sources(self,exe_name,sources):
+ return list(sources)
+
+ def build_executables(self, executables):
+ for (exe_name, build_info) in executables:
+ sources = build_info.get('sources')
+ if sources is None or not isinstance(sources, (list, tuple)):
+ raise DistutilsSetupError, \
+ ("in 'executables' option (library '%s'), " +
+ "'sources' must be present and must be " +
+ "a list of source filenames") % exe_name
+ sources = self.substitute_sources(exe_name,sources)
+
+ log.info("building '%s' program", exe_name)
+
+ # First, compile the source code to object files in the library
+ # directory. (This should probably change to putting object
+ # files in a temporary build directory.)
+ macros = build_info.get('macros')
+ include_dirs = build_info.get('include_dirs')
+ extra_args = self.extra_compile_args or []
+
+ objects = self.compiler.compile(sources,
+ output_dir=self.build_temp,
+ macros=macros,
+ include_dirs=include_dirs,
+ extra_postargs=extra_args,
+ debug=self.debug)
+
+ # Now "link" the object files together into a static library.
+ # (On Unix at least, this isn't really linking -- it just
+ # builds an archive. Whatever.)
+ self.compiler.link_executable(objects, exe_name,
+ output_dir=self.build_cexe,
+ debug=self.debug)
+
diff --git a/distutils.ext/obidistutils/command/build_ext.py b/distutils.ext/obidistutils/command/build_ext.py
new file mode 100644
index 0000000..494953f
--- /dev/null
+++ b/distutils.ext/obidistutils/command/build_ext.py
@@ -0,0 +1,114 @@
+'''
+Created on 13 fevr. 2014
+
+ at author: coissac
+'''
+
+from distutils import log
+import sys
+from obidistutils.serenity import is_serenity
+
+import os
+from obidistutils.serenity.checkpackage import install_requirements
+from obidistutils.serenity.rerun import rerun_with_anothe_python
+
+try:
+ from Cython.Distutils import build_ext as ori_build_ext # @UnresolvedImport
+except ImportError:
+ if not is_serenity() and install_requirements():
+ log.info("Restarting installation with all dependencies ok")
+ rerun_with_anothe_python(os.path.realpath(sys.executable))
+
+from distutils.errors import DistutilsSetupError
+
+class build_ext(ori_build_ext):
+ def modifyDocScripts(self):
+ print >>open("doc/sphinx/build_dir.txt","w"),self.build_lib
+
+ def initialize_options(self):
+ ori_build_ext.initialize_options(self) # @UndefinedVariable
+ self.littlebigman = None
+ self.built_files = None
+
+
+ def finalize_options(self):
+ ori_build_ext.finalize_options(self) # @UndefinedVariable
+
+ self.set_undefined_options('littlebigman',
+ ('littlebigman', 'littlebigman'))
+
+ self.set_undefined_options('build_files',
+ ('files', 'built_files'))
+
+ self.cython_c_in_temp = 1
+
+ if self.littlebigman =='-DLITTLE_END':
+ if self.define is None:
+ self.define=[('LITTLE_END',None)]
+ else:
+ self.define.append('LITTLE_END',None)
+
+ def substitute_sources(self,exe_name,sources):
+ """
+ Substitutes source file name starting by an @ by the actual
+ name of the built file (see --> build_files)
+ """
+ sources = list(sources)
+ for i in xrange(len(sources)):
+ print exe_name,sources[i],
+ if sources[i][0]=='@':
+ try:
+ filename = self.built_files[sources[i][1:]]
+ except KeyError:
+ tmpfilename = os.path.join(self.build_temp,sources[i][1:])
+ if os.path.isfile (tmpfilename):
+ filename = tmpfilename
+ else:
+ raise DistutilsSetupError, \
+ ('The %s filename declared in the source '
+ 'files of the program %s have not been '
+ 'built by the installation process') % (sources[i],
+ exe_name)
+ sources[i]=filename
+ print "changed to ",filename
+ else:
+ print " ok"
+
+ return sources
+
+ def build_extensions(self):
+ # First, sanity-check the 'extensions' list
+
+ for ext in self.extensions:
+ ext.sources = self.substitute_sources(ext.name,ext.sources)
+
+ self.check_extensions_list(self.extensions)
+
+ for ext in self.extensions:
+ print "#####>",ext.sources
+ ext.sources = self.cython_sources(ext.sources, ext)
+ self.build_extension(ext)
+
+
+ def run(self):
+ self.modifyDocScripts()
+ ori_build_ext.run(self) # @UndefinedVariable
+
+
+
+# from obidistutils.serenity.getcython import get_a_cython_module
+# import imp
+# import os.path
+#
+# log.info("No cython installed, try to install a temporary cython")
+# cython = get_a_cython_module()
+# sys.path.insert(0,os.path.dirname(os.path.dirname(cython.__file__)))
+# f, filename, description=imp.find_module('Distutils',[os.path.dirname(cython.__file__)])
+# submodule = imp.load_module('Cython.Distutils', f, filename, description)
+# ori_build_ext = submodule.build_ext
+#
+
+
+
+
+
\ No newline at end of file
diff --git a/distutils.ext/obidistutils/command/build_files.py b/distutils.ext/obidistutils/command/build_files.py
new file mode 100644
index 0000000..6183f07
--- /dev/null
+++ b/distutils.ext/obidistutils/command/build_files.py
@@ -0,0 +1,58 @@
+'''
+Created on 20 oct. 2012
+
+ at author: coissac
+'''
+
+import os.path
+
+from distutils.core import Command
+from distutils.util import convert_path
+from distutils import log, sysconfig
+from distutils.dep_util import newer
+from distutils import log
+
+
+
+class build_files(Command):
+
+ def initialize_options(self):
+ self.files=None
+ self.ctools=None
+ self.build_temp=None
+ self.build_cexe=None
+
+ def finalize_options(self):
+
+ self.set_undefined_options('build_ctools',
+ ('ctools', 'ctools'),
+ ('build_temp','build_temp'),
+ ('build_cexe','build_cexe'),
+ )
+
+ self.files = {}
+
+ def run(self):
+
+ for dest,prog,command in self.distribution.files:
+ destfile = os.path.join(self.build_temp,dest)
+ if prog in self.ctools:
+ progfile = os.path.join(self.build_cexe,prog)
+ else:
+ progfile = prog
+
+ log.info("Building file : %s" % dest)
+
+ commandline = command % {'prog' : progfile,
+ 'dest' : destfile}
+
+ log.info(" --> %s" % commandline)
+
+ os.system(commandline)
+ self.files[dest]=destfile
+
+ log.info("Done.\n")
+
+
+
+
diff --git a/distutils.ext/obidistutils/command/build_filters.py b/distutils.ext/obidistutils/command/build_filters.py
new file mode 100644
index 0000000..c6819c9
--- /dev/null
+++ b/distutils.ext/obidistutils/command/build_filters.py
@@ -0,0 +1,10 @@
+'''
+Created on 20 oct. 2012
+
+ at author: coissac
+'''
+from obidistutils.command.build_scripts import build_scripts
+
+class build_filters(build_scripts):
+ pass
+
diff --git a/distutils.ext/obidistutils/command/build_scripts.py b/distutils.ext/obidistutils/command/build_scripts.py
new file mode 100644
index 0000000..8a8701c
--- /dev/null
+++ b/distutils.ext/obidistutils/command/build_scripts.py
@@ -0,0 +1,102 @@
+'''
+Created on 20 oct. 2012
+
+ at author: coissac
+'''
+
+import os.path
+
+from distutils.command.build_scripts import build_scripts as ori_build_scripts,\
+ first_line_re
+from distutils.util import convert_path
+from distutils import log, sysconfig
+from distutils.dep_util import newer
+from stat import ST_MODE
+
+
+
+class build_scripts(ori_build_scripts):
+
+ def copy_scripts (self):
+ """Copy each script listed in 'self.scripts'; if it's marked as a
+ Python script in the Unix way (first line matches 'first_line_re',
+ ie. starts with "\#!" and contains "python"), then adjust the first
+ line to refer to the current Python interpreter as we copy.
+ """
+ self.mkpath(self.build_dir)
+ rawbuild_dir = os.path.join(os.path.dirname(self.build_dir),'raw_scripts')
+ self.mkpath(rawbuild_dir)
+
+ outfiles = []
+ for script in self.scripts:
+ adjust = 0
+ script = convert_path(script)
+ outfile = os.path.join(self.build_dir, os.path.splitext(os.path.basename(script))[0])
+ rawoutfile = os.path.join(rawbuild_dir, os.path.basename(script))
+ outfiles.append(outfile)
+
+ if not self.force and not newer(script, outfile):
+ log.debug("not copying %s (up-to-date)", script)
+ continue
+
+ # Always open the file but ignore failures in dry-run mode --
+ # that way, we'll get accurate feedback if we can read the
+ # script.
+ try:
+ f = open(script, "r")
+ except IOError:
+ if not self.dry_run:
+ raise
+ f = None
+ else:
+ first_line = f.readline()
+ if not first_line:
+ self.warn("%s is an empty file (skipping)" % script)
+ continue
+
+ match = first_line_re.match(first_line)
+ if match:
+ adjust = 1
+ post_interp = match.group(1) or ''
+
+ log.info("Store the raw script %s -> %s", script,rawoutfile)
+ self.copy_file(script, rawoutfile)
+
+
+ if adjust:
+ log.info("copying and adjusting %s -> %s", script,
+ self.build_dir)
+ if not self.dry_run:
+ outf = open(outfile, "w")
+ if not sysconfig.python_build:
+ outf.write("#!%s%s\n" %
+ (self.executable,
+ post_interp))
+ else:
+ outf.write("#!%s%s\n" %
+ (os.path.join(
+ sysconfig.get_config_var("BINDIR"),
+ "python%s%s" % (sysconfig.get_config_var("VERSION"),
+ sysconfig.get_config_var("EXE"))),
+ post_interp))
+ outf.writelines(f.readlines())
+ outf.close()
+ if f:
+ f.close()
+ else:
+ if f:
+ f.close()
+ self.copy_file(script, outfile)
+
+ if os.name == 'posix':
+ for F in outfiles:
+ if self.dry_run:
+ log.info("changing mode of %s", F)
+ else:
+ oldmode = os.stat(F)[ST_MODE] & 07777
+ newmode = (oldmode | 0555) & 07777
+ if newmode != oldmode:
+ log.info("changing mode of %s from %o to %o",
+ F, oldmode, newmode)
+ os.chmod(F, newmode)
+
diff --git a/distutils.ext/obidistutils/command/build_sphinx.py b/distutils.ext/obidistutils/command/build_sphinx.py
new file mode 100644
index 0000000..8e23aaf
--- /dev/null
+++ b/distutils.ext/obidistutils/command/build_sphinx.py
@@ -0,0 +1,38 @@
+'''
+Created on 10 mars 2015
+
+ at author: coissac
+'''
+
+from distutils import log
+from obidistutils.serenity.checkpackage import install_requirements
+from obidistutils.serenity.rerun import rerun_with_anothe_python
+from obidistutils.serenity import is_serenity
+import os
+import sys
+
+try:
+ from sphinx.setup_command import BuildDoc as ori_build_sphinx
+except ImportError:
+ if not is_serenity() and install_requirements():
+ log.info("Restarting installation with all dependencies ok")
+ rerun_with_anothe_python(os.path.realpath(sys.executable))
+
+class build_sphinx(ori_build_sphinx):
+ '''
+ Build Sphinx documentation in html, epub and man formats
+ '''
+
+ description = __doc__
+
+ def run(self):
+ self.builder='html'
+ self.finalize_options()
+ ori_build_sphinx.run(self)
+ self.builder='epub'
+ self.finalize_options()
+ ori_build_sphinx.run(self)
+ self.builder='man'
+ self.finalize_options()
+ ori_build_sphinx.run(self)
+
\ No newline at end of file
diff --git a/distutils.ext/obidistutils/command/install.py b/distutils.ext/obidistutils/command/install.py
new file mode 100644
index 0000000..3d33dde
--- /dev/null
+++ b/distutils.ext/obidistutils/command/install.py
@@ -0,0 +1,25 @@
+'''
+Created on 6 oct. 2014
+
+ at author: coissac
+'''
+
+try:
+ from setuptools.command.install import install as install_ori
+ has_setuptools=True
+except ImportError:
+ from distutils.command.install import install as install_ori
+ has_setuptools=False
+
+from distutils import log
+
+class install(install_ori):
+
+ def __init__(self,dist):
+ install_ori.__init__(self, dist)
+ self.sub_commands.insert(0, ('build',lambda self: True))
+ self.sub_commands.append(('install_sphinx',lambda self: self.distribution.serenity))
+
+ def run(self):
+ log.info('\n\nRunning obidistutils install process\n\n')
+ install_ori.run(self)
diff --git a/distutils.ext/obidistutils/command/install_scripts.py b/distutils.ext/obidistutils/command/install_scripts.py
new file mode 100644
index 0000000..fb1409f
--- /dev/null
+++ b/distutils.ext/obidistutils/command/install_scripts.py
@@ -0,0 +1,79 @@
+'''
+Created on 20 oct. 2012
+
+ at author: coissac
+'''
+try:
+ from setuptools.command.install_scripts import install_scripts as ori_install_scripts
+ has_setuptools = True
+except ImportError:
+ from distutils.command.install_scripts import install_scripts as ori_install_scripts
+ has_setuptools = False
+
+import os.path
+from distutils import log
+
+class install_scripts(ori_install_scripts):
+
+ def initialize_options(self):
+ ori_install_scripts.initialize_options(self)
+ self.deprecated_scripts = None
+ self.public_dir = None
+
+ def revove_dot_py(self):
+ for filename in self.get_outputs():
+ pyfile = "%s.py" % filename
+ if os.path.exists(pyfile):
+ command = os.path.split(pyfile)[-1]
+ log.info('Removing deprecated .py form of the unix command : %s (file %s)' % (command,pyfile))
+ if not self.dry_run:
+ os.unlink(pyfile)
+ try:
+ if not self.dry_run:
+ os.unlink(os.path.join(self.build_dir,command))
+ except:
+ log.info('Unix command %s is not present in build dir' % command)
+
+ def install_public_link(self):
+ self.mkpath(self.public_dir)
+ for file in self.get_outputs():
+ if self.dry_run:
+ log.info("changing mode of %s", file)
+ else:
+ log.info("exporting file %s -> %s", file,os.path.join(self.public_dir,
+ os.path.split(file)[1]
+ ))
+ dest = os.path.join(self.public_dir,
+ os.path.split(file)[1]
+ )
+ if os.path.exists(dest):
+ os.unlink(dest)
+ os.symlink(file,dest)
+
+ def remove_deprecated_script(self):
+ if self.deprecated_scripts is not None:
+ for f in self.deprecated_scripts:
+ try:
+ ff = os.path.join(self.install_dir,f)
+ if not self.dry_run:
+ os.unlink(ff)
+ log.info('Removing deprecated unix command : %s (file : %s)' % (f,ff))
+ ff = os.path.join(self.build_dir,f)
+ if not self.dry_run:
+ os.unlink(ff)
+ except:
+ log.info('Unix command %s is not present' % f)
+
+
+
+ def run(self):
+
+ self.remove_deprecated_script()
+ ori_install_scripts.run(self)
+ if self.distribution.serenity:
+ self.public_dir=os.path.join(self.install_dir,"../export/bin")
+ self.public_dir=os.path.abspath(self.public_dir)
+ self.install_public_link()
+ self.revove_dot_py()
+
+
diff --git a/distutils.ext/obidistutils/command/install_sphinx.py b/distutils.ext/obidistutils/command/install_sphinx.py
new file mode 100644
index 0000000..5b8e2a3
--- /dev/null
+++ b/distutils.ext/obidistutils/command/install_sphinx.py
@@ -0,0 +1,46 @@
+'''
+Created on 10 mars 2015
+
+ at author: coissac
+'''
+from distutils.core import Command
+import os.path
+import glob
+
+class install_sphinx(Command):
+ '''
+ Install the sphinx documentation
+ '''
+
+ description = "Install the sphinx documentation in serenity mode"
+
+ boolean_options = ['force', 'skip-build']
+
+
+ def initialize_options (self):
+ self.install_doc = None
+ self.build_dir = None
+
+ def finalize_options (self):
+ self.set_undefined_options('build_sphinx', ('build_dir', 'build_dir'))
+ self.set_undefined_options('install',
+ ('install_scripts', 'install_doc'))
+
+ def run (self):
+ if self.distribution.serenity:
+ self.install_doc = os.path.join(self.install_doc,"../export/share")
+ self.install_doc=os.path.abspath(self.install_doc)
+ self.mkpath(self.install_doc)
+ self.mkpath(os.path.join(self.install_doc,'html'))
+ outfiles = self.copy_tree(os.path.join(self.build_dir,'html'),
+ os.path.join(self.install_doc,'html'))
+
+ self.mkpath(os.path.join(self.install_doc,'man','man1'))
+ outfiles = self.copy_tree(os.path.join(self.build_dir,'man'),
+ os.path.join(self.install_doc,'man','man1'))
+
+ for epub in glob.glob(os.path.join(self.build_dir,'epub/*.epub')):
+ self.copy_file(os.path.join(epub),
+ os.path.join(self.install_doc,os.path.split(epub)[1]))
+
+
\ No newline at end of file
diff --git a/distutils.ext/obidistutils/command/littlebigman.py b/distutils.ext/obidistutils/command/littlebigman.py
new file mode 100644
index 0000000..3d07d59
--- /dev/null
+++ b/distutils.ext/obidistutils/command/littlebigman.py
@@ -0,0 +1,62 @@
+'''
+Created on 20 oct. 2012
+
+ at author: coissac
+'''
+
+import os
+
+from distutils.core import Command
+from obidistutils.command.build_exe import build_exe
+from distutils.sysconfig import customize_compiler
+from distutils.errors import DistutilsSetupError
+from distutils import log
+
+import subprocess
+
+class littlebigman(build_exe):
+
+ description = "build the littlebigman executable testing endianness of the CPU"
+
+
+ def initialize_options(self):
+ build_exe.initialize_options(self)
+
+ self.littlebigman = None
+
+
+ def finalize_options(self):
+ # This might be confusing: both build-cexe and build-temp default
+ # to build-temp as defined by the "build" command. This is because
+ # I think that C libraries are really just temporary build
+ # by-products, at least from the point of view of building Python
+ # extensions -- but I want to keep my options open.
+
+ build_exe.finalize_options(self)
+
+ self.set_undefined_options('build',
+ ('build_temp', 'build_cexe'))
+
+ # self.ctools = self.distribution.ctools
+
+ if os.path.isfile("distutils.ext/src/littlebigman.c"):
+ self.executables = [('littlebigman',{"sources":["distutils.ext/src/littlebigman.c"]})]
+ self.check_executable_list(self.executables)
+ else:
+ self.executables = []
+
+
+ def run_littlebigman(self):
+ p = subprocess.Popen("'%s'" % os.path.join(self.build_temp,
+ 'littlebigman'),
+ shell=True,
+ stdout=subprocess.PIPE)
+ little = p.communicate()[0]
+ return little
+
+ def run(self):
+ build_exe.run(self)
+ self.littlebigman=self.run_littlebigman()
+ log.info("Your CPU is in mode : %s" % self.littlebigman)
+
+
\ No newline at end of file
diff --git a/distutils.ext/obidistutils/command/pidname.py b/distutils.ext/obidistutils/command/pidname.py
new file mode 100644
index 0000000..d50ddbb
--- /dev/null
+++ b/distutils.ext/obidistutils/command/pidname.py
@@ -0,0 +1,53 @@
+'''
+Created on 20 oct. 2012
+
+ at author: coissac
+'''
+
+import os
+
+from obidistutils.command.build_exe import build_exe
+from obidistutils.serenity.checksystem import is_mac_system
+from distutils import log
+
+class pidname(build_exe):
+
+ description = "build the pidname executable returning the executable path from a PID on a mac"
+
+
+ def initialize_options(self):
+ build_exe.initialize_options(self)
+
+ self.pidname = False
+
+
+ def finalize_options(self):
+ # This might be confusing: both build-cexe and build-temp default
+ # to build-temp as defined by the "build" command. This is because
+ # I think that C libraries are really just temporary build
+ # by-products, at least from the point of view of building Python
+ # extensions -- but I want to keep my options open.
+
+ build_exe.finalize_options(self)
+
+ self.set_undefined_options('build',
+ ('build_scripts', 'build_cexe'))
+
+ # self.ctools = self.distribution.ctools
+
+ if os.path.isfile("distutils.ext/src/pidname.c"):
+ self.executables = [('pidname',{"sources":["distutils.ext/src/pidname.c"]})]
+ self.check_executable_list(self.executables)
+ else:
+ self.executables = []
+
+
+ def run(self):
+ if is_mac_system():
+ log.info("Building pidname...")
+ build_exe.run(self)
+ log.info("Done")
+ self.pidname=True
+ else:
+ self.pidname=False
+
\ No newline at end of file
diff --git a/distutils.ext/obidistutils/command/sdist.py b/distutils.ext/obidistutils/command/sdist.py
new file mode 100644
index 0000000..6dd3714
--- /dev/null
+++ b/distutils.ext/obidistutils/command/sdist.py
@@ -0,0 +1,46 @@
+'''
+Created on 10 mars 2015
+
+ at author: coissac
+'''
+
+import os.path
+
+try:
+ from setuptools.command.sdist import sdist as orig_sdist
+except ImportError:
+ from distutils.command.sdist import sdist as orig_sdist
+
+from distutils import dir_util
+
+class sdist(orig_sdist):
+
+ def make_distribution(self):
+ """Create the source distribution(s). First, we create the release
+ tree with 'make_release_tree()'; then, we create all required
+ archive files (according to 'self.formats') from the release tree.
+ Finally, we clean up by blowing away the release tree (unless
+ 'self.keep_temp' is true). The list of archive files created is
+ stored so it can be retrieved later by 'get_archive_files()'.
+ """
+ # Don't warn about missing meta-data here -- should be (and is!)
+ # done elsewhere.
+ base_dir = self.distribution.get_fullname()
+ base_name = os.path.join(self.dist_dir,base_dir)
+
+ self.make_release_tree(os.path.join('tmp',base_dir), self.filelist.files)
+ archive_files = [] # remember names of files we create
+ # tar archive must be created last to avoid overwrite and remove
+ if 'tar' in self.formats:
+ self.formats.append(self.formats.pop(self.formats.index('tar')))
+
+ for fmt in self.formats:
+ file = self.make_archive(base_name, fmt, root_dir='tmp',base_dir=base_dir,
+ owner=self.owner, group=self.group)
+ archive_files.append(file)
+ self.distribution.dist_files.append(('sdist', '', file))
+
+ self.archive_files = archive_files
+
+ if not self.keep_temp:
+ dir_util.remove_tree(os.path.join('tmp',base_dir), dry_run=self.dry_run)
diff --git a/distutils.ext/obidistutils/core.py b/distutils.ext/obidistutils/core.py
new file mode 100644
index 0000000..ce0ce12
--- /dev/null
+++ b/distutils.ext/obidistutils/core.py
@@ -0,0 +1,205 @@
+'''
+Created on 20 oct. 2012
+
+ at author: coissac
+'''
+
+from os import path
+import os.path
+import glob
+import sys
+from obidistutils.command.sdist import sdist
+
+
+try:
+ from setuptools import setup as ori_setup
+ from setuptools.command.egg_info import egg_info
+ has_setuptools = True
+except ImportError:
+ from distutils.core import setup as ori_setup
+ has_setuptools = False
+
+from distutils.extension import Extension
+
+from obidistutils.command.build import build
+from obidistutils.command.littlebigman import littlebigman
+from obidistutils.command.build_cexe import build_cexe
+from obidistutils.command.build_sphinx import build_sphinx
+from obidistutils.command import build_ext
+from obidistutils.command.build_ctools import build_ctools
+from obidistutils.command.build_files import build_files
+from obidistutils.command.build_scripts import build_scripts
+from obidistutils.command.install_scripts import install_scripts
+from obidistutils.command.install_sphinx import install_sphinx
+from obidistutils.command.install import install
+from obidistutils.command.pidname import pidname
+
+from obidistutils.dist import Distribution
+
+
+def findPackage(root,base=None):
+ modules=[]
+ if base is None:
+ base=[]
+ for module in (path.basename(path.dirname(x))
+ for x in glob.glob(path.join(root,'*','__init__.py'))):
+ modules.append('.'.join(base+[module]))
+ modules.extend(findPackage(path.join(root,module),base+[module]))
+ return modules
+
+def findCython(root,base=None,pyrexs=None):
+ setupdir = os.path.dirname(sys.argv[0])
+ pyrexs=[]
+
+ if base is None:
+ base=[]
+ for module in (path.basename(path.dirname(x))
+ for x in glob.glob(path.join(root,'*','__init__.py'))):
+
+
+ for pyrex in glob.glob(path.join(root,module,'*.pyx')):
+ pyrexs.append(Extension('.'.join(base+[module,path.splitext(path.basename(pyrex))[0]]),[pyrex]))
+ try:
+ cfiles = os.path.splitext(pyrex)[0]+".cfiles"
+ cfilesdir = os.path.dirname(cfiles)
+ cfiles = open(cfiles)
+ cfiles = [os.path.relpath(os.path.join(cfilesdir,y),setupdir).strip()
+ if y[0] !='@' else y.strip()
+ for y in cfiles]
+
+ print "@@@@>",cfiles
+ incdir = set(os.path.dirname(x) for x in cfiles if x[-2:]==".h")
+ cfiles = [x for x in cfiles if x[-2:]==".c"]
+ pyrexs[-1].sources.extend(cfiles)
+ pyrexs[-1].include_dirs.extend(incdir)
+ pyrexs[-1].extra_compile_args.extend(['-msse2'])
+
+ except IOError:
+ pass
+ pyrexs[-1].sources.extend(glob.glob(os.path.splitext(pyrex)[0]+'.ext.*.c'))
+ print pyrexs[-1].sources
+ # Main.compile([pyrex],timestamps=True)
+
+ pyrexs.extend(findCython(path.join(root,module),base+[module]))
+ return pyrexs
+
+def findC(root,base=None,pyrexs=None):
+ setupdir = os.path.dirname(sys.argv[0])
+ pyrexs=[]
+ if base is None:
+ base=[]
+ for module in (path.basename(path.dirname(x))
+ for x in glob.glob(path.join(root,'*','__init__.py'))):
+
+ for pyrex in glob.glob(path.join(root,module,'*.c')):
+ if '.ext.' not in pyrex:
+ pyrexs.append(Extension('.'.join(base+[module,path.splitext(path.basename(pyrex))[0]]),[pyrex]))
+ try:
+ cfiles = os.path.splitext(pyrex)[0]+".cfiles"
+ cfilesdir = os.path.dirname(cfiles)
+ cfiles = open(cfiles)
+ cfiles = [os.path.relpath(os.path.join(cfilesdir,y),setupdir).strip()
+ if y[0] !='@' else y.strip()
+ for y in cfiles]
+ incdir = set(os.path.dirname(x) for x in cfiles if x[-2:]==".h")
+ cfiles = [x for x in cfiles if x[-2:]==".c"]
+ pyrexs[-1].sources.extend(cfiles)
+ pyrexs[-1].include_dirs.extend(incdir)
+ pyrexs[-1].extra_compile_args.extend(['-msse2'])
+ except IOError:
+ pass
+ pyrexs[-1].sources.extend(glob.glob(os.path.splitext(pyrex)[0]+'.ext.*.c'))
+ print pyrexs[-1].sources
+
+ pyrexs.extend(findC(path.join(root,module),base+[module]))
+ return pyrexs
+
+def rootname(x):
+ return os.path.splitext(x.sources[0])[0]
+
+COMMANDS = {'build':build,
+ 'littlebigman':littlebigman,
+ 'pidname':pidname,
+ 'build_ctools':build_ctools,
+ 'build_files':build_files,
+ 'build_cexe':build_cexe,
+ 'build_ext': build_ext,
+ 'build_scripts':build_scripts,
+ 'build_sphinx':build_sphinx,
+ 'install_scripts':install_scripts,
+ 'install_sphinx':install_sphinx,
+ 'install':install,
+ 'sdist':sdist}
+
+if has_setuptools:
+ COMMANDS['egg_info']=egg_info
+
+
+CTOOLS =[]
+CEXES =[]
+FILES =[]
+
+def setup(**attrs):
+
+ if has_setuptools:
+ try:
+
+ requirements = open('requirements.txt').readlines()
+ requirements = [x.strip() for x in requirements]
+ requirements = [x for x in requirements if x[0]!='-']
+
+ if 'install_requires' not in attrs:
+ attrs['install_requires']=requirements
+ else:
+ attrs['install_requires'].extend(requirements)
+ except IOError:
+ pass
+
+
+ if 'distclass' not in attrs:
+ attrs['distclass']=Distribution
+
+ if 'python_src' not in attrs:
+ SRC = 'src'
+ else:
+ SRC = attrs['python_src']
+ del(attrs['python_src'])
+
+ if 'scripts' not in attrs:
+ attrs['scripts'] = glob.glob('%s/*.py' % SRC)
+
+ if 'package_dir' not in attrs:
+ attrs['package_dir'] = {'': SRC}
+
+ if 'packages' not in attrs:
+ attrs['packages'] = findPackage(SRC)
+
+ if 'cmdclass' not in attrs:
+ attrs['cmdclass'] = COMMANDS
+
+ if 'ctools' not in attrs:
+ attrs['ctools'] = CTOOLS
+
+ if 'executables' not in attrs:
+ attrs['executables'] = CEXES
+
+ if 'files' not in attrs:
+ attrs['files'] = FILES
+
+ if 'sse' not in attrs:
+ attrs['sse']=None
+
+ if 'serenity' not in attrs:
+ attrs['serenity']=False
+
+
+ EXTENTION=findCython(SRC)
+ CEXTENTION=findC(SRC)
+ cython_ext = set(rootname(x) for x in EXTENTION)
+ EXTENTION.extend(x for x in CEXTENTION
+ if rootname(x) not in cython_ext)
+
+ if 'ext_modules' not in attrs:
+ attrs['ext_modules'] = EXTENTION
+
+ ori_setup(**attrs)
diff --git a/distutils.ext/obidistutils/dist.py b/distutils.ext/obidistutils/dist.py
new file mode 100644
index 0000000..3feeae4
--- /dev/null
+++ b/distutils.ext/obidistutils/dist.py
@@ -0,0 +1,48 @@
+'''
+Created on 20 oct. 2012
+
+ at author: coissac
+'''
+
+try:
+ from setuptools.dist import Distribution as ori_Distribution
+except ImportError:
+ from distutils.dist import Distribution as ori_Distribution
+
+class Distribution(ori_Distribution):
+
+ def __init__(self,attrs=None):
+ self.executables = None
+ self.ctools = None
+ self.files = None
+ self.build_cexe = None
+ self.deprecated_scripts = None
+ self.zip_safe=False
+ self.sse = None
+ self.serenity=attrs['serenity']
+
+ ori_Distribution.__init__(self, attrs)
+
+ self.global_options.insert(0,('serenity', None, "install or build the package in a python virtualenv "
+ "without polluting the installed python and with many "
+ "checks during the installation process"
+ ))
+ self.global_options.insert(0,('virtualenv', None, "if the installation is done using the serenity mode "
+ "this option allows for specifying the virtualenv name. "
+ "By default the name is PACKAGE-VERSION"
+ ))
+
+
+ def has_executables(self):
+ return self.executables is not None and self.executables
+
+ def has_ctools(self):
+ return self.ctools is not None and self.ctools
+
+ def has_files(self):
+ return self.files is not None and self.files
+
+ def has_deprecated_scripts(self):
+ return self.deprecated_scripts is not None and self.deprecated_scripts
+
+
\ No newline at end of file
diff --git a/distutils.ext/obidistutils/serenity/__init__.py b/distutils.ext/obidistutils/serenity/__init__.py
new file mode 100644
index 0000000..a86b79b
--- /dev/null
+++ b/distutils.ext/obidistutils/serenity/__init__.py
@@ -0,0 +1,117 @@
+import sys
+
+from distutils import util
+from distutils import sysconfig
+from distutils import log
+from distutils.version import LooseVersion, StrictVersion
+import glob
+import os
+import subprocess
+import re
+from distutils.errors import DistutilsError
+import urllib2
+import tempfile
+
+import importlib
+import imp
+import zipimport
+
+import argparse
+
+import base64
+
+from checkpython import is_mac_system_python, \
+ is_python27, \
+ is_a_virtualenv_python, \
+ which_virtualenv, \
+ is_good_python27
+
+
+from obidistutils.serenity.rerun import enforce_good_python
+from obidistutils.serenity.rerun import rerun_with_anothe_python
+
+from obidistutils.serenity.virtual import serenity_virtualenv
+
+from obidistutils.serenity.checksystem import is_mac_system, \
+ is_windows_system
+
+from obidistutils.serenity.checkpackage import install_requirements
+from obidistutils.serenity.checkpackage import check_requirements
+
+from obidistutils.serenity.util import save_argv
+
+from obidistutils.serenity.snake import snake
+
+from obidistutils.serenity.globals import PIP_MINVERSION, local_serenity
+
+
+def serenity_snake(envname,package,version,minversion=PIP_MINVERSION):
+ old = log.set_threshold(log.INFO)
+
+ log.info("Installing %s (%s) in serenity mode" % (package,version))
+
+ print >>sys.stderr,snake
+ sys.stderr.flush()
+
+ enforce_good_python()
+
+ virtualpython=serenity_virtualenv(envname,package,version,minversion=minversion)
+
+ if virtualpython!=os.path.realpath(sys.executable):
+ log.info("Restarting installation within the %s virtualenv" % (envname))
+ rerun_with_anothe_python(virtualpython)
+
+ log.info("%s will be installed with python : %s" % (package,virtualpython))
+
+ if install_requirements():
+ log.info("Restarting installation with all dependencies ok")
+ rerun_with_anothe_python(virtualpython)
+
+ log.set_threshold(old)
+
+def serenity_assert(version,minversion=PIP_MINVERSION):
+ check_requirements()
+
+
+def is_serenity():
+ from obidistutils.serenity.globals import local_serenity
+ return local_serenity and local_serenity[0]
+
+def serenity_mode(package,version):
+
+ save_argv()
+
+
+ from obidistutils.serenity.globals import saved_args
+
+
+ old = log.set_threshold(log.INFO)
+
+ argparser = argparse.ArgumentParser(add_help=False)
+ argparser.add_argument('--serenity',
+ dest='serenity',
+ action='store_true',
+ default=False,
+ help='Switch the installer in serenity mode. Everythings are installed in a virtualenv')
+
+ argparser.add_argument('--virtualenv',
+ dest='virtual',
+ type=str,
+ action='store',
+ default="%s-%s" % (package,version),
+ help='Specify the name of the virtualenv used by the serenity mode [default: %s-%s]' % (package,version))
+
+ args, unknown = argparser.parse_known_args()
+ sys.argv = [sys.argv[0]] + unknown
+
+ if args.serenity:
+ local_serenity.append(True)
+ serenity_snake(args.virtual,package,version)
+ else:
+ local_serenity.append(False)
+
+ log.set_threshold(old)
+
+ return args.serenity
+
+
diff --git a/distutils.ext/obidistutils/serenity/checkpackage.py b/distutils.ext/obidistutils/serenity/checkpackage.py
new file mode 100644
index 0000000..7bf9c1e
--- /dev/null
+++ b/distutils.ext/obidistutils/serenity/checkpackage.py
@@ -0,0 +1,184 @@
+'''
+Created on 2 oct. 2014
+
+ at author: coissac
+'''
+
+import re
+import sys
+import os
+
+from distutils.version import StrictVersion # @UnusedImport
+from distutils.errors import DistutilsError
+from distutils import log
+
+from obidistutils.serenity.checkpip import get_a_pip_module
+
+def is_installed(requirement,pip=None):
+
+ if pip is None:
+ pip = get_a_pip_module()
+
+ get_installed_distributions=pip.util.get_installed_distributions
+
+ requirement_project,requirement_relation,requirement_version = parse_package_requirement(requirement)
+
+ package = [x for x in get_installed_distributions() if x.project_name==requirement_project]
+
+ if len(package)==1:
+ if requirement_version is not None and requirement_relation is not None:
+ rep = (len(package)==1) and eval("StrictVersion('%s') %s StrictVersion('%s')" % (package[0].version,
+ requirement_relation,
+ requirement_version)
+ )
+ else:
+ rep=True
+ else:
+ rep=False
+
+ if rep:
+ if requirement_version is not None and requirement_relation is not None:
+ log.info("Look for package %s (%s%s) : ok version %s installed" % (requirement_project,
+ requirement_relation,
+ requirement_version,
+ package[0].version))
+ else:
+ log.info("Look for package %s : ok version %s installed" % (requirement_project,
+ package[0].version))
+ else:
+ if len(package)!=1:
+ log.info("Look for package %s (%s%s) : not installed" % (requirement_project,
+ requirement_relation,
+ requirement_version))
+ else:
+ log.info("Look for package %s (%s%s) : failed only version %s installed" % (requirement_project,
+ requirement_relation,
+ requirement_version,
+ package[0].version))
+
+ return rep
+
+
+def get_requirements(pip=None):
+
+ if pip is None:
+ pip = get_a_pip_module()
+
+ try:
+ requirements = open('requirements.txt').readlines()
+ requirements = [x.strip() for x in requirements]
+ requirements = [x for x in requirements if x[0]!='-']
+
+ except IOError:
+ requirements = []
+
+ return requirements
+
+
+def install_requirements(skip_virtualenv=True,pip=None):
+
+ if pip is None:
+ pip = get_a_pip_module()
+
+ install_something=False
+ try:
+ requirements = open('requirements.txt').readlines()
+ requirements = [x.strip() for x in requirements]
+ requirements = [x for x in requirements if x[0]!='-']
+
+ log.info("Required packages for the installation :")
+ for x in requirements:
+ if not skip_virtualenv or x[0:10]!='virtualenv':
+ ok = is_installed(x,pip)
+ if not ok:
+ log.info(" Installing requirement : %s" % x)
+ pip_install_package(x,pip=pip)
+ install_something=True
+
+ except IOError:
+ pass
+
+ return install_something
+
+
+def check_requirements(skip_virtualenv=True,pip=None):
+
+ if pip is None:
+ pip = get_a_pip_module()
+
+
+ try:
+ requirements = open('requirements.txt').readlines()
+ requirements = [x.strip() for x in requirements]
+ requirements = [x for x in requirements if x[0]!='-']
+
+ log.info("Required packages for the installation :")
+ for x in requirements:
+ if not skip_virtualenv or x[0:10]!='virtualenv':
+ ok = is_installed(x,pip)
+ if not ok:
+ log.error(" Missing requirement : %s -- Package installation stopped" % x)
+ sys.exit(0)
+
+ except IOError:
+ pass
+
+
+
+def parse_package_requirement(requirement):
+
+ version_pattern = re.compile('[=><]+(.*)$')
+ project_pattern = re.compile('[^=><]+')
+ relationship_pattern = re.compile('[=><]+')
+
+ try:
+ requirement_project = project_pattern.search(requirement).group(0)
+ requirement_version = version_pattern.search(requirement)
+ if requirement_version is not None:
+ requirement_version=requirement_version.group(1)
+ requirement_relation= relationship_pattern.search(requirement)
+ if requirement_relation is not None:
+ requirement_relation=requirement_relation.group(0)
+ except:
+ raise DistutilsError,"Requirement : %s not correctly formated" % requirement
+
+ return requirement_project,requirement_relation,requirement_version
+
+
+def get_package_requirement(package,pip=None):
+ if pip is None:
+ pip = get_a_pip_module()
+
+ requirements = get_requirements(pip)
+ req = [x for x in requirements
+ if x[0:len(package)]==package
+ ]
+
+ if len(req)==1:
+ return req[0]
+ else:
+ return None
+
+
+def pip_install_package(package,directory=None,pip=None):
+
+ log.info('installing %s in directory %s' % (package,str(directory)))
+
+ if 'http_proxy' in os.environ and 'https_proxy' not in os.environ:
+ os.environ['https_proxy']=os.environ['http_proxy']
+
+ if pip is None:
+ pip = get_a_pip_module()
+
+ args = ['install']
+
+ if 'http_proxy' in os.environ:
+ args.append('--proxy=%s' % os.environ['http_proxy'])
+
+ if directory is not None:
+ args.append('--target=%s' % directory)
+
+ args.append(package)
+
+ return pip.main(args)
+
diff --git a/distutils.ext/obidistutils/serenity/checkpip.py b/distutils.ext/obidistutils/serenity/checkpip.py
new file mode 100644
index 0000000..b5f5834
--- /dev/null
+++ b/distutils.ext/obidistutils/serenity/checkpip.py
@@ -0,0 +1,82 @@
+'''
+Created on 2 oct. 2014
+
+ at author: coissac
+'''
+
+#import urllib2
+import os
+#import imp
+#import base64
+#import zipimport
+import importlib
+
+from distutils.version import StrictVersion
+#from distutils.errors import DistutilsError
+from distutils import log
+
+
+from obidistutils.serenity.globals import PIP_MINVERSION, \
+ local_pip # @UnusedImport
+
+
+from obidistutils.serenity.util import get_serenity_dir
+import sys
+import pkgutil
+
+
+def is_pip_installed(minversion=PIP_MINVERSION):
+ try:
+ log.info("Try to load pip module...")
+ pipmodule = importlib.import_module('pip')
+ if hasattr(pipmodule,'__version__'):
+ ok = StrictVersion(pipmodule.__version__) >= StrictVersion(minversion)
+ log.info("Pip installed version %s" % pipmodule.__version__)
+ else:
+ ok = False
+ log.info("A too old version of pip is installed on your system")
+
+ # We clean up the imported pip module for test purpose
+ for m in [x for x in sys.modules if x.startswith('pip.')]:
+ del sys.modules[m]
+
+ del sys.modules['pip']
+
+
+ except:
+ ok = False
+ log.info("No pip installed on your system")
+
+ return ok
+
+def get_a_pip_module(minversion=PIP_MINVERSION):
+
+ global local_pip
+
+ tmpdir = get_serenity_dir()
+
+ if not local_pip:
+ serenity = importlib.import_module('obidistutils.serenity')
+ sys.path.insert(0, os.path.dirname(serenity.__file__))
+ pip = importlib.import_module('pip')
+
+ local_pip.append(pip)
+ else:
+ pip = local_pip[-1]
+
+ # Prepare the CERT certificat for https download
+
+ cert_path = os.path.join(tmpdir, "cacert.pem")
+
+ certificate = pkgutil.get_data("pip._vendor.requests", "cacert.pem")
+
+ with open(cert_path, "wb") as cert:
+ cert.write(certificate)
+
+ os.environ.setdefault("PIP_CERT", cert_path)
+
+ assert hasattr(pip,'__version__') and StrictVersion(pip.__version__) >= StrictVersion(minversion), \
+ "Unable to find suitable version of pip"
+
+ return local_pip[0]
+
diff --git a/distutils.ext/obidistutils/serenity/checkpython.py b/distutils.ext/obidistutils/serenity/checkpython.py
new file mode 100644
index 0000000..9503b00
--- /dev/null
+++ b/distutils.ext/obidistutils/serenity/checkpython.py
@@ -0,0 +1,170 @@
+'''
+Created on 2 oct. 2014
+
+ at author: coissac
+'''
+
+import subprocess
+import sys
+import os
+import glob
+
+from distutils.version import StrictVersion
+from distutils import sysconfig
+
+from obidistutils.serenity.checksystem import is_mac_system, \
+ is_windows_system
+
+
+
+def is_python27(path=None):
+ '''
+ Checks that the python is a python2.7
+
+ @param path: if None consider the running python
+ otherwise the python pointed by the path
+
+ @return: True if the python is a 2.7
+ @rtype: bool
+ '''
+ if path is None:
+ pythonversion = StrictVersion(sysconfig.get_python_version())
+ else:
+ command = """'%s' -c 'from distutils import sysconfig; """ \
+ """print sysconfig.get_python_version()'""" % path
+
+ p = subprocess.Popen(command,
+ shell=True,
+ stdout=subprocess.PIPE)
+ pythonversion = StrictVersion(p.communicate()[0])
+
+ return pythonversion >=StrictVersion("2.7") \
+ and pythonversion < StrictVersion("2.8")
+
+
+
+def is_mac_system_python(path=None):
+ '''
+ Checks on a mac platform if the python is the original
+ python provided with the systems
+ .
+
+ @param path: if None consider the running python
+ otherwise the python pointed by the path
+
+ @return: True if the python is the system one
+ @rtype: bool
+ '''
+ if path is None:
+ path = sys.executable
+
+ p1 = '/System/Library/Frameworks/Python.framework'
+ p2 = '/usr/bin'
+
+ return path[0:len(p1)]==p1 or \
+ path[0:len(p2)]==p2
+
+
+def is_a_virtualenv_python(path=None):
+ '''
+ Check if the python is belonging a virtualenv
+
+ @param path: the path pointing to the python executable.
+ if path is None then the running python is
+ considered.
+ @param path: str or None
+
+ @return: True if the python belongs a virtualenv
+ False otherwise
+ @rtype: bool
+
+ '''
+ if path is None:
+ rep = hasattr(sys, 'real_prefix')
+ else:
+ command = """'%s' -c 'import sys; print hasattr(sys,"real_prefix")'""" % path
+ p = subprocess.Popen(command,
+ shell=True,
+ stdout=subprocess.PIPE)
+ rep = eval(p.communicate()[0])
+
+ return rep
+
+
+def which_virtualenv(path=None,full=False):
+ '''
+ Returns the name of the virtualenv.
+ @param path: the path to a python binary or None
+ if you want to consider the running python
+ @type path: str or None
+
+ @param full: if set to True, returns the absolute path,
+ otherwise only return a simple directory name
+ @type full: bool
+
+ @return: the virtual environment name or None if the
+ path does not belong a virtualenv
+ @rtype: str or None
+ '''
+ if path is None:
+ path = sys.executable
+
+ if is_a_virtualenv_python(path):
+ parts = path.split(os.sep)
+ try:
+ if full:
+ rep = os.sep.join(parts[0:parts.index('bin')])
+ rep = os.path.realpath(rep)
+ else:
+ rep = parts[parts.index('bin')-1]
+ except ValueError:
+ rep = None
+ else:
+ rep=None
+
+ return rep
+
+def is_good_python27(path = None):
+ '''
+ Checks if the python is usable for the package install.
+
+ Actually the python must be a 2.7 version and not being the
+ default python included with the system on a mac.
+
+ @param path: the path to a python binary or None
+ if you want to consider the running python
+ @type path: str or None
+
+ @return: True if the python is ok
+ False otherwise
+ @rtype: bool
+
+ '''
+ rep = is_python27(path) and \
+ (not is_mac_system() or \
+ not is_mac_system_python(path) \
+ )
+
+ return rep
+
+def lookfor_good_python27():
+ exe = []
+ if not is_windows_system():
+ paths = os.environ['PATH'].split(os.pathsep)
+ for p in paths:
+ candidates = glob.glob(os.path.join(p,'python2.7')) + \
+ glob.glob(os.path.join(p,'python2')) + \
+ glob.glob(os.path.join(p,'python'))
+ pexe = []
+ for e in candidates:
+ if os.path.islink(e):
+ e = os.path.realpath(e)
+ if os.path.isfile(e) and \
+ os.access(e, os.X_OK) and \
+ is_good_python27(e) and \
+ not is_a_virtualenv_python(e):
+ pexe.append(e)
+ exe.extend(set(pexe))
+
+ return exe
+
diff --git a/distutils.ext/obidistutils/serenity/checksystem.py b/distutils.ext/obidistutils/serenity/checksystem.py
new file mode 100644
index 0000000..a60a4e1
--- /dev/null
+++ b/distutils.ext/obidistutils/serenity/checksystem.py
@@ -0,0 +1,19 @@
+'''
+Created on 2 oct. 2014
+
+ at author: coissac
+'''
+
+from distutils import util
+from distutils import log
+
+def is_mac_system():
+ platform = util.get_platform().split('-')[0]
+ if platform=='macosx':
+ log.info('You are running on a Mac platform')
+ return platform=='macosx'
+
+def is_windows_system():
+ platform = util.get_platform().split('-')[0]
+
+ return platform=='Windows'
diff --git a/distutils.ext/obidistutils/serenity/getcython.py b/distutils.ext/obidistutils/serenity/getcython.py
new file mode 100644
index 0000000..6453ddf
--- /dev/null
+++ b/distutils.ext/obidistutils/serenity/getcython.py
@@ -0,0 +1,72 @@
+'''
+Created on 2 oct. 2014
+
+ at author: coissac
+'''
+
+import imp
+import importlib
+
+from distutils.errors import DistutilsError
+from distutils.version import StrictVersion
+from distutils import log
+
+from obidistutils.serenity.globals import local_cython # @UnusedImport
+
+from obidistutils.serenity.checkpip import get_a_pip_module
+
+from obidistutils.serenity.checkpackage import get_package_requirement
+from obidistutils.serenity.checkpackage import parse_package_requirement
+from obidistutils.serenity.checkpackage import is_installed
+from obidistutils.serenity.checkpackage import pip_install_package
+
+from obidistutils.serenity.util import get_serenity_dir
+
+
+def get_a_cython_module(pip=None):
+
+ global local_cython
+
+ if not local_cython:
+ if pip is None:
+ pip = get_a_pip_module()
+
+
+ cython_req = get_package_requirement('Cython',pip)
+ if cython_req is None:
+ cython_req='Cython'
+
+ requirement_project,requirement_relation,minversion = parse_package_requirement(cython_req) # @UnusedVariable
+
+
+
+ if cython_req is None or not is_installed(cython_req, pip):
+ tmpdir = get_serenity_dir()
+
+ ok = pip_install_package(cython_req,directory=tmpdir,pip=pip)
+
+ log.debug('temp install dir : %s' % tmpdir)
+
+ if ok!=0:
+ raise DistutilsError, "I cannot install a cython package"
+
+ f, filename, description = imp.find_module('Cython', [tmpdir])
+
+ cythonmodule = imp.load_module('Cython', f, filename, description)
+
+ if minversion is not None:
+ assert StrictVersion(cythonmodule.__version__) >= minversion, \
+ "Unable to find suitable version of cython get %s instead of %s" % (cythonmodule.__version__,
+ minversion)
+
+ else:
+ cythonmodule = importlib.import_module('Cython')
+
+ local_cython.append(cythonmodule)
+
+ return local_cython[0]
+
+
+
+
+
diff --git a/distutils.ext/obidistutils/serenity/globals.py b/distutils.ext/obidistutils/serenity/globals.py
new file mode 100644
index 0000000..39ad3cd
--- /dev/null
+++ b/distutils.ext/obidistutils/serenity/globals.py
@@ -0,0 +1,15 @@
+'''
+Created on 2 oct. 2014
+
+ at author: coissac
+'''
+
+PIP_MINVERSION = '1.5'
+
+
+saved_args=[]
+tmpdir=[]
+local_pip=[]
+local_virtualenv=[]
+local_cython=[]
+local_serenity=[]
diff --git a/distutils.ext/obidistutils/serenity/rerun.py b/distutils.ext/obidistutils/serenity/rerun.py
new file mode 100644
index 0000000..d33a7de
--- /dev/null
+++ b/distutils.ext/obidistutils/serenity/rerun.py
@@ -0,0 +1,60 @@
+'''
+Created on 2 oct. 2014
+
+ at author: coissac
+'''
+
+import sys
+import os
+
+from distutils import log
+from distutils.errors import DistutilsError
+
+
+from obidistutils.serenity.globals import saved_args
+from obidistutils.serenity.checkpython import is_good_python27, \
+ lookfor_good_python27
+
+
+def rerun_with_anothe_python(path, fork=False):
+
+ if saved_args:
+ args = saved_args
+ else:
+ args = list(sys.argv)
+
+
+ assert is_good_python27(path), \
+ 'the selected python is not adapted to the installation of this package'
+
+ args.insert(0, path)
+
+ sys.stderr.flush()
+ sys.stdout.flush()
+
+ if fork:
+ log.info('Forking a new install process')
+ os.system(' '.join(list(args)))
+ log.info('External process ended')
+ sys.exit(0)
+ else:
+ os.execv(path,list(args))
+
+def enforce_good_python():
+ if is_good_python27():
+ return True
+
+ goodpython = lookfor_good_python27()
+
+ if not goodpython:
+ raise DistutilsError,'No good python identified on your system'
+
+ goodpython=goodpython[0]
+
+ log.warn("========================================")
+ log.warn("")
+ log.warn(" Switching to python : %s" % goodpython)
+ log.warn("")
+ log.warn("========================================")
+
+ rerun_with_anothe_python(goodpython)
diff --git a/distutils.ext/obidistutils/serenity/snake.py b/distutils.ext/obidistutils/serenity/snake.py
new file mode 100644
index 0000000..79c1f39
--- /dev/null
+++ b/distutils.ext/obidistutils/serenity/snake.py
@@ -0,0 +1,35 @@
+'''
+Created on 2 oct. 2014
+
+ at author: coissac
+'''
+
+snake ="""
+
+ ___
+ ,'._,`.
+ (-.___.-)
+ (-.___.-)
+ `-.___.-'
+ (( @ @| . __
+ \ ` | ,\ |`. @| | | _.-._
+ __`.`=-=mm===mm:: | | |`. | | | ,'=` '=`.
+ ( `-'|:/ /:/ `/ @| | | |, @| @| /---)W(---\
+ \ \ / / / / @| | ' (----| |----) ,~
+ |\ \ / /| / / @| \---| |---/ |
+ | \ V /||/ / `.-| |-,' |
+ | `-' |V / \| |/ @'
+ | , |-' __| |__
+ | .;: _,-. ,--""..| |..""--.
+ ;;:::' " ) (`--::__|_|__::--')
+ ,-" _, / \`--...___...--'/
+ ( -:--'/ / /`--...___...--'\
+ "-._ `"'._/ /`---...___...---'\
+ "-._ "---. (`---....___....---')
+ .' ",._ ,' ) |`---....___....---'|
+ /`._| `| | (`---....___....---')
+ ( \ | / \`---...___...---'/
+ `. `, ^"" `:--...___...--;'
+ `.,' hh `-._______.-'
+
+"""
\ No newline at end of file
diff --git a/distutils.ext/obidistutils/serenity/util.py b/distutils.ext/obidistutils/serenity/util.py
new file mode 100644
index 0000000..14c0283
--- /dev/null
+++ b/distutils.ext/obidistutils/serenity/util.py
@@ -0,0 +1,27 @@
+'''
+Created on 2 oct. 2014
+
+ at author: coissac
+'''
+
+import sys
+import tempfile
+
+
+from obidistutils.serenity.globals import tmpdir # @UnusedImport
+from obidistutils.serenity.globals import saved_args # @UnusedImport
+
+def get_serenity_dir():
+ global tmpdir
+
+ if not tmpdir:
+ tmpdir.append(tempfile.mkdtemp())
+ return tmpdir[0]
+
+def save_argv():
+ global saved_args
+
+ del saved_args[:]
+ saved_args.extend(list(sys.argv))
+
+
diff --git a/distutils.ext/obidistutils/serenity/virtual.py b/distutils.ext/obidistutils/serenity/virtual.py
new file mode 100644
index 0000000..e8e95fc
--- /dev/null
+++ b/distutils.ext/obidistutils/serenity/virtual.py
@@ -0,0 +1,133 @@
+'''
+Created on 2 oct. 2014
+
+ at author: coissac
+'''
+
+import imp
+import importlib
+import os
+import sys
+
+from distutils.errors import DistutilsError
+from distutils.version import StrictVersion
+from distutils import log
+
+from obidistutils.serenity.globals import PIP_MINVERSION, \
+ local_virtualenv # @UnusedImport
+
+from obidistutils.serenity.checkpip import get_a_pip_module
+
+from obidistutils.serenity.checkpackage import get_package_requirement,\
+ install_requirements
+from obidistutils.serenity.checkpackage import parse_package_requirement
+from obidistutils.serenity.checkpackage import is_installed
+from obidistutils.serenity.checkpackage import pip_install_package
+
+from obidistutils.serenity.checkpython import is_a_virtualenv_python
+from obidistutils.serenity.checkpython import which_virtualenv
+from obidistutils.serenity.checkpython import is_good_python27
+
+from obidistutils.serenity.util import get_serenity_dir
+
+
+def get_a_virtualenv_module(pip=None):
+
+ global local_virtualenv
+
+ if not local_virtualenv:
+ if pip is None:
+ pip = get_a_pip_module()
+
+
+ virtualenv_req = get_package_requirement('virtualenv',pip)
+ if virtualenv_req is None:
+ virtualenv_req='virtualenv'
+
+ requirement_project,requirement_relation,minversion = parse_package_requirement(virtualenv_req) # @UnusedVariable
+
+ if virtualenv_req is None or not is_installed(virtualenv_req, pip):
+ tmpdir = get_serenity_dir()
+
+ ok = pip_install_package(virtualenv_req,directory=tmpdir,pip=pip)
+
+ log.debug('temp install dir : %s' % tmpdir)
+
+ if ok!=0:
+ raise DistutilsError, "I cannot install a virtualenv package"
+
+ f, filename, description = imp.find_module('virtualenv', [tmpdir])
+
+ vitualenvmodule = imp.load_module('virtualenv', f, filename, description)
+
+ if minversion is not None:
+ assert StrictVersion(vitualenvmodule.__version__) >= minversion, \
+ "Unable to find suitable version of virtualenv get %s instead of %s" % (vitualenvmodule.__version__,
+ minversion)
+
+ else:
+ vitualenvmodule = importlib.import_module('virtualenv')
+
+ local_virtualenv.append(vitualenvmodule)
+
+ return local_virtualenv[0]
+
+
+
+
+
+def serenity_virtualenv(envname,package,version,minversion=PIP_MINVERSION,pip=None):
+
+
+ #
+ # Checks if we are already running under the good virtualenv
+ #
+ if is_a_virtualenv_python():
+ ve = which_virtualenv(full=True)
+ if ve == os.path.realpath(envname) and is_good_python27():
+ return sys.executable
+
+ #
+ # We are not in the good virtualenv
+ #
+
+ if pip is None:
+ pip = get_a_pip_module(minversion)
+
+ #
+ # Check if the virtualenv exist
+ #
+
+ python = None
+
+ if os.path.isdir(envname):
+ python = os.path.join(envname,'bin','python')
+ ok = (is_good_python27(python) and
+ is_a_virtualenv_python(python))
+
+ #
+ # The virtualenv already exist but it is not ok
+ #
+ if not ok:
+ raise DistutilsError, "A virtualenv %s already exists but not with the required python"
+
+ else:
+ ok = False
+
+
+ #
+ # Creates a new virtualenv
+ #
+ if not ok:
+ virtualenv = get_a_virtualenv_module(pip)
+
+ if virtualenv is not None:
+ virtualenv.create_environment(envname)
+
+ # check the newly created virtualenv
+ return serenity_virtualenv(envname,package,version,minversion,pip)
+
+ return os.path.realpath(python)
+
+
+
\ No newline at end of file
diff --git a/distutils.ext/src/littlebigman.c b/distutils.ext/src/littlebigman.c
new file mode 100644
index 0000000..5e2ea2a
--- /dev/null
+++ b/distutils.ext/src/littlebigman.c
@@ -0,0 +1,24 @@
+/*
+ * littlebigman.c
+ *
+ * Created on: 11 juil. 2012
+ * Author: coissac
+ */
+
+#include<stdio.h>
+
+int main(int argc, char *argv[])
+{
+ union { int entier;
+ char caractere[4] ;
+ } test;
+
+ test.entier=0x01020304;
+
+ if (test.caractere[3] == 1)
+ printf("-DLITTLE_END");
+ else
+ printf("-DBIG_END");
+
+ return 0;
+}
diff --git a/distutils.ext/src/pidname.c b/distutils.ext/src/pidname.c
new file mode 100644
index 0000000..ff61eb4
--- /dev/null
+++ b/distutils.ext/src/pidname.c
@@ -0,0 +1,24 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <libproc.h>
+
+int main (int argc, char* argv[])
+{
+ pid_t pid; int ret;
+ char pathbuf[PROC_PIDPATHINFO_MAXSIZE];
+
+ if ( argc > 1 ) {
+ pid = (pid_t) atoi(argv[1]);
+ ret = proc_pidpath (pid, pathbuf, sizeof(pathbuf));
+ if ( ret <= 0 ) {
+ fprintf(stderr, "PID %d: proc_pidpath ();\n", pid);
+ fprintf(stderr, " %s\n", strerror(errno));
+ } else {
+ printf("proc %d: %s\n", pid, pathbuf);
+ }
+ }
+
+ return 0;
+}
diff --git a/doc/sphinx/Makefile b/doc/sphinx/Makefile
new file mode 100644
index 0000000..1464560
--- /dev/null
+++ b/doc/sphinx/Makefile
@@ -0,0 +1,100 @@
+# Makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS =
+SPHINXBUILD = sphinx-build
+#SPHINXBUILD = /Library/Frameworks/Python.framework/Versions/2.7/bin/sphinx-build
+PAPER =
+BUILDDIR = build
+
+# Internal variables.
+PAPEROPT_a4 = -D latex_paper_size=a4
+PAPEROPT_letter = -D latex_paper_size=letter
+ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source
+
+.PHONY: help clean html dirhtml pickle json htmlhelp qthelp latex changes linkcheck doctest
+
+help:
+ @echo "Please use \`make <target>' where <target> is one of"
+ @echo " html to make standalone HTML files"
+ @echo " dirhtml to make HTML files named index.html in directories"
+ @echo " pickle to make pickle files"
+ @echo " json to make JSON files"
+ @echo " htmlhelp to make HTML files and a HTML help project"
+ @echo " qthelp to make HTML files and a qthelp project"
+ @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
+ @echo " changes to make an overview of all changed/added/deprecated items"
+ @echo " linkcheck to check all external links for integrity"
+ @echo " doctest to run all doctests embedded in the documentation (if enabled)"
+
+clean:
+ -rm -rf $(BUILDDIR)/*
+
+html:
+ $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
+ @echo
+ @echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
+
+epub:
+ $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
+ @echo
+ @echo "Build finished. The e-Pub pages are in $(BUILDDIR)/epub."
+
+dirhtml:
+ $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
+ @echo
+ @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
+
+pickle:
+ $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
+ @echo
+ @echo "Build finished; now you can process the pickle files."
+
+json:
+ $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
+ @echo
+ @echo "Build finished; now you can process the JSON files."
+
+man:
+ $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
+ @echo
+ @echo "Build finished."
+
+htmlhelp:
+ $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
+ @echo
+ @echo "Build finished; now you can run HTML Help Workshop with the" \
+ ".hhp project file in $(BUILDDIR)/htmlhelp."
+
+qthelp:
+ $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
+ @echo
+ @echo "Build finished; now you can run "qcollectiongenerator" with the" \
+ ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
+ @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/OBITools.qhcp"
+ @echo "To view the help file:"
+ @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/OBITools.qhc"
+
+latex:
+ $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+ @echo
+ @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
+ @echo "Run \`make all-pdf' or \`make all-ps' in that directory to" \
+ "run these through (pdf)latex."
+
+changes:
+ $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
+ @echo
+ @echo "The overview file is in $(BUILDDIR)/changes."
+
+linkcheck:
+ $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
+ @echo
+ @echo "Link check complete; look for any errors in the above output " \
+ "or in $(BUILDDIR)/linkcheck/output.txt."
+
+doctest:
+ $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
+ @echo "Testing of doctests in the sources finished, look at the " \
+ "results in $(BUILDDIR)/doctest/output.txt."
diff --git a/doc/sphinx/make.bat b/doc/sphinx/make.bat
new file mode 100644
index 0000000..55bbd89
--- /dev/null
+++ b/doc/sphinx/make.bat
@@ -0,0 +1,113 @@
+ at ECHO OFF
+
+REM Command file for Sphinx documentation
+
+set SPHINXBUILD=sphinx-build
+set BUILDDIR=build
+set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% source
+if NOT "%PAPER%" == "" (
+ set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
+)
+
+if "%1" == "" goto help
+
+if "%1" == "help" (
+ :help
+ echo.Please use `make ^<target^>` where ^<target^> is one of
+ echo. html to make standalone HTML files
+ echo. dirhtml to make HTML files named index.html in directories
+ echo. pickle to make pickle files
+ echo. json to make JSON files
+ echo. htmlhelp to make HTML files and a HTML help project
+ echo. qthelp to make HTML files and a qthelp project
+ echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter
+ echo. changes to make an overview over all changed/added/deprecated items
+ echo. linkcheck to check all external links for integrity
+ echo. doctest to run all doctests embedded in the documentation if enabled
+ goto end
+)
+
+if "%1" == "clean" (
+ for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
+ del /q /s %BUILDDIR%\*
+ goto end
+)
+
+if "%1" == "html" (
+ %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
+ echo.
+ echo.Build finished. The HTML pages are in %BUILDDIR%/html.
+ goto end
+)
+
+if "%1" == "dirhtml" (
+ %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
+ echo.
+ echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
+ goto end
+)
+
+if "%1" == "pickle" (
+ %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
+ echo.
+ echo.Build finished; now you can process the pickle files.
+ goto end
+)
+
+if "%1" == "json" (
+ %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
+ echo.
+ echo.Build finished; now you can process the JSON files.
+ goto end
+)
+
+if "%1" == "htmlhelp" (
+ %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
+ echo.
+ echo.Build finished; now you can run HTML Help Workshop with the ^
+.hhp project file in %BUILDDIR%/htmlhelp.
+ goto end
+)
+
+if "%1" == "qthelp" (
+ %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
+ echo.
+ echo.Build finished; now you can run "qcollectiongenerator" with the ^
+.qhcp project file in %BUILDDIR%/qthelp, like this:
+ echo.^> qcollectiongenerator %BUILDDIR%\qthelp\OBITools.qhcp
+ echo.To view the help file:
+ echo.^> assistant -collectionFile %BUILDDIR%\qthelp\OBITools.ghc
+ goto end
+)
+
+if "%1" == "latex" (
+ %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
+ echo.
+ echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
+ goto end
+)
+
+if "%1" == "changes" (
+ %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
+ echo.
+ echo.The overview file is in %BUILDDIR%/changes.
+ goto end
+)
+
+if "%1" == "linkcheck" (
+ %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
+ echo.
+ echo.Link check complete; look for any errors in the above output ^
+or in %BUILDDIR%/linkcheck/output.txt.
+ goto end
+)
+
+if "%1" == "doctest" (
+ %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
+ echo.
+ echo.Testing of doctests in the sources finished, look at the ^
+results in %BUILDDIR%/doctest/output.txt.
+ goto end
+)
+
+:end
diff --git a/doc/sphinx/source/annotations.rst b/doc/sphinx/source/annotations.rst
new file mode 100644
index 0000000..b07bf10
--- /dev/null
+++ b/doc/sphinx/source/annotations.rst
@@ -0,0 +1,11 @@
+Sequence annotations
+====================
+
+
+.. toctree::
+ :maxdepth: 2
+
+ scripts/ecotag
+ scripts/obiannotate
+ scripts/obiaddtaxids
+
diff --git a/doc/sphinx/source/attributes.rst b/doc/sphinx/source/attributes.rst
new file mode 100644
index 0000000..595224f
--- /dev/null
+++ b/doc/sphinx/source/attributes.rst
@@ -0,0 +1,128 @@
+The extended OBITools fasta and fastq format
+--------------------------------------------
+.. _obitools-fasta:
+
+The *extended OBITools Fasta format* is a strict :doc:`fasta format file <fasta>`.
+The file in *extended OBITools Fasta format* can be readed by all programs
+reading fasta files.
+
+Difference between standard and extended fasta is just the structure of the title
+line. For OBITools title line is divided in three parts :
+
+ - Seqid : the sequence identifier
+ - key=value; : a set of key/value keys
+ - the sequence definition
+
+
+::
+
+ >my_sequence taxid=3456; direct=True; sample=A354; this is my pretty sequence
+ ACGTTGCAGTACGTTGCAGTACGTTGCAGTACGTTGCAGTACGTTGCAGTACGTTGCAGT
+ GTGCTGACGTTGCAGTACGTTGCAGTACGTTGCAGTACGTTGCAGTACGTTGCAGTGTTT
+ AACGACGTTGCAGTACGTTGCAGT
+
+Following these rules, the title line can be parsed :
+
+ - The sequence identifier of this sequence is : *my_sequence*
+ - Three keys are assigned to this sequence :
+ - Key *taxid* with value *3456*
+ - Key *direct* with value *True*
+ - Key *sample* with value *A354*
+ - The definition of this sequence is this is *my pretty sequence*
+
+Values can be any valid python expression. If a key value cannot be evaluated as
+a python expression, it is them assumed as a simple string. Following this rule,
+taxid value is considered as an integer value, direct value as a boolean and sample
+value is not a valid python expression so it is considered as a string value.
+
+
+Names reserved for attributes
+.............................
+
+The following attribute names are created by some obitools programs and used by others.
+They have a special meaning. So we recommend not to use them with another semantic.
+
+Contents:
+
+.. toctree::
+ :maxdepth: 2
+
+
+ attributes/ali_dir
+ attributes/ali_length
+ attributes/avg_quality
+ attributes/best_match
+ attributes/best_identity
+ attributes/class
+ attributes/cluster
+ attributes/complemented
+ attributes/count
+ attributes/cut
+ attributes/direction
+ attributes/distance
+ attributes/error
+ attributes/experiment
+ attributes/family
+ attributes/family_name
+ attributes/forward_error
+ attributes/forward_match
+ attributes/forward_primer
+ attributes/forward_score
+ attributes/forward_tag
+ attributes/forward_tm
+ attributes/genus
+ attributes/genus_name
+ attributes/head_quality
+ attributes/id_status
+ attributes/merged_star
+ attributes/merged
+ attributes/mid_quality
+ attributes/mode
+ attributes/obiclean_cluster
+ attributes/obiclean_count
+ attributes/obiclean_head
+ attributes/obiclean_headcount
+ attributes/obiclean_internalcount
+ attributes/obiclean_samplecount
+ attributes/obiclean_singletoncount
+ attributes/obiclean_status
+
+ attributes/occurrence
+ attributes/order
+ attributes/order_name
+ attributes/pairend_limit
+ attributes/partial
+ attributes/rank
+ attributes/reverse_error
+ attributes/reverse_match
+ attributes/reverse_primer
+ attributes/reverse_score
+ attributes/reverse_tag
+ attributes/reverse_tm
+ attributes/sample
+ attributes/scientific_name
+ attributes/score
+ attributes/score_norm
+ attributes/select
+ attributes/seq_ab_match
+ attributes/seq_a_single
+ attributes/seq_a_mismatch
+ attributes/seq_a_deletion
+ attributes/seq_a_insertion
+ attributes/seq_b_single
+ attributes/seq_b_mismatch
+ attributes/seq_b_deletion
+ attributes/seq_b_insertion
+ attributes/seq_length
+ attributes/seq_length_ori
+ attributes/seq_rank
+ attributes/sminL
+ attributes/sminR
+ attributes/species
+ attributes/species_list
+ attributes/species_name
+ attributes/status
+ attributes/strand
+ attributes/tail_quality
+ attributes/taxid
+
\ No newline at end of file
diff --git a/doc/sphinx/source/attributes/ali_dir.rst b/doc/sphinx/source/attributes/ali_dir.rst
new file mode 100644
index 0000000..99c8fde
--- /dev/null
+++ b/doc/sphinx/source/attributes/ali_dir.rst
@@ -0,0 +1,9 @@
+ali_dir
+=======
+
+ Either 'left' or 'right'. Indicates the way the alignment has been done, and especially where
+ the overlapping part is located on the forward read (either its 'right' part, or its
+ 'left' part).
+
+ Attribute added by the program:
+ - :doc:`illuminapairedend <../scripts/illuminapairedend>`
diff --git a/doc/sphinx/source/attributes/ali_length.rst b/doc/sphinx/source/attributes/ali_length.rst
new file mode 100644
index 0000000..1d98f28
--- /dev/null
+++ b/doc/sphinx/source/attributes/ali_length.rst
@@ -0,0 +1,9 @@
+ali_length
+==========
+
+ An integer value indicating the length of the alignment between the two
+ paired-end reads.
+
+ Attribute added by the program:
+ - :doc:`illuminapairedend <../scripts/illuminapairedend>`
+
diff --git a/doc/sphinx/source/attributes/avg_quality.rst b/doc/sphinx/source/attributes/avg_quality.rst
new file mode 100644
index 0000000..60ec6eb
--- /dev/null
+++ b/doc/sphinx/source/attributes/avg_quality.rst
@@ -0,0 +1,18 @@
+avg_quality
+===========
+
+ A float value indicating the average quality of the raw sequence.
+
+ .. note::
+
+ This tag can be used to investigate why sequences have not been assigned to any sample by
+ :doc:`ngsfilter <../scripts/ngsfilter>`
+
+ .. seealso::
+
+ - :doc:`head_quality <./head_quality>`
+ - :doc:`mid_quality <./mid_quality>`
+ - :doc:`tail_quality <./tail_quality>`
+
+ Attribute added by the programs:
+ - :doc:`ngsfilter <../scripts/ngsfilter>`
diff --git a/doc/sphinx/source/attributes/best_identity.rst b/doc/sphinx/source/attributes/best_identity.rst
new file mode 100644
index 0000000..0bfb9dd
--- /dev/null
+++ b/doc/sphinx/source/attributes/best_identity.rst
@@ -0,0 +1,11 @@
+best_identity
+=============
+
+ A float value indicating the alignment score of the best match in the reference database.
+
+ .. seealso::
+
+ - :doc:`best_match <./best_match>`
+
+ Attribute added by the program:
+ - :doc:`ecotag <../scripts/ecotag>`
diff --git a/doc/sphinx/source/attributes/best_match.rst b/doc/sphinx/source/attributes/best_match.rst
new file mode 100644
index 0000000..cc847a0
--- /dev/null
+++ b/doc/sphinx/source/attributes/best_match.rst
@@ -0,0 +1,11 @@
+best_match
+==========
+
+ The sequence *id* of the best match in the reference database.
+
+ .. seealso::
+
+ - :doc:`best_identity <./best_identity>`
+
+ Attribute added by the program:
+ - :doc:`ecotag <../scripts/ecotag>`
diff --git a/doc/sphinx/source/attributes/class.rst b/doc/sphinx/source/attributes/class.rst
new file mode 100644
index 0000000..ef718c4
--- /dev/null
+++ b/doc/sphinx/source/attributes/class.rst
@@ -0,0 +1,11 @@
+class
+=====
+
+ A string value indicating the group (more exactly :doc:`sample <./sample>` or
+ :doc:`taxid <./taxid>`) in which the :doc:`obiselect <../scripts/obiselect>`
+ program will select sequences.
+
+ Attribute added by the programs:
+ - :doc:`obiselect <../scripts/obiselect>`
+
+
\ No newline at end of file
diff --git a/doc/sphinx/source/attributes/cluster.rst b/doc/sphinx/source/attributes/cluster.rst
new file mode 100644
index 0000000..55e196f
--- /dev/null
+++ b/doc/sphinx/source/attributes/cluster.rst
@@ -0,0 +1,8 @@
+cluster
+=======
+
+ A integer value indicating the cluster this sequence belongs to.
+
+
+ Attribute added by the programs:
+ - :doc:`obiannotate <../scripts/obiannotate>`
diff --git a/doc/sphinx/source/attributes/complemented.rst b/doc/sphinx/source/attributes/complemented.rst
new file mode 100644
index 0000000..1788000
--- /dev/null
+++ b/doc/sphinx/source/attributes/complemented.rst
@@ -0,0 +1,9 @@
+complemented
+============
+
+ A boolean value indicating whether the sequence has been complemented before
+ tag and primer identification.
+
+ Attribute added by the program:
+ - :doc:`ngsfilter <../scripts/ngsfilter>`
+
diff --git a/doc/sphinx/source/attributes/count.rst b/doc/sphinx/source/attributes/count.rst
new file mode 100644
index 0000000..1ed31a2
--- /dev/null
+++ b/doc/sphinx/source/attributes/count.rst
@@ -0,0 +1,16 @@
+count
+=====
+
+ An integer value indicating how many times this sequence occurs in the dataset.
+
+ Attribute added by the programs:
+ - :doc:`obiuniq <../scripts/obiuniq>`
+ - :doc:`obisample <../scripts/obisample>`
+
+ Attribute used by the programs:
+ - :doc:`ecotag <../scripts/ecotag>`
+ - :doc:`ecotaxspecificity <../scripts/ecotaxspecificity>`
+ - :doc:`obiclean <../scripts/obiclean>`
+ - :doc:`obicount <../scripts/obicount>`
+ - :doc:`obisample <../scripts/obisample>`
+ - :doc:`obistat <../scripts/obistat>`
\ No newline at end of file
diff --git a/doc/sphinx/source/attributes/cut.rst b/doc/sphinx/source/attributes/cut.rst
new file mode 100644
index 0000000..33287de
--- /dev/null
+++ b/doc/sphinx/source/attributes/cut.rst
@@ -0,0 +1,11 @@
+cut
+===
+
+ A list with two integers indicating the beginning and end of the barcode
+ sequence itself within the raw sequence.
+
+
+ Attribute added by the programs:
+ - :doc:`ngsfilter <../scripts/ngsfilter>`
+
+
diff --git a/doc/sphinx/source/attributes/direction.rst b/doc/sphinx/source/attributes/direction.rst
new file mode 100644
index 0000000..2093750
--- /dev/null
+++ b/doc/sphinx/source/attributes/direction.rst
@@ -0,0 +1,8 @@
+direction
+=========
+
+ Either 'forward' or 'reverse'. Indicates if the primers have been identified on the 'forward' or
+ 'reverse' strand.
+
+ Attribute added by the program:
+ - :doc:`ngsfilter <../scripts/ngsfilter>`
diff --git a/doc/sphinx/source/attributes/distance.rst b/doc/sphinx/source/attributes/distance.rst
new file mode 100644
index 0000000..d20161a
--- /dev/null
+++ b/doc/sphinx/source/attributes/distance.rst
@@ -0,0 +1,11 @@
+distance
+========
+
+ The distance between the optimal value and the value computed for this sequence record.
+
+ .. seealso::
+
+ - :doc:`select <./select>`
+
+ Attribute added by the programs:
+ - :doc:`obiselect <../scripts/obiselect>`
diff --git a/doc/sphinx/source/attributes/error.rst b/doc/sphinx/source/attributes/error.rst
new file mode 100644
index 0000000..da8d6a6
--- /dev/null
+++ b/doc/sphinx/source/attributes/error.rst
@@ -0,0 +1,11 @@
+error
+=====
+
+ An integer value corresponding to the number of mismatches between each
+ primer and its match in the sequence.
+
+
+ Attribute added by the programs:
+ - :doc:`ngsfilter <../scripts/ngsfilter>`
+
+
diff --git a/doc/sphinx/source/attributes/experiment.rst b/doc/sphinx/source/attributes/experiment.rst
new file mode 100644
index 0000000..df49872
--- /dev/null
+++ b/doc/sphinx/source/attributes/experiment.rst
@@ -0,0 +1,9 @@
+experiment
+==========
+
+ A string value indicating the name of the experiment the sequence and sample
+ belong to. This name is mentioned in the first column of the :doc:`ngsfilter <../scripts/ngsfilter>` samples description file.
+
+ Attribute added by the program:
+ - :doc:`ngsfilter <../scripts/ngsfilter>`
+
diff --git a/doc/sphinx/source/attributes/family.rst b/doc/sphinx/source/attributes/family.rst
new file mode 100644
index 0000000..c02ddf1
--- /dev/null
+++ b/doc/sphinx/source/attributes/family.rst
@@ -0,0 +1,30 @@
+family
+======
+
+ An integer value corresponding to the family of the :doc:`taxid <./taxid>` stored into the
+ :doc:`taxid <taxid>` attribute. If the family is not defined for this :doc:`taxid <./taxid>`,
+ this value is *None*.
+
+ .. warning:: This taxonomic information is just added to the sequence for the end-user
+ convenience and not used by other ``OBITools`` programs as taxonomic information.
+ Only the taxonomic information included in the :doc:`taxid <taxid>`
+ attribute is used as taxonomic annotation.
+
+ .. seealso::
+
+ - :doc:`taxid <./taxid>`
+ - :doc:`scientific_name <./scientific_name>`
+ - :doc:`family_name <./family_name>`
+ - :doc:`genus <./genus>`
+ - :doc:`genus_name <./genus_name>`
+ - :doc:`order <./order>`
+ - :doc:`order_name <./order_name>`
+ - :doc:`species <./species>`
+ - :doc:`species_name <./species_name>`
+
+ Attribute added by the programs:
+ - :doc:`obiuniq <../scripts/obiuniq>`
+ - :doc:`obiannotate <../scripts/obiannotate>`
+
+
+
\ No newline at end of file
diff --git a/doc/sphinx/source/attributes/family_name.rst b/doc/sphinx/source/attributes/family_name.rst
new file mode 100644
index 0000000..27bcd57
--- /dev/null
+++ b/doc/sphinx/source/attributes/family_name.rst
@@ -0,0 +1,29 @@
+family_name
+===========
+
+ A string value indicating the family name of the :doc:`taxid <./taxid>` stored into the
+ :doc:`taxid <taxid>`. If the family is not defined for this :doc:`taxid <./taxid>`,
+ this value is *None*.
+
+ .. warning:: This taxonomic information is just added to the sequence for the end-user
+ convenience and not used by other ``OBITools`` programs as taxonomic information.
+ Only the taxonomic information included in the :doc:`taxid <taxid>`
+ attribute is used as taxonomic annotation.
+
+ .. seealso::
+
+ - :doc:`taxid <./taxid>`
+ - :doc:`scientific_name <./scientific_name>`
+ - :doc:`family <./family>`
+ - :doc:`genus <./genus>`
+ - :doc:`genus_name <./genus_name>`
+ - :doc:`order <./order>`
+ - :doc:`order_name <./order_name>`
+ - :doc:`species <./species>`
+ - :doc:`species_name <./species_name>`
+
+ Attribute added by the programs:
+ - :doc:`obiuniq <../scripts/obiuniq>`
+ - :doc:`obiannotate <../scripts/obiannotate>`
+
+
\ No newline at end of file
diff --git a/doc/sphinx/source/attributes/forward_error.rst b/doc/sphinx/source/attributes/forward_error.rst
new file mode 100644
index 0000000..28e0c3b
--- /dev/null
+++ b/doc/sphinx/source/attributes/forward_error.rst
@@ -0,0 +1,13 @@
+forward_error
+=============
+
+ An integer value indicating the number of mismatches between the forward
+ primer and its match on the sequence under consideration.
+
+ .. seealso::
+
+ - :doc:`reverse_error <./reverse_error>`
+
+ Attribute added by the program:
+ - :doc:`obiconvert <../scripts/obiconvert>`
+
diff --git a/doc/sphinx/source/attributes/forward_match.rst b/doc/sphinx/source/attributes/forward_match.rst
new file mode 100644
index 0000000..1c046e0
--- /dev/null
+++ b/doc/sphinx/source/attributes/forward_match.rst
@@ -0,0 +1,15 @@
+forward_match
+=============
+
+ A string value corresponding to the forward primer match used to identify
+ the sequence.
+
+ .. seealso::
+
+ - :doc:`reverse_match <./reverse_match>`
+
+ Attribute added by the programs:
+ - :doc:`ngsfilter <../scripts/ngsfilter>`
+ - :doc:`obiconvert <../scripts/obiconvert>`
+
+
\ No newline at end of file
diff --git a/doc/sphinx/source/attributes/forward_primer.rst b/doc/sphinx/source/attributes/forward_primer.rst
new file mode 100644
index 0000000..4855d35
--- /dev/null
+++ b/doc/sphinx/source/attributes/forward_primer.rst
@@ -0,0 +1,12 @@
+forward_primer
+==============
+
+ A string value indicating the forward primer used to obtain the sequence.
+
+ .. seealso::
+
+ - :doc:`reverse_primer <./reverse_primer>`
+
+ Attribute added by the program:
+ - :doc:`ngsfilter <../scripts/ngsfilter>`
+
diff --git a/doc/sphinx/source/attributes/forward_score.rst b/doc/sphinx/source/attributes/forward_score.rst
new file mode 100644
index 0000000..0c334a6
--- /dev/null
+++ b/doc/sphinx/source/attributes/forward_score.rst
@@ -0,0 +1,11 @@
+forward_score
+=============
+
+ A real value indicating the score of the alignment of the 5' primer against the sequence.
+
+ .. seealso::
+
+ - :doc:`reverse_score <./reverse_score>`
+
+ Attribute added by the programs:
+ - :doc:`ngsfilter <../scripts/ngsfilter>`
diff --git a/doc/sphinx/source/attributes/forward_tag.rst b/doc/sphinx/source/attributes/forward_tag.rst
new file mode 100644
index 0000000..fe28513
--- /dev/null
+++ b/doc/sphinx/source/attributes/forward_tag.rst
@@ -0,0 +1,14 @@
+forward_tag
+===========
+
+ A string value corresponding to the individual tag attached in 5' of the
+ forward primer and used to assign the sequence to a sample.
+
+ .. seealso::
+
+ - :doc:`reverse_tag <./reverse_tag>`
+
+ Attribute added by the program:
+ - :doc:`ngsfilter <../scripts/ngsfilter>`
+
+
\ No newline at end of file
diff --git a/doc/sphinx/source/attributes/forward_tm.rst b/doc/sphinx/source/attributes/forward_tm.rst
new file mode 100644
index 0000000..5ac68fe
--- /dev/null
+++ b/doc/sphinx/source/attributes/forward_tm.rst
@@ -0,0 +1,13 @@
+forward_tm
+==========
+
+ A float value indicating the *Tm* of the forward primer match on the
+ sequence under consideration.
+
+ .. seealso::
+
+ - :doc:`forward_tm <./forward_tm>`
+
+ Attribute added by the program:
+ - :doc:`obiconvert <../scripts/obiconvert>`
+
diff --git a/doc/sphinx/source/attributes/genus.rst b/doc/sphinx/source/attributes/genus.rst
new file mode 100644
index 0000000..2c7929b
--- /dev/null
+++ b/doc/sphinx/source/attributes/genus.rst
@@ -0,0 +1,33 @@
+genus
+=====
+
+ An integer value corresponding to the genus of the :doc:`taxid <./taxid>` stored into the
+ :doc:`taxid <taxid>` attribute. If the genus is not defined for this :doc:`taxid <./taxid>`,
+ this value is *None*.
+
+
+
+ .. warning:: This taxonomic information is just added to the sequence for the end-user
+ convenience and not used by other ``OBITools`` programs as taxonomic information.
+ Only the taxonomic information included in the :doc:`taxid <taxid>`
+ attribute is used as taxonomic annotation.
+
+ .. seealso::
+
+ - :doc:`taxid <./taxid>`
+ - :doc:`scientific_name <./scientific_name>`
+ - :doc:`family <./family>`
+ - :doc:`family_name <./family_name>`
+ - :doc:`genus_name <./genus_name>`
+ - :doc:`order <./order>`
+ - :doc:`order_name <./order_name>`
+ - :doc:`species <./species>`
+ - :doc:`species_name <./species_name>`
+
+
+ Attribute added by the programs:
+ - :doc:`obiuniq <../scripts/obiuniq>`
+ - :doc:`obiannotate <../scripts/obiannotate>`
+
+
+
\ No newline at end of file
diff --git a/doc/sphinx/source/attributes/genus_name.rst b/doc/sphinx/source/attributes/genus_name.rst
new file mode 100644
index 0000000..f9eb238
--- /dev/null
+++ b/doc/sphinx/source/attributes/genus_name.rst
@@ -0,0 +1,29 @@
+genus_name
+==========
+
+ A string value indicating the genus name of the :doc:`taxid <./taxid>` stored into the
+ :doc:`taxid <taxid>`. If the genus is not defined for this :doc:`taxid <./taxid>`,
+ this value is *None*.
+
+ .. warning:: This taxonomic information is just added to the sequence for the end-user
+ convenience and not used by other ``OBITools`` programs as taxonomic information.
+ Only the taxonomic information included in the :doc:`taxid <taxid>`
+ attribute is used as taxonomic annotation.
+
+ .. seealso::
+
+ - :doc:`taxid <./taxid>`
+ - :doc:`scientific_name <./scientific_name>`
+ - :doc:`family <./family>`
+ - :doc:`family_name <./family_name>`
+ - :doc:`genus <./genus>`
+ - :doc:`order <./order>`
+ - :doc:`order_name <./order_name>`
+ - :doc:`species <./species>`
+ - :doc:`species_name <./species_name>`
+
+ Attribute added by the programs:
+ - :doc:`obiuniq <../scripts/obiuniq>`
+ - :doc:`obiannotate <../scripts/obiannotate>`
+
+
diff --git a/doc/sphinx/source/attributes/head_quality.rst b/doc/sphinx/source/attributes/head_quality.rst
new file mode 100644
index 0000000..0004c63
--- /dev/null
+++ b/doc/sphinx/source/attributes/head_quality.rst
@@ -0,0 +1,18 @@
+head_quality
+============
+
+ A float value indicating the average quality of the 10 first nucleotide of the raw sequence.
+
+ .. note::
+
+ This tag can be used to investigate why sequences have not been assigned to any sample by
+ :doc:`ngsfilter <../scripts/ngsfilter>`
+
+ .. seealso::
+
+ - :doc:`avg_quality <./avg_quality>`
+ - :doc:`mid_quality <./mid_quality>`
+ - :doc:`tail_quality <./tail_quality>`
+
+ Attribute added by the programs:
+ - :doc:`ngsfilter <../scripts/ngsfilter>`
diff --git a/doc/sphinx/source/attributes/id_status.rst b/doc/sphinx/source/attributes/id_status.rst
new file mode 100644
index 0000000..4d189a4
--- /dev/null
+++ b/doc/sphinx/source/attributes/id_status.rst
@@ -0,0 +1,8 @@
+id_status
+===========
+
+ A boolean indicating whether a sequence match above the minimum threshold score has been found in
+ the reference database.
+
+ Attribute added by the program:
+ - :doc:`ecotag <../scripts/ecotag>`
diff --git a/doc/sphinx/source/attributes/merged.rst b/doc/sphinx/source/attributes/merged.rst
new file mode 100644
index 0000000..8e4be0a
--- /dev/null
+++ b/doc/sphinx/source/attributes/merged.rst
@@ -0,0 +1,9 @@
+merged
+======
+
+ The `merged` key contains all *ids* of a group of sequences.
+
+ Attribute added by the program:
+ - :doc:`obiuniq <../scripts/obiuniq>`
+ - :doc:`obiselect <../scripts/obiselect>`
+
diff --git a/doc/sphinx/source/attributes/merged_star.rst b/doc/sphinx/source/attributes/merged_star.rst
new file mode 100644
index 0000000..eeafe40
--- /dev/null
+++ b/doc/sphinx/source/attributes/merged_star.rst
@@ -0,0 +1,14 @@
+merged_*
+========
+
+ The `merged_*` attribute is built based on another attribute `*` (for example,
+ `sample`) by the :doc:`obiuniq <../scripts/obiuniq>` program. The value associated to the `merged_*`
+ attribute is a contingency table summarizing modality frequencies associated to the `*` attribute.
+ For instance, `merged_sample={'X1': 12, 'X2': 10}` means that among the 22 identical sequences merged
+ by the :doc:`obiuniq <../scripts/obiuniq>`, the `sample` attribute was set 12 and 10 times to the modality 'X1'
+ and 'X2', respectively.
+
+ Attribute added by the program:
+ - :doc:`obiuniq <../scripts/obiuniq>`
+ - :doc:`obiselect <../scripts/obiselect>`
+
diff --git a/doc/sphinx/source/attributes/mid_quality.rst b/doc/sphinx/source/attributes/mid_quality.rst
new file mode 100644
index 0000000..6d58fb1
--- /dev/null
+++ b/doc/sphinx/source/attributes/mid_quality.rst
@@ -0,0 +1,19 @@
+mid_quality
+===========
+
+ A float value indicating the average quality of the raw sequence except its first and
+ last 10 nucleotides.
+
+ .. note::
+
+ This tag can be used to investigate why sequences have not been assigned to any sample by
+ doc:`ngsfilter <../scripts/ngsfilter>`
+
+ .. seealso::
+
+ - :doc:`avg_quality <./avg_quality>`
+ - :doc:`head_quality <./head_quality>`
+ - :doc:`tail_quality <./tail_quality>`
+
+ Attribute added by the programs:
+ - :doc:`ngsfilter <../scripts/ngsfilter>`
diff --git a/doc/sphinx/source/attributes/mode.rst b/doc/sphinx/source/attributes/mode.rst
new file mode 100644
index 0000000..b767e56
--- /dev/null
+++ b/doc/sphinx/source/attributes/mode.rst
@@ -0,0 +1,11 @@
+mode
+====
+
+ Either 'alignment' or 'joined'. Indicates whether the reported sequence is the consensus of the
+ aligned reads ('alignment') or just the concatenation of the two reads ('joined').
+
+ Attribute added by the program:
+ - :doc:`illuminapairedend <../scripts/illuminapairedend>`
+
+
+
\ No newline at end of file
diff --git a/doc/sphinx/source/attributes/obiclean_cluster.rst b/doc/sphinx/source/attributes/obiclean_cluster.rst
new file mode 100644
index 0000000..a614a4a
--- /dev/null
+++ b/doc/sphinx/source/attributes/obiclean_cluster.rst
@@ -0,0 +1,11 @@
+obiclean_cluster
+================
+
+ An associative array indicating to which cluster each sequence is associated in each sample.
+
+ .. seealso::
+
+ :doc:`obiclean_count <./obiclean_count>`
+
+ Attribute added by the program:
+ - :doc:`obiclean <../scripts/obiclean>`
diff --git a/doc/sphinx/source/attributes/obiclean_count.rst b/doc/sphinx/source/attributes/obiclean_count.rst
new file mode 100644
index 0000000..3f9cabd
--- /dev/null
+++ b/doc/sphinx/source/attributes/obiclean_count.rst
@@ -0,0 +1,13 @@
+obiclean_cluster
+================
+
+ This attribute is added by :doc:`obiclean <../scripts/obiclean>` only to the sequences corresponding to
+ a cluster centre. It is an associative array indicating for each sample the abundance of the cluster.
+
+ .. seealso::
+
+ :doc:`obiclean_cluster <./obiclean_cluster>`
+ :doc:`obiclean_head <./obiclean_head>`
+
+ Attribute added by the program:
+ - :doc:`obiclean <../scripts/obiclean>`
diff --git a/doc/sphinx/source/attributes/obiclean_head.rst b/doc/sphinx/source/attributes/obiclean_head.rst
new file mode 100644
index 0000000..b42b1b9
--- /dev/null
+++ b/doc/sphinx/source/attributes/obiclean_head.rst
@@ -0,0 +1,12 @@
+obiclean_cluster
+================
+
+ A boolean value set to True if the sequence has the `head` status in at least one sample.
+
+ .. seealso::
+
+ :doc:`obiclean_cluster <./obiclean_cluster>`
+ :doc:`obiclean_count <./obiclean_count>`
+
+ Attribute added by the program:
+ - :doc:`obiclean <../scripts/obiclean>`
diff --git a/doc/sphinx/source/attributes/obiclean_headcount.rst b/doc/sphinx/source/attributes/obiclean_headcount.rst
new file mode 100644
index 0000000..cc1ab9f
--- /dev/null
+++ b/doc/sphinx/source/attributes/obiclean_headcount.rst
@@ -0,0 +1,14 @@
+obiclean_headcount
+==================
+
+ An integer value indicating the count of sample where the sequence has the `head` status.
+
+ .. seealso::
+
+ :doc:`obiclean_singletoncount <./obiclean_singletoncount>`
+ :doc:`obiclean_internalcount <./obiclean_internalcount>`
+ :doc:`obiclean_samplecount <./obiclean_samplecount>`
+ :doc:`obiclean_status <./obiclean_status>`
+
+ Attribute added by the program:
+ - :doc:`obiclean <../scripts/obiclean>`
diff --git a/doc/sphinx/source/attributes/obiclean_internalcount.rst b/doc/sphinx/source/attributes/obiclean_internalcount.rst
new file mode 100644
index 0000000..04e6c12
--- /dev/null
+++ b/doc/sphinx/source/attributes/obiclean_internalcount.rst
@@ -0,0 +1,14 @@
+obiclean_internalcount
+=======================
+
+ An integer value indicating the count of sample where the sequence has the `internal` status.
+
+ .. seealso::
+
+ :doc:`obiclean_headcount <./obiclean_headcount>`
+ :doc:`obiclean_singletoncount <./obiclean_singletoncount>`
+ :doc:`obiclean_samplecount <./obiclean_samplecount>`
+ :doc:`obiclean_status <./obiclean_status>`
+
+ Attribute added by the program:
+ - :doc:`obiclean <../scripts/obiclean>`
diff --git a/doc/sphinx/source/attributes/obiclean_samplecount.rst b/doc/sphinx/source/attributes/obiclean_samplecount.rst
new file mode 100644
index 0000000..6979b95
--- /dev/null
+++ b/doc/sphinx/source/attributes/obiclean_samplecount.rst
@@ -0,0 +1,13 @@
+obiclean_samplecount
+====================
+
+ An integer value indicating the count of sample where the sequence is observed.
+
+ .. seealso::
+
+ :doc:`obiclean_headcount <./obiclean_headcount>`
+ :doc:`obiclean_internalcount <./obiclean_internalcount>`
+ :doc:`obiclean_singletoncount <./obiclean_singletoncount>`
+
+ Attribute added by the program:
+ - :doc:`obiclean <../scripts/obiclean>`
diff --git a/doc/sphinx/source/attributes/obiclean_singletoncount.rst b/doc/sphinx/source/attributes/obiclean_singletoncount.rst
new file mode 100644
index 0000000..f979887
--- /dev/null
+++ b/doc/sphinx/source/attributes/obiclean_singletoncount.rst
@@ -0,0 +1,14 @@
+obiclean_singletoncount
+=======================
+
+ An integer value indicating the count of sample where the sequence has the `singleton` status.
+
+ .. seealso::
+
+ :doc:`obiclean_headcount <./obiclean_headcount>`
+ :doc:`obiclean_internalcount <./obiclean_internalcount>`
+ :doc:`obiclean_samplecount <./obiclean_samplecount>`
+ :doc:`obiclean_status <./obiclean_status>`
+
+ Attribute added by the program:
+ - :doc:`obiclean <../scripts/obiclean>`
diff --git a/doc/sphinx/source/attributes/obiclean_status.rst b/doc/sphinx/source/attributes/obiclean_status.rst
new file mode 100644
index 0000000..e66bd42
--- /dev/null
+++ b/doc/sphinx/source/attributes/obiclean_status.rst
@@ -0,0 +1,14 @@
+obiclean_status
+===============
+
+ An associative array storing the status of the sequence `h` (head), `i` (internal) or
+ `s` (singleton) in each sample
+
+ .. seealso::
+
+ :doc:`obiclean_headcount <./obiclean_headcount>`
+ :doc:`obiclean_singletoncount <./obiclean_singletoncount>`
+ :doc:`obiclean_internalcount <./obiclean_internalcount>`
+
+ Attribute added by the program:
+ - :doc:`obiclean <../scripts/obiclean>`
diff --git a/doc/sphinx/source/attributes/occurrence.rst b/doc/sphinx/source/attributes/occurrence.rst
new file mode 100644
index 0000000..14cd1fe
--- /dev/null
+++ b/doc/sphinx/source/attributes/occurrence.rst
@@ -0,0 +1,9 @@
+occurrence
+==========
+
+ An integer value indicating the number of samples in which the sequence has
+ been observed at least once.
+
+ Attribute added by the program:
+ - :doc:`obiclean <../scripts/obiclean>`
+
diff --git a/doc/sphinx/source/attributes/order.rst b/doc/sphinx/source/attributes/order.rst
new file mode 100644
index 0000000..f5bb882
--- /dev/null
+++ b/doc/sphinx/source/attributes/order.rst
@@ -0,0 +1,30 @@
+order
+=====
+
+ An integer value corresponding to the order of the :doc:`taxid <./taxid>` stored into the
+ :doc:`taxid <taxid>` attribute. If the order is not defined for this :doc:`taxid <./taxid>`,
+ this value is *None*.
+
+ .. warning:: This taxonomic information is just added to the sequence for the end-user
+ convenience and not used by other ``OBITools`` programs as taxonomic information.
+ Only the taxonomic information included in the :doc:`taxid <taxid>`
+ attribute is used as taxonomic annotation.
+
+ .. seealso::
+
+ - :doc:`taxid <./taxid>`
+ - :doc:`scientific_name <./scientific_name>`
+ - :doc:`family <./family>`
+ - :doc:`family_name <./family_name>`
+ - :doc:`genus <./genus>`
+ - :doc:`genus_name <./genus_name>`
+ - :doc:`order_name <./order_name>`
+ - :doc:`species <./species>`
+ - :doc:`species_name <./species_name>`
+
+
+ Attribute added by the programs:
+ - :doc:`obiuniq <../scripts/obiuniq>`
+ - :doc:`obiannotate <../scripts/obiannotate>`
+
+
diff --git a/doc/sphinx/source/attributes/order_name.rst b/doc/sphinx/source/attributes/order_name.rst
new file mode 100644
index 0000000..85493bc
--- /dev/null
+++ b/doc/sphinx/source/attributes/order_name.rst
@@ -0,0 +1,29 @@
+order_name
+==========
+
+ A string value indicating the order name of the :doc:`taxid <./taxid>` stored into the
+ :doc:`taxid <taxid>`. If the order is not defined for this :doc:`taxid <./taxid>`,
+ this value is *None*.
+
+ .. warning:: This taxonomic information is just added to the sequence for the end-user
+ convenience and not used by other ``OBITools`` programs as taxonomic information.
+ Only the taxonomic information included in the :doc:`taxid <taxid>`
+ attribute is used as taxonomic annotation.
+
+ .. seealso::
+
+ - :doc:`taxid <./taxid>`
+ - :doc:`scientific_name <./scientific_name>`
+ - :doc:`family <./family>`
+ - :doc:`family_name <./family_name>`
+ - :doc:`genus <./genus>`
+ - :doc:`genus_name <./genus_name>`
+ - :doc:`order <./order>`
+ - :doc:`species <./species>`
+ - :doc:`species_name <./species_name>`
+
+
+ Attribute added by the programs:
+ - :doc:`obiuniq <../scripts/obiuniq>`
+ - :doc:`obiannotate <../scripts/obiannotate>`
+
diff --git a/doc/sphinx/source/attributes/pairend_limit.rst b/doc/sphinx/source/attributes/pairend_limit.rst
new file mode 100644
index 0000000..7829224
--- /dev/null
+++ b/doc/sphinx/source/attributes/pairend_limit.rst
@@ -0,0 +1,13 @@
+pairend_limit
+=============
+
+ Indicates, when the reported sequence is the concatenation of the two reads (mode='joined'),
+ the position in the reported sequence where the second read starts.
+
+ .. seealso::
+
+ - :doc:`mode <./mode>`
+
+ Attribute added by the program:
+ - :doc:`illuminapairedend <../scripts/illuminapairedend>`
+
diff --git a/doc/sphinx/source/attributes/partial.rst b/doc/sphinx/source/attributes/partial.rst
new file mode 100644
index 0000000..069ceeb
--- /dev/null
+++ b/doc/sphinx/source/attributes/partial.rst
@@ -0,0 +1,11 @@
+partial
+=======
+
+ A boolean value indicating whether both sample tags and both primers have
+ been identified on both extremities of the sequence, more exactly whether we expect
+ the sequence to be partially sequenced (**partial=True;**) or completely sequenced
+ (**partial=False;**).
+
+ Attribute added by the program:
+ - :doc:`ngsfilter <../scripts/ngsfilter>`
+
diff --git a/doc/sphinx/source/attributes/rank.rst b/doc/sphinx/source/attributes/rank.rst
new file mode 100644
index 0000000..3ee8221
--- /dev/null
+++ b/doc/sphinx/source/attributes/rank.rst
@@ -0,0 +1,15 @@
+rank
+====
+
+ A string value corresponding to the lowest taxonomic rank of :doc:`taxid <./taxid>` stored
+ into the :doc:`taxid <taxid>` attribute.
+
+ .. warning:: This taxonomic information is just added to the sequence for the end-user
+ convenience and not used by other ``obitools`` programs as taxonomic information.
+ Only the taxonomic information included in the :doc:`taxid <../attributes/taxid>`
+ attribute is used as taxonomic annotation.
+
+ Attribute added by the programs:
+ - :doc:`obiuniq <../scripts/obiuniq>`
+ - :doc:`obiannotate <../scripts/obiannotate>`
+
diff --git a/doc/sphinx/source/attributes/reverse_error.rst b/doc/sphinx/source/attributes/reverse_error.rst
new file mode 100644
index 0000000..7efc2e5
--- /dev/null
+++ b/doc/sphinx/source/attributes/reverse_error.rst
@@ -0,0 +1,13 @@
+reverse_error
+=============
+
+ An integer value indicating the number of mismatches between the reverse
+ primer and its match on the sequence under consideration.
+
+ .. seealso::
+
+ - :doc:`forward_error <./forward_error>`
+
+ Attribute added by the programs:
+ - :doc:`obiconvert <../scripts/obiconvert>`
+
diff --git a/doc/sphinx/source/attributes/reverse_match.rst b/doc/sphinx/source/attributes/reverse_match.rst
new file mode 100644
index 0000000..10154d4
--- /dev/null
+++ b/doc/sphinx/source/attributes/reverse_match.rst
@@ -0,0 +1,15 @@
+reverse_match
+=============
+
+ A string value corresponding to the reverse primer match used to identify
+ the sequence.
+
+ .. seealso::
+
+ - :doc:`reverse_match <./reverse_match>`
+
+ Attribute added by the programs:
+ - :doc:`ngsfilter <../scripts/ngsfilter>`
+ - :doc:`obiconvert <../scripts/obiconvert>`
+
+
\ No newline at end of file
diff --git a/doc/sphinx/source/attributes/reverse_primer.rst b/doc/sphinx/source/attributes/reverse_primer.rst
new file mode 100644
index 0000000..962eae2
--- /dev/null
+++ b/doc/sphinx/source/attributes/reverse_primer.rst
@@ -0,0 +1,13 @@
+reverse_primer
+==============
+
+ A string value indicating the reverse primer used to obtain the sequence.
+
+ .. seealso::
+
+ - :doc:`forward_primer <./forward_primer>`
+
+ Attribute added by the program:
+ - :doc:`ngsfilter <../scripts/ngsfilter>`
+
+
\ No newline at end of file
diff --git a/doc/sphinx/source/attributes/reverse_score.rst b/doc/sphinx/source/attributes/reverse_score.rst
new file mode 100644
index 0000000..6dd9b62
--- /dev/null
+++ b/doc/sphinx/source/attributes/reverse_score.rst
@@ -0,0 +1,11 @@
+reverse_score
+=============
+
+ A real value indicating the score of the alignment of the 3' primer against the sequence.
+
+ .. seealso::
+
+ - :doc:`forward_score <./forward_score>`
+
+ Attribute added by the programs:
+ - :doc:`ngsfilter <../scripts/ngsfilter>`
diff --git a/doc/sphinx/source/attributes/reverse_tag.rst b/doc/sphinx/source/attributes/reverse_tag.rst
new file mode 100644
index 0000000..aec33b6
--- /dev/null
+++ b/doc/sphinx/source/attributes/reverse_tag.rst
@@ -0,0 +1,13 @@
+reverse_tag
+===========
+
+ A string value corresponding to the individual tag attached in 5' of the
+ reverse primer and used to assign the sequence to a sample.
+
+ .. seealso::
+
+ - :doc:`forward_tag <./forward_tag>`
+
+ Attribute added by the program:
+ - :doc:`ngsfilter <../scripts/ngsfilter>`
+
diff --git a/doc/sphinx/source/attributes/reverse_tm.rst b/doc/sphinx/source/attributes/reverse_tm.rst
new file mode 100644
index 0000000..3292690
--- /dev/null
+++ b/doc/sphinx/source/attributes/reverse_tm.rst
@@ -0,0 +1,13 @@
+reverse_tm
+==========
+
+ A float value indicating the *Tm* of the reverse primer match on the
+ sequence under consideration.
+
+ .. seealso::
+
+ - :doc:`forward_tm <./forward_tm>`
+
+ Attribute added by the program:
+ - :doc:`obiconvert <../scripts/obiconvert>`
+
diff --git a/doc/sphinx/source/attributes/sample.rst b/doc/sphinx/source/attributes/sample.rst
new file mode 100644
index 0000000..11f50c8
--- /dev/null
+++ b/doc/sphinx/source/attributes/sample.rst
@@ -0,0 +1,10 @@
+sample
+======
+
+ A string value indicating the name of the sample the sequence belongs to.
+ This name is mentioned in the second column of the :doc:`ngsfilter <../scripts/ngsfilter>`
+ samples description file.
+
+ Attribute added by the program:
+ - :doc:`ngsfilter <../scripts/ngsfilter>`
+
diff --git a/doc/sphinx/source/attributes/scientific_name.rst b/doc/sphinx/source/attributes/scientific_name.rst
new file mode 100644
index 0000000..53fd77a
--- /dev/null
+++ b/doc/sphinx/source/attributes/scientific_name.rst
@@ -0,0 +1,28 @@
+scientific_name
+===============
+
+ A string value indicating the scientific name corresponding to the :doc:`taxid <./taxid>` stored
+ into the :doc:`taxid <taxid>` attribute.
+
+ .. warning:: This taxonomic information is just added to the sequence for the end-user
+ convenience and not used by other ``OBITools`` programs as taxonomic information.
+ Only the taxonomic information included in the :doc:`taxid <../attributes/taxid>`
+ attribute is used as taxonomic annotation.
+
+ .. seealso::
+
+ - :doc:`taxid <./taxid>`
+ - :doc:`family <./family>`
+ - :doc:`family_name <./family_name>`
+ - :doc:`genus <./genus>`
+ - :doc:`genus_name <./genus_name>`
+ - :doc:`order <./order>`
+ - :doc:`order_name <./order_name>`
+ - :doc:`species <./species>`
+ - :doc:`species_name <./species_name>`
+
+
+ Attribute added by the programs:
+ - :doc:`obiuniq <../scripts/obiuniq>`
+ - :doc:`obiannotate <../scripts/obiannotate>`
+
diff --git a/doc/sphinx/source/attributes/score.rst b/doc/sphinx/source/attributes/score.rst
new file mode 100644
index 0000000..ffdd488
--- /dev/null
+++ b/doc/sphinx/source/attributes/score.rst
@@ -0,0 +1,8 @@
+score
+=====
+
+ A real value computed based on the alignment of two paired-end reads.
+
+ Attribute added by the program:
+ - :doc:`illuminapairedend <../scripts/illuminapairedend>`
+
diff --git a/doc/sphinx/source/attributes/score_norm.rst b/doc/sphinx/source/attributes/score_norm.rst
new file mode 100644
index 0000000..3be2b9c
--- /dev/null
+++ b/doc/sphinx/source/attributes/score_norm.rst
@@ -0,0 +1,8 @@
+score_norm
+==========
+
+ A real value computed based on the alignment score divided by the alignment length.
+
+ Attribute added by the program:
+ - :doc:`illuminapairedend <../scripts/illuminapairedend>`
+
diff --git a/doc/sphinx/source/attributes/select.rst b/doc/sphinx/source/attributes/select.rst
new file mode 100644
index 0000000..f18fac3
--- /dev/null
+++ b/doc/sphinx/source/attributes/select.rst
@@ -0,0 +1,11 @@
+select
+=======
+
+ The value evaluated for this sequence record.
+
+ .. seealso::
+
+ - :doc:`distance <./distance>`
+
+ Attribute added by the programs:
+ - :doc:`obiselect <../scripts/obiselect>`
diff --git a/doc/sphinx/source/attributes/seq_a_deletion.rst b/doc/sphinx/source/attributes/seq_a_deletion.rst
new file mode 100644
index 0000000..b8e334e
--- /dev/null
+++ b/doc/sphinx/source/attributes/seq_a_deletion.rst
@@ -0,0 +1,8 @@
+seq_a_deletion
+==============
+
+ Integer value indicating the number of deletions between the first
+ read and the consensus sequence in the aligned part.
+
+ Attribute added by the program:
+ - :doc:`illuminapairedend <../scripts/illuminapairedend>`
diff --git a/doc/sphinx/source/attributes/seq_a_insertion.rst b/doc/sphinx/source/attributes/seq_a_insertion.rst
new file mode 100644
index 0000000..9864fe7
--- /dev/null
+++ b/doc/sphinx/source/attributes/seq_a_insertion.rst
@@ -0,0 +1,8 @@
+seq_a_insertion
+===============
+
+ Integer value indicating the number of insertions between the first
+ read and the consensus sequence in the aligned part.
+
+ Attribute added by the program:
+ - :doc:`illuminapairedend <../scripts/illuminapairedend>`
diff --git a/doc/sphinx/source/attributes/seq_a_mismatch.rst b/doc/sphinx/source/attributes/seq_a_mismatch.rst
new file mode 100644
index 0000000..e230750
--- /dev/null
+++ b/doc/sphinx/source/attributes/seq_a_mismatch.rst
@@ -0,0 +1,8 @@
+seq_a_mismatch
+==============
+
+ Integer value indicating the number of mismatches between the first
+ read and the consensus sequence in the aligned part.
+
+ Attribute added by the program:
+ - :doc:`illuminapairedend <../scripts/illuminapairedend>`
diff --git a/doc/sphinx/source/attributes/seq_a_single.rst b/doc/sphinx/source/attributes/seq_a_single.rst
new file mode 100644
index 0000000..add9b42
--- /dev/null
+++ b/doc/sphinx/source/attributes/seq_a_single.rst
@@ -0,0 +1,9 @@
+seq_a_single
+============
+
+ Integer value indicating the number of nucleotides of the first read
+ that belong to the consensus sequence and were not aligned with the
+ second read.
+
+ Attribute added by the program:
+ - :doc:`illuminapairedend <../scripts/illuminapairedend>`
diff --git a/doc/sphinx/source/attributes/seq_ab_match.rst b/doc/sphinx/source/attributes/seq_ab_match.rst
new file mode 100644
index 0000000..63db8b9
--- /dev/null
+++ b/doc/sphinx/source/attributes/seq_ab_match.rst
@@ -0,0 +1,7 @@
+seq_ab_match
+============
+
+ Integer value indicating the number of matches in the aligned part.
+
+ Attribute added by the program:
+ - :doc:`illuminapairedend <../scripts/illuminapairedend>`
diff --git a/doc/sphinx/source/attributes/seq_b_deletion.rst b/doc/sphinx/source/attributes/seq_b_deletion.rst
new file mode 100644
index 0000000..ae48c14
--- /dev/null
+++ b/doc/sphinx/source/attributes/seq_b_deletion.rst
@@ -0,0 +1,8 @@
+seq_b_deletion
+==============
+
+ Integer value indicating the number of deletions between the second
+ read and the consensus sequence in the aligned part.
+
+ Attribute added by the program:
+ - :doc:`illuminapairedend <../scripts/illuminapairedend>`
diff --git a/doc/sphinx/source/attributes/seq_b_insertion.rst b/doc/sphinx/source/attributes/seq_b_insertion.rst
new file mode 100644
index 0000000..1db9bec
--- /dev/null
+++ b/doc/sphinx/source/attributes/seq_b_insertion.rst
@@ -0,0 +1,8 @@
+seq_b_insertion
+===============
+
+ Integer value indicating the number of insertions between the second
+ read and the consensus sequence in the aligned part.
+
+ Attribute added by the program:
+ - :doc:`illuminapairedend <../scripts/illuminapairedend>`
diff --git a/doc/sphinx/source/attributes/seq_b_mismatch.rst b/doc/sphinx/source/attributes/seq_b_mismatch.rst
new file mode 100644
index 0000000..8c3f8ff
--- /dev/null
+++ b/doc/sphinx/source/attributes/seq_b_mismatch.rst
@@ -0,0 +1,8 @@
+seq_b_mismatch
+==============
+
+ Integer value indicating the number of mismatches between the second
+ read and the consensus sequence in the aligned part.
+
+ Attribute added by the program:
+ - :doc:`illuminapairedend <../scripts/illuminapairedend>`
diff --git a/doc/sphinx/source/attributes/seq_b_single.rst b/doc/sphinx/source/attributes/seq_b_single.rst
new file mode 100644
index 0000000..4a4d5f9
--- /dev/null
+++ b/doc/sphinx/source/attributes/seq_b_single.rst
@@ -0,0 +1,9 @@
+seq_b_single
+============
+
+ Integer value indicating the number of nucleotides of the second read
+ that belong to the consensus sequence and were not aligned with the
+ first read.
+
+ Attribute added by the program:
+ - :doc:`illuminapairedend <../scripts/illuminapairedend>`
diff --git a/doc/sphinx/source/attributes/seq_length.rst b/doc/sphinx/source/attributes/seq_length.rst
new file mode 100644
index 0000000..76c3ffa
--- /dev/null
+++ b/doc/sphinx/source/attributes/seq_length.rst
@@ -0,0 +1,10 @@
+seq_length
+==========
+
+ A integer value indicating the length of the sequence.
+
+ Attribute added by the programs:
+ - :doc:`ngsfilter <../scripts/ngsfilter>`
+ - :doc:`obiannotate <../scripts/obiannotate>`
+
+
\ No newline at end of file
diff --git a/doc/sphinx/source/attributes/seq_length_ori.rst b/doc/sphinx/source/attributes/seq_length_ori.rst
new file mode 100644
index 0000000..57fdbec
--- /dev/null
+++ b/doc/sphinx/source/attributes/seq_length_ori.rst
@@ -0,0 +1,9 @@
+seq_length_ori
+==============
+
+ An integer value indicating the length of the sequence before tag and primer
+ removal.
+
+ Attribute added by the program:
+ - :doc:`ngsfilter <../scripts/ngsfilter>`
+
diff --git a/doc/sphinx/source/attributes/seq_rank.rst b/doc/sphinx/source/attributes/seq_rank.rst
new file mode 100644
index 0000000..c35933c
--- /dev/null
+++ b/doc/sphinx/source/attributes/seq_rank.rst
@@ -0,0 +1,9 @@
+seq_rank
+========
+
+ An integer value indicating the rank of the sequence in the file.
+
+ Attribute added by the programs:
+ - :doc:`obiannotate <../scripts/obiannotate>`
+
+
\ No newline at end of file
diff --git a/doc/sphinx/source/attributes/sminL.rst b/doc/sphinx/source/attributes/sminL.rst
new file mode 100644
index 0000000..b7e6616
--- /dev/null
+++ b/doc/sphinx/source/attributes/sminL.rst
@@ -0,0 +1,8 @@
+sminL
+=====
+
+ A real value corresponding to the minimum score (specified with the ``--score-min`` option) above
+ which ``left`` alignment are discarded.
+
+ Attribute added by the programs:
+ - :doc:`illuminapairedend <../scripts/illuminapairedend>`
diff --git a/doc/sphinx/source/attributes/sminR.rst b/doc/sphinx/source/attributes/sminR.rst
new file mode 100644
index 0000000..775b45c
--- /dev/null
+++ b/doc/sphinx/source/attributes/sminR.rst
@@ -0,0 +1,8 @@
+sminR
+=====
+
+ A real value corresponding to the minimum score (specified with the ``--score-min`` option) above
+ which ``right`` alignment are discarded.
+
+ Attribute added by the programs:
+ - :doc:`illuminapairedend <../scripts/illuminapairedend>`
diff --git a/doc/sphinx/source/attributes/species.rst b/doc/sphinx/source/attributes/species.rst
new file mode 100644
index 0000000..3cea8bd
--- /dev/null
+++ b/doc/sphinx/source/attributes/species.rst
@@ -0,0 +1,32 @@
+species
+=======
+
+ An integer value corresponding to the species of the :doc:`taxid <./taxid>` stored into the
+ :doc:`taxid <taxid>` attribute. If the species is not defined for this :doc:`taxid <./taxid>`,
+ this value is *None*.
+
+ .. warning:: This taxonomic information is just added to the sequence for the end-user
+ convenience and not used by other ``OBITools`` programs as taxonomic information.
+ Only the taxonomic information included in the :doc:`taxid <taxid>`
+ attribute is used as taxonomic annotation.
+
+ .. seealso::
+
+ - :doc:`taxid <./taxid>`
+ - :doc:`scientific_name <./scientific_name>`
+ - :doc:`family <./family>`
+ - :doc:`family_name <./family_name>`
+ - :doc:`genus <./genus>`
+ - :doc:`genus_name <./genus_name>`
+ - :doc:`order <./order>`
+ - :doc:`order_name <./order_name>`
+ - :doc:`species_name <./species_name>`
+
+
+ Attribute added by the programs:
+ - :doc:`obiuniq <../scripts/obiuniq>`
+ - :doc:`obiannotate <../scripts/obiannotate>`
+
+
+
+
\ No newline at end of file
diff --git a/doc/sphinx/source/attributes/species_list.rst b/doc/sphinx/source/attributes/species_list.rst
new file mode 100644
index 0000000..35f22f3
--- /dev/null
+++ b/doc/sphinx/source/attributes/species_list.rst
@@ -0,0 +1,12 @@
+species_list
+============
+
+ A list of strings corresponding to the species scientific names which are under the
+ assigned :doc:`taxid <./taxid>` (when the list becomes too long, the list is empty).
+
+ .. warning::
+
+ This list should not be used for assignment purposes.
+
+ Attribute added by the program:
+ - :doc:`ecotag <../scripts/ecotag>`
diff --git a/doc/sphinx/source/attributes/species_name.rst b/doc/sphinx/source/attributes/species_name.rst
new file mode 100644
index 0000000..3e442d8
--- /dev/null
+++ b/doc/sphinx/source/attributes/species_name.rst
@@ -0,0 +1,31 @@
+species_name
+============
+
+ A string value indicating the species scientific name of the :doc:`taxid <./taxid>` stored
+ into the :doc:`taxid <taxid>`. If the species is not defined for this :doc:`taxid <./taxid>`,
+ this value is *None*.
+
+ .. warning:: This taxonomic information is just added to the sequence for the end-user
+ convenience and not used by other ``OBITools`` programs as taxonomic information.
+ Only the taxonomic information included in the :doc:`taxid <taxid>`
+ attribute is used as taxonomic annotation.
+
+ .. seealso::
+
+ - :doc:`taxid <./taxid>`
+ - :doc:`scientific_name <./scientific_name>`
+ - :doc:`family <./family>`
+ - :doc:`family_name <./family_name>`
+ - :doc:`genus <./genus>`
+ - :doc:`genus_name <./genus_name>`
+ - :doc:`order <./order>`
+ - :doc:`order_name <./order_name>`
+ - :doc:`species <./species>`
+
+
+ Attribute added by the programs:
+ - :doc:`obiuniq <../scripts/obiuniq>`
+ - :doc:`obiannotate <../scripts/obiannotate>`
+
+
+
\ No newline at end of file
diff --git a/doc/sphinx/source/attributes/status.rst b/doc/sphinx/source/attributes/status.rst
new file mode 100644
index 0000000..add86ac
--- /dev/null
+++ b/doc/sphinx/source/attributes/status.rst
@@ -0,0 +1,10 @@
+status
+======
+
+ Either *full* if the amplicon has been sequenced entirely, or *partial* if not.
+
+
+ Attribute added by the programs:
+ - :doc:`ngsfilter <../scripts/ngsfilter>`
+
+
diff --git a/doc/sphinx/source/attributes/strand.rst b/doc/sphinx/source/attributes/strand.rst
new file mode 100644
index 0000000..8831fd4
--- /dev/null
+++ b/doc/sphinx/source/attributes/strand.rst
@@ -0,0 +1,10 @@
+strand
+======
+
+ A string value indicating whether the sequence was amplified directly from
+ the reference (strand=D) or from its reverse complement (strand=R).
+
+ Attribute added by the programs:
+ - :doc:`obiconvert <../scripts/obiconvert>`
+
+
\ No newline at end of file
diff --git a/doc/sphinx/source/attributes/tail_quality.rst b/doc/sphinx/source/attributes/tail_quality.rst
new file mode 100644
index 0000000..4cc944c
--- /dev/null
+++ b/doc/sphinx/source/attributes/tail_quality.rst
@@ -0,0 +1,18 @@
+mid_quality
+===========
+
+ A float value indicating the average quality of the 10 last nucleotides of the barcode.
+
+ .. note::
+
+ This tag can be used to investigate why sequences have not been assigned to any sample by
+ :`ngsfilter <../scripts/ngsfilter>`
+
+ .. seealso::
+
+ - :doc:`avg_quality <./avg_quality>`
+ - :doc:`head_quality <./head_quality>`
+ - :doc:`mid_quality <./mid_quality>`
+
+ Attribute added by the programs:
+ - :doc:`ngsfilter <../scripts/ngsfilter>`
diff --git a/doc/sphinx/source/attributes/taxid.rst b/doc/sphinx/source/attributes/taxid.rst
new file mode 100644
index 0000000..067d907
--- /dev/null
+++ b/doc/sphinx/source/attributes/taxid.rst
@@ -0,0 +1,30 @@
+taxid
+=====
+
+ An integer referring unambiguously to one taxon in the taxonomic associated database.
+
+ Attribute added by the programs:
+ - :doc:`ecotag <../scripts/ecotag>`
+ - :doc:`ecopcr <../scripts/ecotag>`
+ - :doc:`obiaddtaxids <../scripts/obiaddtaxids>`
+
+ .. seealso::
+
+ - :doc:`scientific_name <./scientific_name>`
+ - :doc:`family <./family>`
+ - :doc:`family_name <./family_name>`
+ - :doc:`genus <./genus>`
+ - :doc:`genus_name <./genus_name>`
+ - :doc:`order <./order>`
+ - :doc:`order_name <./order_name>`
+ - :doc:`species <./species>`
+ - :doc:`species_name <./species_name>`
+
+
+ Attribute used by the programs:
+ - :doc:`obiselect <../scripts/ecotag>`
+ - :doc:`obiannotate <../scripts/obiannotate>`
+ - :doc:`ecodbtaxstat <../scripts/ecodbtaxstat>`
+ - :doc:`ecotaxspecificity <../scripts/ecotaxspecificity>`
+ - :doc:`obiuniq <../scripts/obiuniq>`
+
\ No newline at end of file
diff --git a/doc/sphinx/source/barcodes.rst b/doc/sphinx/source/barcodes.rst
new file mode 100644
index 0000000..eacd41b
--- /dev/null
+++ b/doc/sphinx/source/barcodes.rst
@@ -0,0 +1,11 @@
+Metabarcode design and quality assessment
+=========================================
+
+.. toctree::
+ :maxdepth: 2
+
+ scripts/ecoPCR
+ scripts/ecoPrimers
+ scripts/ecotaxstat
+ scripts/ecotaxspecificity
+
diff --git a/doc/sphinx/source/conf.py b/doc/sphinx/source/conf.py
new file mode 100644
index 0000000..5afacf8
--- /dev/null
+++ b/doc/sphinx/source/conf.py
@@ -0,0 +1,262 @@
+# -*- coding: utf-8 -*-
+#
+# OBITools documentation build configuration file, created by
+# sphinx-quickstart on Tue Dec 8 21:30:02 2009.
+#
+# This file is execfile()d with the current directory set to its containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+
+import sys, os
+import glob
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#sys.path.append(os.path.abspath('.'))
+
+build_dir=open("../build_dir.txt").readlines()[0].strip()
+sys.path.insert(0, os.path.abspath('../../../%s' % build_dir))
+sys.path.insert(0, os.path.abspath('../../../build/raw_scripts'))
+sys.path.insert(0, os.path.abspath('../sphinxext'))
+
+import obitools
+import obitools.version
+
+# Add any Sphinx extension module names here, as strings. They can
+# be extensions coming with Sphinx (named 'sphinx.ext.*') or your
+# custom ones.
+
+# -- General configuration -----------------------------------------------------
+
+# Add any Sphinx extension module names here, as strings. They can be extensions
+# coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
+extensions = ['sphinx.ext.autodoc', 'sphinx.ext.todo',
+ 'sphinx.ext.coverage', 'sphinx.ext.viewcode',
+ 'sphinx.ext.graphviz', 'sphinx.ext.inheritance_diagram',
+ 'sphinx.ext.pngmath',
+# 'matplotlib.sphinxext.mathmpl',
+# 'matplotlib.sphinxext.only_directives',
+# 'matplotlib.sphinxext.plot_directive',
+# 'matplotlib.sphinxext.ipython_directive',
+ 'sphinx.ext.doctest',
+ 'ipython_console_highlighting',
+ 'numpydoc']
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix of source filenames.
+source_suffix = '.rst'
+
+# The encoding of source files.
+#source_encoding = 'utf-8'
+source_encoding = 'latin1'
+
+# The master toctree document.
+master_doc = 'index'
+
+# General information about the project.
+project = u'OBITools'
+copyright = u'2009 - 2015, OBITool Development Team'
+
+# The version info for the project you're documenting, acts as replacement for
+# |version| and |release|, also used in various other places throughout the
+# built documents.
+#
+# The short X.Y version.
+version = obitools.version.version
+# The full version, including alpha/beta/rc tags.
+release = obitools.version.version
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#language = None
+
+# There are two options for replacing |today|: either, you set today to some
+# non-false value, then it is used:
+#today = ''
+# Else, today_fmt is used as the format for a strftime call.
+#today_fmt = '%B %d, %Y'
+
+# List of documents that shouldn't be included in the build.
+#unused_docs = []
+
+# List of directories, relative to source directory, that shouldn't be searched
+# for source files.
+exclude_trees = []
+
+# The reST default role (used for this markup: `text`) to use for all documents.
+#default_role = None
+
+# If true, '()' will be appended to :func: etc. cross-reference text.
+#add_function_parentheses = True
+
+# If true, the current module name will be prepended to all description
+# unit titles (such as .. function::).
+#add_module_names = True
+
+# If true, sectionauthor and moduleauthor directives will be shown in the
+# output. They are ignored by default.
+#show_authors = False
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx'
+
+# A list of ignored prefixes for module index sorting.
+#modindex_common_prefix = ['obitools.']
+
+
+# -- Options for HTML output ---------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages. See the documentation for
+# a list of builtin themes.
+html_theme = 'nature'
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further. For a list of options available for each theme, see the
+# documentation.
+#html_theme_options = {}
+
+# Add any paths that contain custom themes here, relative to this directory.
+#html_theme_path = []
+
+# The name for this set of Sphinx documents. If None, it defaults to
+# "<project> v<release> documentation".
+#html_title = None
+
+# A shorter title for the navigation bar. Default is the same as html_title.
+#html_short_title = None
+
+# The name of an image file (relative to this directory) to place at the top
+# of the sidebar.
+#html_logo = None
+
+# The name of an image file (within the static path) to use as favicon of the
+# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
+# pixels large.
+#html_favicon = None
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
+# using the given strftime format.
+#html_last_updated_fmt = '%b %d, %Y'
+
+# If true, SmartyPants will be used to convert quotes and dashes to
+# typographically correct entities.
+#html_use_smartypants = True
+
+# Custom sidebar templates, maps document names to template names.
+#html_sidebars = {}
+
+# Additional templates that should be rendered to pages, maps page names to
+# template names.
+#html_additional_pages = {}
+
+# If false, no module index is generated.
+#html_use_modindex = True
+
+# If false, no index is generated.
+#html_use_index = True
+
+# If true, the index is split into individual pages for each letter.
+#html_split_index = False
+
+# If true, links to the reST sources are added to the pages.
+#html_show_sourcelink = True
+
+# If true, an OpenSearch description file will be output, and all pages will
+# contain a <link> tag referring to it. The value of this option must be the
+# base URL from which the finished HTML is served.
+#html_use_opensearch = ''
+
+# If nonempty, this is the file name suffix for HTML files (e.g. ".xhtml").
+#html_file_suffix = ''
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'OBIToolsdoc'
+
+
+# -- Options for LaTeX output --------------------------------------------------
+
+latex_elements = {'papersize' : 'a4paper',
+ 'pointsize' : '11pts'}
+
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title, author, documentclass [howto/manual]).
+latex_documents = [
+ ('index', 'OBITools.tex', u'OBITools Documentation',
+ u'OBITools Development Team', 'manual'),
+]
+
+# The name of an image file (relative to this directory) to place at the top of
+# the title page.
+latex_logo = 'OBITools.png'
+
+# For "manual" documents, if this is true, then toplevel headings are parts,
+# not chapters.
+latex_use_parts = True
+
+# Additional stuff for the LaTeX preamble.
+#latex_preamble = ''
+
+# Documents to append as an appendix to all manuals.
+#latex_appendices = []
+
+# If false, no module index is generated.
+#latex_use_modindex = True
+
+autoclass_content="both"
+
+inheritance_graph_attrs = dict(rankdir="TB", size='"5.0, 6.0"',
+ fontsize=12, ratio='compress')
+
+inheritance_node_attrs = dict(fontsize=12, height=0.5,
+ color='dodgerblue1', style='filled')
+
+
+#
+# Subclass the latex formatter to reduce font size of code examples
+#
+
+from sphinx.highlighting import PygmentsBridge
+from pygments.formatters.latex import LatexFormatter
+
+class CustomLatexFormatter(LatexFormatter):
+ def __init__(self, **options):
+ super(CustomLatexFormatter, self).__init__(**options)
+ self.verboptions = r"formatcom=\footnotesize"
+
+PygmentsBridge.latex_formatter = CustomLatexFormatter
+
+
+#--option for epub format ----------------------------------
+
+
+epub_publisher=u"metabarcoding.org"
+epub_author=u"OBITools Development Team"
+
+#--options for the man format -------------------------------
+
+
+man_pages = []
+
+for f in glob.glob('scripts/*.rst'):
+ man_pages.append((
+ f[:-4], # source file (no extension)
+ os.path.split(f[:-4])[1], # output file (under output dir)
+ 'description of %s' % os.path.split(f[:-4])[1], # description
+ 'The OBITools Development Team - LECA', # author
+ 1, # section
+ ))
+
+
diff --git a/doc/sphinx/source/conversions.rst b/doc/sphinx/source/conversions.rst
new file mode 100644
index 0000000..14141a1
--- /dev/null
+++ b/doc/sphinx/source/conversions.rst
@@ -0,0 +1,11 @@
+File format conversions
+=======================
+
+.. toctree::
+ :maxdepth: 2
+
+ scripts/obiconvert
+ scripts/obipr2
+ scripts/obisilva
+ scripts/obitaxonomy
+ scripts/obitab
\ No newline at end of file
diff --git a/doc/sphinx/source/embl.rst b/doc/sphinx/source/embl.rst
new file mode 100644
index 0000000..baf3910
--- /dev/null
+++ b/doc/sphinx/source/embl.rst
@@ -0,0 +1,2 @@
+The EMBL sequence format
+========================
\ No newline at end of file
diff --git a/doc/sphinx/source/fasta.rst b/doc/sphinx/source/fasta.rst
new file mode 100644
index 0000000..f17de3b
--- /dev/null
+++ b/doc/sphinx/source/fasta.rst
@@ -0,0 +1,47 @@
+The *fasta* format
+==================
+
+.. _classical-fasta:
+
+
+The *fasta* format is certainly the most widely used sequence file format.
+This is certainly due to its great simplicity. It was originally created
+for the Lipman and Pearson `FASTA program`_. OBITools use in more
+of the classical :ref:`fasta <classical-fasta>` format an
+:ref:`extended version <obitools-fasta>` of this format where structured
+data are included in the title line.
+
+In *fasta* format a sequence is represented by a title line beginning with a **>** character and
+the sequences by itself following the :doc:`iupac <iupac>` code. The sequence is usually split other
+severals lines of the same length (expect for the last one) ::
+
+
+ >my_sequence this is my pretty sequence
+ ACGTTGCAGTACGTTGCAGTACGTTGCAGTACGTTGCAGTACGTTGCAGTACGTTGCAGT
+ GTGCTGACGTTGCAGTACGTTGCAGTACGTTGCAGTACGTTGCAGTACGTTGCAGTGTTT
+ AACGACGTTGCAGTACGTTGCAGT
+
+
+This is no special format for the title line excepting that this line should be unique.
+Usually the first word following the **>** character is considered as the sequence identifier.
+The end of the title line corresponding to a description of the sequence.
+
+Several sequences can be concatenated in a same file. The description of the next sequence
+is just pasted at the end of the record of the previous one ::
+
+
+ >sequence_A this is my first pretty sequence
+ ACGTTGCAGTACGTTGCAGTACGTTGCAGTACGTTGCAGTACGTTGCAGTACGTTGCAGT
+ GTGCTGACGTTGCAGTACGTTGCAGTACGTTGCAGTACGTTGCAGTACGTTGCAGTGTTT
+ AACGACGTTGCAGTACGTTGCAGT
+ >sequence_B this is my second pretty sequence
+ ACGTTGCAGTACGTTGCAGTACGTTGCAGTACGTTGCAGTACGTTGCAGTACGTTGCAGT
+ GTGCTGACGTTGCAGTACGTTGCAGTACGTTGCAGTACGTTGCAGTACGTTGCAGTGTTT
+ AACGACGTTGCAGTACGTTGCAGT
+ >sequence_C this is my third pretty sequence
+ ACGTTGCAGTACGTTGCAGTACGTTGCAGTACGTTGCAGTACGTTGCAGTACGTTGCAGT
+ GTGCTGACGTTGCAGTACGTTGCAGTACGTTGCAGTACGTTGCAGTACGTTGCAGTGTTT
+ AACGACGTTGCAGTACGTTGCAGT
+
+
+.. _`FASTA program`: http://www.ncbi.nlm.nih.gov/pubmed/3162770?dopt=Citation
\ No newline at end of file
diff --git a/doc/sphinx/source/fastq.rst b/doc/sphinx/source/fastq.rst
new file mode 100644
index 0000000..a4f974a
--- /dev/null
+++ b/doc/sphinx/source/fastq.rst
@@ -0,0 +1,163 @@
+The *fastq* sequence format
+===========================
+
+.. _classical-fastq:
+
+.. note::
+
+ This article uses material from the Wikipedia article
+ `FASTQ format <http://en.wikipedia.org/wiki/FASTQ_format>`
+ which is released under the
+ `Creative Commons Attribution-Share-Alike License 3.0 <http://creativecommons.org/licenses/by-sa/3.0/>`
+
+**fastq format** is a text-based format for storing both a biological sequence
+(usually nucleotide sequence) and its corresponding quality scores.
+Both the sequence letter and quality score are encoded with a single
+ASCII character for brevity. It was originally developed at the `Wellcome Trust Sanger
+Institute` to bundle a
+:ref:`fasta <classical-fasta>` sequence and its quality data, but has recently
+become the *de facto* standard for storing the output of high throughput
+sequencing instruments such as the Illumina Genome
+Analyzer Illumina. [1]_
+
+Format
+------
+
+A fastq file normally uses four lines per sequence.
+
+- Line 1 begins with a '@' character and is followed by a sequence
+ identifier and an *optional* description (like a
+ :ref:`fasta <classical-fasta>` title line).
+- Line 2 is the raw sequence letters.
+- Line 3 begins with a '+' character and is *optionally* followed by
+ the same sequence identifier (and any description) again.
+- Line 4 encodes the quality values for the sequence in Line 2, and
+ must contain the same number of symbols as letters in the sequence.
+
+A fastq file containing a single sequence might look like this:
+
+::
+
+ @SEQ_ID
+ GATTTGGGGTTCAAAGCAGTATCGATCAAATAGTAAATCCATTTGTTCAACTCACAGTTT
+ +
+ !''*((((***+))%%%++)(%%%%).1***-+*''))**55CCF>>>>>>CCCCCCC65
+
+The character '!' represents the lowest quality while '~' is the
+highest. Here are the quality value characters in left-to-right
+increasing order of quality (`ASCII`):
+
+::
+
+ !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~
+
+The original Sanger FASTQ files also allowed the sequence and quality
+strings to be wrapped (split over multiple lines), but this is generally
+discouraged as it can make parsing complicated due to the unfortunate
+choice of "@" and "+" as markers (these characters can also occur in the
+quality string).
+
+
+Variations
+----------
+
+Quality
+~~~~~~~
+
+A quality value *Q* is an integer mapping of *p* (i.e., the probability
+that the corresponding base call is incorrect). Two different equations
+have been in use. The first is the standard Sanger variant to assess
+reliability of a base call, otherwise known as Phred quality
+score:
+
+:math:`Q_\text{sanger} = -10 \, \log_{10} p`
+
+The Solexa pipeline (i.e., the software delivered with the Illumina
+Genome Analyzer) earlier used a different mapping, encoding the
+odds *p*/(1-*p*) instead of the probability *p*:
+
+:math:`Q_\text{solexa-prior to v.1.3} = -10 \, \log_{10} \frac{p}{1-p}`
+
+Although both mappings are asymptotically identical at higher quality
+values, they differ at lower quality levels (i.e., approximately *p* >
+0.05, or equivalently, *Q* < 13).
+
+|Relationship between *Q* and *p* using the Sanger (red) and Solexa
+(black) equations (described above). The vertical dotted line indicates
+*p* = 0.05, or equivalently, *Q* � 13.|
+
+
+Encoding
+~~~~~~~~
+
+- Sanger format can encode a Phred quality
+ score from 0 to 93 using ASCII 33 to 126
+ (although in raw read data the Phred quality score rarely exceeds 60,
+ higher scores are possible in assemblies or read maps).
+- Solexa/Illumina 1.0 format can encode a Solexa/Illumina quality score
+ from -5 to 62 using ASCII 59 to 126 (although in raw read
+ data Solexa scores from -5 to 40 only are expected)
+- Starting with Illumina 1.3 and before Illumina 1.8, the format
+ encoded a Phred quality score from 0 to 62
+ using ASCII 64 to 126 (although in raw read data Phred
+ scores from 0 to 40 only are expected).
+- Starting in Illumina 1.5 and before Illumina 1.8, the Phred scores 0
+ to 2 have a slightly different meaning. The values 0 and 1 are no
+ longer used and the value 2, encoded by ASCII 66 "B".
+
+Sequencing Control Software, Version 2.6, Catalog # SY-960-2601, Part #
+15009921 Rev. A, November
+2009]\ http://watson.nci.nih.gov/solexa/Using_SCSv2.6_15009921_A.pdf\
+(page 30) states the following: *If a read ends with a segment of mostly
+low quality (Q15 or below), then all of the quality values in the
+segment are replaced with a value of 2 (encoded as the letter B in
+Illumina's text-based encoding of quality scores)... This Q2 indicator
+does not predict a specific error rate, but rather indicates that a
+specific final portion of the read should not be used in further
+analyses.* Also, the quality score encoded as "B" letter may occur
+internally within reads at least as late as pipeline version 1.6, as
+shown in the following example:
+
+::
+
+ @HWI-EAS209_0006_FC706VJ:5:58:5894:21141#ATCACG/1
+ TTAATTGGTAAATAAATCTCCTAATAGCTTAGATNTTACCTTNNNNNNNNNNTAGTTTCTTGAGATTTGTTGGGGGAGACATTTTTGTGATTGCCTTGAT
+ +HWI-EAS209_0006_FC706VJ:5:58:5894:21141#ATCACG/1
+ efcfffffcfeefffcffffffddf`feed]`]_Ba_^__[YBBBBBBBBBBRTT\]][]dddd`ddd^dddadd^BBBBBBBBBBBBBBBBBBBBBBBB
+
+An alternative interpretation of this ASCII encoding has been
+proposed. Also, in Illumina runs using PhiX controls, the character
+'B' was observed to represent an "unknown quality score". The error rate
+of 'B' reads was roughly 3 phred scores lower the mean observed score of
+a given run.
+
+- Starting in Illumina 1.8, the quality scores have basically returned
+ to the use of the Sanger format (Phred+33).
+
+File extension
+--------------
+
+There is no standard file extension for a FASTQ
+file, but .fq and .fastq, are commonly used.
+
+See also
+--------
+
+- :ref:`fasta <classical-fasta>`
+
+References
+----------
+
+.. [1]
+ Cock et al (2009) The Sanger FASTQ file format for sequences with
+ quality scores, and the Solexa/Illumina FASTQ variants. Nucleic Acids
+ Research,
+
+.. [2]
+ Illumina Quality Scores, Tobias Mann, Bioinformatics, San Diego,
+ Illumina `1 <http://seqanswers.com/forums/showthread.php?t=4721>`__
+
+.. |Relationship between *Q* and *p* using the Sanger (red) and Solexa (black) equations (described above). The vertical dotted line indicates *p* = 0.05, or equivalently, *Q* � 13.| image:: Probability metrics.png
+
+See http://en.wikipedia.org/wiki/FASTQ_format
+
diff --git a/doc/sphinx/source/filtering.rst b/doc/sphinx/source/filtering.rst
new file mode 100644
index 0000000..6d9e0a4
--- /dev/null
+++ b/doc/sphinx/source/filtering.rst
@@ -0,0 +1,14 @@
+Sequence sampling and filtering
+===============================
+
+.. toctree::
+ :maxdepth: 2
+
+ scripts/obiextract
+ scripts/obigrep
+ scripts/obihead
+ scripts/obisample
+ scripts/obiselect
+ scripts/obisplit
+ scripts/obisubset
+ scripts/obitail
diff --git a/doc/sphinx/source/formats.rst b/doc/sphinx/source/formats.rst
new file mode 100644
index 0000000..f734d1c
--- /dev/null
+++ b/doc/sphinx/source/formats.rst
@@ -0,0 +1,59 @@
+File formats usable with OBITools
+=================================
+
+.. _the-sequence-files:
+
+The sequence files
+------------------
+
+Sequences can be stored following various format. OBITools knows
+some of them. The central format for sequence files manipulated by OBITools scripts
+is the :doc:`fasta format <fasta>`. OBITools extends the fasta format by specifying
+a syntax to include in the definition line data qualifying the sequence.
+All file formats use the :doc:`IUPAC <iupac>` code for encoding nucleotides and
+amino-acids.
+
+.. toctree::
+ :maxdepth: 2
+
+ iupac
+ fasta
+ fastq
+ attributes
+
+..
+ genbank
+ embl
+
+
+The taxonomy files
+------------------
+
+Many OBITools are able to take into account taxonomic data. This is done in general by specifying
+either a directory containing all :doc:`NCBI taxonomy dump files <./taxdump>` or an
+:doc:`obitaxonomy <./obitaxonomy>` formatted database.
+
+.. toctree::
+ :maxdepth: 2
+
+ taxdump
+ obitaxonomy
+
+..
+ The ecoPCR files
+ ----------------
+
+ ecoPCR_ simulates a PCR experiment by selecting in a sequence database, sequences matching
+ simultaneously two primers sequences in a way allowing a PCR amplification of a DNA region.
+
+ The ecoPrimers files
+ --------------------
+
+
+ The OBITools files
+ ------------------
+
+
+.. _ecoPCR: http://www.grenoble.prabi.fr/trac/ecoPCR
+.. _LECA: http://www-leca.ujf-grenoble.fr
+.. _`NCBI taxonomy`: http://www.ncbi.nlm.nih.gov/taxonomy
\ No newline at end of file
diff --git a/doc/sphinx/source/genbank.rst b/doc/sphinx/source/genbank.rst
new file mode 100644
index 0000000..fe64560
--- /dev/null
+++ b/doc/sphinx/source/genbank.rst
@@ -0,0 +1,2 @@
+The genbank sequence format
+===========================
\ No newline at end of file
diff --git a/doc/sphinx/source/index.rst b/doc/sphinx/source/index.rst
new file mode 100644
index 0000000..c8c2ded
--- /dev/null
+++ b/doc/sphinx/source/index.rst
@@ -0,0 +1,23 @@
+.. OBITools documentation master file, created by
+ sphinx-quickstart on Tue Dec 8 21:30:02 2009.
+ You can adapt this file completely to your liking, but it should at least
+ contain the root `toctree` directive.
+
+.. role:: latex(raw)
+ :format: latex
+
+
+.. toctree::
+ :maxdepth: 2
+
+ Welcome to OBITools's <introduction>
+ The OBITools scripts <scripts>
+ Sample tutorials <tutorials>
+
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`search`
+
diff --git a/doc/sphinx/source/introduction.rst b/doc/sphinx/source/introduction.rst
new file mode 100644
index 0000000..648d615
--- /dev/null
+++ b/doc/sphinx/source/introduction.rst
@@ -0,0 +1,8 @@
+Introduction
+============
+
+.. toctree::
+ :maxdepth: 2
+
+ Welcome to OBITools's <welcome>
+ The file formats <formats>
diff --git a/doc/sphinx/source/iupac.rst b/doc/sphinx/source/iupac.rst
new file mode 100644
index 0000000..c3e00a0
--- /dev/null
+++ b/doc/sphinx/source/iupac.rst
@@ -0,0 +1,63 @@
+The IUPAC code
+==============
+
+The International Union of Pure and Applied Chemistry (IUPAC_) defined
+the standard code for representing protein or DNA sequences.
+
+Nucleic IUPAC Code
+------------------
+
+======== =================================
+**Code** **Nucleotide**
+======== =================================
+ A Adenine
+ C Cytosine
+ G Guanine
+ T Thymine
+ U Uracil
+ R Purine (A or G)
+ Y Pyrimidine (C, T, or U)
+ M C or A
+ K T, U, or G
+ W T, U, or A
+ S C or G
+ B C, T, U, or G (not A)
+ D A, T, U, or G (not C)
+ H A, T, U, or C (not G)
+ V A, C, or G (not T, not U)
+ N Any base (A, C, G, T, or U)
+======== =================================
+
+
+Peptidic one and three letters IUPAC code
+-----------------------------------------
+
+============ ============= =======================================
+**1-letter** **3-letters** **Amino acid**
+============ ============= =======================================
+ A Ala Alanine
+ R Arg Arginine
+ N Asn Asparagine
+ D Asp Aspartic acid
+ C Cys Cysteine
+ Q Gln Glutamine
+ E Glu Glutamic acid
+ G Gly Glycine
+ H His Histidine
+ I Ile Isoleucine
+ L Leu Leucine
+ K Lys Lysine
+ M Met Methionine
+ F Phe Phenylalanine
+ P Pro Proline
+ S Ser Serine
+ T Thr Threonine
+ W Trp Tryptophan
+ Y Tyr Tyrosine
+ V Val Valine
+ B Asx Aspartic acid or Asparagine
+ Z Glx Glutamine or Glutamic acid
+ X Xaa Any amino acid
+============ ============= =======================================
+
+.. _IUPAC: http://www.iupac.org/
\ No newline at end of file
diff --git a/doc/sphinx/source/manipulations.rst b/doc/sphinx/source/manipulations.rst
new file mode 100644
index 0000000..04cc398
--- /dev/null
+++ b/doc/sphinx/source/manipulations.rst
@@ -0,0 +1,15 @@
+Computations on sequences
+=========================
+
+
+.. toctree::
+ :maxdepth: 2
+
+ scripts/illuminapairedend
+ scripts/ngsfilter
+ scripts/obicomplement
+ scripts/obiclean
+ scripts/obicut
+ scripts/obijoinpairedend
+ scripts/obiuniq
+
\ No newline at end of file
diff --git a/doc/sphinx/source/obitaxonomy.rst b/doc/sphinx/source/obitaxonomy.rst
new file mode 100644
index 0000000..7dee5a0
--- /dev/null
+++ b/doc/sphinx/source/obitaxonomy.rst
@@ -0,0 +1,26 @@
+The OBITools formatted taxonomy
+===============================
+
+Management of the taxonomy
+--------------------------
+
+Filtering and annotation steps in the processing of DNA metabarcoding sequence data are greatly
+eased by the explicit association of taxonomic information to sequences together with an easy
+access to the taxonomy. Taxonomic information, including a taxonomic identifier, can thus be
+stored in the set of attributes of each sequence record. Specifically, the `taxid` attribute
+is used by the OBITools when querying taxonomic information of a sequence record, nevertheless
+several OBITools commands can annotate sequence records with taxonomy-related attributes for
+the user's convenience. The value of the `taxid` attribute must be a unique integer referring
+unambiguously to one taxon in the taxonomic associated database (note that a taxon can be any node
+in the taxonomic tree). Although this is not mandatory, the NCBI taxonomy is a preferred source of
+taxonomic information as the OBITools provide commands to easily extract the full taxonomic
+information from it. The command `obitaxonomy` is useful to build a taxonomic database in the
+OBITools format from a dump of the NCBI taxonomic database (downloadable at the following
+URL: ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz). Moreover, the `obitaxonomy` command can
+enrich an existing taxonomy with private taxa, therefore enabling to associate sequence records to
+taxa not initially present in the reference taxonomic database. As the OBITools have access to the
+full taxonomic tree topology, they are able to inform higher taxonomic levels from a taxon identifier
+(e.g. the family, order, class, phylum, etc. corresponding to a genus) leading to efficient and
+simple annotation and querying of taxonomic information.
+
+
diff --git a/doc/sphinx/source/optionsSet/defaultoptions.txt b/doc/sphinx/source/optionsSet/defaultoptions.txt
new file mode 100644
index 0000000..20b6f3a
--- /dev/null
+++ b/doc/sphinx/source/optionsSet/defaultoptions.txt
@@ -0,0 +1,13 @@
+Common options
+--------------
+
+.. program:: obitools
+
+.. cmdoption:: -h, --help
+
+ Shows this help message and exits.
+
+.. cmdoption:: --DEBUG
+
+ Sets logging in debug mode.
+
diff --git a/doc/sphinx/source/optionsSet/inputformat.txt b/doc/sphinx/source/optionsSet/inputformat.txt
new file mode 100644
index 0000000..8a8d181
--- /dev/null
+++ b/doc/sphinx/source/optionsSet/inputformat.txt
@@ -0,0 +1,79 @@
+Options to specify input format
+-------------------------------
+
+.. program:: obitools
+
+
+Restrict the analysis to a sub-part of the input file
+.....................................................
+
+.. cmdoption:: --skip <N>
+
+ The N first sequence records of the file are discarded from the analysis and
+ not reported to the output file
+
+
+.. cmdoption:: --only <N>
+
+ Only the N next sequence records of the file are analyzed. The following sequences
+ in the file are neither analyzed, neither reported to the output file.
+ This option can be used conjointly with the `--skip` option.
+
+
+
+Sequence annotated format
+.........................
+
+.. cmdoption:: --genbank
+
+ Input file is in :doc:`genbank <../genbank>` format.
+
+.. cmdoption:: --embl
+
+ Input file is in :doc:`embl <../embl>` format.
+
+:doc:`fasta <../fasta>` related format
+.......................................
+
+
+.. cmdoption:: --fasta
+
+ Input file is in :doc:`fasta <../fasta>` format (including
+ OBITools :doc:`fasta <../fasta>` extensions).
+
+:doc:`fastq <../fastq>` related format
+.......................................
+
+.. cmdoption:: --sanger
+
+ Input file is in Sanger :doc:`fastq <../fastq>` format (standard
+ :doc:`fastq <../fastq>` used by HiSeq/MiSeq sequencers).
+
+.. cmdoption:: --solexa
+
+ Input file is in :doc:`fastq <../fastq>` format produced by
+ Solexa (Ga IIx) sequencers.
+
+ecoPCR related format
+.....................
+
+.. cmdoption:: --ecopcr
+
+ Input file is in :doc:`ecoPCR <../formats>` format.
+
+.. cmdoption:: --ecopcrdb
+
+ Input is an :doc:`ecoPCR <../formats>` database.
+
+Specifying the sequence type
+............................
+
+.. cmdoption:: --nuc
+
+ Input file contains nucleic sequences.
+
+.. cmdoption:: --prot
+
+ Input file contains protein sequences.
+
+
\ No newline at end of file
diff --git a/doc/sphinx/source/optionsSet/outputformat.txt b/doc/sphinx/source/optionsSet/outputformat.txt
new file mode 100644
index 0000000..4e58cc8
--- /dev/null
+++ b/doc/sphinx/source/optionsSet/outputformat.txt
@@ -0,0 +1,30 @@
+Options to specify output format
+--------------------------------
+
+.. program:: obitools
+
+Standard output format
+......................
+
+.. cmdoption:: --fasta-output
+
+ Output sequences in ``OBITools`` :doc:`fasta <../fasta>` format
+
+.. cmdoption:: --fastq-output
+
+ Output sequences in Sanger :doc:`fastq <../fastq>` format
+
+Generating an ecoPCR database
+.............................
+
+.. cmdoption:: --ecopcrdb-output=<PREFIX_FILENAME>
+
+ Creates an ecoPCR database from sequence records results
+
+Miscellaneous option
+....................
+
+.. cmdoption:: --uppercase
+
+ Print sequences in upper case (default is lower case)
+
diff --git a/doc/sphinx/source/optionsSet/sequenceEdit.txt b/doc/sphinx/source/optionsSet/sequenceEdit.txt
new file mode 100644
index 0000000..1952f6d
--- /dev/null
+++ b/doc/sphinx/source/optionsSet/sequenceEdit.txt
@@ -0,0 +1,83 @@
+Sequence record editing options
+-------------------------------
+
+.. cmdoption:: --seq-rank
+
+ Adds a new attribute named ``seq_rank`` to the sequence record indicating
+ its entry number in the sequence file.
+
+
+.. cmdoption:: -R <OLD_NAME>:<NEW_NAME>, --rename-tag=<OLD_NAME>:<NEW_NAME>
+
+ Changes attribute name <OLD_NAME> to <NEW_NAME>. When attribute
+ named <OLD_NAME> is missing, the sequence record is
+ skipped and the next one is examined.
+
+.. cmdoption:: --delete-tag=<KEY>
+
+ Deletes attribute named <ATTRIBUTE_NAME>.When this attribute
+ is missing, the sequence record is skipped and the
+ next one is examined.
+
+.. cmdoption:: -S <KEY>:<PYTHON_EXPRESSION>, --set-tag=<KEY>:<PYTHON_EXPRESSION>
+
+ Creates a new attribute named with a key <KEY> and a
+ value computed from <PYTHON_EXPRESSION>.
+
+.. cmdoption:: --tag-list=<FILENAME>
+
+ <FILENAME> points to a file containing attribute
+ names and values to modify for specified sequence records.
+
+.. cmdoption:: --set-identifier=<PYTHON_EXPRESSION>
+
+ Sets sequence record identifier with a value computed
+ from <PYTHON_EXPRESSION>.
+
+.. cmdoption:: --run=<PYTHON_EXPRESSION>
+
+ Runs a python expression on each selected sequence.
+
+.. cmdoption:: --set-sequence=<PYTHON_EXPRESSION>
+
+ Changes the sequence itself with a value computed from
+ <PYTHON_EXPRESSION>.
+
+
+.. cmdoption:: -T, --set-definition=<PYTHON_EXPRESSION>
+
+ Sets sequence definition with a value computed from
+ <PYTHON_EXPRESSION>.
+
+
+.. cmdoption:: -O, --only-valid-python
+
+ Allows only valid python expressions.
+
+.. cmdoption:: -C, --clear
+
+ Clears all attributes associated to the sequence records.
+
+.. cmdoption:: -k <KEY>, --keep=<KEY>
+
+ Keeps only attribute with key <KEY>. Several ``-k``
+ options can be combined.
+
+.. cmdoption:: --length
+
+ Adds attribute with ``seq_length`` as a key and sequence length as a value.
+
+.. cmdoption:: --with-taxon-at-rank=<RANK_NAME>
+
+ Adds taxonomic annotation at taxonomic rank
+ <RANK_NAME>.
+
+.. cmdoption:: -m <MCLFILE>, --mcl=<MCLFILE>
+
+ Creates a new attribute containing the number of the
+ cluster the sequence record was assigned to, as
+ indicated in file <MCLFILE>.
+
+.. cmdoption:: --uniq-id
+
+ Forces sequence record ids to be unique.
diff --git a/doc/sphinx/source/optionsSet/sequenceFilter.txt b/doc/sphinx/source/optionsSet/sequenceFilter.txt
new file mode 100644
index 0000000..c56e29d
--- /dev/null
+++ b/doc/sphinx/source/optionsSet/sequenceFilter.txt
@@ -0,0 +1,174 @@
+Sequence record selection options
+---------------------------------
+
+.. cmdoption:: -s <REGULAR_PATTERN>, --sequence=<REGULAR_PATTERN>
+
+ Regular expression pattern to be tested against the
+ sequence itself. The pattern is case insensitive.
+
+ *Examples:*
+
+ .. code-block:: bash
+
+ > obigrep -s 'GAATTC' seq1.fasta > seq2.fasta
+
+ Selects only the sequence records that contain an *EcoRI* restriction site.
+
+ .. code-block:: bash
+
+ > obigrep -s 'A{10,}' seq1.fasta > seq2.fasta
+
+ Selects only the sequence records that contain a stretch of at least 10 ``A``.
+
+ .. code-block:: bash
+
+ > obigrep -s '^[ACGT]+$' seq1.fasta > seq2.fasta
+
+ Selects only the sequence records that do not contain ambiguous nucleotides.
+
+
+.. cmdoption:: -D <REGULAR_PATTERN>, --definition=<REGULAR_PATTERN>
+
+ Regular expression pattern to be tested against the
+ definition of the sequence record. The pattern is case
+ sensitive.
+
+ *Example:*
+
+ .. code-block:: bash
+
+ > obigrep -D '[Cc]hloroplast' seq1.fasta > seq2.fasta
+
+ Selects only the sequence records whose definition contains ``chloroplast`` or
+ ``Chloroplast``.
+
+
+.. cmdoption:: -I <REGULAR_PATTERN>, --identifier=<REGULAR_PATTERN>
+
+ Regular expression pattern to be tested against the
+ identifier of the sequence record. The pattern is case
+ sensitive.
+
+ *Example:*
+
+ .. code-block:: bash
+
+ > obigrep -I '^GH' seq1.fasta > seq2.fasta
+
+ Selects only the sequence records whose identifier begins with ``GH``.
+
+
+.. cmdoption:: --id-list=<FILENAME>
+
+ ``<FILENAME>`` points to a text file containing the list of sequence
+ record identifiers to be selected.
+ The file format consists in a single identifier per line.
+
+ *Example:*
+
+ .. code-block:: bash
+
+ > obigrep --id-list=my_id_list.txt seq1.fasta > seq2.fasta
+
+ Selects only the sequence records whose identifier is present in the
+ ``my_id_list.txt`` file.
+
+.. cmdoption:: -a <KEY>:<REGULAR_PATTERN>,
+.. cmdoption:: --attribute=<KEY>:<REGULAR_PATTERN>
+
+ Regular expression pattern matched against the
+ :doc:`attributes of the sequence record <../fasta>`. the value of this attribute
+ is of the form : key:regular_pattern. The
+ pattern is case sensitive. Several ``-a`` options can be
+ used on the same command line and in this last case,
+ the selected sequence records will match all constraints.
+
+ *Example:*
+
+ .. code-block:: bash
+
+ > obigrep -a 'family_name:Asteraceae' seq1.fasta > seq2.fasta
+
+ Selects the sequence records containing an attribute whose key is ``family_name`` and value
+ is ``Asteraceae``.
+
+
+.. cmdoption:: -A <ATTRIBUTE_NAME>, --has-attribute=<KEY>
+
+ Selects sequence records having an attribute whose key = <KEY>.
+
+ *Example:*
+
+ .. code-block:: bash
+
+ > obigrep -A taxid seq1.fasta > seq2.fasta
+
+ Selects only the sequence records having a *taxid* attribute defined.
+
+
+.. cmdoption:: -p <PYTHON_EXPRESSION>, --predicat=<PYTHON_EXPRESSION>
+
+ Python boolean expression to be evaluated for each
+ sequence record. The attribute keys defined for each sequence record
+ can be used in the expression as variable names.
+ An extra variable named 'sequence' refers to the
+ sequence record itself.
+ Several -p options can be used on the same command
+ line and in this last case,
+ the selected sequence records will match all constraints.
+
+ *Example:*
+
+ .. code-block:: bash
+
+ > obigrep -p '(forward_error<2) and (reverse_error<2)' \
+ seq1.fasta > seq2.fasta
+
+ Selects only the sequence records whose ``forward_error`` and ``reverse_error``
+ attributes have a value smaller than two.
+
+
+.. cmdoption:: -L <##>, --lmax=<##>
+
+ Keeps sequence records whose sequence length is
+ equal or shorter than ``lmax``.
+
+ *Example:*
+
+ .. code-block:: bash
+
+ > obigrep -L 100 seq1.fasta > seq2.fasta
+
+ Selects only the sequence records that have a sequence
+ length equal or shorter than 100bp.
+
+
+.. cmdoption:: -l <##>, --lmin=<##>
+
+ Selects sequence records whose sequence length is
+ equal or longer than ``lmin``.
+
+
+ *Examples:*
+
+ .. code-block:: bash
+
+ > obigrep -l 100 seq1.fasta > seq2.fasta
+
+ Selects only the sequence records that have a sequence length
+ equal or longer than 100bp.
+
+
+.. cmdoption:: -v, --inverse-match
+
+ Inverts the sequence record selection.
+
+ *Examples:*
+
+ .. code-block:: bash
+
+ > obigrep -v -l 100 seq1.fasta > seq2.fasta
+
+ Selects only the sequence records that have a sequence length shorter than 100bp.
+
+
\ No newline at end of file
diff --git a/doc/sphinx/source/optionsSet/taxonomyDB.txt b/doc/sphinx/source/optionsSet/taxonomyDB.txt
new file mode 100644
index 0000000..5e7a417
--- /dev/null
+++ b/doc/sphinx/source/optionsSet/taxonomyDB.txt
@@ -0,0 +1,13 @@
+Taxonomy related options
+------------------------
+
+.. program:: taxonomy
+
+.. cmdoption:: -d <FILENAME>, --database=<FILENAME>
+
+ ecoPCR taxonomy Database name
+
+.. cmdoption:: -t <FILENAME>, --taxonomy-dump=<FILENAME>
+
+ NCBI Taxonomy dump repository name
+
diff --git a/doc/sphinx/source/optionsSet/taxonomyFilter.txt b/doc/sphinx/source/optionsSet/taxonomyFilter.txt
new file mode 100644
index 0000000..568be9b
--- /dev/null
+++ b/doc/sphinx/source/optionsSet/taxonomyFilter.txt
@@ -0,0 +1,14 @@
+.. include:: ../optionsSet/taxonomyDB.txt
+
+.. cmdoption:: --require-rank=<RANK_NAME>
+
+ select sequence with taxid tag containing a parent of
+ rank <RANK_NAME>
+
+.. cmdoption:: -r <TAXID>, --required=<TAXID>
+
+ required taxid
+
+.. cmdoption:: -i <TAXID>, --ignore=<TAXID>
+
+ ignored taxid
\ No newline at end of file
diff --git a/doc/sphinx/source/scripts.rst b/doc/sphinx/source/scripts.rst
new file mode 100644
index 0000000..3e85117
--- /dev/null
+++ b/doc/sphinx/source/scripts.rst
@@ -0,0 +1,13 @@
+OBITools scripts
+================
+
+.. toctree::
+ :maxdepth: 2
+
+ barcodes
+ conversions
+ annotations
+ manipulations
+ filtering
+ statistics
+ utilities
diff --git a/doc/sphinx/source/scripts/ecoPCR.rst b/doc/sphinx/source/scripts/ecoPCR.rst
new file mode 100644
index 0000000..720cf6d
--- /dev/null
+++ b/doc/sphinx/source/scripts/ecoPCR.rst
@@ -0,0 +1,193 @@
+:py:mod:`ecoPCR`: *in silico* PCR
+=================================
+
+:py:mod:`ecoPCR` *in silico* PCR preserves the taxonomic information
+of the selected sequences, and allows various specified conditions for the
+*in silico* amplification.
+
+Additionally to the different options, the command requires two arguments corresponding
+to the two primers.
+
+References
+----------
+
+ Bellemain E, Carlsen T, Brochmann C, Coissac E, Taberlet P, Kauserud H (2010) ITS as an environmental DNA barcode for fungi: an *in silico* approach reveals potential PCR biases BMC Microbiology, 10, 189.
+
+ Ficetola GF, Coissac E, Zundel S, Riaz T, Shehzad W, Bessiere J, Taberlet P, Pompanon F (2010) An *in silico* approach for the evaluation of DNA barcodes. BMC Genomics, 11, 434.
+
+
+:py:mod:`ecoPCR` specific options
+---------------------------------
+
+ .. cmdoption:: -d <filename>
+
+ Filename containing the database used for the *in silico* PCR. The database
+ must be in the ``ecoPCR format`` (see :doc:`obiconvert <./obiconvert>`).
+
+ .. WARNING:: This option is compulsory.
+
+
+ .. cmdoption:: -e <INTEGER>
+
+ Maximum number of errors (mismatches) allowed per primer (default: 0).
+ See example 2 for avoiding errors on the 3' end of the primers.
+
+
+ .. cmdoption:: -l <INTEGER>
+
+ Minimum length of the *in silico* amplified DNA fragment, excluding primers.
+
+
+ .. cmdoption:: -L <INTEGER>
+
+ Maximum length of the *in silico* amplified DNA fragment, excluding primers.
+
+
+ .. cmdoption:: -r <TAXID>
+
+ Only the sequence records corresponding to the taxonomic group identified by its
+ ``TAXID`` are considered for the *in silico* PCR. The ``TAXID`` is an integer that
+ can be found either in the NCBI taxonomic database, or using the :doc:`ecofind <./ecofind>` program.
+
+ .. cmdoption:: -i <TAXID>
+
+ The sequences of the taxonomic group identified by its ``TAXID`` are not considered for
+ the *in silico* PCR.
+
+
+ .. cmdoption:: -c
+
+ Considers that the sequences of the database are circular (e.g. mitochondrial
+ or chloroplast DNA).
+
+
+ .. cmdoption:: -D <INTEGER>
+
+ Keeps the specified number of nucleotides on each side of the *in silico*
+ amplified sequences, (including the amplified DNA fragment plus the two target
+ sequences of the primers).
+
+
+ .. cmdoption:: -k
+
+ Print in the programme output the kingdom of the *in silico* amplified
+ sequences (default: print the superkingdom).
+
+
+ .. cmdoption:: -m <1|2>
+
+ Defines the method used for estimating the Tm (melting temperature) between
+ the primers and their corresponding target sequences (default: 1).
+
+ 1 SantaLucia method (SantaLucia J (1998) A unified view of polymer, dumbbell, and oligonucleotide DNA nearest-neighbor thermodynamics. PNAS, 95, 1460-1465).
+
+ 2 Owczarzy method (Owczarzy R, Vallone PM, Gallo FJ *et al.* (1997) Predicting sequence-dependent melting stability of short duplex DNA oligomers. Biopolymers, 44, 217-239).
+
+
+ .. cmdoption:: -a <FLOAT>
+
+ Salt concentration used for estimating the *Tm* (default: 0.05).
+
+
+ .. cmdoption:: -h
+
+ Print help.
+
+
+
+Output file
+-----------
+
+ The output file contains several columns, with '|' as separator, and describes
+ the properties of the *in silico* amplified sequences.
+
+ column 1: sequence identification in the reference database (= accession number when using EMBL or GenBank for building the reference database)
+
+ column 2: length of the original sequence
+
+ column 3: scientific name as indicated in the reference database
+
+ column 4: taxonomic rank as indicated in the reference database
+
+ column 5: *taxid* of the species
+
+ column 6: scientific name of the species
+
+ column 7: *taxid* of the genus
+
+ column 8: genus name
+
+ column 9: *taxid* of the family
+
+ column 10: family name
+
+ column 11: *taxid* of the super kingdom (or of the kingdom if the ``-k`` option is set)
+
+ column 12: super kingdom name (or kingdom name if the ``-k`` option is set)
+
+ column 13: strand (D or R, corresponding to direct or reverse, respectively)
+
+ column 14: target sequence of the first primer
+
+ column 15: number of mismatches for the first primer
+
+ column 16: target sequence of the second primer
+
+ column 17: number of mismatches for the second primer
+
+ column 18: length of the amplified fragment (excluding primers)
+
+ column 19: sequence
+
+ column 20: definition
+
+
+
+Examples
+--------
+
+ *Example 1:*
+
+ .. code-block:: bash
+
+ > ecoPCR -d mydatabase -e 3 -l 50 -L 500 \
+ TCACAGACCTGTTATTGC TYTGTCTGSTTRATTSCG > mysequences.ecopcr
+
+ Launches an *in silico* PCR on mydatabase (see :doc:`obiconvert <./obiconvert>` for a description
+ of the database format), with a maximum of three mismatches for each primer. The minimum and
+ maximum amplified sequence lengths (excluding primers) are 50 bp and 500 bp, respectively. The
+ primers used are TCACAGACCTGTTATTGC and TYTGTCTGSTTRATTSCG (possibility to use
+ :doc:`IUPAC codes <../iupac>`). They amplify a short portion of the nuclear 18S gene. The
+ results are saved in the *mysequence.ecopcr* file.
+
+
+
+ *Example 2:*
+
+ .. code-block:: bash
+
+ > ecoPCR -d mydatabase -e 2 -l 80 -L 120 -D 50 -r 7742 \
+ TTAGATACCCCACTATG#C# TAGAACAGGCTCCTCTA#G# > mysequences.ecopcr
+
+ Launches an *in silico* PCR on mydatabase (see :doc:`obiconvert <./obiconvert>` for a description
+ of the database format), with a maximum of two mismatches for each primer, but with a perfect match
+ on the last two nucleotides of the 3' end of each primer (a perfect match can be enforced by adding
+ a '#' after the considered nucleotide). The minimum and maximum amplified sequence lengths (excluding
+ primers) are 80 bp and 120 bp, respectively. The ``-D`` option keeps 50 nucleotides on each side of
+ the *in silico* amplified sequences, (including the amplified DNA fragment plus the two target
+ sequences of the primers). The primers used are TTAGATACCCCACTATGC and TAGAACAGGCTCCTCTAG. They
+ amplify a short portion of the mitochondrial 12S gene. The ``-r`` option restricts the search to
+ vertebrates (7742 is the :doc:`taxid <../attributes/taxid>` of vertebrates). The results are saved
+ in the ``mysequence.ecopcr`` file.
+
+
+:py:mod:`ecoPCR` used sequence attributes
+-----------------------------------------
+
+ - :doc:`taxid <../attributes/taxid>`
+
+
+
+
+
+
diff --git a/doc/sphinx/source/scripts/ecoPrimers.rst b/doc/sphinx/source/scripts/ecoPrimers.rst
new file mode 100644
index 0000000..313c084
--- /dev/null
+++ b/doc/sphinx/source/scripts/ecoPrimers.rst
@@ -0,0 +1,253 @@
+:py:mod:`ecoPrimers`: new barcode markers and primers
+=====================================================
+
+Authors: Eric Coissac <eric.coissac at metabarcoding.org> and Tiayyba Riaz <tiayyba.riaz at metabarcoding.org>
+
+:py:mod:`ecoPrimers` designs the most efficient barcode markers and primers, based
+on a set of reference sequence records, and according to specified parameters.
+
+Reference
+---------
+
+ Riaz T, Shehzad W, Viari A, Pompanon F, Taberlet P, Coissac E (2011) ecoPrimers: inference of new DNA
+ barcode markers from whole genome sequence analysis. Nucleic Acids Research, 39, e145.
+
+
+
+:py:mod:`ecoPrimers` specific options
+-------------------------------------
+
+ .. cmdoption:: -d <filename>
+
+ Filename containing the reference sequence records used for designing the barcode
+ markers and primers (see :doc:`obiconvert <./obiconvert>` for a description
+ of the database format).
+
+ .. WARNING:: This option is compulsory.
+
+
+ .. cmdoption:: -e <INTEGER>
+
+ Maximum number of errors (mismatches) allowed per primer (default: 0).
+
+
+ .. cmdoption:: -l <INTEGER>
+
+ Minimum length of the barcode, excluding primers.
+
+
+ .. cmdoption:: -L <INTEGER>
+
+ Maximum length of the barcode, excluding primers.
+
+
+ .. cmdoption:: -r <TAXID>
+
+ Defines the example sequence records (example dataset). Only the sequences of the corresponding
+ taxonomic group identified by its ``TAXID`` are taken into account for designing the barcodes and
+ the primers. The ``TAXID`` is an integer that can be found either in the NCBI taxonomic database,
+ or using the :doc:`ecofind <ecofind>` program.
+
+ .. cmdoption:: -i <TAXID>
+
+ Defines the counterexample sequence records (counterexample dataset). The barcodes and primers
+ will be selected in order to avoid the counterexample taxonomic group identified by its ``TAXID``.
+
+
+ .. cmdoption:: -E <TAXID>
+
+ Defines an counterexample taxonomic group (identified by its ``TAXID``) within the example
+ dataset.
+
+
+ .. cmdoption:: -c
+
+ Considers that the sequences of the database are circular (e.g. mitochondrial
+ or chloroplast DNA).
+
+
+ .. cmdoption:: -3 <INTEGER>
+
+ Defines the number of nucleotides on the 3' end of the primers that must have a strict match
+ with their target sequences.
+
+
+ .. cmdoption:: -q <FLOAT>
+
+ Defines the strict matching quorum, i.e. the proportion of the sequence records in which a
+ strict match between the primers and their targets occurs (default: 0.7)
+
+
+ .. cmdoption:: -s <FLOAT>
+
+ Defines the sensitivity quorum, i.e. the proportion of the example sequence records that
+ must fulfill the specified parameters for designing the barcodes and the primers.
+
+
+ .. cmdoption:: -x <FLOAT>
+
+ Defines the false positive quorum, i.e. the maximum proportion of the counterexample
+ sequence records that fulfill the specified parameters for designing the barcodes and
+ the primers.
+
+
+ .. cmdoption:: -t <TAXONOMIC_LEVEL>
+
+ Defines the taxonomic level that is considered for evaluating the barcodes and primers in
+ the output of :py:mod:`ecoPrimers`. The default taxonomic level is the species level. When
+ using a taxonomic database builts from a :doc:`NCBI taxonomy dump files <../taxdump>`, the
+ other possible taxonomic levels are genus, family, order, class, phylum, kingdom, and
+ superkingdom.
+
+
+ .. cmdoption:: -D
+
+ Sets the double strand mode.
+
+
+ .. cmdoption:: -S
+
+ Sets the single strand mode.
+
+
+ .. cmdoption:: -O <INTEGER>
+
+ Sets the primer length (default: 18).
+
+
+ .. cmdoption:: -m <1|2>
+
+ Defines the method used for estimating the *Tm* (melting temperature) between
+ the primers and their corresponding target sequences (default: 1).
+
+ 1 SantaLucia method (SantaLucia J (1998) A unified view of polymer, dumbbell, and oligonucleotide DNA nearest-neighbor thermodynamics. PNAS, 95, 1460-1465).
+
+ 2 Owczarzy method (Owczarzy R, Vallone PM, Gallo FJ *et al.* (1997) Predicting sequence-dependent melting stability of short duplex DNA oligomers. Biopolymers, 44, 217-239).
+
+
+ .. cmdoption:: -a <FLOAT>
+
+ Salt concentration used for estimating the *Tm* (default: 0.05).
+
+
+ .. cmdoption:: -U
+
+ No multi match of a primer on the same sequence record.
+
+
+ .. cmdoption:: -R <TEXT>
+
+ Defines the reference sequence by indicating its identifier in the database.
+
+
+ .. cmdoption:: -A
+
+ Prints the list of all identifiers of sequence records in the database.
+
+
+ .. cmdoption:: -f
+
+ Remove data mining step during strict primer identification.
+
+
+ .. cmdoption:: -v
+
+ Stores statistic file about memory usage during strict primer identification.
+
+
+ .. cmdoption:: -h
+
+ Print help.
+
+
+
+Output file
+-----------
+
+ The output file contains several columns, with '|' as separator, and describes
+ the characteristics of each barcode and its associated primers.
+
+ column 1: serial number
+
+ column 2: sequence of primer 1
+
+ column 3: sequence of primer 2
+
+ column 4: *Tm* (melting temperature) of primer 1, without mismatch
+
+ column 5: lowest *Tm* of primer 1 against example sequence records
+
+ column 6: *Tm* of primer 2, without mismatch
+
+ column 7: lowest *Tm* of primer 2 against example sequence records
+
+ column 8: number of C or G in primer 1
+
+ column 9: number of C or G in primer 2
+
+ column 10: GG (*Good-Good*) means that both primer are specific to the example dataset,
+ GB or BG (*Good-Bad* or *Bad-Good*) means that only one of the two primers
+ is specific to the example dataset
+
+ column 11: number of sequence records of the example dataset that are properly amplified according to the specified parameters
+
+ column 12: proportion of sequence records of the example dataset that are properly amplified according to the specified parameters
+
+ column 13: yule-like output
+
+ column 14: number of taxa of the example dataset that are properly amplified according to the specified parameters
+
+ column 15: number of taxa of the counterexample dataset that are properly amplified according to the specified parameters
+
+ column 16: proportion of taxa of the example dataset that are properly amplified according to the specified parameters (*Bc* index)
+
+ column 17: number of taxa of the example dataset that are properly identified
+
+ column 18: proportion of taxa of the example dataset that are properly identified (*Bs* index)
+
+ column 19: minimum length of the barcode in base pairs for the example sequence records (excluding primers)
+
+ column 20: maximum length of the barcode in base pairs for the example sequence records (excluding primers)
+
+ column 21: average length of the barcode in base pairs for the example sequence records(excluding primers)
+
+
+
+Examples
+--------
+
+ *Example 1:*
+
+ .. code-block:: bash
+
+ > ecoPrimers -d mydatabase -e 3 -l 50 \
+ -L 800 -r 2759 -3 2 > mybarcodes.ecoprimers
+
+ Launches a search for barcodes and corresponding primers on mydatabase (see
+ :doc:`obiconvert <./obiconvert>` for a description of the database format), with a maximum
+ of three mismatches for each primer. The minimum and maximum barcode lengths (excluding
+ primers) are 50 bp and 800 bp, respectively. The search is restricted to the taxonomic
+ group identified by its *taxid* (2759 corresponds to the Diatoma). The two last
+ Nucleotides on the 3' end of the primers must have a perfect match with their target sequences.
+ The results are saved in the mybarcodes.ecoprimers file.
+
+
+
+ *Example 2:*
+
+ .. code-block:: bash
+
+ > ecoPrimers -d mydatabase -e 2 -l 30 -L 120 \
+ -r 7742 - i 2 -E 9604 -3 2 > mybarcodes.ecoprimers
+
+ Launches a search for barcodes and corresponding primers on mydatabase (see :doc:`obiconvert <./obiconvert>`
+ for a description of the database format), with a maximum of two mismatches for each primer. The minimum and
+ maximum barcode lengths (excluding primers) are 30 bp and 120 bp, respectively. The search is
+ restricted to the Vertebrates, excluding Bacteria and Hominidae (7742, 2, and 9604 corresponds to
+ the `TAXID` of Vertebrates, Bacteria, and Hominidae, respectively. The two last nucleotides on
+ the 3' end of the primers must have a perfect match with their target sequences. The results
+ are saved in the mybarcodes.ecoprimers file.
+
+
+
+
diff --git a/doc/sphinx/source/scripts/ecodbtaxstat.rst b/doc/sphinx/source/scripts/ecodbtaxstat.rst
new file mode 100644
index 0000000..4b89815
--- /dev/null
+++ b/doc/sphinx/source/scripts/ecodbtaxstat.rst
@@ -0,0 +1,49 @@
+.. automodule:: ecodbtaxstat
+
+ :py:mod:`ecodbtaxstat` specific option
+ --------------------------------------
+
+ .. cmdoption:: --rank=<TAXONOMIC_RANK>
+
+ The taxonomic rank at which frequencies have to be computed.
+ Possible values are:
+
+ - class
+ - family
+ - forma
+ - genus
+ - infraclass
+ - infraorder
+ - kingdom
+ - order
+ - parvorder
+ - phylum
+ - species (default)
+ - species group
+ - species subgroup
+ - subclass
+ - subfamily
+ - subgenus
+ - subkingdom
+ - suborder
+ - subphylum
+ - subspecies
+ - subtribe
+ - superclass
+ - superfamily
+ - superkingdom
+ - superorder
+ - superphylum
+ - tribe
+ - varietas
+
+ .. include:: ../optionsSet/taxonomyFilter.txt
+
+ :py:mod:`ecodbtaxstat` used sequence attributes
+ -----------------------------------------------
+
+ - :doc:`taxid <../attributes/taxid>`
+
+
+
+
\ No newline at end of file
diff --git a/doc/sphinx/source/scripts/ecofind.rst b/doc/sphinx/source/scripts/ecofind.rst
new file mode 100644
index 0000000..9fcc032
--- /dev/null
+++ b/doc/sphinx/source/scripts/ecofind.rst
@@ -0,0 +1,85 @@
+:py:mod:`ecofind`: querying a taxonomic database
+================================================
+
+:py:mod:`ecofind` retrive taxonomic information from taxonomic database
+given either a *taxid* or a regular expression patterns.
+
+:py:mod:`ecofind` specific options
+----------------------------------
+
+ .. cmdoption:: -d <filename>
+
+ Filename containing the database used for the *in silico* PCR. The database
+ must be in the ``ecoPCR format`` (see :doc:`obiconvert <./obiconvert>`).
+
+ .. WARNING:: This option is compulsory.
+
+ .. cmdoption:: -a
+
+ Enable the search on all alternative names and not only scientific names.
+
+ .. cmdoption:: -L
+
+ List all taxonomic rank available for -r option and exit.
+
+ .. cmdoption:: -r
+
+ Restrict to given taxonomic rank.
+
+ .. cmdoption:: -s
+
+ Displays all subtree's information for the given taxid.
+
+ .. cmdoption:: -p
+
+ Displays all parental tree's information for the given taxid.
+
+ .. cmdoption:: -P
+
+ Display taxonomic Path as suplementary column in output
+
+ .. cmdoption:: -h
+
+ Print help.
+
+
+
+Output file
+-----------
+
+ The output file contains several columns, with '|' as separator, and describes
+ the properties of the retrieved *taxids*.
+
+ column 1: the *taxid*
+
+ column 2: the taxonomic rank
+
+ column 3: the name (not only scientific)
+
+ column 4: class name
+
+ column 5: the scientific name
+
+ column 6 (optional): the full taxonomic path of the *taxid*
+
+
+
+Examples
+--------
+
+ *Example 1:*
+
+ .. code-block:: bash
+
+ > ecofind -d mydatabase 'homo ' > homo_.tax
+
+ Retrieve all *taxids* whose 'homo ' is contained in the associated names.
+
+ *Example 2:*
+
+ .. code-block:: bash
+
+ > ecofind -d mydatabase -p 9606 -P > sapiens.info.tax
+
+ Retrieve all parents taxa of the 9606 *taxid*. The -P option add a supplementary column
+ with the full path for each *taxid*.
\ No newline at end of file
diff --git a/doc/sphinx/source/scripts/ecotag.rst b/doc/sphinx/source/scripts/ecotag.rst
new file mode 100644
index 0000000..88f8a86
--- /dev/null
+++ b/doc/sphinx/source/scripts/ecotag.rst
@@ -0,0 +1,91 @@
+.. automodule:: ecotag
+
+ :py:mod:`ecotag` specific options
+ ---------------------------------
+
+ .. cmdoption:: -R <FILENAME>, --ref-database=<FILENAME>
+
+ <FILENAME> is the fasta file containing the reference sequences
+
+ .. cmdoption:: -m FLOAT, --minimum-identity=FLOAT
+
+ When the best match with the reference database present an identity
+ level below FLOAT, the taxonomic assignment for the sequence record
+ is not computed. The sequence record is nevertheless included in the
+ output file. FLOAT is included in a [0,1] interval.
+
+ .. cmdoption:: --minimum-circle=FLOAT
+
+ minimum identity considered for the assignment circle.
+ FLOAT is included in a [0,1] interval.
+
+ .. cmdoption:: -x RANK, --explain=RANK
+
+ .. cmdoption:: -u, --uniq
+
+ When this option is specified, the program first dereplicates the sequence
+ records to work on unique sequences only. This option greatly improves
+ the program's speed, especially for highly redundant datasets.
+
+ .. cmdoption:: --sort=<KEY>
+
+ The output is sorted based on the values of the relevant attribute.
+
+ .. cmdoption:: -r, --reverse
+
+ The output is sorted in reverse order (should be used with the --sort option).
+ (Works even if the --sort option is not set, but could not find on what
+ the output is sorted).
+
+ .. cmdoption:: -E FLOAT, --errors=FLOAT
+
+ FLOAT is the fraction of reference sequences that will
+ be ignored when looking for the lowest common ancestor. This
+ option is useful when a non-negligible proportion of reference sequences
+ is expected to be assigned to the wrong taxon, for example because of
+ taxonomic misidentification. FLOAT is included in a [0,1] interval.
+
+
+ .. cmdoption:: -M INTEGER, --min-matches=FLOAT
+
+ Define the minimum congruent assignation. If this minimum is reached and
+ the -E option is activated, the lowest common ancestor algorithm tolarated
+ that some sequences do not provide the same taxonomic annotation (see the
+ -E option).
+
+
+ .. cmdoption:: --cache-size=INTEGER
+
+ A cache for computed similarities is maintained by `ecotag`. the default
+ size for this cache is 1,000,000 of scores. This option allows to change
+ the cache size.
+
+ .. include:: ../optionsSet/taxonomyDB.txt
+
+ .. include:: ../optionsSet/inputformat.txt
+
+ .. include:: ../optionsSet/outputformat.txt
+
+ .. include:: ../optionsSet/defaultoptions.txt
+
+ :py:mod:`ecotag` added sequence attributes
+ ------------------------------------------
+
+ .. hlist::
+ :columns: 3
+
+ - :doc:`best_identity <../attributes/best_identity>`
+ - :doc:`best_match <../attributes/best_match>`
+ - :doc:`family <../attributes/family>`
+ - :doc:`family_name <../attributes/family_name>`
+ - :doc:`genus <../attributes/genus>`
+ - :doc:`genus_name <../attributes/genus_name>`
+ - :doc:`id_status <../attributes/id_status>`
+ - :doc:`order <../attributes/order>`
+ - :doc:`order_name <../attributes/order_name>`
+ - :doc:`rank <../attributes/rank>`
+ - :doc:`scientific_name <../attributes/scientific_name>`
+ - :doc:`species <../attributes/species>`
+ - :doc:`species_list <../attributes/species_list>`
+ - :doc:`species_name <../attributes/species_name>`
+ - :doc:`taxid <../attributes/taxid>`
diff --git a/doc/sphinx/source/scripts/ecotaxspecificity.rst b/doc/sphinx/source/scripts/ecotaxspecificity.rst
new file mode 100644
index 0000000..47a6e8d
--- /dev/null
+++ b/doc/sphinx/source/scripts/ecotaxspecificity.rst
@@ -0,0 +1,31 @@
+.. automodule:: ecotaxspecificity
+
+ :py:mod:`ecotaxspecificity` specific options
+ --------------------------------------------
+
+ .. cmdoption:: -e INT, --errors=<INT>
+
+ Two sequences are considered as different if they have INT or more
+ differences (default: 1).
+
+ *Example:*
+
+ .. code-block:: bash
+
+ > ecotaxspecificity -d my_ecopcr_database -e 5 seq.fasta
+
+ This command considers that two sequences with less than 5 differences
+ correspond to the same barcode.
+
+ .. include:: ../optionsSet/taxonomyDB.txt
+
+ .. include:: ../optionsSet/inputformat.txt
+
+ .. include:: ../optionsSet/defaultoptions.txt
+
+ :py:mod:`ecotaxspecificity` used sequence attribute
+ ---------------------------------------------------
+
+
+ - :doc:`taxid <../attributes/taxid>`
+
diff --git a/doc/sphinx/source/scripts/ecotaxstat.rst b/doc/sphinx/source/scripts/ecotaxstat.rst
new file mode 100644
index 0000000..c2c2497
--- /dev/null
+++ b/doc/sphinx/source/scripts/ecotaxstat.rst
@@ -0,0 +1,21 @@
+.. automodule:: ecotaxstat
+
+ :py:mod:`ecotaxstat` specific options
+ --------------------------------------------
+
+ .. cmdoption:: -r TAXID, --required=<TAXID>
+
+ Taxids can be specified to focus the coverage on a smaller part of the taxonomy.
+
+ *Example:*
+
+ .. code-block:: bash
+
+ > ecotaxstat -d my_ecopcr_database seq.ecopcr
+
+ This command will print taxonomy coverage for the considered primer pair
+
+ .. include:: ../optionsSet/taxonomyDB.txt
+
+ .. include:: ../optionsSet/defaultoptions.txt
+
diff --git a/doc/sphinx/source/scripts/illuminapairedend.rst b/doc/sphinx/source/scripts/illuminapairedend.rst
new file mode 100644
index 0000000..bdf3b28
--- /dev/null
+++ b/doc/sphinx/source/scripts/illuminapairedend.rst
@@ -0,0 +1,61 @@
+.. automodule:: illuminapairedend
+
+ :py:mod:`illuminapairedend` specific options
+ --------------------------------------------
+
+ .. cmdoption:: -r <FILENAME>, --reverse-reads=<FILENAME>
+
+ Filename points to the file containing the reverse reads.
+
+ .. cmdoption:: --index-file=<FILENAME>
+ Filename points to the file containing the illumina index reads
+
+ .. cmdoption:: --score-min=<FLOAT>
+
+ minimum score for keeping alignment. If the alignment score is
+ below this threshold both the sequences are just concatenated.
+ The ``mode`` attribute is set to the value ``joined``.
+
+ Options to specify input format
+ -------------------------------
+
+ .. program:: obitools
+
+ Fastq related format
+ ....................
+
+ .. cmdoption:: --sanger
+
+ Input file is in :doc:`Sanger fastq nucleic format <../fastq>` (standard
+ fastq used by HiSeq/MiSeq sequencers).
+
+ .. cmdoption:: --solexa
+
+ Input file is in :doc:`fastq nucleic format <../fastq>` produced by
+ Solexa (Ga IIx) sequencers.
+
+ .. include:: ../optionsSet/outputformat.txt
+
+ .. include:: ../optionsSet/defaultoptions.txt
+
+
+ :py:mod:`illuminapairedend` added sequence attributes
+ -----------------------------------------------------
+
+ - :doc:`ali_dir <../attributes/ali_dir>`
+ - :doc:`ali_length <../attributes/ali_length>`
+ - :doc:`score <../attributes/score>`
+ - :doc:`score_norm <../attributes/score_norm>`
+ - :doc:`mode <../attributes/mode>`
+ - :doc:`pairend_limit <../attributes/pairend_limit>`
+ - :doc:`sminL <../attributes/sminL>`
+ - :doc:`sminR <../attributes/sminR>`
+ - :doc:`seq_ab_match <../attributes/seq_ab_match>`
+ - :doc:`seq_a_single <../attributes/seq_a_single>`
+ - :doc:`seq_b_single <../attributes/seq_b_single>`
+ - :doc:`seq_a_mismatch <../attributes/seq_a_mismatch>`
+ - :doc:`seq_b_mismatch <../attributes/seq_b_mismatch>`
+ - :doc:`seq_a_deletion <../attributes/seq_a_deletion>`
+ - :doc:`seq_b_deletion <../attributes/seq_b_deletion>`
+ - :doc:`seq_b_insertion <../attributes/seq_b_insertion>`
+ - :doc:`seq_a_insertion <../attributes/seq_a_insertion>`
diff --git a/doc/sphinx/source/scripts/ngsfilter.rst b/doc/sphinx/source/scripts/ngsfilter.rst
new file mode 100644
index 0000000..74abfda
--- /dev/null
+++ b/doc/sphinx/source/scripts/ngsfilter.rst
@@ -0,0 +1,54 @@
+.. automodule:: ngsfilter
+
+ :py:mod:`ngsfilter` specific options
+ ------------------------------------
+
+ .. cmdoption:: -t, --tag-list
+
+ Used to specify the file containing the samples description (with tags, primers, sample names,...)
+
+ .. cmdoption:: -u, --unidentified
+
+ Filename used to store the sequences unassigned to any sample
+
+ .. cmdoption:: -e, --error
+
+ Used to specify the number of errors allowed for matching primers [default = 2]
+
+
+ .. include:: ../optionsSet/inputformat.txt
+
+ .. include:: ../optionsSet/outputformat.txt
+
+ .. include:: ../optionsSet/defaultoptions.txt
+
+
+ :py:mod:`ngsfilter` added sequence attributes
+ ---------------------------------------------
+
+ .. hlist::
+ :columns: 3
+
+ - :doc:`avg_quality <../attributes/avg_quality>`
+ - :doc:`complemented <../attributes/complemented>`
+ - :doc:`cut <../attributes/cut>`
+ - :doc:`direction <../attributes/direction>`
+ - :doc:`error <../attributes/error>`
+ - :doc:`experiment <../attributes/experiment>`
+ - :doc:`forward_match <../attributes/forward_match>`
+ - :doc:`forward_primer <../attributes/forward_primer>`
+ - :doc:`forward_score <../attributes/forward_score>`
+ - :doc:`forward_tag <../attributes/forward_tag>`
+ - :doc:`head_quality <../attributes/head_quality>`
+ - :doc:`mid_quality <../attributes/mid_quality>`
+ - :doc:`partial <../attributes/partial>`
+ - :doc:`reverse_match <../attributes/reverse_match>`
+ - :doc:`reverse_primer <../attributes/reverse_primer>`
+ - :doc:`reverse_score <../attributes/reverse_score>`
+ - :doc:`reverse_tag <../attributes/reverse_tag>`
+ - :doc:`sample <../attributes/sample>`
+ - :doc:`seq_length <../attributes/seq_length>`
+ - :doc:`seq_length_ori <../attributes/seq_length_ori>`
+ - :doc:`status <../attributes/status>`
+ - :doc:`tail_quality <../attributes/tail_quality>`
+
diff --git a/doc/sphinx/source/scripts/obiaddtaxids.rst b/doc/sphinx/source/scripts/obiaddtaxids.rst
new file mode 100644
index 0000000..bce8402
--- /dev/null
+++ b/doc/sphinx/source/scripts/obiaddtaxids.rst
@@ -0,0 +1,57 @@
+.. automodule:: obiaddtaxids
+
+ :py:mod:`obiaddtaxids` specific options
+ ---------------------------------------
+
+ .. cmdoption:: -f <FORMAT>, --format=<FORMAT>
+
+ Format of the sequence file. Possible formats are:
+
+ - ``raw``: for regular ``OBITools`` extended :doc:`fasta <../fasta>` files (default value).
+
+ - ``UNITE``: for :doc:`fasta <../fasta>` files downloaded from the `UNITE web site <http://unite.ut.ee/>`_.
+
+ - ``SILVA``: for :doc:`fasta <../fasta>` files downloaded from the `SILVA web site <http://www.arb-silva.de/>`_.
+
+ .. cmdoption:: -k <KEY>, --key-name=<KEY>
+
+ Key of the attribute containing the taxon name in sequence files in the ``OBITools`` extended
+ :doc:`fasta <../fasta>` format.
+
+
+ .. cmdoption:: -a <ANCESTOR>, --restricting_ancestor=<ANCESTOR>
+
+ Enables to restrict the search of *taxids* under a specified ancestor.
+
+ ``<ANCESTOR>`` can be a *taxid* (integer) or a key (string).
+
+ - If it is a *taxid*, this *taxid* is used to restrict the search for all the sequence
+ records.
+
+ - If it is a key, :py:mod:`obiaddtaxids` looks for the ancestor *taxid* in the
+ corresponding attribute. This allows having a different ancestor restriction
+ for each sequence record.
+
+
+
+ .. cmdoption:: -g <FILENAME>, --genus_found=<FILENAME>
+
+ File used to store sequences with a match found for the genus.
+
+ .. CAUTION:: this option is not valid with the UNITE format.
+
+
+ .. cmdoption:: -u <FILENAME>, --unidentified=<FILENAME>
+
+ File used to store sequences with no taxonomic match found.
+
+ .. include:: ../optionsSet/taxonomyDB.txt
+
+ .. include:: ../optionsSet/defaultoptions.txt
+
+
+ :py:mod:`obiaddtaxids` added sequence attribute
+ -----------------------------------------------
+
+ - :doc:`taxid <../attributes/taxid>`
+
diff --git a/doc/sphinx/source/scripts/obiannotate.rst b/doc/sphinx/source/scripts/obiannotate.rst
new file mode 100644
index 0000000..8c81373
--- /dev/null
+++ b/doc/sphinx/source/scripts/obiannotate.rst
@@ -0,0 +1,36 @@
+.. automodule:: obiannotate
+
+ .. include:: ../optionsSet/sequenceEdit.txt
+
+ .. include:: ../optionsSet/sequenceFilter.txt
+
+ .. include:: ../optionsSet/taxonomyFilter.txt
+
+ .. include:: ../optionsSet/inputformat.txt
+
+ .. include:: ../optionsSet/outputformat.txt
+
+ .. include:: ../optionsSet/defaultoptions.txt
+
+ :py:mod:`obiannotate` added sequence attributes
+ -----------------------------------------------
+
+ .. hlist::
+ :columns: 3
+
+ - :doc:`seq_length <../attributes/seq_length>`
+ - :doc:`seq_rank <../attributes/seq_rank>`
+ - :doc:`cluster <../attributes/cluster>`
+ - :doc:`scientific_name <../attributes/scientific_name>`
+ - :doc:`taxid <../attributes/taxid>`
+ - :doc:`rank <../attributes/rank>`
+ - :doc:`family <../attributes/family>`
+ - :doc:`family_name <../attributes/family_name>`
+ - :doc:`genus <../attributes/genus>`
+ - :doc:`genus_name <../attributes/genus_name>`
+ - :doc:`order <../attributes/order>`
+ - :doc:`order_name <../attributes/order_name>`
+ - :doc:`species <../attributes/species>`
+ - :doc:`species_name <../attributes/species_name>`
+
+
\ No newline at end of file
diff --git a/doc/sphinx/source/scripts/obiclean.rst b/doc/sphinx/source/scripts/obiclean.rst
new file mode 100644
index 0000000..79a2f89
--- /dev/null
+++ b/doc/sphinx/source/scripts/obiclean.rst
@@ -0,0 +1,63 @@
+.. automodule:: obiclean
+
+ :py:mod:`obiclean` specific options
+ -----------------------------------
+
+ .. cmdoption:: -d <INTEGER>, --distance=<INTEGER>
+
+ Maximum numbers of differences between two variant sequences (default: 1).
+
+ .. cmdoption:: -s <KEY>, --sample=<KEY>
+
+ Attribute containing sample descriptions.
+
+ .. cmdoption:: -r <FLOAT>, --ratio=<FLOAT>
+
+ Threshold ratio between counts (rare/abundant counts) of two sequence records
+ so that the less abundant one is a variant of the more abundant
+ (default: 1, i.e. all less abundant sequences are variants).
+
+ .. cmdoption:: -C, --cluster
+
+ Switch :py:mod:`obiclean` into its clustering mode. This adds information
+ to each sequence about the true.
+
+ .. cmdoption:: -H, --head
+
+ Select only sequences with the head status in a least one sample.
+
+
+ .. cmdoption:: -g, --graph
+
+ Creates a file containing the set of DAG used by the obiclean clustering algorithm.
+ The graph file follows the `dot` format
+
+ .. include:: ../optionsSet/inputformat.txt
+
+ .. include:: ../optionsSet/outputformat.txt
+
+ .. include:: ../optionsSet/defaultoptions.txt
+
+ :py:mod:`obiclean` used sequence attributes
+ -----------------------------------------------
+
+ .. hlist::
+ :columns: 3
+
+ - :doc:`count <../attributes/count>`
+
+ :py:mod:`obiclean` added sequence attributes
+ -----------------------------------------------
+
+ .. hlist::
+ :columns: 3
+
+ - :doc:`obiclean_cluster <../attributes/obiclean_cluster>`
+ - :doc:`obiclean_count <../attributes/obiclean_count>`
+ - :doc:`obiclean_head <../attributes/obiclean_head>`
+ - :doc:`obiclean_headcount <../attributes/obiclean_headcount>`
+ - :doc:`obiclean_internalcount <../attributes/obiclean_internalcount>`
+ - :doc:`obiclean_samplecount <../attributes/obiclean_samplecount>`
+ - :doc:`obiclean_singletoncount <../attributes/obiclean_singletoncount>`
+ - :doc:`obiclean_status <../attributes/obiclean_status>`
+
\ No newline at end of file
diff --git a/doc/sphinx/source/scripts/obicomplement.rst b/doc/sphinx/source/scripts/obicomplement.rst
new file mode 100644
index 0000000..851f7e7
--- /dev/null
+++ b/doc/sphinx/source/scripts/obicomplement.rst
@@ -0,0 +1,7 @@
+.. automodule:: obicomplement
+
+
+ .. include:: ../optionsSet/inputformat.txt
+
+ .. include:: ../optionsSet/defaultoptions.txt
+
diff --git a/doc/sphinx/source/scripts/obiconvert.rst b/doc/sphinx/source/scripts/obiconvert.rst
new file mode 100644
index 0000000..b9d88cd
--- /dev/null
+++ b/doc/sphinx/source/scripts/obiconvert.rst
@@ -0,0 +1,21 @@
+.. automodule:: obiconvert
+
+ *Examples:*
+
+ .. code-block:: bash
+
+ > obiconvert --ecopcrdb --fasta-output \
+ 'my_ecopcr_database' > sequences.fasta
+
+ Converts an ecoPCR database in a sequence file in *extended OBITools fasta* format.
+
+ .. include:: ../optionsSet/taxonomyDB.txt
+
+ .. include:: ../optionsSet/inputformat.txt
+
+ .. include:: ../optionsSet/outputformat.txt
+
+ .. include:: ../optionsSet/defaultoptions.txt
+
+
+
\ No newline at end of file
diff --git a/doc/sphinx/source/scripts/obicount.rst b/doc/sphinx/source/scripts/obicount.rst
new file mode 100644
index 0000000..077f190
--- /dev/null
+++ b/doc/sphinx/source/scripts/obicount.rst
@@ -0,0 +1,42 @@
+.. automodule:: obicount
+
+
+ :py:mod:`obicount` specific options
+ -----------------------------------
+
+ .. cmdoption:: -a, --all
+
+ Prints only the sum of ``count`` attributes.
+ If a sequence has no `count` attribute, its default count is 1.
+
+ *Example:*
+
+ .. code-block:: bash
+
+ > obicount -a seq.fasta
+
+ For all sequence records contained in the ``seq.fasta`` file, prints only
+ the sum of ``count`` attributes.
+
+
+ .. cmdoption:: -s, --sequence
+
+ Prints only the number of sequence records.
+
+ *Example:*
+
+ .. code-block:: bash
+
+ > obicount -s seq.fasta
+
+ Prints only the number of sequence records contained in the ``seq.fasta`` file.
+
+ .. include:: ../optionsSet/inputformat.txt
+
+ .. include:: ../optionsSet/defaultoptions.txt
+
+ :py:mod:`obicount` added sequence attribute
+ -------------------------------------------
+
+ - :doc:`count <../attributes/count>`
+
diff --git a/doc/sphinx/source/scripts/obicut.rst b/doc/sphinx/source/scripts/obicut.rst
new file mode 100644
index 0000000..72d3420
--- /dev/null
+++ b/doc/sphinx/source/scripts/obicut.rst
@@ -0,0 +1,21 @@
+.. automodule:: obicut
+
+
+ :py:mod:`obicut` specific options
+ ---------------------------------
+
+ .. cmdoption:: -b <INTEGER>, --begin=<INTEGER>
+
+ Integer value (possibly calculated using a python expression)
+ indicating the first position of the sequence to be kept.
+
+ .. cmdoption:: -e <INTEGER>, --end=<INTEGER>
+
+ Integer value (possibly calculated using a python expression)
+ indicating the last position of the sequence to be kept.
+
+ .. include:: ../optionsSet/sequenceFilter.txt
+
+ .. include:: ../optionsSet/taxonomyFilter.txt
+
+ .. include:: ../optionsSet/defaultoptions.txt
\ No newline at end of file
diff --git a/doc/sphinx/source/scripts/obidistribute.rst b/doc/sphinx/source/scripts/obidistribute.rst
new file mode 100644
index 0000000..e418f17
--- /dev/null
+++ b/doc/sphinx/source/scripts/obidistribute.rst
@@ -0,0 +1,20 @@
+.. automodule:: obidistribute
+
+
+ :py:mod:`obidistribute` specific options
+ ----------------------------------------
+
+ .. cmdoption:: -n <INT>, --number=<INT>
+
+ Number of files to distribute over.
+
+ .. cmdoption:: -p <STRING>, --prefix=<STRING>
+
+ Prefix added at each file name.
+
+
+ .. include:: ../optionsSet/inputformat.txt
+
+ .. include:: ../optionsSet/outputformat.txt
+
+ .. include:: ../optionsSet/defaultoptions.txt
diff --git a/doc/sphinx/source/scripts/obiextract.rst b/doc/sphinx/source/scripts/obiextract.rst
new file mode 100644
index 0000000..9d45f51
--- /dev/null
+++ b/doc/sphinx/source/scripts/obiextract.rst
@@ -0,0 +1,61 @@
+.. automodule:: obiextract
+
+ :py:mod:`obiextract` specific options
+ -------------------------------------
+
+ .. cmdoption:: -s <KEY>, --sample=<KEY>
+
+ Attribute containing sample descriptions. By default the attribute
+ name used for describing samples is set to ``merged_sample``.
+
+
+ .. cmdoption:: -e <SAMPLE_NAME>, --extract=<KEY>
+
+ Attribute indicating which <SAMPLE_NAME> have to be extracted.
+ Several ``-p`` options can be added for specifying several samples.
+ If you want to extract a large number of samples, please refer to the ``-E``
+ option described below
+
+ .. TIP:: The ``<KEY>`` can be simply the key of an attribute, or a *Python* expression
+ similarly to the ``-p`` option of :py:mod:`obigrep`.
+
+ *Example:*
+
+ .. code-block:: bash
+
+ > obiextract -e sampleA -e sampleB allseqs.fasta > samplesAB.fasta
+
+ This command extracts from the ``allseqs.fasta`` file data related to samples ``A`` and ``B``.
+
+
+ .. cmdoption:: -E <FILENAME>, --extract-list=<FILENAME>
+
+ Allows for indicating a file name where a list of sample is stored. The file must be a simple
+ text file with a sample name per line.
+
+ *Example:*
+
+ .. code-block:: bash
+
+ > obiextract -E subset.txt allseqs.fasta > subset_samples.fasta
+
+ This command extracts from the ``allseqs.fasta`` file data related to samples listed in the ``subset.txt`` file.
+
+
+ .. include:: ../optionsSet/inputformat.txt
+
+ .. include:: ../optionsSet/outputformat.txt
+
+ .. include:: ../optionsSet/defaultoptions.txt
+
+ :py:mod:`obiextract` modified sequence attributes
+ -------------------------------------------------
+
+ - :doc:`count <../attributes/count>`
+
+ :py:mod:`obiextract` used sequence attribute
+ --------------------------------------------
+
+ - :doc:`count <../attributes/count>`
+
+
\ No newline at end of file
diff --git a/doc/sphinx/source/scripts/obigrep.rst b/doc/sphinx/source/scripts/obigrep.rst
new file mode 100644
index 0000000..a16fdf3
--- /dev/null
+++ b/doc/sphinx/source/scripts/obigrep.rst
@@ -0,0 +1,11 @@
+.. automodule:: obigrep
+
+ .. include:: ../optionsSet/sequenceFilter.txt
+
+ .. include:: ../optionsSet/taxonomyFilter.txt
+
+ .. include:: ../optionsSet/inputformat.txt
+
+ .. include:: ../optionsSet/outputformat.txt
+
+ .. include:: ../optionsSet/defaultoptions.txt
\ No newline at end of file
diff --git a/doc/sphinx/source/scripts/obihead.rst b/doc/sphinx/source/scripts/obihead.rst
new file mode 100644
index 0000000..f9c6d64
--- /dev/null
+++ b/doc/sphinx/source/scripts/obihead.rst
@@ -0,0 +1,14 @@
+.. automodule:: obihead
+
+ :py:mod:`obihead` specific options
+ ----------------------------------
+
+ .. cmdoption:: -n <INTEGER>, --sequence-count=<INTEGER>
+
+ Number of sequence records to be selected (default value : 10).
+
+ .. include:: ../optionsSet/inputformat.txt
+
+ .. include:: ../optionsSet/defaultoptions.txt
+
+
\ No newline at end of file
diff --git a/doc/sphinx/source/scripts/obijoinpairedend.rst b/doc/sphinx/source/scripts/obijoinpairedend.rst
new file mode 100644
index 0000000..b521023
--- /dev/null
+++ b/doc/sphinx/source/scripts/obijoinpairedend.rst
@@ -0,0 +1,15 @@
+.. automodule:: obijoinpairedend
+
+ :py:mod:`obijoinpairedend` specific options
+ -------------------------------------------
+
+ .. cmdoption:: -r <FILENAME>, --reverse-reads=<FILENAME>
+
+ Filename points to the file containing the reverse reads.
+
+ .. include:: ../optionsSet/inputformat.txt
+
+ .. include:: ../optionsSet/outputformat.txt
+
+ .. include:: ../optionsSet/defaultoptions.txt
+
diff --git a/doc/sphinx/source/scripts/obipr2.rst b/doc/sphinx/source/scripts/obipr2.rst
new file mode 100644
index 0000000..3c0bfb6
--- /dev/null
+++ b/doc/sphinx/source/scripts/obipr2.rst
@@ -0,0 +1,22 @@
+.. automodule:: obipr2
+
+ :py:mod:`obipr2` specific options
+ -------------------------------------
+
+ .. cmdoption:: --local=<DIRNAME>
+
+ Specify you have already downloaded a copy of the PR2 database located at the following URL
+ `<http://5.196.17.195/pr2/download/entire_database>`_
+
+ *Example:*
+
+ .. code-block:: bash
+
+ > obipr2 --local=PR2Dir
+
+ This format **PR2** database pre-downloaded in the `PR2Dir` directory.
+
+
+ .. include:: ../optionsSet/defaultoptions.txt
+
+
\ No newline at end of file
diff --git a/doc/sphinx/source/scripts/obisample.rst b/doc/sphinx/source/scripts/obisample.rst
new file mode 100644
index 0000000..af4b062
--- /dev/null
+++ b/doc/sphinx/source/scripts/obisample.rst
@@ -0,0 +1,67 @@
+.. automodule:: obisample
+
+ :py:mod:`obisample` specific options
+ ------------------------------------
+
+ .. cmdoption:: -s ###, --sample-size ###
+
+ Specifies the size of the generated sample.
+
+ - without the ``-a`` option, sample size is expressed as the exact number of sequence
+ records to be sampled (default: number of sequence records in the input file).
+
+ - with the ``-a`` option, sample size is expressed as a fraction of the
+ sequence record numbers in the input file
+ (expressed as a number between 0 and 1).
+
+ *Example:*
+
+ .. code-block:: bash
+
+ > obisample -s 1000 seq1.fasta > seq2.fasta
+
+ Samples randomly 1000 sequence records from the ``seq1.fasta`` file, with replacement,
+ and saves them in the ``seq2.fasta`` file.
+
+ .. cmdoption:: -a, --approx-sampling
+
+ Switches the resampling algorithm to an approximative one,
+ useful for large files.
+
+ The default algorithm selects exactly the number of sequence records
+ specified with the ``-s`` option. When the ``-a`` option is set,
+ each sequence record has a probability to be selected related to the
+ ``count`` attribute of the sequence record and the ``-s`` fraction.
+
+ *Example:*
+
+ .. code-block:: bash
+
+ > obisample -s 0.5 -a seq1.fastq > seq2.fastq
+
+ Samples randomly half of the sequence records of the ``seq1.fastq`` file,
+ without replacement,
+ and saves them in the ``seq2.fastq`` file.
+
+ .. cmdoption:: -w, --without-replacement
+
+ Asks for sampling without replacement.
+
+ *Example:*
+
+ .. code-block:: bash
+
+ > obisample -s 1000 -w seq1.fasta > seq2.fasta
+
+ Samples randomly 1000 sequence records from the ``seq1.fasta`` file, without replacement
+ (the input file must contain at least 1000 sequences), and saves them in the ``seq2.fasta`` file.
+
+ .. include:: ../optionsSet/inputformat.txt
+
+ .. include:: ../optionsSet/defaultoptions.txt
+
+ :py:mod:`obisample` used sequence attribute
+ -------------------------------------------
+
+ - :doc:`count <../attributes/count>`
+
diff --git a/doc/sphinx/source/scripts/obiselect.rst b/doc/sphinx/source/scripts/obiselect.rst
new file mode 100644
index 0000000..06c214d
--- /dev/null
+++ b/doc/sphinx/source/scripts/obiselect.rst
@@ -0,0 +1,126 @@
+.. automodule:: obiselect
+
+ In each group as definied by a set of `-c` options, sequence records are ordered according
+ to a score function. The `N` first sequences (`N`is selected using the `-n` option) are kept
+ in the result subset of sequence records.
+
+ By default the score function is a random function and one sequence record is retrieved per
+ group. This leads to select randomly one sequence per group.
+
+
+ :py:mod:`obiselect` specific options
+ ------------------------------------
+
+ .. cmdoption:: -c <KEY>, --category-attribute=<KEY>
+
+ Attribute used to categorize the sequence records. Several ``-c`` options can be combined.
+
+ .. TIP:: The ``<KEY>`` can be simply the key of an attribute, or a *Python* expression
+ similarly to the ``-p`` option of :py:mod:`obigrep`.
+
+ *Example:*
+
+ .. code-block:: bash
+
+ > obiselect -c sample -c seq_length seq.fasta
+
+ This command select randomly one sequence record per sample and sequence length from
+ the sequence records included in the `seq.fasta` file.
+ The selected sequence records are printed on the screen.
+
+ .. cmdoption:: -n <INTEGER>, --number=<INTEGER>
+
+ Indicates how many sequence records per group have to be retrieved.
+ If the size of the group is lesser than this `NUMBER`, the whole group
+ is retrieved.
+
+ *Example:*
+
+ .. code-block:: bash
+
+ > obiselect -n 2 -c sample -c seq_length seq.fasta
+
+ This command has the same effect than the previous example except that two
+ sequences are retrieved by class of sample/length.
+
+ .. cmdoption:: --merge=<KEY>
+
+ Attribute to merge.
+
+ *Example:*
+
+ .. code-block:: bash
+
+ > obiselect -c seq_length -n 2 -m sample seq1.fasta > seq2.fasta
+
+ This command keeps two sequences per sequence length, and records how
+ many times they were observed for each sample in the new attribute
+ ``merged_sample``.
+
+ .. cmdoption:: --merge-ids
+
+ Adds a ``merged`` attribute containing the list of sequence record ids merged
+ within this group.
+
+
+ .. cmdoption:: -m, --min
+
+ Sets the function used for scoring sequence records into a group to the minimum function.
+ The minimum function is applied to the values used to define categories (see option `-c`).
+ Sequences will be ordered according to the distance of their values to the minimum value.
+
+ .. cmdoption:: -M, --max
+
+ Sets the function used for scoring sequence records into a group to the maximum function.
+ The maximum function is applied to the values used to define categories (see option `-c`).
+ Sequences will be ordered according to the distance of their values to the maximum value.
+
+ .. cmdoption:: -a, --mean
+
+ Sets the function used for scoring sequence records into a group to the mean function.
+ The mean function is applied to the values used to define categories (see option `-c`).
+ Sequences will be ordered according to the distance of their values to the mean value.
+
+ .. cmdoption:: --median
+
+ Sets the function used for scoring sequence records into a group to the median function.
+ The median function is applied to the values used to define categories (see option `-c`).
+ Sequences will be ordered according to the distance of their values to the median value.
+
+
+ .. cmdoption:: -f FUNCTION, --function=FUNCTION
+
+ Sets the function used for scoring sequence records into a group to a user define function.
+ The user define function is declared using `Python` syntax. Attribute keys can be used as variables.
+ An extra `sequence` variable representing the full sequence record is available. If option for
+ loading a taxonomy database is provided, a `taxonomy` variable is also available.
+ The function is estimated for each sequence record and the minimum value of this function in
+ each group.
+ Sequences will be ordered in each group according to the distance of their function estimation
+ to the minimum value of their group.
+
+
+ .. include:: ../optionsSet/inputformat.txt
+
+ .. include:: ../optionsSet/taxonomyDB.txt
+
+ .. include:: ../optionsSet/outputformat.txt
+
+ .. include:: ../optionsSet/defaultoptions.txt
+
+ :py:mod:`obiselect` added sequence attributes
+ ---------------------------------------------
+
+ - :doc:`class <../attributes/class>`
+ - :doc:`distance <../attributes/distance>`
+ - :doc:`merged <../attributes/merged>`
+ - :doc:`class <../attributes/class>`
+ - :doc:`merged_* <../attributes/merged_star>`
+ - :doc:`select <../attributes/select>`
+
+ :py:mod:`obiselect` used sequence attribute
+ -------------------------------------------
+
+ - :doc:`taxid <../attributes/taxid>`
+
+
diff --git a/doc/sphinx/source/scripts/obisilva.rst b/doc/sphinx/source/scripts/obisilva.rst
new file mode 100644
index 0000000..615dfe0
--- /dev/null
+++ b/doc/sphinx/source/scripts/obisilva.rst
@@ -0,0 +1,71 @@
+.. automodule:: obisilva
+
+ :py:mod:`obisilva` specific options
+ -------------------------------------
+
+ .. cmdoption:: -s , --ssu
+
+ Specify that you are interested in the **SSU** database.
+
+ *Example:*
+
+ .. code-block:: bash
+
+ > obisilva --ssu --parc
+
+ This download and format into an ecoPCR database the latest version of the **SSUParc** database of **Silva**.
+
+ .. cmdoption:: -l, --lsu
+
+ Specify that you are interested in the **LSU** database.
+
+ *Example:*
+
+ .. code-block:: bash
+
+ > obisilva --ssu --parc
+
+ This download and format into an ecoPCR database the latest version of the **LSUParc** database of **Silva**.
+
+
+ .. cmdoption:: -p , --parc
+
+ Specify that you are interested in the **Parc** (complete) version of the **Silva** database.
+
+
+ .. cmdoption:: -r , --ref
+
+ Specify that you are interested in the **Reference** (cleaned to keep only high quality sequences)
+ version of the **Silva** database.
+
+ .. cmdoption:: -n , --nr
+
+ Specify that you are interested in the **Non redundant** version of the **Silva** database.
+ just a version of the to closely related sequence is kept in this version of the database
+
+ .. warning::
+ Non redundant version of **Silva** exists only for the SSU sequences
+ in its **Reference** and **Truncated** version
+
+ .. cmdoption:: -t , --trunc
+
+ Specify that you are interested in the **Truncated** (limited to the rDNA element without flanked regions)
+ version of the **Silva** database.
+
+ .. cmdoption:: --local=<DIRNAME>
+
+ Specify you have already downloaded a copy of the **Silva** database located at the following URL
+ `<http://www.arb-**Silva**.de/no_cache/download/archive/current/Exports/>`_
+
+ *Example:*
+
+ .. code-block:: bash
+
+ > obisilva --ssu --parc --local=**Silva**Dir
+
+ This format the **SSUParc** version of the **Silva** database pre-downloaded in the `**Silva**Dir` directory.
+
+
+ .. include:: ../optionsSet/defaultoptions.txt
+
+
\ No newline at end of file
diff --git a/doc/sphinx/source/scripts/obisort.rst b/doc/sphinx/source/scripts/obisort.rst
new file mode 100644
index 0000000..c742c93
--- /dev/null
+++ b/doc/sphinx/source/scripts/obisort.rst
@@ -0,0 +1,35 @@
+.. automodule:: obisort
+
+ :py:mod:`obisort` specific options
+ ----------------------------------
+
+ .. cmdoption:: -k <KEY>, --key=<KEY>
+
+ Attribute used to sort the sequence records.
+
+ *Example:*
+
+ .. code-block:: bash
+
+ > obisort -k count seq1.fasta > seq2.fasta
+
+ Sorts the sequence records of file ``seq1.fasta`` according to their `count`
+ (numeric order) and prints the results in the ``seq2.fasta`` file.
+
+ .. cmdoption:: -r, --reverse
+
+ Sorts in reverse order.
+
+ *Example:*
+
+ .. code-block:: bash
+
+ > obisort -r -k count seq1.fastq > seq2.fastq
+
+ Sorts the sequence records of file ``seq1.fasta`` according to their `count`
+ (reverse numeric order) and prints the results in the ``seq2.fasta`` file.
+
+ .. include:: ../optionsSet/inputformat.txt
+
+ .. include:: ../optionsSet/defaultoptions.txt
+
diff --git a/doc/sphinx/source/scripts/obisplit.rst b/doc/sphinx/source/scripts/obisplit.rst
new file mode 100644
index 0000000..6f0dfa1
--- /dev/null
+++ b/doc/sphinx/source/scripts/obisplit.rst
@@ -0,0 +1,21 @@
+.. automodule:: obisplit
+
+
+ :py:mod:`obisplit` specific options
+ -----------------------------------
+
+ .. cmdoption:: -p <PREFIX FILENAME>, --prefix=<PREFIX FILENAME>
+
+ Prefix added to each subfile name.
+
+ .. cmdoption:: -t <KEY>, --tag-name=<KEY>
+
+ Attribute key used to split the sequence file.
+
+ .. cmdoption:: -u <FILENAME>, --undefined=<FILENAME>
+
+ Name of the file where sequence records without attribute ``<KEY>`` are stored.
+
+ .. include:: ../optionsSet/inputformat.txt
+
+ .. include:: ../optionsSet/defaultoptions.txt
diff --git a/doc/sphinx/source/scripts/obistat.rst b/doc/sphinx/source/scripts/obistat.rst
new file mode 100644
index 0000000..46670ea
--- /dev/null
+++ b/doc/sphinx/source/scripts/obistat.rst
@@ -0,0 +1,95 @@
+.. automodule:: obistat
+
+ :py:mod:`obistat` specific options
+ ----------------------------------
+
+ .. cmdoption:: -c <KEY>, --category-attribute=<KEY>
+
+ Attribute used to categorize the sequence records. Several ``-c`` options can be combined.
+
+ .. TIP:: The ``<KEY>`` can be simply the key of an attribute, or a *Python* expression
+ similarly to the ``-p`` option of :py:mod:`obigrep`.
+
+ *Example:*
+
+ .. code-block:: bash
+
+ > obistat -c sample -c seq_length seq.fasta
+
+ This command prints the number of sequence records and total count for each combination of
+ sample and sequence length.
+
+
+ .. cmdoption:: -m <KEY>, --min=<KEY>
+
+ Computes the minimum value of attribute <KEY> for each category.
+
+ *Example:*
+
+ .. code-block:: bash
+
+ > obistat -c sample -m seq_length seq.fastq
+
+ This command computes the minimum sequence length observed for each sample.
+
+ .. cmdoption:: -M <KEY>, --max=<KEY>
+
+ Computes the maximum value of attribute <KEY> for each category.
+
+ *Example:*
+
+ .. code-block:: bash
+
+ > obistat -c sample -M seq_length seq.fastq
+
+ This command computes the maximum sequence length observed for each sample.
+
+ .. cmdoption:: -a <KEY>, --mean=<KEY>
+
+ Computes the mean value of attribute <KEY> for each category.
+
+ *Example:*
+
+ .. code-block:: bash
+
+ > obistat -c sample -a seq_length seq.fastq
+
+ This command computes the mean sequence length observed for each sample.
+
+ .. cmdoption:: -v <KEY>, --variance=<KEY>
+
+ Computes the variance of attribute <KEY> for each category.
+
+ *Example:*
+
+ .. code-block:: bash
+
+ > obistat -c genus_name -v reverse_error seq.fastq
+
+ This command computes the variance of the number of errors observed in the reverse primer for each genus.
+
+ .. cmdoption:: -s <KEY>, -std-dev=<KEY>
+
+ Computes the standard deviation of attribute <KEY> for each category.
+
+ *Example:*
+
+ .. code-block:: bash
+
+ > obistat -c genus_name -s reverse_error seq.fastq
+
+ This command computes the standard deviation of the number of errors observed in the reverse primer for each genus.
+
+
+ .. include:: ../optionsSet/inputformat.txt
+
+ .. include:: ../optionsSet/taxonomyDB.txt
+
+ .. include:: ../optionsSet/defaultoptions.txt
+
+ :py:mod:`obistat` used sequence attribute
+ -----------------------------------------
+
+ - :doc:`count <../attributes/count>`
+
+
\ No newline at end of file
diff --git a/doc/sphinx/source/scripts/obisubset.rst b/doc/sphinx/source/scripts/obisubset.rst
new file mode 100644
index 0000000..7218a89
--- /dev/null
+++ b/doc/sphinx/source/scripts/obisubset.rst
@@ -0,0 +1,81 @@
+.. automodule:: obisubset
+
+
+ :py:mod:`obisubset` specific options
+ ------------------------------------
+
+ .. cmdoption:: -s <TAGNAME>, --sample=<TAGNAME>,
+
+ The option ``-s`` allows to specify the tag containing sample descriptions,
+ the default value is set to *merged_sample*.
+
+ *Example:*
+
+ .. code-block:: bash
+
+ > obiuniq -m sample seq1.fasta > seq2.fasta
+ > obisubset -s merged_sample -n sample1 seq2.fasta > seq3.fasta
+
+ After the dereplication of the sequences using the
+ in the new attribute ``merged_sample``.
+
+ .. cmdoption:: -o <TAGNAME>, --other-tag=<TAGNAME>,
+
+ Another tag to clean according to the sample subset
+
+ *Example:*
+
+ .. code-block:: bash
+
+ > obisubset -s merged_sample -o -n sample1 seq2.fasta > seq3.fasta
+
+ .. cmdoption:: -l <FILENAME>, --sample-list=<FILENAME>,
+
+ File containing the samples names (one sample id per line).
+
+ *Example:*
+
+ .. code-block:: bash
+
+ > obisubset -s merged_sample -o -l ids.txt seq2.fasta > seq3.fasta
+
+ .. cmdoption:: -p <REGEX>, --sample-pattern=<REGEX>,
+
+ A regular expression pattern matching the sample ids to extract.
+
+ *Example:*
+
+ .. code-block:: bash
+
+ > obisubset -s merged_sample -o -p "negative_.*" seq2.fasta > seq3.fasta
+
+ .. cmdoption:: -n <SAMPLEIDS>, --sample-name=<SAMPLEIDS>,
+
+ A sample id to extract
+
+ *Example:*
+
+ .. code-block:: bash
+
+ > obisubset -s merged_sample -o -n sample1 seq2.fasta > seq3.fasta
+
+ .. include:: ../optionsSet/inputformat.txt
+
+ .. include:: ../optionsSet/outputformat.txt
+
+ .. include:: ../optionsSet/defaultoptions.txt
+
+ :py:mod:`obisubset` modifies sequence attributes
+ ------------------------------------------------
+
+ .. hlist::
+ :columns: 3
+
+ - :doc:`count <../attributes/count>`
+ - :doc:`merged_* <../attributes/merged_star>`
+
+ :py:mod:`obisubset` used sequence attribute
+ -------------------------------------------
+
+ - :doc:`count <../attributes/taxid>`
+ - :doc:`merged_* <../attributes/merged_star>`
diff --git a/doc/sphinx/source/scripts/obitab.rst b/doc/sphinx/source/scripts/obitab.rst
new file mode 100644
index 0000000..4011e24
--- /dev/null
+++ b/doc/sphinx/source/scripts/obitab.rst
@@ -0,0 +1,48 @@
+.. automodule:: obitab
+
+
+ :py:mod:`obitab` specific options
+ ---------------------------------
+
+ .. cmdoption:: -n <NOT AVAILABLE STRING>, --na-string=<NOT AVAILABLE STRING>
+
+ String written in the table for the not available values
+ (default value ``NA``).
+
+ .. cmdoption:: --output-field-separator=<STRING>
+
+ Field separator for the tabular file
+ (default value ``TAB``).
+
+ .. cmdoption:: -o, --output-seq
+
+ Adds an extra column at the end of the table for
+ the sequence itself.
+
+ .. cmdoption:: -d, --no-definition
+
+ Removes column containing the sequence definition in
+ the output tab file.
+
+ .. cmdoption:: -a <KEY>, --omit-attribute=<KEY>
+
+ Attributes whose key is in this list will not be printed in
+ the output tab file.
+
+ .. include:: ../optionsSet/inputformat.txt
+
+ .. include:: ../optionsSet/defaultoptions.txt
+
+
+
+ Example
+ -------
+
+ .. code-block:: bash
+
+ > obitab -d -o seq1.fasta > seq1.txt
+
+ Reformats all sequence records present in the ``seq1.fasta`` file
+ into a tabular file without outputing the sequence definition but
+ with an extra column containing the sequence itself. The result is
+ stored in the ``seq1.txt`` file.
\ No newline at end of file
diff --git a/doc/sphinx/source/scripts/obitail.rst b/doc/sphinx/source/scripts/obitail.rst
new file mode 100644
index 0000000..0ca2392
--- /dev/null
+++ b/doc/sphinx/source/scripts/obitail.rst
@@ -0,0 +1,14 @@
+.. automodule:: obitail
+
+ :py:mod:`obitail` specific options
+ ----------------------------------
+
+ .. cmdoption:: -n <INTEGER>, --sequence-count <INTEGER>
+
+ Number of sequence records to be selected (default value : 10).
+
+ .. include:: ../optionsSet/inputformat.txt
+
+ .. include:: ../optionsSet/defaultoptions.txt
+
+
\ No newline at end of file
diff --git a/doc/sphinx/source/scripts/obitaxonomy.rst b/doc/sphinx/source/scripts/obitaxonomy.rst
new file mode 100644
index 0000000..b621110
--- /dev/null
+++ b/doc/sphinx/source/scripts/obitaxonomy.rst
@@ -0,0 +1,160 @@
+.. automodule:: obitaxonomy
+
+ :py:mod:`obitaxonomy` specific options
+ ---------------------------------------
+
+
+ .. cmdoption:: -a <TAXON_INFOS>, --add-taxon=<TAXON_INFOS>
+
+ Adds a new taxon to the taxonomy. The new taxon
+ is described by three values separated by colons:
+ its scientific name, its taxonomic rank, and the
+ taxid of its first ancestor.
+
+ *Example:*
+
+ .. code-block:: bash
+
+ > obitaxonomy -d my_ecopcr_database \
+ -a 'Gentiana alpina':'species':49934
+
+ Adds a taxon with the scientific name *Gentiana alpina* and the rank *species* under
+ the taxon whose taxid is 49934.
+
+
+ .. cmdoption:: -m <####>, --min-taxid=<####>
+
+ Minimum *taxid* for the newly added *taxid(s)*.
+
+ *Example:*
+
+ .. code-block:: bash
+
+ > obitaxonomy -d my_ecopcr_database -m 1000000000 \
+ -a 'Gentiana alpina':'species':49934
+
+ Adds a taxon with the scientific name *Gentiana alpina* and the rank *species* under
+ the taxon whose *taxid* is 49934, with a *taxid* greater than or equal to 1000000000.
+
+
+ .. cmdoption:: -D <TAXID>, --delete-local-taxon=<TAXID>
+
+ Deletes the local taxon with the *taxid* <TAXID> from the
+ taxonomic database.
+
+ *Example:*
+
+ .. code-block:: bash
+
+ > obitaxonomy -d my_ecopcr_database -D 10000832
+
+ Deletes the local taxon with the taxid 10000832 from the taxonomic database.
+
+
+ .. cmdoption:: -s <SPECIES_NAME>, --add-species=<SPECIES_NAME>
+
+ Adds a new species to the taxonomy. The new species
+ is described by its scientific name. The genus of the
+ species must already exist in the database.
+ The species will be added under its genus.
+
+ *Example:*
+
+ .. code-block:: bash
+
+ > obitaxonomy -d my_ecopcr_database -s 'Gentiana alpina'
+
+ Adds the species with the scientific name *Gentiana alpina* under the genus *Gentiana*.
+
+
+ .. cmdoption:: -f <TAXON_NAME>:<TAXID>, --add-favorite-name=<TAXON_NAME>:<TAXID>
+
+ Adds a new favorite scientific name to the taxonomy.
+ The new name is described by two values separated by
+ a colon: the new favorite name and the *taxid* of the taxon.
+
+ *Example:*
+
+ .. code-block:: bash
+
+ > obitaxonomy -d my_ecopcr_database \
+ -f 'Gentiana algida':50748
+
+ Adds the favorite scientific name *Gentiana algida* for the *taxid* 50748 in the taxonomic database.
+
+
+ .. cmdoption:: -F <FILE_NAME>, --file-name=<FILE_NAME>
+
+ Adds all the taxa from a sequence file in ``OBITools`` extended
+ doc:`fasta <../fasta>` format, and eventually their ancestors to the database
+ (see documentation). Each sequence record must contain the
+ attribute specified by the ``-k`` option.
+
+ *Example:*
+
+ .. code-block:: bash
+
+ > obitaxonomy -d my_ecopcr_database \
+ -k my_taxon_name_key -F my_sequences.fasta
+
+ Adds the taxon of each sequence record from the file ``my_sequences.fasta`` in the taxonomic
+ database, based on the scientific name contained in the ``my_taxon_name_key`` attribute.
+
+
+ .. cmdoption:: -k <KEY_NAME>, --key-name=<KEY_NAME>
+
+ Works with the ``-F`` option. Defines the key of the
+ attribute that contains the scientific name of
+ the taxon to be added. See example above.
+
+
+ .. cmdoption:: -A <ANCESTOR>, --restricting_ancestor=<ANCESTOR>
+
+ Works with the ``-F`` option. Can be a *taxid* (integer) or
+ a key (string). If it is a *taxid*, this *taxid* is the
+ default *taxid* under which the new taxon is added if
+ none of his ancestors are specified or can be found.
+ If it is a key, :py:mod:`obitaxonomy` looks for the
+ ancestor *taxid* in the corresponding attribute, and the
+ new taxon is *systematically* added under this ancestor.
+ By default, the restricting ancestor is the root of the
+ taxonomic tree for all the new taxa.
+
+ *Example:*
+
+ .. code-block:: bash
+
+ > obitaxonomy -d my_ecopcr_database -a 33090 \
+ -k my_taxon_name_key -F my_sequences.fasta
+
+ Adds the taxon of each sequence record from the file ``my_sequences.fasta`` in the taxonomic
+ database, based on the scientific name contained in the ``my_taxon_name_key`` attribute. If
+ the genus of the new taxon cannot be found, the new taxon is added under the taxon whose
+ *taxid* is 33090.
+
+
+ .. cmdoption:: -p <PATH>, --path=<PATH>
+
+ Works with the ``-F`` option. Key of the attribute containing
+ the taxonomic paths of the taxa if they are in the headers of
+ the sequence records. The value contained in this attribute
+ must be of the form 'Fungi, Agaricomycetes, Thelephorales,
+ Thelephoraceae' with the highest ancestors first and commas
+ between ancestors.
+
+ *Example:*
+
+ .. code-block:: bash
+
+ > obitaxonomy -d my_ecopcr_database -p my_taxonomic_path_key \
+ -k my_taxon_name_key -F my_sequences.fasta
+
+ Adds the taxon of each sequence record from the file ``my_sequences.fasta`` in the taxonomic
+ database, based on the scientific name contained in the ``my_taxon_name_key`` attribute.
+ Each ancestor contained in the ``my_taxonomic_path_key`` attribute is added if it does not
+ already exist, and the new taxon is added under the latest ancestor of the path.
+
+
+ .. include:: ../optionsSet/taxonomyDB.txt
+
+ .. include:: ../optionsSet/defaultoptions.txt
diff --git a/doc/sphinx/source/scripts/obiuniq.rst b/doc/sphinx/source/scripts/obiuniq.rst
new file mode 100644
index 0000000..37d594a
--- /dev/null
+++ b/doc/sphinx/source/scripts/obiuniq.rst
@@ -0,0 +1,79 @@
+.. automodule:: obiuniq
+
+
+
+ :py:mod:`obiuniq` specific options
+ ----------------------------------
+
+ .. cmdoption:: -m <KEY>, --merge=<KEY>
+
+ Attribute to merge.
+
+ *Example:*
+
+ .. code-block:: bash
+
+ > obiuniq -m sample seq1.fasta > seq2.fasta
+
+ Dereplicates sequences and keeps the value distribution of the ``sample`` attribute
+ in the new attribute ``merged_sample``.
+
+ .. cmdoption:: -i , --merge-ids
+
+ Adds a ``merged`` attribute containing the list of sequence record ids merged
+ within this group.
+
+ .. cmdoption:: -c <KEY>, --category-attribute=<KEY>
+
+ Adds one attribute to the list of attributes used to define sequence groups
+ (this option can be used several times).
+
+ *Example:*
+
+ .. code-block:: bash
+
+ > obiuniq -c sample seq1.fasta > seq2.fasta
+
+ Dereplicates sequences within each sample.
+
+ .. cmdoption:: -p, --prefix
+
+ Dereplication is done based on prefix matching:
+
+ 1. The shortest sequence of each group is a prefix of any sequence of its group
+
+ 2. The shortest sequence of a group is the prefix of only the sequences belonging
+ to its group
+
+
+ .. include:: ../optionsSet/taxonomyDB.txt
+
+ .. include:: ../optionsSet/inputformat.txt
+
+ .. include:: ../optionsSet/defaultoptions.txt
+
+ :py:mod:`obiuniq` added sequence attributes
+ -------------------------------------------
+
+ .. hlist::
+ :columns: 3
+
+ - :doc:`count <../attributes/count>`
+ - :doc:`merged_* <../attributes/merged_star>`
+ - :doc:`merged <../attributes/merged>`
+ - :doc:`scientific_name <../attributes/scientific_name>`
+ - :doc:`rank <../attributes/rank>`
+ - :doc:`family <../attributes/family>`
+ - :doc:`family_name <../attributes/family_name>`
+ - :doc:`genus <../attributes/genus>`
+ - :doc:`genus_name <../attributes/genus_name>`
+ - :doc:`order <../attributes/order>`
+ - :doc:`order_name <../attributes/order_name>`
+ - :doc:`species <../attributes/species>`
+ - :doc:`species_name <../attributes/species_name>`
+
+ :py:mod:`obiuniq` used sequence attribute
+ -----------------------------------------
+
+ - :doc:`taxid <../attributes/taxid>`
+
\ No newline at end of file
diff --git a/doc/sphinx/source/scripts/oligotag.rst b/doc/sphinx/source/scripts/oligotag.rst
new file mode 100644
index 0000000..a3c4d01
--- /dev/null
+++ b/doc/sphinx/source/scripts/oligotag.rst
@@ -0,0 +1,134 @@
+.. automodule:: oligotag
+
+ :py:mod:`oligotag` specific options
+ -----------------------------------
+
+ .. cmdoption:: -L <filename>, --oligo-list=<filename>
+
+ Filename containing a list of oligonucleotides. `oligotag` selects within this list
+ the oligonucleotides that match the specified options.
+
+ .. CAUTION:: Cannot be used with the ``-s`` option.
+
+
+ .. cmdoption:: -s ###, --oligo-size=###
+
+ Size of oligonucleotides to be generated.
+
+ .. CAUTION:: Cannot be used with the ``-L`` option.
+
+ .. WARNING:: A size equal or greater than eight often leads to a very long
+ computing time and requires a large amount of memory.
+
+
+ .. cmdoption:: -f ###, --family-size=###
+
+ Minimal size of the oligonucleotide family to be generated.
+
+
+ .. cmdoption:: -d ###, --distance=###
+
+ Minimal Hamming distance (number of differences)
+ between two oligonucleotides.
+
+
+ .. cmdoption:: -g ###, --gc-max=###
+
+ Maximum number of G or C in the oligonucleotides.
+
+
+ .. cmdoption:: -a <IUPAC_PATTERN>, --accepted=<IUPAC_PATTERN>
+
+ Selected oligonucleotides are constrained by the given pattern
+ (only :doc:`IUPAC <../iupac>` symbols are allowed).
+
+ .. CAUTION:: pattern length must have the same length as oligonucleotides.
+
+ .. cmdoption:: -r <IUPAC_PATTERN>, --rejected=<IUPAC_PATTERN>
+
+ Selected oligonucleotides do not match the given pattern
+ (only :doc:`IUPAC <../iupac>` symbols are allowed).
+
+ .. CAUTION:: pattern length must have the same length as oligonucleotides.
+
+ .. cmdoption:: -p ###, --homopolymer=###
+
+ Selected oligonucleotides do not contain any homopolymer
+ longer than the specified length.
+
+
+ .. cmdoption:: -P ###, --homopolymer-min=###
+
+ Selected oligonucleotides contain at least one homopolymer longer
+ or equal to the specified length.
+
+
+ .. cmdoption:: -T <seconde>, --timeout=<seconde>
+
+ Timeout to identify a set of oligonucleotides of required size,
+ as defined by the ``-f`` option.
+
+
+ .. include:: ../optionsSet/defaultoptions.txt
+
+
+ Examples
+ --------
+
+ *Example 1:*
+
+ .. code-block:: bash
+
+ > oligotag -s 5 -f 24 -d 3 -g 3 -p 2 > mytags.txt
+
+ Searches for a family of at least 24 oligonucleotides of a length of 5 nucleotides,
+ with at least 3 differences among them, with a maximum of 3 C/G, and without
+ homopolymers longer than 2. The resulting list of oligonucleotides is saved in
+ the ``mytags.txt`` file.
+
+
+ *Example 2:*
+
+ .. code-block:: bash
+
+ > oligotag -d 5 -L my_oligos.txt -f 10 -p 1
+
+ Searches for a subset of at least 10 oligonucleotides listed in the ``my_oligos.txt`` file, with
+ at least 5 differences among them, and without homopolymers. The ``my_oligos.txt`` file must
+ contain a set of oligonucleotides of the same length, with only one oligonucleotide per line.
+ The resulting list of oligonucleotides is printed on the terminal window.
+
+
+
+ *Example 3:*
+
+ .. code-block:: bash
+
+ > oligotag -s 7 -f 96 -d 3 -p 1 -r cnnnnnn > mytags.txt
+
+ Searches for a family of at least 96 oligonucleotides of a length of 7 nucleotides,
+ with at least 3 differences among them, without homopolymers, and without a ``C`` in
+ the first position. The resulting list is saved in the ``mytags.txt`` file.
+
+
+ *Example 4:*
+
+ .. code-block:: bash
+
+ > oligotag -s 9 -f 24 -d 3 -a yryryryry > mytags.txt
+
+ Searches for a family of at least 24 oligonucleotides of a length of 9 nucleotides,
+ with at least 3 differences among them, and an alternation of pyrimidines and purines.
+ The resulting list is saved in the ``mytags.txt`` file. Because of the
+ constraints imposed by the ``-a`` option, it is possible to compute longer oligonucleotides
+ in a reasonable time.
+
+
+ Reference
+ ---------
+
+ E. Coissac. Oligotag: a program for designing sets of tags for next-generation sequencing of multiplexed samples. Methods Mol Biol, 888:13-31, 2012.
+
+
+
+
\ No newline at end of file
diff --git a/doc/sphinx/source/statistics.rst b/doc/sphinx/source/statistics.rst
new file mode 100644
index 0000000..b29337a
--- /dev/null
+++ b/doc/sphinx/source/statistics.rst
@@ -0,0 +1,9 @@
+Statistics over sequence file
+=============================
+
+.. toctree::
+ :maxdepth: 2
+
+ scripts/ecodbtaxstat
+ scripts/obicount
+ scripts/obistat
diff --git a/doc/sphinx/source/taxdump.rst b/doc/sphinx/source/taxdump.rst
new file mode 100644
index 0000000..bd951e7
--- /dev/null
+++ b/doc/sphinx/source/taxdump.rst
@@ -0,0 +1,2 @@
+The NCBI taxonomy dump files
+============================
diff --git a/doc/sphinx/source/tutorials.rst b/doc/sphinx/source/tutorials.rst
new file mode 100644
index 0000000..0bf7269
--- /dev/null
+++ b/doc/sphinx/source/tutorials.rst
@@ -0,0 +1,12 @@
+OBITools tutorials
+==================
+
+Some basic tutorials.
+
+
+Contents:
+
+.. toctree::
+ :maxdepth: 2
+
+ wolves
diff --git a/doc/sphinx/source/utilities.rst b/doc/sphinx/source/utilities.rst
new file mode 100644
index 0000000..c559716
--- /dev/null
+++ b/doc/sphinx/source/utilities.rst
@@ -0,0 +1,13 @@
+Utilities
+=========
+
+
+.. toctree::
+ :maxdepth: 2
+
+ scripts/oligotag
+ scripts/obidistribute
+ scripts/obisort
+ scripts/obitaxonomy
+ scripts/ecofind
+
\ No newline at end of file
diff --git a/doc/sphinx/source/welcome.rst b/doc/sphinx/source/welcome.rst
new file mode 100644
index 0000000..eeb051e
--- /dev/null
+++ b/doc/sphinx/source/welcome.rst
@@ -0,0 +1,405 @@
+.. role:: latex(raw)
+ :format: latex
+
+
+Welcome to the ``OBITools``
+===========================
+
+:latex:`~`
+:latex:`\vspace{\fill}`
+
+The ``OBITools`` package is a set of programs specifically designed for analyzing
+NGS data in a DNA metabarcoding context, taking into account taxonomic
+information. It is distributed as an open source software available on the
+following website: http://metabarcoding.org/obitools.
+
+Citation: Boyer F., Mercier C., Bonin A., Taberlet P., Coissac E. (2014)
+OBITools: a Unix-inspired software package for DNA metabarcoding. *Molecular
+Ecology Resources*, submitted.
+
+:latex:`\vspace{\fill}`
+
+Installing the ``OBITools``
+===========================
+
+Availability of the ``OBITools``
+................................
+
+The ``OBITools`` are open source and protected by the CeCILL 2.1 license
+(`http://www.cecill.info/licences/Licence_CeCILL_V2.1-en.html <http://www.cecill.info/licences/Licence_CeCILL_V2.1-en.html>`_).
+
+The ``OBITools`` are deposited on the Python Package Index (PyPI : `https://pypi.python.org/pypi/obitools <https://pypi.python.org/pypi/obitools>`_)
+and all the sources can be downloaded from our subversion server
+(`http://www.grenoble.prabi.fr/public-svn/OBISofts/OBITools <http://www.grenoble.prabi.fr/public-svn/OBISofts/OBITools>`_).
+
+Prerequisites
+.............
+
+To install the ``OBITools``, you need that these softwares are installed on your
+system:
+
+* Python 2.7 (installed by default on most ``Unix`` systems, available from
+ `the Python website <http://www.python.org/>`_)
+* ``gcc`` (installed by default on most ``Unix`` systems, available from the
+ GNU sites dedicated to `GCC <https://www.gnu.org/software/gcc/>`_ and
+ `GMake <https://www.gnu.org/software/make/>`_)
+
+On a linux system
+^^^^^^^^^^^^^^^^^
+
+You have to take care that the Python-dev packages are installed.
+
+On MacOSX
+^^^^^^^^^
+
+The C compiler and all the other compilation tools are included in the `XCode <https://itunes.apple.com/fr/app/xcode/id497799835?mt=12>`_
+application not installed by default. The Python included in the system is not
+suitable for running the ``OBITools``. You have to install a complete distribution
+of Python that you can download as a `MacOSX package from the Python website <https://www.python.org/downloads/>`_.
+
+Downloading and installing the ``OBITools``
+...........................................
+
+The ``OBITools`` are downloaded and installed using the :download:`get-obitools.py <../../../get_obitools/get-obitools.py>` script.
+This is a user level installation that does not need administrator privilege.
+
+Once downloaded, move the file :download:`get-obitools.py <../../../get_obitools/get-obitools.py>` in the directory where you want to install
+the ``OBITools``. From a Unix terminal you must now run the command :
+
+ .. code-block:: bash
+
+ > python get-obitools.py
+
+The script will create a new directory at the place you are running it in which all the
+``OBITools`` will be installed. No system privilege are required, and you system will not
+be altered in any way by the obitools installation.
+
+The newly created directory is named OBITools-VERSION where version is substituted by the
+latest version number available.
+
+Inside the newly created directory all the ``OBITools`` are installed. Close to this directory
+there is a shell script named ``obitools``. Running this script activate the ``OBITools``
+by reconfiguring your Unix environment.
+
+ .. code-block:: bash
+
+ > ./obitools
+
+Once activated you can desactivate the ``OBITools`` byt typing the command ``exit``.
+
+ .. code-block:: bash
+
+ > exit
+
+ OBITools are no more activated, Bye...
+ ======================================
+
+
+System level installation
+.........................
+
+To install the ``OBITools`` at the system level you can follow two options :
+
+ - copy the ``obitools`` script in a usual directory for installing program like ``/usr/local/bin``
+ but never move the ``OBITools`` directory itself after the installation by the
+ :download:`get-obitools.py <../../../get_obitools/get-obitools.py>`.
+
+ - The other solution is to add the ``export/bin`` directory located in the ``OBITools`` directory
+ to the ``PATH``environment variable.
+
+Retrieving the sources of the OBITools
+......................................
+
+If you want to compile by yourself the ``OBITools``, you will need to install the same
+prerequisite:
+
+ .. code-block:: bash
+
+ > pip install -U virtualenv
+
+ > pip install -U sphinx
+
+ > pip install -U cython
+
+moreover you need to install any subversion client (a list of clients is available from `Wikipedia <http://en.wikipedia.org/wiki/Comparison_of_Subversion_clients>`_)
+
+Then you can download the
+
+ .. code-block:: bash
+
+ > svn co http://www.grenoble.prabi.fr/public-svn/OBISofts/OBITools/branches/OBITools-1.00/ OBITools
+
+This command will create a new directory called ``OBITools``.
+
+Compiling and installing the ``OBITools``
+.........................................
+
+From the directory where you retrieved the sources, execute the following commands:
+
+ .. code-block:: bash
+
+ > cd OBITools
+
+ > python setup.py --serenity install
+
+Once installed, you can test your installation by running the commands of the
+:doc:`tutorials <./tutorials>`.
+
+
+Introduction
+============
+
+DNA metabarcoding is an emerging approach for biodiversity studies (Taberlet et
+al. 2012). Originally mainly developed by microbiologists (e.g. Sogin et al.
+2006), it is now widely used for plants (e.g. Sonstebo et al. 2010, Parducci et
+al. 2012, Yoccoz et al. 2012) and animals from meiofauna (e.g. Chariton et al.
+2010, Baldwin et al. 2013) to larger organisms (e.g. Andersen et al. 2012,
+Thomsen et al. 2012). Interestingly, this method is not limited to *sensu
+stricto* biodiversity surveys, but it can also be implemented in other
+ecological contexts such as for herbivore (e.g. Valentini et al. 2009, Kowalczyk
+et al. 2011) or carnivore (e.g. Deagle et al. 2009, Shehzad et al. 2012) diet
+analyses.
+Whatever the biological question under consideration, the DNA metabarcoding
+methodology relies heavily on next-generation sequencing (NGS), and generates
+considerable numbers of DNA sequence reads (typically million of reads).
+Manipulation of such large datasets requires dedicated programs usually running
+on a Unix system. Unix is an operating system, whose first version was created
+during the sixties. Since its early stages, it is dedicated to scientific
+computing and includes a large set of simple tools to efficiently process text
+files. Most of those programs can be viewed as filters extracting information
+from a text file to create a new text file. These programs process text files as
+streams, line per line, therefore allowing computation on a huge dataset without
+requiring a large memory. Unix programs usually print their results to their
+standard output (*stdout*), which by default is the terminal, so the results can
+be examined on screen. The main philosophy of the Unix environment is to allow
+easy redirection of the *stdout* either to a file, for saving the results, or to
+the standard input (*stdin*) of a second program thus allowing to easily create
+complex processing from simple base commands. Access to Unix computers is
+increasingly easier for scientists nowadays. Indeed, the Linux operating system,
+an open source version of Unix, can be freely installed on every PC machine and
+the MacOSX operating system, running on Apple computers, is also a Unix system.
+The ``OBITools`` programs imitate Unix standard programs because they usually act as
+filters, reading their data from text files or the *stdin* and writing their
+results to the *stdout*. The main difference with classical Unix programs is that
+text files are not analyzed line per line but sequence record per sequence
+record (see below for a detailed description of a sequence record).
+Compared to packages for similar purposes like mothur (Schloss et al. 2009) or
+QIIME (Caporaso et al. 2010), the ``OBITools`` mainly rely on filtering and sorting
+algorithms. This allows users to set up versatile data analysis pipelines
+(Figure 1), adjustable to the broad range of DNA metabarcoding applications.
+The innovation of the ``OBITools`` is their ability to take into account the
+taxonomic annotations, ultimately allowing sorting and filtering of sequence
+records based on the taxonomy.
+
+|Pipeline example for a standard biodiversity survey|
+
+
+References
+..........
+
+Andersen K, Bird KL, Rasmussen M, Haile J, Breuning-Madsen H, Kj�r KH, Orlando
+L, Gilbert MTP, Willerslev E (2012) Meta-barcoding of "dirt" DNA from soil
+reflects vertebrate biodiversity. Molecular Ecology, 21, 1966-1979.
+
+Baldwin DS, Colloff MJ, Rees GN, Chariton AA, Watson GO, Court LN, Hartley DM,
+Morgan Mj, King AJ, Wilson JS, Hodda M, Hardy CM (2013) Impacts of inundation
+and drought on eukaryote biodiversity in semi-arid floodplain soils. Molecular
+Ecology, 22, 1746-1758.
+
+Caporaso JG, Kuczynski J, Stombaugh J, Bittinger K, Bushman FD, Costello EK,
+Fierer N, Pena AG, Goodrich JK, Gordon JI, Huttley GA, Kelley ST, Knights D,
+Koenig JE, Ley RE, Lozupone CA, McDonald D, Muegge BD, Pirrung M, Reeder J,
+Sevinsky JR, Tumbaugh PJ, Walters WA, Widmann J, Yatsunenko T, Zaneveld J,
+Knight R (2010) QIIME allows analysis of high-throughput community sequencing
+data. Nature Methods, 7, 335-336.
+
+Chariton AA, Court LN, Hartley DM, Colloff MJ, Hardy CM (2010) Ecological
+assessment of estuarine sediments by pyrosequencing eukaryotic ribosomal DNA.
+Frontiers in Ecology and the Environment, 8, 233-238.
+
+Deagle BE, Kirkwood R, Jarman SN (2009) Analysis of Australian fur seal diet by
+pyrosequencing prey DNA in faeces. Molecular Ecology, 18, 2022-2038.
+
+Kowalczyk R, Taberlet P, Coissac E, Valentini A, Miquel C, Kaminski T, W�jcik JM
+(2011) Influence of management practices on large herbivore diet - case of
+European bison in Bialowieza Primeval Forest (Poland). Forest Ecology and
+Management, 261, 821-828.
+
+Parducci L, Jorgensen T, Tollefsrud MM, Elverland E, Alm T, Fontana SL, Bennett
+KD, Haile J, Matetovici I, Suyama Y, Edwards ME, Andersen K, Rasmussen M,
+Boessenkool S, Coissac E, Brochmann C, Taberlet P, Houmark-Nielsen M, Larsen NK,
+Orlando L, Gilbert MTP, Kjaer KH, Alsos IG, Willerslev E (2012) Glacial Survival
+of Boreal Trees in Northern Scandinavia. Science, 335, 1083-1086.
+
+Schloss PD, Westcott SL, Ryabin T, Hall JR, Hartmann M, Hollister EB, Lesniewski
+RA, Oakley BB, Parks DH, Robinson CJ, Sahl JW, Stres B, Thallinger GG, Van Horn
+DJ, Weber CF (2009) Introducing mothur: open-source, platform-independent,
+community-supported software for describing and comparing microbial communities.
+Applied and Environmental Microbiology, 75, 7537-7541.
+
+Shehzad W, Riaz T, Nawaz MA, Miquel C, Poillot C, Shah SA, Pompanon F, Coissac
+E, Taberlet P (2012) Carnivore diet analysis based on next generation
+sequencing: application to the leopard cat (*Prionailurus bengalensis*) in
+Pakistan. Molecular Ecology, 21, 1951-1965.
+
+Sogin ML, Morrison HG, Huber JA, Welch DM, Huse SM, Neal PR, Arrieta JM, Herndl
+GJ (2006) Microbial diversity in the deep sea and the underexplored "rare
+biosphere". Proceedings of the National Academy of Sciences of the United States
+of America, 103, 12115-12120.
+
+S�nsteb� JH, Gielly L, Brysting A, Reidar E, Edwards M, Haile J, Willerslev E,
+Coissac E, Rioux D, Sannier J, Taberlet P, Brochmann C (2010) Using
+next-generation sequencing for molecular reconstruction of past Arctic
+vegetation and climate. Molecular Ecology Resources, 10, 1009-1018.
+
+Taberlet P, Coissac E, Hajibabaei M, Rieseberg LH (2012) Environmental DNA.
+Molecular Ecology, 21, 1789-1793.
+
+Thomsen PF, Kielgast J, Iversen LL, Wiuf C, Rasmussen M, Gilbert MTP, Orlando L,
+Willerslev E (2012) Monitoring endangered freshwater biodiversity using
+environmental DNA. Molecular Ecology, 21, 2565-2573.
+
+Valentini A, Miquel C, Nawaz MA, Bellemain E, Coissac E, Pompanon F, Gielly L,
+Cruaud C, Nascetti G, Wincker P, Swenson JE, Taberlet P (2009) New perspectives
+in diet analysis based on DNA barcoding and parallel pyrosequencing: the trnL
+approach. Molecular Ecology Resources, 9, 51-60.
+
+Yoccoz NG, Br�then KA, Gielly L, Haile J, Edwards ME, Goslar T, von Stedingk H,
+Brysting AK, Coissac E, Pompanon F, S�nsteb� JH, Miquel C, Valentini A, de Bello
+F, Chave J, Thuiller W, Wincker P, Cruaud C, Gavory F, Rasmussen M, Gilbert MTP,
+Orlando L, Brochmann C, Willerslev E, Taberlet P (2012) DNA from soil mirrors
+plant taxonomic and growth form diversity. Molecular Ecology, 21, 3647-3655.
+
+
+Basic concepts of the ``OBITools``
+==================================
+
+Once installed, the ``OBITools`` enrich the Unix command line interface with a set
+of new commands dedicated to NGS data processing. Most of them have a name
+starting with the `obi` prefix. They automatically recognize the input file
+format amongst most of the standard sequence file formats (i.e. :doc:`fasta <fasta>`, :doc:`fastq <fastq>`,
+:doc:`EMBL <embl>`, and :doc:`GenBank <genbank>` formats). Nevertheless, options are available to enforce some
+format specificity such as the encoding system used in :doc:`fastq <fastq>` files for quality
+codes. Most of the basic Unix commands have their ``OBITools`` equivalent (e.g.
+`obihead` *vs* `head`, `obitail` *vs* `tail`, `obigrep` *vs* `grep`), which is
+convenient for scientists familiar with Unix. The main difference between any
+standard Unix command and its ``OBITools`` counterpart is that the treatment unit is
+no longer the text line but the sequence record. As a sequence record is more
+complex than a single text line, the ``OBITools`` programs have many supplementary
+options compared to their Unix equivalents.
+
+The structure of a sequence record
+..................................
+
+The ``OBITools`` commands consider a sequence record as an entity composed of five
+distinct elements. Two of them are mandatory, the identifier (*id*) and the DNA or
+protein sequence itself. The *id* is a single word composed of characters, digits,
+and other symbols like dots or underscores excluding spaces. Formally, the *ids*
+should be unique within a dataset and should identify each sequence record
+unambiguously, but only a few ``OBITools`` actually rely on this property. The
+sequence is an ordered set of characters corresponding to nucleotides or
+amino-acids according to the International Union of Pure and Applied Chemistry
+(IUPAC) nomenclature (Cornish-Bowden 1985). The three other elements composing a
+sequence record are optional. They consist in a sequence definition, a quality
+vector, and a set of attributes. The sequence definition is a free text
+describing the sequence briefly. The quality vector associates a quality score
+to each nucleotide or amino-acid. Usually this quality score is the result of
+the base-calling process by the sequencer. The last element is a set of
+attributes qualifying the sequence, each attribute being described by a
+`key=value` pair. The set of attributes is the central concept of the ``OBITools``
+system. When an ``OBITools`` command is run on the sequence records included in a
+dataset, the result of the computation often consist in the addition of new
+attributes completing the annotation of each sequence record. This strategy of
+sequence annotation allows the ``OBITools`` to return their results as a new
+sequence record file that can be used as the input of another ``OBITools`` program,
+ultimately creating complex pipelines.
+
+Managed sequence file formats
+.............................
+
+Most of the ``OBITools`` commands read sequence records from a file or from the
+*stdin*, make some computations on the sequence records and output annotated
+sequence records. As inputs, the ``OBITools`` are able to automatically recognize
+the most common sequence file formats (i.e. :doc:`fasta <fasta>`, :doc:`fastq <fastq>`, :doc:`EMBL <embl>`, and :doc:`GenBank <genbank>`).
+They are also able to read `ecoPCR` (Ficetola et al. 2010) result files and
+`ecoPCR`/`ecoPrimers` formatted sequence databases (Riaz et al. 2011) as
+ordinary sequence files. File format outputs are more limited. By default,
+sequences without and with quality information are written in :doc:`fasta <fasta>` and Sanger
+:doc:`fastq <fastq>` formats, respectively. However, dedicated options allow enforcing the
+output format, and the ``OBITools`` are also able to write sequences in the
+`ecoPCR`/`ecoPrimers` database format, to produce reference databases for these
+programs. In the :doc:`fasta <fasta>` or :doc:`fastq <fastq>` format, the attributes are written in the header
+line just after the *id*, following a `key=value;` format (Figure 2).
+
+|The structure of an OBITools sequence record and its representation in fasta and fastq formats|
+
+Taxonomical aspects
+...................
+
+Filtering and annotation steps in the processing of DNA metabarcoding sequence
+data are greatly eased by the explicit association of taxonomic information to
+sequences together with an easy access to the taxonomy. Taxonomic information,
+including a taxonomic identifier, can thus be stored in the set of attributes of
+each sequence record. Specifically, the `taxid` attribute is used by the
+``OBITools`` when querying taxonomic information of a sequence record, nevertheless
+several ``OBITools`` commands can annotate sequence records with taxonomy-related
+attributes for the user's convenience. The value of the `taxid` attribute must
+be a unique integer referring unambiguously to one taxon in the taxonomic
+associated database. Although this is not mandatory, the NCBI taxonomy is a
+preferred source of taxonomic information as the ``OBITools`` provide commands to
+easily extract the full taxonomic information from it. The command `obitaxonomy`
+is useful to build a taxonomic database in the ``OBITools`` format from a dump of
+the NCBI taxonomic database (downloadable at the following URL:
+ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz).
+
+Implemented algorithms
+......................
+
+Most of the algorithms implemented in the ``OBITools`` are basic algorithms allowing
+sampling, filtering and annotation of sequence records based on their associated
+attribute set or sequence (e.g. `obisample`, `obigrep`, `obiannotate`). Some
+others implement algorithms directly related to NGS or to DNA metabarcoding
+(e.g. `illuminapairedend`, `ngsfilter`, `ecotag`). Finally, a few of them do not
+run on sequence records and/or do not provide their results as sequence records.
+Amongst them, `oligotag` (Coissac 2012) generates a set of short oligonucleotide
+sequences (hereafter referred to as `tags` useful to uniquely identify
+individual samples within a single NGS library containing many samples. Hereby,
+we will describe some of the implemented algorithms pertaining directly to DNA
+metabarcoding, as well as the corresponding programs. A full description of all
+programs included in the ``OBITools`` suite is available on the web
+http://metabarcoding.org/obitools/doc.
+
+Implementation of the ``OBITools``
+...................................
+
+The ``OBITools`` are a set of Python programs relying on an eponym Python library.
+The ``OBITools`` library is mainly developed in Python (version 2.7 see
+(http://www.python.org). For increasing the speed of execution, many parts of
+the ``OBITools`` library are developed using `cython` (http://cython.org/, a Python
+to C compiler) or the C language directly. The ``OBITools`` compile on Unix systems
+including Linux and MacOSX.
+
+References
+..........
+
+Coissac E (2012) Oligotag: a program for designing sets of tags for
+next-generation sequencing of multiplexed samples. In: Data Production and
+Analysis in Population Genomics: Methods and Protocols (eds. Pompanon F, Bonin
+A), pp. 13-31. Springer Science+Business Media, New York.
+
+Cornish-Bowden A (1985) Nomenclature for incompletely specified bases in nucleic
+acid sequences: recommendations 1984. Nucleic Acids Research, 13, 3021-3030.
+
+Ficetola GF, Coissac E, Zundel S, Riaz T, Shehzad W, Bessi�re J, Taberlet P,
+Pompanon F (2010) An in silico approach for the evaluation of DNA barcodes. BMC
+Genomics, 11, 434.
+
+Riaz T, Shehzad W, Viari A, Pompanon F, Taberlet P, Coissac E (2011) ecoPrimers:
+inference of new DNA barcode markers from whole genome sequence analysis.
+Nucleic Acids Research, 39, e145.
+
+.. |Pipeline example for a standard biodiversity survey| image:: fig-Pipeline.*
+
+.. |The structure of an OBITools sequence record and its representation in fasta and fastq formats| image:: fig-Record.*
+
+
diff --git a/doc/sphinx/source/wolves.rst b/doc/sphinx/source/wolves.rst
new file mode 100644
index 0000000..34aa594
--- /dev/null
+++ b/doc/sphinx/source/wolves.rst
@@ -0,0 +1,648 @@
+Wolves' diet based on DNA metabarcoding
+=======================================
+
+
+Here is a tutorial on how to analyze DNA metabarcoding data produced on Illumina
+sequencers using:
+
+ - the *OBITools*
+ - some basic *Unix* commands
+
+The data used in this tutorial correspond to the analysis of four wolf scats, using the
+protocol published in Shehzad et al. (2012) for assessing carnivore diet.
+After extracting DNA from the faeces, the DNA amplifications were carried out using the
+primers TTAGATACCCCACTATGC and TAGAACAGGCTCCTCTAG amplifiying the 12S-V5 region
+(Riaz et al. 2011), together with a wolf blocking oligonucleotide.
+
+The complete data set can be downloaded here: :download:`the tutorial dataset<../../../wolf_tutorial.zip>`
+
+
++-------------------------------------------------------------+
+| Good to remember: I am working with tons of sequences |
++-------------------------------------------------------------+
+| It is always a good idea to have a look at the intermediate |
+| results or to evaluate the best parameter for each step. |
+| Some commands are designed for that purpose, for example |
+| you can use : |
+| |
+| - :doc:`obicount <scripts/obicount>` to count the number |
+| of sequence records in a file |
+| - :doc:`obihead <scripts/obihead>` and |
+| :doc:`obitail <scripts/obitail>` to view the first |
+| or last sequence records of a file |
+| - :doc:`obistat <scripts/obistat>` to get some basic |
+| statistics (count, mean, standard deviation) on the |
+| attributes (key=value combinations) in the header of each |
+| sequence record (see The `extended OBITools fasta format` |
+| in the :doc:`fasta format <fasta>` description) |
+| - any *Unix* command such as ``less``, ``awk``, ``sort``, |
+| ``wc`` to check your files |
++-------------------------------------------------------------+
+
+
+Data
+----
+
+The data needed to run the tutorial are the following:
+
+
+- :doc:`fastq <fastq>` files resulting of a GA IIx (Illumina) paired-end (2 x 108 bp)
+ sequencing assay of DNA extracted and amplified from
+ four wolf faeces:
+
+ * ``wolf_F.fastq``
+ * ``wolf_R.fastq``
+
+- the file describing the primers and tags used for all samples sequenced:
+
+ * ``wolf_diet_ngsfilter.txt``
+ The tags correspond to short and specific sequences added on the 5' end of each
+ primer to distinguish the different samples
+
+- the file containing the reference database in a fasta format:
+
+ * ``db_v05_r117.fasta``
+ This reference database has been extracted from the release 117 of EMBL using
+ :doc:`ecoPCR <scripts/ecoPCR>`
+
+- the NCBI taxonomy formatted in the :doc:`ecoPCR <scripts/ecoPCR>` format (see the
+ :doc:`obiconvert <scripts/obiconvert>` utility for details) :
+
+ * ``embl_r117.ndx``
+ * ``embl_r117.rdx``
+ * ``embl_r117.tdx``
+
+
+Step by step analysis
+---------------------
+
+
+Recover full sequence reads from forward and reverse partial reads
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+When using the result of a paired-end sequencing assay with supposedly overlapping forward
+and reverse reads, the first step is to recover the assembled sequence.
+
+The forward and reverse reads of the same fragment are *at the same line position* in the
+two fastq files obtained after sequencing.
+Based on these two files, the assembly of the forward and reverse reads is done with the
+:doc:`illuminapairedend <scripts/illuminapairedend>` utility that aligns the two reads
+and returns the reconstructed sequence.
+
+In our case, the command is:
+
+.. code-block:: bash
+
+ > illuminapairedend --score-min=40 -r wolf_R.fastq wolf_F.fastq > wolf.fastq
+
+The :py:mod:`--score-min` option allows discarding sequences with low alignment quality.
+If the alignment score is below 40, the forward and reverse reads are not aligned but
+concatenated, and the value of the :py:mod:`mode` attribute in the sequence header is set
+to :py:mod:`joined` instead of :py:mod:`alignment`
+
+Remove unaligned sequence records
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Unaligned sequences (:py:mod:`mode=joined`) cannot be used. The following command allows
+removing them from the dataset:
+
+.. code-block:: bash
+
+ > obigrep -p 'mode!="joined"' wolf.fastq > wolf.ali.fastq
+
+The :py:mod:`-p` requires a *python* expression. :py:mod:`mode!="joined"` means that if
+the value of the :py:mod:`mode` attribute is different from :py:mod:`joined`, the
+corresponding sequence record will be kept.
+
+The first sequence record of ``wolf.ali.fastq`` can be obtained using the following
+command line:
+
+.. code-block:: bash
+
+ > obihead --without-progress-bar -n 1 wolf.ali.fastq
+
+And the result is:
+
+.. code-block:: bash
+
+ @HELIUM_000100422_612GNAAXX:7:119:14871:19157#0/1_CONS ali_length=61;
+ direction=left; seq_ab_match=47; sminR=40.0; seq_a_mismatch=7; seq_b_deletion=1;
+ seq_b_mismatch=7; seq_a_deletion=1; score_norm=1.89772607661;
+ score=115.761290673; seq_a_insertion=0; mode=alignment; sminL=40.0;
+ seq_a_single=46; seq_b_single=46; seq_b_insertion=0;
+ ccgcctcctttagataccccactatgcttagccctaaacacaagtaattattataacaaaatcattcgccagagtgtagc
+ gggagtaggttaaaactcaaaggacttggcggtgctttatacccttctagaggagcctgttctaaggaggcgg
+ +
+ ddddddddddddddddddddddcddddcacdddddddddddddc\d~b~~~b~~~~~~b`ryK~|uxyXk`}~ccBccBc
+ ccBcBcccBcBccccccc~~~~b|~~xdbaddaaWcccdaaddddadacddddddcddadbbddddddddddd
+
+
+
+Assign each sequence record to the corresponding sample/marker combination
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Each sequence record is assigned to its corresponding sample and marker using the data
+provided in a text file (here ``wolf_diet_ngsfilter.txt``). This text file contains one
+line per sample, with the name of the experiment (several experiments can be included in
+the same file), the name of the tags (for example: ``aattaac`` if the same tag has been
+used on each extremity of the PCR products, or ``aattaac:gaagtag`` if the tags were
+different), the sequence of the forward primer, the sequence of the reverse primer, the
+letter ``T`` or ``F`` for sample identification using the forward primer and tag only or
+using both primers and both tags, respectively (see :doc:`ngsfilter <scripts/ngsfilter>`
+for details).
+
+.. code-block:: bash
+
+ > ngsfilter -t wolf_diet_ngsfilter.txt -u unidentified.fastq wolf.ali.fastq > \
+ wolf.ali.assigned.fastq
+
+This command creates two files:
+
+- ``unidentified.fastq`` containing all the sequence records that were not assigned to a
+ sample/marker combination
+
+- ``wolf.ali.assigned.fastq`` containing all the sequence records that were properly
+ assigned to a sample/marker combination
+
+Note that each sequence record of the ``wolf.ali.assigned.fastq`` file contains only the
+barcode sequence as the sequences of primers and tags are removed by the
+:doc:`ngsfilter <scripts/ngsfilter>` program. Information concerning the experiment,
+sample, primers and tags is added as attributes in the sequence header.
+
+For instance, the first sequence record of ``wolf.ali.assigned.fastq`` is:
+
+.. code-block:: bash
+
+ @HELIUM_000100422_612GNAAXX:7:119:14871:19157#0/1_CONS_SUB_SUB status=full;
+ seq_ab_match=47; sminR=40.0; ali_length=61; tail_quality=67.0;
+ reverse_match=tagaacaggctcctctag; seq_a_deletion=1; sample=29a_F260619;
+ forward_match=ttagataccccactatgc; forward_primer=ttagataccccactatgc;
+ reverse_primer=tagaacaggctcctctag; sminL=40.0; forward_score=72.0;
+ score=115.761290673; seq_a_mismatch=7; forward_tag=gcctcct; seq_b_mismatch=7;
+ experiment=wolf_diet; mid_quality=69.4210526316; avg_quality=69.1045751634;
+ seq_a_single=46; score_norm=1.89772607661; reverse_score=72.0;
+ direction=forward; seq_b_insertion=0; seq_b_deletion=1; seq_a_insertion=0;
+ seq_length_ori=153; reverse_tag=gcctcct; seq_length=99; mode=alignment;
+ head_quality=67.0; seq_b_single=46;
+ ttagccctaaacacaagtaattattataacaaaatcattcgccagagtgtagcgggagtaggttaaaactcaaaggact
+ tggcggtgctttataccctt
+ +
+ cacdddddddddddddc\d~b~~~b~~~~~~b`ryK~|uxyXk`}~ccBccBcccBcBcccBcBccccccc~~~~b|~~
+ xdbaddaaWcccdaadddda
+
+
+
+
+
+Dereplicate reads into uniq sequences
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The same DNA molecule can be sequenced several times. In order to reduce both file size
+and computations time, and to get easier interpretable results,
+it is convenient to work with unique *sequences* instead of *reads*. To *dereplicate* such
+*reads* into unique *sequences*, we use the :doc:`obiuniq <scripts/obiuniq>` command.
+
++-------------------------------------------------------------+
+| Definition: Dereplicate reads into unique sequences |
++-------------------------------------------------------------+
+| 1. compare all the reads in a data set to each other |
+| 2. group strictly identical reads together |
+| 3. output the sequence for each group and its count in the |
+| original dataset (in this way, all duplicated reads are |
+| removed) |
+| |
+| Definition adapted from Seguritan and Rohwer (2001) |
++-------------------------------------------------------------+
+
+
+For dereplication, we use the :doc:`obiuniq <scripts/obiuniq>` command with the `-m
+sample`. The `-m sample` option is used to keep the information of the samples of origin
+for each unique sequence.
+
+.. code-block:: bash
+
+ > obiuniq -m sample wolf.ali.assigned.fastq > wolf.ali.assigned.uniq.fasta
+
+Note that :doc:`obiuniq <scripts/obiuniq>` returns a fasta file.
+
+The first sequence record of ``wolf.ali.assigned.uniq.fasta`` is:
+
+.. code-block:: bash
+
+ >HELIUM_000100422_612GNAAXX:7:119:14871:19157#0/1_CONS_SUB_SUB_CMP ali_length=61;
+ seq_ab_match=47; sminR=40.0; tail_quality=67.0; reverse_match=ttagataccccactatgc;
+ seq_a_deletion=1; forward_match=tagaacaggctcctctag; forward_primer=tagaacaggctcctctag;
+ reverse_primer=ttagataccccactatgc; sminL=40.0; merged_sample={'29a_F260619': 1};
+ forward_score=72.0; seq_a_mismatch=7; forward_tag=gcctcct; seq_b_mismatch=7;
+ score=115.761290673; mid_quality=69.4210526316; avg_quality=69.1045751634;
+ seq_a_single=46; score_norm=1.89772607661; reverse_score=72.0; direction=reverse;
+ seq_b_insertion=0; experiment=wolf_diet; seq_b_deletion=1; seq_a_insertion=0;
+ seq_length_ori=153; reverse_tag=gcctcct; count=1; seq_length=99; status=full;
+ mode=alignment; head_quality=67.0; seq_b_single=46;
+ aagggtataaagcaccgccaagtcctttgagttttaacctactcccgctacactctggcg
+ aatgattttgttataataattacttgtgtttagggctaa
+
+The run of :doc:`obiuniq <scripts/obiuniq>` has added two key=values entries in the header
+of the fasta sequence:
+
+ - :py:mod:`merged_sample={'29a_F260619': 1}`: this sequence have been found once in a
+ single sample called 29a_F260619
+ - :py:mod:`count=1` : the total count for this sequence is 1
+
+To keep only these two ``key=value`` attributes, we can use the
+:doc:`obiannotate <scripts/obiannotate>` command:
+
+
+.. code-block:: bash
+
+ > obiannotate -k count -k merged_sample \
+ wolf.ali.assigned.uniq.fasta > $$ ; mv $$ wolf.ali.assigned.uniq.fasta
+
+
+The first five sequence records of ``wolf.ali.assigned.uniq.fasta`` become:
+
+.. code-block:: bash
+
+ >HELIUM_000100422_612GNAAXX:7:119:14871:19157#0/1_CONS_SUB_SUB_CMP merged_sample={'29a_F260619': 1}; count=1;
+ aagggtataaagcaccgccaagtcctttgagttttaacctactcccgctacactctggcg
+ aatgattttgttataataattacttgtgtttagggctaa
+ >HELIUM_000100422_612GNAAXX:7:108:5640:3823#0/1_CONS_SUB_SUB_CMP merged_sample={'29a_F260619': 7, '15a_F730814': 2}; count=9;
+ aagggtataaagcaccgccaagtcctttgagttttaagctattgccggtagtactctggc
+ gaacaattttgttatattaattacttgtgtttagggctaa
+ >HELIUM_000100422_612GNAAXX:7:97:14311:19299#0/1_CONS_SUB_SUB_CMP merged_sample={'29a_F260619': 5, '15a_F730814': 4}; count=9;
+ aagggtataaagcaccgccaagtcctttgagttttaagctcttgccggtagtactctggc
+ gaataattttgttatattaattacttgtgtttagggctaa
+ >HELIUM_000100422_612GNAAXX:7:22:8540:14708#0/1_CONS_SUB_SUB merged_sample={'29a_F260619': 4697, '15a_F730814': 7638}; count=12335;
+ aagggtataaagcaccgccaagtcctttgagttttaagctattgccggtagtactctggc
+ gaataattttgttatattaattacttgtgtttagggctaa
+ >HELIUM_000100422_612GNAAXX:7:57:18459:16145#0/1_CONS_SUB_SUB_CMP merged_sample={'26a_F040644': 10490}; count=10490;
+ agggatgtaaagcaccgccaagtcctttgagtttcaggctgttgctagtagtactctggc
+ gaacattcttgtttattgaatgtttatgtttagggctaa
+
+
+Denoise the sequence dataset
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+To have a set of sequences assigned to their corresponding samples does not mean that all
+sequences are *biologically* meaningful i.e. some of these sequences can contains PCR
+and/or sequencing errors, or chimeras. To remove such sequences as much as possible, we
+first discard rare sequences and then rsequence variants that likely correspond to
+artifacts.
+
+
+
+Get the count statistics
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+In that case, we use :doc:`obistat <scripts/obistat>` to get the counting statistics on
+the 'count' attribute (the count attribute has been added by the :doc:`obiuniq
+<scripts/obiuniq>` command). By piping the result in the *Unix* commands ``sort`` and
+``head``, we keep only the count statistics for the 20 lowest values of the 'count'
+attribute.
+
+.. code-block:: bash
+
+ > obistat -c count wolf.ali.assigned.uniq.fasta | \
+ sort -nk1 | head -20
+
+This print the output:
+
+.. code-block:: bash
+
+ count count total
+ 1 3504 3504
+ 2 228 456
+ 3 136 408
+ 4 73 292
+ 5 61 305
+ 6 47 282
+ 7 34 238
+ 8 27 216
+ 9 26 234
+ 10 25 250
+ 11 13 143
+ 12 14 168
+ 13 10 130
+ 14 5 70
+ 15 9 135
+ 16 8 128
+ 17 4 68
+ 18 9 162
+ 19 5 95
+
+The dataset contains 3504 sequences occurring only once.
+
+
+
+Keep only the sequences having a count greater or equal to 10 and a length shorter than 80 bp
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Based on the previous observation, we set the cut-off for keeping sequences for further
+analysis to a count of 10. To do this, we use the :doc:`obigrep <scripts/obigrep>`
+command.
+The ``-p 'count>=10'`` option means that the ``python`` expression :py:mod:`count>=10`
+must be evaluated to :py:mod:`True` for each sequence to be kept. Based on previous
+knowledge we also remove sequences with a length shorter than 80 bp (option -l) as we know
+that the amplified 12S-V5 barcode for vertebrates must have a length around 100bp.
+
+.. code-block:: bash
+
+ > obigrep -l 80 -p 'count>=10' wolf.ali.assigned.uniq.fasta \
+ > wolf.ali.assigned.uniq.c10.l80.fasta
+
+
+The first sequence record of ``wolf.ali.assigned.uniq.c10.l80.fasta`` is:
+
+.. code-block:: bash
+
+ >HELIUM_000100422_612GNAAXX:7:22:8540:14708#0/1_CONS_SUB_SUB count=12335; merged_sample={'29a_F260619': 4697, '15a_F730814': 7638};
+ aagggtataaagcaccgccaagtcctttgagttttaagctattgccggtagtactctggc
+ gaataattttgttatattaattacttgtgtttagggctaa
+
+
+Clean the sequences for PCR/sequencing errors (sequence variants)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+As a final denoising step, using the :doc:`obiclean <scripts/obiclean>` program, we keep
+the `head` sequences (``-H`` option) that are sequences with no variants with a count
+greater than 5% of their own count (``-r 0.05`` option).
+
+.. code-block:: bash
+
+ > obiclean -s merged_sample -r 0.05 -H \
+ wolf.ali.assigned.uniq.c10.l80.fasta > wolf.ali.assigned.uniq.c10.l80.clean.fasta
+
+The first sequence record of ``wolf.ali.assigned.uniq.c10.l80.clean.fasta`` is:
+
+.. code-block:: bash
+
+ >HELIUM_000100422_612GNAAXX:7:22:8540:14708#0/1_CONS_SUB_SUB
+ merged_sample={'29a_F260619': 4697, '15a_F730814': 7638};
+ obiclean_count={'29a_F260619': 5438, '15a_F730814': 8642}; obiclean_head=True;
+ obiclean_cluster={'29a_F260619':
+ 'HELIUM_000100422_612GNAAXX:7:22:8540:14708#0/1_CONS_SUB_SUB', '15a_F730814':
+ 'HELIUM_000100422_612GNAAXX:7:22:8540:14708#0/1_CONS_SUB_SUB'};
+ count=12335; obiclean_internalcount=0; obiclean_status={'29a_F260619': 'h', '15a_F730814': 'h'};
+ obiclean_samplecount=2; obiclean_headcount=2; obiclean_singletoncount=0;
+ aagggtataaagcaccgccaagtcctttgagttttaagctattgccggtagtactctggc
+ gaataattttgttatattaattacttgtgtttagggctaa
+
+Taxonomic assignment of sequences
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Once denoising has been done, the next step in diet analysis is to assign the barcodes to
+the corresponding species in order to get the complete list of species associated to each
+sample.
+
+Taxonomic assignment of sequences requires a reference database compiling all possible
+species to be identified in the sample. Assignment is then done based on sequence
+comparison between sample sequences and reference sequences.
+
+
+Build a reference database
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+One way to build the reference database is to use the :doc:`ecoPCR <scripts/ecoPCR>`
+program to simulate a PCR and to extract all sequences from the EMBL that may be amplified
+`in silico` by the two primers (`TTAGATACCCCACTATGC` and `TAGAACAGGCTCCTCTAG`) used for
+PCR amplification.
+
+The full list of steps for building this reference database would then be:
+
+1. Download the whole set of EMBL sequences (available from:
+ ftp://ftp.ebi.ac.uk/pub/databases/embl/release/)
+2. Download the NCBI taxonomy (available from:
+ ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz)
+3. Format them into the ecoPCR format (see :doc:`obiconvert <scripts/obiconvert>` for how
+ you can produce ecoPCR compatible files)
+4. Use ecoPCR to simulate amplification and build a reference database based on putatively
+ amplified barcodes together with their recorded taxonomic information
+
+As step 1 and step 3 can be really time-consuming (about one day), we alredy provide the
+reference database produced by the following commands so that you can skip its
+construction. Note that as the EMBL database and taxonomic data can evolve daily, if you
+run the following commands you may end up with quite different results.
+
+
+Any utility allowing file downloading from a ftp site can be used. In the following
+commands, we use the commonly used ``wget`` *Unix* command.
+
+Download the sequences
+......................
+
+.. code-block:: bash
+
+ > mkdir EMBL
+ > cd EMBL
+ > wget -nH --cut-dirs=4 -Arel_std_\*.dat.gz -m ftp://ftp.ebi.ac.uk/pub/databases/embl/release/
+ > cd ..
+
+Download the taxonomy
+.....................
+
+.. code-block:: bash
+
+ > mkdir TAXO
+ > cd TAXO
+ > wget ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz
+ > tar -zxvf taxdump.tar.gz
+ > cd ..
+
+Format the data
+...............
+
+.. code-block:: bash
+
+ > obiconvert --embl -t ./TAXO --ecopcrDB-output=embl_last ./EMBL/*.dat
+
+
+Use ecoPCR to simulate an in silico` PCR
+........................................
+
+.. code-block:: bash
+
+ > ecoPCR -d ./ECODB/embl_last -e 3 -l 50 -L 150 \
+ TTAGATACCCCACTATGC TAGAACAGGCTCCTCTAG > v05.ecopcr
+
+
+Note that the primers must be in the same order both in ``wolf_diet_ngsfilter.txt`` and in
+the :doc:`ecoPCR <scripts/ecoPCR>` command.
+
+
+Clean the database
+..................
+
+ 1. filter sequences so that they have a good taxonomic description at the species,
+ genus, and family levels (:doc:`obigrep <scripts/obigrep>` command below).
+ 2. remove redundant sequences (:doc:`obiuniq <scripts/obiuniq>` command below).
+ 3. ensure that the dereplicated sequences have a taxid at the family level
+ (:doc:`obigrep <scripts/obigrep>` command below).
+ 4. ensure that sequences each have a unique identification
+ (:doc:`obiannotate <scripts/obiannotate>` command below)
+
+.. code-block:: bash
+
+ > obigrep -d embl_last --require-rank=species \
+ --require-rank=genus --require-rank=family v05.ecopcr > v05_clean.fasta
+
+ > obiuniq -d embl_last \
+ v05_clean.fasta > v05_clean_uniq.fasta
+
+ > obigrep -d embl_last --require-rank=family \
+ v05_clean_uniq.fasta > v05_clean_uniq_clean.fasta
+
+ > obiannotate --uniq-id v05_clean_uniq_clean.fasta > db_v05.fasta
+
+
+.. warning::
+ From now on, for the sake of clarity, the following commands will use the filenames of
+ the files provided with the tutorial. If you decided to run the last steps and use the
+ files you have produced, you'll have to use ``db_v05.fasta`` instead of
+ ``db_v05_r117.fasta`` and ``embl_last`` instead of ``embl_r117``
+
+
+Assign each sequence to a taxon
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Once the reference database is built, taxonomic assignment can be carried out using
+the :doc:`ecotag <scripts/ecotag>` command.
+
+.. code-block:: bash
+
+ > ecotag -d embl_r117 -R db_v05_r117.fasta wolf.ali.assigned.uniq.c10.l80.clean.fasta > \
+ wolf.ali.assigned.uniq.c10.l80.clean.tag.fasta
+
+
+The :doc:`ecotag <scripts/ecotag>` adds several `key=value` attributes in the sequence
+record header, among them:
+
+- best_match=ACCESSION where ACCESSION is the id of hte sequence in the reference database
+ that best aligns to the query sequence;
+- best_identity=FLOAT where FLOAT*100 is the percentage of identity between the best match
+ sequence and the query sequence;
+- taxid=TAXID where TAXID is the final assignation of the sequence by
+ :doc:`ecotag <scripts/ecotag>`
+- scientific_name=NAME where NAME is the scientific name of the assigned taxid.
+
+The first sequence record of ``wolf.ali.assigned.uniq.c10.l80.clean.tag.fasta`` is:
+
+
+.. code-block:: bash
+
+ >HELIUM_000100422_612GNAAXX:7:22:8540:14708#0/1_CONS_SUB_SUB_CMP
+ species_name=Capreolus capreolus; family=9850; scientific_name=Capreolus
+ capreolus; rank=species; taxid=9858; best_identity={'db_v05_r117': 1.0};
+ scientific_name_by_db={'db_v05_r117': 'Capreolus capreolus'};
+ obiclean_samplecount=2; species=9858; merged_sample={'29a_F260619': 4697,
+ '15a_F730814': 7638}; obiclean_count={'29a_F260619': 5438, '15a_F730814': 8642};
+ obiclean_singletoncount=0; obiclean_cluster={'29a_F260619':
+ 'HELIUM_000100422_612GNAAXX:7:22:8540:14708#0/1_CONS_SUB_SUB_CMP',
+ '15a_F730814':
+ 'HELIUM_000100422_612GNAAXX:7:22:8540:14708#0/1_CONS_SUB_SUB_CMP'};
+ species_list={'db_v05_r117': ['Capreolus capreolus']}; obiclean_internalcount=0;
+ match_count={'db_v05_r117': 1}; obiclean_head=True; taxid_by_db={'db_v05_r117':
+ 9858}; family_name=Cervidae; genus_name=Capreolus;
+ obiclean_status={'29a_F260619': 'h', '15a_F730814': 'h'}; obiclean_headcount=2;
+ count=12335; id_status={'db_v05_r117': True}; best_match={'db_v05_r117':
+ 'AJ885202'}; order_name=None; rank_by_db={'db_v05_r117': 'species'}; genus=9857;
+ order=None;
+ ttagccctaaacacaagtaattaatataacaaaattattcgccagagtactaccggcaat
+ agcttaaaactcaaaggacttggcggtgctttataccctt
+
+
+Generate the final result table
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Some unuseful attributes can be removed at this stage.
+
+.. code-block:: bash
+
+ > obiannotate --delete-tag=scientific_name_by_db --delete-tag=obiclean_samplecount \
+ --delete-tag=obiclean_count --delete-tag=obiclean_singletoncount \
+ --delete-tag=obiclean_cluster --delete-tag=obiclean_internalcount \
+ --delete-tag=obiclean_head --delete-tag=taxid_by_db --delete-tag=obiclean_headcount \
+ --delete-tag=id_status --delete-tag=rank_by_db --delete-tag=order_name \
+ --delete-tag=order wolf.ali.assigned.uniq.c10.l80.clean.tag.fasta > \
+ wolf.ali.assigned.uniq.c10.l80.clean.tag.ann.fasta
+
+
+The first sequence record of ``wolf.ali.assigned.uniq.c10.l80.clean.tag.ann.fasta`` is
+then:
+
+.. code-block:: bash
+
+ >HELIUM_000100422_612GNAAXX:7:22:8540:14708#0/1_CONS_SUB_SUB_CMP
+ match_count={'db_v05_r117': 1}; count=12335; species_name=Capreolus capreolus;
+ best_match={'db_v05_r117': 'AJ885202'}; family=9850; family_name=Cervidae;
+ scientific_name=Capreolus capreolus; taxid=9858; rank=species;
+ obiclean_status={'29a_F260619': 'h', '15a_F730814': 'h'};
+ best_identity={'db_v05_r117': 1.0}; merged_sample={'29a_F260619': 4697,
+ '15a_F730814': 7638}; genus_name=Capreolus; genus=9857; species=9858;
+ species_list={'db_v05_r117': ['Capreolus capreolus']};
+ ttagccctaaacacaagtaattaatataacaaaattattcgccagagtactaccggcaat
+ agcttaaaactcaaaggacttggcggtgctttataccctt
+
+
+The sequences can be sorted by decreasing order of `count`.
+
+.. code-block:: bash
+
+ > obisort -k count -r wolf.ali.assigned.uniq.c10.l80.clean.tag.ann.fasta > \
+ wolf.ali.assigned.uniq.c10.l80.clean.tag.ann.sort.fasta
+
+The first sequence record of ``wolf.ali.assigned.uniq.c10.l80.clean.tag.ann.sort.fasta`` is then:
+
+.. code-block:: bash
+
+ >HELIUM_000100422_612GNAAXX:7:22:8540:14708#0/1_CONS_SUB_SUB_CMP count=12335;
+ match_count={'db_v05_r117': 1}; species_name=Capreolus capreolus;
+ best_match={'db_v05_r117': 'AJ885202'}; family=9850; family_name=Cervidae;
+ scientific_name=Capreolus capreolus; taxid=9858; rank=species;
+ obiclean_status={'29a_F260619': 'h', '15a_F730814': 'h'};
+ best_identity={'db_v05_r117': 1.0}; merged_sample={'29a_F260619': 4697,
+ '15a_F730814': 7638}; genus_name=Capreolus; genus=9857; species=9858;
+ species_list={'db_v05_r117': ['Capreolus capreolus']};
+ ttagccctaaacacaagtaattaatataacaaaattattcgccagagtactaccggcaat
+ agcttaaaactcaaaggacttggcggtgctttataccctt
+
+Finally, a tab-delimited file that can be open by excel or R is generated.
+
+.. code-block:: bash
+
+ > obitab -o wolf.ali.assigned.uniq.c10.l80.clean.tag.ann.sort.fasta > \
+ wolf.ali.assigned.uniq.c10.l80.clean.tag.ann.sort.tab
+
+
+This file contains 26 sequences. You can deduce the diet of each sample:
+ - 13a_F730603: Cervus elaphus
+ - 15a_F730814: Capreolus capreolus
+ - 26a_F040644: Marmota sp. (according to the location, it is Marmota marmota)
+ - 29a_F260619: Capreolus capreolus
+
+Note that we also obtained a few wolf sequences although a wolf-blocking oligonucleotide
+was used.
+
+
+References
+----------
+
+ - Shehzad W, Riaz T, Nawaz MA, Miquel C, Poillot C, Shah SA, Pompanon F, Coissac E,
+ Taberlet P (2012) Carnivore diet analysis based on next generation sequencing:
+ application to the leopard cat (Prionailurus bengalensis) in Pakistan. Molecular
+ Ecology, 21, 1951-1965.
+ - Riaz T, Shehzad W, Viari A, Pompanon F, Taberlet P, Coissac E (2011) ecoPrimers:
+ inference of new DNA barcode markers from whole genome sequence analysis. Nucleic
+ Acids Research, 39, e145.
+ - Seguritan V, Rohwer F. (2001) FastGroup: a program to dereplicate libraries of
+ 16S rDNA sequences. BMC Bioinformatics. 2001;2:9. Epub 2001 Oct 16.
+
+
+Contact
+-------
+
+For any suggestion or improvement, please contact :
+
+ - eric.coissac at metabarcoding.org
+ - frederic.boyer at metabarcoding.org
+
+
diff --git a/doc/sphinx/sphinxext/apigen.py b/doc/sphinx/sphinxext/apigen.py
new file mode 100644
index 0000000..1237409
--- /dev/null
+++ b/doc/sphinx/sphinxext/apigen.py
@@ -0,0 +1,427 @@
+"""Attempt to generate templates for module reference with Sphinx
+
+XXX - we exclude extension modules
+
+To include extension modules, first identify them as valid in the
+``_uri2path`` method, then handle them in the ``_parse_module`` script.
+
+We get functions and classes by parsing the text of .py files.
+Alternatively we could import the modules for discovery, and we'd have
+to do that for extension modules. This would involve changing the
+``_parse_module`` method to work via import and introspection, and
+might involve changing ``discover_modules`` (which determines which
+files are modules, and therefore which module URIs will be passed to
+``_parse_module``).
+
+NOTE: this is a modified version of a script originally shipped with the
+PyMVPA project, which we've adapted for NIPY use. PyMVPA is an MIT-licensed
+project."""
+
+# Stdlib imports
+import os
+import re
+
+# Functions and classes
+class ApiDocWriter(object):
+ ''' Class for automatic detection and parsing of API docs
+ to Sphinx-parsable reST format'''
+
+ # only separating first two levels
+ rst_section_levels = ['*', '=', '-', '~', '^']
+
+ def __init__(self,
+ package_name,
+ rst_extension='.rst',
+ package_skip_patterns=None,
+ module_skip_patterns=None,
+ ):
+ ''' Initialize package for parsing
+
+ Parameters
+ ----------
+ package_name : string
+ Name of the top-level package. *package_name* must be the
+ name of an importable package
+ rst_extension : string, optional
+ Extension for reST files, default '.rst'
+ package_skip_patterns : None or sequence of {strings, regexps}
+ Sequence of strings giving URIs of packages to be excluded
+ Operates on the package path, starting at (including) the
+ first dot in the package path, after *package_name* - so,
+ if *package_name* is ``sphinx``, then ``sphinx.util`` will
+ result in ``.util`` being passed for earching by these
+ regexps. If is None, gives default. Default is:
+ ['\.tests$']
+ module_skip_patterns : None or sequence
+ Sequence of strings giving URIs of modules to be excluded
+ Operates on the module name including preceding URI path,
+ back to the first dot after *package_name*. For example
+ ``sphinx.util.console`` results in the string to search of
+ ``.util.console``
+ If is None, gives default. Default is:
+ ['\.setup$', '\._']
+ '''
+ if package_skip_patterns is None:
+ package_skip_patterns = ['\\.tests$']
+ if module_skip_patterns is None:
+ module_skip_patterns = ['\\.setup$', '\\._']
+ self.package_name = package_name
+ self.rst_extension = rst_extension
+ self.package_skip_patterns = package_skip_patterns
+ self.module_skip_patterns = module_skip_patterns
+
+ def get_package_name(self):
+ return self._package_name
+
+ def set_package_name(self, package_name):
+ ''' Set package_name
+
+ >>> docwriter = ApiDocWriter('sphinx')
+ >>> import sphinx
+ >>> docwriter.root_path == sphinx.__path__[0]
+ True
+ >>> docwriter.package_name = 'docutils'
+ >>> import docutils
+ >>> docwriter.root_path == docutils.__path__[0]
+ True
+ '''
+ # It's also possible to imagine caching the module parsing here
+ self._package_name = package_name
+ self.root_module = __import__(package_name)
+ self.root_path = self.root_module.__path__[0]
+ self.written_modules = None
+
+ package_name = property(get_package_name, set_package_name, None,
+ 'get/set package_name')
+
+ def _get_object_name(self, line):
+ ''' Get second token in line
+ >>> docwriter = ApiDocWriter('sphinx')
+ >>> docwriter._get_object_name(" def func(): ")
+ 'func'
+ >>> docwriter._get_object_name(" class Klass(object): ")
+ 'Klass'
+ >>> docwriter._get_object_name(" class Klass: ")
+ 'Klass'
+ '''
+ name = line.split()[1].split('(')[0].strip()
+ # in case we have classes which are not derived from object
+ # ie. old style classes
+ return name.rstrip(':')
+
+ def _uri2path(self, uri):
+ ''' Convert uri to absolute filepath
+
+ Parameters
+ ----------
+ uri : string
+ URI of python module to return path for
+
+ Returns
+ -------
+ path : None or string
+ Returns None if there is no valid path for this URI
+ Otherwise returns absolute file system path for URI
+
+ Examples
+ --------
+ >>> docwriter = ApiDocWriter('sphinx')
+ >>> import sphinx
+ >>> modpath = sphinx.__path__[0]
+ >>> res = docwriter._uri2path('sphinx.builder')
+ >>> res == os.path.join(modpath, 'builder.py')
+ True
+ >>> res = docwriter._uri2path('sphinx')
+ >>> res == os.path.join(modpath, '__init__.py')
+ True
+ >>> docwriter._uri2path('sphinx.does_not_exist')
+
+ '''
+ if uri == self.package_name:
+ return os.path.join(self.root_path, '__init__.py')
+ path = uri.replace('.', os.path.sep)
+ path = path.replace(self.package_name + os.path.sep, '')
+ path = os.path.join(self.root_path, path)
+ # XXX maybe check for extensions as well?
+ if os.path.exists(path + '.py'): # file
+ path += '.py'
+ elif os.path.exists(os.path.join(path, '__init__.py')):
+ path = os.path.join(path, '__init__.py')
+ else:
+ return None
+ return path
+
+ def _path2uri(self, dirpath):
+ ''' Convert directory path to uri '''
+ relpath = dirpath.replace(self.root_path, self.package_name)
+ if relpath.startswith(os.path.sep):
+ relpath = relpath[1:]
+ return relpath.replace(os.path.sep, '.')
+
+ def _parse_module(self, uri):
+ ''' Parse module defined in *uri* '''
+ filename = self._uri2path(uri)
+ if filename is None:
+ # nothing that we could handle here.
+ return ([],[])
+ f = open(filename, 'rt')
+ functions, classes = self._parse_lines(f)
+ f.close()
+ return functions, classes
+
+ def _parse_lines(self, linesource):
+ ''' Parse lines of text for functions and classes '''
+ functions = []
+ classes = []
+ for line in linesource:
+ if line.startswith('def ') and line.count('('):
+ # exclude private stuff
+ name = self._get_object_name(line)
+ if not name.startswith('_'):
+ functions.append(name)
+ elif line.startswith('class '):
+ # exclude private stuff
+ name = self._get_object_name(line)
+ if not name.startswith('_'):
+ classes.append(name)
+ else:
+ pass
+ functions.sort()
+ classes.sort()
+ return functions, classes
+
+ def generate_api_doc(self, uri):
+ '''Make autodoc documentation template string for a module
+
+ Parameters
+ ----------
+ uri : string
+ python location of module - e.g 'sphinx.builder'
+
+ Returns
+ -------
+ S : string
+ Contents of API doc
+ '''
+ # get the names of all classes and functions
+ functions, classes = self._parse_module(uri)
+ if not len(functions) and not len(classes):
+ print 'WARNING: Empty -',uri # dbg
+ return ''
+
+ # Make a shorter version of the uri that omits the package name for
+ # titles
+ uri_short = re.sub(r'^%s\.' % self.package_name,'',uri)
+
+ ad = '.. AUTO-GENERATED FILE -- DO NOT EDIT!\n\n'
+
+ chap_title = uri_short
+ ad += (chap_title+'\n'+ self.rst_section_levels[1] * len(chap_title)
+ + '\n\n')
+
+ # Set the chapter title to read 'module' for all modules except for the
+ # main packages
+ if '.' in uri:
+ title = 'Module: :mod:`' + uri_short + '`'
+ else:
+ title = ':mod:`' + uri_short + '`'
+ ad += title + '\n' + self.rst_section_levels[2] * len(title)
+
+ if len(classes):
+ ad += '\nInheritance diagram for ``%s``:\n\n' % uri
+ ad += '.. inheritance-diagram:: %s \n' % uri
+ ad += ' :parts: 3\n'
+
+ ad += '\n.. automodule:: ' + uri + '\n'
+ ad += '\n.. currentmodule:: ' + uri + '\n'
+ multi_class = len(classes) > 1
+ multi_fx = len(functions) > 1
+ if multi_class:
+ ad += '\n' + 'Classes' + '\n' + \
+ self.rst_section_levels[2] * 7 + '\n'
+ elif len(classes) and multi_fx:
+ ad += '\n' + 'Class' + '\n' + \
+ self.rst_section_levels[2] * 5 + '\n'
+ for c in classes:
+ ad += '\n:class:`' + c + '`\n' \
+ + self.rst_section_levels[multi_class + 2 ] * \
+ (len(c)+9) + '\n\n'
+ ad += '\n.. autoclass:: ' + c + '\n'
+ # must NOT exclude from index to keep cross-refs working
+ ad += ' :members:\n' \
+ ' :undoc-members:\n' \
+ ' :show-inheritance:\n' \
+ ' :inherited-members:\n' \
+ '\n' \
+ ' .. automethod:: __init__\n'
+ if multi_fx:
+ ad += '\n' + 'Functions' + '\n' + \
+ self.rst_section_levels[2] * 9 + '\n\n'
+ elif len(functions) and multi_class:
+ ad += '\n' + 'Function' + '\n' + \
+ self.rst_section_levels[2] * 8 + '\n\n'
+ for f in functions:
+ # must NOT exclude from index to keep cross-refs working
+ ad += '\n.. autofunction:: ' + uri + '.' + f + '\n\n'
+ return ad
+
+ def _survives_exclude(self, matchstr, match_type):
+ ''' Returns True if *matchstr* does not match patterns
+
+ ``self.package_name`` removed from front of string if present
+
+ Examples
+ --------
+ >>> dw = ApiDocWriter('sphinx')
+ >>> dw._survives_exclude('sphinx.okpkg', 'package')
+ True
+ >>> dw.package_skip_patterns.append('^\\.badpkg$')
+ >>> dw._survives_exclude('sphinx.badpkg', 'package')
+ False
+ >>> dw._survives_exclude('sphinx.badpkg', 'module')
+ True
+ >>> dw._survives_exclude('sphinx.badmod', 'module')
+ True
+ >>> dw.module_skip_patterns.append('^\\.badmod$')
+ >>> dw._survives_exclude('sphinx.badmod', 'module')
+ False
+ '''
+ if match_type == 'module':
+ patterns = self.module_skip_patterns
+ elif match_type == 'package':
+ patterns = self.package_skip_patterns
+ else:
+ raise ValueError('Cannot interpret match type "%s"'
+ % match_type)
+ # Match to URI without package name
+ L = len(self.package_name)
+ if matchstr[:L] == self.package_name:
+ matchstr = matchstr[L:]
+ for pat in patterns:
+ try:
+ pat.search
+ except AttributeError:
+ pat = re.compile(pat)
+ if pat.search(matchstr):
+ return False
+ return True
+
+ def discover_modules(self):
+ ''' Return module sequence discovered from ``self.package_name``
+
+
+ Parameters
+ ----------
+ None
+
+ Returns
+ -------
+ mods : sequence
+ Sequence of module names within ``self.package_name``
+
+ Examples
+ --------
+ >>> dw = ApiDocWriter('sphinx')
+ >>> mods = dw.discover_modules()
+ >>> 'sphinx.util' in mods
+ True
+ >>> dw.package_skip_patterns.append('\.util$')
+ >>> 'sphinx.util' in dw.discover_modules()
+ False
+ >>>
+ '''
+ modules = [self.package_name]
+ # raw directory parsing
+ for dirpath, dirnames, filenames in os.walk(self.root_path):
+ # Check directory names for packages
+ root_uri = self._path2uri(os.path.join(self.root_path,
+ dirpath))
+ for dirname in dirnames[:]: # copy list - we modify inplace
+ package_uri = '.'.join((root_uri, dirname))
+ if (self._uri2path(package_uri) and
+ self._survives_exclude(package_uri, 'package')):
+ modules.append(package_uri)
+ else:
+ dirnames.remove(dirname)
+ # Check filenames for modules
+ for filename in filenames:
+ module_name = filename[:-3]
+ module_uri = '.'.join((root_uri, module_name))
+ if (self._uri2path(module_uri) and
+ self._survives_exclude(module_uri, 'module')):
+ modules.append(module_uri)
+ return sorted(modules)
+
+ def write_modules_api(self, modules,outdir):
+ # write the list
+ written_modules = []
+ for m in modules:
+ api_str = self.generate_api_doc(m)
+ if not api_str:
+ continue
+ # write out to file
+ outfile = os.path.join(outdir,
+ m + self.rst_extension)
+ fileobj = open(outfile, 'wt')
+ fileobj.write(api_str)
+ fileobj.close()
+ written_modules.append(m)
+ self.written_modules = written_modules
+
+ def write_api_docs(self, outdir):
+ """Generate API reST files.
+
+ Parameters
+ ----------
+ outdir : string
+ Directory name in which to store files
+ We create automatic filenames for each module
+
+ Returns
+ -------
+ None
+
+ Notes
+ -----
+ Sets self.written_modules to list of written modules
+ """
+ if not os.path.exists(outdir):
+ os.mkdir(outdir)
+ # compose list of modules
+ modules = self.discover_modules()
+ self.write_modules_api(modules,outdir)
+
+ def write_index(self, outdir, froot='gen', relative_to=None):
+ """Make a reST API index file from written files
+
+ Parameters
+ ----------
+ path : string
+ Filename to write index to
+ outdir : string
+ Directory to which to write generated index file
+ froot : string, optional
+ root (filename without extension) of filename to write to
+ Defaults to 'gen'. We add ``self.rst_extension``.
+ relative_to : string
+ path to which written filenames are relative. This
+ component of the written file path will be removed from
+ outdir, in the generated index. Default is None, meaning,
+ leave path as it is.
+ """
+ if self.written_modules is None:
+ raise ValueError('No modules written')
+ # Get full filename path
+ path = os.path.join(outdir, froot+self.rst_extension)
+ # Path written into index is relative to rootpath
+ if relative_to is not None:
+ relpath = outdir.replace(relative_to + os.path.sep, '')
+ else:
+ relpath = outdir
+ idx = open(path,'wt')
+ w = idx.write
+ w('.. AUTO-GENERATED FILE -- DO NOT EDIT!\n\n')
+ w('.. toctree::\n\n')
+ for f in self.written_modules:
+ w(' %s\n' % os.path.join(relpath,f))
+ idx.close()
diff --git a/doc/sphinx/sphinxext/docscrape.py b/doc/sphinx/sphinxext/docscrape.py
new file mode 100644
index 0000000..f374b3d
--- /dev/null
+++ b/doc/sphinx/sphinxext/docscrape.py
@@ -0,0 +1,497 @@
+"""Extract reference documentation from the NumPy source tree.
+
+"""
+
+import inspect
+import textwrap
+import re
+import pydoc
+from StringIO import StringIO
+from warnings import warn
+4
+class Reader(object):
+ """A line-based string reader.
+
+ """
+ def __init__(self, data):
+ """
+ Parameters
+ ----------
+ data : str
+ String with lines separated by '\n'.
+
+ """
+ if isinstance(data,list):
+ self._str = data
+ else:
+ self._str = data.split('\n') # store string as list of lines
+
+ self.reset()
+
+ def __getitem__(self, n):
+ return self._str[n]
+
+ def reset(self):
+ self._l = 0 # current line nr
+
+ def read(self):
+ if not self.eof():
+ out = self[self._l]
+ self._l += 1
+ return out
+ else:
+ return ''
+
+ def seek_next_non_empty_line(self):
+ for l in self[self._l:]:
+ if l.strip():
+ break
+ else:
+ self._l += 1
+
+ def eof(self):
+ return self._l >= len(self._str)
+
+ def read_to_condition(self, condition_func):
+ start = self._l
+ for line in self[start:]:
+ if condition_func(line):
+ return self[start:self._l]
+ self._l += 1
+ if self.eof():
+ return self[start:self._l+1]
+ return []
+
+ def read_to_next_empty_line(self):
+ self.seek_next_non_empty_line()
+ def is_empty(line):
+ return not line.strip()
+ return self.read_to_condition(is_empty)
+
+ def read_to_next_unindented_line(self):
+ def is_unindented(line):
+ return (line.strip() and (len(line.lstrip()) == len(line)))
+ return self.read_to_condition(is_unindented)
+
+ def peek(self,n=0):
+ if self._l + n < len(self._str):
+ return self[self._l + n]
+ else:
+ return ''
+
+ def is_empty(self):
+ return not ''.join(self._str).strip()
+
+
+class NumpyDocString(object):
+ def __init__(self,docstring):
+ docstring = textwrap.dedent(docstring).split('\n')
+
+ self._doc = Reader(docstring)
+ self._parsed_data = {
+ 'Signature': '',
+ 'Summary': [''],
+ 'Extended Summary': [],
+ 'Parameters': [],
+ 'Returns': [],
+ 'Raises': [],
+ 'Warns': [],
+ 'Other Parameters': [],
+ 'Attributes': [],
+ 'Methods': [],
+ 'See Also': [],
+ 'Notes': [],
+ 'Warnings': [],
+ 'References': '',
+ 'Examples': '',
+ 'index': {}
+ }
+
+ self._parse()
+
+ def __getitem__(self,key):
+ return self._parsed_data[key]
+
+ def __setitem__(self,key,val):
+ if not self._parsed_data.has_key(key):
+ warn("Unknown section %s" % key)
+ else:
+ self._parsed_data[key] = val
+
+ def _is_at_section(self):
+ self._doc.seek_next_non_empty_line()
+
+ if self._doc.eof():
+ return False
+
+ l1 = self._doc.peek().strip() # e.g. Parameters
+
+ if l1.startswith('.. index::'):
+ return True
+
+ l2 = self._doc.peek(1).strip() # ---------- or ==========
+ return l2.startswith('-'*len(l1)) or l2.startswith('='*len(l1))
+
+ def _strip(self,doc):
+ i = 0
+ j = 0
+ for i,line in enumerate(doc):
+ if line.strip(): break
+
+ for j,line in enumerate(doc[::-1]):
+ if line.strip(): break
+
+ return doc[i:len(doc)-j]
+
+ def _read_to_next_section(self):
+ section = self._doc.read_to_next_empty_line()
+
+ while not self._is_at_section() and not self._doc.eof():
+ if not self._doc.peek(-1).strip(): # previous line was empty
+ section += ['']
+
+ section += self._doc.read_to_next_empty_line()
+
+ return section
+
+ def _read_sections(self):
+ while not self._doc.eof():
+ data = self._read_to_next_section()
+ name = data[0].strip()
+
+ if name.startswith('..'): # index section
+ yield name, data[1:]
+ elif len(data) < 2:
+ yield StopIteration
+ else:
+ yield name, self._strip(data[2:])
+
+ def _parse_param_list(self,content):
+ r = Reader(content)
+ params = []
+ while not r.eof():
+ header = r.read().strip()
+ if ' : ' in header:
+ arg_name, arg_type = header.split(' : ')[:2]
+ else:
+ arg_name, arg_type = header, ''
+
+ desc = r.read_to_next_unindented_line()
+ desc = dedent_lines(desc)
+
+ params.append((arg_name,arg_type,desc))
+
+ return params
+
+
+ _name_rgx = re.compile(r"^\s*(:(?P<role>\w+):`(?P<name>[a-zA-Z0-9_.-]+)`|"
+ r" (?P<name2>[a-zA-Z0-9_.-]+))\s*", re.X)
+ def _parse_see_also(self, content):
+ """
+ func_name : Descriptive text
+ continued text
+ another_func_name : Descriptive text
+ func_name1, func_name2, :meth:`func_name`, func_name3
+
+ """
+ items = []
+
+ def parse_item_name(text):
+ """Match ':role:`name`' or 'name'"""
+ m = self._name_rgx.match(text)
+ if m:
+ g = m.groups()
+ if g[1] is None:
+ return g[3], None
+ else:
+ return g[2], g[1]
+ raise ValueError("%s is not a item name" % text)
+
+ def push_item(name, rest):
+ if not name:
+ return
+ name, role = parse_item_name(name)
+ items.append((name, list(rest), role))
+ del rest[:]
+
+ current_func = None
+ rest = []
+
+ for line in content:
+ if not line.strip(): continue
+
+ m = self._name_rgx.match(line)
+ if m and line[m.end():].strip().startswith(':'):
+ push_item(current_func, rest)
+ current_func, line = line[:m.end()], line[m.end():]
+ rest = [line.split(':', 1)[1].strip()]
+ if not rest[0]:
+ rest = []
+ elif not line.startswith(' '):
+ push_item(current_func, rest)
+ current_func = None
+ if ',' in line:
+ for func in line.split(','):
+ push_item(func, [])
+ elif line.strip():
+ current_func = line
+ elif current_func is not None:
+ rest.append(line.strip())
+ push_item(current_func, rest)
+ return items
+
+ def _parse_index(self, section, content):
+ """
+ .. index: default
+ :refguide: something, else, and more
+
+ """
+ def strip_each_in(lst):
+ return [s.strip() for s in lst]
+
+ out = {}
+ section = section.split('::')
+ if len(section) > 1:
+ out['default'] = strip_each_in(section[1].split(','))[0]
+ for line in content:
+ line = line.split(':')
+ if len(line) > 2:
+ out[line[1]] = strip_each_in(line[2].split(','))
+ return out
+
+ def _parse_summary(self):
+ """Grab signature (if given) and summary"""
+ if self._is_at_section():
+ return
+
+ summary = self._doc.read_to_next_empty_line()
+ summary_str = " ".join([s.strip() for s in summary]).strip()
+ if re.compile('^([\w., ]+=)?\s*[\w\.]+\(.*\)$').match(summary_str):
+ self['Signature'] = summary_str
+ if not self._is_at_section():
+ self['Summary'] = self._doc.read_to_next_empty_line()
+ else:
+ self['Summary'] = summary
+
+ if not self._is_at_section():
+ self['Extended Summary'] = self._read_to_next_section()
+
+ def _parse(self):
+ self._doc.reset()
+ self._parse_summary()
+
+ for (section,content) in self._read_sections():
+ if not section.startswith('..'):
+ section = ' '.join([s.capitalize() for s in section.split(' ')])
+ if section in ('Parameters', 'Attributes', 'Methods',
+ 'Returns', 'Raises', 'Warns'):
+ self[section] = self._parse_param_list(content)
+ elif section.startswith('.. index::'):
+ self['index'] = self._parse_index(section, content)
+ elif section == 'See Also':
+ self['See Also'] = self._parse_see_also(content)
+ else:
+ self[section] = content
+
+ # string conversion routines
+
+ def _str_header(self, name, symbol='-'):
+ return [name, len(name)*symbol]
+
+ def _str_indent(self, doc, indent=4):
+ out = []
+ for line in doc:
+ out += [' '*indent + line]
+ return out
+
+ def _str_signature(self):
+ if self['Signature']:
+ return [self['Signature'].replace('*','\*')] + ['']
+ else:
+ return ['']
+
+ def _str_summary(self):
+ if self['Summary']:
+ return self['Summary'] + ['']
+ else:
+ return []
+
+ def _str_extended_summary(self):
+ if self['Extended Summary']:
+ return self['Extended Summary'] + ['']
+ else:
+ return []
+
+ def _str_param_list(self, name):
+ out = []
+ if self[name]:
+ out += self._str_header(name)
+ for param,param_type,desc in self[name]:
+ out += ['%s : %s' % (param, param_type)]
+ out += self._str_indent(desc)
+ out += ['']
+ return out
+
+ def _str_section(self, name):
+ out = []
+ if self[name]:
+ out += self._str_header(name)
+ out += self[name]
+ out += ['']
+ return out
+
+ def _str_see_also(self, func_role):
+ if not self['See Also']: return []
+ out = []
+ out += self._str_header("See Also")
+ last_had_desc = True
+ for func, desc, role in self['See Also']:
+ if role:
+ link = ':%s:`%s`' % (role, func)
+ elif func_role:
+ link = ':%s:`%s`' % (func_role, func)
+ else:
+ link = "`%s`_" % func
+ if desc or last_had_desc:
+ out += ['']
+ out += [link]
+ else:
+ out[-1] += ", %s" % link
+ if desc:
+ out += self._str_indent([' '.join(desc)])
+ last_had_desc = True
+ else:
+ last_had_desc = False
+ out += ['']
+ return out
+
+ def _str_index(self):
+ idx = self['index']
+ out = []
+ out += ['.. index:: %s' % idx.get('default','')]
+ for section, references in idx.iteritems():
+ if section == 'default':
+ continue
+ out += [' :%s: %s' % (section, ', '.join(references))]
+ return out
+
+ def __str__(self, func_role=''):
+ out = []
+ out += self._str_signature()
+ out += self._str_summary()
+ out += self._str_extended_summary()
+ for param_list in ('Parameters','Returns','Raises'):
+ out += self._str_param_list(param_list)
+ out += self._str_section('Warnings')
+ out += self._str_see_also(func_role)
+ for s in ('Notes','References','Examples'):
+ out += self._str_section(s)
+ out += self._str_index()
+ return '\n'.join(out)
+
+
+def indent(str,indent=4):
+ indent_str = ' '*indent
+ if str is None:
+ return indent_str
+ lines = str.split('\n')
+ return '\n'.join(indent_str + l for l in lines)
+
+def dedent_lines(lines):
+ """Deindent a list of lines maximally"""
+ return textwrap.dedent("\n".join(lines)).split("\n")
+
+def header(text, style='-'):
+ return text + '\n' + style*len(text) + '\n'
+
+
+class FunctionDoc(NumpyDocString):
+ def __init__(self, func, role='func', doc=None):
+ self._f = func
+ self._role = role # e.g. "func" or "meth"
+ if doc is None:
+ doc = inspect.getdoc(func) or ''
+ try:
+ NumpyDocString.__init__(self, doc)
+ except ValueError, e:
+ print '*'*78
+ print "ERROR: '%s' while parsing `%s`" % (e, self._f)
+ print '*'*78
+ #print "Docstring follows:"
+ #print doclines
+ #print '='*78
+
+ if not self['Signature']:
+ func, func_name = self.get_func()
+ try:
+ # try to read signature
+ argspec = inspect.getargspec(func)
+ argspec = inspect.formatargspec(*argspec)
+ argspec = argspec.replace('*','\*')
+ signature = '%s%s' % (func_name, argspec)
+ except TypeError, e:
+ signature = '%s()' % func_name
+ self['Signature'] = signature
+
+ def get_func(self):
+ func_name = getattr(self._f, '__name__', self.__class__.__name__)
+ if inspect.isclass(self._f):
+ func = getattr(self._f, '__call__', self._f.__init__)
+ else:
+ func = self._f
+ return func, func_name
+
+ def __str__(self):
+ out = ''
+
+ func, func_name = self.get_func()
+ signature = self['Signature'].replace('*', '\*')
+
+ roles = {'func': 'function',
+ 'meth': 'method'}
+
+ if self._role:
+ if not roles.has_key(self._role):
+ print "Warning: invalid role %s" % self._role
+ out += '.. %s:: %s\n \n\n' % (roles.get(self._role,''),
+ func_name)
+
+ out += super(FunctionDoc, self).__str__(func_role=self._role)
+ return out
+
+
+class ClassDoc(NumpyDocString):
+ def __init__(self,cls,modulename='',func_doc=FunctionDoc,doc=None):
+ if not inspect.isclass(cls):
+ raise ValueError("Initialise using a class. Got %r" % cls)
+ self._cls = cls
+
+ if modulename and not modulename.endswith('.'):
+ modulename += '.'
+ self._mod = modulename
+ self._name = cls.__name__
+ self._func_doc = func_doc
+
+ if doc is None:
+ doc = pydoc.getdoc(cls)
+
+ NumpyDocString.__init__(self, doc)
+
+ @property
+ def methods(self):
+ return [name for name,func in inspect.getmembers(self._cls)
+ if not name.startswith('_') and callable(func)]
+
+ def __str__(self):
+ out = ''
+ out += super(ClassDoc, self).__str__()
+ out += "\n\n"
+
+ #for m in self.methods:
+ # print "Parsing `%s`" % m
+ # out += str(self._func_doc(getattr(self._cls,m), 'meth')) + '\n\n'
+ # out += '.. index::\n single: %s; %s\n\n' % (self._name, m)
+
+ return out
+
+
diff --git a/doc/sphinx/sphinxext/docscrape_sphinx.py b/doc/sphinx/sphinxext/docscrape_sphinx.py
new file mode 100644
index 0000000..77ed271
--- /dev/null
+++ b/doc/sphinx/sphinxext/docscrape_sphinx.py
@@ -0,0 +1,136 @@
+import re, inspect, textwrap, pydoc
+from docscrape import NumpyDocString, FunctionDoc, ClassDoc
+
+class SphinxDocString(NumpyDocString):
+ # string conversion routines
+ def _str_header(self, name, symbol='`'):
+ return ['.. rubric:: ' + name, '']
+
+ def _str_field_list(self, name):
+ return [':' + name + ':']
+
+ def _str_indent(self, doc, indent=4):
+ out = []
+ for line in doc:
+ out += [' '*indent + line]
+ return out
+
+ def _str_signature(self):
+ return ['']
+ if self['Signature']:
+ return ['``%s``' % self['Signature']] + ['']
+ else:
+ return ['']
+
+ def _str_summary(self):
+ return self['Summary'] + ['']
+
+ def _str_extended_summary(self):
+ return self['Extended Summary'] + ['']
+
+ def _str_param_list(self, name):
+ out = []
+ if self[name]:
+ out += self._str_field_list(name)
+ out += ['']
+ for param,param_type,desc in self[name]:
+ out += self._str_indent(['**%s** : %s' % (param.strip(),
+ param_type)])
+ out += ['']
+ out += self._str_indent(desc,8)
+ out += ['']
+ return out
+
+ def _str_section(self, name):
+ out = []
+ if self[name]:
+ out += self._str_header(name)
+ out += ['']
+ content = textwrap.dedent("\n".join(self[name])).split("\n")
+ out += content
+ out += ['']
+ return out
+
+ def _str_see_also(self, func_role):
+ out = []
+ if self['See Also']:
+ see_also = super(SphinxDocString, self)._str_see_also(func_role)
+ out = ['.. seealso::', '']
+ out += self._str_indent(see_also[2:])
+ return out
+
+ def _str_warnings(self):
+ out = []
+ if self['Warnings']:
+ out = ['.. warning::', '']
+ out += self._str_indent(self['Warnings'])
+ return out
+
+ def _str_index(self):
+ idx = self['index']
+ out = []
+ if len(idx) == 0:
+ return out
+
+ out += ['.. index:: %s' % idx.get('default','')]
+ for section, references in idx.iteritems():
+ if section == 'default':
+ continue
+ elif section == 'refguide':
+ out += [' single: %s' % (', '.join(references))]
+ else:
+ out += [' %s: %s' % (section, ','.join(references))]
+ return out
+
+ def _str_references(self):
+ out = []
+ if self['References']:
+ out += self._str_header('References')
+ if isinstance(self['References'], str):
+ self['References'] = [self['References']]
+ out.extend(self['References'])
+ out += ['']
+ return out
+
+ def __str__(self, indent=0, func_role="obj"):
+ out = []
+ out += self._str_signature()
+ out += self._str_index() + ['']
+ out += self._str_summary()
+ out += self._str_extended_summary()
+ for param_list in ('Parameters', 'Attributes', 'Methods',
+ 'Returns','Raises'):
+ out += self._str_param_list(param_list)
+ out += self._str_warnings()
+ out += self._str_see_also(func_role)
+ out += self._str_section('Notes')
+ out += self._str_references()
+ out += self._str_section('Examples')
+ out = self._str_indent(out,indent)
+ return '\n'.join(out)
+
+class SphinxFunctionDoc(SphinxDocString, FunctionDoc):
+ pass
+
+class SphinxClassDoc(SphinxDocString, ClassDoc):
+ pass
+
+def get_doc_object(obj, what=None, doc=None):
+ if what is None:
+ if inspect.isclass(obj):
+ what = 'class'
+ elif inspect.ismodule(obj):
+ what = 'module'
+ elif callable(obj):
+ what = 'function'
+ else:
+ what = 'object'
+ if what == 'class':
+ return SphinxClassDoc(obj, '', func_doc=SphinxFunctionDoc, doc=doc)
+ elif what in ('function', 'method'):
+ return SphinxFunctionDoc(obj, '', doc=doc)
+ else:
+ if doc is None:
+ doc = pydoc.getdoc(obj)
+ return SphinxDocString(doc)
+
diff --git a/doc/sphinx/sphinxext/ipython_console_highlighting.py b/doc/sphinx/sphinxext/ipython_console_highlighting.py
new file mode 100644
index 0000000..217b779
--- /dev/null
+++ b/doc/sphinx/sphinxext/ipython_console_highlighting.py
@@ -0,0 +1,114 @@
+"""reST directive for syntax-highlighting ipython interactive sessions.
+
+XXX - See what improvements can be made based on the new (as of Sept 2009)
+'pycon' lexer for the python console. At the very least it will give better
+highlighted tracebacks.
+"""
+
+#-----------------------------------------------------------------------------
+# Needed modules
+
+# Standard library
+import re
+
+# Third party
+from pygments.lexer import Lexer, do_insertions
+from pygments.lexers.agile import (PythonConsoleLexer, PythonLexer,
+ PythonTracebackLexer)
+from pygments.token import Comment, Generic
+
+from sphinx import highlighting
+
+#-----------------------------------------------------------------------------
+# Global constants
+line_re = re.compile('.*?\n')
+
+#-----------------------------------------------------------------------------
+# Code begins - classes and functions
+
+class IPythonConsoleLexer(Lexer):
+ """
+ For IPython console output or doctests, such as:
+
+ .. sourcecode:: ipython
+
+ In [1]: a = 'foo'
+
+ In [2]: a
+ Out[2]: 'foo'
+
+ In [3]: print a
+ foo
+
+ In [4]: 1 / 0
+
+ Notes:
+
+ - Tracebacks are not currently supported.
+
+ - It assumes the default IPython prompts, not customized ones.
+ """
+
+ name = 'IPython console session'
+ aliases = ['ipython']
+ mimetypes = ['text/x-ipython-console']
+ input_prompt = re.compile("(In \[[0-9]+\]: )|( \.\.\.+:)")
+ output_prompt = re.compile("(Out\[[0-9]+\]: )|( \.\.\.+:)")
+ continue_prompt = re.compile(" \.\.\.+:")
+ tb_start = re.compile("\-+")
+
+ def get_tokens_unprocessed(self, text):
+ pylexer = PythonLexer(**self.options)
+ tblexer = PythonTracebackLexer(**self.options)
+
+ curcode = ''
+ insertions = []
+ for match in line_re.finditer(text):
+ line = match.group()
+ input_prompt = self.input_prompt.match(line)
+ continue_prompt = self.continue_prompt.match(line.rstrip())
+ output_prompt = self.output_prompt.match(line)
+ if line.startswith("#"):
+ insertions.append((len(curcode),
+ [(0, Comment, line)]))
+ elif input_prompt is not None:
+ insertions.append((len(curcode),
+ [(0, Generic.Prompt, input_prompt.group())]))
+ curcode += line[input_prompt.end():]
+ elif continue_prompt is not None:
+ insertions.append((len(curcode),
+ [(0, Generic.Prompt, continue_prompt.group())]))
+ curcode += line[continue_prompt.end():]
+ elif output_prompt is not None:
+ # Use the 'error' token for output. We should probably make
+ # our own token, but error is typicaly in a bright color like
+ # red, so it works fine for our output prompts.
+ insertions.append((len(curcode),
+ [(0, Generic.Error, output_prompt.group())]))
+ curcode += line[output_prompt.end():]
+ else:
+ if curcode:
+ for item in do_insertions(insertions,
+ pylexer.get_tokens_unprocessed(curcode)):
+ yield item
+ curcode = ''
+ insertions = []
+ yield match.start(), Generic.Output, line
+ if curcode:
+ for item in do_insertions(insertions,
+ pylexer.get_tokens_unprocessed(curcode)):
+ yield item
+
+
+def setup(app):
+ """Setup as a sphinx extension."""
+
+ # This is only a lexer, so adding it below to pygments appears sufficient.
+ # But if somebody knows that the right API usage should be to do that via
+ # sphinx, by all means fix it here. At least having this setup.py
+ # suppresses the sphinx warning we'd get without it.
+ pass
+
+#-----------------------------------------------------------------------------
+# Register the extension as a valid pygments lexer
+highlighting.lexers['ipython'] = IPythonConsoleLexer()
diff --git a/doc/sphinx/sphinxext/numpydoc.py b/doc/sphinx/sphinxext/numpydoc.py
new file mode 100644
index 0000000..ff6c44c
--- /dev/null
+++ b/doc/sphinx/sphinxext/numpydoc.py
@@ -0,0 +1,116 @@
+"""
+========
+numpydoc
+========
+
+Sphinx extension that handles docstrings in the Numpy standard format. [1]
+
+It will:
+
+- Convert Parameters etc. sections to field lists.
+- Convert See Also section to a See also entry.
+- Renumber references.
+- Extract the signature from the docstring, if it can't be determined otherwise.
+
+.. [1] http://projects.scipy.org/scipy/numpy/wiki/CodingStyleGuidelines#docstring-standard
+
+"""
+
+import os, re, pydoc
+from docscrape_sphinx import get_doc_object, SphinxDocString
+import inspect
+
+def mangle_docstrings(app, what, name, obj, options, lines,
+ reference_offset=[0]):
+ if what == 'module':
+ # Strip top title
+ title_re = re.compile(r'^\s*[#*=]{4,}\n[a-z0-9 -]+\n[#*=]{4,}\s*',
+ re.I|re.S)
+ lines[:] = title_re.sub('', "\n".join(lines)).split("\n")
+ else:
+ doc = get_doc_object(obj, what, "\n".join(lines))
+ lines[:] = str(doc).split("\n")
+
+ if app.config.numpydoc_edit_link and hasattr(obj, '__name__') and \
+ obj.__name__:
+ if hasattr(obj, '__module__'):
+ v = dict(full_name="%s.%s" % (obj.__module__, obj.__name__))
+ else:
+ v = dict(full_name=obj.__name__)
+ lines += ['', '.. htmlonly::', '']
+ lines += [' %s' % x for x in
+ (app.config.numpydoc_edit_link % v).split("\n")]
+
+ # replace reference numbers so that there are no duplicates
+ references = []
+ for l in lines:
+ l = l.strip()
+ if l.startswith('.. ['):
+ try:
+ references.append(int(l[len('.. ['):l.index(']')]))
+ except ValueError:
+ print "WARNING: invalid reference in %s docstring" % name
+
+ # Start renaming from the biggest number, otherwise we may
+ # overwrite references.
+ references.sort()
+ if references:
+ for i, line in enumerate(lines):
+ for r in references:
+ new_r = reference_offset[0] + r
+ lines[i] = lines[i].replace('[%d]_' % r,
+ '[%d]_' % new_r)
+ lines[i] = lines[i].replace('.. [%d]' % r,
+ '.. [%d]' % new_r)
+
+ reference_offset[0] += len(references)
+
+def mangle_signature(app, what, name, obj, options, sig, retann):
+ # Do not try to inspect classes that don't define `__init__`
+ if (inspect.isclass(obj) and
+ 'initializes x; see ' in pydoc.getdoc(obj.__init__)):
+ return '', ''
+
+ if not (callable(obj) or hasattr(obj, '__argspec_is_invalid_')): return
+ if not hasattr(obj, '__doc__'): return
+
+ doc = SphinxDocString(pydoc.getdoc(obj))
+ if doc['Signature']:
+ sig = re.sub("^[^(]*", "", doc['Signature'])
+ return sig, ''
+
+def initialize(app):
+ try:
+ app.connect('autodoc-process-signature', mangle_signature)
+ except:
+ monkeypatch_sphinx_ext_autodoc()
+
+def setup(app, get_doc_object_=get_doc_object):
+ global get_doc_object
+ get_doc_object = get_doc_object_
+
+ app.connect('autodoc-process-docstring', mangle_docstrings)
+ app.connect('builder-inited', initialize)
+ app.add_config_value('numpydoc_edit_link', None, True)
+
+#------------------------------------------------------------------------------
+# Monkeypatch sphinx.ext.autodoc to accept argspecless autodocs (Sphinx < 0.5)
+#------------------------------------------------------------------------------
+
+def monkeypatch_sphinx_ext_autodoc():
+ global _original_format_signature
+ import sphinx.ext.autodoc
+
+ if sphinx.ext.autodoc.format_signature is our_format_signature:
+ return
+
+ print "[numpydoc] Monkeypatching sphinx.ext.autodoc ..."
+ _original_format_signature = sphinx.ext.autodoc.format_signature
+ sphinx.ext.autodoc.format_signature = our_format_signature
+
+def our_format_signature(what, obj):
+ r = mangle_signature(None, what, None, obj, None, None, None)
+ if r is not None:
+ return r[0]
+ else:
+ return _original_format_signature(what, obj)
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..8aa4c99
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,6 @@
+--extra-index-url https://pypi.python.org/simple/
+Cython>=0.24
+Sphinx>=1.2.0
+wheel>=0.24.0
+virtualenv>=1.11.0
+ipython<6.0
diff --git a/setup.cfg b/setup.cfg
new file mode 100644
index 0000000..861a9f5
--- /dev/null
+++ b/setup.cfg
@@ -0,0 +1,5 @@
+[egg_info]
+tag_build =
+tag_date = 0
+tag_svn_revision = 0
+
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..9e9b5b6
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,70 @@
+#! /usr/bin/env python
+#
+# Install script
+#
+#
+
+import sys
+import os
+import os.path
+import re
+import glob
+
+from os import path
+
+#
+# Add to the python path the directory containing the extensions
+# of distutils
+#
+
+
+PACKAGE = "OBITools"
+VERSION = "1.2.11"
+AUTHOR = 'Eric Coissac'
+EMAIL = 'eric at coissac.eu'
+URL = 'http://metabarcoding.org/obitools'
+LICENSE = 'CeCILL-V2'
+
+SRC = 'src'
+CSRC = None
+
+sys.path.append('distutils.ext')
+
+if __name__=="__main__":
+
+ from obidistutils.serenity import serenity_mode
+
+ serenity=serenity_mode(PACKAGE,VERSION)
+
+ from obidistutils.core import setup
+ from obidistutils.core import CTOOLS
+ from obidistutils.core import CEXES
+ from obidistutils.core import FILES
+
+ DEPRECATED_SCRIPTS=["fastaComplement", "fastaUniq","fasta2tab","fastaAnnotate",
+ "fastaSample","fastaGrep","fastaCount","fastaLength",
+ "fastaHead","fastaTail","fastaSplit","fastaStrand",
+ "fastaLocate","solexaPairEnd","ecoTag","obijoinpairedend"
+ ]
+
+ setup(name=PACKAGE,
+ description="Scripts and library for sequence analysis",
+ classifiers=[
+ 'Development Status :: 5 - Production/Stable',
+ 'Environment :: Console',
+ 'Intended Audience :: Science/Research',
+ 'License :: Other/Proprietary License',
+ 'Operating System :: Unix',
+ 'Programming Language :: Python',
+ 'Programming Language :: Python :: 2',
+ 'Topic :: Scientific/Engineering :: Bio-Informatics',
+ 'Topic :: Utilities',
+ ],
+ version=VERSION,
+ author=AUTHOR,
+ author_email=EMAIL,
+ license=LICENSE,
+ url=URL,
+ python_src=SRC,
+ sse='sse2',
+ serenity=serenity)
diff --git a/src/OBITools.egg-info/PKG-INFO b/src/OBITools.egg-info/PKG-INFO
new file mode 100644
index 0000000..91758e5
--- /dev/null
+++ b/src/OBITools.egg-info/PKG-INFO
@@ -0,0 +1,19 @@
+Metadata-Version: 1.1
+Name: OBITools
+Version: 1.2.11
+Summary: Scripts and library for sequence analysis
+Home-page: http://metabarcoding.org/obitools
+Author: Eric Coissac
+Author-email: eric at coissac.eu
+License: CeCILL-V2
+Description: UNKNOWN
+Platform: UNKNOWN
+Classifier: Development Status :: 5 - Production/Stable
+Classifier: Environment :: Console
+Classifier: Intended Audience :: Science/Research
+Classifier: License :: Other/Proprietary License
+Classifier: Operating System :: Unix
+Classifier: Programming Language :: Python
+Classifier: Programming Language :: Python :: 2
+Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
+Classifier: Topic :: Utilities
diff --git a/src/OBITools.egg-info/SOURCES.txt b/src/OBITools.egg-info/SOURCES.txt
new file mode 100644
index 0000000..1d07409
--- /dev/null
+++ b/src/OBITools.egg-info/SOURCES.txt
@@ -0,0 +1,591 @@
+MANIFEST.in
+README.txt
+requirements.txt
+setup.py
+distutils.ext/obidistutils/__init__.py
+distutils.ext/obidistutils/core.py
+distutils.ext/obidistutils/dist.py
+distutils.ext/obidistutils/command/__init__.py
+distutils.ext/obidistutils/command/build.py
+distutils.ext/obidistutils/command/build_cexe.py
+distutils.ext/obidistutils/command/build_ctools.py
+distutils.ext/obidistutils/command/build_exe.py
+distutils.ext/obidistutils/command/build_ext.py
+distutils.ext/obidistutils/command/build_files.py
+distutils.ext/obidistutils/command/build_filters.py
+distutils.ext/obidistutils/command/build_scripts.py
+distutils.ext/obidistutils/command/build_sphinx.py
+distutils.ext/obidistutils/command/install.py
+distutils.ext/obidistutils/command/install_scripts.py
+distutils.ext/obidistutils/command/install_sphinx.py
+distutils.ext/obidistutils/command/littlebigman.py
+distutils.ext/obidistutils/command/pidname.py
+distutils.ext/obidistutils/command/sdist.py
+distutils.ext/obidistutils/serenity/__init__.py
+distutils.ext/obidistutils/serenity/checkpackage.py
+distutils.ext/obidistutils/serenity/checkpip.py
+distutils.ext/obidistutils/serenity/checkpython.py
+distutils.ext/obidistutils/serenity/checksystem.py
+distutils.ext/obidistutils/serenity/getcython.py
+distutils.ext/obidistutils/serenity/globals.py
+distutils.ext/obidistutils/serenity/rerun.py
+distutils.ext/obidistutils/serenity/snake.py
+distutils.ext/obidistutils/serenity/util.py
+distutils.ext/obidistutils/serenity/virtual.py
+distutils.ext/obidistutils/serenity/pip/__init__.py
+distutils.ext/obidistutils/serenity/pip/__main__.py
+distutils.ext/obidistutils/serenity/pip/basecommand.py
+distutils.ext/obidistutils/serenity/pip/baseparser.py
+distutils.ext/obidistutils/serenity/pip/cmdoptions.py
+distutils.ext/obidistutils/serenity/pip/download.py
+distutils.ext/obidistutils/serenity/pip/exceptions.py
+distutils.ext/obidistutils/serenity/pip/index.py
+distutils.ext/obidistutils/serenity/pip/locations.py
+distutils.ext/obidistutils/serenity/pip/log.py
+distutils.ext/obidistutils/serenity/pip/pep425tags.py
+distutils.ext/obidistutils/serenity/pip/req.py
+distutils.ext/obidistutils/serenity/pip/runner.py
+distutils.ext/obidistutils/serenity/pip/status_codes.py
+distutils.ext/obidistutils/serenity/pip/util.py
+distutils.ext/obidistutils/serenity/pip/wheel.py
+distutils.ext/obidistutils/serenity/pip/_vendor/__init__.py
+distutils.ext/obidistutils/serenity/pip/_vendor/pkg_resources.py
+distutils.ext/obidistutils/serenity/pip/_vendor/re-vendor.py
+distutils.ext/obidistutils/serenity/pip/_vendor/six.py
+distutils.ext/obidistutils/serenity/pip/_vendor/_markerlib/__init__.py
+distutils.ext/obidistutils/serenity/pip/_vendor/_markerlib/markers.py
+distutils.ext/obidistutils/serenity/pip/_vendor/colorama/__init__.py
+distutils.ext/obidistutils/serenity/pip/_vendor/colorama/ansi.py
+distutils.ext/obidistutils/serenity/pip/_vendor/colorama/ansitowin32.py
+distutils.ext/obidistutils/serenity/pip/_vendor/colorama/initialise.py
+distutils.ext/obidistutils/serenity/pip/_vendor/colorama/win32.py
+distutils.ext/obidistutils/serenity/pip/_vendor/colorama/winterm.py
+distutils.ext/obidistutils/serenity/pip/_vendor/distlib/__init__.py
+distutils.ext/obidistutils/serenity/pip/_vendor/distlib/compat.py
+distutils.ext/obidistutils/serenity/pip/_vendor/distlib/database.py
+distutils.ext/obidistutils/serenity/pip/_vendor/distlib/index.py
+distutils.ext/obidistutils/serenity/pip/_vendor/distlib/locators.py
+distutils.ext/obidistutils/serenity/pip/_vendor/distlib/manifest.py
+distutils.ext/obidistutils/serenity/pip/_vendor/distlib/markers.py
+distutils.ext/obidistutils/serenity/pip/_vendor/distlib/metadata.py
+distutils.ext/obidistutils/serenity/pip/_vendor/distlib/resources.py
+distutils.ext/obidistutils/serenity/pip/_vendor/distlib/scripts.py
+distutils.ext/obidistutils/serenity/pip/_vendor/distlib/util.py
+distutils.ext/obidistutils/serenity/pip/_vendor/distlib/version.py
+distutils.ext/obidistutils/serenity/pip/_vendor/distlib/wheel.py
+distutils.ext/obidistutils/serenity/pip/_vendor/distlib/_backport/__init__.py
+distutils.ext/obidistutils/serenity/pip/_vendor/distlib/_backport/misc.py
+distutils.ext/obidistutils/serenity/pip/_vendor/distlib/_backport/shutil.py
+distutils.ext/obidistutils/serenity/pip/_vendor/distlib/_backport/sysconfig.py
+distutils.ext/obidistutils/serenity/pip/_vendor/distlib/_backport/tarfile.py
+distutils.ext/obidistutils/serenity/pip/_vendor/html5lib/__init__.py
+distutils.ext/obidistutils/serenity/pip/_vendor/html5lib/constants.py
+distutils.ext/obidistutils/serenity/pip/_vendor/html5lib/html5parser.py
+distutils.ext/obidistutils/serenity/pip/_vendor/html5lib/ihatexml.py
+distutils.ext/obidistutils/serenity/pip/_vendor/html5lib/inputstream.py
+distutils.ext/obidistutils/serenity/pip/_vendor/html5lib/sanitizer.py
+distutils.ext/obidistutils/serenity/pip/_vendor/html5lib/tokenizer.py
+distutils.ext/obidistutils/serenity/pip/_vendor/html5lib/utils.py
+distutils.ext/obidistutils/serenity/pip/_vendor/html5lib/filters/__init__.py
+distutils.ext/obidistutils/serenity/pip/_vendor/html5lib/filters/_base.py
+distutils.ext/obidistutils/serenity/pip/_vendor/html5lib/filters/alphabeticalattributes.py
+distutils.ext/obidistutils/serenity/pip/_vendor/html5lib/filters/inject_meta_charset.py
+distutils.ext/obidistutils/serenity/pip/_vendor/html5lib/filters/lint.py
+distutils.ext/obidistutils/serenity/pip/_vendor/html5lib/filters/optionaltags.py
+distutils.ext/obidistutils/serenity/pip/_vendor/html5lib/filters/sanitizer.py
+distutils.ext/obidistutils/serenity/pip/_vendor/html5lib/filters/whitespace.py
+distutils.ext/obidistutils/serenity/pip/_vendor/html5lib/serializer/__init__.py
+distutils.ext/obidistutils/serenity/pip/_vendor/html5lib/serializer/htmlserializer.py
+distutils.ext/obidistutils/serenity/pip/_vendor/html5lib/treeadapters/__init__.py
+distutils.ext/obidistutils/serenity/pip/_vendor/html5lib/treeadapters/sax.py
+distutils.ext/obidistutils/serenity/pip/_vendor/html5lib/treebuilders/__init__.py
+distutils.ext/obidistutils/serenity/pip/_vendor/html5lib/treebuilders/_base.py
+distutils.ext/obidistutils/serenity/pip/_vendor/html5lib/treebuilders/dom.py
+distutils.ext/obidistutils/serenity/pip/_vendor/html5lib/treebuilders/etree.py
+distutils.ext/obidistutils/serenity/pip/_vendor/html5lib/treebuilders/etree_lxml.py
+distutils.ext/obidistutils/serenity/pip/_vendor/html5lib/treewalkers/__init__.py
+distutils.ext/obidistutils/serenity/pip/_vendor/html5lib/treewalkers/_base.py
+distutils.ext/obidistutils/serenity/pip/_vendor/html5lib/treewalkers/dom.py
+distutils.ext/obidistutils/serenity/pip/_vendor/html5lib/treewalkers/etree.py
+distutils.ext/obidistutils/serenity/pip/_vendor/html5lib/treewalkers/genshistream.py
+distutils.ext/obidistutils/serenity/pip/_vendor/html5lib/treewalkers/lxmletree.py
+distutils.ext/obidistutils/serenity/pip/_vendor/html5lib/treewalkers/pulldom.py
+distutils.ext/obidistutils/serenity/pip/_vendor/html5lib/trie/__init__.py
+distutils.ext/obidistutils/serenity/pip/_vendor/html5lib/trie/_base.py
+distutils.ext/obidistutils/serenity/pip/_vendor/html5lib/trie/datrie.py
+distutils.ext/obidistutils/serenity/pip/_vendor/html5lib/trie/py.py
+distutils.ext/obidistutils/serenity/pip/_vendor/requests/__init__.py
+distutils.ext/obidistutils/serenity/pip/_vendor/requests/adapters.py
+distutils.ext/obidistutils/serenity/pip/_vendor/requests/api.py
+distutils.ext/obidistutils/serenity/pip/_vendor/requests/auth.py
+distutils.ext/obidistutils/serenity/pip/_vendor/requests/cacert.pem
+distutils.ext/obidistutils/serenity/pip/_vendor/requests/certs.py
+distutils.ext/obidistutils/serenity/pip/_vendor/requests/compat.py
+distutils.ext/obidistutils/serenity/pip/_vendor/requests/cookies.py
+distutils.ext/obidistutils/serenity/pip/_vendor/requests/exceptions.py
+distutils.ext/obidistutils/serenity/pip/_vendor/requests/hooks.py
+distutils.ext/obidistutils/serenity/pip/_vendor/requests/models.py
+distutils.ext/obidistutils/serenity/pip/_vendor/requests/sessions.py
+distutils.ext/obidistutils/serenity/pip/_vendor/requests/status_codes.py
+distutils.ext/obidistutils/serenity/pip/_vendor/requests/structures.py
+distutils.ext/obidistutils/serenity/pip/_vendor/requests/utils.py
+distutils.ext/obidistutils/serenity/pip/_vendor/requests/packages/__init__.py
+distutils.ext/obidistutils/serenity/pip/_vendor/requests/packages/chardet/__init__.py
+distutils.ext/obidistutils/serenity/pip/_vendor/requests/packages/chardet/big5freq.py
+distutils.ext/obidistutils/serenity/pip/_vendor/requests/packages/chardet/big5prober.py
+distutils.ext/obidistutils/serenity/pip/_vendor/requests/packages/chardet/chardetect.py
+distutils.ext/obidistutils/serenity/pip/_vendor/requests/packages/chardet/chardistribution.py
+distutils.ext/obidistutils/serenity/pip/_vendor/requests/packages/chardet/charsetgroupprober.py
+distutils.ext/obidistutils/serenity/pip/_vendor/requests/packages/chardet/charsetprober.py
+distutils.ext/obidistutils/serenity/pip/_vendor/requests/packages/chardet/codingstatemachine.py
+distutils.ext/obidistutils/serenity/pip/_vendor/requests/packages/chardet/compat.py
+distutils.ext/obidistutils/serenity/pip/_vendor/requests/packages/chardet/constants.py
+distutils.ext/obidistutils/serenity/pip/_vendor/requests/packages/chardet/cp949prober.py
+distutils.ext/obidistutils/serenity/pip/_vendor/requests/packages/chardet/escprober.py
+distutils.ext/obidistutils/serenity/pip/_vendor/requests/packages/chardet/escsm.py
+distutils.ext/obidistutils/serenity/pip/_vendor/requests/packages/chardet/eucjpprober.py
+distutils.ext/obidistutils/serenity/pip/_vendor/requests/packages/chardet/euckrfreq.py
+distutils.ext/obidistutils/serenity/pip/_vendor/requests/packages/chardet/euckrprober.py
+distutils.ext/obidistutils/serenity/pip/_vendor/requests/packages/chardet/euctwfreq.py
+distutils.ext/obidistutils/serenity/pip/_vendor/requests/packages/chardet/euctwprober.py
+distutils.ext/obidistutils/serenity/pip/_vendor/requests/packages/chardet/gb2312freq.py
+distutils.ext/obidistutils/serenity/pip/_vendor/requests/packages/chardet/gb2312prober.py
+distutils.ext/obidistutils/serenity/pip/_vendor/requests/packages/chardet/hebrewprober.py
+distutils.ext/obidistutils/serenity/pip/_vendor/requests/packages/chardet/jisfreq.py
+distutils.ext/obidistutils/serenity/pip/_vendor/requests/packages/chardet/jpcntx.py
+distutils.ext/obidistutils/serenity/pip/_vendor/requests/packages/chardet/langbulgarianmodel.py
+distutils.ext/obidistutils/serenity/pip/_vendor/requests/packages/chardet/langcyrillicmodel.py
+distutils.ext/obidistutils/serenity/pip/_vendor/requests/packages/chardet/langgreekmodel.py
+distutils.ext/obidistutils/serenity/pip/_vendor/requests/packages/chardet/langhebrewmodel.py
+distutils.ext/obidistutils/serenity/pip/_vendor/requests/packages/chardet/langhungarianmodel.py
+distutils.ext/obidistutils/serenity/pip/_vendor/requests/packages/chardet/langthaimodel.py
+distutils.ext/obidistutils/serenity/pip/_vendor/requests/packages/chardet/latin1prober.py
+distutils.ext/obidistutils/serenity/pip/_vendor/requests/packages/chardet/mbcharsetprober.py
+distutils.ext/obidistutils/serenity/pip/_vendor/requests/packages/chardet/mbcsgroupprober.py
+distutils.ext/obidistutils/serenity/pip/_vendor/requests/packages/chardet/mbcssm.py
+distutils.ext/obidistutils/serenity/pip/_vendor/requests/packages/chardet/sbcharsetprober.py
+distutils.ext/obidistutils/serenity/pip/_vendor/requests/packages/chardet/sbcsgroupprober.py
+distutils.ext/obidistutils/serenity/pip/_vendor/requests/packages/chardet/sjisprober.py
+distutils.ext/obidistutils/serenity/pip/_vendor/requests/packages/chardet/universaldetector.py
+distutils.ext/obidistutils/serenity/pip/_vendor/requests/packages/chardet/utf8prober.py
+distutils.ext/obidistutils/serenity/pip/_vendor/requests/packages/urllib3/__init__.py
+distutils.ext/obidistutils/serenity/pip/_vendor/requests/packages/urllib3/_collections.py
+distutils.ext/obidistutils/serenity/pip/_vendor/requests/packages/urllib3/connection.py
+distutils.ext/obidistutils/serenity/pip/_vendor/requests/packages/urllib3/connectionpool.py
+distutils.ext/obidistutils/serenity/pip/_vendor/requests/packages/urllib3/exceptions.py
+distutils.ext/obidistutils/serenity/pip/_vendor/requests/packages/urllib3/fields.py
+distutils.ext/obidistutils/serenity/pip/_vendor/requests/packages/urllib3/filepost.py
+distutils.ext/obidistutils/serenity/pip/_vendor/requests/packages/urllib3/poolmanager.py
+distutils.ext/obidistutils/serenity/pip/_vendor/requests/packages/urllib3/request.py
+distutils.ext/obidistutils/serenity/pip/_vendor/requests/packages/urllib3/response.py
+distutils.ext/obidistutils/serenity/pip/_vendor/requests/packages/urllib3/contrib/__init__.py
+distutils.ext/obidistutils/serenity/pip/_vendor/requests/packages/urllib3/contrib/ntlmpool.py
+distutils.ext/obidistutils/serenity/pip/_vendor/requests/packages/urllib3/contrib/pyopenssl.py
+distutils.ext/obidistutils/serenity/pip/_vendor/requests/packages/urllib3/packages/__init__.py
+distutils.ext/obidistutils/serenity/pip/_vendor/requests/packages/urllib3/packages/ordered_dict.py
+distutils.ext/obidistutils/serenity/pip/_vendor/requests/packages/urllib3/packages/six.py
+distutils.ext/obidistutils/serenity/pip/_vendor/requests/packages/urllib3/packages/ssl_match_hostname/__init__.py
+distutils.ext/obidistutils/serenity/pip/_vendor/requests/packages/urllib3/packages/ssl_match_hostname/_implementation.py
+distutils.ext/obidistutils/serenity/pip/_vendor/requests/packages/urllib3/util/__init__.py
+distutils.ext/obidistutils/serenity/pip/_vendor/requests/packages/urllib3/util/connection.py
+distutils.ext/obidistutils/serenity/pip/_vendor/requests/packages/urllib3/util/request.py
+distutils.ext/obidistutils/serenity/pip/_vendor/requests/packages/urllib3/util/response.py
+distutils.ext/obidistutils/serenity/pip/_vendor/requests/packages/urllib3/util/ssl_.py
+distutils.ext/obidistutils/serenity/pip/_vendor/requests/packages/urllib3/util/timeout.py
+distutils.ext/obidistutils/serenity/pip/_vendor/requests/packages/urllib3/util/url.py
+distutils.ext/obidistutils/serenity/pip/backwardcompat/__init__.py
+distutils.ext/obidistutils/serenity/pip/commands/__init__.py
+distutils.ext/obidistutils/serenity/pip/commands/bundle.py
+distutils.ext/obidistutils/serenity/pip/commands/completion.py
+distutils.ext/obidistutils/serenity/pip/commands/freeze.py
+distutils.ext/obidistutils/serenity/pip/commands/help.py
+distutils.ext/obidistutils/serenity/pip/commands/install.py
+distutils.ext/obidistutils/serenity/pip/commands/list.py
+distutils.ext/obidistutils/serenity/pip/commands/search.py
+distutils.ext/obidistutils/serenity/pip/commands/show.py
+distutils.ext/obidistutils/serenity/pip/commands/uninstall.py
+distutils.ext/obidistutils/serenity/pip/commands/unzip.py
+distutils.ext/obidistutils/serenity/pip/commands/wheel.py
+distutils.ext/obidistutils/serenity/pip/commands/zip.py
+distutils.ext/obidistutils/serenity/pip/vcs/__init__.py
+distutils.ext/obidistutils/serenity/pip/vcs/bazaar.py
+distutils.ext/obidistutils/serenity/pip/vcs/git.py
+distutils.ext/obidistutils/serenity/pip/vcs/mercurial.py
+distutils.ext/obidistutils/serenity/pip/vcs/subversion.py
+distutils.ext/src/littlebigman.c
+distutils.ext/src/pidname.c
+doc/sphinx/Makefile
+doc/sphinx/make.bat
+doc/sphinx/source/annotations.rst
+doc/sphinx/source/attributes.rst
+doc/sphinx/source/barcodes.rst
+doc/sphinx/source/conf.py
+doc/sphinx/source/conversions.rst
+doc/sphinx/source/embl.rst
+doc/sphinx/source/fasta.rst
+doc/sphinx/source/fastq.rst
+doc/sphinx/source/filtering.rst
+doc/sphinx/source/formats.rst
+doc/sphinx/source/genbank.rst
+doc/sphinx/source/index.rst
+doc/sphinx/source/introduction.rst
+doc/sphinx/source/iupac.rst
+doc/sphinx/source/manipulations.rst
+doc/sphinx/source/obitaxonomy.rst
+doc/sphinx/source/scripts.rst
+doc/sphinx/source/statistics.rst
+doc/sphinx/source/taxdump.rst
+doc/sphinx/source/tutorials.rst
+doc/sphinx/source/utilities.rst
+doc/sphinx/source/welcome.rst
+doc/sphinx/source/wolves.rst
+doc/sphinx/source/attributes/ali_dir.rst
+doc/sphinx/source/attributes/ali_length.rst
+doc/sphinx/source/attributes/avg_quality.rst
+doc/sphinx/source/attributes/best_identity.rst
+doc/sphinx/source/attributes/best_match.rst
+doc/sphinx/source/attributes/class.rst
+doc/sphinx/source/attributes/cluster.rst
+doc/sphinx/source/attributes/complemented.rst
+doc/sphinx/source/attributes/count.rst
+doc/sphinx/source/attributes/cut.rst
+doc/sphinx/source/attributes/direction.rst
+doc/sphinx/source/attributes/distance.rst
+doc/sphinx/source/attributes/error.rst
+doc/sphinx/source/attributes/experiment.rst
+doc/sphinx/source/attributes/family.rst
+doc/sphinx/source/attributes/family_name.rst
+doc/sphinx/source/attributes/forward_error.rst
+doc/sphinx/source/attributes/forward_match.rst
+doc/sphinx/source/attributes/forward_primer.rst
+doc/sphinx/source/attributes/forward_score.rst
+doc/sphinx/source/attributes/forward_tag.rst
+doc/sphinx/source/attributes/forward_tm.rst
+doc/sphinx/source/attributes/genus.rst
+doc/sphinx/source/attributes/genus_name.rst
+doc/sphinx/source/attributes/head_quality.rst
+doc/sphinx/source/attributes/id_status.rst
+doc/sphinx/source/attributes/merged.rst
+doc/sphinx/source/attributes/merged_star.rst
+doc/sphinx/source/attributes/mid_quality.rst
+doc/sphinx/source/attributes/mode.rst
+doc/sphinx/source/attributes/obiclean_cluster.rst
+doc/sphinx/source/attributes/obiclean_count.rst
+doc/sphinx/source/attributes/obiclean_head.rst
+doc/sphinx/source/attributes/obiclean_headcount.rst
+doc/sphinx/source/attributes/obiclean_internalcount.rst
+doc/sphinx/source/attributes/obiclean_samplecount.rst
+doc/sphinx/source/attributes/obiclean_singletoncount.rst
+doc/sphinx/source/attributes/obiclean_status.rst
+doc/sphinx/source/attributes/occurrence.rst
+doc/sphinx/source/attributes/order.rst
+doc/sphinx/source/attributes/order_name.rst
+doc/sphinx/source/attributes/pairend_limit.rst
+doc/sphinx/source/attributes/partial.rst
+doc/sphinx/source/attributes/rank.rst
+doc/sphinx/source/attributes/reverse_error.rst
+doc/sphinx/source/attributes/reverse_match.rst
+doc/sphinx/source/attributes/reverse_primer.rst
+doc/sphinx/source/attributes/reverse_score.rst
+doc/sphinx/source/attributes/reverse_tag.rst
+doc/sphinx/source/attributes/reverse_tm.rst
+doc/sphinx/source/attributes/sample.rst
+doc/sphinx/source/attributes/scientific_name.rst
+doc/sphinx/source/attributes/score.rst
+doc/sphinx/source/attributes/score_norm.rst
+doc/sphinx/source/attributes/select.rst
+doc/sphinx/source/attributes/seq_a_deletion.rst
+doc/sphinx/source/attributes/seq_a_insertion.rst
+doc/sphinx/source/attributes/seq_a_mismatch.rst
+doc/sphinx/source/attributes/seq_a_single.rst
+doc/sphinx/source/attributes/seq_ab_match.rst
+doc/sphinx/source/attributes/seq_b_deletion.rst
+doc/sphinx/source/attributes/seq_b_insertion.rst
+doc/sphinx/source/attributes/seq_b_mismatch.rst
+doc/sphinx/source/attributes/seq_b_single.rst
+doc/sphinx/source/attributes/seq_length.rst
+doc/sphinx/source/attributes/seq_length_ori.rst
+doc/sphinx/source/attributes/seq_rank.rst
+doc/sphinx/source/attributes/sminL.rst
+doc/sphinx/source/attributes/sminR.rst
+doc/sphinx/source/attributes/species.rst
+doc/sphinx/source/attributes/species_list.rst
+doc/sphinx/source/attributes/species_name.rst
+doc/sphinx/source/attributes/status.rst
+doc/sphinx/source/attributes/strand.rst
+doc/sphinx/source/attributes/tail_quality.rst
+doc/sphinx/source/attributes/taxid.rst
+doc/sphinx/source/optionsSet/defaultoptions.txt
+doc/sphinx/source/optionsSet/inputformat.txt
+doc/sphinx/source/optionsSet/outputformat.txt
+doc/sphinx/source/optionsSet/sequenceEdit.txt
+doc/sphinx/source/optionsSet/sequenceFilter.txt
+doc/sphinx/source/optionsSet/taxonomyDB.txt
+doc/sphinx/source/optionsSet/taxonomyFilter.txt
+doc/sphinx/source/scripts/ecoPCR.rst
+doc/sphinx/source/scripts/ecoPrimers.rst
+doc/sphinx/source/scripts/ecodbtaxstat.rst
+doc/sphinx/source/scripts/ecofind.rst
+doc/sphinx/source/scripts/ecotag.rst
+doc/sphinx/source/scripts/ecotaxspecificity.rst
+doc/sphinx/source/scripts/ecotaxstat.rst
+doc/sphinx/source/scripts/illuminapairedend.rst
+doc/sphinx/source/scripts/ngsfilter.rst
+doc/sphinx/source/scripts/obiaddtaxids.rst
+doc/sphinx/source/scripts/obiannotate.rst
+doc/sphinx/source/scripts/obiclean.rst
+doc/sphinx/source/scripts/obicomplement.rst
+doc/sphinx/source/scripts/obiconvert.rst
+doc/sphinx/source/scripts/obicount.rst
+doc/sphinx/source/scripts/obicut.rst
+doc/sphinx/source/scripts/obidistribute.rst
+doc/sphinx/source/scripts/obiextract.rst
+doc/sphinx/source/scripts/obigrep.rst
+doc/sphinx/source/scripts/obihead.rst
+doc/sphinx/source/scripts/obijoinpairedend.rst
+doc/sphinx/source/scripts/obipr2.rst
+doc/sphinx/source/scripts/obisample.rst
+doc/sphinx/source/scripts/obiselect.rst
+doc/sphinx/source/scripts/obisilva.rst
+doc/sphinx/source/scripts/obisort.rst
+doc/sphinx/source/scripts/obisplit.rst
+doc/sphinx/source/scripts/obistat.rst
+doc/sphinx/source/scripts/obisubset.rst
+doc/sphinx/source/scripts/obitab.rst
+doc/sphinx/source/scripts/obitail.rst
+doc/sphinx/source/scripts/obitaxonomy.rst
+doc/sphinx/source/scripts/obiuniq.rst
+doc/sphinx/source/scripts/oligotag.rst
+doc/sphinx/sphinxext/apigen.py
+doc/sphinx/sphinxext/docscrape.py
+doc/sphinx/sphinxext/docscrape_sphinx.py
+doc/sphinx/sphinxext/ipython_console_highlighting.py
+doc/sphinx/sphinxext/numpydoc.py
+src/ali2consensus.py
+src/ecodbtaxstat.py
+src/ecotag.py
+src/ecotaxspecificity.py
+src/ecotaxstat.py
+src/extractreads.py
+src/extractreads2.py
+src/illuminapairedend.py
+src/ngsfilter.py
+src/obiaddtaxids.py
+src/obiannotate.py
+src/obiclean.py
+src/obicomplement.py
+src/obiconvert.py
+src/obicount.py
+src/obicut.py
+src/obidistribute.py
+src/obiextract.py
+src/obigrep.py
+src/obihead.py
+src/obijoinpairedend.py
+src/obipr2.py
+src/obisample.py
+src/obiselect.py
+src/obisilva.py
+src/obisort.py
+src/obisplit.py
+src/obistat.py
+src/obisubset.py
+src/obitab.py
+src/obitail.py
+src/obitaxonomy.py
+src/obiuniq.py
+src/oligotag.py
+src/OBITools.egg-info/PKG-INFO
+src/OBITools.egg-info/SOURCES.txt
+src/OBITools.egg-info/dependency_links.txt
+src/OBITools.egg-info/not-zip-safe
+src/OBITools.egg-info/requires.txt
+src/OBITools.egg-info/top_level.txt
+src/obitools/SVGdraw.py
+src/obitools/__init__.py
+src/obitools/_obitools.h
+src/obitools/_obitools.pxd
+src/obitools/_obitools.pyx
+src/obitools/collections.py
+src/obitools/decorator.py
+src/obitools/fast.py
+src/obitools/gzip.py
+src/obitools/sample.py
+src/obitools/solexaPairEnd.py
+src/obitools/svg.py
+src/obitools/version.py
+src/obitools/zipfile.py
+src/obitools/align/__init__.py
+src/obitools/align/_assemble.pxd
+src/obitools/align/_assemble.pyx
+src/obitools/align/_codonnws.pxd
+src/obitools/align/_codonnws.pyx
+src/obitools/align/_dynamic.pxd
+src/obitools/align/_dynamic.pyx
+src/obitools/align/_freeendgap.pxd
+src/obitools/align/_freeendgap.pyx
+src/obitools/align/_freeendgapfm.pxd
+src/obitools/align/_freeendgapfm.pyx
+src/obitools/align/_gprofilenws.pxd
+src/obitools/align/_gprofilenws.pyx
+src/obitools/align/_lcs.cfiles
+src/obitools/align/_lcs.ext.1.c
+src/obitools/align/_lcs.ext.2.c
+src/obitools/align/_lcs.ext.3.c
+src/obitools/align/_lcs.ext.4.c
+src/obitools/align/_lcs.h
+src/obitools/align/_lcs.pxd
+src/obitools/align/_lcs.pyx
+src/obitools/align/_lcs_fast.h
+src/obitools/align/_nws.pxd
+src/obitools/align/_nws.pyx
+src/obitools/align/_nwsdnabyprot.pxd
+src/obitools/align/_nwsdnabyprot.pyx
+src/obitools/align/_profilenws.pxd
+src/obitools/align/_profilenws.pyx
+src/obitools/align/_qsassemble.pyx
+src/obitools/align/_qsrassemble.pyx
+src/obitools/align/_rassemble.pxd
+src/obitools/align/_rassemble.pyx
+src/obitools/align/_sse.h
+src/obitools/align/_upperbond.cfiles
+src/obitools/align/_upperbond.ext.1.c
+src/obitools/align/_upperbond.h
+src/obitools/align/_upperbond.pxd
+src/obitools/align/_upperbond.pyx
+src/obitools/align/homopolymere.py
+src/obitools/align/ssearch.py
+src/obitools/alignment/__init__.py
+src/obitools/alignment/ace.py
+src/obitools/barcodecoverage/__init__.py
+src/obitools/barcodecoverage/calcBc.py
+src/obitools/barcodecoverage/drawBcTree.py
+src/obitools/barcodecoverage/findErrors.py
+src/obitools/barcodecoverage/readFiles.py
+src/obitools/barcodecoverage/writeBcTree.py
+src/obitools/blast/__init__.py
+src/obitools/carto/__init__.py
+src/obitools/distances/__init__.py
+src/obitools/distances/observed.py
+src/obitools/distances/phylip.py
+src/obitools/distances/r.py
+src/obitools/dnahash/__init__.py
+src/obitools/ecobarcode/__init__.py
+src/obitools/ecobarcode/databases.py
+src/obitools/ecobarcode/ecotag.py
+src/obitools/ecobarcode/options.py
+src/obitools/ecobarcode/rawdata.py
+src/obitools/ecobarcode/taxonomy.py
+src/obitools/ecopcr/__init__.py
+src/obitools/ecopcr/annotation.py
+src/obitools/ecopcr/options.py
+src/obitools/ecopcr/sequence.py
+src/obitools/ecopcr/taxonomy.py
+src/obitools/ecotag/__init__.py
+src/obitools/ecotag/parser.py
+src/obitools/eutils/__init__.py
+src/obitools/fasta/__init__.py
+src/obitools/fasta/_fasta.pxd
+src/obitools/fasta/_fasta.pyx
+src/obitools/fastq/__init__.py
+src/obitools/fastq/_fastq.pyx
+src/obitools/fnaqual/__init__.py
+src/obitools/fnaqual/fasta.py
+src/obitools/fnaqual/quality.py
+src/obitools/format/__init__.py
+src/obitools/format/_format.pyx
+src/obitools/format/options.py
+src/obitools/format/genericparser/__init__.py
+src/obitools/format/genericparser/_genericparser.pyx
+src/obitools/format/ontology/__init__.py
+src/obitools/format/ontology/go_obo.py
+src/obitools/format/sequence/__init__.py
+src/obitools/format/sequence/embl.py
+src/obitools/format/sequence/fasta.py
+src/obitools/format/sequence/fastq.py
+src/obitools/format/sequence/fnaqual.py
+src/obitools/format/sequence/genbank.py
+src/obitools/format/sequence/tagmatcher.py
+src/obitools/goa/__init__.py
+src/obitools/goa/parser.py
+src/obitools/graph/__init__.py
+src/obitools/graph/dag.py
+src/obitools/graph/rootedtree.py
+src/obitools/graph/tree.py
+src/obitools/graph/algorithms/__init__.py
+src/obitools/graph/algorithms/clique.py
+src/obitools/graph/algorithms/compact.py
+src/obitools/graph/algorithms/component.py
+src/obitools/graph/layout/__init__.py
+src/obitools/graph/layout/radialtree.py
+src/obitools/interactive/__init__.py
+src/obitools/location/__init__.py
+src/obitools/location/feature.py
+src/obitools/metabarcoding/__init__.py
+src/obitools/metabarcoding/options.py
+src/obitools/obischemas/__init__.py
+src/obitools/obischemas/options.py
+src/obitools/obischemas/kb/__init__.py
+src/obitools/obischemas/kb/extern.py
+src/obitools/obo/__init__.py
+src/obitools/obo/parser.py
+src/obitools/obo/go/__init__.py
+src/obitools/obo/go/parser.py
+src/obitools/options/__init__.py
+src/obitools/options/_bioseqfilter.pyx
+src/obitools/options/_options.pyx
+src/obitools/options/bioseqcutter.py
+src/obitools/options/bioseqedittag.py
+src/obitools/options/bioseqfilter.py
+src/obitools/options/taxonomyfilter.py
+src/obitools/parallel/__init__.py
+src/obitools/parallel/jobqueue.py
+src/obitools/phylogeny/__init__.py
+src/obitools/phylogeny/newick.py
+src/obitools/profile/__init__.py
+src/obitools/profile/_profile.pxd
+src/obitools/profile/_profile.pyx
+src/obitools/seqdb/__init__.py
+src/obitools/seqdb/dnaparser.py
+src/obitools/seqdb/blastdb/__init__.py
+src/obitools/seqdb/embl/__init__.py
+src/obitools/seqdb/embl/parser.py
+src/obitools/seqdb/genbank/__init__.py
+src/obitools/seqdb/genbank/ncbi.py
+src/obitools/seqdb/genbank/parser.py
+src/obitools/sequenceencoder/__init__.py
+src/obitools/solexa/__init__.py
+src/obitools/statistics/__init__.py
+src/obitools/statistics/hypergeometric.py
+src/obitools/statistics/noncentralhypergeo.py
+src/obitools/table/__init__.py
+src/obitools/table/csv.py
+src/obitools/tagmatcher/__init__.py
+src/obitools/tagmatcher/options.py
+src/obitools/tagmatcher/parser.py
+src/obitools/thermo/__init__.py
+src/obitools/tools/__init__.py
+src/obitools/tools/_solexapairend.pyx
+src/obitools/tools/solexapairend.py
+src/obitools/tree/__init__.py
+src/obitools/tree/dot.py
+src/obitools/tree/layout.py
+src/obitools/tree/newick.py
+src/obitools/tree/svg.py
+src/obitools/tree/unrooted.py
+src/obitools/unit/__init__.py
+src/obitools/unit/obitools/__init__.py
+src/obitools/utils/__init__.py
+src/obitools/utils/_utils.pxd
+src/obitools/utils/_utils.pyx
+src/obitools/utils/bioseq.py
+src/obitools/utils/crc64.py
+src/obitools/utils/iterator.py
+src/obitools/word/__init__.py
+src/obitools/word/_binary.pyx
+src/obitools/word/_readindex.cfiles
+src/obitools/word/_readindex.ext.1.c
+src/obitools/word/_readindex.h
+src/obitools/word/_readindex.pyx
+src/obitools/word/options.py
+src/obitools/word/predicate.py
\ No newline at end of file
diff --git a/src/OBITools.egg-info/dependency_links.txt b/src/OBITools.egg-info/dependency_links.txt
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/src/OBITools.egg-info/dependency_links.txt
@@ -0,0 +1 @@
+
diff --git a/src/OBITools.egg-info/not-zip-safe b/src/OBITools.egg-info/not-zip-safe
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/src/OBITools.egg-info/not-zip-safe
@@ -0,0 +1 @@
+
diff --git a/src/OBITools.egg-info/requires.txt b/src/OBITools.egg-info/requires.txt
new file mode 100644
index 0000000..791bd73
--- /dev/null
+++ b/src/OBITools.egg-info/requires.txt
@@ -0,0 +1,5 @@
+Cython>=0.24
+Sphinx>=1.2.0
+wheel>=0.24.0
+virtualenv>=1.11.0
+ipython<6.0
diff --git a/src/OBITools.egg-info/top_level.txt b/src/OBITools.egg-info/top_level.txt
new file mode 100644
index 0000000..7ae40d8
--- /dev/null
+++ b/src/OBITools.egg-info/top_level.txt
@@ -0,0 +1 @@
+obitools
diff --git a/src/ali2consensus.py b/src/ali2consensus.py
new file mode 100644
index 0000000..e6438e6
--- /dev/null
+++ b/src/ali2consensus.py
@@ -0,0 +1,111 @@
+#!/usr/local/bin/python
+
+'''
+Created on 30 sept. 2011
+
+ at author: fboyer
+
+Used to get the consensus sequence of a nucleotide fasta alignment.
+
+Example:
+
+ali2consensus.py -t 75 myFastaAlignedSequences.fasta
+
+ at todo: Check input/output format options to suite with the script objective
+'''
+
+
+from obitools.fasta import fastFastaIterator
+from obitools.options import getOptionManager
+from obitools.alignment import Alignment, columnIterator
+from obitools import NucSequence
+from obitools.format.options import sequenceWriterGenerator, addInOutputOption
+
+def addAliOptions(optionManager):
+ optionManager.add_option('-t','--threshold',
+ action="store", dest="threshold",
+ metavar="",
+ type="int",
+ default=50,
+ help="Threshold parameter for consensus building")
+
+
+
+if __name__=='__main__':
+
+ optionParser = getOptionManager([addInOutputOption, addAliOptions],
+ entryIterator=fastFastaIterator
+ )
+
+ (options, entries) = optionParser()
+
+ assert options.threshold>=0 and options.threshold<=100, 'Threshold must belong to [0, 100]'
+ threshold = options.threshold/100.
+
+
+ #taken from http://www.dna.affrc.go.jp/misc/MPsrch/InfoIUPAC.html
+ iupacDNA = dict()
+ iupacDNA['-'] = ('-',)
+ iupacDNA['A'] = ('A',)
+ iupacDNA['C'] = ('C',)
+ iupacDNA['G'] = ('G',)
+ iupacDNA['T'] = ('T',)
+ iupacDNA['U'] = ('T',)
+ iupacDNA['M'] = ('A', 'C')
+ iupacDNA['R'] = ('A','G')
+ iupacDNA['W'] = ('A', 'T')
+ iupacDNA['S'] = ('C', 'G')
+ iupacDNA['Y'] = ('C', 'T')
+ iupacDNA['K'] = ('G', 'T')
+ iupacDNA['V'] = ('A', 'C', 'G')
+ iupacDNA['H'] = ('A', 'C', 'T')
+ iupacDNA['D'] = ('A', 'G', 'T')
+ iupacDNA['B'] = ('C', 'G', 'T')
+ iupacDNA['N'] = ('A', 'C', 'G', 'T')
+
+ reverse_iupacDNA = dict(map(lambda x : (x[1],x[0]), iupacDNA.items()))
+
+ alignedSequences = Alignment(entries)
+
+ consensusNtSeq = ""
+ def addCountInCol(t, columnCount):
+ lt = float(len(t))
+ for x in t:
+ columnCount[x]+= 1/lt
+
+ def cmpTuple(t1,t2):
+ return cmp(t1[1],t2[1])
+
+ thresholdCount = threshold*len(alignedSequences)
+ for c in columnIterator(alignedSequences):
+ colC = {'A':0., 'C':0., 'G':0., 'T':0., '-':0.}
+ map(lambda t: addCountInCol(t, colC), map(lambda nt: iupacDNA[nt.upper()], c))
+
+
+ counts = colC.items()
+ counts.sort(cmpTuple, reverse=True)
+
+ sumCounts = 0
+ symbols = list()
+ for nt, count in counts:
+ sumCounts += count
+ symbols.append(nt)
+
+ if sumCounts>=thresholdCount:
+ symbols.sort()
+ t = tuple(symbols)
+ try:
+ consensusNtSeq += reverse_iupacDNA[t]
+ except:
+ consensusNtSeq += '?'
+ finally:
+ break
+
+ consensusSeq = NucSequence('Consensus_%d'%(int(threshold*100,)),
+ consensusNtSeq,
+ 'Consensus sequence done on %d aligned sequences of length %d with a threshold of %d %%'%(len(alignedSequences),
+ len(alignedSequences[0]),
+ int(threshold*100)))
+ writer = sequenceWriterGenerator(options)
+ consensusSeq
+ writer(consensusSeq)
diff --git a/src/ecodbtaxstat.py b/src/ecodbtaxstat.py
new file mode 100644
index 0000000..7600fee
--- /dev/null
+++ b/src/ecodbtaxstat.py
@@ -0,0 +1,76 @@
+#!/usr/local/bin/python
+'''
+:py:mod:`ecodbtaxstat`: gives taxonomic rank frequency of a given ``ecopcr`` database
+=====================================================================================
+
+.. codeauthor:: Eric Coissac <eric.coissac at metabarcoding.org>
+
+The :py:mod:`ecodbtaxstat` command requires an ``ecopcr`` database and a taxonomic rank
+(specified by the ``--rank`` option, default *species*). The command outputs first
+the total number of sequence records in the database having taxonomic information at this rank,
+and then the number of sequence records for each value of this rank.
+
+'''
+
+from obitools.options import getOptionManager
+
+from obitools.options.taxonomyfilter import addTaxonomyFilterOptions, \
+ taxonomyFilterIteratorGenerator
+
+from obitools.ecopcr.taxonomy import EcoTaxonomyDB
+from obitools.ecopcr.sequence import EcoPCRDBSequenceIterator
+
+def addRankOptions(optionManager):
+
+ group = optionManager.add_option_group('ecodbtaxstat specific option')
+ group.add_option('--rank',
+ action="store", dest="rank",
+ metavar="<taxonomic rank>",
+ type="string",
+ default="species",
+ help="The taxonomic rank at which frequencies have to be computed. "
+ "Possible values are: "
+ "class, family, forma, genus, infraclass, infraorder, kingdom, "
+ "order, parvorder, phylum, species, species group, "
+ "species subgroup, subclass, subfamily, subgenus, subkingdom, "
+ "suborder, subphylum, subspecies, subtribe, superclass, "
+ "superfamily, superkingdom, superorder, superphylum, tribe or varietas. "
+ "(Default: species)")
+
+
+def cmptax(taxonomy):
+ def cmptaxon(t1,t2):
+ return cmp(taxonomy.getScientificName(t1),
+ taxonomy.getScientificName(t2))
+ return cmptaxon
+
+if __name__=='__main__':
+
+ optionParser = getOptionManager([addRankOptions,addTaxonomyFilterOptions], progdoc=__doc__)
+
+
+ (options, entries) = optionParser()
+
+
+ filter = taxonomyFilterIteratorGenerator(options)
+ seqdb = EcoPCRDBSequenceIterator(options.ecodb,options.taxonomy)
+
+ stats = {}
+ i=0
+ tot=0
+ for seq in filter(seqdb):
+ tot+=1
+ t = options.taxonomy.getTaxonAtRank(seq['taxid'],options.rank)
+ if t is not None:
+ i+=1
+ stats[t]=stats.get(t,0)+1
+
+ print "#sequence count : %d" % tot
+ print "#considered sequences : %d" % i
+ print "# %s : %d" % (options.rank,len(stats))
+ taxons = stats.keys()
+ taxons.sort(cmptax(options.taxonomy))
+
+ for t in taxons:
+ print "%s\t%d" % (options.taxonomy.getScientificName(t),stats[t])
+
\ No newline at end of file
diff --git a/src/ecotag.py b/src/ecotag.py
new file mode 100755
index 0000000..5dacfe6
--- /dev/null
+++ b/src/ecotag.py
@@ -0,0 +1,460 @@
+#!/usr/local/OBITools-1.1.22/bin/python
+'''
+:py:mod:`ecotag`: assigns sequences to taxa
+===========================================
+
+.. codeauthor:: Eric Coissac <eric.coissac at metabarcoding.org>
+
+:py:mod:`ecotag` is the tool that assigns sequences to a taxon based on
+sequence similarity. The program first searches the reference database for the
+reference sequence(s) (hereafter referred to as 'primary reference sequence(s)') showing the
+highest similarity with the query sequence. Then it looks for all other reference
+sequences (hereafter referred to as 'secondary reference sequences') whose
+similarity with the primary reference sequence(s) is equal or higher than the
+similarity between the primary reference and the query sequences. Finally, it
+assigns the query sequence to the most recent common ancestor of the primary and
+secondary reference sequences.
+
+As input, `ecotag` requires the sequences to be assigned, a reference database
+in :doc:`fasta <../fasta>` format, where each sequence is associated with a taxon identified
+by a unique *taxid*, and a taxonomy database where taxonomic information is stored
+for each *taxid*.
+
+ *Example:*
+
+ .. code-block:: bash
+
+ > ecotag -d embl_r113 -R ReferenceDB.fasta \\
+ --sort=count -m 0.95 -r seq.fasta > seq_tag.fasta
+
+ The above command specifies that each sequence stored in ``seq.fasta``
+ is compared to those in the reference database called ``ReferenceDB.fasta``
+ for taxonomic assignment. In the output file ``seq_tag.fasta``, the sequences
+ are sorted from highest to lowest counts. When there is no reference sequence
+ with a similarity equal or higher than 0.95 for a given sequence, no taxonomic
+ information is provided for this sequence in ``seq_tag.fasta``.
+
+'''
+
+from obitools.fasta import fastaNucIterator
+#from obitools.align.ssearch import ssearchIterator
+from obitools.utils.bioseq import uniqSequence,sortSequence
+
+from obitools.align import lenlcs,ALILEN
+
+from obitools.options.taxonomyfilter import addTaxonomyDBOptions,loadTaxonomyDatabase
+from obitools.options import getOptionManager
+from obitools.format.options import addInOutputOption, sequenceWriterGenerator
+
+from collections import OrderedDict
+
+import sys
+import math
+import os.path
+
+
+def addSearchOptions(optionManager):
+
+ optionManager.add_option('-R','--ref-database',
+ action="store", dest="database",
+ metavar="<FILENAME>",
+ type="string",
+ help="fasta file containing reference "
+ "sequences")
+
+# optionManager.add_option('-s','--shape',
+# action="store", dest="shape",
+# metavar="shapeness",
+# type="float",
+# default=2.0,
+# help="selectivity on the ssearch results "
+# "1.0 is the higher selectivity. "
+# "values > 1.0 decrease selectivity.")
+
+ optionManager.add_option('-m','--minimum-identity',
+ action="store", dest="minimum",
+ metavar="identity",
+ type="float",
+ default=0.0,
+ help="minimum identity to consider.")
+
+ optionManager.add_option('--minimum-circle',
+ action="store", dest="circle",
+ metavar="identity",
+ type="float",
+ default=1.0,
+ help="minimum identity considered for the assignment circle.")
+
+# optionManager.add_option('-S','--normalized-smallest',
+# action="store_false", dest="large",
+# default=True,
+# help="normalize identity over the shortest sequence")
+#
+# optionManager.add_option('-L','--normalized-largest',
+# action="store_true", dest="large",
+# default=True,
+# help="normalize identity over the longest sequence")
+
+ optionManager.add_option('-x','--explain',
+ action='store',dest='explain',
+ type="string",
+ default=None,
+ help="Add in the output CD (complementary data) record "
+ "to explain identification decision")
+
+ optionManager.add_option('-u','--uniq',
+ action='store_true',dest='uniq',
+ default=False,
+ help='Apply a uniq filter on query sequences before identification')
+
+# optionManager.add_option('-T','--table',
+# action='store_true',dest='table',
+# default=False,
+# help='Write results in a tabular format')
+
+# optionManager.add_option('--store-in-db',
+# action='store_true',dest='storeindb',
+# default=False,
+# help='Write results in an ecobarcode DB')
+#
+# optionManager.add_option('--update-db',
+# action='store_true',dest='updatedb',
+# default=False,
+# help='Run identification only on new sequences')
+
+ optionManager.add_option('--sort',
+ action='store',dest='sort',
+ type='string',
+ default=None,
+ help='Sort output on input sequence tag')
+
+ optionManager.add_option('-r','--reverse',
+ action='store_true',dest='reverse',
+ default=False,
+ help='Sort in reverse order (should be used with -S)')
+
+# optionManager.add_option('-o','--output-sequence',
+# action='store_true',dest='sequence',
+# default=False,
+# help='Add an extra column in the output with the query sequence')
+#
+# optionManager.add_option('--self-matches',
+# action='store_true',dest='selfmatches',
+# default=False,
+# help='Switch to the new match algorithm')
+
+ optionManager.add_option('-E','--errors',
+ action='store',dest='error',
+ type='float',
+ default=0.0,
+ help='Tolerated rate of wrong assignation')
+
+ optionManager.add_option('-M','--min-matches',
+ action='store',dest='minmatches',
+ type="int",
+ default=1,
+ help='Minimum congruent assignation')
+
+ optionManager.add_option('--cache-size',
+ action='store',dest='cache',
+ type='int',
+ metavar='<SIZE>',
+ default=1000000,
+ help='Cache size for the aligment score')
+
+
+def count(data):
+ rep = {}
+ for x in data:
+ if isinstance(x, (list,tuple)):
+ k = x[0]
+ if len(x) > 1:
+ v = [x[1]]
+ default=[]
+ else:
+ v = 1
+ default=0
+ else:
+ k=x
+ v=1
+ default=0
+ rep[k]=rep.get(k,default)+v
+ return rep
+
+
+def myLenlcs(s1, s2, minid, normalized, reference):
+
+ if s1.hasKey('pairend_limit') :
+
+ overlap = min(0,len(s1) - len(s2))
+
+ f5P1 = s1[0:s1['pairend_limit']]
+ f3P1 = s1[s1['pairend_limit']:]
+
+ f5P2 = s2[0:s1['pairend_limit']]
+
+ from2 = len(s2) - min(len(s2),len(f3P1))
+ f3P2 = s2[from2:]
+
+ errors = int(math.ceil((1-minid) * len(s1)))
+ minid5P = max(len(f5P1),len(f5P2)) - errors
+ minid3P = max(len(f3P1),len(f3P2)) - errors
+
+ lcs5P, lali5P = lenlcs(f5P1,f5P2,minid5P,False)
+ lcs3P, lali3P = lenlcs(f3P1,f3P2,minid3P,False)
+
+ raw_lcs = lcs5P + lcs3P - overlap
+ lali = lali5P + lali3P - overlap
+ lcs = raw_lcs / float(lali)
+
+ else:
+ lcs, lali = lenlcs(s1,s2,minid,normalized,reference)
+
+ return lcs, lali
+
+
+def cachedLenLCS(s1,s2,minid,normalized,reference):
+ global __LCSCache__
+ global __INCache__
+ global __OUTCache__
+ global __CACHE_SIZE__
+
+ pair=frozenset((s1.id,s2.id))
+
+ if pair in __LCSCache__:
+ rep=__LCSCache__[pair]
+ del __LCSCache__[pair]
+ __INCache__+=1.0
+
+ else:
+ rep=lenlcs(s1,s2,minid,normalized,reference)
+ __OUTCache__+=1.0
+
+ __LCSCache__[pair]=rep
+
+ if len(__LCSCache__) > __CACHE_SIZE__:
+ __LCSCache__.popitem(0)
+ return rep
+
+
+
+#def lcsIterator(entries,db,options):
+#
+# for seq in entries:
+# results = []
+# maxid = (None,0.0)
+# minid = options.minimum
+# for d in db:
+# lcs,lali = myLenlcs(seq, d, minid,normalized=True,reference=ALILEN)
+# if lcs > maxid[1]:
+# maxid = (d,lcs)
+# minid = maxid[1] ** options.shape
+# results.append((d,lcs))
+# minid = maxid[1] ** options.shape
+# results = [x for x in results if x[1]>=minid]
+# yield seq,([maxid[0]],maxid[1]),results
+
+def mostPreciseTaxid(taxidlist, options):
+ tl = set(x for x in taxidlist if x > 1)
+ if not tl:
+ tl=set([1])
+
+ while len(tl) > 1:
+ t1 = tl.pop()
+ t2 = tl.pop()
+ if options.taxonomy.isAncestor(t1,t2):
+ taxid = t2
+ elif options.taxonomy.isAncestor(t2,t1):
+ taxid = t1
+ else:
+ taxid = options.taxonomy.lastCommonTaxon(t1,t2)
+ tl.add(taxid)
+
+ taxid = tl.pop()
+
+ return taxid
+
+def lcsIteratorSelf(entries,db,options):
+
+ for seq in entries:
+ results = []
+ maxid = ([],0.0)
+ minid = options.minimum
+ for d in db:
+ lcs,lali = myLenlcs(seq,d,minid,normalized=True,reference=ALILEN) # @UnusedVariable
+ if lcs > maxid[1] and lcs > options.minimum:
+ maxid = ([d],lcs)
+ minid = maxid[1]
+ elif lcs==maxid[1]:
+ maxid[0].append(d)
+
+ if maxid[0]:
+ if maxid[1] > options.circle:
+ maxid=(maxid[0],options.circle)
+ results.extend([(s,maxid[1]) for s in maxid[0]])
+ for d in db:
+ for s in maxid[0]:
+ if d.id != s.id:
+ lcs,lali = cachedLenLCS(s,d,maxid[1],normalized=True,reference=ALILEN) # @UnusedVariable
+ if lcs >= maxid[1]:
+ results.append((d,lcs))
+
+ yield seq,maxid,results
+
+if __name__=='__main__':
+
+ __LCSCache__=OrderedDict()
+ __INCache__=1.0
+ __OUTCache__=1.0
+
+
+ optionParser = getOptionManager([addSearchOptions,addTaxonomyDBOptions,addInOutputOption],progdoc=__doc__)
+
+ (options, entries) = optionParser()
+
+ __CACHE_SIZE__=options.cache
+
+ if __CACHE_SIZE__ < 10:
+ __CACHE_SIZE__=10
+
+ taxonomy = loadTaxonomyDatabase(options)
+ writer = sequenceWriterGenerator(options)
+
+ print >>sys.stderr,"Reading reference DB ...",
+# if (hasattr(options, 'ecobarcodedb') and options.ecobarcodedb is not None):
+# try:
+# db = list(fastaNucIterator(options.database))
+# except IOError:
+# db = list(referenceDBIterator(options))
+# if options.primer is not None:
+# entries = sequenceIterator(options)
+# else:
+
+ db = list(fastaNucIterator(options.database))
+ dbname=os.path.splitext(os.path.basename(options.database))[0]
+
+ print >>sys.stderr," : %d" % len(db)
+
+ taxonlink = {}
+
+ rankid = taxonomy.findRankByName(options.explain)
+
+ for seq in db:
+ seqid = seq.id[0:46]
+ seq.id=seqid
+ assert seqid not in taxonlink
+ taxonlink[seqid]=int(seq['taxid'])
+
+
+ if options.uniq:
+ entries = uniqSequence(entries)
+
+ if options.sort is not None:
+ entries = sortSequence(entries, options.sort, options.reverse)
+
+# matcher = lcsIterator
+#
+# if options.selfmatches:
+# matcher= lcsIteratorSelf
+
+ search = lcsIteratorSelf(entries,db,options)
+
+ print >>sys.stderr,'\nCache size : %d\n' % __CACHE_SIZE__
+
+
+ for seq,best,match in search:
+ try:
+ seqcount = seq['count']
+ except KeyError:
+ seqcount=1
+
+ if best[0]:
+ taxlist = set(taxonlink[p[0].id] for p in match)
+ if options.error > 0.0 and len(match) >= int(options.minmatches / (1.0 - options.error)):
+ lca = taxonomy.betterCommonTaxon(options.error,
+ *tuple(taxlist))
+ else:
+ lca = taxonomy.betterCommonTaxon(0.0,*tuple(taxlist))
+
+ scname = taxonomy.getScientificName(lca)
+ rank = taxonomy.getRank(lca)
+ if len(taxlist) < 15:
+ species_list = set(taxonomy.getSpecies(t) for t in taxlist)
+ species_list = [taxonomy.getScientificName(t) for t in species_list if t is not None]
+ else:
+ species_list = []
+
+
+ worst = min(x[1] for x in match)
+
+ data =['ID',seq.id,best[0][0].id,best[1],worst,'NA',seqcount,len(match),lca,scname,rank]
+ else:
+ data =['UK',seq.id,'NA','NA','NA','NA',seqcount,0,1,'root','no rank']
+
+ tag = seq.get('id_status',{})
+ tag[dbname]=data[0]=='ID'
+
+ seq['count']=data[6]
+
+ tag = seq.get('match_count',{})
+ tag[dbname]=data[7]
+
+ tag = seq.get('taxid_by_db',{})
+ tag[dbname]=data[8]
+ seq['taxid'] = mostPreciseTaxid(tag.values(), options)
+
+ tag = seq.get('scientific_name_by_db',{})
+ tag[dbname]=data[9]
+ seq['scientific_name']=options.taxonomy.getScientificName(seq['taxid'])
+
+ tag = seq.get('rank_by_db',{})
+ tag[dbname]=data[10]
+ seq['rank']=options.taxonomy.getRank(seq['taxid'])
+
+
+ if data[0]=='ID':
+ tag = seq.get('best_match',{})
+ tag[dbname]=data[2]
+
+ tag = seq.get('best_identity',{})
+ tag[dbname]=data[3]
+
+ tag = seq.get('species_list',{})
+ tag[dbname]=species_list
+
+ if options.explain is not None:
+ tag = seq.get('explain',{})
+ tag[dbname]=dict((s[0].id,s[1]) for s in match)
+
+
+
+ seq['order']=options.taxonomy.getOrder(seq['taxid'])
+ if seq['order']:
+ seq['order_name']=options.taxonomy.getScientificName(seq['order'])
+ else:
+ seq['order_name']=None
+
+ seq['family']=options.taxonomy.getFamily(seq['taxid'])
+ if seq['family']:
+ seq['family_name']=options.taxonomy.getScientificName(seq['family'])
+ else:
+ seq['family_name']=None
+
+ seq['genus']=options.taxonomy.getGenus(seq['taxid'])
+ if seq['genus']:
+ seq['genus_name']=options.taxonomy.getScientificName(seq['genus'])
+ else:
+ seq['genus_name']=None
+
+ seq['species']=options.taxonomy.getSpecies(seq['taxid'])
+ if seq['species']:
+ seq['species_name']=options.taxonomy.getScientificName(seq['species'])
+ else:
+ seq['species_name']=None
+
+
+ writer(seq)
+ print >>sys.stderr,'\n%5.3f%% of the alignments was cached' % (__INCache__/(__INCache__+__OUTCache__)*100)
+
+
+
diff --git a/src/ecotaxspecificity.py b/src/ecotaxspecificity.py
new file mode 100755
index 0000000..939cfce
--- /dev/null
+++ b/src/ecotaxspecificity.py
@@ -0,0 +1,239 @@
+#!/usr/local/bin/python
+'''
+:py:mod:`ecotaxspecificity`: Evaluates barcode resolution
+=========================================================
+
+.. codeauthor:: Eric Coissac <eric.coissac at metabarcoding.org>
+
+The :py:mod:`ecotaxspecificity` command evaluates barcode resolution at different
+taxonomic ranks.
+
+As inputs, it takes a sequence record file annotated with taxids in the sequence
+header, and a database formated as an ecopcr database (see :doc:`obitaxonomy
+<obitaxonomy>`) or a NCBI taxdump (see NCBI ftp site).
+
+An example of output is reported below::
+
+ Number of sequences added in graph: 284
+ Number of nodes in all components: 269
+ Number of sequences lost: 15!
+ rank taxon_ok taxon_total percent
+ order 8 8 100.00
+ superfamily 1 1 100.00
+ parvorder 1 1 100.00
+ subkingdom 1 1 100.00
+ superkingdom 1 1 100.00
+ kingdom 3 3 100.00
+ phylum 5 5 100.00
+ infraorder 1 1 100.00
+ subfamily 3 3 100.00
+ class 6 6 100.00
+ species 35 176 19.89
+ superorder 1 1 100.00
+ suborder 1 1 100.00
+ subtribe 1 1 100.00
+ subclass 3 3 100.00
+ genus 9 15 60.00
+ superclass 1 1 100.00
+ family 10 10 100.00
+ tribe 2 2 100.00
+ subphylum 1 1 100.00
+
+
+In this example, the input sequence file contains 284 sequence records, but only
+269 have been examined, because taxonomic information was not recovered for the
+the 15 remaining ones.
+
+"Taxon_total" refers to the number of different taxa observed at this rank
+in the sequence record file (when taxonomic information is available at this
+rank), and "taxon_ok" corresponds to the number of taxa that the barcode sequence
+identifies unambiguously in the taxonomic database. In this example, the sequence
+records correspond to 176 different species, but only 35 of these have specific
+barcodes. "percent" is the percentage of unambiguously identified taxa among
+the total number of taxa (taxon_ok/taxon_total*100).
+
+'''
+
+import math
+import sys
+
+
+from obitools.graph import Graph
+from obitools.utils import progressBar
+from obitools.align import LCS
+from obitools.align import isLCSReachable
+from obitools.format.options import addInputFormatOption, sequenceWriterGenerator
+from obitools.options import getOptionManager
+from obitools.graph.algorithms.component import componentIterator
+from obitools.ecopcr.options import addTaxonomyDBOptions, loadTaxonomyDatabase
+
+
+
+def addSpecificityOptions(optionManager):
+ group = optionManager.add_option_group('ecotaxspecificity specific options')
+ group.add_option('-e','--errors',
+ action="store", dest="dist",
+ metavar="###",
+ type="int",
+ default=1,
+ help="Maximum errors between two sequences")
+ group.add_option('-q','--quorum',
+ action="store", dest="quorum",
+ type="float",
+ default=0.0,
+ help="Quorum")
+
+
+if __name__=='__main__':
+
+ optionParser = getOptionManager([addInputFormatOption,addTaxonomyDBOptions,addSpecificityOptions])
+
+ (options, entries) = optionParser()
+
+ loadTaxonomyDatabase(options)
+ tax =options.taxonomy
+
+ ranks = set(x for x in tax.rankIterator())
+ results = [seq for seq in entries]
+
+ graph = Graph("error",directed=False)
+ xx = 0
+ for s in results:
+ #if options.sample is None:
+ # sample = {"XXX":s['count'] if 'count' in s else 1}
+ #else:
+ # sample = s[options.sample]
+ #graph.addNode(s.id,shape='circle',_sequence=s,_sample=sample)
+ graph.addNode(s.id,shape='circle',_sequence=s)
+ xx = xx + 1
+
+
+ ldb = len(results)
+ digit = int(math.ceil(math.log10(ldb)))
+ aligncount = ldb*(ldb+1)/2
+ edgecount = 0
+ print >>sys.stderr
+
+ header = "Alignment : %%0%dd x %%0%dd -> %%0%dd " % (digit,digit,digit)
+ progressBar(1,aligncount,True,"Alignment : %s x %s -> %s " % ('-'*digit,'-'*digit, '0'*digit))
+ pos=1
+ aligner = LCS()
+
+
+ for i in xrange(ldb):
+
+ inode = graph[results[i].id]
+
+ aligner.seqA = results[i]
+ li = len(results[i])
+
+ for j in xrange(i+1,ldb):
+ progressBar(pos,aligncount,head=header % (i,j,edgecount))
+ pos+=1
+
+ lj = len(results[j])
+
+ lm = max(li,lj)
+ lcsmin = lm - options.dist
+
+ if isLCSReachable(results[i],results[j],lcsmin):
+ aligner.seqB=results[j]
+ ali = aligner()
+ llcs=ali.score
+ lali = len(ali[0])
+ obsdist = lali-llcs
+ if obsdist <= options.dist: # options.dist:
+ jnode = graph[results[j].id]
+ res=graph.addEdge(inode.label, jnode.label) # make links
+ edgecount+=1
+
+ indexbyseq={} # each element in this dict will be one component, with first seq of component as its key
+
+ yy = 0
+ for c in componentIterator(graph):
+ sub = graph.subgraph(c)
+ first = True
+ s = ""
+ for node in sub: #all nodes of a component should go with same key (taken as first sequence in comp)
+ #print node
+ seq = node["_sequence"]
+ if first == True: #we will take first seq of a component as key for that component
+ s = str(seq)
+ indexbyseq[s]=set([seq])
+ first = False
+ else:
+ indexbyseq[s].add(seq)
+ yy = yy + 1
+
+ #print "Number of sequences added in graph: " + str(xx)
+ #print "Number of nodes in all components: " + str (yy)
+ #print "Number of sequences lost: " + str (xx-yy) + "!"
+
+ print >>sys.stderr
+
+ # since multiple different sequences have one key, we need to know what that key is for each sequence
+ indexbykey={} #it will have elements like: {"seq1":key, "seq2":key, ...} where 'key' is the component key to which 'seqx' belongs
+ for key in indexbyseq.keys (): # loop on all components
+ for x in indexbyseq[key]: # loop on each seq in this component
+ v = str(x)
+ if v not in indexbykey:
+ indexbykey[v] = key
+
+ print '%-20s\t%10s\t%10s\t%7s' % ('rank','taxon_ok','taxon_total','percent')
+ lostSeqs = []
+ for rank,rankid in ranks:
+ if rank != 'no rank':
+ indexbytaxid={}
+ for seq in results:
+ t = tax.getTaxonAtRank(seq['taxid'],rankid)
+ if t is not None:
+ if t in indexbytaxid:
+ indexbytaxid[t].add(str(seq))
+ else:
+ indexbytaxid[t]=set([str(seq)])
+
+ taxoncount=0
+ taxonok=0
+ for taxon in indexbytaxid:
+ taxlist = set()
+ taxonindividuals = {}
+ for tag in indexbytaxid[taxon]:
+ if tag in indexbykey:
+ key = indexbykey[tag] #get component key for this seq
+ if options.quorum > 0.0:
+ for x in indexbyseq[key]:
+ txn = tax.getTaxonAtRank(x['taxid'],rankid)
+ if txn not in taxonindividuals:
+ taxonindividuals[txn] = set([x['taxid']])
+ else:
+ taxonindividuals[txn].add(x['taxid'])
+ taxlist |=set(tax.getTaxonAtRank(x['taxid'],rankid) for x in indexbyseq[key])
+ else:
+ if tag not in lostSeqs:
+ lostSeqs.append(tag)
+
+ taxoncount+=1
+
+ if options.quorum > 0.0:
+ max = 0
+ sum = 0
+ for k in taxonindividuals.keys ():
+ if len(taxonindividuals[k]) > max:
+ max = len(taxonindividuals[k])
+ sum = sum + len(taxonindividuals[k])
+ if max >= (sum-sum*options.quorum):
+ taxonok += 1
+ else:
+ if len(taxlist)==1:
+ taxonok+=1
+ if taxoncount:
+ print '%-20s\t%10d\t%10d\t%8.2f' % (rank,taxonok,taxoncount,float(taxonok)/taxoncount*100)
+
+ # if len (lostSeqs) > 0:
+ # print "Lost Sequences:"
+ # print lostSeqs
+
+
+
+
+
diff --git a/src/ecotaxstat.py b/src/ecotaxstat.py
new file mode 100755
index 0000000..ea74390
--- /dev/null
+++ b/src/ecotaxstat.py
@@ -0,0 +1,109 @@
+#!/usr/local/bin/python
+'''
+:py:mod:`ecotaxstat` : getting the coverage of an ecoPCR output compared to the original ecoPCR database
+========================================================================================================
+
+.. codeauthor:: Eric Coissac <eric.coissac at metabarcoding.org>
+
+The :py:mod:`ecotaxstat` command requires two parameters : an *ecoPCR* formatted database (specified
+with the `-d` option, (see :doc:`obiconvert <obiconvert>` for a description of the database format)
+and an ecoPCR output (ideally computed using the specified ecoPCR database).
+
+The command outputs, for every rank, the coverage (Bc) of the ecoPCR output. The coverage (Bc) is the
+fraction of *taxids* that have a sequence in the database and have also have a sequence in the ecoPCR
+output file.
+
+Optionally, *taxids* can be specified to focus the coverage on a smaller part of the taxonomy.
+'''
+
+from obitools.ecopcr import taxonomy
+from obitools.ecopcr import sequence
+from obitools.ecopcr import EcoPCRFile
+
+from obitools.options import getOptionManager
+from obitools.ecopcr.options import loadTaxonomyDatabase
+
+import sys
+
+def addTaxonomyOptions(optionManager):
+
+ optionManager.add_option('-d','--ecopcrdb',
+ action="store", dest="db",
+ metavar="<FILENAME>",
+ type="string",
+ help="ecoPCR Database "
+ "name")
+
+ optionManager.add_option('-r','--required',
+ action="append",
+ dest='required',
+ metavar="<TAXID>",
+ type="int",
+ default=[],
+ help="required taxid")
+
+if __name__=='__main__':
+
+ optionParser = getOptionManager([addTaxonomyOptions],
+ entryIterator=EcoPCRFile)
+
+ (options, entries) = optionParser()
+
+ if (options.db is None):
+ print>>sys.stderr, "-d option is required"
+ sys.exit(1)
+
+ if len(options.required)==0:
+ print>>sys.stderr, "-r option is required"
+ sys.exit(1)
+
+ tax = taxonomy.EcoTaxonomyDB(options.db)
+ seqd= sequence.EcoPCRDBSequenceIterator(options.db,taxonomy=tax)
+
+ ranks = set(x for x in tax.rankIterator())
+
+ listtaxonbyrank = {}
+
+ for seq in seqd:
+ taxid = seq['taxid']
+ if (options.required and
+ reduce(lambda x,y: x or y,
+ (tax.isAncestor(r,taxid) for r in options.required),
+ False)):
+
+ for rank,rankid in ranks:
+ if rank != 'no rank':
+ t = tax.getTaxonAtRank(seq['taxid'],rankid)
+ if t is not None:
+ if rank in listtaxonbyrank:
+ listtaxonbyrank[rank].add(t)
+ else:
+ listtaxonbyrank[rank]=set([t])
+
+ stats = dict((x,len(listtaxonbyrank[x])) for x in listtaxonbyrank)
+
+ listtaxonbyrank = {}
+
+ for seq in entries:
+ for rank,rankid in ranks:
+ if rank != 'no rank':
+ t = tax.getTaxonAtRank(seq['taxid'],rankid)
+ if t is not None:
+ if rank in listtaxonbyrank:
+ listtaxonbyrank[rank].add(t)
+ else:
+ listtaxonbyrank[rank]=set([t])
+
+ dbstats= dict((x,len(listtaxonbyrank[x])) for x in listtaxonbyrank)
+
+ ranknames = [x[0] for x in ranks]
+ ranknames.sort()
+
+ print '%-20s\t%10s\t%10s\t%7s' % ('rank','ecopcr','db','percent')
+
+ for r in ranknames:
+ if r in dbstats and r in stats and dbstats[r]:
+ print '%-20s\t%10d\t%10d\t%8.2f' % (r,dbstats[r],stats[r],float(dbstats[r])/stats[r]*100)
+
+
+
diff --git a/src/extractreads.py b/src/extractreads.py
new file mode 100644
index 0000000..04050d5
--- /dev/null
+++ b/src/extractreads.py
@@ -0,0 +1,243 @@
+'''
+Created on 9 juin 2012
+
+ at author: coissac
+'''
+from obitools.format.options import addInOutputOption, sequenceWriterGenerator,\
+ autoEntriesIterator
+from obitools.fasta import formatFasta
+from obitools.options import getOptionManager
+from obitools.options._options import allEntryIterator
+from obitools.word._readindex import ReadIndex,minword
+
+import sys
+import math
+
+def addWindowsOptions(optionManager):
+
+ optionManager.add_option('-l','--window-length',
+ action="store", dest="length",
+ metavar="<WORD SIZE>",
+ type="int",
+ default=90,
+ help="size of the sliding window")
+
+ optionManager.add_option('-s','--step',
+ action="store", dest="step",
+ metavar="<STEP>",
+ type="int",
+ default=1,
+ help="position difference between two windows")
+
+ optionManager.add_option('-c','--circular',
+ action="store_true", dest="circular",
+ default=False,
+ help="set for circular sequence")
+
+ optionManager.add_option('-R','--reference',
+ action="store", dest="reffile",
+ metavar="<FILENAME>",
+ type="str",
+ default=None,
+ help="sequence file containing the reference sequences")
+
+ optionManager.add_option('-r','--reverse-reads',
+ action="store", dest="reverse",
+ metavar="<FILENAME>",
+ type="str",
+ default=None,
+ help="Filename containing reverse solexa reads "
+ )
+
+ optionManager.add_option('-D','--write-dump',
+ action="store", dest="wdump",
+ metavar="<FILENAME>",
+ type="str",
+ default=None,
+ help="Save the index to a dump file"
+ )
+
+ optionManager.add_option('-d','--read-dump',
+ action="store", dest="rdump",
+ metavar="<FILENAME>",
+ type="str",
+ default=None,
+ help="Read the index from a dump file"
+ )
+
+ optionManager.add_option('-S','--singleton',
+ action="store", dest="singleton",
+ metavar="<FILENAME>",
+ type="str",
+ default=None,
+ help="Write singleton sequence in this file"
+ )
+
+def cutQuality(s):
+ def quantile(x,q=0.1):
+ y = list(x)
+ y.sort()
+ return y[int(q*len(y))]
+
+ def cumsum0(x):
+ if x[0] < 0: x[0]=0
+ for i in xrange(1,len(x)):
+ x[i]+=x[i-1]
+ if x[i]<0: x[i]=0
+ return x
+
+ q = [- math.log10(a) * 10 for a in s.quality]
+ mq=quantile(q)
+ q = cumsum0([a - mq for a in q])
+
+
+ mx = max(q)
+
+ xmax = len(q)-1
+
+ while(q[xmax] < mx):
+ xmax-=1
+
+ xmin=xmax
+ xmax+=1
+
+ while(xmin>0 and q[xmin]>0):
+ xmin-=1
+
+ if q[xmin]==0:
+ xmin+=1
+
+ return s[xmin:xmax]
+
+
+
+def cutDirectReverse(entries):
+ first = []
+
+ for i in xrange(10):
+ first.append(entries.next())
+
+ lens = [len(x) for x in first]
+ clen = {}
+ for i in lens:
+ clen[i]=clen.get(i,0)+1
+ freq = max(clen.values())
+ freq = [k for k in clen if clen[k]==freq]
+ assert len(freq)==1,"To many sequence length"
+ freq = freq[0]
+ assert freq % 2 == 0, ""
+ lread = freq/2
+
+ seqs = chain(first,entries)
+
+ for s in seqs:
+ d = s[0:lread]
+ r = s[lread:]
+ yield(d,r)
+
+def seqPairs(direct,reverse):
+ for d in direct:
+ r = reverse.next()
+ yield(cutQuality(d),cutQuality(r))
+
+
+def seq2words(seqs,options):
+ nw=set()
+ for seq in seqs:
+ s=str(seq)
+
+ if options.circular:
+ s = s + s[0:options.length]
+
+ ls = len(s) - options.length + 1
+
+ for wp in xrange(0,ls,options.step):
+ w =minword(s[wp:wp+options.length])
+ if len(w)==options.length:
+ nw.add(w)
+
+ return nw
+
+
+if __name__ == '__main__':
+
+ optionParser = getOptionManager([addWindowsOptions,addInOutputOption],progdoc=__doc__)
+
+ (options, direct) = optionParser()
+
+ if options.reverse is None:
+ sequences=((x,) for x in direct)
+ else:
+ reverse = allEntryIterator([options.reverse],options.readerIterator)
+ sequences=seqPairs(direct,reverse)
+
+ reader = autoEntriesIterator(options)
+ rfile = open(options.reffile)
+ reference = reader(rfile)
+
+ worddone=set()
+ wordlist = seq2words(reference,options)
+
+ indexer = ReadIndex(readsize=105)
+
+ seqpair=0
+ nbseq=0
+
+ writer = sequenceWriterGenerator(options)
+
+ if options.rdump is None:
+ print >>sys.stderr,"Indexing sequences..."
+ for seq in sequences:
+ indexer.add(seq)
+
+ print >>sys.stderr,"Indexing words..."
+ indexer.indexWords(options.length,True)
+
+ if options.wdump is not None:
+ print >>sys.stderr,"Saving index to file %s..." % options.wdump
+ indexer.save(options.wdump,True)
+ else:
+ print >>sys.stderr,"Loading index dump..."
+ indexer.load(options.rdump,True)
+
+
+
+ print >>sys.stderr,"Selecting sequences..."
+
+ while len(wordlist)>0:
+ w = wordlist.pop()
+ worddone.add(w)
+
+ i=0
+
+ #print >>sys.stderr,"Looking for word : %s..." % w
+
+ for seq in indexer.iterreads(w):
+ i+=1
+ #print formatFasta(seq)
+ s = str(seq)
+ sc = str(seq.complement())
+ assert w in s or w in sc,'Bug !!!! sequence %s (%d) %s sans %s' % (seq.id,i,s,w)
+ words = seq2words((seq,),options) - worddone
+ wordlist|=words
+
+ seqpair+=i
+
+ if i:
+ print >>sys.stderr,"\rWrote extracted = %d/total = %d/word done = %d [wordlist=%d]" % (i,seqpair,len(worddone),len(wordlist)),
+
+ print >>sys.stderr,"\nWriting sequences..."
+
+ for seq in indexer.itermarkedpairs():
+ print formatFasta(seq)
+
+ if options.singleton is not None:
+ s = open(options.singleton,'w')
+ for seq in indexer.itermarkedsingleton():
+ print >>s,formatFasta(seq)
+ s.close()
+
+
+
+
+
diff --git a/src/extractreads2.py b/src/extractreads2.py
new file mode 100644
index 0000000..41bec8b
--- /dev/null
+++ b/src/extractreads2.py
@@ -0,0 +1,119 @@
+'''
+Created on 9 juin 2012
+
+ at author: coissac
+'''
+from esm import Index
+from obitools.format.options import addInOutputOption, sequenceWriterGenerator,\
+ autoEntriesIterator
+from obitools.options import getOptionManager
+from obitools.options._options import allEntryIterator
+
+def addWindowsOptions(optionManager):
+
+ optionManager.add_option('-l','--window-length',
+ action="store", dest="length",
+ metavar="<WORD SIZE>",
+ type="int",
+ default=None,
+ help="size of the sliding window")
+
+ optionManager.add_option('-s','--step',
+ action="store", dest="step",
+ metavar="<STEP>",
+ type="int",
+ default=1,
+ help="position difference between two windows")
+
+ optionManager.add_option('-c','--circular',
+ action="store_true", dest="circular",
+ default=False,
+ help="set for circular sequence")
+
+ optionManager.add_option('-R','--reference',
+ action="store", dest="reffile",
+ metavar="<STEP>",
+ type="str",
+ default=None,
+ help="sequence file containing the reference sequences")
+ optionManager.add_option('-r','--reverse-reads',
+ action="store", dest="reverse",
+ metavar="<FILENAME>",
+ type="str",
+ default=None,
+ help="Filename containing reverse solexa reads "
+ )
+
+def cutDirectReverse(entries):
+ first = []
+
+ for i in xrange(10):
+ first.append(entries.next())
+
+ lens = [len(x) for x in first]
+ clen = {}
+ for i in lens:
+ clen[i]=clen.get(i,0)+1
+ freq = max(clen.values())
+ freq = [k for k in clen if clen[k]==freq]
+ assert len(freq)==1,"To many sequence length"
+ freq = freq[0]
+ assert freq % 2 == 0, ""
+ lread = freq/2
+
+ seqs = chain(first,entries)
+
+ for s in seqs:
+ d = s[0:lread]
+ r = s[lread:]
+ yield(d,r)
+
+def seqPairs(direct,reverse):
+ for d in direct:
+ r = reverse.next()
+ yield(d,r)
+
+if __name__ == '__main__':
+
+ optionParser = getOptionManager([addWindowsOptions,addInOutputOption],progdoc=__doc__)
+
+ (options, direct) = optionParser()
+
+ if options.reverse is None:
+ sequences=((x,) for x in direct)
+ else:
+ reverse = allEntryIterator([options.reverse],options.readerIterator)
+ sequences=seqPairs(direct,reverse)
+
+ reader = autoEntriesIterator(options)
+ rfile = open(options.reffile)
+ reference = reader(rfile)
+
+ words = Index()
+
+ for rs in reference:
+ ft = str(rs)
+ rt = str(rs.complement())
+
+ if options.circular:
+ ft = ft + ft[0:options.length]
+ rt = rt + rt[0:options.length]
+
+ for x in xrange(0,len(ft),options.step):
+ w = ft[x:(x+options.length)]
+ if len(w)==options.length:
+ words.enter(w)
+ w = rt[x:(x+options.length)]
+ if len(w)==options.length:
+ words.enter(w)
+
+ words.fix()
+
+
+ writer = sequenceWriterGenerator(options)
+
+ for seq in sequences:
+ t = "".join([str(x) for x in seq])
+ r = words.query(t)
+ if r:
+ writer(seq)
diff --git a/src/illuminapairedend.py b/src/illuminapairedend.py
new file mode 100644
index 0000000..6bc667c
--- /dev/null
+++ b/src/illuminapairedend.py
@@ -0,0 +1,280 @@
+#!/usr/local/bin/python
+'''
+:py:mod:`illuminapairedend`: aligns paired-end Illumina reads
+=============================================================
+
+.. codeauthor:: Eric Coissac <eric.coissac at metabarcoding.org>
+
+.. IMPORTANT::
+
+ :py:mod:`illuminapairedend` replaces ``solexapairend``.
+
+:py:mod:`illuminapairedend` aims at aligning the two reads of a pair-end library sequenced
+using an Illumina platform.
+
+ - If the two reads overlap, it returns the consensus sequence together with its quality
+
+ - Otherwise, it concatenates sequence merging the forward read and
+ the reversed-complemented reverse read.
+
+The program uses as input one or two :doc:`fastq <../fastq>` sequences reads files.
+
+ - If two files are used one of them must be specified using the ``-r`` option.
+ Sequence records corresponding to the same read pair must be in the same order
+ in the two files.
+
+ - If just one file is provided, sequence records are supposed to be all of the same length.
+ The first half of the sequence is used as forward read, the second half is used as the reverse
+ read.
+
+:py:mod:`illuminapairedend` align the forward sequence record with the reverse complement of the
+reverse sequence record. The alignment algorithm takes into account the base qualities.
+
+ *Example:*
+
+ .. code-block:: bash
+
+ > illuminapairedend -r seq3P.fastq seq5P.fastq > seq.fastq
+
+ The ``seq5P.fastq`` sequence file contains the forward sequence records.
+ The ``seq3P.fastq`` sequence file contains the reverse sequence records.
+ Pairs of reads are aligned together and the consensus sequence is stored in the
+ `` seq.fastq`` file.
+
+'''
+
+from obitools import NucSequence
+from obitools.options import getOptionManager, allEntryIterator
+from obitools.align import QSolexaReverseAssemble
+from obitools.align import QSolexaRightReverseAssemble
+from obitools.tools._solexapairend import buildConsensus
+from obitools.format.options import addOutputFormatOption,\
+ sequenceWriterGenerator
+
+from itertools import chain
+import cPickle
+import math
+from obitools.fastq._fastq import fastqIterator # @UnresolvedImport
+
+
+def addSolexaPairEndOptions(optionManager):
+ optionManager.add_option('-r','--reverse-reads',
+ action="store", dest="reverse",
+ metavar="<FILENAME>",
+ type="str",
+ default=None,
+ help="Filename containing reverse solexa reads "
+ )
+
+ optionManager.add_option('--index-file',
+ action="store", dest="indexfile",
+ metavar="<FILENAME>",
+ type="str",
+ default=None,
+ help="Filename containing illumina index reads "
+ )
+ optionManager.add_option('--sanger',
+ action="store_const", dest="seqinformat",
+ default=None,
+ const='sanger',
+ help="input file is in sanger fastq nucleic format (standard fastq)")
+
+ optionManager.add_option('--solexa',
+ action="store_const", dest="seqinformat",
+ default=None,
+ const='solexa',
+ help="input file is in fastq nucleic format produced by solexa sequencer")
+
+ optionManager.add_option('--illumina',
+ action="store_const", dest="seqinformat",
+ default=None,
+ const='illumina',
+ help="input file is in fastq nucleic format produced by old solexa sequencer")
+
+ optionManager.add_option('--score-min',
+ action="store", dest="smin",
+ metavar="#.###",
+ type="float",
+ default=None,
+ help="minimum score for keeping aligment")
+
+
+
+def cutDirectReverse(entries):
+ first = []
+
+ for i in xrange(10):
+ first.append(entries.next())
+
+ lens = [len(x) for x in first]
+ clen = {}
+ for i in lens:
+ clen[i]=clen.get(i,0)+1
+ freq = max(clen.values())
+ freq = [k for k in clen if clen[k]==freq]
+ assert len(freq)==1,"To many sequence length"
+ freq = freq[0]
+ assert freq % 2 == 0, ""
+ lread = freq/2
+
+ seqs = chain(first,entries)
+
+ for s in seqs:
+ d = s[0:lread]
+ r = s[lread:]
+ yield(d,r)
+
+def seqPairs(direct,reverse):
+ for d in direct:
+ r = reverse.next()
+ yield(d,r)
+
+def checkAlignOk(ali):
+ #print not (ali[0][0]=='-' or ali[1][len(ali[1])-1]=='-')
+ return not (ali[0][0]=='-' or ali[1][len(ali[1])-1]=='-')
+
+la = QSolexaReverseAssemble()
+ra = QSolexaRightReverseAssemble()
+
+def buildAlignment(direct,reverse):
+
+ if len(direct)==0 or len(reverse)==0:
+ return None
+
+ la.seqA=direct
+ la.seqB=reverse
+ ali=la()
+ ali.direction='left'
+
+ ra.seqA=direct
+ ra.seqB=reverse
+ rali=ra()
+ rali.direction='right'
+
+ if ali.score < rali.score:
+ ali=rali
+
+ return ali
+
+def alignmentIterator(sequences):
+
+ for d,r in sequences:
+ ali = buildAlignment(d,r)
+ if ali is None:
+ continue
+ yield ali
+
+
+def buildJoinedSequence(ali,options):
+ d = ali[0].getRoot()
+ r = ali[1].getRoot()
+
+
+ r=r.complement()
+
+ s = str(d) + str(r)
+
+ seq = NucSequence(d.id + '_PairEnd',s,d.definition,**d)
+
+ withqual = hasattr(d, 'quality') or hasattr(r, 'quality')
+
+ if withqual:
+ if hasattr(d, 'quality'):
+ quality = d.quality
+ else:
+ quality = [10**-4] * len(d)
+
+ if hasattr(r, 'quality'):
+ quality.extend(r.quality)
+ else:
+ quality.extend([10**-4] * len(r))
+
+ seq.quality=quality
+
+ seq['score']=ali.score
+ seq['ali_dir']=ali.direction
+ seq['mode']='joined'
+ seq['pairend_limit']=len(d)
+
+ return seq
+
+
+
+if __name__ == '__main__':
+ optionParser = getOptionManager([addSolexaPairEndOptions,addOutputFormatOption],checkFormat=True
+ )
+
+ (options, direct) = optionParser()
+
+ #WARNING TO REMOVE : DIRTY PATCH !
+ options.proba = None
+ options.skip = None
+ options.only = None
+
+
+ options.sminL = None
+ options.sminR = None
+
+
+ if options.proba is not None and options.smin is None:
+ p = open(options.proba)
+ options.nullLeft = cPickle.load(p)
+ options.nullRight = cPickle.load(p)
+
+ assert options.pvalue is not None, "You have to indicate a pvalue or an score min"
+
+ i = int(math.floor((1.0 - options.pvalue) * len(options.nullLeft)))
+
+ if i == len(options.nullLeft):
+ i-=1
+ options.sminL = options.nullLeft[i]
+
+ i = int(math.floor((1.0 - options.pvalue) * len(options.nullRight)))
+ if i == len(options.nullRight):
+ i-=1
+ options.sminR = options.nullRight[i]
+
+ if options.smin is not None:
+ options.sminL = options.smin
+ options.sminR = options.smin
+
+
+ if options.reverse is None:
+ sequences=cutDirectReverse(direct)
+ else:
+ reverse = allEntryIterator([options.reverse],options.readerIterator)
+ sequences=seqPairs(direct,reverse)
+
+ if options.indexfile is not None:
+ indexfile = fastqIterator(options.indexfile)
+ else:
+ indexfile = None
+
+ writer = sequenceWriterGenerator(options)
+
+ ba = alignmentIterator(sequences)
+
+ for ali in ba:
+
+ if options.sminL is not None:
+ if ( (ali.direction=='left' and ali.score > options.sminL)
+ or (ali.score > options.sminR)):
+ consensus = buildConsensus(ali)
+ else:
+ consensus = buildJoinedSequence(ali, options)
+
+ consensus['sminL']=options.sminL
+ consensus['sminR']=options.sminR
+ else:
+ consensus = buildConsensus(ali)
+
+ if indexfile is not None:
+ i = str(indexfile.next())
+ consensus['illumina_index']=i
+
+ writer(consensus)
+
+
+
+
+
diff --git a/src/ngsfilter.py b/src/ngsfilter.py
new file mode 100644
index 0000000..979bae3
--- /dev/null
+++ b/src/ngsfilter.py
@@ -0,0 +1,458 @@
+#!/usr/local/bin/python
+'''
+:py:mod:`ngsfilter` : Assigns sequence records to the corresponding experiment/sample based on DNA tags and primers
+===================================================================================================================
+
+.. codeauthor:: Eric Coissac <eric.coissac at metabarcoding.org>
+
+To distinguish between sequences from different PCR products pooled in the same sequencing library, pairs of small DNA
+sequences (call tags, see the :py:mod:`oligoTag` command and its associated paper for more informations on the design
+of such tags) can be concatenated to the PCR primers.
+
+:py:mod:`ngsfilter` takes as input sequence record files and a file describing the DNA tags and primers sequences used
+for each PCR sample. :py:mod:`ngsfilter` allows to demultiplex sequence records file by identifying these DNA tags and
+the primers.
+
+:py:mod:`ngsfilter` requires a sample description file containing the description of the primers and tags associated
+to each sample (specified by option ``-t``). The sample description file is a text file where each line describes one
+sample. Columns are separated by space or tab characters. Lines beginning with the '#' character will be considered
+as commentary lines and will simply be ignored by :py:mod:`ngsfilter`.
+
+Here is an example of a sample description file::
+
+ #exp sample tags forward_primer reverse_primer extra_information
+ gh 01_11a cacgcagtc:cacgcatcg GGGCAATCCTGAGCCAA CCATTGAGTCTCTGCACCTATC F @ community=Festuca; bucket=1; extraction=1;
+ gh 01_12a cacgcatcg:cacgcagtc GGGCAATCCTGAGCCAA CCATTGAGTCTCTGCACCTATC F @ community=Festuca; bucket=1; extraction=2;
+ gh 01_21a cacgcgcat:cacgctact GGGCAATCCTGAGCCAA CCATTGAGTCTCTGCACCTATC F @ community=Festuca; bucket=2; extraction=1;
+ gh 01_22a cacgctact:cacgcgcat GGGCAATCCTGAGCCAA CCATTGAGTCTCTGCACCTATC F @ community=Festuca; bucket=2; extraction=2;
+ gh 02_11a cacgctgag:cacgtacga GGGCAATCCTGAGCCAA CCATTGAGTCTCTGCACCTATC F @ community=Festuca; bucket=1; extraction=1;
+ gh 02_12a cacgtacga:cacgctgag GGGCAATCCTGAGCCAA CCATTGAGTCTCTGCACCTATC F @ community=Festuca; bucket=1; extraction=2;
+
+
+The results consist of sequence records, printed on the standard output, with their sequence trimmed of the primers and
+tags and annotated with the corresponding experiment and sample (and possibly some extra informations). Sequences for
+which the tags and primers have not been well identified, and which are thus unassigned to any sample, are stored in a
+file if option ``-u`` is specified and tagged as erroneous sequences (``error`` attribute) by :py:mod:`ngsfilter`.
+'''
+
+from obitools import NucSequence, DNAComplementSequence
+from string import lower
+
+import sys
+
+import math
+
+from obitools.options import getOptionManager
+from obitools.utils import ColumnFile
+from obitools.align import FreeEndGapFullMatch
+from obitools.format.options import addInOutputOption, sequenceWriterGenerator
+
+
+
+def addNGSOptions(optionManager):
+
+ group = optionManager.add_option_group('ngsfilter specific options')
+ group.add_option('-t','--tag-list',
+ action="store", dest="taglist",
+ metavar="<FILENAME>",
+ type="string",
+ default=None,
+ help="File containing the samples definition (with tags, primers, sample names,...)")
+
+ group.add_option('-u','--unidentified',
+ action="store", dest="unidentified",
+ metavar="<FILENAME>",
+ type="string",
+ default=None,
+ help="Filename used to store the sequences unassigned to any sample")
+
+ group.add_option('-e','--error',
+ action="store", dest="error",
+ metavar="###",
+ type="int",
+ default=2,
+ help="Number of errors allowed for matching primers [default = 2]")
+
+
+class Primer:
+
+ collection={}
+
+ def __init__(self,sequence,taglength,direct=True,error=2,verbose=False):
+ '''
+
+ @param sequence:
+ @type sequence:
+ @param direct:
+ @type direct:
+ '''
+
+ assert sequence not in Primer.collection \
+ or Primer.collection[sequence]==taglength, \
+ "Primer %s must always be used with tags of the same length" % sequence
+
+ Primer.collection[sequence]=taglength
+
+ self.raw=sequence
+ self.sequence = NucSequence('primer',sequence)
+ self.lseq = len(self.sequence)
+ self.align=FreeEndGapFullMatch()
+ self.align.match=4
+ self.align.mismatch=-2
+ self.align.opengap=-2
+ self.align.extgap=-2
+ self.error=error
+ self.minscore = (self.lseq-error) * self.align.match + error * self.align.mismatch
+ if verbose:
+ print >>sys.stderr,sequence,":",self.lseq,"*",self.align.match,"+",error,"*",self.align.mismatch,"=",self.minscore
+ self.taglength=taglength
+
+ self.align.seqB=self.sequence
+
+ self.direct = direct
+ self.verbose=verbose
+
+ def complement(self):
+ p = Primer(self.raw,
+ self.taglength,
+ not self.direct,verbose=self.verbose,
+ error=self.error)
+ p.sequence=p.sequence.complement()
+ p.align.seqB=p.sequence
+ return p
+
+ def __hash__(self):
+ return hash(str(self.raw))
+
+ def __eq__(self,primer):
+ return self.raw==primer.raw
+
+ def __call__(self,sequence):
+ if len(sequence) <= self.lseq:
+ return None
+ if self.verbose:
+ print >>sys.stderr,len(sequence) , self.lseq,len(sequence) < self.lseq
+ self.align.seqA=sequence
+ ali=self.align()
+ if self.verbose:
+ print >>sys.stderr,ali
+ print >>sys.stderr,"Score : %d Minscore : %d \n" %(ali.score,self.minscore)
+
+ if ali.score >= self.minscore:
+ score = ali.score
+ start = ali[1].gaps[0][1]
+ end = len(ali[1])-ali[1].gaps[-1][1]
+ if self.taglength is not None:
+ if isinstance(self.sequence, DNAComplementSequence):
+ if (len(sequence)-end) >= self.taglength:
+ tag=str(sequence[end:end+self.taglength].complement())
+ else:
+ tag=None
+ else:
+ if start >= self.taglength:
+ tag=str(sequence[start - self.taglength:start])
+ else:
+ tag=None
+ else:
+ tag=None
+
+ return score,start,end,tag
+
+ return None
+
+ def __str__(self):
+ return "%s: %s" % ({True:'D',False:'R'}[self.direct],self.raw)
+
+ __repr__=__str__
+
+
+def tagpair(x):
+ x=tuple(lower(y.strip()) for y in x.split(':'))
+ if len(x)==1:
+ x = (x[0],x[0])
+ return x
+
+def readTagfile(filename):
+ """
+ data file describing tags and primers for each sample
+ is a space separated tabular file following this format
+
+ experiment sample forward_tag reverse_tag forward_primer reverse_primer partial
+
+
+ tags can be specified as - if no tag are used
+ """
+
+ tab=ColumnFile(filename,strip=True,
+ types=(str,str,tagpair,lower,lower,bool),
+ head=('experiment','sample',
+ 'tags',
+ 'forward_primer','reverse_primer',
+ 'partial'),
+ skip="#",
+ extra="@")
+
+ primers = {}
+
+ for p in tab:
+ forward=Primer(p['forward_primer'],
+ len(p['tags'][0]) if p['tags'][0]!='-' else None,
+ True,
+ error=options.error,verbose=options.debug)
+
+ fp = primers.get(forward,{})
+ primers[forward]=fp
+
+ reverse=Primer(p['reverse_primer'],
+ len(p['tags'][1]) if p['tags'][1]!='-' else None,
+ False,
+ error=options.error,verbose=options.debug)
+
+ rp = primers.get(reverse,{})
+ primers[reverse]=rp
+
+ cf=forward.complement()
+ cr=reverse.complement()
+
+ dpp=fp.get(cr,{})
+ fp[cr]=dpp
+
+ rpp=rp.get(cf,{})
+ rp[cf]=rpp
+
+ tags = (p['tags'][0] if p['tags'][0]!='-' else None,
+ p['tags'][1] if p['tags'][1]!='-' else None)
+
+ assert tags not in dpp, \
+ "tag pair %s is already used with primer pairs : (%s,%s)" % (str(tags),forward,reverse)
+
+ extras = p.get('__extra__',{})
+ data ={'experiment':p['experiment'],
+ 'sample': p['sample']
+ }
+ data.update(extras)
+
+ dpp[tags]=data
+ rpp[tags]=data
+
+
+ if p['partial']:
+ dpartial = fp.get(None,{})
+ fp[None]=dpartial
+ rpartial = rp.get(None,{})
+ rp[None]=rpartial
+
+ dt = [x for x in dpartial if x[0]==tags[0]]
+ rt = [x for x in rpartial if x[1]==tags[1]]
+
+ assert not(dt) and not(rt), \
+ "partial fragment are not usable with primer pair : (%s,%s)" % (forward,reverse)
+
+ dpartial[tags]=data
+ rpartial[tags]=data
+
+ return primers
+
+
+def annotate(sequence,options):
+
+ def sortMatch(m1,m2):
+ if m1[1] is None and m2[1] is None:
+ return 0
+
+ if m1[1] is None:
+ return 1
+
+ if m2[1] is None:
+ return -1
+
+ return cmp(m1[1][1],m2[1][2])
+
+ if hasattr(sequence, "quality"):
+ q = -reduce(lambda x,y:x+y,(math.log10(z) for z in sequence.quality),0)/len(sequence.quality)*10
+ sequence['avg_quality']=q
+ q = -reduce(lambda x,y:x+y,(math.log10(z) for z in sequence.quality[0:10]),0)
+ sequence['head_quality']=q
+ if len(sequence.quality[10:-10]) :
+ q = -reduce(lambda x,y:x+y,(math.log10(z) for z in sequence.quality[10:-10]),0)/len(sequence.quality[10:-10])*10
+ sequence['mid_quality']=q
+ q = -reduce(lambda x,y:x+y,(math.log10(z) for z in sequence.quality[-10:]),0)
+ sequence['tail_quality']=q
+
+ primers = options.primers
+ if options.debug:
+ print >>sys.stderr,"directmatch"
+ directmatch = [(p,p(sequence)) for p in primers]
+
+
+ directmatch.sort(cmp=sortMatch)
+ directmatch=directmatch[0] if directmatch[0][1] is not None else None
+
+ if options.debug:
+ print >>sys.stderr,">>>>",directmatch
+ if directmatch is None:
+ sequence['error']='No primer match'
+ return False,sequence
+
+ match=str(sequence[directmatch[1][1]:directmatch[1][2]])
+
+ sequence['seq_length_ori']=len(sequence)
+
+ sequence = sequence[directmatch[1][2]:]
+
+ if directmatch[0].direct:
+ sequence['direction']='forward'
+ sequence['forward_score']=directmatch[1][0]
+ sequence['forward_primer']=directmatch[0].raw
+ sequence['forward_match']=match
+
+ else:
+ sequence['direction']='reverse'
+ sequence['reverse_score']=directmatch[1][0]
+ sequence['reverse_primer']=directmatch[0].raw
+ sequence['reverse_match']=match
+
+ del sequence['cut']
+
+ primers = options.primers[directmatch[0]]
+ if options.debug:
+ print >>sys.stderr,"reverse match"
+ reversematch = [(p,p(sequence)) for p in primers if p is not None]
+ reversematch.sort(cmp=sortMatch)
+ reversematch = reversematch[0] if reversematch[0][1] is not None else None
+
+ if options.debug:
+ print >>sys.stderr,"<<<<",reversematch
+ if reversematch is None and None not in primers:
+ if directmatch[0].direct:
+ message = 'No reverse primer match'
+ else:
+ message = 'No direct primer match'
+
+ sequence['error']=message
+ return False,sequence
+
+ if reversematch is None:
+ sequence['status']='partial'
+
+ if directmatch[0].direct:
+ tags=(directmatch[1][3],None)
+ else:
+ tags=(None,directmatch[1][3])
+
+ samples = primers[None]
+
+ else:
+ sequence['status']='full'
+
+ match=str(sequence[reversematch[1][1]:reversematch[1][2]].complement())
+ sequence = sequence[0:reversematch[1][1]]
+
+ if directmatch[0].direct:
+ tags=(directmatch[1][3],reversematch[1][3])
+ sequence['reverse_score']=reversematch[1][0]
+ sequence['reverse_primer']=reversematch[0].raw
+ sequence['reverse_match']=match
+ sequence['forward_tag']=tags[0]
+ sequence['reverse_tag']=tags[1]
+
+ else:
+ tags=(reversematch[1][3],directmatch[1][3])
+ sequence['forward_score']=reversematch[1][0]
+ sequence['forward_primer']=reversematch[0].raw
+ sequence['forward_match']=match
+
+ del sequence['cut']
+ sequence['forward_tag']=tags[0]
+ sequence['reverse_tag']=tags[1]
+
+ samples = primers[reversematch[0]]
+
+
+ if not directmatch[0].direct:
+ sequence=sequence.complement()
+ del sequence['complemented']
+
+ sample=None
+
+ if tags[0] is not None: # Direct tag known
+ if tags[1] is not None: # Reverse tag known
+ sample = samples.get(tags,None)
+ else: # Reverse tag known
+ s=[samples[x] for x in samples if x[0]==tags[0]]
+ if len(s)==1:
+ sample=s[0]
+ elif len(s)>1:
+ sequence['error']='multiple samples match tags'
+ return False,sequence
+ else:
+ sample=None
+ else: # Direct tag unknown
+ if tags[1] is not None: # Reverse tag known
+ s=[samples[x] for x in samples if x[1]==tags[1]]
+ if len(s)==1:
+ sample=s[0]
+ elif len(s)>1:
+ sequence['error']='multiple samples match tags'
+ return False,sequence
+ else: # Reverse tag known
+ sample=None
+
+
+ if sample is None:
+ sequence['error']="Cannot assign sequence to a sample"
+ return False,sequence
+
+ sequence._info.update(sample)
+ sequence['seq_length']=len(sequence)
+
+ return True,sequence
+
+
+if __name__ == '__main__':
+
+
+ optionParser = getOptionManager([addNGSOptions,addInOutputOption], progdoc=__doc__)
+
+
+ (options, entries) = optionParser()
+
+# assert options.direct is not None or options.taglist is not None, \
+# "you must specify option -d ou -t"
+
+ assert options.taglist is not None,"you must specify option -t"
+
+# if options.taglist is not None:
+ primers=readTagfile(options.taglist)
+#TODO: Patch when no taglists
+# else:
+# options.direct=options.direct.lower()
+# options.reverse=options.reverse.lower()
+# primers={options.direct:(options.taglength,{})}
+# if options.reverse is not None:
+# reverse = options.reverse
+# else:
+# reverse = '-'
+# primers[options.direct][1][reverse]={'-':('-','-',True,None)}
+
+ options.primers=primers
+
+ if options.unidentified is not None:
+ unidentified = open(options.unidentified,"w")
+
+ writer = sequenceWriterGenerator(options)
+
+ if options.unidentified is not None:
+ unidentified = sequenceWriterGenerator(options,open(options.unidentified,"w"))
+ else :
+ unidentified = None
+
+ for seq in entries:
+ good,seq = annotate(seq,options)
+ if good:
+ writer(seq)
+ elif unidentified is not None:
+ unidentified(seq)
+
+
+
diff --git a/src/obiaddtaxids.py b/src/obiaddtaxids.py
new file mode 100644
index 0000000..052fe1f
--- /dev/null
+++ b/src/obiaddtaxids.py
@@ -0,0 +1,424 @@
+#!/usr/local/bin/python
+'''
+:py:mod:`obiaddtaxids`: adds *taxids* to sequence records using an ecopcr database
+==================================================================================
+
+.. codeauthor:: Celine Mercier <celine.mercier at metabarcoding.org>
+
+The :py:mod:`obiaddtaxids` command annotates sequence records with a *taxid* based on
+a taxon scientific name stored in the sequence record header.
+
+Taxonomic information linking a *taxid* to a taxon scientific name is stored in a
+database formatted as an ecoPCR database (see :doc:`obitaxonomy <obitaxonomy>`) or
+a NCBI taxdump (see NCBI ftp site).
+
+The way to extract the taxon scientific name from the sequence record header can be
+specified by two options:
+
+ - By default, the sequence identifier is used. Underscore characters (``_``) are substituted
+ by spaces before looking for the taxon scientific name into the taxonomic
+ database.
+
+ - If the input file is an ``OBITools`` extended :doc:`fasta <../fasta>` format, the ``-k`` option
+ specifies the attribute containing the taxon scientific name.
+
+ - If the input file is a :doc:`fasta <../fasta>` file imported from the UNITE or from the SILVA web sites,
+ the ``-f`` option allows specifying this source and parsing correctly the associated
+ taxonomic information.
+
+
+For each sequence record, :py:mod:`obiaddtaxids` tries to match the extracted taxon scientific name
+with those stored in the taxonomic database.
+
+ - If a match is found, the sequence record is annotated with the corresponding *taxid*.
+
+Otherwise,
+
+ - If the ``-g`` option is set and the taxon name is composed of two words and only the
+ first one is found in the taxonomic database at the 'genus' rank, :py:mod:`obiaddtaxids`
+ considers that it found the genus associated with this sequence record and it stores this
+ sequence record in the file specified by the ``-g`` option.
+
+ - If the ``-u`` option is set and no taxonomic information is retrieved from the
+ scientific taxon name, the sequence record is stored in the file specified by the
+ ``-u`` option.
+
+ *Example*
+
+
+ .. code-block:: bash
+
+ > obiaddtaxids -k species_name -g genus_identified.fasta \\
+ -u unidentified.fasta -d my_ecopcr_database \\
+ my_sequences.fasta > identified.fasta
+
+ Tries to match the value associated with the ``species_name`` key of each sequence record
+ from the ``my_sequences.fasta`` file with a taxon name from the ecoPCR database ``my_ecopcr_database``.
+
+ - If there is an exact match, the sequence record is stored in the ``identified.fasta`` file.
+
+ - If not and the ``species_name`` value is composed of two words, :py:mod:`obiaddtaxids`
+ considers the first word as a genus name and tries to find it into the taxonomic database.
+
+ - If a genus is found, the sequence record is stored in the ``genus_identified.fasta``
+ file.
+
+ - Otherwise the sequence record is stored in the ``unidentified.fasta`` file.
+
+'''
+
+
+import re
+
+from obitools.fasta import fastaIterator,formatFasta
+from obitools.options import getOptionManager
+from obitools.options.taxonomyfilter import addTaxonomyDBOptions
+from obitools.options.taxonomyfilter import loadTaxonomyDatabase
+from obitools.format.genericparser import genericEntryIteratorGenerator
+from obitools import NucSequence
+
+
+def addObiaddtaxidsOptions(optionManager):
+
+ optionManager.add_option('-g','--genus_found',
+ action="store", dest="genus_found",
+ metavar="<FILENAME>",
+ type="string",
+ default=None,
+ help="(not with UNITE databases) file used to store sequences with the genus found.")
+
+ optionManager.add_option('-u','--unidentified',
+ action="store", dest="unidentified",
+ metavar="<FILENAME>",
+ type="string",
+ default=None,
+ help="file used to store completely unidentified sequences.")
+
+ optionManager.add_option('-s','--dirty',
+ action='store', dest="dirty",
+ metavar="<FILENAME>",
+ type="str",
+ default=None,
+ help="(not with UNITE databases) if chosen, ALL the words in the name used to identify the sequences will be searched"
+ " when neither the exact name nor the genus have been found."
+ " Only use if the sequences in your database are badly named with useless words or numbers"
+ " in the name etc."
+ " The sequences identified this way will be written in <FILENAME>.")
+
+ optionManager.add_option('-f','--format',
+ action="store", dest="db_type",
+ metavar="<FORMAT>",
+ type="string",
+ default='raw',
+ help="type of the database with the taxa to be added. Possibilities : 'raw', 'UNITE_FULL', 'UNITE_GENERAL' or 'SILVA'."
+ "The UNITE_FULL format is the one used for the 'Full UNITE+INSD dataset', and the UNITE_GENERAL format is the "
+ "one used for the 'General FASTA release'."
+ " Default : raw.")
+
+ optionManager.add_option('-k','--key-name',
+ action="store", dest="tagname",
+ metavar="<KEYNAME>",
+ type="string",
+ default='',
+ help="name of the key attribute containing the taxon name in databases of 'raw' type. Default : the taxon name is the id "
+ "of the sequence. The taxon name MUST have '_' between the words of the name when it's the id, and "
+ "CAN be of this form when it's in a field.")
+
+ optionManager.add_option('-a','--restricting_ancestor',
+ action="store", dest="res_anc",
+ type="str",
+ metavar="<ANCESTOR>",
+ default='',
+ help="can be a word or a taxid (number). Enables to restrict the search of taxids under a "
+ "specified ancestor. If it's a word, it's the field containing the ancestor's taxid "
+ "in each sequence's header (can be different for each sequence). If it's a number, "
+ "it's the taxid of the ancestor (in which case it's the same for all the sequences)")
+
+
+
+def numberInStr(s) :
+ containsNumber = False
+ for c in s :
+ if c.isdigit() :
+ containsNumber = True
+ return containsNumber
+
+
+def UNITEIterator_FULL(f):
+
+ fastaEntryIterator = genericEntryIteratorGenerator(startEntry='>')
+ for entry in fastaEntryIterator(f) :
+ all = entry.split('\n')
+ header = all[0]
+ fields = header.split('|')
+ seq_id = fields[0][1:]
+ seq = all[1]
+ s = NucSequence(seq_id, seq)
+
+ path = fields[1]
+
+ species_name_loc = path.index('s__')
+ species_name_loc+=3
+ s['species_name'] = path[species_name_loc:]
+
+ genus_name_loc = path.index('g__')
+ genus_name_loc+=3
+ s['genus_name'] = path[genus_name_loc:species_name_loc-4]
+
+ path = re.sub('[a-z]__', '', path)
+ s['path'] = path.replace(';', ',')
+
+ yield s
+
+
+def UNITEIterator_GENERAL(f):
+
+ fastaEntryIterator = genericEntryIteratorGenerator(startEntry='>')
+ for entry in fastaEntryIterator(f) :
+ all = entry.split('\n')
+ header = all[0]
+ fields = header.split('|')
+
+ seq_id = fields[0][1:]
+ seq = all[1]
+ s = NucSequence(seq_id, seq)
+
+ s['species_name'] = seq_id.replace("_", " ")
+
+ path = fields[4]
+ path = re.sub('[a-z]__', '', path)
+ path = path.replace(';', ',')
+ s['path'] = path.replace(',,', ',')
+
+ yield s
+
+
+def SILVAIterator(f, tax):
+
+ fastaEntryIterator = genericEntryIteratorGenerator(startEntry='>')
+ for entry in fastaEntryIterator(f) :
+ all = entry.split('\n')
+ header = all[0]
+ fields = header.split(' | ')
+ id = fields[0][1:]
+ seq = all[1]
+ s = NucSequence(id, seq)
+
+ if (
+ '(' in fields[1]
+ and len(fields[1].split('(')[1][:-1]) > 2
+ and ')' not in fields[1].split('(')[1][:-1]
+ and not numberInStr(fields[1].split('(')[1][:-1])
+ ) :
+ species_name = fields[1].split('(')[0][:-1]
+ other_name = fields[1].split('(')[1][:-1]
+
+ ancestor = None
+ notAnAncestor = False
+
+ if (len(other_name.split(' ')) == 1 and other_name[0].isupper()):
+
+ try:
+ ancestor = tax.findTaxonByName(other_name)
+ except KeyError :
+ notAnAncestor = True
+
+ if (ancestor == None and notAnAncestor == False):
+ s['common_name'] = other_name
+ s['original_silva_name'] = fields[1]
+ s['species_name'] = species_name
+
+ elif (ancestor != None and notAnAncestor == False) :
+ s['ancestor_name'] = other_name
+ s['ancestor'] = ancestor[0]
+ s['original_silva_name'] = fields[1]
+ s['species_name'] = species_name
+
+ elif notAnAncestor == True :
+ s['species_name'] = fields[1]
+
+ else :
+ s['species_name'] = fields[1]
+
+ yield s
+
+
+def dirtyLookForSimilarNames(name, tax, ancestor):
+
+ similar_name = ''
+ taxid = None
+
+ try :
+ t = tax.findTaxonByName(name)
+ taxid = t[0]
+ similar_name = t[3]
+
+ except KeyError :
+ taxid = None
+
+ if ancestor != None and not tax.isAncestor(ancestor, taxid) :
+ taxid = None
+
+ return similar_name, taxid
+
+
+def getGenusTaxid(tax, species_name, ancestor):
+ genus_sp = species_name.split(' ')
+ genus_taxid = getTaxid(tax, genus_sp[0], ancestor)
+ if tax.getRank(genus_taxid) != 'genus' :
+ raise KeyError()
+ return genus_taxid
+
+
+def getTaxid(tax, name, ancestor):
+
+ taxid = tax.findTaxonByName(name)[0][0]
+ if ancestor != None and not tax.isAncestor(ancestor, taxid) :
+ raise KeyError()
+ return taxid
+
+
+def get_species_name(s, options) :
+
+ species_name = None
+ if options.tagname == '' or options.tagname in s :
+ if options.tagname == '' :
+ species_name = s.id
+ else :
+ species_name = s[options.tagname]
+
+ if "_" in species_name :
+ species_name = species_name.replace('_', ' ')
+
+ if len(species_name.split(' ')) == 2 and (species_name.split(' ')[1] == 'sp' or species_name.split(' ')[1] == 'sp.' or species_name.split(' ')[1] == 'unknown') :
+ species_name = species_name.split(' ')[0]
+
+ if options.tagname == '' :
+ s['species_name'] = species_name
+
+ return species_name
+
+
+def getVaguelySimilarNames(species_name, tax, restricting_ancestor) :
+
+ kindOfFound = False
+ uselessWords = ['sp', 'sp.', 'fungus', 'fungal', 'unknown', 'strain', 'associated', 'uncultured']
+ for word in species_name.split(' ') :
+ if word not in uselessWords :
+ similar_name, taxid = dirtyLookForSimilarNames(word, tax, restricting_ancestor)
+ if taxid != None :
+ if len(similar_name) > len(s['species_name']) or kindOfFound == False :
+ s['species_name'] = similar_name
+ kindOfFound = True
+ return kindOfFound
+
+
+def openFiles(options) :
+
+ if options.unidentified is not None:
+ options.unidentified=open(options.unidentified,'w')
+
+ if options.genus_found is not None:
+ options.genus_found=open(options.genus_found,'w')
+
+ if options.dirty is not None:
+ options.dirty = open(options.dirty, 'w')
+
+
+################################################################################################
+
+if __name__=='__main__':
+
+ optionParser = getOptionManager([addObiaddtaxidsOptions, addTaxonomyDBOptions], progdoc=__doc__)
+
+ (options,entries) = optionParser()
+
+ tax=loadTaxonomyDatabase(options)
+
+ if options.db_type == 'raw' :
+ entryIterator = fastaIterator
+ entries = entryIterator(entries)
+ elif options.db_type == 'UNITE_FULL' :
+ entryIterator = UNITEIterator_FULL
+ entries = entryIterator(entries)
+ elif options.db_type == 'UNITE_GENERAL' :
+ entryIterator = UNITEIterator_GENERAL
+ entries = entryIterator(entries)
+ elif options.db_type == 'SILVA' :
+ entryIterator = SILVAIterator
+ entries = entryIterator(entries, tax)
+ options.tagname = 'species_name'
+
+ openFiles(options)
+
+ if (options.db_type == 'raw') or (options.db_type == 'SILVA') :
+
+ if options.res_anc == '' :
+ restricting_ancestor = None
+ elif options.res_anc.isdigit() :
+ restricting_ancestor = int(options.res_anc)
+
+ for s in entries:
+
+ if options.res_anc != '' and not options.res_anc.isdigit():
+ restricting_ancestor = int(s[options.res_anc])
+
+ species_name = get_species_name(s, options)
+
+ if species_name != None :
+ try:
+ taxid = getTaxid(tax, species_name, restricting_ancestor)
+ s['taxid'] = taxid
+ print formatFasta(s)
+
+ except KeyError:
+
+ genusFound = False
+ if options.genus_found is not None and len(species_name.split(' ')) >= 2 :
+ try:
+ genusTaxid = getGenusTaxid(tax, species_name, restricting_ancestor)
+ s['genus_taxid'] = genusTaxid
+ print>>options.genus_found, formatFasta(s)
+ genusFound = True
+ except KeyError :
+ pass
+
+ kindOfFound = False
+ if options.dirty is not None and not genusFound :
+ kindOfFound = getVaguelySimilarNames(species_name, tax, restricting_ancestor)
+ if kindOfFound == True :
+ print>>options.dirty, formatFasta(s)
+
+ if options.unidentified is not None and not genusFound and not kindOfFound :
+ print>>options.unidentified,formatFasta(s)
+
+
+ elif ((options.db_type =='UNITE_FULL') or (options.db_type =='UNITE_GENERAL')) :
+
+ restricting_ancestor = tax.findTaxonByName('Fungi')[0][0]
+
+ for s in entries :
+
+ try :
+ species_name = s['species_name']
+ taxid = getTaxid(tax, species_name, restricting_ancestor)
+ s['taxid'] = taxid
+ s['rank'] = tax.getRank(taxid)
+ print formatFasta(s)
+
+
+ except KeyError:
+
+ genusFound = False
+ if options.genus_found is not None :
+ try:
+ genusTaxid = getGenusTaxid(tax, species_name, restricting_ancestor)
+ s['genus_taxid'] = genusTaxid
+ print>>options.genus_found, formatFasta(s)
+ genusFound = True
+
+ except KeyError:
+ pass
+
+ if options.unidentified is not None and not genusFound :
+ print>>options.unidentified,formatFasta(s)
diff --git a/src/obiannotate.py b/src/obiannotate.py
new file mode 100755
index 0000000..b04c129
--- /dev/null
+++ b/src/obiannotate.py
@@ -0,0 +1,85 @@
+#!/usr/local/bin/python
+
+'''
+:py:mod:`obiannotate`: adds/edits sequence record annotations
+=============================================================
+
+.. codeauthor:: Eric Coissac <eric.coissac at metabarcoding.org>
+
+:py:mod:`obiannotate` is the command that allows adding/modifying/removing
+annotation attributes attached to sequence records.
+
+Once such attributes are added, they can be used by the other OBITools commands for
+filtering purposes or for statistics computing.
+
+*Example 1:*
+
+ .. code-block:: bash
+
+ > obiannotate -S short:'len(sequence)<100' seq1.fasta > seq2.fasta
+
+ The above command adds an attribute named *short* which has a boolean value indicating whether the sequence length is less than 100bp.
+
+*Example 2:*
+
+ .. code-block:: bash
+
+ > obiannotate --seq-rank seq1.fasta | \\
+ obiannotate -C --set-identifier '"'FungA'_%05d" % seq_rank' \\
+ > seq2.fasta
+
+ The above command adds a new attribute whose value is the sequence record
+ entry number in the file. Then it clears all the sequence record attributes
+ and sets the identifier to a string beginning with *FungA_* followed by a
+ suffix with 5 digits containing the sequence entry number.
+
+*Example 3:*
+
+ .. code-block:: bash
+
+ > obiannotate -d my_ecopcr_database \\
+ --with-taxon-at-rank=genus seq1.fasta > seq2.fasta
+
+ The above command adds taxonomic information at the *genus* rank to the
+ sequence records.
+
+*Example 4:*
+
+ .. code-block:: bash
+
+ > obiannotate -S 'new_seq:str(sequence).replace("a","t")' \\
+ seq1.fasta | obiannotate --set-sequence new_seq > seq2.fasta
+
+ The overall aim of the above command is to edit the *sequence* object itself,
+ by replacing all nucleotides *a* by nucleotides *t*. First, a new attribute
+ named *new_seq* is created, which contains the modified sequence, and then
+ the former sequence is replaced by the modified one.
+
+'''
+
+from obitools.options import getOptionManager
+from obitools.options.bioseqfilter import addSequenceFilteringOptions
+from obitools.options.bioseqfilter import filterGenerator
+from obitools.options.bioseqedittag import addSequenceEditTagOptions
+from obitools.options.bioseqedittag import sequenceTaggerGenerator
+from obitools.format.options import addInOutputOption, sequenceWriterGenerator
+
+
+if __name__=='__main__':
+
+ optionParser = getOptionManager([addSequenceFilteringOptions,
+ addSequenceEditTagOptions,
+ addInOutputOption], progdoc=__doc__)
+
+ (options, entries) = optionParser()
+
+ writer = sequenceWriterGenerator(options)
+
+ sequenceTagger = sequenceTaggerGenerator(options)
+ goodFasta = filterGenerator(options)
+
+ for seq in entries:
+ if goodFasta(seq):
+ sequenceTagger(seq)
+ writer(seq)
+
diff --git a/src/obiclean.py b/src/obiclean.py
new file mode 100644
index 0000000..36af2dd
--- /dev/null
+++ b/src/obiclean.py
@@ -0,0 +1,416 @@
+#!/usr/local/bin/python
+
+'''
+:py:mod:`obiclean`: tags a set of sequences for PCR/sequencing errors identification
+====================================================================================
+
+.. codeauthor:: Eric Coissac <eric.coissac at metabarcoding.org>
+
+:py:mod:`obiclean` is a command that classifies sequence records either as ``head``, ``internal`` or ``singleton``.
+
+For that purpose, two pieces of information are used:
+ - sequence record counts
+ - sequence similarities
+
+*S1* a sequence record is considered as a variant of *S2* another sequence record if and only if:
+ - ``count`` of *S1* divided by ``count`` of *S2* is lesser than the ratio *R*.
+ *R* default value is set to 1, and can be adjusted between 0 and 1 with the ``-r`` option.
+ - both sequences are *related* to one another (they can align with some differences,
+ the maximum number of differences can be specified by the ``-d`` option).
+
+Considering *S* a sequence record, the following properties hold for *S* tagged as:
+ - ``head``:
+ + there exists **at least one** sequence record in the dataset that is a variant of *S*
+ + there exists **no** sequence record in the dataset such that *S* is a variant of this
+ sequence record
+ - ``internal``:
+ + there exists **at least one** sequence record in the dataset such that *S* is a variant
+ of this sequence record
+ - ``singleton``:
+ + there exists **no** sequence record in the dataset that is a variant of *S*
+ + there exists **no** sequence record in the dataset such that *S* is a variant of this
+ sequence record
+
+By default, tagging is done once for the whole dataset, but it can also be done sample by sample
+by specifying the ``-s`` option. In such a case, the counts are extracted from the sample
+information.
+
+Finally, each sequence record is annotated with three new attributes ``head``, ``internal`` and
+``singleton``. The attribute values are the numbers of samples in which the sequence record has
+been classified in this manner.
+'''
+
+from obitools.format.options import addInOutputOption, sequenceWriterGenerator
+from obitools.options import getOptionManager
+from obitools.graph import UndirectedGraph,Indexer
+from obitools.graph.dag import DAG
+from obitools.utils import progressBar
+from obitools.align import LCS
+from obitools.align import isLCSReachable
+
+
+import sys
+import math
+
+
+def addCleanOptions(optionManager):
+ optionManager.add_option('-d','--distance',
+ action="store", dest="dist",
+ metavar="###",
+ type="int",
+ default=1,
+ help="Maximum numbers of errors between two variant sequences [default: 1]")
+ optionManager.add_option('-s','--sample',
+ action="store", dest="sample",
+ metavar="<TAGNAME>",
+ type="str",
+ default=None,
+ help="Tag containing sample descriptions")
+
+ optionManager.add_option('-g','--graph',
+ action="store", dest="graph",
+ metavar="<TAGNAME>",
+ type="str",
+ default=None,
+ help="File name where the clustering graphs are saved")
+
+ optionManager.add_option('-r','--ratio',
+ action="store", dest="ratio",
+ metavar="<FLOAT>",
+ type="float",
+ default="0.5",
+ help="Minimum ratio between counts of two sequence records so that the less abundant "
+ "one can be considered as a variant of the more abundant "
+ "[default: 0.5]")
+ optionManager.add_option('-H','--head',
+ action="store_true", dest="onlyhead",
+ default=False,
+ help="Outputs only head tagged sequence records")
+
+ optionManager.add_option('-C','--cluster',
+ action="store_true", dest="clustermode",
+ default=False,
+ help="Set obiclean in clustering mode")
+
+def lookforFather(node,sample):
+ father=set()
+
+ for neighbour in node.neighbourIterator():
+ if sample in neighbour['_sample']:
+ if neighbour['_sample'][sample] > node['_sample'][sample]:
+ gdfather = lookforFather(neighbour, sample)
+ father|=gdfather
+ if not father:
+ father.add(node)
+
+ return father
+
+def cmpseqcount(s1,s2):
+ if 'count' not in s1:
+ s1['count']=1
+ if 'count' not in s2:
+ s2['count']=1
+
+ return cmp(s2['count'],s1['count'])
+
+
+if __name__ == '__main__':
+
+ optionParser = getOptionManager([addCleanOptions,addInOutputOption], progdoc=__doc__)
+ (options, entries) = optionParser()
+
+ if (options.onlyhead):
+ options.clustermode=True
+
+ globalIndex = Indexer() # I keep correspondances for all graphs between
+ # node id and sequence
+
+ db = [] # sequences are stored in a list. The indexes in the list
+ # are corresponding to the node index in graphs
+
+ sampleOccurrences = [] # Contains the list of count distribution per samples
+ # The indexes in the list are corresponding to the node
+ # index in graphs
+
+ graph = UndirectedGraph("error",indexer=globalIndex)
+ pcr= {} # For each sample store a set of node id occuring in this PCR
+
+ if options.graph is not None:
+ graphfile=open(options.graph,"w")
+ else:
+ graphfile=None
+
+ for s in entries:
+ nodeid = globalIndex.getIndex(s.id)
+ db.append(s)
+
+ if options.sample is None:
+ sample = {"XXX":s['count'] if 'count' in s else 1}
+ else:
+ sample = s[options.sample]
+
+ sampleOccurrences.append(sample)
+
+ graph.addNode(s.id,shape='circle')
+
+ for sp in sample:
+ spcr = pcr.get(sp,set())
+ spcr.add(nodeid)
+ pcr[sp]=spcr
+
+
+ writer = sequenceWriterGenerator(options)
+
+ ldb = len(db)
+ digit = int(math.ceil(math.log10(ldb)))
+ aligncount = ldb*(ldb+1)/2
+ edgecount = 0
+ print >>sys.stderr
+
+ header = "Alignment : %%0%dd x %%0%dd -> %%0%dd " % (digit,digit,digit)
+ progressBar(1,aligncount,True,"Alignment : %s x %s -> %s " % ('-'*digit,'-'*digit, '0'*digit))
+ pos=1
+ aligner = LCS()
+
+ #
+ # We build the global levenstein graph
+ # Two sequences are linked if their distances are below
+ # options.dist (usually 1)
+ #
+
+ for i in xrange(ldb):
+
+ aligner.seqA = db[i]
+ li = len(db[i])
+
+ for j in xrange(i+1,ldb):
+ progressBar(pos,aligncount,head=header % (i,j,edgecount))
+ pos+=1
+
+ lj = len(db[j])
+
+ lm = max(li,lj)
+ lcsmin = lm - options.dist
+
+ if isLCSReachable(db[i],db[j],lcsmin):
+ aligner.seqB=db[j]
+ ali = aligner()
+ llcs=ali.score
+ lali = len(ali[0])
+ obsdist = lali-llcs
+ if obsdist >= 1 and obsdist <= options.dist:
+ graph.addEdge(index1=i, index2=j)
+ edgecount+=1
+
+ print >>sys.stderr
+
+ header = "Clustering sample : %20s "
+ samplecount = len(pcr)
+
+ print >>sys.stderr,"Sample count : %d" % samplecount
+
+
+ progressBar(1,samplecount,True,head=header % "--")
+ isample=0
+
+ #
+ # We iterate through all PCR
+ #
+
+ for sample in pcr:
+
+ isample+=1
+ progressBar(isample,samplecount,head=header % sample)
+
+ seqids = list(pcr[sample])
+ nnodes = len(seqids)
+
+ #
+ # We build a sub DAG for each sample
+ #
+
+ sub = DAG(sample,indexer=globalIndex)
+ counts = []
+
+ for i in seqids:
+ c=sampleOccurrences[i][sample]
+ sub.addNode(index=i,count=c,oricount=c)
+ counts.append(c)
+
+ order = map(None,counts,seqids)
+ order.sort(key=lambda a : a[0],reverse=True)
+
+ for j in xrange(nnodes - 1):
+ count1,index1 = order[j]
+ for k in xrange(j+1,nnodes):
+ count2,index2 = order[k]
+ r = float(count2)/float(count1)
+ if r <= options.ratio and graph.hasEdge(index1=index1,index2=index2):
+ sub.addEdge(index1=index1,
+ index2=index2,
+ ratio=r,
+ arette = "%d -> %d" % (count1,count2))
+
+ if (options.clustermode):
+ # We transfer the weight of errors to the parent sequence
+ # when an error has several parents, we distribute its
+ # weight to each of its parent proportionally to the parent
+ # weight.
+
+ leaves = sub.getLeaves()
+
+ while leaves:
+ for l in leaves:
+ l['color']='red'
+ l['done']=True
+ c = l['count']
+ p = l.getParents()
+ pc = [float(x['count']) for x in p]
+ ps = sum(pc)
+ pc = [x / ps * c for x in pc]
+ for i in xrange(len(pc)):
+ p[i]['count']+=int(round(pc[i]))
+
+ leaves = [x for x in sub.nodeIterator(lambda n : 'done' not in n and not [y for y in n.neighbourIterator(lambda k : 'done' not in k)])]
+
+
+
+ # Just clean the done tag set by the precedent loop
+
+ for x in sub.nodeIterator():
+ del x["done"]
+
+
+ # Annotate each sequences with its more probable parent.
+ # When a sequence has several potential parents, it is
+ # annotated with the heaviest one
+
+ heads = sub.getRoots()
+ sons = []
+ for h in heads:
+ h['cluster']=h.label
+
+ if (options.clustermode):
+ h['head'] =True
+
+ sons.extend(h.neighbourIterator(lambda k : 'cluster' not in k))
+
+ #
+ # Annotate the corresponding sequence
+ #
+
+ seq = db[h.index]
+
+ # sequence at least head in one PCR get the obiclean_head
+ # attribute
+ seq['obiclean_head']=True
+
+ if (options.clustermode):
+ # Store for each sample the cluster center related to
+ # this sequence
+ if "obiclean_cluster" not in seq:
+ seq['obiclean_cluster']={}
+ seq['obiclean_cluster'][sample]=h.label
+
+
+ # Store for each sample the count of this sequence plus
+ # the count of all its related
+ if "obiclean_count" not in seq:
+ seq["obiclean_count"]={}
+
+ seq["obiclean_count"][sample]=h['count']
+
+ if "obiclean_status" not in seq:
+ seq["obiclean_status"]={}
+
+ if len(h) > 0:
+ seq["obiclean_status"][sample]='h'
+ else:
+ seq["obiclean_status"][sample]='s'
+
+
+ heads=sons
+ sons = []
+
+ while heads:
+ for h in heads:
+ parents = h.getParents()
+ maxp=None
+ for p in parents:
+ if maxp is None or p['count']>maxp['count']:
+ maxp=p
+
+ if 'cluster' in maxp:
+ cluster = maxp['cluster']
+ h['cluster']=cluster
+ sons.extend(h.neighbourIterator(lambda k : 'cluster' not in k))
+
+ #
+ # Annotate the corresponding sequence
+ #
+
+ seq = db[h.index]
+ if (options.clustermode):
+ if "obiclean_cluster" not in seq:
+ seq['obiclean_cluster']={}
+ seq['obiclean_cluster'][sample]=cluster
+
+ if "obiclean_count" not in seq:
+ seq["obiclean_count"]={}
+ seq["obiclean_count"][sample]=h['count']
+
+ if "obiclean_status" not in seq:
+ seq["obiclean_status"]={}
+
+ seq["obiclean_status"][sample]='i'
+
+ heads=sons
+ sons = []
+
+ if graphfile is not None:
+ print >>graphfile,sub
+
+ print >>sys.stderr
+
+ seqcount = len(db)
+ sc=0
+ progressBar(1,seqcount,True,head="Writing sequences")
+
+ for node in db:
+ sc+=1
+ progressBar(sc,seqcount,head="Writing sequences")
+
+ if (not options.onlyhead or 'obiclean_head' in node):
+ status = node["obiclean_status"]
+ i=0
+ h=0
+ s=0
+ for sample in status:
+ st=status[sample]
+ if st=="i":
+ i+=1
+ elif st=="s":
+ s+=1
+ else:
+ h+=1
+ node['obiclean_headcount']=h
+ node['obiclean_internalcount']=i
+ node['obiclean_singletoncount']=s
+ node['obiclean_samplecount']=s+i+h
+
+ if 'obiclean_head' not in node:
+ node['obiclean_head']=False
+
+# if (not options.clustermode):
+# del node["obiclean_status"]
+
+ writer(node)
+
+ print >>sys.stderr
+
+
+
+
+
+
+
diff --git a/src/obicomplement.py b/src/obicomplement.py
new file mode 100644
index 0000000..f4cb073
--- /dev/null
+++ b/src/obicomplement.py
@@ -0,0 +1,62 @@
+#!/usr/local/bin/python
+"""
+:py:mod:`obicomplement`: reverse-complements sequences
+======================================================
+
+.. codeauthor:: Eric Coissac <eric.coissac at metabarcoding.org>
+
+:py:mod:`obicomplement` reverse-complements the sequence records.
+
+
+ .. TIP:: The identifiers of the sequence records are modified by appending
+ to them the ``_CMP`` suffix.
+
+ .. TIP:: a attribute with key ``complemented`` and value sets to ``True`` is added
+ on each reversed complemented sequence record.
+
+By using the selection option set, it is possible to reverse complement only a subset of the
+sequence records included in the input file. The selected sequence are reversed complemented,
+others are stored without modification
+
+ *Example 1:*
+
+ .. code-block:: bash
+
+ > obicomplement seq.fasta > seqRC.fasta
+
+ Reverses complements all sequence records from the ``seq.fasta`` file and stores the
+ result to the ``seqRC.fasta`` file.
+
+ *Example 2:*
+
+ .. code-block:: bash
+
+ > obicomplement -s 'A{10,}$' seq.fasta > seqRC.fasta
+
+ Reverses complements sequence records from the ``seq.fasta`` file only if they finish
+ by at least 10 ``A``. Others sequences are stored without modification.
+
+"""
+
+from obitools.options import getOptionManager
+from obitools.options.bioseqfilter import addSequenceFilteringOptions
+from obitools.options.bioseqfilter import filterGenerator
+from obitools.format.options import addInOutputOption, sequenceWriterGenerator
+
+
+if __name__=='__main__':
+
+ optionParser = getOptionManager([addSequenceFilteringOptions,addInOutputOption], progdoc=__doc__)
+
+ (options, entries) = optionParser()
+
+ goodFasta = filterGenerator(options)
+ writer = sequenceWriterGenerator(options)
+
+ for seq in entries:
+ if goodFasta(seq):
+ writer(seq.complement())
+ else:
+ writer(seq)
+
+
\ No newline at end of file
diff --git a/src/obiconvert.py b/src/obiconvert.py
new file mode 100644
index 0000000..bbfb786
--- /dev/null
+++ b/src/obiconvert.py
@@ -0,0 +1,54 @@
+#!/usr/local/bin/python
+'''
+:py:mod:`obiconvert`: converts sequence files to different output formats
+=========================================================================
+
+.. codeauthor:: Eric Coissac <eric.coissac at metabarcoding.org>
+
+:py:mod:`obiconvert` converts sequence files to different output formats.
+:doc:`See the documentation for more details on the different formats. <../formats>`
+
+Input files can be in :
+
+ - *fasta* format
+ - *extended OBITools fasta* format
+ - Sanger *fastq* format
+ - Solexa *fastq* format
+ - *ecoPCR* format
+ - *ecoPCR* database format
+ - *GenBank* format
+ - *EMBL* format
+
+:py:mod:`obiconvert` converts those files to the :
+
+ - *extended OBITools fasta* format
+ - Sanger *fastq* format
+ - *ecoPCR* database format
+
+If no file name is specified, data is read from standard input.
+
+'''
+
+from obitools.options import getOptionManager
+from obitools.format.options import addInOutputOption, sequenceWriterGenerator
+from obitools.ecopcr.options import addTaxonomyDBOptions
+
+from sys import stderr
+
+if __name__ == '__main__':
+
+ optionParser = getOptionManager([addInOutputOption,addTaxonomyDBOptions])
+
+ (options, entries) = optionParser()
+ writer = sequenceWriterGenerator(options)
+
+ for entry in entries:
+ if options.skiperror:
+ try:
+ writer(entry)
+ except:
+ print >>stderr,"Skip writing of sequence : %s" % entry.id
+ else:
+ writer(entry)
+
+
\ No newline at end of file
diff --git a/src/obicount.py b/src/obicount.py
new file mode 100644
index 0000000..bc431a5
--- /dev/null
+++ b/src/obicount.py
@@ -0,0 +1,59 @@
+#!/usr/local/bin/python
+'''
+:py:mod:`obicount`: counts the number of sequence records
+=========================================================
+
+.. codeauthor:: Eric Coissac <eric.coissac at metabarcoding.org>
+
+:py:mod:`obicount` counts the number of sequence records and/or the sum of the ``count`` attributes.
+
+*Example:*
+
+ .. code-block:: bash
+
+ > obicount seq.fasta
+
+ Prints the number of sequence records contained in the ``seq.fasta``
+ file and the sum of their ``count`` attributes.
+'''
+
+from obitools.options import getOptionManager
+from obitools.format.options import addInputFormatOption
+
+def addCountOptions(optionManager):
+ group=optionManager.add_option_group('Obicount specific options')
+ group.add_option('-s','--sequence',
+ action="store_true", dest="sequence",
+ default=False,
+ help="Prints only the number of sequence records."
+ )
+
+ group.add_option('-a','--all',
+ action="store_true", dest="all",
+ default=False,
+ help="Prints only the total count of sequence records (if a sequence has no `count` attribute, its default count is 1) (default: False)."
+ )
+
+
+if __name__ == '__main__':
+ optionParser = getOptionManager([addCountOptions,addInputFormatOption], progdoc=__doc__)
+
+ (options, entries) = optionParser()
+
+ count1=0
+ count2=0
+
+ for s in entries:
+ count1+=1
+ if 'count' in s:
+ count2+=s['count']
+ else:
+ count2+=1
+
+ if options.all==options.sequence:
+ print count1,count2
+ elif options.all:
+ print count2
+ else:
+ print count1
+
\ No newline at end of file
diff --git a/src/obicut.py b/src/obicut.py
new file mode 100755
index 0000000..8d01734
--- /dev/null
+++ b/src/obicut.py
@@ -0,0 +1,53 @@
+#!/usr/local/bin/python
+'''
+:py:mod:`obicut`: trims sequences
+=================================
+
+.. codeauthor:: Eric Coissac <eric.coissac at metabarcoding.org>
+
+:py:mod:`obicut` is a command that trims sequence objects based on two integer
+values: the ``-b`` option gives the first position of the sequence to be kept,
+and the ``-e`` option gives the last position to be kept. Both values can be
+computed using a python expression.
+
+ *Example:*
+
+ .. code-block:: bash
+
+ > obicut -b 50 -e seq_length seq1.fasta > seq2.fasta
+
+ Keeps only the sequence part from the fiftieth position to the end.
+
+ *Example:*
+
+ .. code-block:: bash
+
+ > obicut -b 50 -e seq_length-50 seq1.fasta > seq2.fasta
+
+ Trims the first and last 50 nucleotides of the sequence object.
+'''
+
+from obitools.format.options import addInOutputOption, sequenceWriterGenerator
+
+from obitools.options import getOptionManager
+from obitools.options.bioseqfilter import addSequenceFilteringOptions, sequenceFilterIteratorGenerator
+
+from obitools.options.bioseqcutter import addSequenceCuttingOptions, cutterIteratorGenerator
+
+if __name__=='__main__': # @UndefinedVariable
+
+
+ optionParser = getOptionManager([addSequenceCuttingOptions,
+ addSequenceFilteringOptions,
+ addInOutputOption],
+ progdoc=__doc__) # @UndefinedVariable
+ (options, entries) = optionParser()
+
+ filter = sequenceFilterIteratorGenerator(options)
+ cutter = cutterIteratorGenerator(options)
+
+ writer = sequenceWriterGenerator(options)
+
+ for seq in cutter(filter(entries)):
+ writer(seq)
+
\ No newline at end of file
diff --git a/src/obidistribute.py b/src/obidistribute.py
new file mode 100644
index 0000000..2c31d56
--- /dev/null
+++ b/src/obidistribute.py
@@ -0,0 +1,140 @@
+#!/usr/local/bin/python
+'''
+:py:mod:`obidistribute`: Distributes sequence records over several sequence records files
+=========================================================================================
+
+.. codeauthor:: Eric Coissac <eric.coissac at metabarcoding.org>
+
+:py:mod:`obidistribute` distributes equitably a set of sequence records over several files
+(No sequence records are printed on standard output).
+
+The number of files is set using the ``-n`` option (required). File names are build with a prefix if
+provided (``-p``option) and the file number (1 to ``n``).
+
+*Example:*
+
+ .. code-block:: bash
+
+ > obidistribute -n 10 -p 'part' seq.fastq
+
+ Distribute the sequence records contained in the ``seq.fastq``
+ file and distributes them over files ``part_1.fastq`` to ``part_10.fastq``.
+'''
+
+from obitools.options import getOptionManager
+from obitools.format.options import addInOutputOption, sequenceWriterGenerator
+import math
+from obitools.fasta import formatFasta
+from obitools.fastq import formatFastq
+
+
+def addDistributeOptions(optionManager):
+ group = optionManager.add_option_group('obidistribute specific options')
+
+ group.add_option('-n','--number',
+ action="store", dest="number",
+ metavar="###",
+ type="int",
+ default=None,
+ help="Number of files to distribute over")
+
+ group.add_option('-p','--prefix',
+ action="store", dest="prefix",
+ metavar="<PREFIX FILENAME>",
+ type="string",
+ default="",
+ help="prefix added at each file name")
+
+
+class OutFiles:
+ def __init__(self,options):
+ self._tags = options.tagname
+ self._undefined = None
+ if options.undefined is not None:
+ self._undefined=open(options.undefined,'w')
+ self._prefix=options.prefix
+ self._files = {}
+ self._first=None
+ self._last=None
+ self._extension=options.outputFormat
+ self._digit = math.ceil(math.log10(options.number))
+
+
+ def __getitem__(self,key):
+ if key in self._files:
+ data = self._files[key]
+ prev,current,next = data
+ if next is not None:
+ if prev is not None:
+ self._files[prev][2]=next
+ self._files[next][0]=prev
+ data[0]=self._last
+ data[2]=None
+ self._last=key
+ else:
+ name = key
+ if self._prefix is not None:
+ template = "%s_%%0%dd.%s" % (self._prefix,self._digit,self._extension)
+ else:
+ template = "%%0%dd.%s" % (self._digit,self._extension)
+
+ current = open(template % name,'a')
+ prev=self._last
+ self._last=key
+ next=None
+ self._files[key]=[prev,current,next]
+ if len(self._files)>100:
+ oprev,old,onext=self._files[self._first]
+ del(self._files[self._first])
+ old.close()
+ self._first=onext
+ if self._first is None:
+ self._first=key
+ return current
+
+ def __call__(self,seq):
+ ok = reduce(lambda x,y: x and y, (z in seq for z in self._tags),True)
+ if ok:
+ k = "_".join([str(seq[x]) for x in self._tags])
+ file=self[k]
+ else:
+ file=self._undefined
+ if file is not None and self._extension=="fasta":
+ print >>file,formatFasta(seq)
+ else:
+ print >>file,formatFastq(seq)
+
+ def __del__(self):
+ k=self._files.keys()
+ for x in k:
+ del(self._files[x])
+
+if __name__=='__main__':
+
+ optionParser = getOptionManager([addDistributeOptions,addInOutputOption], progdoc=__doc__)
+
+ (options, entries) = optionParser()
+
+ assert options.number is not None, "You must specify the number of parts"
+
+ digit = math.ceil(math.log10(options.number))
+ out=[]
+
+
+ i = 0
+ for seq in entries:
+ if not out:
+ template = "%s_%%0%dd.%s" % (options.prefix,digit,options.outputFormat)
+ out=[sequenceWriterGenerator(options,
+ open(template % (i+1),"w"))
+ for i in xrange(options.number)
+ ]
+
+ out[i](seq)
+ i+=1
+ i%=options.number
+
+ del out
+
+
+
diff --git a/src/obiextract.py b/src/obiextract.py
new file mode 100644
index 0000000..3908565
--- /dev/null
+++ b/src/obiextract.py
@@ -0,0 +1,81 @@
+#!/usr/local/bin/python
+'''
+:py:mod:`obiextract`: extract samples from a dataset
+====================================================
+
+.. codeauthor:: Eric Coissac <eric.coissac at metabarcoding.org>
+
+The :py:mod:`obiextract` command extract a subset of samples from a complete
+dataset.
+
+Extracted sample names can be specified or by indicating their names using option
+on the command line or by indicating a file name containing a sample name per line
+
+The count attribute of the sequence and the slot describing distribution of the sample
+occurrences among samples are modified according to the selected samples.
+
+A sequence not present in at least one of the selected samples is not conserved in the
+output of :py:mod:`obiextract`.
+
+'''
+from obitools.format.options import addInOutputOption, sequenceWriterGenerator
+from obitools.options import getOptionManager
+
+def addExtractOptions(optionManager):
+ optionManager.add_option('-s','--sample',
+ action="store", dest="sample",
+ metavar="<TAGNAME>",
+ type="str",
+ default="merged_sample",
+ help="Tag containing sample descriptions")
+
+ optionManager.add_option('-e','--extract',
+ action="append",
+ type="string",
+ dest="sample_list",
+ default=[],
+ metavar="<SAMPLE_NAME>",
+ help="which <SAMPLE_NAME> have to be extracted")
+
+ optionManager.add_option('-E','--extract-list',
+ action="store", dest="sample_file",
+ metavar="<FILENAME>",
+ type="str",
+ default=None,
+ help="File name where a list of sample is stored")
+
+
+def selectSamples(entry,key,samples):
+ newsamples = {}
+ oldsamples = entry.get(key,{})
+ for k in samples:
+ if k in oldsamples:
+ newsamples[k]=oldsamples[k]
+ s = sum(newsamples.values())
+ if s > 0:
+ entry['count']=s
+ entry[key]=newsamples
+ if len(newsamples)==1 and key[0:7]=='merged_':
+ entry[key[7:]]=newsamples.keys()[0]
+ else:
+ entry=None
+
+ return entry
+
+
+if __name__ == '__main__':
+ optionParser = getOptionManager([addExtractOptions,addInOutputOption],progdoc=__doc__)
+
+ (options, entries) = optionParser()
+
+ if options.sample_file is not None:
+ s = [x.strip() for x in open(options.sample_file)]
+ options.sample_list.extend(s)
+
+ writer = sequenceWriterGenerator(options)
+
+ for seq in entries:
+ seq = selectSamples(seq,options.sample,options.sample_list)
+ if seq is not None:
+ writer(seq)
+
diff --git a/src/obigrep.py b/src/obigrep.py
new file mode 100644
index 0000000..fe85380
--- /dev/null
+++ b/src/obigrep.py
@@ -0,0 +1,45 @@
+#!/usr/local/bin/python
+'''
+:py:mod:`obigrep`: filters sequence file
+========================================
+
+.. codeauthor:: Eric Coissac <eric.coissac at metabarcoding.org>
+
+The :py:mod:`obigrep` command is in some way analog to the standard Unix `grep`
+command.
+It selects a subset of sequence records from a sequence file.
+
+A sequence record is a complex object composed of an identifier,
+a set of attributes (``key=value``), a definition, and the sequence itself.
+
+Instead of working text line by text line as the standard Unix tool, selection is
+done sequence record by sequence record.
+A large set of options allows refining selection on any of the sequence record
+elements.
+
+Moreover :py:mod:`obigrep` allows specifying simultaneously several conditions (that
+take the value ``TRUE`` or ``FALSE``) and only the sequence records that fulfill all
+the conditions (all conditions are ``TRUE``) are selected.
+
+'''
+
+
+from obitools.format.options import addInOutputOption, sequenceWriterGenerator
+from obitools.options import getOptionManager
+from obitools.options.bioseqfilter import addSequenceFilteringOptions
+from obitools.options.bioseqfilter import sequenceFilterIteratorGenerator
+
+if __name__=='__main__':
+
+ optionParser = getOptionManager([addSequenceFilteringOptions,addInOutputOption],progdoc=__doc__)
+
+ (options, entries) = optionParser()
+
+ goodSeq = sequenceFilterIteratorGenerator(options)
+
+ writer = sequenceWriterGenerator(options)
+
+ for seq in goodSeq(entries):
+ writer(seq)
+
+
diff --git a/src/obihead.py b/src/obihead.py
new file mode 100644
index 0000000..f9b2a22
--- /dev/null
+++ b/src/obihead.py
@@ -0,0 +1,57 @@
+#!/usr/local/bin/python
+'''
+:py:mod:`obihead`: extracts the first sequence records
+======================================================
+
+.. codeauthor:: Eric Coissac <eric.coissac at metabarcoding.org>
+
+:py:mod:`obihead` command is in some way analog to the standard Unix `head` command.
+It selects the head of a sequence file.
+But instead of working text line by text line as the standard Unix tool,
+selection is done at the sequence record level. You can specify the number of sequence records
+to select.
+
+ *Example:*
+
+
+ .. code-block:: bash
+
+ > obihead -n 150 seq1.fasta > seq2.fasta
+
+ Selects the 150 first sequence records from the ``seq1.fasta`` file and stores
+ them into the ``seq2.fasta`` file.
+
+
+'''
+import sys
+from obitools.format.options import addInOutputOption, sequenceWriterGenerator
+from obitools.options import getOptionManager
+
+
+def addHeadOptions(optionManager):
+ optionManager.add_option('-n','--sequence-count',
+ action="store", dest="count",
+ metavar="###",
+ type="int",
+ default=10,
+ help="Count of first sequences to print")
+
+
+if __name__ == '__main__':
+ optionParser = getOptionManager([addHeadOptions,addInOutputOption])
+
+ (options, entries) = optionParser()
+ i=0
+
+ writer = sequenceWriterGenerator(options)
+
+ for s in entries:
+ if i < options.count:
+ writer(s)
+ i+=1
+ else:
+ print >>sys.stderr,""
+ sys.exit(0)
+
+
+
diff --git a/src/obijoinpairedend.py b/src/obijoinpairedend.py
new file mode 100644
index 0000000..3d4f8f2
--- /dev/null
+++ b/src/obijoinpairedend.py
@@ -0,0 +1,134 @@
+#!/usr/local/bin/python
+'''
+:py:mod:`obijoinpairedend`: Joins paired-end reads
+==================================================
+
+.. codeauthor:: Eric Coissac <eric.coissac at metabarcoding.org>
+
+:py:mod:`obijoinpairedend` aims at joining the two reads of a paired-end library.
+
+For this purpose, it concatenates sequence merging the forward read and the
+reversed-complemented reverse read.
+
+The program uses as input one or two sequences reads files.
+
+ - If two files are used one of them must be specified using the ``-r`` option.
+ Sequence records corresponding to the same read pair must be in the same order
+ in the two files.
+
+ - If just one file is provided, sequence records are supposed to be all of the same length.
+ The first half of the sequence is used as forward read, the second half is used as the reverse
+ read.
+
+ *Example:*
+
+ .. code-block:: bash
+
+ > obijoinpairedend -r seq3P.fastq seq5P.fastq > seq.fastq
+
+ The ``seq5P.fastq`` sequence file contains the forward sequence records.
+ The ``seq3P.fastq`` sequence file contains the reverse sequence records.
+ Pairs of reads are joined together and the resulting sequence is stored in the
+ `` seq.fastq`` file.
+
+'''
+
+from obitools.options import getOptionManager
+
+from itertools import chain
+from obitools import NucSequence
+from obitools.format.options import sequenceWriterGenerator, autoEntriesIterator,\
+ addInOutputOption
+from obitools.utils import universalOpen
+
+def addPairEndOptions(optionManager):
+ optionManager.add_option('-r','--reverse-reads',
+ action="store", dest="reverse",
+ metavar="<FILENAME>",
+ type="string",
+ default=None,
+ help="Filename containing reverse solexa reads "
+ )
+
+
+def cutDirectReverse(entries):
+ first = []
+
+ for i in xrange(10):
+ first.append(entries.next())
+
+ lens = [len(x) for x in first]
+ clen = {}
+ for i in lens:
+ clen[i]=clen.get(i,0)+1
+ freq = max(clen.values())
+ freq = [k for k in clen if clen[k]==freq]
+ assert len(freq)==1,"To many sequence length"
+ freq = freq[0]
+ assert freq % 2 == 0, ""
+ lread = freq/2
+
+ seqs = chain(first,entries)
+
+ for s in seqs:
+ d = s[0:lread]
+ r = s[lread:]
+ yield(d,r)
+
+
+def seqPairs(direct,reverse):
+ for d in direct:
+ r = reverse.next()
+ yield(d,r)
+
+
+
+def buildJoinedSequence(sequences,options):
+
+ for d,r in sequences:
+ r=r.complement()
+
+ s = str(d) + str(r)
+
+ seq = NucSequence(d.id + '_PairEnd',s,d.definition,**d)
+
+ withqual = hasattr(d, 'quality') or hasattr(r, 'quality')
+
+ if withqual:
+ if hasattr(d, 'quality'):
+ quality = d.quality
+ else:
+ quality = [10**-4] * len(d)
+
+ if hasattr(r, 'quality'):
+ quality.extend(r.quality)
+ else:
+ quality.extend([10**-4] * len(r))
+
+ seq.quality=quality
+ seq['pairend_limit']=len(d)
+
+
+ yield seq
+
+
+
+if __name__ == '__main__':
+ optionParser = getOptionManager([addPairEndOptions,addInOutputOption])
+
+ (options, direct) = optionParser()
+
+ if options.reverse is None:
+ sequences=cutDirectReverse(direct)
+ else:
+ reader = autoEntriesIterator(options)
+ reverse = reader(universalOpen(options.reverse))
+ sequences=seqPairs(direct,reverse)
+
+ writer = sequenceWriterGenerator(options)
+
+ for seq in buildJoinedSequence(sequences,options):
+ writer(seq)
+
+
+
diff --git a/src/obipr2.py b/src/obipr2.py
new file mode 100644
index 0000000..ac8e9d8
--- /dev/null
+++ b/src/obipr2.py
@@ -0,0 +1,302 @@
+#!/usr/local/bin/python
+'''
+:py:mod:`obipr2`: converts silva database into an ecoPCR database
+=================================================================
+
+:py:mod:`obipr2`: converts and optionally download the `PR2 database <http://ssu-rrna.org/pr2/>`_
+into an ecoPCR database. The formated database include the taxonomy as defined by the PR2 authors.
+
+.. warning::
+ Take care that the numeric taxids associated to the sequences are specific
+ to this **PR2** database and not compatible with the NCBI taxids.
+ The taxids present in a version of the **PR2** database are are just valid for
+ this version of the database and not compatible with the taxids used in another version
+ downloaded at an other time.
+
+
+*Example:*
+
+ .. code-block:: bash
+
+ > obipr2
+
+ This command downloads and formats the latest version of the PR2 database from
+ the official `PR2 web site<http://ssu-rrna.org/pr2/>`_.
+
+
+.. codeauthor:: Eric Coissac <eric.coissac at metabarcoding.org>
+
+'''
+
+
+from obitools.options import getOptionManager
+from obitools.ecopcr.taxonomy import Taxonomy
+from obitools.fasta import fastaIterator
+import sys
+from obitools.utils import universalOpen, ColumnFile
+import re
+import urllib2
+from obitools.ecopcr.sequence import EcoPCRDBSequenceWriter
+from obitools.utils import progressBar
+from os.path import isfile, join
+from os import listdir
+
+
+
+
+def numberInStr(s) :
+ containsNumber = False
+ for c in s :
+ if c.isdigit() :
+ containsNumber = True
+ return containsNumber
+
+def silvaOptions(optionManager):
+ optionManager.add_option('--localdb',
+ action="store", dest="local",
+ type='str',
+ default=None,
+ help="Local copy of the files located in the specified directory "
+ "will be used instead of those present on the PR2 web site")
+
+ optionManager.add_option('-m','--min-taxid',
+ action="store", dest="taxashift",
+ type="int",
+ metavar="####",
+ default=10000000,
+ help="minimal taxid for the species taxid")
+
+# http://5.196.17.195/pr2/download/entire_database/gb203_pr2.fasta.gz
+siteurl="http://5.196.17.195/"
+baseurl="%s/pr2/download/entire_database" % siteurl
+
+
+def getHyperlink(url):
+ furl = urllib2.urlopen(url)
+ data = "".join([l.strip() for l in furl])
+
+ href = re.compile('<a .*?</a>',re.IGNORECASE)
+ target=re.compile('href="(.*?)"',re.IGNORECASE)
+ filename=re.compile(">(.*?)</a>",re.IGNORECASE)
+
+ hrefs = href.findall(data)
+
+ links = {}
+
+ for h in hrefs:
+ t = target.search(h).group(1)
+ f = filename.search(h).group(1)
+ links[f]=t
+
+ return links
+
+def pr2URLS(options):
+
+ global baseurl
+
+ if options.local is not None:
+ archive = dict((f,f) for f in listdir(options.local) if isfile(join(options.local,f)))
+ baseurl=options.local
+ else:
+ archive=getHyperlink(baseurl)
+
+
+ pr2file = [x.strip() for x in archive.keys()
+ if x.strip().endswith('pr2.fasta.gz') or x.strip().endswith('pr2.fasta')
+ ]
+
+ version_pattern = re.compile("^gb([0-9]*)", re.IGNORECASE)
+
+ versions = [int(version_pattern.search(x.strip()).group(1)) for x in pr2file]
+ latest = max(versions)
+
+ seqfile=pr2file[versions.index(latest)]
+
+ pr2txfile = [x for x in archive.keys()
+ if x.endswith('pr2.tlf.gz') or x.endswith('pr2.tlf')
+ ]
+
+ versions = [int(version_pattern.search(x).group(1)) for x in pr2txfile]
+ print versions
+
+ taxfile = pr2txfile[versions.index(latest)]
+
+ try:
+ sequrl = archive[seqfile]
+ except KeyError:
+ if seqfile[-3:]=='.gz':
+ seqfile=seqfile[0:-3]
+ else:
+ seqfile=seqfile+'.gz'
+ sequrl = archive[seqfile]
+
+ try:
+ taxurl = archive[taxfile]
+ except KeyError:
+ if taxfile[-3:]=='.gz':
+ taxfile=taxfile[0:-3]
+ else:
+ taxfile=taxfile+'.gz'
+ taxurl = archive[taxfile]
+
+ output = "pr2_gb%d" % latest
+
+ return "%s/%s" %(baseurl,sequrl),"%s/%s" %(baseurl,taxurl),output
+
+
+pathElementPattern = re.compile("^ *(.*?) {(.*?)} *$", re.IGNORECASE)
+def pr2PathParser(path):
+ x = pathElementPattern.match(path)
+ rank = x.group(1)
+ if rank=='classe':
+ rank='class'
+ elif rank=='ordre':
+ rank='order'
+ elif rank=='famille':
+ rank='family'
+ elif rank=='genre':
+ rank='genus'
+ elif rank=='espece':
+ rank='species'
+ elif rank.strip()=="":
+ rank="no rank"
+
+ return rank,x.group(2)
+
+class Pr2Dump(Taxonomy):
+
+ def __init__(self,taxdump=None):
+
+ self._path=taxdump
+ self._readNodeTable(taxdump)
+
+ Taxonomy.__init__(self)
+
+ def _taxonCmp(t1,t2):
+ if t1[0] < t2[0]:
+ return -1
+ elif t1[0] > t2[0]:
+ return +1
+ return 0
+
+ _taxonCmp=staticmethod(_taxonCmp)
+
+ def _readNodeTable(self,dumpfile):
+
+
+ nodes = ColumnFile(dumpfile,
+ sep='\t',
+ types=(str,pr2PathParser))
+ print >>sys.stderr,"Reading taxonomy dump file..."
+ # (taxid,rank,parent)
+
+ nexttaxid = 2
+ taxidx={'root':1}
+ actaxid={}
+ taxonomy=[[1,'root',1,'root','pr2']]
+ for node in nodes:
+ ac = node[0]
+ path = [('root','root')] + node[2:]
+ allpath = [[]]
+ for s in path:
+ allpath.append(allpath[-1]+[s[1]])
+
+ allpath.pop(0)
+ allpath=[";".join(x) for x in allpath]
+ i=0
+ for p in allpath:
+ try:
+ taxid = taxidx[p]
+ except KeyError:
+ taxid=nexttaxid
+ taxidx[p]=taxid
+ nexttaxid+=1
+ parent=p.rsplit(";",1)[0]
+ ptaxid=taxidx[parent]
+ rank = path[i][0]
+ name = path[i][1]
+ taxonomy.append([taxid,rank,ptaxid,name,'pr2'])
+ i+=1
+ actaxid[ac]=taxid
+
+ print >>sys.stderr,"List all taxonomy rank..."
+ ranks =list(set(x[1] for x in taxonomy))
+ ranks.sort()
+ print >>sys.stderr,ranks
+ rankidx = dict(map(None,ranks,xrange(len(ranks))))
+
+ self._taxonomy=taxonomy
+ self._localtaxon=len(taxonomy)
+
+ print >>sys.stderr,"Indexing taxonomy..."
+ index = {}
+ for i in xrange(self._localtaxon):
+ index[self._taxonomy[i][0]]=i
+
+ print >>sys.stderr,"Indexing parent and rank..."
+ for t in self._taxonomy:
+ t[1]=rankidx[t[1]]
+ t[2]=index[t[2]]
+
+ self._ranks=ranks
+ self._index=index
+ self._preferedName = []
+
+ self._name=[(n[3],'scientific name',self._index[n[0]]) for n in taxonomy]
+ self.pr2ac=actaxid
+
+
+
+def pr22obi(seq,taxonomy):
+
+ try:
+ # parent = taxonomy.findTaxonByTaxid(taxonomy.silvaname[ancestor])
+ oriid=seq.id
+ seq.id,seq.definition=oriid.split("|",1)
+ taxid=taxonomy.pr2ac[seq.id]
+ seq['taxid']=taxid
+ except KeyError:
+ pass
+
+ return seq
+
+
+
+if __name__ == '__main__':
+
+ optionParser = getOptionManager([silvaOptions])
+
+ (options, entries) = optionParser()
+
+ sequrl,taxurl,options.ecopcroutput = pr2URLS(options)
+
+ taxonomydata = universalOpen(taxurl)
+
+ options.taxonomy = Pr2Dump(taxonomydata)
+
+# if options.write != '' :
+# options.write = open(options.write, 'w')
+
+ entries = fastaIterator(universalOpen(sequrl))
+ writer = EcoPCRDBSequenceWriter(options)
+
+ nseq = len(options.taxonomy.pr2ac)
+
+ progressBar(1,nseq,
+ head=options.ecopcroutput)
+
+ done=0
+ for e in entries:
+ e = pr22obi(e, options.taxonomy)
+ done+=1
+ progressBar(done,nseq,
+ head=options.ecopcroutput)
+
+ if 'taxid' in e:
+ writer.put(e)
+ else:
+ print >>sys.stderr,"\nCannot find taxon for entry : %s : %s" % (e.id,e.definition)
+
+ print >>sys.stderr
+
+
\ No newline at end of file
diff --git a/src/obisample.py b/src/obisample.py
new file mode 100644
index 0000000..7fb4654
--- /dev/null
+++ b/src/obisample.py
@@ -0,0 +1,119 @@
+#!/usr/local/bin/python
+'''
+:py:mod:`obisample`: randomly resamples sequence records
+========================================================
+
+.. codeauthor:: Eric Coissac <eric.coissac at metabarcoding.org>
+
+
+:py:mod:`obisample` randomly resamples sequence records with or without replacement.
+
+'''
+
+
+from obitools.options import getOptionManager
+from obitools.sample import weigthedSample, weigthedSampleWithoutReplacement
+from obitools.format.options import addInOutputOption, sequenceWriterGenerator
+import random
+
+def addSampleOptions(optionManager):
+ optionManager.add_option('-s','--sample-size',
+ action="store", dest="size",
+ metavar="###",
+ type="float",
+ default=None,
+ help="Size of the generated sample. "
+ "If -a option is set, size is expressed as fraction"
+ )
+ optionManager.add_option('-a','--approx-sampling',
+ action="store_true", dest="approx",
+ default=False,
+ help="Switch to an approximative algorithm, "
+ "useful for large files"
+ )
+
+ optionManager.add_option('-w','--without-replacement',
+ action="store_true", dest="woreplace",
+ default=False,
+ help="Ask for sampling without replacement"
+ )
+
+def rbinom(n,p):
+ return sum((random.random() < p) for x in xrange(n))
+
+if __name__ == '__main__':
+
+ optionParser = getOptionManager([addSampleOptions,addInOutputOption]
+ )
+
+ (options, entries) = optionParser()
+
+ if not options.approx:
+
+ db = [s for s in entries]
+
+ if options.size is None:
+ options.size=len(db)
+ else:
+ options.size=int(options.size)
+
+ distribution = {}
+ idx=0
+ total = 0
+ for s in db:
+ count = s['count']
+ total+=count
+ distribution[idx]=count
+ idx+=1
+
+ if options.woreplace:
+ assert options.size <= total
+ sp = weigthedSampleWithoutReplacement
+ else:
+ sp= weigthedSample
+
+ sample =sp(distribution, options.size)
+
+
+ else:
+ db = []
+ distribution = {}
+ idx = 0
+ total = 0
+
+ assert options.size is not None, \
+ "You cannot specify option -a without option -s"
+
+ assert options.size>=0 and options.size <=1, \
+ "When used with -a options -s must be a probability"
+
+ p = options.size * 1.5
+
+ if p > 1.:
+ p = 1.
+
+ for seq in entries:
+ count = seq['count']
+ total+=count
+
+ n = rbinom(count, p)
+
+ if n > 0:
+ db.append(seq)
+ distribution[idx]=n
+
+ idx+=1
+
+ size = int(total * options.size)
+ sample=weigthedSampleWithoutReplacement(distribution, size)
+
+ writer = sequenceWriterGenerator(options)
+
+ for idx in sample:
+ seq = db[idx]
+ seq['count']=sample[idx]
+ writer(seq)
+
+
+
+
diff --git a/src/obiselect.py b/src/obiselect.py
new file mode 100644
index 0000000..0e05e82
--- /dev/null
+++ b/src/obiselect.py
@@ -0,0 +1,281 @@
+#!/usr/local/bin/python
+"""
+:py:mod:`obiselect` : selects representative sequence records
+=============================================================
+
+.. codeauthor:: Eric Coissac <eric.coissac at metabarcoding.org>
+
+:py:mod:`obiselect` command allows to select a subset of sequences records from a sequence
+file by describing sequence record groups and defining how many and which sequence records
+from each group must be retrieved.
+
+"""
+from obitools.format.options import addInOutputOption, sequenceWriterGenerator
+from obitools.options import getOptionManager
+from obitools.ecopcr.options import addTaxonomyDBOptions, loadTaxonomyDatabase
+from random import random
+from obitools.utils import progressBar
+import math
+import sys
+from obitools.utils.bioseq import mergeTaxonomyClassification
+
+def minimum(seqs):
+ return min(s['select'] for s in seqs)
+
+def maximum(seqs):
+ try:
+ return max(s['select'] for s in seqs)
+ except TypeError, e:
+ print >>sys.stderr, seqs
+ raise e
+def mean(seqs):
+ ss= reduce(lambda x,y: x + y,(s['select'] for s in seqs),0)
+ return float(ss) / len(seqs)
+
+def median(seqs):
+ ss = [s['select'] for s in seqs]
+ ss.sort()
+ return ss[len(ss)/2]
+
+
+
+def addSelectOptions(optionManager):
+
+ group = optionManager.add_option_group('obiselect specific options')
+
+
+ group.add_option('-c','--category-attribute',
+ action="append", dest="categories",
+ metavar="<Attribute Name>",
+ default=[],
+ help="Add one attribute to the list of"
+ " attribute used for categorizing sequence records")
+
+ group.add_option('-n','--number',
+ action="store", dest="number",
+ metavar="",
+ type="int",
+ default=1,
+ help="number of sequence records to keep in each category")
+
+ group.add_option('-f','--function',
+ action="store", dest="function",
+ metavar="",
+ default="random",
+ help="python code evaluated for each sequence record [default: random value]")
+
+
+ group.add_option('-m','--min',
+ action="store_const", dest="method",
+ metavar="",
+ default=maximum,
+ const=minimum,
+ help="select sequence record in each group minimizing the function"
+ " (exclusive with -M, -a, --median)")
+
+ group.add_option('-M','--max',
+ action="store_const", dest="method",
+ metavar="",
+ default=maximum,
+ const=maximum,
+ help="select sequence record in each group maximizing the function"
+ " (exclusive with -m, -a, --median)")
+
+ group.add_option('-a','--mean',
+ action="store_const", dest="method",
+ metavar="",
+ default=maximum,
+ const=mean,
+ help="select sequence record in each group closest to the mean of the function"
+ " (exclusive with -m, -M, --median)")
+
+ group.add_option('--median',
+ action="store_const", dest="method",
+ metavar="<Attribute Name>",
+ default=maximum,
+ const=median,
+ help="select sequence record in each group closest to the median of the function"
+ " (exclusive with -m, -M, -a)")
+
+ group.add_option('--merge',
+ action="append", dest="merge",
+ metavar="<TAG NAME>",
+ type="string",
+ default=[],
+ help="attributes to merge within each group")
+
+ group.add_option('-s','--sample',
+ action="store", dest="sample",
+ metavar="<TAGNAME>",
+ type="str",
+ default=None,
+ help="Tag containing sample descriptions, the default value is set to *merged_sample*")
+
+ group.add_option('--merge-ids',
+ action="store_true", dest="mergeids",
+ default=False,
+ help="add the merged id data to output")
+
+
+
+def sortclass(seqs,options):
+ cible = float(options.method(seqs))
+ for s in seqs:
+ s['distance']=math.sqrt((float(s['select'])-cible)**2)
+ seqs.sort(lambda s1,s2 : cmp(s1['distance'],s2['distance']))
+
+
+
+if __name__ == '__main__':
+
+ optionParser = getOptionManager([addSelectOptions,addInOutputOption,addTaxonomyDBOptions])
+
+ (options, entries) = optionParser()
+
+ taxonomy=loadTaxonomyDatabase(options)
+
+ writer = sequenceWriterGenerator(options)
+
+ classes = {}
+
+ print >>sys.stderr,"\nLoading sequences...\n"
+
+ with_taxonomy=hasattr(options, 'taxonomy') and options.taxonomy is not None
+
+ nbseq=0
+
+ for s in entries:
+ nbseq+=1
+ category = []
+
+ if with_taxonomy:
+ environ = {'taxonomy' : options.taxonomy,'sequence':s,'random':random()}
+ else:
+ environ = {'sequence':s,'random':random()}
+
+ for c in options.categories:
+ try:
+ v = eval(c,environ,s)
+ category.append(v)
+ except:
+ category.append(None)
+
+ category=tuple(category)
+ group = classes.get(category,[])
+ group.append(s)
+ classes[category]= group
+
+ try:
+ select = eval(options.function,environ,s)
+ s['select']=select
+ except:
+ s['select']=None
+
+ mergedKey = options.merge
+ mergeIds = options.mergeids
+
+ if mergedKey is not None:
+ mergedKey=set(mergedKey)
+ else:
+ mergedKey=set()
+
+ if taxonomy is not None:
+ mergedKey.add('taxid')
+
+
+ print >>sys.stderr,"\nSelecting sequences...\n"
+
+ lclasses=len(classes)
+ progressBar(1,lclasses,True,'Selecting')
+ i=0
+ for c in classes:
+ i+=1
+ progressBar(i,lclasses,False,"%15s" % ("/".join(map(str,c)),))
+ seqs = classes[c]
+ if options.sample is not None:
+ subsets = {}
+ for s in seqs:
+ for sid in s[options.sample]:
+ ss = subsets.get(sid,[])
+ ss.append(s)
+ subsets[sid]=ss
+ else:
+ subsets={"all":seqs}
+
+ for seqs in subsets.values():
+ sortclass(seqs, options)
+
+ if len(c)==1:
+ c=c[0]
+
+ if options.number==1 and options.sample is None:
+ s = seqs[0]
+
+ for key in mergedKey:
+ if key=='taxid' and mergeIds:
+ if 'taxid_dist' not in s:
+ s["taxid_dist"]={}
+ if 'taxid' in s:
+ s["taxid_dist"][s.id]=s['taxid']
+ mkey = "merged_%s" % key
+ if mkey not in s:
+ if key in s:
+ s[mkey]={s[key]:1}
+ else:
+ s[mkey]={}
+
+ if 'count' not in s:
+ s['count']=1
+ if mergeIds:
+ s['merged']=[s.id]
+
+ for seq in seqs[1:]:
+
+ if 'count' in seq:
+ s['count']+=seq['count']
+ else:
+ s['count']+=1
+
+ for key in mergedKey:
+ if key=='taxid' and mergeIds:
+ if 'taxid_dist' in seq:
+ s["taxid_dist"].update(seq["taxid_dist"])
+ if 'taxid' in seq:
+ s["taxid_dist"][seq.id]=seq['taxid']
+
+ mkey = "merged_%s" % key
+ if mkey in seq:
+ m = seq[mkey]
+ else:
+ if key in seq:
+ m={seq[key]:1}
+
+ allmkey = set(m.keys()) | set(s[mkey].keys())
+ s[mkey] = dict((k,m.get(k,0)+s[mkey].get(k,0)) for k in allmkey)
+
+ if mergeIds:
+ s['merged'].append(seq.id)
+
+ if taxonomy is not None:
+ mergeTaxonomyClassification(seqs, taxonomy)
+
+
+ for s in seqs[0:options.number]:
+ s['class']=c
+ s['__ at TOWRITE@__']=True
+
+ print >>sys.stderr,"\Writing sequences...\n"
+ progressBar(1,nbseq,True,'Writing')
+
+ i=0
+ for c in classes:
+ seqs = classes[c]
+ for s in seqs:
+ i+=1
+ progressBar(i,nbseq,False,"Writing")
+ if '__ at TOWRITE@__' in s:
+ del s['__ at TOWRITE@__']
+ del s['select']
+ writer(s)
+
+ print >>sys.stderr
diff --git a/src/obisilva.py b/src/obisilva.py
new file mode 100644
index 0000000..9816815
--- /dev/null
+++ b/src/obisilva.py
@@ -0,0 +1,355 @@
+#!/usr/local/bin/python
+'''
+:py:mod:`obisilva`: converts silva database into an ecoPCR database
+===================================================================
+
+:py:mod:`obisilva`: converts and optionally download the `Silva database <http://www.arb-silva.de>`_
+into an ecoPCR database. The formated database include the taxonomy as defined by the Silva authors.
+
+.. warning::
+ Take care that the numeric taxids associated to the sequences are specific
+ to this Silva database and not compatible with the NCBI taxids.
+ The taxids present in a version of the Silva database (*i.e* ssu, lsu, parc, ref...)
+ are are just valid for this version of the database and not compatible
+ with the taxids used in another version.
+
+.. codeauthor:: Eric Coissac <eric.coissac at metabarcoding.org>
+
+'''
+
+
+from obitools.options import getOptionManager
+from obitools.ecopcr.taxonomy import ecoTaxonomyWriter, Taxonomy
+from obitools.fasta import fastaIterator
+import sys
+from obitools.utils import universalOpen, ColumnFile
+import re
+import urllib2
+from obitools import NucSequence
+from obitools.ecopcr.sequence import EcoPCRDBSequenceWriter
+from obitools.utils import progressBar
+from os.path import isfile, join
+from os import listdir
+
+
+
+
+def numberInStr(s) :
+ containsNumber = False
+ for c in s :
+ if c.isdigit() :
+ containsNumber = True
+ return containsNumber
+
+def silvaOptions(optionManager):
+ optionManager.add_option('-s','--ssu',
+ action="store_const", dest="rrna",
+ metavar="<taxon_name>:rank:parent",
+ const = "ssu",
+ default=None,
+ help="specify that you are interested in the SSU database")
+
+ optionManager.add_option('-l','--lsu',
+ action="store_const", dest="rrna",
+ metavar="<taxon_name>:rank:parent",
+ const = "lsu",
+ default=None,
+ help="specify that you are interested in the LSU database")
+
+ optionManager.add_option('-p','--parc',
+ action="store_const", dest="type",
+ metavar="<taxon_name>:rank:parent",
+ const = "parc",
+ default=None,
+ help="specify that you are interested in the parc version of the database")
+
+ optionManager.add_option('-r','--ref',
+ action="store_const", dest="type",
+ metavar="<taxon_name>:rank:parent",
+ const = "ref",
+ default=None,
+ help="specify that you are interested in the reference version of the database")
+
+ optionManager.add_option('-n','--nr',
+ action="store_true", dest="nr",
+ default=False,
+ help="specify that you are interested in the non redundant version of the database")
+
+ optionManager.add_option('-t','--trunc',
+ action="store_true", dest="trunc",
+ default=False,
+ help="specify that you are interested in the truncated version of database")
+
+ optionManager.add_option('--localdb',
+ action="store", dest="local",
+ type='str',
+ default=None,
+ help="Local copy of the files located in the specified directory "
+ "will be used instead of those present on the ARB-Silva web site")
+
+ optionManager.add_option('-m','--min-taxid',
+ action="store", dest="taxashift",
+ type="int",
+ metavar="####",
+ default=10000000,
+ help="minimal taxid for the species taxid")
+
+siteurl="http://www.arb-silva.de/"
+baseurl="%sno_cache/download/archive/current/Exports" % siteurl
+
+# (options.rrna,options.type,options.trunc,options.nr)
+seqfilepattern={('lsu','parc',False,False) : "SILVA_%s_LSUParc_tax_silva.fasta.gz",
+ ('lsu','parc',False,True ) : None,
+ ('lsu','parc',True ,False) : "SILVA_%s_LSUParc_tax_silva_trunc.fasta.gz",
+ ('lsu','parc',True ,True ) : None,
+ ('lsu','ref' ,False,False) : "SILVA_%s_LSURef_tax_silva.fasta.gz",
+ ('lsu','ref' ,False,True ) : None,
+ ('lsu','ref' ,True ,False) : "SILVA_%s_LSURef_tax_silva_trunc.fasta.gz",
+ ('lsu','ref' ,True ,True ) : None,
+ ('ssu','parc',False,False) : "SILVA_%s_SSUParc_tax_silva.fasta.gz",
+ ('ssu','parc',False,True ) : None,
+ ('ssu','parc',True ,False) : "SILVA_%s_SSUParc_tax_silva_trunc.fasta.gz",
+ ('ssu','parc',True ,True ) : None,
+ ('ssu','ref' ,False,False) : "SILVA_%s_SSURef_tax_silva.fasta.gz",
+ ('ssu','ref' ,False,True ) : "SILVA_%s_SSURef_Nr99_tax_silva.fasta.gz",
+ ('ssu','ref' ,True ,False) : "SILVA_%s_SSURef_tax_silva_trunc.fasta.gz",
+ ('ssu','ref' ,True ,True ) : "SILVA_%s_SSURef_Nr99_tax_silva_trunc.fasta.gz"
+ }
+# (options.rrna,options.nr)
+taxfilepattern={'lsu' : "tax_slv_lsu_%s.txt",
+ 'ssu' : "tax_slv_ssu_%s.txt"
+ }
+
+def getHyperlink(url):
+ furl = urllib2.urlopen(url)
+ data = "".join([l.strip() for l in furl])
+
+ href = re.compile('<a .*?</a>',re.IGNORECASE)
+ target=re.compile('href="(.*?)"',re.IGNORECASE)
+ filename=re.compile(">(.*?)</a>",re.IGNORECASE)
+
+ hrefs = href.findall(data)
+
+ links = {}
+
+ for h in hrefs:
+ t = target.search(h).group(1)
+ f = filename.search(h).group(1)
+ links[f]=t
+
+ return links
+
+def silvaURLS(options):
+ global siteurl
+
+ if options.local is not None:
+ archive = dict((f,f) for f in listdir(options.local) if isfile(join(options.local,f)))
+ taxonomy= dict((f,"taxonomy/"+f) for f in listdir(options.local+'/taxonomy') if isfile(join(options.local+'/taxonomy',f)))
+ siteurl=options.local
+ else:
+ archive=getHyperlink(baseurl)
+ taxonomy=getHyperlink(baseurl+"/taxonomy")
+
+ silvafile = [x for x in archive.keys()
+ if x.startswith('SILVA') and (x.endswith('fasta.gz') or x.endswith('fasta'))
+ ]
+ versions = [tuple(map(int, x.split('_')[1].split('.'))) for x in silvafile]
+ versions.sort(reverse=True)
+ version='.'.join(map(str,versions[0]))
+ #if all(x==versions[0] for x in versions):
+ # version = int(versions[0])
+ #else:
+ # raise AssertionError("Unable to identify the database version")
+
+ whichfile = (options.rrna,options.type,options.trunc,options.nr)
+
+ seqfile = seqfilepattern[whichfile]
+
+ if seqfile is None:
+ raise AssertionError("Non existing version of Silva")
+
+ seqfile = seqfile % version
+ taxfile = taxfilepattern[options.rrna] % version
+
+ try:
+ sequrl = archive[seqfile]
+ except KeyError:
+ if seqfile[-3:]=='.gz':
+ seqfile=seqfile[0:-3]
+ else:
+ seqfile=seqfile+'.gz'
+ sequrl = archive[seqfile]
+
+ try:
+ taxurl = taxonomy[taxfile]
+ except KeyError:
+ if taxfile[-3:]=='.gz':
+ taxfile=taxfile[0:-3]
+ else:
+ taxfile=taxfile+'.gz'
+ taxurl = taxonomy[taxfile]
+
+ output = "silva_%s_%s%s_%s%s" % (version,options.rrna,options.type,
+ {True:"nr_" , False:""}[options.nr],
+ {True:"trunc" , False:"full"}[options.trunc]
+ )
+ return "%s/%s" %(siteurl,sequrl),"%s/%s" %(siteurl,taxurl),output
+
+
+def silvaPathParser(path):
+ x = path.strip().rsplit(";",2)[0:2]
+ if x[1]=="":
+ x[1]=x[0]
+ x[0]="root"
+ return x
+
+class SilvaDump(Taxonomy):
+
+ def __init__(self,taxdump=None):
+
+ self._path=taxdump
+ self._readNodeTable(taxdump)
+
+ print >>sys.stderr,"Adding scientific name..."
+
+# self._nameidx = {}
+# for x in self._name :
+# if x[0] not in self._nameidx :
+# self._nameidx[x[0]] = [x[2]]
+# else :
+# self._nameidx[x[0]].append(x[2])
+
+ # self._bigestTaxid = max(x[0] for x in self._taxonomy)
+
+ Taxonomy.__init__(self)
+
+ def _taxonCmp(t1,t2):
+ if t1[0] < t2[0]:
+ return -1
+ elif t1[0] > t2[0]:
+ return +1
+ return 0
+
+ _taxonCmp=staticmethod(_taxonCmp)
+
+ def _readNodeTable(self,dumpfile):
+
+ nodes = ColumnFile(dumpfile,
+ sep='\t',
+ types=(str,int,str,str,int))
+ print >>sys.stderr,"Reading taxonomy dump file..."
+ # (taxid,rank,parent)
+
+ taxonomy=[[n[1],n[2],n[0]] for n in nodes]
+ taxonomy.append([1,'root','root;'])
+
+ print >>sys.stderr,"Sorting taxons..."
+
+ taxonomy.sort(SilvaDump._taxonCmp)
+
+ print >>sys.stderr,"Assigning parent taxids..."
+
+ taxidx=dict((n[2][0:-1],n[0]) for n in taxonomy)
+
+ taxonomy=[[n[0],n[1]]+ silvaPathParser(n[2]) for n in taxonomy]
+
+ print >>sys.stderr,"Extracting scientific name..."
+
+ taxonomy=[[n[0],n[1],taxidx[n[2]],n[3],'silva'] for n in taxonomy]
+
+ print >>sys.stderr,"List all taxonomy rank..."
+ ranks =list(set(x[1] for x in taxonomy) | set(['species']))
+ ranks.sort()
+ rankidx = dict(map(None,ranks,xrange(len(ranks))))
+
+ self._taxonomy=taxonomy
+ self._localtaxon=len(taxonomy)
+
+ print >>sys.stderr,"Indexing taxonomy..."
+ index = {}
+ for i in xrange(self._localtaxon):
+ index[self._taxonomy[i][0]]=i
+
+ print >>sys.stderr,"Indexing parent and rank..."
+ for t in self._taxonomy:
+ t[1]=rankidx[t[1]]
+ t[2]=index[t[2]]
+
+ self._ranks=ranks
+ self._index=index
+ self._preferedName = []
+
+ self._name=[(n[3],'scientific name',self._index[n[0]]) for n in taxonomy]
+ self.silvaname=taxidx
+
+
+def silva2obi(seq,taxonomy,state):
+ s = str(seq).lower().replace('u', 't')
+ s = NucSequence(seq.id,s,seq.definition)
+ ancestor,species = [x.strip() for x in seq.definition.rsplit(';',1)]
+
+ try:
+ # parent = taxonomy.findTaxonByTaxid(taxonomy.silvaname[ancestor])
+ ptaxid=taxonomy.silvaname[ancestor]
+ if taxonomy.getRank(ptaxid)=="genus":
+ state.add(ptaxid)
+ taxid = taxonomy.addLocalTaxon(species,'species',ptaxid,options.taxashift)
+ s['taxid']=taxid
+ s['specie_name']=species
+ except KeyError:
+ pass
+
+ return s
+
+
+
+if __name__ == '__main__':
+
+ optionParser = getOptionManager([silvaOptions])
+
+ (options, entries) = optionParser()
+
+ if options.rrna is None:
+ raise AssertionError("rRNA type not specified (--ssu or --lsu)")
+
+ if options.type is None:
+ raise AssertionError("library type not specified (--parc or --ref)")
+
+
+ sequrl,taxurl,options.ecopcroutput = silvaURLS(options)
+
+ taxonomydata = universalOpen(taxurl)
+
+ options.taxonomy = SilvaDump(taxonomydata)
+
+# if options.write != '' :
+# options.write = open(options.write, 'w')
+
+ entries = fastaIterator(universalOpen(sequrl))
+ writer = EcoPCRDBSequenceWriter(options)
+
+ state = set()
+
+ gidx = options.taxonomy.findRankByName('genus')
+ ngenus = len([x for x in options.taxonomy._taxonomy if x[1]==gidx])
+
+ progressBar(max(1,len(state)),ngenus,
+ head=options.ecopcroutput)
+
+
+ for e in entries:
+ e = silva2obi(e, options.taxonomy,state)
+
+ progressBar(max(1,len(state)),ngenus,
+ head=options.ecopcroutput)
+
+ if 'taxid' in e:
+ writer.put(e)
+ else:
+ print >>sys.stderr,"\nCannot find taxon for entry : %s : %s" % (e.id,e.definition)
+
+
+ print >>sys.stderr
+
+ ecoTaxonomyWriter(options.ecopcroutput,options.taxonomy,onlyLocal=True)
+
diff --git a/src/obisort.py b/src/obisort.py
new file mode 100644
index 0000000..3ea093e
--- /dev/null
+++ b/src/obisort.py
@@ -0,0 +1,61 @@
+#!/usr/local/bin/python
+'''
+:py:mod:`obisort`: Sorts sequence records according to the value of a given attribute
+=====================================================================================
+
+.. codeauthor:: Eric Coissac <eric.coissac at metabarcoding.org>
+
+:py:mod:`obisort` sorts sequence records according to the value of a given attribute, which can be either numeric or alphanumeric.
+
+'''
+from obitools.format.options import addInOutputOption, sequenceWriterGenerator
+from obitools.options import getOptionManager
+
+def addSortOptions(optionManager):
+ group=optionManager.add_option_group('Obisort specific options')
+ group.add_option('-k','--key',
+ action="append", dest="key",
+ metavar="<TAG NAME>",
+ type="string",
+ default=[],
+ help="Attribute used to sort the sequence records.")
+
+ group.add_option('-r','--reverse',
+ action="store_true", dest="reverse",
+ default=False,
+ help="Sorts in reverse order.")
+
+def cmpGenerator(options):
+
+ keys=options.key
+ lk=len(keys)-1
+
+ def cmpkeys(x,y,i=0):
+ k=keys[i]
+ c=cmp(x[k],y[k])
+ if c==0 and i < lk:
+ i+=1
+ c=cmpkeys(x, y,i+1)
+ if i==lk:
+ i=0
+ return c
+
+ return cmpkeys
+
+
+
+if __name__ == '__main__':
+
+ optionParser = getOptionManager([addSortOptions,addInOutputOption])
+
+ (options, entries) = optionParser()
+
+ cmpk=cmpGenerator(options)
+
+ seqs = [seq for seq in entries]
+ seqs.sort(cmpk, reverse=options.reverse)
+
+ writer = sequenceWriterGenerator(options)
+
+ for seq in seqs:
+ writer(seq)
diff --git a/src/obisplit.py b/src/obisplit.py
new file mode 100755
index 0000000..d09ae62
--- /dev/null
+++ b/src/obisplit.py
@@ -0,0 +1,135 @@
+#!/usr/local/bin/python
+'''
+:py:mod:`obisplit`: Splits a sequence file in a set of subfiles
+===============================================================
+
+.. codeauthor:: Eric Coissac <eric.coissac at metabarcoding.org>
+
+:py:mod:`obisplit` splits the input sequence file in a set of subfiles
+according to the values of a given attribute. The generated subfiles are named
+after the values of the attribute, possibly preceded by a prefix
+(``-p`` option). The sequence records for which the attribute is missing are discarded by default, or
+put in a file whose name is set using the ``-u`` option.
+
+ Example:
+
+ .. code-block:: bash
+
+ > obisplit -p experiment_ -t mode
+
+ The above command splits the sequence input file according to the ``mode`` attribute.
+ This attribute is created by the :py:mod:`solexapairend` tool and its value can be set to
+ either ``joined`` or ``alignment``. The prefix ``experiment_`` is put before
+ each subfile name. Two subfiles will thus be created: ``experiment_joined`` and
+ ``experiment_alignment``.
+
+
+'''
+
+from obitools.options import getOptionManager
+from obitools.format.options import addInOutputOption
+from obitools.fasta import formatFasta
+from obitools.fastq import formatFastq
+
+
+def addSplitOptions(optionManager):
+ group = optionManager.add_option_group('Obisplit specific options')
+ group.add_option('-p','--prefix',
+ action="store", dest="prefix",
+ metavar="<PREFIX FILENAME>",
+ type="string",
+ default=None,
+ help="Prefix added to each subfile name")
+
+
+ group.add_option('-t','--tag-name',
+ action="append", dest="tagname",
+ metavar="<tagname>",
+ type="string",
+ default=[],
+ help="Attribute used to split the sequence file")
+
+ group.add_option('-u','--undefined',
+ action="store", dest="undefined",
+ metavar="<FILENAME>",
+ type="string",
+ default=None,
+ help="Name of the file where undefined sequenced are stored")
+
+
+class OutFiles:
+ def __init__(self,options):
+ self._tags = options.tagname
+ self._undefined = None
+ if options.undefined is not None:
+ self._undefined=open(options.undefined,'w')
+ self._prefix=options.prefix
+ self._files = {}
+ self._first=None
+ self._last=None
+ self._extension=options.outputFormat
+
+
+ def __getitem__(self,key):
+ if key in self._files:
+ data = self._files[key]
+ prev,current,next = data
+ if next is not None:
+ if prev is not None:
+ self._files[prev][2]=next
+ self._files[next][0]=prev
+ data[0]=self._last
+ data[2]=None
+ self._last=key
+ else:
+ name = key
+ if self._prefix is not None:
+ name = '%s%s' % (options.prefix,name)
+ current = open('%s.%s' % (name,self._extension),'a')
+ prev=self._last
+ self._last=key
+ next=None
+ self._files[key]=[prev,current,next]
+ if len(self._files)>100:
+ oprev,old,onext=self._files[self._first]
+ del(self._files[self._first])
+ old.close()
+ self._first=onext
+ if self._first is None:
+ self._first=key
+ return current
+
+ def __call__(self,seq):
+ ok = reduce(lambda x,y: x and y, (z in seq for z in self._tags),True)
+ if ok:
+ k = "_".join([str(seq[x]) for x in self._tags])
+ file=self[k]
+ else:
+ file=self._undefined
+ if file is not None and self._extension=="fasta":
+ print >>file,formatFasta(seq)
+ else:
+ print >>file,formatFastq(seq)
+
+ def __del__(self):
+ k=self._files.keys()
+ for x in k:
+ del(self._files[x])
+
+
+if __name__=='__main__':
+
+ optionParser = getOptionManager([addSplitOptions,addInOutputOption])
+
+ (options, entries) = optionParser()
+
+ out=None
+
+ for seq in entries:
+ if out is None:
+ out = OutFiles(options)
+ out(seq)
+
+
+
+
diff --git a/src/obistat.py b/src/obistat.py
new file mode 100644
index 0000000..801f3e1
--- /dev/null
+++ b/src/obistat.py
@@ -0,0 +1,221 @@
+#!/usr/local/bin/python
+'''
+:py:mod:`obistat`: computes basic statistics for attribute values
+=================================================================
+
+.. codeauthor:: Eric Coissac <eric.coissac at metabarcoding.org>
+
+:py:mod:`obistats` computes basic statistics for attribute values of sequence records.
+The sequence records can be categorized or not using one or several ``-c`` options.
+By default, only the number of sequence records and the total count are computed for each category.
+Additional statistics can be computed for attribute values in each category, like:
+
+ - minimum value (``-m`` option)
+ - maximum value (``-M`` option)
+ - mean value (``-a`` option)
+ - variance (``-v`` option)
+ - standard deviation (``-s`` option)
+
+The result is a contingency table with the different categories in rows, and the
+computed statistics in columns.
+
+'''
+from obitools.options import getOptionManager
+from obitools.format.options import addInputFormatOption
+from obitools.ecopcr.options import addTaxonomyDBOptions, loadTaxonomyDatabase
+import math
+
+def addStatOptions(optionManager):
+ group = optionManager.add_option_group('obistat specific options')
+ group.add_option('-c','--category-attribute',
+ action="append", dest="categories",
+ metavar="<Attribute Name>",
+ default=[],
+ help="Attribute used to categorize the sequence records.")
+
+ group.add_option('-m','--min',
+ action="append", dest="minimum",
+ metavar="<Attribute Name>",
+ default=[],
+ help="Computes the minimum value of attribute for each category.")
+
+ group.add_option('-M','--max',
+ action="append", dest="maximum",
+ metavar="<Attribute Name>",
+ default=[],
+ help="Computes the maximum value of attribute for each category.")
+
+ group.add_option('-a','--mean',
+ action="append", dest="mean",
+ metavar="<Attribute Name>",
+ default=[],
+ help="Computes the mean value of attribute for each category.")
+
+ group.add_option('-v','--variance',
+ action="append", dest="var",
+ metavar="<Attribute Name>",
+ default=[],
+ help="Computes the variance of attribute for each category.")
+
+ group.add_option('-s','--std-dev',
+ action="append", dest="sd",
+ metavar="<Attribute Name>",
+ default=[],
+ help="Computes the standard deviation of attribute for each category.")
+
+
+def statistics(values,attribute,func):
+ stat={}
+ lstat={}
+
+ for var in attribute:
+ if var in values:
+ stat[var]={}
+ lstat[var]=0
+ for c in values[var]:
+ v = values[var][c]
+ m = func(v)
+ stat[var][c]=m
+ lm=len(str(m))
+ if lm > lstat[var]:
+ lstat[var]=lm
+
+ return stat,lstat
+
+def minimum(values,options):
+ return statistics(values, options.minimum, min)
+
+
+def maximum(values,options):
+ return statistics(values, options.maximum, max)
+
+def mean(values,options):
+ def average(v):
+ s = reduce(lambda x,y:x+y,v,0)
+ return float(s)/len(v)
+ return statistics(values, options.mean, average)
+
+
+def variance(v):
+ if len(v)==1:
+ return 0
+ s = reduce(lambda x,y:(x[0]+y,x[1]+y**2),v,(0.,0.))
+ return s[1]/(len(v)-1) - s[0]**2/len(v)/(len(v)-1)
+
+def varpop(values,options):
+ return statistics(values, options.var, variance)
+
+def sd(values,options):
+ def stddev(v):
+ return math.sqrt(variance(v))
+ return statistics(values, options.sd, stddev)
+
+
+if __name__ == "__main__":
+ optionParser = getOptionManager([addStatOptions,addInputFormatOption,addTaxonomyDBOptions],
+ progdoc=__doc__)
+
+ (options, entries) = optionParser()
+
+ loadTaxonomyDatabase(options)
+
+ options.statistics = set(options.minimum) | set(options.maximum) | set(options.mean)
+ total = 0
+ catcount={}
+ totcount={}
+ values={}
+ lcat=0
+
+ for s in entries:
+ category = []
+ for c in options.categories:
+ try:
+ if hasattr(options, 'taxonomy') and options.taxonomy is not None:
+ environ = {'taxonomy' : options.taxonomy,'sequence':s}
+ else:
+ environ = {'sequence':s}
+
+ v = eval(c,environ,s)
+ lv=len(str(v))
+ if lv > lcat:
+ lcat=lv
+ category.append(v)
+ except:
+ category.append(None)
+ if 4 > lcat:
+ lcat=4
+ category=tuple(category)
+ catcount[category]=catcount.get(category,0)+1
+ try:
+ totcount[category]=totcount.get(category,0)+s['count']
+ except KeyError:
+ totcount[category]=totcount.get(category,0)+1
+ for var in options.statistics:
+ if var in s:
+ v = s[var]
+ if var not in values:
+ values[var]={}
+ if category not in values[var]:
+ values[var][category]=[]
+ values[var][category].append(v)
+
+
+ mini,lmini = minimum(values, options)
+ maxi,lmaxi = maximum(values, options)
+ avg ,lavg = mean(values, options)
+ varp ,lvarp = varpop(values, options)
+ sigma,lsigma= sd(values, options)
+
+
+ pcat = "%%-%ds" % lcat
+ if options.minimum:
+ minvar= "min_%%-%ds" % max(len(x) for x in options.minimum)
+ else:
+ minvar= "%s"
+
+ if options.maximum:
+ maxvar= "max_%%-%ds" % max(len(x) for x in options.maximum)
+ else:
+ maxvar= "%s"
+
+ if options.mean:
+ meanvar= "mean_%%-%ds" % max(len(x) for x in options.mean)
+ else:
+ meanvar= "%s"
+
+ if options.var:
+ varvar= "var_%%-%ds" % max(len(x) for x in options.var)
+ else:
+ varvar= "%s"
+
+ if options.sd:
+ sdvar= "sd_%%-%ds" % max(len(x) for x in options.sd)
+ else:
+ sdvar= "%s"
+
+ hcat = "\t".join([pcat % x for x in options.categories]) + "\t" +\
+ "\t".join([minvar % x for x in options.minimum]) + "\t" +\
+ "\t".join([maxvar % x for x in options.maximum]) + "\t" +\
+ "\t".join([meanvar % x for x in options.mean]) + "\t" +\
+ "\t".join([varvar % x for x in options.var]) + "\t" +\
+ "\t".join([sdvar % x for x in options.sd]) + \
+ "\t count" + \
+ "\t total"
+ print hcat
+ for c in catcount:
+ for v in c:
+ print pcat % str(v)+"\t",
+ for m in options.minimum:
+ print (("%%%dd" % lmini[m]) % mini[m][c])+"\t",
+ for m in options.maximum:
+ print (("%%%dd" % lmaxi[m]) % maxi[m][c])+"\t",
+ for m in options.mean:
+ print (("%%%df" % lavg[m]) % avg[m][c])+"\t",
+ for m in options.var:
+ print (("%%%df" % lvarp[m]) % varp[m][c])+"\t",
+ for m in options.sd:
+ print (("%%%df" % lsigma[m]) % sigma[m][c])+"\t",
+ print "%7d" %catcount[c],
+ print "%9d" %totcount[c]
+
+
diff --git a/src/obisubset.py b/src/obisubset.py
new file mode 100644
index 0000000..19f96e7
--- /dev/null
+++ b/src/obisubset.py
@@ -0,0 +1,116 @@
+#!/usr/local/bin/python
+'''
+:py:mod:`obisubset`: extract a subset of samples
+================================================
+
+.. codeauthor:: Eric Coissac <eric.coissac at metabarcoding.org>
+
+The :py:mod:`obisubset` command extracts a subset of samples from a sequence file
+after its dereplication using :py:mod:`obiuniq` program.
+'''
+from obitools.format.options import addInOutputOption, sequenceWriterGenerator
+from obitools.options import getOptionManager
+import re
+
+def addSubsetOptions(optionManager):
+
+ group = optionManager.add_option_group('obisubset specific options')
+ group.add_option('-s','--sample',
+ action="store", dest="sample",
+ metavar="<TAGNAME>",
+ type="str",
+ default='merged_sample',
+ help="Tag containing sample descriptions, the default value is set to *merged_sample*")
+
+ group.add_option('-o','--other-tag',
+ action="append", dest="taglist",
+ metavar="<TAGNAME>",
+ type="string",
+ default=[],
+ help="Another tag to clean according to the sample subset")
+
+ group.add_option('-l','--sample-list',
+ action="store", dest="samplelist",
+ metavar="<FILENAME>",
+ type="string",
+ default=None,
+ help="File containing the samples names (one sample id per line)")
+
+ group.add_option('-p','--sample-pattern',
+ action="store", dest="samplepattern",
+ metavar="<REGEX>",
+ type="string",
+ default=None,
+ help="A regular expression pattern matching the sample ids to extract")
+
+ group.add_option('-n','--sample-name',
+ action="append", dest="samplename",
+ metavar="<SAMPLEIDS>",
+ type="string",
+ default=[],
+ help="A sample id to extract")
+
+def sequenceSelectorGenerator(options):
+
+ samplename = set(options.samplename)
+ othertags = set(options.taglist)
+
+ if options.samplelist is not None:
+ with open(options.samplelist) as lname :
+ for name in lname:
+ name = name.strip()
+ samplename.add(name)
+
+ if options.samplepattern is not None:
+ samplepattern = re.compile(options.samplepattern)
+ else:
+ samplepattern = None
+
+ def sequenceSelector(entries):
+ for entry in entries:
+ samples=entry[options.sample]
+
+ slist = set(samples.keys())
+ tokeep=slist & samplename
+
+ if samplepattern is not None:
+ for name in slist:
+ if samplepattern.match(name):
+ tokeep.add(name)
+
+ if tokeep:
+ newsample={}
+ newcount=0
+ for name in tokeep:
+ c = samples[name]
+ newsample[name]= c
+ newcount+=c
+
+ entry['count']=newcount
+ entry[options.sample]=newsample
+
+ for t in othertags:
+ if t in entry:
+ d = entry[t]
+ newd={}
+ for name in tokeep:
+ if name in d:
+ newd[name] = d[name]
+ entry[t]=newd
+
+ yield entry
+
+ return sequenceSelector
+
+if __name__=='__main__':
+
+ optionParser = getOptionManager([addInOutputOption,addSubsetOptions],progdoc=__doc__)
+
+ (options, entries) = optionParser()
+
+ writer = sequenceWriterGenerator(options)
+
+ good = sequenceSelectorGenerator(options)
+
+ for seq in good(entries):
+ writer(seq)
diff --git a/src/obitab.py b/src/obitab.py
new file mode 100644
index 0000000..e021bbd
--- /dev/null
+++ b/src/obitab.py
@@ -0,0 +1,178 @@
+#!/usr/local/bin/python
+'''
+:py:mod:`obitab`: converts a sequence file to a tabular file
+============================================================
+
+.. codeauthor:: Eric Coissac <eric.coissac at metabarcoding.org>
+
+:py:mod:`obitab` command converts sequence file to a tabular file that
+can be open by a spreadsheet program or R.
+
+'''
+
+from obitools.options import getOptionManager
+from obitools.format.options import addInOutputOption
+
+def addTableOptions(optionManager):
+ optionManager.add_option('-n','--na-string',
+ action="store", dest="NA",
+ metavar="<NOT AVAILABLE STRING>",
+ type="string",
+ default="NA",
+ help="String write in the table for not available value"
+ )
+ optionManager.add_option('','--output-field-separator',
+ action="store", dest="ofs",
+ metavar="STRING",
+ type="string",
+ default="\t",
+ help="Field separator for CSV file"
+ )
+ optionManager.add_option('-o','--output-seq',
+ action="store_true", dest="sequence",
+ default=False,
+ help="Add an extra column for sequence"
+ )
+ optionManager.add_option('-d','--no-definition',
+ action="store_false", dest="definition",
+ default=True,
+ help="Remove column for sequence definition"
+ )
+ optionManager.add_option('-a','--omit-attribute',
+ action="append", dest="omit",
+ metavar="<KEY>",
+ default=[],
+ help="Add attribute name to omit in the output tab"
+ )
+
+
+def headerCmp(h1,h2):
+ if type(h1) is str and type(h2) is str:
+ return cmp(h1, h2)
+ if type(h1) is str and type(h2) is tuple:
+ return cmp(h1, h2[0])
+ if type(h1) is tuple and type(h2) is str:
+ return cmp(h1[0], h2)
+ if type(h1) is tuple and type(h2) is tuple:
+ c = cmp(h1[0],h2[0])
+ if c==0:
+ c = cmp(h1[1],h2[1])
+ return c
+ raise AssertionError
+
+
+
+
+
+if __name__=='__main__':
+
+ optionParser = getOptionManager([addTableOptions,addInOutputOption])
+
+ (options, entries) = optionParser()
+
+ column = {}
+ subcol = {}
+ db = []
+ for seq in entries:
+ db.append(seq)
+ keys = seq.keys()
+ for k in keys:
+ t=type(seq[k])
+ if k in column:
+ column[k].add(t)
+ else:
+ column[k]=set([t])
+ if t is dict:
+ if k not in subcol:
+ subcol[k]=set()
+ subcol[k]|=set(seq[k].keys())
+
+ headers = set()
+ for c in column:
+ if len(column[c])==1:
+ column[c]=column[c].pop()
+ else:
+ column[c]=str
+
+ if column[c] not in (str,int,float,dict,bool):
+ column[c]=str
+
+
+ if column[c] is not dict:
+ headers.add(c)
+ else:
+ for sc in subcol[c]:
+ headers.add((c,sc))
+
+ omit = set(options.omit)
+ headers=list(headers)
+ headers.sort(headerCmp)
+
+
+ OFS = options.ofs
+
+ s = "id"
+ if options.definition:
+ s = '%s%sdefinition'%(s,OFS)
+
+ for k in headers:
+ if type(k) is str:
+ if k not in omit:
+ s = '%s%s%s'%(s,OFS,k)
+ else:
+ if k[0] not in omit:
+ if type(k[1]) is tuple:
+ sk = ":".join([str(x) for x in k[1]])
+ else:
+ sk = str(k[1])
+ if k[0][0:7]=='merged_':
+ s = '%s%s%s:%s' % (s,OFS,k[0][7:],sk)
+ else:
+ s = '%s%s%s:%s' % (s,OFS,k[0],sk)
+
+ if options.sequence:
+ s = "%s%ssequence"%(s,OFS)
+ print s
+
+
+ for seq in db:
+ s = seq.id
+
+ if options.definition:
+ s = '%s%s%s'%(s,OFS,seq.definition)
+
+ for k in headers:
+ if type(k) is str:
+ if k not in omit:
+ if k in seq:
+ v = seq[k]
+ if v is None:
+ v=options.NA
+ s = '%s%s%s'%(s,OFS,v)
+ else:
+ s = '%s%s%s'%(s,OFS,options.NA)
+ else:
+ if k[0] not in omit:
+ if k[0] in seq:
+ sk = seq[k[0]]
+ else:
+ sk={}
+ if k[1] in sk:
+ v = sk[k[1]]
+ if v is None:
+ v=options.NA
+ s = '%s%s%s'%(s,OFS,v)
+ else:
+ if k[0][0:7]=='merged_':
+ s = '%s%s0'%(s,OFS)
+ else:
+ s = '%s%s%s'%(s,OFS,options.NA)
+ if options.sequence:
+ s = '%s%s%s'%(s,OFS,str(seq))
+
+ print s
+
+
+
+
+
diff --git a/src/obitail.py b/src/obitail.py
new file mode 100644
index 0000000..a89fa46
--- /dev/null
+++ b/src/obitail.py
@@ -0,0 +1,54 @@
+#!/usr/local/bin/python
+'''
+:py:mod:`obitail`: extracts the last sequence records
+=====================================================
+
+.. codeauthor:: Eric Coissac <eric.coissac at metabarcoding.org>
+
+:py:mod:`obitail` command is in some way analog to the standard Unix `tail` command.
+It selects the tail of :doc:`a sequence file <../formats>`.
+But instead of working text line by text line as the standard Unix tool,
+selection is done at the sequence record level. You can specify the number of
+sequence records to select.
+
+ *Example:*
+
+ .. code-block:: bash
+
+ > obitail -n 150 seq1.fasta > seq2.fasta
+
+ Selects the 150 last sequence records from the ``seq1.fasta`` file and stores
+ them into the ``seq2.fasta`` file.
+
+
+'''
+
+from obitools.format.options import addInOutputOption, sequenceWriterGenerator
+from obitools.options import getOptionManager
+import collections
+
+def addHeadOptions(optionManager):
+ optionManager.add_option('-n','--sequence-count',
+ action="store", dest="count",
+ metavar="###",
+ type="int",
+ default=10,
+ help="Count of first sequences to print")
+
+
+if __name__ == '__main__':
+ optionParser = getOptionManager([addHeadOptions,addInOutputOption])
+
+ (options, entries) = optionParser()
+ i=0
+
+ queue = collections.deque(entries,options.count)
+
+ writer = sequenceWriterGenerator(options)
+
+ while queue:
+ writer(queue.popleft())
+
+
+
+
diff --git a/src/obitaxonomy.py b/src/obitaxonomy.py
new file mode 100644
index 0000000..3f96362
--- /dev/null
+++ b/src/obitaxonomy.py
@@ -0,0 +1,350 @@
+#!/usr/local/bin/python
+'''
+:py:mod:`obitaxonomy`: manages taxonomic databases
+==================================================
+
+.. codeauthor:: Eric Coissac <eric.coissac at metabarcoding.org> and Celine Mercier <celine.mercier at metabarcoding.org>
+
+The :py:mod:`obitaxonomy` command can generate an ecoPCR database from a NCBI taxdump
+(see NCBI ftp site) and allows managing the taxonomic data contained in both types of
+database.
+
+Several types of editing are possible:
+
+**Adding a taxon to the database**
+
+ The new taxon is described by three values:
+ its scientific name, its taxonomic rank, and the *taxid* of its first ancestor.
+ Done by using the ``-a`` option.
+
+**Deleting a taxon from the database**
+
+ Erases a local taxon. Done by using the ``-D`` option and specifying a *taxid*.
+
+**Adding a species to the database**
+
+ The genus of the species must already exist in the database. The species will be
+ added under its genus. Done by using the ``-s`` option and specifying a species
+ scientific name.
+
+**Adding a preferred scientific name for a taxon in the database**
+
+ Adds a preferred name for a taxon in the taxonomy, by specifying the new favorite
+ name and the *taxid* of the taxon whose preferred name should be changed.
+ Done by using the ``-f`` option.
+
+**Adding all the taxa from a sequence file in the ``OBITools`` extended :doc:`fasta <../fasta>` format to the database**
+
+ All the taxon from a file in the ``OBITools`` extended :doc:`fasta <../fasta>` format, and eventually their ancestors, are added to the
+ taxonomy database.
+
+ The header of each sequence record must contain the attribute defined by the
+ ``-k`` option (default key: ``species_name``), whose value is the scientific name
+ of the taxon to be added.
+
+ A taxonomic path for each sequence record can be specified with the ``-p`` option,
+ as the attribute key that contains the taxonomic path of the taxon to be added.
+
+ A restricting ancestor can be specified with the ``-A`` option, either as a *taxid*
+ (integer) or a key (string). If it is a *taxid*, this *taxid* is the default *taxid*
+ under which the new taxon is added if none of his ancestors are specified or can
+ be found. If it is a key, :py:mod:`obitaxonomy` looks for the ancestor *taxid* in
+ the corresponding attribute, and the new taxon is systematically added under this
+ ancestor. By default, the restricting ancestor is the root of the taxonomic tree for
+ all the new taxa.
+
+ If neither a path nor an ancestor is specified in the header of the sequence record,
+ :py:mod:`obitaxonomy` tries to read the taxon name as a species name and to find the
+ genus in the taxonomic database. If the genus is found, the new taxon is added under it.
+ If not, it is added under the restricting ancestor.
+
+ It is highly recommended checking what was exactly done by reading the output,
+ since :py:mod:`obitaxonomy` uses *ad hoc* parsing and decision rules.
+
+ Done by using the ``-F`` option.
+
+**Notes:**
+
+- When a taxon is added, a new *taxid* is assigned to it. The minimum for the new *taxids*
+ can be specified by the ``-m`` option and is equal to 10000000 by default.
+
+- For each modification, a line is printed with details on what was done.
+
+'''
+
+
+from obitools.options.taxonomyfilter import addTaxonomyDBOptions,loadTaxonomyDatabase
+from obitools.options import getOptionManager
+from obitools.ecopcr.taxonomy import ecoTaxonomyWriter
+from obitools.fasta import fastaIterator
+import sys
+
+
+def addTaxonFromFile(name, rank, parent, options) :
+
+ taxid = options.taxonomy.addLocalTaxon(name, rank, parent, options.taxashift)
+ taxon = options.taxonomy.findTaxonByTaxid(taxid)
+ parent= options.taxonomy._taxonomy[taxon[2]]
+
+# if options.write == '' :
+ print>>sys.stderr, "added : %-40s\t%-15s\t%-8d\t->\t%s [%d] (%s)" % (taxon[3],options.taxonomy._ranks[taxon[1]],
+ taxon[0],
+ parent[3],parent[0],options.taxonomy._ranks[parent[1]])
+# else :
+# print>>options.write, "added : %-40s\t%-15s\t%-8d\t->\t%s [%d] (%s)" % (taxon[3],options.taxonomy._ranks[taxon[1]],
+# taxon[0],
+# parent[3],parent[0],options.taxonomy._ranks[parent[1]])
+ return taxid
+
+
+def numberInStr(s) :
+ containsNumber = False
+ for c in s :
+ if c.isdigit() :
+ containsNumber = True
+ return containsNumber
+
+
+def editTaxonomyOptions(optionManager):
+ optionManager.add_option('-a','--add-taxon',
+ action="append", dest="newtaxon",
+ metavar="<taxon_name>:rank:parent",
+ default=[],
+ help="Adds a new taxon to the taxonomy. The new taxon "
+ "is described by three values separated by colons: "
+ "the scientific name, the rank of the new taxon, "
+ "the taxid of the parent taxon")
+
+ optionManager.add_option('-D','--delete-local-taxon',
+ action="append", dest="deltaxon",
+ metavar="<TAXID>",
+ default=[],
+ help="Erase a local taxon")
+
+ optionManager.add_option('-s','--add-species',
+ action="append", dest="newspecies",
+ metavar="<SPECIES_NAME>",
+ default=[],
+ help="Adds a new species to the taxonomy. The new species "
+ "is described by its scientific name")
+
+ optionManager.add_option('-F','--add-file',
+ action="store", dest="species_file",
+ metavar="<file name>",
+ default=None,
+ help="Add all the species from a fasta file to the taxonomy. The header of"
+ " the sequences must contain the field defined by the -k option")
+
+ optionManager.add_option('-k','--key_name',
+ action="store", dest="key_name",
+ metavar="<key name>",
+ default='species_name',
+ help="Name of the attribute key used to find the species names in the headers "
+ "when the -F option is used. "
+ "Default = 'species_name'")
+
+ optionManager.add_option('-f','--add-favorite-name',
+ action="append", dest="newname",
+ metavar="<taxon_name>:taxid",
+ default=[],
+ help="Add a new favorite name to the taxonomy. The new name "
+ "is described by two values separated by a colon. "
+ "the new favorite name and the taxid of the taxon")
+
+ optionManager.add_option('-m','--min-taxid',
+ action="store", dest="taxashift",
+ type="int",
+ metavar="####",
+ default=10000000,
+ help="minimal taxid for the newly added taxid")
+
+ optionManager.add_option('-A','--restricting_ancestor',
+ action="store", dest="res_anc",
+ type="str",
+ metavar="<ANCESTOR>",
+ default='',
+ help="works with the -F option. Can be a word or a taxid (number). Enables to restrict the "
+ "adding of taxids under a specified ancestor. If it's a word, it's the field containing "
+ "the ancestor's taxid in each sequence's header (can be different for each sequence). If "
+ "it's a number, it's the taxid of the ancestor (in which case it's the same for all the sequences)."
+ " All the sequences in the file for which the genus can't be found will be added under this ancestor.")
+
+# optionManager.add_option('-w','--write_in_file',
+# action="store", dest="write",
+# metavar="<write_in_file>",
+# type = "str", default='',
+# help="works with the -F option. Writes all the taxa added in the specified file instead of in the console screen."
+# " Useful for big and/or problematic files.")
+
+ optionManager.add_option('-p','--path',
+ action="store", dest="path",
+ type="str",
+ metavar="<path>",
+ default='',
+ help="works with the -F option. Field name for the taxonomy path of the taxa if they are in the headers of the sequences. "
+ "Must be of the form 'Fungi,Agaricomycetes,Thelephorales,Thelephoraceae' with the highest ancestors"
+ " first and ',' as separators between ancestors")
+
+# optionManager.add_option('-P','--force_ancestor',
+# action="store_true", dest="force_ancestor",
+# metavar="<force_ancestor>",
+# default=False,
+# help="works with the -A option when the ancestor is in the header. Forces the adding of the species under the ancestor specified."
+# " /!\ the ancestor must exist. Use taxonomy paths (-p option) if you want the ancestor(s) to be created too.")
+
+if __name__ == '__main__':
+
+ optionParser = getOptionManager([addTaxonomyDBOptions,editTaxonomyOptions])
+
+ (options, entries) = optionParser()
+
+ loadTaxonomyDatabase(options)
+
+ localdata=False
+
+# if options.write != '' :
+# options.write = open(options.write, 'w')
+
+ for t in options.newtaxon:
+ tx = t.split(':')
+ taxid = options.taxonomy.addLocalTaxon(tx[0].strip(),tx[1],tx[2],options.taxashift)
+ taxon = options.taxonomy.findTaxonByTaxid(taxid)
+ parent= options.taxonomy._taxonomy[taxon[2]]
+ print "added : %-40s\t%-15s\t%-8d\t->\t%s [%d] (%s)" % (taxon[3],options.taxonomy._ranks[taxon[1]],
+ taxon[0],
+ parent[3],parent[0],options.taxonomy._ranks[parent[1]])
+ localdata=True
+
+# for t in options.deltaxon:
+# tx = int(t)
+# taxon = options.taxonomy.removeLocalTaxon(tx)
+# print "removed : %-40s\t%-15s\t%-8d" % (taxon[3],options.taxonomy._ranks[taxon[1]],
+# taxon[0])
+# localdata=True
+
+
+ if options.species_file != None :
+
+ useless_words = ['fungal','fungi','endophyte','unknown','mycorrhizal','uncultured','Uncultured','ectomycorrhiza', \
+ 'ectomycorrhizal','mycorrhizal','vouchered','unidentified','bacterium','Bacterium']
+
+ if options.res_anc == '' :
+ restricting_ancestor = 1
+ resAncInHeader = False
+ elif options.res_anc.isdigit() :
+ restricting_ancestor = int(options.res_anc)
+ resAncInHeader = False
+ else :
+ resAncInHeader = True
+
+ for seq in fastaIterator(options.species_file) :
+
+ if resAncInHeader :
+ if options.res_anc in seq :
+ restricting_ancestor = int(seq[options.res_anc])
+ else :
+ restricting_ancestor = 1
+
+ t = seq[options.key_name]
+
+ key_error = False
+ taxid = None
+ # check if the taxon isn't already in the taxonomy with the right ancestor
+ try :
+ possible_taxids = options.taxonomy.findTaxonByName(t)
+ for p in possible_taxids :
+ if options.taxonomy.isAncestor(restricting_ancestor, p[0]) :
+ taxid = p[0]
+
+ except KeyError :
+ key_error = True
+
+ if key_error or taxid is None :
+
+ if (resAncInHeader and options.res_anc in seq) :
+ taxid = addTaxonFromFile(t,'species',restricting_ancestor,options)
+
+ elif options.path != '' :
+ previous = options.taxonomy.findTaxonByTaxid(restricting_ancestor)
+ if seq[options.path] != '' :
+ ancestors = [a for a in seq[options.path].split(',')]
+ if ancestors[-1] != t :
+ ancestors.append(t)
+ else : # useful when data is from UNITE databases but could disappear
+ if len(t.split(' ')) >= 2 and not numberInStr(t) :
+ genus, trash = t.split(" ",1)
+ ancestors = [genus, t]
+ else :
+ ancestors = [t]
+ for a in ancestors :
+ try:
+ possible_previous = options.taxonomy.findTaxonByName(a)
+ keyError = True
+ for p in possible_previous :
+ if options.taxonomy.isAncestor(restricting_ancestor, p[0]) :
+ previous = p
+ keyError = False
+ if keyError :
+ raise KeyError()
+
+ except KeyError :
+ if (len(ancestors) > 1 and a == ancestors[-2] and len(ancestors[-1].split(' ')) >= 2 and ((not numberInStr(a)) or 'sp' in a.split(' '))) : #a voirrrrr, trop restrictif ?
+ rank = 'genus'
+ elif a == ancestors[-1] :
+ rank = 'species'
+ else :
+ rank = 'no rank'
+ taxid = addTaxonFromFile(a,rank,previous[0],options)
+ previous = (taxid, options.taxonomy.findRankByName(rank))
+
+ else :
+
+ if (len(t.split(' ')) >= 2 and (not numberInStr(t) or 'sp' in t.split(' ') or t[0].isupper()) \
+ and t.split(' ')[0] not in useless_words) :
+
+ genus,species = t.split(" ",1)
+
+ try :
+ possible_genuses = options.taxonomy.findTaxonByName(genus)
+ genus_taxid = None
+ for g in possible_genuses :
+ if options.taxonomy.isAncestor(restricting_ancestor, g[0]) :
+ genus_taxid = g[0]
+ except KeyError :
+ genus_taxid = addTaxonFromFile(genus,'genus',restricting_ancestor,options)
+
+ if genus_taxid is None : # Genuses matching the name were found but they weren't under the restricting ancestor
+ parent = restricting_ancestor
+ else :
+ parent = genus_taxid
+ taxid = addTaxonFromFile(t, 'species', parent, options)
+
+ else :
+ taxid = addTaxonFromFile(t, 'species', restricting_ancestor, options)
+
+ localdata=True
+
+# seq['taxid'] = taxid
+# print formatFasta(seq)
+
+
+
+ for t in options.newspecies:
+ genus,species = t.split(" ",1)
+ parent = options.taxonomy.findTaxonByName(genus)
+ taxid = options.taxonomy.addLocalTaxon(t,'species',parent[0],options.taxashift)
+ taxon = options.taxonomy.findTaxonByTaxid(taxid)
+ parent= options.taxonomy._taxonomy[taxon[2]]
+ print "added : %-40s\t%-15s\t%-8d\t->\t%s [%d] (%s)" % (taxon[3],options.taxonomy._ranks[taxon[1]],
+ taxon[0],
+ parent[3],parent[0],options.taxonomy._ranks[parent[1]])
+ localdata=True
+
+ for t in options.newname:
+ tx = t.split(':')
+ taxid = options.taxonomy.addPreferedName(int(tx[1]), tx[0].strip())
+ print "name : %8d\t->\t%s" % (taxid,options.taxonomy.getPreferedName(taxid))
+
+ ecoTaxonomyWriter(options.ecodb,options.taxonomy,onlyLocal=True)
+
+
\ No newline at end of file
diff --git a/src/obitools/SVGdraw.py b/src/obitools/SVGdraw.py
new file mode 100644
index 0000000..521f750
--- /dev/null
+++ b/src/obitools/SVGdraw.py
@@ -0,0 +1,1054 @@
+#!/usr/bin/env python
+##Copyright (c) 2002, Fedor Baart & Hans de Wit (Stichting Farmaceutische Kengetallen)
+##All rights reserved.
+##
+##Redistribution and use in source and binary forms, with or without modification,
+##are permitted provided that the following conditions are met:
+##
+##Redistributions of source code must retain the above copyright notice, this
+##list of conditions and the following disclaimer.
+##
+##Redistributions in binary form must reproduce the above copyright notice,
+##this list of conditions and the following disclaimer in the documentation and/or
+##other materials provided with the distribution.
+##
+##Neither the name of the Stichting Farmaceutische Kengetallen nor the names of
+##its contributors may be used to endorse or promote products derived from this
+##software without specific prior written permission.
+##
+##THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+##AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+##IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+##DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+##FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+##DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+##SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+##CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+##OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+##OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+##Thanks to Gerald Rosennfellner for his help and useful comments.
+
+__doc__="""Use SVGdraw to generate your SVGdrawings.
+
+SVGdraw uses an object model drawing and a method toXML to create SVG graphics
+by using easy to use classes and methods usualy you start by creating a drawing eg
+
+ d=drawing()
+ #then you create a SVG root element
+ s=svg()
+ #then you add some elements eg a circle and add it to the svg root element
+ c=circle()
+ #you can supply attributes by using named arguments.
+ c=circle(fill='red',stroke='blue')
+ #or by updating the attributes attribute:
+ c.attributes['stroke-width']=1
+ s.addElement(c)
+ #then you add the svg root element to the drawing
+ d.setSVG(s)
+ #and finaly you xmlify the drawing
+ d.toXml()
+
+
+this results in the svg source of the drawing, which consists of a circle
+on a white background. Its as easy as that;)
+This module was created using the SVG specification of www.w3c.org and the
+O'Reilly (www.oreilly.com) python books as information sources. A svg viewer
+is available from www.adobe.com"""
+
+__version__="1.0"
+
+# there are two possibilities to generate svg:
+# via a dom implementation and directly using <element>text</element> strings
+# the latter is way faster (and shorter in coding)
+# the former is only used in debugging svg programs
+# maybe it will be removed alltogether after a while
+# with the following variable you indicate whether to use the dom implementation
+# Note that PyXML is required for using the dom implementation.
+# It is also possible to use the standard minidom. But I didn't try that one.
+# Anyway the text based approach is about 60 times faster than using the full dom implementation.
+use_dom_implementation=0
+
+
+import exceptions
+if use_dom_implementation<>0:
+ try:
+ from xml.dom import implementation
+ from xml.dom.ext import PrettyPrint
+ except:
+ raise exceptions.ImportError, "PyXML is required for using the dom implementation"
+#The implementation is used for the creating the XML document.
+#The prettyprint module is used for converting the xml document object to a xml file
+
+import sys
+assert sys.version_info[0]>=2
+if sys.version_info[1]<2:
+ True=1
+ False=0
+ file=open
+
+sys.setrecursionlimit=50
+#The recursion limit is set conservative so mistakes like s=svg() s.addElement(s)
+#won't eat up too much processor time.
+
+#the following code is pasted form xml.sax.saxutils
+#it makes it possible to run the code without the xml sax package installed
+#To make it possible to have <rubbish> in your text elements, it is necessary to escape the texts
+def _escape(data, entities={}):
+ """Escape &, <, and > in a string of data.
+
+ You can escape other strings of data by passing a dictionary as
+ the optional entities parameter. The keys and values must all be
+ strings; each key will be replaced with its corresponding value.
+ """
+ data = data.replace("&", "&")
+ data = data.replace("<", "<")
+ data = data.replace(">", ">")
+ for chars, entity in entities.items():
+ data = data.replace(chars, entity)
+ return data
+
+def _quoteattr(data, entities={}):
+ """Escape and quote an attribute value.
+
+ Escape &, <, and > in a string of data, then quote it for use as
+ an attribute value. The \" character will be escaped as well, if
+ necessary.
+
+ You can escape other strings of data by passing a dictionary as
+ the optional entities parameter. The keys and values must all be
+ strings; each key will be replaced with its corresponding value.
+ """
+ data = _escape(data, entities)
+ if '"' in data:
+ if "'" in data:
+ data = '"%s"' % data.replace('"', """)
+ else:
+ data = "'%s'" % data
+ else:
+ data = '"%s"' % data
+ return data
+
+
+
+def _xypointlist(a):
+ """formats a list of xy pairs"""
+ s=''
+ for e in a: #this could be done more elegant
+ s+=str(e)[1:-1] +' '
+ return s
+
+def _viewboxlist(a):
+ """formats a tuple"""
+ s=''
+ for e in a:
+ s+=str(e)+' '
+ return s
+
+def _pointlist(a):
+ """formats a list of numbers"""
+ return str(a)[1:-1]
+
+class pathdata:
+ """class used to create a pathdata object which can be used for a path.
+ although most methods are pretty straightforward it might be useful to look at the SVG specification."""
+ #I didn't test the methods below.
+ def __init__(self,x=None,y=None):
+ self.path=[]
+ if x is not None and y is not None:
+ self.path.append('M '+str(x)+' '+str(y))
+ def closepath(self):
+ """ends the path"""
+ self.path.append('z')
+ def move(self,x,y):
+ """move to absolute"""
+ self.path.append('M '+str(x)+' '+str(y))
+ def relmove(self,x,y):
+ """move to relative"""
+ self.path.append('m '+str(x)+' '+str(y))
+ def line(self,x,y):
+ """line to absolute"""
+ self.path.append('L '+str(x)+' '+str(y))
+ def relline(self,x,y):
+ """line to relative"""
+ self.path.append('l '+str(x)+' '+str(y))
+ def hline(self,x):
+ """horizontal line to absolute"""
+ self.path.append('H'+str(x))
+ def relhline(self,x):
+ """horizontal line to relative"""
+ self.path.append('h'+str(x))
+ def vline(self,y):
+ """verical line to absolute"""
+ self.path.append('V'+str(y))
+ def relvline(self,y):
+ """vertical line to relative"""
+ self.path.append('v'+str(y))
+ def bezier(self,x1,y1,x2,y2,x,y):
+ """bezier with xy1 and xy2 to xy absolut"""
+ self.path.append('C'+str(x1)+','+str(y1)+' '+str(x2)+','+str(y2)+' '+str(x)+','+str(y))
+ def relbezier(self,x1,y1,x2,y2,x,y):
+ """bezier with xy1 and xy2 to xy relative"""
+ self.path.append('c'+str(x1)+','+str(y1)+' '+str(x2)+','+str(y2)+' '+str(x)+','+str(y))
+ def smbezier(self,x2,y2,x,y):
+ """smooth bezier with xy2 to xy absolut"""
+ self.path.append('S'+str(x2)+','+str(y2)+' '+str(x)+','+str(y))
+ def relsmbezier(self,x2,y2,x,y):
+ """smooth bezier with xy2 to xy relative"""
+ self.path.append('s'+str(x2)+','+str(y2)+' '+str(x)+','+str(y))
+ def qbezier(self,x1,y1,x,y):
+ """quadratic bezier with xy1 to xy absolut"""
+ self.path.append('Q'+str(x1)+','+str(y1)+' '+str(x)+','+str(y))
+ def relqbezier(self,x1,y1,x,y):
+ """quadratic bezier with xy1 to xy relative"""
+ self.path.append('q'+str(x1)+','+str(y1)+' '+str(x)+','+str(y))
+ def smqbezier(self,x,y):
+ """smooth quadratic bezier to xy absolut"""
+ self.path.append('T'+str(x)+','+str(y))
+ def relsmqbezier(self,x,y):
+ """smooth quadratic bezier to xy relative"""
+ self.path.append('t'+str(x)+','+str(y))
+ def ellarc(self,rx,ry,xrot,laf,sf,x,y):
+ """elliptival arc with rx and ry rotating with xrot using large-arc-flag and sweep-flag to xy absolut"""
+ self.path.append('A'+str(rx)+','+str(ry)+' '+str(xrot)+' '+str(laf)+' '+str(sf)+' '+str(x)+' '+str(y))
+ def relellarc(self,rx,ry,xrot,laf,sf,x,y):
+ """elliptival arc with rx and ry rotating with xrot using large-arc-flag and sweep-flag to xy relative"""
+ self.path.append('a'+str(rx)+','+str(ry)+' '+str(xrot)+' '+str(laf)+' '+str(sf)+' '+str(x)+' '+str(y))
+ def __repr__(self):
+ return ' '.join(self.path)
+
+
+
+
+class SVGelement:
+ """SVGelement(type,attributes,elements,text,namespace,**args)
+ Creates a arbitrary svg element and is intended to be subclassed not used on its own.
+ This element is the base of every svg element it defines a class which resembles
+ a xml-element. The main advantage of this kind of implementation is that you don't
+ have to create a toXML method for every different graph object. Every element
+ consists of a type, attribute, optional subelements, optional text and an optional
+ namespace. Note the elements==None, if elements = None:self.elements=[] construction.
+ This is done because if you default to elements=[] every object has a reference
+ to the same empty list."""
+ def __init__(self,type='',attributes=None,elements=None,text='',namespace='',cdata=None,**args):
+ self.type=type
+ if attributes==None:
+ self.attributes={}
+ else:
+ self.attributes=attributes
+ if elements==None:
+ self.elements=[]
+ else:
+ self.elements=elements
+ self.text=text
+ self.namespace=namespace
+ self.cdata=cdata
+ for arg in args.keys():
+ self.attributes[arg]=args[arg]
+ def addElement(self,SVGelement):
+ """adds an element to a SVGelement
+
+ SVGelement.addElement(SVGelement)
+ """
+ self.elements.append(SVGelement)
+
+ #def toXml(self,level,f, preserveWhitespace=False):
+ def toXml(self,level,f, **kwargs):
+ preserve = kwargs.get("preserveWhitespace", False)
+ if preserve:
+ #print "PRESERVING"
+ NEWLINE = ""
+ TAB = ""
+ else:
+ #print "NOT PRESE"
+ NEWLINE = "\n"
+ TAB = "\t"
+ f.write(TAB*level)
+ f.write('<'+self.type)
+ for attkey in self.attributes.keys():
+ f.write(' '+_escape(str(attkey))+'='+_quoteattr(str(self.attributes[attkey])))
+ if self.namespace:
+ f.write(' xmlns="'+ _escape(str(self.namespace))+'" ')
+ if self.elements or self.text or self.cdata:
+ f.write('>')
+ if self.elements:
+ f.write(NEWLINE)
+ for element in self.elements:
+ element.toXml(level+1,f, preserveWhitespace=preserve)
+ if self.cdata:
+ f.write(NEWLINE+TAB*(level+1)+'<![CDATA[')
+ for line in self.cdata.splitlines():
+ f.write(NEWLINE+TAB*(level+2)+line)
+ f.write(NEWLINE+TAB*(level+1)+']]>'+NEWLINE)
+ if self.text:
+ if type(self.text)==type(''): #If the text is only text
+ f.write(_escape(str(self.text)))
+ else: #If the text is a spannedtext class
+ f.write(str(self.text))
+ if self.elements:
+ f.write(TAB*level+'</'+self.type+'>'+NEWLINE)
+ elif self.text:
+ f.write('</'+self.type+'>'+NEWLINE)
+ elif self.cdata:
+ f.write(TAB*level+'</'+self.type+'>'+NEWLINE)
+ else:
+ f.write('/>'+NEWLINE)
+
+class tspan(SVGelement):
+ """ts=tspan(text='',**args)
+
+ a tspan element can be used for applying formatting to a textsection
+ usage:
+ ts=tspan('this text is bold')
+ ts.attributes['font-weight']='bold'
+ st=spannedtext()
+ st.addtspan(ts)
+ t=text(3,5,st)
+ """
+ def __init__(self,text=None,**args):
+ SVGelement.__init__(self,'tspan',**args)
+ if self.text<>None:
+ self.text=text
+ def __repr__(self):
+ s="<tspan"
+ for key,value in self.attributes.items():
+ s+= ' %s="%s"' % (key,value)
+ s+='>'
+ s+=self.text
+ s+='</tspan>'
+ return s
+
+class tref(SVGelement):
+ """tr=tref(link='',**args)
+
+ a tref element can be used for referencing text by a link to its id.
+ usage:
+ tr=tref('#linktotext')
+ st=spannedtext()
+ st.addtref(tr)
+ t=text(3,5,st)
+ """
+ def __init__(self,link,**args):
+ SVGelement.__init__(self,'tref',{'xlink:href':link},**args)
+ def __repr__(self):
+ s="<tref"
+
+ for key,value in self.attributes.items():
+ s+= ' %s="%s"' % (key,value)
+ s+='/>'
+ return s
+
+class spannedtext:
+ """st=spannedtext(textlist=[])
+
+ a spannedtext can be used for text which consists of text, tspan's and tref's
+ You can use it to add to a text element or path element. Don't add it directly
+ to a svg or a group element.
+ usage:
+
+ ts=tspan('this text is bold')
+ ts.attributes['font-weight']='bold'
+ tr=tref('#linktotext')
+ tr.attributes['fill']='red'
+ st=spannedtext()
+ st.addtspan(ts)
+ st.addtref(tr)
+ st.addtext('This text is not bold')
+ t=text(3,5,st)
+ """
+ def __init__(self,textlist=None):
+ if textlist==None:
+ self.textlist=[]
+ else:
+ self.textlist=textlist
+ def addtext(self,text=''):
+ self.textlist.append(text)
+ def addtspan(self,tspan):
+ self.textlist.append(tspan)
+ def addtref(self,tref):
+ self.textlist.append(tref)
+ def __repr__(self):
+ s=""
+ for element in self.textlist:
+ s+=str(element)
+ return s
+
+class rect(SVGelement):
+ """r=rect(width,height,x,y,fill,stroke,stroke_width,**args)
+
+ a rectangle is defined by a width and height and a xy pair
+ """
+ def __init__(self,x=None,y=None,width=None,height=None,fill=None,stroke=None,stroke_width=None,**args):
+ if width==None or height==None:
+ if width<>None:
+ raise ValueError, 'height is required'
+ if height<>None:
+ raise ValueError, 'width is required'
+ else:
+ raise ValueError, 'both height and width are required'
+ SVGelement.__init__(self,'rect',{'width':width,'height':height},**args)
+ if x<>None:
+ self.attributes['x']=x
+ if y<>None:
+ self.attributes['y']=y
+ if fill<>None:
+ self.attributes['fill']=fill
+ if stroke<>None:
+ self.attributes['stroke']=stroke
+ if stroke_width<>None:
+ self.attributes['stroke-width']=stroke_width
+
+class ellipse(SVGelement):
+ """e=ellipse(rx,ry,x,y,fill,stroke,stroke_width,**args)
+
+ an ellipse is defined as a center and a x and y radius.
+ """
+ def __init__(self,cx=None,cy=None,rx=None,ry=None,fill=None,stroke=None,stroke_width=None,**args):
+ if rx==None or ry== None:
+ if rx<>None:
+ raise ValueError, 'rx is required'
+ if ry<>None:
+ raise ValueError, 'ry is required'
+ else:
+ raise ValueError, 'both rx and ry are required'
+ SVGelement.__init__(self,'ellipse',{'rx':rx,'ry':ry},**args)
+ if cx<>None:
+ self.attributes['cx']=cx
+ if cy<>None:
+ self.attributes['cy']=cy
+ if fill<>None:
+ self.attributes['fill']=fill
+ if stroke<>None:
+ self.attributes['stroke']=stroke
+ if stroke_width<>None:
+ self.attributes['stroke-width']=stroke_width
+
+
+class circle(SVGelement):
+ """c=circle(x,y,radius,fill,stroke,stroke_width,**args)
+
+ The circle creates an element using a x, y and radius values eg
+ """
+ def __init__(self,cx=None,cy=None,r=None,fill=None,stroke=None,stroke_width=None,**args):
+ if r==None:
+ raise ValueError, 'r is required'
+ SVGelement.__init__(self,'circle',{'r':r},**args)
+ if cx<>None:
+ self.attributes['cx']=cx
+ if cy<>None:
+ self.attributes['cy']=cy
+ if fill<>None:
+ self.attributes['fill']=fill
+ if stroke<>None:
+ self.attributes['stroke']=stroke
+ if stroke_width<>None:
+ self.attributes['stroke-width']=stroke_width
+
+class point(circle):
+ """p=point(x,y,color)
+
+ A point is defined as a circle with a size 1 radius. It may be more efficient to use a
+ very small rectangle if you use many points because a circle is difficult to render.
+ """
+ def __init__(self,x,y,fill='black',**args):
+ circle.__init__(self,x,y,1,fill,**args)
+
+class line(SVGelement):
+ """l=line(x1,y1,x2,y2,stroke,stroke_width,**args)
+
+ A line is defined by a begin x,y pair and an end x,y pair
+ """
+ def __init__(self,x1=None,y1=None,x2=None,y2=None,stroke=None,stroke_width=None,**args):
+ SVGelement.__init__(self,'line',**args)
+ if x1<>None:
+ self.attributes['x1']=x1
+ if y1<>None:
+ self.attributes['y1']=y1
+ if x2<>None:
+ self.attributes['x2']=x2
+ if y2<>None:
+ self.attributes['y2']=y2
+ if stroke_width<>None:
+ self.attributes['stroke-width']=stroke_width
+ if stroke<>None:
+ self.attributes['stroke']=stroke
+
+class polyline(SVGelement):
+ """pl=polyline([[x1,y1],[x2,y2],...],fill,stroke,stroke_width,**args)
+
+ a polyline is defined by a list of xy pairs
+ """
+ def __init__(self,points,fill=None,stroke=None,stroke_width=None,**args):
+ SVGelement.__init__(self,'polyline',{'points':_xypointlist(points)},**args)
+ if fill<>None:
+ self.attributes['fill']=fill
+ if stroke_width<>None:
+ self.attributes['stroke-width']=stroke_width
+ if stroke<>None:
+ self.attributes['stroke']=stroke
+
+class polygon(SVGelement):
+ """pl=polyline([[x1,y1],[x2,y2],...],fill,stroke,stroke_width,**args)
+
+ a polygon is defined by a list of xy pairs
+ """
+ def __init__(self,points,fill=None,stroke=None,stroke_width=None,**args):
+ SVGelement.__init__(self,'polygon',{'points':_xypointlist(points)},**args)
+ if fill<>None:
+ self.attributes['fill']=fill
+ if stroke_width<>None:
+ self.attributes['stroke-width']=stroke_width
+ if stroke<>None:
+ self.attributes['stroke']=stroke
+
+class path(SVGelement):
+ """p=path(path,fill,stroke,stroke_width,**args)
+
+ a path is defined by a path object and optional width, stroke and fillcolor
+ """
+ def __init__(self,pathdata,fill=None,stroke=None,stroke_width=None,id=None,**args):
+ SVGelement.__init__(self,'path',{'d':str(pathdata)},**args)
+ if stroke<>None:
+ self.attributes['stroke']=stroke
+ if fill<>None:
+ self.attributes['fill']=fill
+ if stroke_width<>None:
+ self.attributes['stroke-width']=stroke_width
+ if id<>None:
+ self.attributes['id']=id
+
+
+class text(SVGelement):
+ """t=text(x,y,text,font_size,font_family,**args)
+
+ a text element can bge used for displaying text on the screen
+ """
+ def __init__(self,x=None,y=None,text=None,font_size=None,font_family=None,text_anchor=None,**args):
+ SVGelement.__init__(self,'text',**args)
+ if x<>None:
+ self.attributes['x']=x
+ if y<>None:
+ self.attributes['y']=y
+ if font_size<>None:
+ self.attributes['font-size']=font_size
+ if font_family<>None:
+ self.attributes['font-family']=font_family
+ if text<>None:
+ self.text=text
+ if text_anchor<>None:
+ self.attributes['text-anchor']=text_anchor
+
+ def toXml(self,level,f, **kwargs):
+ preserve = self.attributes.get("xml:space", None)
+ if preserve == "preserve":
+ #print "FOO PRE"
+ SVGelement.toXml(self,level, f, preserveWhitespace=True)
+ else:
+ #print "FOO NOT"
+ SVGelement.toXml(self, level, f, preserveWhitespace=False)
+
+class textpath(SVGelement):
+ """tp=textpath(text,link,**args)
+
+ a textpath places a text on a path which is referenced by a link.
+ """
+ def __init__(self,link,text=None,**args):
+ SVGelement.__init__(self,'textPath',{'xlink:href':link},**args)
+ if text<>None:
+ self.text=text
+
+class pattern(SVGelement):
+ """p=pattern(x,y,width,height,patternUnits,**args)
+
+ A pattern is used to fill or stroke an object using a pre-defined
+ graphic object which can be replicated ("tiled") at fixed intervals
+ in x and y to cover the areas to be painted.
+ """
+ def __init__(self,x=None,y=None,width=None,height=None,patternUnits=None,**args):
+ SVGelement.__init__(self,'pattern',**args)
+ if x<>None:
+ self.attributes['x']=x
+ if y<>None:
+ self.attributes['y']=y
+ if width<>None:
+ self.attributes['width']=width
+ if height<>None:
+ self.attributes['height']=height
+ if patternUnits<>None:
+ self.attributes['patternUnits']=patternUnits
+
+class title(SVGelement):
+ """t=title(text,**args)
+
+ a title is a text element. The text is displayed in the title bar
+ add at least one to the root svg element
+ """
+ def __init__(self,text=None,**args):
+ SVGelement.__init__(self,'title',**args)
+ if text<>None:
+ self.text=text
+
+class description(SVGelement):
+ """d=description(text,**args)
+
+ a description can be added to any element and is used for a tooltip
+ Add this element before adding other elements.
+ """
+ def __init__(self,text=None,**args):
+ SVGelement.__init__(self,'desc',**args)
+ if text<>None:
+ self.text=text
+
+class lineargradient(SVGelement):
+ """lg=lineargradient(x1,y1,x2,y2,id,**args)
+
+ defines a lineargradient using two xy pairs.
+ stop elements van be added to define the gradient colors.
+ """
+ def __init__(self,x1=None,y1=None,x2=None,y2=None,id=None,**args):
+ SVGelement.__init__(self,'linearGradient',**args)
+ if x1<>None:
+ self.attributes['x1']=x1
+ if y1<>None:
+ self.attributes['y1']=y1
+ if x2<>None:
+ self.attributes['x2']=x2
+ if y2<>None:
+ self.attributes['y2']=y2
+ if id<>None:
+ self.attributes['id']=id
+
+class radialgradient(SVGelement):
+ """rg=radialgradient(cx,cy,r,fx,fy,id,**args)
+
+ defines a radial gradient using a outer circle which are defined by a cx,cy and r and by using a focalpoint.
+ stop elements van be added to define the gradient colors.
+ """
+ def __init__(self,cx=None,cy=None,r=None,fx=None,fy=None,id=None,**args):
+ SVGelement.__init__(self,'radialGradient',**args)
+ if cx<>None:
+ self.attributes['cx']=cx
+ if cy<>None:
+ self.attributes['cy']=cy
+ if r<>None:
+ self.attributes['r']=r
+ if fx<>None:
+ self.attributes['fx']=fx
+ if fy<>None:
+ self.attributes['fy']=fy
+ if id<>None:
+ self.attributes['id']=id
+
+class stop(SVGelement):
+ """st=stop(offset,stop_color,**args)
+
+ Puts a stop color at the specified radius
+ """
+ def __init__(self,offset,stop_color=None,**args):
+ SVGelement.__init__(self,'stop',{'offset':offset},**args)
+ if stop_color<>None:
+ self.attributes['stop-color']=stop_color
+
+class style(SVGelement):
+ """st=style(type,cdata=None,**args)
+
+ Add a CDATA element to this element for defing in line stylesheets etc..
+ """
+ def __init__(self,type,cdata=None,**args):
+ SVGelement.__init__(self,'style',{'type':type},cdata=cdata, **args)
+
+
+class image(SVGelement):
+ """im=image(url,width,height,x,y,**args)
+
+ adds an image to the drawing. Supported formats are .png, .jpg and .svg.
+ """
+ def __init__(self,url,x=None,y=None,width=None,height=None,**args):
+ if width==None or height==None:
+ if width<>None:
+ raise ValueError, 'height is required'
+ if height<>None:
+ raise ValueError, 'width is required'
+ else:
+ raise ValueError, 'both height and width are required'
+ SVGelement.__init__(self,'image',{'xlink:href':url,'width':width,'height':height},**args)
+ if x<>None:
+ self.attributes['x']=x
+ if y<>None:
+ self.attributes['y']=y
+
+class cursor(SVGelement):
+ """c=cursor(url,**args)
+
+ defines a custom cursor for a element or a drawing
+ """
+ def __init__(self,url,**args):
+ SVGelement.__init__(self,'cursor',{'xlink:href':url},**args)
+
+
+class marker(SVGelement):
+ """m=marker(id,viewbox,refX,refY,markerWidth,markerHeight,**args)
+
+ defines a marker which can be used as an endpoint for a line or other pathtypes
+ add an element to it which should be used as a marker.
+ """
+ def __init__(self,id=None,viewBox=None,refx=None,refy=None,markerWidth=None,markerHeight=None,**args):
+ SVGelement.__init__(self,'marker',**args)
+ if id<>None:
+ self.attributes['id']=id
+ if viewBox<>None:
+ self.attributes['viewBox']=_viewboxlist(viewBox)
+ if refx<>None:
+ self.attributes['refX']=refx
+ if refy<>None:
+ self.attributes['refY']=refy
+ if markerWidth<>None:
+ self.attributes['markerWidth']=markerWidth
+ if markerHeight<>None:
+ self.attributes['markerHeight']=markerHeight
+
+class group(SVGelement):
+ """g=group(id,**args)
+
+ a group is defined by an id and is used to contain elements
+ g.addElement(SVGelement)
+ """
+ def __init__(self,id=None,**args):
+ SVGelement.__init__(self,'g',**args)
+ if id<>None:
+ self.attributes['id']=id
+
+class symbol(SVGelement):
+ """sy=symbol(id,viewbox,**args)
+
+ defines a symbol which can be used on different places in your graph using
+ the use element. A symbol is not rendered but you can use 'use' elements to
+ display it by referencing its id.
+ sy.addElement(SVGelement)
+ """
+
+ def __init__(self,id=None,viewBox=None,**args):
+ SVGelement.__init__(self,'symbol',**args)
+ if id<>None:
+ self.attributes['id']=id
+ if viewBox<>None:
+ self.attributes['viewBox']=_viewboxlist(viewBox)
+
+class defs(SVGelement):
+ """d=defs(**args)
+
+ container for defining elements
+ """
+ def __init__(self,**args):
+ SVGelement.__init__(self,'defs',**args)
+
+class switch(SVGelement):
+ """sw=switch(**args)
+
+ Elements added to a switch element which are "switched" by the attributes
+ requiredFeatures, requiredExtensions and systemLanguage.
+ Refer to the SVG specification for details.
+ """
+ def __init__(self,**args):
+ SVGelement.__init__(self,'switch',**args)
+
+
+class use(SVGelement):
+ """u=use(link,x,y,width,height,**args)
+
+ references a symbol by linking to its id and its position, height and width
+ """
+ def __init__(self,link,x=None,y=None,width=None,height=None,**args):
+ SVGelement.__init__(self,'use',{'xlink:href':link},**args)
+ if x<>None:
+ self.attributes['x']=x
+ if y<>None:
+ self.attributes['y']=y
+
+ if width<>None:
+ self.attributes['width']=width
+ if height<>None:
+ self.attributes['height']=height
+
+
+class link(SVGelement):
+ """a=link(url,**args)
+
+ a link is defined by a hyperlink. add elements which have to be linked
+ a.addElement(SVGelement)
+ """
+ def __init__(self,link='',**args):
+ SVGelement.__init__(self,'a',{'xlink:href':link},**args)
+
+class view(SVGelement):
+ """v=view(id,**args)
+
+ a view can be used to create a view with different attributes"""
+ def __init__(self,id=None,**args):
+ SVGelement.__init__(self,'view',**args)
+ if id<>None:
+ self.attributes['id']=id
+
+class script(SVGelement):
+ """sc=script(type,type,cdata,**args)
+
+ adds a script element which contains CDATA to the SVG drawing
+
+ """
+ def __init__(self,type,cdata=None,**args):
+ SVGelement.__init__(self,'script',{'type':type},cdata=cdata,**args)
+
+class animate(SVGelement):
+ """an=animate(attribute,from,to,during,**args)
+
+ animates an attribute.
+ """
+ def __init__(self,attribute,fr=None,to=None,dur=None,**args):
+ SVGelement.__init__(self,'animate',{'attributeName':attribute},**args)
+ if fr<>None:
+ self.attributes['from']=fr
+ if to<>None:
+ self.attributes['to']=to
+ if dur<>None:
+ self.attributes['dur']=dur
+
+class animateMotion(SVGelement):
+ """an=animateMotion(pathdata,dur,**args)
+
+ animates a SVGelement over the given path in dur seconds
+ """
+ def __init__(self,pathdata,dur,**args):
+ SVGelement.__init__(self,'animateMotion',**args)
+ if pathdata<>None:
+ self.attributes['path']=str(pathdata)
+ if dur<>None:
+ self.attributes['dur']=dur
+
+class animateTransform(SVGelement):
+ """antr=animateTransform(type,from,to,dur,**args)
+
+ transform an element from and to a value.
+ """
+ def __init__(self,type=None,fr=None,to=None,dur=None,**args):
+ SVGelement.__init__(self,'animateTransform',{'attributeName':'transform'},**args)
+ #As far as I know the attributeName is always transform
+ if type<>None:
+ self.attributes['type']=type
+ if fr<>None:
+ self.attributes['from']=fr
+ if to<>None:
+ self.attributes['to']=to
+ if dur<>None:
+ self.attributes['dur']=dur
+class animateColor(SVGelement):
+ """ac=animateColor(attribute,type,from,to,dur,**args)
+
+ Animates the color of a element
+ """
+ def __init__(self,attribute,type=None,fr=None,to=None,dur=None,**args):
+ SVGelement.__init__(self,'animateColor',{'attributeName':attribute},**args)
+ if type<>None:
+ self.attributes['type']=type
+ if fr<>None:
+ self.attributes['from']=fr
+ if to<>None:
+ self.attributes['to']=to
+ if dur<>None:
+ self.attributes['dur']=dur
+class set(SVGelement):
+ """st=set(attribute,to,during,**args)
+
+ sets an attribute to a value for a
+ """
+ def __init__(self,attribute,to=None,dur=None,**args):
+ SVGelement.__init__(self,'set',{'attributeName':attribute},**args)
+ if to<>None:
+ self.attributes['to']=to
+ if dur<>None:
+ self.attributes['dur']=dur
+
+
+
+class svg(SVGelement):
+ """s=svg(viewbox,width,height,**args)
+
+ a svg or element is the root of a drawing add all elements to a svg element.
+ You can have different svg elements in one svg file
+ s.addElement(SVGelement)
+
+ eg
+ d=drawing()
+ s=svg((0,0,100,100),'100%','100%')
+ c=circle(50,50,20)
+ s.addElement(c)
+ d.setSVG(s)
+ d.toXml()
+ """
+ def __init__(self,viewBox=None, width=None, height=None,**args):
+ SVGelement.__init__(self,'svg',**args)
+ if viewBox<>None:
+ self.attributes['viewBox']=_viewboxlist(viewBox)
+ if width<>None:
+ self.attributes['width']=width
+ if height<>None:
+ self.attributes['height']=height
+ self.namespace="http://www.w3.org/2000/svg"
+
+class drawing:
+ """d=drawing()
+
+ this is the actual SVG document. It needs a svg element as a root.
+ Use the addSVG method to set the svg to the root. Use the toXml method to write the SVG
+ source to the screen or to a file
+ d=drawing()
+ d.addSVG(svg)
+ d.toXml(optionalfilename)
+ """
+
+ def __init__(self):
+ self.svg=None
+ def setSVG(self,svg):
+ self.svg=svg
+ #Voeg een element toe aan de grafiek toe.
+ if use_dom_implementation==0:
+ def toXml(self, filename='',compress=False):
+ import cStringIO
+ xml=cStringIO.StringIO()
+ xml.write('<?xml version="1.0" encoding="UTF-8" standalone="no"?>\n')
+ xml.write("""<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.0//EN"
+ "http://www.w3.org/TR/2001/REC-SVG-20010904/DTD/svg10.dtd"
+ [<!ATTLIST svg xmlns:xlink CDATA #FIXED "http://www.w3.org/1999/xlink">]>\n""")
+ self.svg.toXml(0,xml)
+ if not filename:
+ if compress:
+ import gzip
+ f=cStringIO.StringIO()
+ zf=gzip.GzipFile(fileobj=f,mode='wb')
+ zf.write(xml.getvalue())
+ zf.close()
+ f.seek(0)
+ return f.read()
+ else:
+ return xml.getvalue()
+ else:
+ if filename[-4:]=='svgz':
+ import gzip
+ f=gzip.GzipFile(filename=filename,mode="wb", compresslevel=9)
+ f.write(xml.getvalue())
+ f.close()
+ else:
+ f=file(filename,'w')
+ f.write(xml.getvalue())
+ f.close()
+
+ else:
+ def toXml(self,filename='',compress=False):
+ """drawing.toXml() ---->to the screen
+ drawing.toXml(filename)---->to the file
+ writes a svg drawing to the screen or to a file
+ compresses if filename ends with svgz or if compress is true
+ """
+ doctype = implementation.createDocumentType('svg',"-//W3C//DTD SVG 1.0//EN""",'http://www.w3.org/TR/2001/REC-SVG-20010904/DTD/svg10.dtd ')
+
+ global root
+ #root is defined global so it can be used by the appender. Its also possible to use it as an arugument but
+ #that is a bit messy.
+ root=implementation.createDocument(None,None,doctype)
+ #Create the xml document.
+ global appender
+ def appender(element,elementroot):
+ """This recursive function appends elements to an element and sets the attributes
+ and type. It stops when alle elements have been appended"""
+ if element.namespace:
+ e=root.createElementNS(element.namespace,element.type)
+ else:
+ e=root.createElement(element.type)
+ if element.text:
+ textnode=root.createTextNode(element.text)
+ e.appendChild(textnode)
+ for attribute in element.attributes.keys(): #in element.attributes is supported from python 2.2
+ e.setAttribute(attribute,str(element.attributes[attribute]))
+ if element.elements:
+ for el in element.elements:
+ e=appender(el,e)
+ elementroot.appendChild(e)
+ return elementroot
+ root=appender(self.svg,root)
+ if not filename:
+ import cStringIO
+ xml=cStringIO.StringIO()
+ PrettyPrint(root,xml)
+ if compress:
+ import gzip
+ f=cStringIO.StringIO()
+ zf=gzip.GzipFile(fileobj=f,mode='wb')
+ zf.write(xml.getvalue())
+ zf.close()
+ f.seek(0)
+ return f.read()
+ else:
+ return xml.getvalue()
+ else:
+ try:
+ if filename[-4:]=='svgz':
+ import gzip
+ import cStringIO
+ xml=cStringIO.StringIO()
+ PrettyPrint(root,xml)
+ f=gzip.GzipFile(filename=filename,mode='wb',compresslevel=9)
+ f.write(xml.getvalue())
+ f.close()
+ else:
+ f=open(filename,'w')
+ PrettyPrint(root,f)
+ f.close()
+ except:
+ print "Cannot write SVG file: " + filename
+ def validate(self):
+ try:
+ import xml.parsers.xmlproc.xmlval
+ except:
+ raise exceptions.ImportError,'PyXml is required for validating SVG'
+ svg=self.toXml()
+ xv=xml.parsers.xmlproc.xmlval.XMLValidator()
+ try:
+ xv.feed(svg)
+ except:
+ raise "SVG is not well formed, see messages above"
+ else:
+ print "SVG well formed"
+if __name__=='__main__':
+
+
+ d=drawing()
+ s=svg((0,0,100,100))
+ r=rect(-100,-100,300,300,'cyan')
+ s.addElement(r)
+
+ t=title('SVGdraw Demo')
+ s.addElement(t)
+ g=group('animations')
+ e=ellipse(0,0,5,2)
+ g.addElement(e)
+ c=circle(0,0,1,'red')
+ g.addElement(c)
+ pd=pathdata(0,-10)
+ for i in range(6):
+ pd.relsmbezier(10,5,0,10)
+ pd.relsmbezier(-10,5,0,10)
+ an=animateMotion(pd,10)
+ an.attributes['rotate']='auto-reverse'
+ an.attributes['repeatCount']="indefinite"
+ g.addElement(an)
+ s.addElement(g)
+ for i in range(20,120,20):
+ u=use('#animations',i,0)
+ s.addElement(u)
+ for i in range(0,120,20):
+ for j in range(5,105,10):
+ c=circle(i,j,1,'red','black',.5)
+ s.addElement(c)
+ d.setSVG(s)
+
+ print d.toXml()
+
diff --git a/src/obitools/__init__.py b/src/obitools/__init__.py
new file mode 100644
index 0000000..e3f8c87
--- /dev/null
+++ b/src/obitools/__init__.py
@@ -0,0 +1,57 @@
+'''
+**obitools** main module
+------------------------
+
+.. codeauthor:: Eric Coissac <eric.coissac at metabarcoding.org>
+
+
+
+obitools module provides base class for sequence manipulation.
+
+All biological sequences must be subclass of :py:class:`obitools.BioSequence`.
+Some biological sequences are defined as transformation of other
+biological sequences. For example Reversed complemented sequences
+are a transformation of a :py:class:`obitools.NucSequence`. This particular
+type of sequences are subclasses of the :py:class:`obitools.WrappedBioSequence`.
+
+.. inheritance-diagram:: BioSequence NucSequence AASequence WrappedBioSequence SubSequence DNAComplementSequence
+ :parts: 1
+
+
+'''
+
+from _obitools import BioSequence,NucSequence,AASequence, \
+ WrappedBioSequence,SubSequence, \
+ DNAComplementSequence,_default_raw_parser, \
+ _isNucSeq,bioSeqGenerator
+
+#try:
+# from functools import partial
+#except:
+# #
+# # Add for compatibility purpose with Python < 2.5
+# #
+# def partial(func, *args, **keywords):
+# def newfunc(*fargs, **fkeywords):
+# newkeywords = keywords.copy()
+# newkeywords.update(fkeywords)
+# return func(*(args + fargs), **newkeywords)
+# newfunc.func = func
+# newfunc.args = args
+# newfunc.keywords = keywords
+# return newfunc
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/src/obitools/_obitools.h b/src/obitools/_obitools.h
new file mode 100644
index 0000000..d9f7958
--- /dev/null
+++ b/src/obitools/_obitools.h
@@ -0,0 +1,25 @@
+#ifndef __PYX_HAVE__obitools___obitools
+#define __PYX_HAVE__obitools___obitools
+
+
+#ifndef __PYX_HAVE_API__obitools___obitools
+
+#ifndef __PYX_EXTERN_C
+ #ifdef __cplusplus
+ #define __PYX_EXTERN_C extern "C"
+ #else
+ #define __PYX_EXTERN_C extern
+ #endif
+#endif
+
+__PYX_EXTERN_C DL_IMPORT(PyObject) *__pyx_v_8obitools_9_obitools___default_raw_parser;
+
+#endif /* !__PYX_HAVE_API__obitools___obitools */
+
+#if PY_MAJOR_VERSION < 3
+PyMODINIT_FUNC init_obitools(void);
+#else
+PyMODINIT_FUNC PyInit__obitools(void);
+#endif
+
+#endif /* !__PYX_HAVE__obitools___obitools */
diff --git a/src/obitools/_obitools.pxd b/src/obitools/_obitools.pxd
new file mode 100644
index 0000000..3fa7895
--- /dev/null
+++ b/src/obitools/_obitools.pxd
@@ -0,0 +1,109 @@
+cdef extern from "strings.h":
+ void *memchr(char *s, int c, int n)
+
+cdef public bytes __default_raw_parser = b" %s *= *([^;]*);"
+
+
+cdef class BioSequence(object):
+
+ cdef object __quality
+ cdef public bytes __seq
+ cdef public int __len
+ cdef public bytes __rawinfo
+ cdef public dict _info
+ cdef public bytes _rawparser
+ cdef public bytes _definition
+ cdef public bytes _id
+ cdef public bint _hasTaxid
+ cdef public object _wrappers
+ cdef public object word4table
+ cdef public int word4over
+
+ cpdef bytes get_seq(self)
+ cpdef set_seq(self, object value)
+ cpdef object clone(self)
+ cpdef bytes getDefinition(self)
+ cpdef setDefinition(self, bytes value)
+ cpdef bytes getId(self)
+ cpdef setId(self, bytes value)
+ cpdef bytes getStr(self)
+ cpdef getSymbolAt(self, int position)
+ cpdef object getSubSeq(self, object location)
+ cpdef object getKey(self, bytes key)
+ cpdef extractTaxon(self)
+ cpdef bint hasKey(self,bytes key)
+ cpdef list items(self)
+ cpdef list keys(self)
+ cpdef dict getTags(self)
+ cpdef object getRoot(self)
+ cpdef int _getTaxid(self)
+ cpdef _setTaxid(self,int taxid)
+ cpdef bytes _getRawInfo(self)
+
+cdef class NucSequence(BioSequence):
+ cpdef object complement(self)
+ cpdef bint isNucleotide(self)
+
+cdef class AASequence(BioSequence):
+ cpdef bint isNucleotide(self)
+
+cdef class WrappedBioSequence(BioSequence):
+
+ cdef object _wrapped
+ cdef object __weakref__
+
+ cpdef object clone(self)
+ cpdef object getWrapped(self)
+ cpdef bytes getDefinition(self)
+ cpdef setDefinition(self, bytes value)
+ cpdef bytes getId(self)
+ cpdef setId(self, bytes value)
+ cpdef bint isNucleotide(self)
+ cpdef object getKey(self,bytes key)
+ cpdef bint hasKey(self,bytes key)
+ cpdef getSymbolAt(self, int position)
+ cpdef int posInWrapped(self, int position, object reference=? ) except *
+ cpdef int _posInWrapped(self, int position) except *
+ cpdef bytes getStr(self)
+ cpdef object getRoot(self)
+ cpdef object complement(self)
+ cpdef bytes _getRawInfo(self)
+
+cdef int _sign(int x)
+
+cdef class SubSequence(WrappedBioSequence):
+
+ cdef public object _location
+ cdef public object _indices
+ cdef public object _xrange
+
+ cpdef bytes getId(self)
+ cpdef setId(self, bytes value)
+ cpdef object clone(self)
+ cpdef bytes getStr(self)
+ cpdef int _posInWrapped(self, int position) except *
+
+cdef class DNAComplementSequence(WrappedBioSequence):
+
+ cdef dict _comp
+
+ cpdef bytes getId(self)
+ cpdef setId(self, bytes value)
+ cpdef bytes getStr(self)
+ cpdef int _posInWrapped(self, int position) except *
+ cpdef getSymbolAt(self, int position)
+ cpdef object complement(self)
+
+cpdef bint _isNucSeq(bytes text)
+
+cdef object _bioSeqGenerator(bytes id,
+ bytes seq,
+ bytes definition,
+ bytes rawinfo,
+ bytes rawparser,
+ dict info)
+
+
+
+
+
diff --git a/src/obitools/_obitools.pyx b/src/obitools/_obitools.pyx
new file mode 100644
index 0000000..55fc456
--- /dev/null
+++ b/src/obitools/_obitools.pyx
@@ -0,0 +1,800 @@
+# cython: profile=True
+
+
+from _obitools cimport *
+
+#from cython.parallel import parallel, prange
+
+from weakref import ref
+import re
+from itertools import chain
+import array
+
+
+from obitools.utils.iterator import uniqueChain
+from obitools.sequenceencoder import DNAComplementEncoder
+from obitools.location import Location
+
+__default_raw_parser = b" %s *= *([^;]*);"
+_default_raw_parser=__default_raw_parser
+
+cdef class WrapperSetIterator(object):
+ def __init__(self,s):
+ self._i = set.__iter__(s)
+ def next(self): # @ReservedAssignment
+ return self._i.next()()
+ def __iter__(self):
+ return self
+
+cdef class WrapperSet(set):
+ def __iter__(self): # @DuplicatedSignature
+ return WrapperSetIterator(self)
+
+
+cdef class BioSequence(object):
+ '''
+ BioSequence class is the base class for biological
+ sequence representation.
+
+ It provides storage of :
+
+ - the sequence itself,
+ - an identifier,
+ - a definition an manage
+ - a set of complementary information on a key / value principle.
+
+ .. warning::
+
+ :py:class:`obitools.BioSequence` is an abstract class, this constructor
+ can only be called by a subclass constructor.
+ '''
+
+ def __init__(self,bytes id, bytes seq, # @DuplicatedSignature
+ bytes definition=None,
+ bytes rawinfo=None,
+ bytes rawparser=__default_raw_parser,**info):
+ '''
+
+ :param id: sequence identifier
+ :type id: `str`
+
+ :param seq: the sequence
+ :type seq: `str`
+
+ :param definition: sequence definition (optional)
+ :type definition: `str`
+
+ :param rawinfo: a text containing a set of key=value; patterns
+ :type definition: `str`
+
+ :param rawparser: a text describing a regular patterns template
+ used to parse rawinfo
+ :type definition: `str`
+
+ :param info: extra named parameters can be added to associate complementary
+ data to the sequence
+
+ '''
+
+ assert type(self)!=BioSequence,"obitools.BioSequence is an abstract class"
+
+ self._seq=seq
+ self._info = dict(info)
+ if rawinfo is not None:
+ self.__rawinfo=b' ' + rawinfo
+ else:
+ self.__rawinfo=None
+ self._rawparser=rawparser
+ self._definition=definition
+ self._id=id
+ self._hasTaxid=True
+ self.__quality=None
+ self.word4table=None
+ self.word4over=0
+
+ cpdef bytes get_seq(self):
+ return self.__seq
+
+
+ cpdef set_seq(self, object value):
+
+ cdef bytes s
+
+ if not isinstance(value, bytes):
+ s=bytes(value)
+ else:
+ s=value
+
+ self.__seq = s.lower()
+ self.__len = len(s)
+
+
+ cpdef object clone(self):
+ seq = type(self)(self.id,
+ str(self),
+ definition=self.definition
+ )
+ seq._info=dict(self.getTags())
+ seq.__rawinfo=self.__rawinfo
+ seq._rawparser=self._rawparser
+ seq._hasTaxid=self._hasTaxid
+ return seq
+
+ cpdef bytes getDefinition(self):
+ '''
+ Sequence definition getter.
+
+ :return: the sequence definition
+ :rtype: str
+
+ '''
+ return self._definition
+
+ cpdef setDefinition(self, bytes value):
+ '''
+ Sequence definition setter.
+
+ :param value: the new sequence definition
+ :type value: C{str}
+ :return: C{None}
+ '''
+ self._definition = value
+
+ cpdef bytes getId(self):
+ '''
+ Sequence identifier getter
+
+ :return: the sequence identifier
+ :rtype: C{str}
+ '''
+ return self._id
+
+ cpdef setId(self, bytes value):
+ '''
+ Sequence identifier setter.
+
+ :param value: the new sequence identifier
+ :type value: C{str}
+ :return: C{None}
+ '''
+ self._id = value
+
+ cpdef bytes getStr(self):
+ '''
+ Return the sequence as a string
+
+ :return: the string representation of the sequence
+ :rtype: str
+ '''
+ return self._seq
+
+ cpdef getSymbolAt(self, int position):
+ '''
+ Return the symbole at C{position} in the sequence
+
+ :param position: the desired position. Position start from 0
+ if position is < 0 then they are considered
+ to reference the end of the sequence.
+ :type position: `int`
+
+ :return: a one letter string
+ :rtype: `str`
+ '''
+ return str(self)[position]
+
+ cpdef object getSubSeq(self, object location):
+ '''
+ return a subsequence as described by C{location}.
+
+ The C{location} parametter can be a L{obitools.location.Location} instance,
+ an interger or a python C{slice} instance. If C{location}
+ is an iterger this method is equivalent to L{getSymbolAt}.
+
+ :param location: the positions of the subsequence to return
+ :type location: C{Location} or C{int} or C{slice}
+ :return: the subsequence
+ :rtype: a single character as a C{str} is C{location} is an integer,
+ a L{obitools.SubSequence} instance otherwise.
+
+ '''
+ if isinstance(location,Location):
+ return location.extractSequence(self)
+ elif isinstance(location, int):
+ return self.getSymbolAt(location)
+ elif isinstance(location, slice):
+ return SubSequence(self,location)
+
+ raise TypeError,'key must be a Location, an integer or a slice'
+
+ cpdef object getKey(self, bytes key):
+
+ if key not in self._info:
+ if self.__rawinfo is None:
+ if key==b'count':
+ return 1
+ elif key==b'taxid' and self._hasTaxid:
+ self.extractTaxon()
+ return self._info['taxid']
+ else:
+ raise KeyError,key
+ p = re.compile(self._rawparser % key)
+ m = p.search(self.__rawinfo)
+ if m is not None:
+ v=m.group(1)
+ self.__rawinfo=b' ' + self.__rawinfo[0:m.start(0)]+self.__rawinfo[m.end(0):]
+ try:
+ v = eval(v)
+ except:
+ pass
+ self._info[key]=v
+ else:
+ if key=='count':
+ v=1
+ else:
+ raise KeyError,key
+ else:
+ v=self._info[key]
+ return v
+
+ cpdef extractTaxon(self):
+ '''
+ Extract Taxonomy information from the sequence header.
+ This method by default return None. It should be subclassed
+ if necessary as in L{obitools.seqdb.AnnotatedSequence}.
+
+ :return: None
+ '''
+ self._hasTaxid=self.hasKey(b'taxid')
+ return None
+
+ def get(self,key,default):
+ try:
+ v = self.getKey(key)
+ except KeyError:
+ v=default
+ self[key]=v
+ return v
+
+ def __str__(self):
+ return self.getStr()
+
+ def __getitem__(self,key):
+ if isinstance(key, bytes):
+ return self.getKey(key)
+ else:
+ return self.getSubSeq(key)
+
+ def __setitem__(self,key,value):
+ self.__contains__(key)
+ self._info[key]=value
+ if key=='taxid':
+ self._hasTaxid=value is not None
+
+ def __delitem__(self,key):
+ if isinstance(key, bytes):
+ if key in self:
+ del self._info[key]
+ else:
+ raise KeyError,key
+
+ if key=='taxid':
+ self._hasTaxid=False
+ else:
+ raise TypeError,key
+
+ def __iter__(self): # @DuplicatedSignature
+ '''
+ Iterate through the sequence symbols
+ '''
+ return iter(str(self))
+
+ def __len__(self):
+ return self.__len
+
+ cpdef bint hasKey(self,bytes key):
+ cdef bint rep
+
+ rep = key in self._info
+
+ if not rep and self.__rawinfo is not None:
+ p = re.compile(self._rawparser % key)
+ m = p.search(self.__rawinfo)
+ if m is not None:
+ v=m.group(1)
+ self.__rawinfo=b' ' + self.__rawinfo[0:m.start(0)]+self.__rawinfo[m.end(0):]
+ try:
+ v = eval(v)
+ except:
+ pass
+ self._info[key]=v
+ rep=True
+
+ return rep
+
+ def __contains__(self,key):
+ '''
+ methods allowing to use the C{in} operator on a C{BioSequence}.
+
+ The C{in} operator test if the C{key} value is defined for this
+ sequence.
+
+ :param key: the name of the checked value
+ :type key: str
+ :return: C{True} if the value is defined, {False} otherwise.
+ :rtype: C{bool}
+ '''
+ if key=='taxid' and self._hasTaxid is None:
+ self.extractTaxon()
+ return self.hasKey(key)
+
+ def rawiteritems(self):
+ return self.iteritems()
+
+ def iteritems(self):
+ '''
+ iterate other items dictionary storing the values
+ associated to the sequence. It works similarly to
+ the iteritems function of C{dict}.
+
+ :return: an iterator over the items (key,value)
+ link to a sequence
+ :rtype: iterator over tuple
+ :see: L{items}
+ '''
+ if self.__rawinfo is not None:
+ p = re.compile(self._rawparser % "([a-zA-Z]\w*)")
+ for k,v in p.findall(self.__rawinfo):
+ try:
+ self._info[k]=eval(v)
+ except:
+ self._info[k]=v
+ self.__rawinfo=None
+ return self._info.iteritems()
+
+ cpdef list items(self):
+ return [x for x in self.iteritems()]
+
+ def iterkeys(self):
+ return (k for k,v in self.iteritems())
+
+ cpdef list keys(self):
+ return [x for x in self.iterkeys()]
+
+ cpdef dict getTags(self):
+ self.iteritems()
+ return self._info
+
+ cpdef object getRoot(self):
+ return self
+
+ def getWrappers(self):
+ if self._wrappers is None:
+ self._wrappers=WrapperSet()
+ return self._wrappers
+
+ def register(self,wrapper):
+ self.wrappers.add(ref(wrapper,self._unregister))
+
+ def _unregister(self,ref):
+ self.wrappers.remove(ref)
+
+ wrappers = property(getWrappers,None,None,'')
+
+ definition = property(getDefinition, setDefinition, None, "Sequence Definition")
+
+ id = property(getId, setId, None, 'Sequence identifier')
+
+ cpdef int _getTaxid(self):
+ return self['taxid']
+
+ cpdef _setTaxid(self,int taxid):
+ self['taxid']=taxid
+
+ cpdef bytes _getRawInfo(self):
+ return self.__rawinfo
+
+ _rawinfo = property(_getRawInfo)
+
+
+ taxid = property(_getTaxid,_setTaxid,None,'NCBI Taxonomy identifier')
+ _seq = property(get_seq, set_seq, None, None)
+
+ def _getQuality(self):
+ if self.__quality is None:
+ raise AttributeError
+ else:
+ return self.__quality
+
+ def _setQuality(self,qual):
+ self.__quality=qual
+
+ def _delQuality(self):
+ self.__quality=None
+
+ quality = property(_getQuality,_setQuality,_delQuality,'Quality associated to the sequence')
+
+cdef class NucSequence(BioSequence):
+ """
+ :py:class:`NucSequence` specialize the :py:class:`BioSequence` class for storing DNA
+ sequences.
+
+ The constructor is identical to the :py:class:`BioSequence` constructor.
+ """
+
+ cpdef object complement(self):
+ """
+ :return: The reverse complemented sequence as an instance of :py:class:`DNAComplementSequence`
+ :rtype: :py:class:`DNAComplementSequence`
+ """
+ return DNAComplementSequence(self)
+
+ cpdef bint isNucleotide(self):
+ return True
+
+
+cdef class AASequence(BioSequence):
+ """
+ :py:class:`AASequence` specialize the :py:class:`BioSequence` class for storing protein
+ sequences.
+
+ The constructor is identical to the :py:class:`BioSequence` constructor.
+ """
+
+
+ cpdef bint isNucleotide(self):
+ return False
+
+
+
+cdef class WrappedBioSequence(BioSequence):
+ """
+ .. warning::
+
+ :py:class:`obitools.WrappedBioSequence` is an abstract class, this constructor
+ can only be called by a subclass constructor.
+ """
+
+
+ def __init__(self, object reference, # @DuplicatedSignature
+ bytes id=None,
+ bytes definition=None,
+ **info):
+
+ assert type(self)!=WrappedBioSequence,"obitools.WrappedBioSequence is an abstract class"
+
+ self._wrapped = reference
+ reference.register(self)
+ self._id=id
+ self.definition=definition
+ self._info=info
+
+ cpdef object clone(self):
+ seq = type(self)(self.wrapped,
+ id=self._id,
+ definition=self._definition
+ )
+ seq._info=dict(self._info)
+
+ return seq
+
+ cpdef object getWrapped(self):
+ return self._wrapped
+
+ cpdef bytes getDefinition(self):
+ d = self._definition or self.wrapped.definition
+ return d
+
+ cpdef setDefinition(self, bytes value):
+ '''
+ Sequence definition setter.
+
+ :param value: the new sequence definition
+ :type value: C{str}
+ :return: C{None}
+ '''
+ self._definition=value
+
+ cpdef bytes getId(self):
+ d = self._id or self.wrapped.id
+ return d
+
+ cpdef setId(self, bytes value):
+ '''
+ Sequence identifier setter.
+
+ :param value: the new sequence identifier
+ :type value: C{str}
+ :return: C{None}
+ '''
+ self._id = value
+
+ cpdef bint isNucleotide(self):
+ return self.wrapped.isNucleotide()
+
+
+ def iterkeys(self): # @DuplicatedSignature
+ return uniqueChain(self._info.iterkeys(),
+ self.wrapped.iterkeys())
+
+ def rawiteritems(self): # @DuplicatedSignature
+ return chain(self._info.iteritems(),
+ (x for x in self.wrapped.rawiteritems()
+ if x[0] not in self._info))
+
+ def iteritems(self): # @DuplicatedSignature
+ for x in self.iterkeys():
+ yield (x,self[x])
+
+ cpdef object getKey(self,bytes key):
+ if key in self._info:
+ return self._info[key]
+ else:
+ return self.wrapped.getKey(key)
+
+ cpdef bint hasKey(self,bytes key):
+ return key in self._info or self.wrapped.hasKey(key)
+
+ cpdef getSymbolAt(self, int position):
+ return self.wrapped.getSymbolAt(self.posInWrapped(position))
+
+ cpdef int posInWrapped(self, int position, object reference=None) except *:
+ if reference is None or reference is self.wrapped:
+ return self._posInWrapped(position)
+ else:
+ return self.wrapped.posInWrapped(self._posInWrapped(position),reference)
+
+
+ cpdef bytes getStr(self):
+ return str(self.wrapped)
+
+ cpdef object getRoot(self):
+ return self.wrapped.getRoot()
+
+ cpdef object complement(self):
+ """
+ The :py:meth:`complement` method of the :py:class:`WrappedBioSequence` class
+ raises an exception :py:exc:`AttributeError` if the method is called and the cut
+ sequence does not corresponds to a nucleic acid sequence.
+ """
+
+ if self.wrapped.isNucleotide():
+ return DNAComplementSequence(self)
+ raise AttributeError
+
+
+ cpdef int _posInWrapped(self, int position) except *:
+ return position
+
+
+ definition = property(getDefinition,setDefinition, None)
+ id = property(getId,setId, None)
+
+ wrapped = property(getWrapped, None, None, "A pointer to the wrapped sequence")
+
+ cpdef bytes _getRawInfo(self):
+ return self.wrapped.__rawinfo
+
+ _rawinfo = property(_getRawInfo)
+
+
+cdef int _sign(int x):
+ if x == 0:
+ return 0
+ elif x < 0:
+ return -1
+ return 1
+
+cdef class SubSequence(WrappedBioSequence):
+ """
+ """
+
+ def __init__(self, object reference, # @DuplicatedSignature
+ object location=None,
+ int start=0, object stop=None,
+ object id=None,
+ object definition=None,
+ **info):
+ WrappedBioSequence.__init__(self,reference,id=None,definition=None,**info)
+
+ if isinstance(location, slice):
+ self._location = location
+ else:
+ step = 1
+ start = 0;
+ if not isinstance(stop,int):
+ stop = len(reference)
+ self._location=slice(start,stop,step)
+
+ self._indices=self._location.indices(len(self.wrapped))
+ self._xrange=xrange(*self._indices)
+
+ self._info['cut']='[%d,%d,%s]' % self._indices
+
+ if hasattr(reference,'quality'):
+ self.quality = reference.quality[self._location]
+
+ cpdef bytes getId(self):
+ d = self._id or ("%s_SUB" % self.wrapped.id)
+ return d
+
+ cpdef setId(self, bytes value):
+ '''
+ Sequence identifier setter.
+
+ :param value: the new sequence identifier
+ :type value: C{str}
+ :return: C{None}
+ '''
+ WrappedBioSequence.setId(self,value)
+
+
+ cpdef object clone(self):
+ seq = WrappedBioSequence.clone(self)
+ seq._location=self._location
+ seq._indices=seq._location.indices(len(seq.wrapped))
+ seq._xrange=xrange(*seq._indices)
+ return seq
+
+
+ def __len__(self): # @DuplicatedSignature
+ return len(self._xrange)
+
+ cpdef bytes getStr(self):
+ return b''.join([x for x in self])
+
+ def __iter__(self): # @DuplicatedSignature
+ return (self.wrapped.getSymbolAt(x) for x in self._xrange)
+
+ cpdef int _posInWrapped(self, int position) except *:
+ return self._xrange[position]
+
+
+ id = property(getId,setId, None)
+
+cdef dict _comp={b'a': b't', b'c': b'g', b'g': b'c', b't': b'a',
+ b'r': b'y', b'y': b'r', b'k': b'm', b'm': b'k',
+ b's': b's', b'w': b'w', b'b': b'v', b'd': b'h',
+ b'h': b'd', b'v': b'b', b'n': b'n', b'u': b'a',
+ b'-': b'-'}
+
+
+cdef class DNAComplementSequence(WrappedBioSequence):
+ """
+ Class used to represent a reverse complemented DNA sequence. Usually instances
+ of this class are produced by using the :py:meth:`NucSequence.complement` method.
+ """
+
+ def __init__(self, object reference, # @DuplicatedSignature
+ bytes id=None,
+ bytes definition=None,
+ **info):
+ WrappedBioSequence.__init__(self,reference,id=None,definition=None,**info)
+ assert reference.isNucleotide()
+ self._info[b'complemented']=True
+ if hasattr(reference,'quality'):
+ self.quality = reference.quality[::-1]
+
+
+ cpdef bytes getId(self):
+ d = self._id or (b"%s_CMP" % self.wrapped.id)
+ return d
+
+ cpdef setId(self, bytes value):
+ '''
+ Sequence identifier setter.
+
+ :param value: the new sequence identifier
+ :type value: C{str}
+ :return: C{None}
+ '''
+ WrappedBioSequence.setId(self,value)
+
+ def __len__(self): # @DuplicatedSignature
+ return len(self._wrapped)
+
+ cpdef bytes getStr(self):
+ return b''.join([x for x in self])
+
+ def __iter__(self): # @DuplicatedSignature
+ return (self.getSymbolAt(x) for x in xrange(len(self)))
+
+ cpdef int _posInWrapped(self, int position) except *:
+ return -(position+1)
+
+ cpdef getSymbolAt(self, int position):
+ return _comp[self.wrapped.getSymbolAt(self.posInWrapped(position))]
+
+ cpdef object complement(self):
+ """
+ The :py:meth:`complement` method of the :py:class:`DNAComplementSequence` class actually
+ returns the wrapped sequenced. Effectively the reversed complemented sequence of a reversed
+ complemented sequence is the initial sequence.
+ """
+ return self.wrapped
+
+ id = property(getId,setId, None)
+
+cdef set _iupac=set([b'r', b'y', b'k', b'm',
+ b's', b'w', b'b', b'd',
+ b'h', b'v', b'n',
+ b'R', b'Y', b'K', b'M',
+ b'S', b'W', b'B', b'D',
+ b'H', b'V', b'N'])
+
+#cdef char *_iupac=b"acgtrykmswbdhvnu-"
+
+cdef set _nuc = set([b'a', b'c', b'g', b't',b'u',b'A', b'C', b'G', b'T',b'U',b'-'])
+
+#cdef char *_nuc=b"acgt-"
+
+cpdef bint _isNucSeq(bytes text):
+ cdef int acgt
+ cdef int notnuc
+ cdef int ltot,lltot
+ cdef int i
+
+ acgt = 0
+ notnuc = 0
+ lltot = len(text)
+ ltot = lltot * 4 / 5
+
+ for c in text:
+ if c in _nuc:
+ acgt+=1
+ elif c not in _iupac:
+ notnuc+=1
+ return notnuc==0 and acgt > ltot
+
+
+cdef object _bioSeqGenerator(bytes id,
+ bytes seq,
+ bytes definition,
+ bytes rawinfo,
+ bytes rawparser,
+ dict info):
+
+ if _isNucSeq(seq):
+ return NucSequence(id,seq,definition,rawinfo,rawparser,**info)
+ else:
+ return AASequence(id,seq,definition,rawinfo,rawparser,**info)
+
+
+def bioSeqGenerator(bytes id,
+ bytes seq,
+ bytes definition=None,
+ bytes rawinfo=None,
+ bytes rawparser=__default_raw_parser,
+ **info):
+ """
+ Generate automagically the good class instance between :
+
+ - :py:class:`NucSequence`
+ - :py:class:`AASequence`
+
+ Build a new sequence instance. Sequences are instancied as :py:class:`NucSequence` if the
+ `seq` attribute contains more than 80% of *A*, *C*, *G*, *T* or *-* symbols
+ in upper or lower cases. Conversely, the new sequence instance is instancied as
+ :py:class:`AASequence`.
+
+
+
+ :param id: sequence identifier
+ :type id: `str`
+
+ :param seq: the sequence
+ :type seq: `str`
+
+ :param definition: sequence definition (optional)
+ :type definition: `str`
+
+ :param rawinfo: a text containing a set of key=value; patterns
+ :type definition: `str`
+
+ :param rawparser: a text describing a regular patterns template
+ used to parse rawinfo
+ :type definition: `str`
+
+ :param info: extra named parameters can be added to associate complementary
+ data to the sequence
+ """
+ return _bioSeqGenerator(id,seq,definition,rawinfo,rawparser,info)
diff --git a/src/obitools/align/__init__.py b/src/obitools/align/__init__.py
new file mode 100644
index 0000000..1e3e8d6
--- /dev/null
+++ b/src/obitools/align/__init__.py
@@ -0,0 +1,15 @@
+
+
+from _nws import NWS
+from _upperbond import indexSequences
+from _lcs import LCS,lenlcs,ALILEN,MAXLEN,MINLEN
+from _assemble import DirectAssemble, ReverseAssemble
+from _qsassemble import QSolexaDirectAssemble,QSolexaReverseAssemble
+from _rassemble import RightDirectAssemble as RightReverseAssemble
+from _qsrassemble import QSolexaRightDirectAssemble,QSolexaRightReverseAssemble
+from _freeendgap import FreeEndGap
+from _freeendgapfm import FreeEndGapFullMatch
+from _upperbond import isLCSReachable
+from _codonnws import CodonNWS
+
+
diff --git a/src/obitools/align/_assemble.pxd b/src/obitools/align/_assemble.pxd
new file mode 100644
index 0000000..2e4359f
--- /dev/null
+++ b/src/obitools/align/_assemble.pxd
@@ -0,0 +1,10 @@
+from _nws cimport *
+
+cdef class DirectAssemble(NWS):
+ cdef double ysmax
+ cdef int ymax
+
+ cdef double doAlignment(self) except? 0
+
+cdef class ReverseAssemble(DirectAssemble):
+ pass
\ No newline at end of file
diff --git a/src/obitools/align/_assemble.pyx b/src/obitools/align/_assemble.pyx
new file mode 100644
index 0000000..849cd0a
--- /dev/null
+++ b/src/obitools/align/_assemble.pyx
@@ -0,0 +1,169 @@
+'''
+Created on 6 Nov. 2009
+
+ at author: coissac
+'''
+#@PydevCodeAnalysisIgnore
+
+from _assemble cimport *
+
+
+cdef class DirectAssemble(NWS):
+
+ def __init__(self,match=4,mismatch=-6,opengap=-8,extgap=-2):
+ NWS.__init__(self,match,mismatch,opengap,extgap)
+ self.ysmax=0
+ self.ymax=0
+
+ cdef double doAlignment(self) except? 0:
+ cdef int i # vertical index
+ cdef int j # horizontal index
+ cdef int idx
+ cdef int idx0
+ cdef int idx1
+ cdef int jump
+ cdef int delta
+ cdef double score
+ cdef double scoremax
+ cdef int path
+
+
+ if self.needToCompute:
+ self.allocate()
+ self.reset()
+ self.ysmax=0
+ self.ymax=0
+
+ for j in range(1,self.hSeq.length+1):
+ idx = self.index(j,0)
+ self.matrix.matrix[idx].score = 0
+ self.matrix.matrix[idx].path = j
+
+ for i in range(1,self.vSeq.length+1):
+ idx = self.index(0,i)
+ self.matrix.matrix[idx].score = self._opengap + (self._extgap * (i-1))
+ self.matrix.matrix[idx].path = -i
+
+ idx0=self.index(-1,0)
+ idx1=self.index(0,1)
+ for i in range(1,self.vSeq.length+1):
+ idx0+=1
+ idx1+=1
+ for j in range(1,self.hSeq.length+1):
+
+ # 1 - came from diagonal
+ #idx = self.index(j-1,i-1)
+ idx = idx0
+ # print "computing cell : %d,%d --> %d/%d" % (j,i,self.index(j,i),self.matrix.msize),
+ scoremax = self.matrix.matrix[idx].score + \
+ self.matchScore(j,i)
+ path = 0
+
+ # print "so=%f sd=%f sm=%f" % (self.matrix.matrix[idx].score,self.matchScore(j,i),scoremax),
+
+ # 2 - open horizontal gap
+ # idx = self.index(j-1,i)
+ idx = idx1 - 1
+ score = self.matrix.matrix[idx].score+ \
+ self._opengap
+ if score > scoremax :
+ scoremax = score
+ path = +1
+
+ # 3 - open vertical gap
+ # idx = self.index(j,i-1)
+ idx = idx0 + 1
+ score = self.matrix.matrix[idx].score + \
+ self._opengap
+ if score > scoremax :
+ scoremax = score
+ path = -1
+
+ # 4 - extend horizontal gap
+ jump = self.matrix.bestHJump[i]
+ if jump >= 0:
+ idx = self.index(jump,i)
+ delta = j-jump
+ score = self.matrix.matrix[idx].score + \
+ self._extgap * delta
+ if score > scoremax :
+ scoremax = score
+ path = delta+1
+
+ # 5 - extend vertical gap
+ jump = self.matrix.bestVJump[j]
+ if jump >= 0:
+ idx = self.index(j,jump)
+ delta = i-jump
+ score = self.matrix.matrix[idx].score + \
+ self._extgap * delta
+ if score > scoremax :
+ scoremax = score
+ path = -delta-1
+
+ # idx = self.index(j,i)
+ idx = idx1
+ self.matrix.matrix[idx].score = scoremax
+ self.matrix.matrix[idx].path = path
+
+ if path == -1:
+ self.matrix.bestVJump[j]=i
+ elif path == +1 :
+ self.matrix.bestHJump[i]=j
+
+ if j==self.hSeq.length and scoremax > self.ysmax:
+ self.ysmax=scoremax
+ self.ymax=i
+ idx0+=1
+ idx1+=1
+
+ self.sequenceChanged=False
+ self.scoreChanged=False
+
+ return self.ysmax
+
+ cdef void backtrack(self):
+ #cdef list path=[]
+ cdef int i
+ cdef int j
+ cdef int p
+
+ self.doAlignment()
+ i=self.ymax
+ j=self.hSeq.length
+ self.path=allocatePath(i,j+1,self.path)
+
+ if self.ymax<self.vSeq.length:
+ self.path.path[self.path.length]=self.ymax-self.vSeq.length
+ self.path.length+=1
+
+ while (i or j):
+ p=self.matrix.matrix[self.index(j,i)].path
+ self.path.path[self.path.length]=p
+ self.path.length+=1
+ #path.append(p)
+ if p==0:
+ i-=1
+ j-=1
+ elif p < 0:
+ i+=p
+ else:
+ j-=p
+
+ #path.reverse()
+ #reversePath(self.path)
+ self.path.hStart=0
+ self.path.vStart=0
+ #return 0,0,path
+
+
+cdef class ReverseAssemble(DirectAssemble):
+
+ property seqB:
+ def __get__(self):
+ return self.verticalSeq.wrapped
+
+ def __set__(self, seq):
+ self.sequenceChanged=True
+ self.verticalSeq=seq.complement()
+ self.vSeq=allocateSequence(self.verticalSeq,self.vSeq)
diff --git a/src/obitools/align/_codonnws.pxd b/src/obitools/align/_codonnws.pxd
new file mode 100644
index 0000000..ddd338c
--- /dev/null
+++ b/src/obitools/align/_codonnws.pxd
@@ -0,0 +1,15 @@
+from _nws cimport *
+
+cdef class CodonNWS(NWS):
+ #cdef double* _aamatrix
+ cdef int _phasedA
+ cdef int _phasedB
+
+ cdef double matchCodon(self,int h, int v)
+ cdef double doAlignment(self) except? 0
+ cdef void backtrack(self)
+ cdef inline int colindex(self, int idx)
+ cdef inline int rowindex(self, int idx)
+
+
+
diff --git a/src/obitools/align/_codonnws.pyx b/src/obitools/align/_codonnws.pyx
new file mode 100644
index 0000000..f8dec6b
--- /dev/null
+++ b/src/obitools/align/_codonnws.pyx
@@ -0,0 +1,1589 @@
+'''
+Created on 6 Nov. 2009
+
+ at author: coissac
+'''
+#@PydevCodeAnalysisIgnore
+
+from _codonnws cimport *
+
+
+#TODO: change functions for translation and BLOSUM scores
+
+#ftp://ftp.ncbi.nih.gov/entrez/misc/data/gc.prt
+#
+#Standard genetic code
+# name "Standard" ,
+# name "SGC0" ,
+# id 1 ,
+# ncbieaa "FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
+# sncbieaa "---M---------------M---------------M----------------------------"
+# -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
+# -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
+# -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
+
+#TODO : fonction completement cablee en dure a changer vite !
+cdef char _translate(char c1, char c2, char c3):
+ if c1=='a':
+ if c2=='a':
+ if c3=='a':
+ return 'k'
+ elif c3=='c':
+ return 'n'
+ elif c3=='g':
+ return 'k'
+ elif c3=='t':
+ return 'n'
+ elif c2=='c':
+ if c3=='a':
+ return 't'
+ elif c3=='c':
+ return 't'
+ elif c3=='g':
+ return 't'
+ elif c3=='t':
+ return 't'
+ elif c2=='g':
+ if c3=='a':
+ return 'r'
+ elif c3=='c':
+ return 's'
+ elif c3=='g':
+ return 'r'
+ elif c3=='t':
+ return 's'
+ elif c2=='t':
+ if c3=='a':
+ return 'i'
+ elif c3=='c':
+ return 'i'
+ elif c3=='g':
+ return 'm'
+ elif c3=='t':
+ return 'i'
+ elif c1=='c':
+ if c2=='a':
+ if c3=='a':
+ return 'q'
+ elif c3=='c':
+ return 'h'
+ elif c3=='g':
+ return 'q'
+ elif c3=='t':
+ return 'h'
+ elif c2=='c':
+ if c3=='a':
+ return 'p'
+ elif c3=='c':
+ return 'p'
+ elif c3=='g':
+ return 'p'
+ elif c3=='t':
+ return 'p'
+ elif c2=='g':
+ if c3=='a':
+ return 'r'
+ elif c3=='c':
+ return 'r'
+ elif c3=='g':
+ return 'r'
+ elif c3=='t':
+ return 'r'
+ elif c2=='g':
+ if c3=='a':
+ return 'l'
+ elif c3=='c':
+ return 'l'
+ elif c3=='g':
+ return 'l'
+ elif c3=='t':
+ return 'l'
+ elif c1=='g':
+ if c2=='a':
+ if c3=='a':
+ return 'e'
+ elif c3=='c':
+ return 'd'
+ elif c3=='g':
+ return 'e'
+ elif c3=='t':
+ return 'd'
+ elif c2=='c':
+ if c3=='a':
+ return 'a'
+ elif c3=='c':
+ return 'a'
+ elif c3=='g':
+ return 'a'
+ elif c3=='t':
+ return 'a'
+ elif c2=='g':
+ if c3=='a':
+ return 'g'
+ elif c3=='c':
+ return 'g'
+ elif c3=='g':
+ return 'g'
+ elif c3=='t':
+ return 'g'
+ elif c2=='t':
+ if c3=='a':
+ return 'v'
+ elif c3=='c':
+ return 'v'
+ elif c3=='g':
+ return 'v'
+ elif c3=='t':
+ return 'v'
+ elif c1=='t':
+ if c2=='a':
+ if c3=='a':
+ return '*'
+ elif c3=='c':
+ return 'y'
+ elif c3=='g':
+ return '*'
+ elif c3=='t':
+ return 'y'
+ elif c2=='c':
+ if c3=='a':
+ return 's'
+ elif c3=='c':
+ return 's'
+ elif c3=='g':
+ return 's'
+ elif c3=='t':
+ return 's'
+ elif c2=='g':
+ if c3=='a':
+ return '*'
+ elif c3=='c':
+ return 'c'
+ elif c3=='g':
+ return 'w'
+ elif c3=='t':
+ return 'c'
+ elif c2=='t':
+ if c3=='a':
+ return 'l'
+ elif c3=='c':
+ return 'f'
+ elif c3=='g':
+ return 'l'
+ elif c3=='t':
+ return 'f'
+
+ return '*'
+
+#http://www.ncbi.nlm.nih.gov/Class/FieldGuide/BLOSUM62.txt
+#
+## Matrix made by matblas from blosum62.iij
+## * column uses minimum score
+## BLOSUM Clustered Scoring Matrix in 1/2 Bit Units
+## Blocks Database = /data/blocks_5.0/blocks.dat
+## Cluster Percentage: >= 62
+## Entropy = 0.6979, Expected = -0.5209
+# A R N D C Q E G H I L K M F P S T W Y V B Z X *
+#A 4 -1 -2 -2 0 -1 -1 0 -2 -1 -1 -1 -1 -2 -1 1 0 -3 -2 0 -2 -1 0 -4
+#R -1 5 0 -2 -3 1 0 -2 0 -3 -2 2 -1 -3 -2 -1 -1 -3 -2 -3 -1 0 -1 -4
+#N -2 0 6 1 -3 0 0 0 1 -3 -3 0 -2 -3 -2 1 0 -4 -2 -3 3 0 -1 -4
+#D -2 -2 1 6 -3 0 2 -1 -1 -3 -4 -1 -3 -3 -1 0 -1 -4 -3 -3 4 1 -1 -4
+#C 0 -3 -3 -3 9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1 -3 -3 -2 -4
+#Q -1 1 0 0 -3 5 2 -2 0 -3 -2 1 0 -3 -1 0 -1 -2 -1 -2 0 3 -1 -4
+#E -1 0 0 2 -4 2 5 -2 0 -3 -3 1 -2 -3 -1 0 -1 -3 -2 -2 1 4 -1 -4
+#G 0 -2 0 -1 -3 -2 -2 6 -2 -4 -4 -2 -3 -3 -2 0 -2 -2 -3 -3 -1 -2 -1 -4
+#H -2 0 1 -1 -3 0 0 -2 8 -3 -3 -1 -2 -1 -2 -1 -2 -2 2 -3 0 0 -1 -4
+#I -1 -3 -3 -3 -1 -3 -3 -4 -3 4 2 -3 1 0 -3 -2 -1 -3 -1 3 -3 -3 -1 -4
+#L -1 -2 -3 -4 -1 -2 -3 -4 -3 2 4 -2 2 0 -3 -2 -1 -2 -1 1 -4 -3 -1 -4
+#K -1 2 0 -1 -3 1 1 -2 -1 -3 -2 5 -1 -3 -1 0 -1 -3 -2 -2 0 1 -1 -4
+#M -1 -1 -2 -3 -1 0 -2 -3 -2 1 2 -1 5 0 -2 -1 -1 -1 -1 1 -3 -1 -1 -4
+#F -2 -3 -3 -3 -2 -3 -3 -3 -1 0 0 -3 0 6 -4 -2 -2 1 3 -1 -3 -3 -1 -4
+#P -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4 7 -1 -1 -4 -3 -2 -2 -1 -2 -4
+#S 1 -1 1 0 -1 0 0 0 -1 -2 -2 0 -1 -2 -1 4 1 -3 -2 -2 0 0 0 -4
+#T 0 -1 0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1 1 5 -2 -2 0 -1 -1 0 -4
+#W -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1 1 -4 -3 -2 11 2 -3 -4 -3 -2 -4
+#Y -2 -2 -2 -3 -2 -1 -2 -3 2 -1 -1 -2 -1 3 -3 -2 -2 2 7 -1 -3 -2 -1 -4
+#V 0 -3 -3 -3 -1 -2 -2 -3 -3 3 1 -2 1 -1 -2 -2 0 -3 -1 4 -3 -2 -1 -4
+#B -2 -1 3 4 -3 0 1 -1 0 -3 -4 0 -3 -3 -2 0 -1 -4 -3 -3 4 1 -1 -4
+#Z -1 0 0 1 -3 3 4 -2 0 -3 -3 1 -1 -3 -1 0 -1 -3 -2 -2 1 4 -1 -4
+#X 0 -1 -1 -1 -2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -2 0 0 -2 -1 -1 -1 -1 -1 -4
+#* -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 1
+#
+
+cdef double _blosum62(char c1_1, char c1_2, char c1_3, char c2_1, char c2_2, char c2_3):
+
+ cdef char aa1 = _translate(c1_1, c1_2, c1_3)
+ cdef char aa2 = _translate(c2_1, c2_2, c2_3)
+
+ if aa1=="a" and aa2=="a":
+ return 4
+ if aa1=="a" and aa2=="r":
+ return -1
+ if aa1=="a" and aa2=="n":
+ return -2
+ if aa1=="a" and aa2=="d":
+ return -2
+ if aa1=="a" and aa2=="c":
+ return 0
+ if aa1=="a" and aa2=="q":
+ return -1
+ if aa1=="a" and aa2=="e":
+ return -1
+ if aa1=="a" and aa2=="g":
+ return 0
+ if aa1=="a" and aa2=="h":
+ return -2
+ if aa1=="a" and aa2=="i":
+ return -1
+ if aa1=="a" and aa2=="l":
+ return -1
+ if aa1=="a" and aa2=="k":
+ return -1
+ if aa1=="a" and aa2=="m":
+ return -1
+ if aa1=="a" and aa2=="f":
+ return -2
+ if aa1=="a" and aa2=="p":
+ return -1
+ if aa1=="a" and aa2=="s":
+ return 1
+ if aa1=="a" and aa2=="t":
+ return 0
+ if aa1=="a" and aa2=="w":
+ return -3
+ if aa1=="a" and aa2=="y":
+ return -2
+ if aa1=="a" and aa2=="v":
+ return 0
+ if aa1=="a" and aa2=="b":
+ return -2
+ if aa1=="a" and aa2=="z":
+ return -1
+ if aa1=="a" and aa2=="x":
+ return 0
+ if aa1=="a" and aa2=="*":
+ return -4
+ if aa1=="r" and aa2=="a":
+ return -1
+ if aa1=="r" and aa2=="r":
+ return 5
+ if aa1=="r" and aa2=="n":
+ return 0
+ if aa1=="r" and aa2=="d":
+ return -2
+ if aa1=="r" and aa2=="c":
+ return -3
+ if aa1=="r" and aa2=="q":
+ return 1
+ if aa1=="r" and aa2=="e":
+ return 0
+ if aa1=="r" and aa2=="g":
+ return -2
+ if aa1=="r" and aa2=="h":
+ return 0
+ if aa1=="r" and aa2=="i":
+ return -3
+ if aa1=="r" and aa2=="l":
+ return -2
+ if aa1=="r" and aa2=="k":
+ return 2
+ if aa1=="r" and aa2=="m":
+ return -1
+ if aa1=="r" and aa2=="f":
+ return -3
+ if aa1=="r" and aa2=="p":
+ return -2
+ if aa1=="r" and aa2=="s":
+ return -1
+ if aa1=="r" and aa2=="t":
+ return -1
+ if aa1=="r" and aa2=="w":
+ return -3
+ if aa1=="r" and aa2=="y":
+ return -2
+ if aa1=="r" and aa2=="v":
+ return -3
+ if aa1=="r" and aa2=="b":
+ return -1
+ if aa1=="r" and aa2=="z":
+ return 0
+ if aa1=="r" and aa2=="x":
+ return -1
+ if aa1=="r" and aa2=="*":
+ return -4
+ if aa1=="n" and aa2=="a":
+ return -2
+ if aa1=="n" and aa2=="r":
+ return 0
+ if aa1=="n" and aa2=="n":
+ return 6
+ if aa1=="n" and aa2=="d":
+ return 1
+ if aa1=="n" and aa2=="c":
+ return -3
+ if aa1=="n" and aa2=="q":
+ return 0
+ if aa1=="n" and aa2=="e":
+ return 0
+ if aa1=="n" and aa2=="g":
+ return 0
+ if aa1=="n" and aa2=="h":
+ return 1
+ if aa1=="n" and aa2=="i":
+ return -3
+ if aa1=="n" and aa2=="l":
+ return -3
+ if aa1=="n" and aa2=="k":
+ return 0
+ if aa1=="n" and aa2=="m":
+ return -2
+ if aa1=="n" and aa2=="f":
+ return -3
+ if aa1=="n" and aa2=="p":
+ return -2
+ if aa1=="n" and aa2=="s":
+ return 1
+ if aa1=="n" and aa2=="t":
+ return 0
+ if aa1=="n" and aa2=="w":
+ return -4
+ if aa1=="n" and aa2=="y":
+ return -2
+ if aa1=="n" and aa2=="v":
+ return -3
+ if aa1=="n" and aa2=="b":
+ return 3
+ if aa1=="n" and aa2=="z":
+ return 0
+ if aa1=="n" and aa2=="x":
+ return -1
+ if aa1=="n" and aa2=="*":
+ return -4
+ if aa1=="d" and aa2=="a":
+ return -2
+ if aa1=="d" and aa2=="r":
+ return -2
+ if aa1=="d" and aa2=="n":
+ return 1
+ if aa1=="d" and aa2=="d":
+ return 6
+ if aa1=="d" and aa2=="c":
+ return -3
+ if aa1=="d" and aa2=="q":
+ return 0
+ if aa1=="d" and aa2=="e":
+ return 2
+ if aa1=="d" and aa2=="g":
+ return -1
+ if aa1=="d" and aa2=="h":
+ return -1
+ if aa1=="d" and aa2=="i":
+ return -3
+ if aa1=="d" and aa2=="l":
+ return -4
+ if aa1=="d" and aa2=="k":
+ return -1
+ if aa1=="d" and aa2=="m":
+ return -3
+ if aa1=="d" and aa2=="f":
+ return -3
+ if aa1=="d" and aa2=="p":
+ return -1
+ if aa1=="d" and aa2=="s":
+ return 0
+ if aa1=="d" and aa2=="t":
+ return -1
+ if aa1=="d" and aa2=="w":
+ return -4
+ if aa1=="d" and aa2=="y":
+ return -3
+ if aa1=="d" and aa2=="v":
+ return -3
+ if aa1=="d" and aa2=="b":
+ return 4
+ if aa1=="d" and aa2=="z":
+ return 1
+ if aa1=="d" and aa2=="x":
+ return -1
+ if aa1=="d" and aa2=="*":
+ return -4
+ if aa1=="c" and aa2=="a":
+ return 0
+ if aa1=="c" and aa2=="r":
+ return -3
+ if aa1=="c" and aa2=="n":
+ return -3
+ if aa1=="c" and aa2=="d":
+ return -3
+ if aa1=="c" and aa2=="c":
+ return 9
+ if aa1=="c" and aa2=="q":
+ return -3
+ if aa1=="c" and aa2=="e":
+ return -4
+ if aa1=="c" and aa2=="g":
+ return -3
+ if aa1=="c" and aa2=="h":
+ return -3
+ if aa1=="c" and aa2=="i":
+ return -1
+ if aa1=="c" and aa2=="l":
+ return -1
+ if aa1=="c" and aa2=="k":
+ return -3
+ if aa1=="c" and aa2=="m":
+ return -1
+ if aa1=="c" and aa2=="f":
+ return -2
+ if aa1=="c" and aa2=="p":
+ return -3
+ if aa1=="c" and aa2=="s":
+ return -1
+ if aa1=="c" and aa2=="t":
+ return -1
+ if aa1=="c" and aa2=="w":
+ return -2
+ if aa1=="c" and aa2=="y":
+ return -2
+ if aa1=="c" and aa2=="v":
+ return -1
+ if aa1=="c" and aa2=="b":
+ return -3
+ if aa1=="c" and aa2=="z":
+ return -3
+ if aa1=="c" and aa2=="x":
+ return -2
+ if aa1=="c" and aa2=="*":
+ return -4
+ if aa1=="q" and aa2=="a":
+ return -1
+ if aa1=="q" and aa2=="r":
+ return 1
+ if aa1=="q" and aa2=="n":
+ return 0
+ if aa1=="q" and aa2=="d":
+ return 0
+ if aa1=="q" and aa2=="c":
+ return -3
+ if aa1=="q" and aa2=="q":
+ return 5
+ if aa1=="q" and aa2=="e":
+ return 2
+ if aa1=="q" and aa2=="g":
+ return -2
+ if aa1=="q" and aa2=="h":
+ return 0
+ if aa1=="q" and aa2=="i":
+ return -3
+ if aa1=="q" and aa2=="l":
+ return -2
+ if aa1=="q" and aa2=="k":
+ return 1
+ if aa1=="q" and aa2=="m":
+ return 0
+ if aa1=="q" and aa2=="f":
+ return -3
+ if aa1=="q" and aa2=="p":
+ return -1
+ if aa1=="q" and aa2=="s":
+ return 0
+ if aa1=="q" and aa2=="t":
+ return -1
+ if aa1=="q" and aa2=="w":
+ return -2
+ if aa1=="q" and aa2=="y":
+ return -1
+ if aa1=="q" and aa2=="v":
+ return -2
+ if aa1=="q" and aa2=="b":
+ return 0
+ if aa1=="q" and aa2=="z":
+ return 3
+ if aa1=="q" and aa2=="x":
+ return -1
+ if aa1=="q" and aa2=="*":
+ return -4
+ if aa1=="e" and aa2=="a":
+ return -1
+ if aa1=="e" and aa2=="r":
+ return 0
+ if aa1=="e" and aa2=="n":
+ return 0
+ if aa1=="e" and aa2=="d":
+ return 2
+ if aa1=="e" and aa2=="c":
+ return -4
+ if aa1=="e" and aa2=="q":
+ return 2
+ if aa1=="e" and aa2=="e":
+ return 5
+ if aa1=="e" and aa2=="g":
+ return -2
+ if aa1=="e" and aa2=="h":
+ return 0
+ if aa1=="e" and aa2=="i":
+ return -3
+ if aa1=="e" and aa2=="l":
+ return -3
+ if aa1=="e" and aa2=="k":
+ return 1
+ if aa1=="e" and aa2=="m":
+ return -2
+ if aa1=="e" and aa2=="f":
+ return -3
+ if aa1=="e" and aa2=="p":
+ return -1
+ if aa1=="e" and aa2=="s":
+ return 0
+ if aa1=="e" and aa2=="t":
+ return -1
+ if aa1=="e" and aa2=="w":
+ return -3
+ if aa1=="e" and aa2=="y":
+ return -2
+ if aa1=="e" and aa2=="v":
+ return -2
+ if aa1=="e" and aa2=="b":
+ return 1
+ if aa1=="e" and aa2=="z":
+ return 4
+ if aa1=="e" and aa2=="x":
+ return -1
+ if aa1=="e" and aa2=="*":
+ return -4
+ if aa1=="g" and aa2=="a":
+ return 0
+ if aa1=="g" and aa2=="r":
+ return -2
+ if aa1=="g" and aa2=="n":
+ return 0
+ if aa1=="g" and aa2=="d":
+ return -1
+ if aa1=="g" and aa2=="c":
+ return -3
+ if aa1=="g" and aa2=="q":
+ return -2
+ if aa1=="g" and aa2=="e":
+ return -2
+ if aa1=="g" and aa2=="g":
+ return 6
+ if aa1=="g" and aa2=="h":
+ return -2
+ if aa1=="g" and aa2=="i":
+ return -4
+ if aa1=="g" and aa2=="l":
+ return -4
+ if aa1=="g" and aa2=="k":
+ return -2
+ if aa1=="g" and aa2=="m":
+ return -3
+ if aa1=="g" and aa2=="f":
+ return -3
+ if aa1=="g" and aa2=="p":
+ return -2
+ if aa1=="g" and aa2=="s":
+ return 0
+ if aa1=="g" and aa2=="t":
+ return -2
+ if aa1=="g" and aa2=="w":
+ return -2
+ if aa1=="g" and aa2=="y":
+ return -3
+ if aa1=="g" and aa2=="v":
+ return -3
+ if aa1=="g" and aa2=="b":
+ return -1
+ if aa1=="g" and aa2=="z":
+ return -2
+ if aa1=="g" and aa2=="x":
+ return -1
+ if aa1=="g" and aa2=="*":
+ return -4
+ if aa1=="h" and aa2=="a":
+ return -2
+ if aa1=="h" and aa2=="r":
+ return 0
+ if aa1=="h" and aa2=="n":
+ return 1
+ if aa1=="h" and aa2=="d":
+ return -1
+ if aa1=="h" and aa2=="c":
+ return -3
+ if aa1=="h" and aa2=="q":
+ return 0
+ if aa1=="h" and aa2=="e":
+ return 0
+ if aa1=="h" and aa2=="g":
+ return -2
+ if aa1=="h" and aa2=="h":
+ return 8
+ if aa1=="h" and aa2=="i":
+ return -3
+ if aa1=="h" and aa2=="l":
+ return -3
+ if aa1=="h" and aa2=="k":
+ return -1
+ if aa1=="h" and aa2=="m":
+ return -2
+ if aa1=="h" and aa2=="f":
+ return -1
+ if aa1=="h" and aa2=="p":
+ return -2
+ if aa1=="h" and aa2=="s":
+ return -1
+ if aa1=="h" and aa2=="t":
+ return -2
+ if aa1=="h" and aa2=="w":
+ return -2
+ if aa1=="h" and aa2=="y":
+ return 2
+ if aa1=="h" and aa2=="v":
+ return -3
+ if aa1=="h" and aa2=="b":
+ return 0
+ if aa1=="h" and aa2=="z":
+ return 0
+ if aa1=="h" and aa2=="x":
+ return -1
+ if aa1=="h" and aa2=="*":
+ return -4
+ if aa1=="i" and aa2=="a":
+ return -1
+ if aa1=="i" and aa2=="r":
+ return -3
+ if aa1=="i" and aa2=="n":
+ return -3
+ if aa1=="i" and aa2=="d":
+ return -3
+ if aa1=="i" and aa2=="c":
+ return -1
+ if aa1=="i" and aa2=="q":
+ return -3
+ if aa1=="i" and aa2=="e":
+ return -3
+ if aa1=="i" and aa2=="g":
+ return -4
+ if aa1=="i" and aa2=="h":
+ return -3
+ if aa1=="i" and aa2=="i":
+ return 4
+ if aa1=="i" and aa2=="l":
+ return 2
+ if aa1=="i" and aa2=="k":
+ return -3
+ if aa1=="i" and aa2=="m":
+ return 1
+ if aa1=="i" and aa2=="f":
+ return 0
+ if aa1=="i" and aa2=="p":
+ return -3
+ if aa1=="i" and aa2=="s":
+ return -2
+ if aa1=="i" and aa2=="t":
+ return -1
+ if aa1=="i" and aa2=="w":
+ return -3
+ if aa1=="i" and aa2=="y":
+ return -1
+ if aa1=="i" and aa2=="v":
+ return 3
+ if aa1=="i" and aa2=="b":
+ return -3
+ if aa1=="i" and aa2=="z":
+ return -3
+ if aa1=="i" and aa2=="x":
+ return -1
+ if aa1=="i" and aa2=="*":
+ return -4
+ if aa1=="l" and aa2=="a":
+ return -1
+ if aa1=="l" and aa2=="r":
+ return -2
+ if aa1=="l" and aa2=="n":
+ return -3
+ if aa1=="l" and aa2=="d":
+ return -4
+ if aa1=="l" and aa2=="c":
+ return -1
+ if aa1=="l" and aa2=="q":
+ return -2
+ if aa1=="l" and aa2=="e":
+ return -3
+ if aa1=="l" and aa2=="g":
+ return -4
+ if aa1=="l" and aa2=="h":
+ return -3
+ if aa1=="l" and aa2=="i":
+ return 2
+ if aa1=="l" and aa2=="l":
+ return 4
+ if aa1=="l" and aa2=="k":
+ return -2
+ if aa1=="l" and aa2=="m":
+ return 2
+ if aa1=="l" and aa2=="f":
+ return 0
+ if aa1=="l" and aa2=="p":
+ return -3
+ if aa1=="l" and aa2=="s":
+ return -2
+ if aa1=="l" and aa2=="t":
+ return -1
+ if aa1=="l" and aa2=="w":
+ return -2
+ if aa1=="l" and aa2=="y":
+ return -1
+ if aa1=="l" and aa2=="v":
+ return 1
+ if aa1=="l" and aa2=="b":
+ return -4
+ if aa1=="l" and aa2=="z":
+ return -3
+ if aa1=="l" and aa2=="x":
+ return -1
+ if aa1=="l" and aa2=="*":
+ return -4
+ if aa1=="k" and aa2=="a":
+ return -1
+ if aa1=="k" and aa2=="r":
+ return 2
+ if aa1=="k" and aa2=="n":
+ return 0
+ if aa1=="k" and aa2=="d":
+ return -1
+ if aa1=="k" and aa2=="c":
+ return -3
+ if aa1=="k" and aa2=="q":
+ return 1
+ if aa1=="k" and aa2=="e":
+ return 1
+ if aa1=="k" and aa2=="g":
+ return -2
+ if aa1=="k" and aa2=="h":
+ return -1
+ if aa1=="k" and aa2=="i":
+ return -3
+ if aa1=="k" and aa2=="l":
+ return -2
+ if aa1=="k" and aa2=="k":
+ return 5
+ if aa1=="k" and aa2=="m":
+ return -1
+ if aa1=="k" and aa2=="f":
+ return -3
+ if aa1=="k" and aa2=="p":
+ return -1
+ if aa1=="k" and aa2=="s":
+ return 0
+ if aa1=="k" and aa2=="t":
+ return -1
+ if aa1=="k" and aa2=="w":
+ return -3
+ if aa1=="k" and aa2=="y":
+ return -2
+ if aa1=="k" and aa2=="v":
+ return -2
+ if aa1=="k" and aa2=="b":
+ return 0
+ if aa1=="k" and aa2=="z":
+ return 1
+ if aa1=="k" and aa2=="x":
+ return -1
+ if aa1=="k" and aa2=="*":
+ return -4
+ if aa1=="m" and aa2=="a":
+ return -1
+ if aa1=="m" and aa2=="r":
+ return -1
+ if aa1=="m" and aa2=="n":
+ return -2
+ if aa1=="m" and aa2=="d":
+ return -3
+ if aa1=="m" and aa2=="c":
+ return -1
+ if aa1=="m" and aa2=="q":
+ return 0
+ if aa1=="m" and aa2=="e":
+ return -2
+ if aa1=="m" and aa2=="g":
+ return -3
+ if aa1=="m" and aa2=="h":
+ return -2
+ if aa1=="m" and aa2=="i":
+ return 1
+ if aa1=="m" and aa2=="l":
+ return 2
+ if aa1=="m" and aa2=="k":
+ return -1
+ if aa1=="m" and aa2=="m":
+ return 5
+ if aa1=="m" and aa2=="f":
+ return 0
+ if aa1=="m" and aa2=="p":
+ return -2
+ if aa1=="m" and aa2=="s":
+ return -1
+ if aa1=="m" and aa2=="t":
+ return -1
+ if aa1=="m" and aa2=="w":
+ return -1
+ if aa1=="m" and aa2=="y":
+ return -1
+ if aa1=="m" and aa2=="v":
+ return 1
+ if aa1=="m" and aa2=="b":
+ return -3
+ if aa1=="m" and aa2=="z":
+ return -1
+ if aa1=="m" and aa2=="x":
+ return -1
+ if aa1=="m" and aa2=="*":
+ return -4
+ if aa1=="f" and aa2=="a":
+ return -2
+ if aa1=="f" and aa2=="r":
+ return -3
+ if aa1=="f" and aa2=="n":
+ return -3
+ if aa1=="f" and aa2=="d":
+ return -3
+ if aa1=="f" and aa2=="c":
+ return -2
+ if aa1=="f" and aa2=="q":
+ return -3
+ if aa1=="f" and aa2=="e":
+ return -3
+ if aa1=="f" and aa2=="g":
+ return -3
+ if aa1=="f" and aa2=="h":
+ return -1
+ if aa1=="f" and aa2=="i":
+ return 0
+ if aa1=="f" and aa2=="l":
+ return 0
+ if aa1=="f" and aa2=="k":
+ return -3
+ if aa1=="f" and aa2=="m":
+ return 0
+ if aa1=="f" and aa2=="f":
+ return 6
+ if aa1=="f" and aa2=="p":
+ return -4
+ if aa1=="f" and aa2=="s":
+ return -2
+ if aa1=="f" and aa2=="t":
+ return -2
+ if aa1=="f" and aa2=="w":
+ return 1
+ if aa1=="f" and aa2=="y":
+ return 3
+ if aa1=="f" and aa2=="v":
+ return -1
+ if aa1=="f" and aa2=="b":
+ return -3
+ if aa1=="f" and aa2=="z":
+ return -3
+ if aa1=="f" and aa2=="x":
+ return -1
+ if aa1=="f" and aa2=="*":
+ return -4
+ if aa1=="p" and aa2=="a":
+ return -1
+ if aa1=="p" and aa2=="r":
+ return -2
+ if aa1=="p" and aa2=="n":
+ return -2
+ if aa1=="p" and aa2=="d":
+ return -1
+ if aa1=="p" and aa2=="c":
+ return -3
+ if aa1=="p" and aa2=="q":
+ return -1
+ if aa1=="p" and aa2=="e":
+ return -1
+ if aa1=="p" and aa2=="g":
+ return -2
+ if aa1=="p" and aa2=="h":
+ return -2
+ if aa1=="p" and aa2=="i":
+ return -3
+ if aa1=="p" and aa2=="l":
+ return -3
+ if aa1=="p" and aa2=="k":
+ return -1
+ if aa1=="p" and aa2=="m":
+ return -2
+ if aa1=="p" and aa2=="f":
+ return -4
+ if aa1=="p" and aa2=="p":
+ return 7
+ if aa1=="p" and aa2=="s":
+ return -1
+ if aa1=="p" and aa2=="t":
+ return -1
+ if aa1=="p" and aa2=="w":
+ return -4
+ if aa1=="p" and aa2=="y":
+ return -3
+ if aa1=="p" and aa2=="v":
+ return -2
+ if aa1=="p" and aa2=="b":
+ return -2
+ if aa1=="p" and aa2=="z":
+ return -1
+ if aa1=="p" and aa2=="x":
+ return -2
+ if aa1=="p" and aa2=="*":
+ return -4
+ if aa1=="s" and aa2=="a":
+ return 1
+ if aa1=="s" and aa2=="r":
+ return -1
+ if aa1=="s" and aa2=="n":
+ return 1
+ if aa1=="s" and aa2=="d":
+ return 0
+ if aa1=="s" and aa2=="c":
+ return -1
+ if aa1=="s" and aa2=="q":
+ return 0
+ if aa1=="s" and aa2=="e":
+ return 0
+ if aa1=="s" and aa2=="g":
+ return 0
+ if aa1=="s" and aa2=="h":
+ return -1
+ if aa1=="s" and aa2=="i":
+ return -2
+ if aa1=="s" and aa2=="l":
+ return -2
+ if aa1=="s" and aa2=="k":
+ return 0
+ if aa1=="s" and aa2=="m":
+ return -1
+ if aa1=="s" and aa2=="f":
+ return -2
+ if aa1=="s" and aa2=="p":
+ return -1
+ if aa1=="s" and aa2=="s":
+ return 4
+ if aa1=="s" and aa2=="t":
+ return 1
+ if aa1=="s" and aa2=="w":
+ return -3
+ if aa1=="s" and aa2=="y":
+ return -2
+ if aa1=="s" and aa2=="v":
+ return -2
+ if aa1=="s" and aa2=="b":
+ return 0
+ if aa1=="s" and aa2=="z":
+ return 0
+ if aa1=="s" and aa2=="x":
+ return 0
+ if aa1=="s" and aa2=="*":
+ return -4
+ if aa1=="t" and aa2=="a":
+ return 0
+ if aa1=="t" and aa2=="r":
+ return -1
+ if aa1=="t" and aa2=="n":
+ return 0
+ if aa1=="t" and aa2=="d":
+ return -1
+ if aa1=="t" and aa2=="c":
+ return -1
+ if aa1=="t" and aa2=="q":
+ return -1
+ if aa1=="t" and aa2=="e":
+ return -1
+ if aa1=="t" and aa2=="g":
+ return -2
+ if aa1=="t" and aa2=="h":
+ return -2
+ if aa1=="t" and aa2=="i":
+ return -1
+ if aa1=="t" and aa2=="l":
+ return -1
+ if aa1=="t" and aa2=="k":
+ return -1
+ if aa1=="t" and aa2=="m":
+ return -1
+ if aa1=="t" and aa2=="f":
+ return -2
+ if aa1=="t" and aa2=="p":
+ return -1
+ if aa1=="t" and aa2=="s":
+ return 1
+ if aa1=="t" and aa2=="t":
+ return 5
+ if aa1=="t" and aa2=="w":
+ return -2
+ if aa1=="t" and aa2=="y":
+ return -2
+ if aa1=="t" and aa2=="v":
+ return 0
+ if aa1=="t" and aa2=="b":
+ return -1
+ if aa1=="t" and aa2=="z":
+ return -1
+ if aa1=="t" and aa2=="x":
+ return 0
+ if aa1=="t" and aa2=="*":
+ return -4
+ if aa1=="w" and aa2=="a":
+ return -3
+ if aa1=="w" and aa2=="r":
+ return -3
+ if aa1=="w" and aa2=="n":
+ return -4
+ if aa1=="w" and aa2=="d":
+ return -4
+ if aa1=="w" and aa2=="c":
+ return -2
+ if aa1=="w" and aa2=="q":
+ return -2
+ if aa1=="w" and aa2=="e":
+ return -3
+ if aa1=="w" and aa2=="g":
+ return -2
+ if aa1=="w" and aa2=="h":
+ return -2
+ if aa1=="w" and aa2=="i":
+ return -3
+ if aa1=="w" and aa2=="l":
+ return -2
+ if aa1=="w" and aa2=="k":
+ return -3
+ if aa1=="w" and aa2=="m":
+ return -1
+ if aa1=="w" and aa2=="f":
+ return 1
+ if aa1=="w" and aa2=="p":
+ return -4
+ if aa1=="w" and aa2=="s":
+ return -3
+ if aa1=="w" and aa2=="t":
+ return -2
+ if aa1=="w" and aa2=="w":
+ return 11
+ if aa1=="w" and aa2=="y":
+ return 2
+ if aa1=="w" and aa2=="v":
+ return -3
+ if aa1=="w" and aa2=="b":
+ return -4
+ if aa1=="w" and aa2=="z":
+ return -3
+ if aa1=="w" and aa2=="x":
+ return -2
+ if aa1=="w" and aa2=="*":
+ return -4
+ if aa1=="y" and aa2=="a":
+ return -2
+ if aa1=="y" and aa2=="r":
+ return -2
+ if aa1=="y" and aa2=="n":
+ return -2
+ if aa1=="y" and aa2=="d":
+ return -3
+ if aa1=="y" and aa2=="c":
+ return -2
+ if aa1=="y" and aa2=="q":
+ return -1
+ if aa1=="y" and aa2=="e":
+ return -2
+ if aa1=="y" and aa2=="g":
+ return -3
+ if aa1=="y" and aa2=="h":
+ return 2
+ if aa1=="y" and aa2=="i":
+ return -1
+ if aa1=="y" and aa2=="l":
+ return -1
+ if aa1=="y" and aa2=="k":
+ return -2
+ if aa1=="y" and aa2=="m":
+ return -1
+ if aa1=="y" and aa2=="f":
+ return 3
+ if aa1=="y" and aa2=="p":
+ return -3
+ if aa1=="y" and aa2=="s":
+ return -2
+ if aa1=="y" and aa2=="t":
+ return -2
+ if aa1=="y" and aa2=="w":
+ return 2
+ if aa1=="y" and aa2=="y":
+ return 7
+ if aa1=="y" and aa2=="v":
+ return -1
+ if aa1=="y" and aa2=="b":
+ return -3
+ if aa1=="y" and aa2=="z":
+ return -2
+ if aa1=="y" and aa2=="x":
+ return -1
+ if aa1=="y" and aa2=="*":
+ return -4
+ if aa1=="v" and aa2=="a":
+ return 0
+ if aa1=="v" and aa2=="r":
+ return -3
+ if aa1=="v" and aa2=="n":
+ return -3
+ if aa1=="v" and aa2=="d":
+ return -3
+ if aa1=="v" and aa2=="c":
+ return -1
+ if aa1=="v" and aa2=="q":
+ return -2
+ if aa1=="v" and aa2=="e":
+ return -2
+ if aa1=="v" and aa2=="g":
+ return -3
+ if aa1=="v" and aa2=="h":
+ return -3
+ if aa1=="v" and aa2=="i":
+ return 3
+ if aa1=="v" and aa2=="l":
+ return 1
+ if aa1=="v" and aa2=="k":
+ return -2
+ if aa1=="v" and aa2=="m":
+ return 1
+ if aa1=="v" and aa2=="f":
+ return -1
+ if aa1=="v" and aa2=="p":
+ return -2
+ if aa1=="v" and aa2=="s":
+ return -2
+ if aa1=="v" and aa2=="t":
+ return 0
+ if aa1=="v" and aa2=="w":
+ return -3
+ if aa1=="v" and aa2=="y":
+ return -1
+ if aa1=="v" and aa2=="v":
+ return 4
+ if aa1=="v" and aa2=="b":
+ return -3
+ if aa1=="v" and aa2=="z":
+ return -2
+ if aa1=="v" and aa2=="x":
+ return -1
+ if aa1=="v" and aa2=="*":
+ return -4
+ if aa1=="b" and aa2=="a":
+ return -2
+ if aa1=="b" and aa2=="r":
+ return -1
+ if aa1=="b" and aa2=="n":
+ return 3
+ if aa1=="b" and aa2=="d":
+ return 4
+ if aa1=="b" and aa2=="c":
+ return -3
+ if aa1=="b" and aa2=="q":
+ return 0
+ if aa1=="b" and aa2=="e":
+ return 1
+ if aa1=="b" and aa2=="g":
+ return -1
+ if aa1=="b" and aa2=="h":
+ return 0
+ if aa1=="b" and aa2=="i":
+ return -3
+ if aa1=="b" and aa2=="l":
+ return -4
+ if aa1=="b" and aa2=="k":
+ return 0
+ if aa1=="b" and aa2=="m":
+ return -3
+ if aa1=="b" and aa2=="f":
+ return -3
+ if aa1=="b" and aa2=="p":
+ return -2
+ if aa1=="b" and aa2=="s":
+ return 0
+ if aa1=="b" and aa2=="t":
+ return -1
+ if aa1=="b" and aa2=="w":
+ return -4
+ if aa1=="b" and aa2=="y":
+ return -3
+ if aa1=="b" and aa2=="v":
+ return -3
+ if aa1=="b" and aa2=="b":
+ return 4
+ if aa1=="b" and aa2=="z":
+ return 1
+ if aa1=="b" and aa2=="x":
+ return -1
+ if aa1=="b" and aa2=="*":
+ return -4
+ if aa1=="z" and aa2=="a":
+ return -1
+ if aa1=="z" and aa2=="r":
+ return 0
+ if aa1=="z" and aa2=="n":
+ return 0
+ if aa1=="z" and aa2=="d":
+ return 1
+ if aa1=="z" and aa2=="c":
+ return -3
+ if aa1=="z" and aa2=="q":
+ return 3
+ if aa1=="z" and aa2=="e":
+ return 4
+ if aa1=="z" and aa2=="g":
+ return -2
+ if aa1=="z" and aa2=="h":
+ return 0
+ if aa1=="z" and aa2=="i":
+ return -3
+ if aa1=="z" and aa2=="l":
+ return -3
+ if aa1=="z" and aa2=="k":
+ return 1
+ if aa1=="z" and aa2=="m":
+ return -1
+ if aa1=="z" and aa2=="f":
+ return -3
+ if aa1=="z" and aa2=="p":
+ return -1
+ if aa1=="z" and aa2=="s":
+ return 0
+ if aa1=="z" and aa2=="t":
+ return -1
+ if aa1=="z" and aa2=="w":
+ return -3
+ if aa1=="z" and aa2=="y":
+ return -2
+ if aa1=="z" and aa2=="v":
+ return -2
+ if aa1=="z" and aa2=="b":
+ return 1
+ if aa1=="z" and aa2=="z":
+ return 4
+ if aa1=="z" and aa2=="x":
+ return -1
+ if aa1=="z" and aa2=="*":
+ return -4
+ if aa1=="x" and aa2=="a":
+ return 0
+ if aa1=="x" and aa2=="r":
+ return -1
+ if aa1=="x" and aa2=="n":
+ return -1
+ if aa1=="x" and aa2=="d":
+ return -1
+ if aa1=="x" and aa2=="c":
+ return -2
+ if aa1=="x" and aa2=="q":
+ return -1
+ if aa1=="x" and aa2=="e":
+ return -1
+ if aa1=="x" and aa2=="g":
+ return -1
+ if aa1=="x" and aa2=="h":
+ return -1
+ if aa1=="x" and aa2=="i":
+ return -1
+ if aa1=="x" and aa2=="l":
+ return -1
+ if aa1=="x" and aa2=="k":
+ return -1
+ if aa1=="x" and aa2=="m":
+ return -1
+ if aa1=="x" and aa2=="f":
+ return -1
+ if aa1=="x" and aa2=="p":
+ return -2
+ if aa1=="x" and aa2=="s":
+ return 0
+ if aa1=="x" and aa2=="t":
+ return 0
+ if aa1=="x" and aa2=="w":
+ return -2
+ if aa1=="x" and aa2=="y":
+ return -1
+ if aa1=="x" and aa2=="v":
+ return -1
+ if aa1=="x" and aa2=="b":
+ return -1
+ if aa1=="x" and aa2=="z":
+ return -1
+ if aa1=="x" and aa2=="x":
+ return -1
+ if aa1=="x" and aa2=="*":
+ return -4
+ if aa1=="*" and aa2=="a":
+ return -4
+ if aa1=="*" and aa2=="r":
+ return -4
+ if aa1=="*" and aa2=="n":
+ return -4
+ if aa1=="*" and aa2=="d":
+ return -4
+ if aa1=="*" and aa2=="c":
+ return -4
+ if aa1=="*" and aa2=="q":
+ return -4
+ if aa1=="*" and aa2=="e":
+ return -4
+ if aa1=="*" and aa2=="g":
+ return -4
+ if aa1=="*" and aa2=="h":
+ return -4
+ if aa1=="*" and aa2=="i":
+ return -4
+ if aa1=="*" and aa2=="l":
+ return -4
+ if aa1=="*" and aa2=="k":
+ return -4
+ if aa1=="*" and aa2=="m":
+ return -4
+ if aa1=="*" and aa2=="f":
+ return -4
+ if aa1=="*" and aa2=="p":
+ return -4
+ if aa1=="*" and aa2=="s":
+ return -4
+ if aa1=="*" and aa2=="t":
+ return -4
+ if aa1=="*" and aa2=="w":
+ return -4
+ if aa1=="*" and aa2=="y":
+ return -4
+ if aa1=="*" and aa2=="v":
+ return -4
+ if aa1=="*" and aa2=="b":
+ return -4
+ if aa1=="*" and aa2=="z":
+ return -4
+ if aa1=="*" and aa2=="x":
+ return -4
+ if aa1=="*" and aa2=="*":
+ return 1
+
+
+cdef class CodonNWS(NWS):
+
+ def __init__(self,match=2,mismatch=-3,opengap=-4,extgap=-1, phasedA = -1, phasedB = -1):#, AAmatrix=_blosum62, translationtable=None):
+ NWS.__init__(self,match, mismatch, opengap, extgap)
+ self._phasedA = -1 if phasedA == -1 else phasedA%3
+ self._phasedB = -1 if phasedB == -1 else phasedB%3
+
+ cdef double matchCodon(self, int h, int v):
+ cdef double score
+ cdef double match
+ score = 0
+
+ for i in range(3):
+ match = iupacPartialMatch(self.hSeq.sequence[h-i-1],self.vSeq.sequence[v-i-1])
+ score += match * self._match + (1-match) * self._mismatch
+ bl = _blosum62(self.hSeq.sequence[h-1], self.hSeq.sequence[h-2], self.hSeq.sequence[h-3], self.vSeq.sequence[v-1], self.vSeq.sequence[v-2], self.vSeq.sequence[v-3])
+ #print "MatchCodon","h=",h,"v=",v, " ",\
+ # ''.join(['%c'%(self.hSeq.sequence[h-3],),\
+ # '%c'%(self.hSeq.sequence[h-2],),\
+ # '%c'%(self.hSeq.sequence[h-1],)]),\
+ # " ", \
+ # ''.join(['%c'%(self.vSeq.sequence[v-3],),\
+ # '%c'%(self.vSeq.sequence[v-2],),\
+ # '%c'%(self.vSeq.sequence[v-1])])
+ #print '--> score = %d + %d'%(score,bl)
+
+ score += bl
+ return score
+
+ cdef inline int colindex(self, int idx):
+ return idx%(self._hlen()+1)
+
+ cdef inline int rowindex(self, int idx):
+ return idx/(self._hlen()+1)
+
+
+ #on change la signification des infos dans la matrice path
+ #on met l'indice de la cellule d'origine
+
+ cdef double doAlignment(self) except? 0:
+ cdef int i # vertical index
+ cdef int j # horizontal index
+ cdef int idx
+ cdef int jump
+ cdef int delta
+ cdef double score
+ cdef double scoremax
+ cdef int path
+
+
+ if self.needToCompute:
+ self.allocate()
+ self.reset()
+
+ for j in range(1,self._hlen()+1):
+ idx = self.index(j,0)
+ self.matrix.matrix[idx].score = self._opengap + (self._extgap * (j-1))
+ self.matrix.matrix[idx].path = 0
+
+ for i in range(1,self._vlen()+1):
+ idx = self.index(0,i)
+ self.matrix.matrix[idx].score = self._opengap + (self._extgap * (i-1))
+ self.matrix.matrix[idx].path = 0
+
+ for i in range(1,self._vlen()+1):
+ for j in range(1,self._hlen()+1):
+
+ # 1 - came from diagonal
+ idx = self.index(j-1,i-1)
+ # print "computing cell : %d,%d --> %d/%d" % (j,i,self.index(j,i),self.matrix.msize),
+ scoremax = self.matrix.matrix[idx].score + \
+ self.matchScore(j,i)
+ path = idx
+
+ # print "so=%f sd=%f sm=%f" % (self.matrix.matrix[idx].score,self.matchScore(j,i),scoremax),
+
+ # 1.1 - came from diagonal by aligning a codon with a codon
+ #print i, i%3, self._phasedB, i%3==self._phasedB
+ if (j-3)>=0 and (i-3)>=0 and (self._phasedB==-1 or (i%3)==self._phasedB) and (self._phasedA==-1 or (j%3)==self._phasedA):
+ idx = self.index(j-3,i-3)
+ contrib = self.matchCodon(j,i)
+ score = self.matrix.matrix[idx].score + \
+ contrib
+
+ #print "so=%f sd=%f score=%f sm=%f" % (self.matrix.matrix[idx].score,contrib,score, scoremax)
+
+ if score > scoremax :
+ #print "putain trop bien !"
+ scoremax = score
+ path = idx
+
+
+ # 2 - open horizontal gap
+ idx = self.index(j-1,i)
+ score = self.matrix.matrix[idx].score + \
+ self._opengap
+ if score > scoremax :
+ scoremax = score
+ path = idx
+
+ # 3 - open vertical gap
+ idx = self.index(j,i-1)
+ score = self.matrix.matrix[idx].score + \
+ self._opengap
+ if score > scoremax :
+ scoremax = score
+ path = idx
+
+ # 4 - extend horizontal gap
+ jump = self.matrix.bestHJump[i]
+ if jump >= 0:
+ idx = self.index(jump,i)
+ delta = j-jump
+ score = self.matrix.matrix[idx].score + \
+ self._extgap * delta
+ if score > scoremax :
+ scoremax = score
+ path = idx
+
+ # 5 - extend vertical gap
+ jump = self.matrix.bestVJump[j]
+ if jump >= 0:
+ idx = self.index(j,jump)
+ delta = i-jump
+ score = self.matrix.matrix[idx].score + \
+ self._extgap * delta
+ if score > scoremax :
+ scoremax = score
+ path = idx
+
+ idx = self.index(j,i)
+ self.matrix.matrix[idx].score = scoremax
+ self.matrix.matrix[idx].path = path
+
+ #si on a choisi l'ouverture de gap
+ if path == self.index(j,i-1):
+ self.matrix.bestVJump[j]=i
+ elif path == self.index(j-1,i):
+ self.matrix.bestHJump[i]=j
+
+ self.sequenceChanged=False
+ self.scoreChanged=False
+
+ idx = self.index(self._hlen(),self._vlen())
+ return self.matrix.matrix[idx].score
+
+
+ cdef void backtrack(self):
+ #cdef list path=[]
+ cdef int i
+ cdef int j
+ cdef int p
+
+ self.doAlignment()
+ i=self._vlen()
+ j=self._hlen()
+ self.path=allocatePath(i,j,self.path)
+
+ while (i or j):
+ idx=self.matrix.matrix[self.index(j,i)].path
+ ori_j = self.colindex(idx)
+ ori_i = self.rowindex(idx)
+
+ #print i,j
+
+ if i-ori_i == 3 and j-ori_j == 3:
+ #print 'on passe par un codon'
+ p = 0
+
+ self.path.path[self.path.length]=p
+ self.path.length+=1
+
+ self.path.path[self.path.length]=p
+ self.path.length+=1
+
+ elif i-ori_i == 1 and j-ori_j == 1:
+ #print 'on passe par un match'
+ p = 0
+ elif i-ori_i == 0:
+ #print 'on passe par un gap'
+ p = (j-ori_j)
+ elif j-ori_j == 0:
+ #print 'on passe par un gap'
+ p = -(i-ori_i)
+ else:
+ print "badaboum !"
+
+ i = ori_i
+ j = ori_j
+
+ #print '->', i, j
+
+ self.path.path[self.path.length]=p
+ self.path.length+=1
+
+ self.path.hStart=0
+ self.path.vStart=0
+
+
+ property match:
+ def __get__(self):
+ return self._match
+
+ def __set__(self,match):
+ self._match=match
+ self.scoreChanged=True
+
+ property mismatch:
+ def __get__(self):
+ return self._mismatch
+
+ def __set__(self,mismatch):
+ self._mismatch=mismatch
+ self.scoreChanged=True
+
+
+
+
diff --git a/src/obitools/align/_dynamic.pxd b/src/obitools/align/_dynamic.pxd
new file mode 100644
index 0000000..c268c64
--- /dev/null
+++ b/src/obitools/align/_dynamic.pxd
@@ -0,0 +1,90 @@
+cdef import from "stdlib.h":
+ void* malloc(int size) except NULL
+ void* realloc(void* chunk,int size) except NULL
+ void free(void* chunk)
+
+cdef import from "string.h":
+ void bzero(void *s, size_t n)
+ void memset(void* chunk,int car,int length)
+ void memcpy(void* s1, void* s2, int n)
+
+cdef struct AlignCell :
+ double score
+ int path
+
+cdef struct AlignMatrix :
+ AlignCell* matrix
+ int* bestVJump
+ int* bestHJump
+ int msize
+ int vsize
+ int hsize
+
+
+
+cdef AlignMatrix* allocateMatrix(int hsize, int vsize,AlignMatrix *matrix=?)
+
+cdef void freeMatrix(AlignMatrix* matrix)
+
+cdef void resetMatrix(AlignMatrix* matrix)
+
+
+cdef struct alignSequence:
+ long length
+ long buffsize
+ bint hasQuality
+ char* sequence
+ double* quality
+
+cdef alignSequence* allocateSequence(object bioseq, alignSequence* seq=?) except *
+
+cdef void freeSequence(alignSequence* seq)
+
+cdef struct alignPath:
+ long length
+ long buffsize
+ long vStart
+ long hStart
+ long *path
+
+cdef alignPath* allocatePath(long l1,long l2,alignPath* path=?)
+
+cdef void reversePath(alignPath* path)
+
+
+cdef void freePath(alignPath* path)
+
+
+cdef int bitCount(int x)
+cpdef bint iupacMatch(unsigned char a, unsigned char b)
+cpdef double iupacPartialMatch(unsigned char a, unsigned char b)
+cpdef unsigned char encodeBase(unsigned char lettre)
+
+cdef class DynamicProgramming:
+ cdef AlignMatrix* matrix
+
+ cdef object horizontalSeq
+ cdef object verticalSeq
+
+ cdef alignSequence* hSeq
+ cdef alignSequence* vSeq
+ cdef alignPath* path
+
+ cdef double _opengap
+ cdef double _extgap
+
+ cdef object alignment
+
+ cdef bint sequenceChanged
+ cdef bint scoreChanged
+
+ cdef int _vlen(self)
+ cdef int _hlen(self)
+ cdef int allocate(self) except -1
+ cdef double doAlignment(self) except? 0
+ cdef void reset(self)
+ cdef inline int index(self, int x, int y)
+ cdef inline bint _needToCompute(self)
+ cdef void backtrack(self)
+ cdef void clean(self)
+
diff --git a/src/obitools/align/_dynamic.pyx b/src/obitools/align/_dynamic.pyx
new file mode 100644
index 0000000..ada9f52
--- /dev/null
+++ b/src/obitools/align/_dynamic.pyx
@@ -0,0 +1,365 @@
+#@PydevCodeAnalysisIgnore
+'''
+Created on 14 sept. 2009
+
+ at author: coissac
+'''
+
+from obitools import BioSequence
+from obitools.alignment import AlignedSequence
+from obitools.alignment import Alignment
+
+
+######
+#
+# Import standard memory management function to improve
+# efficiency of the alignment code
+#
+#
+
+from _dynamic cimport *
+
+cdef AlignMatrix* allocateMatrix(int hsize, int vsize,AlignMatrix *matrix=NULL):
+
+ vsize+=1
+ hsize+=1
+
+ if matrix is NULL:
+ matrix = <AlignMatrix*>malloc(sizeof(AlignMatrix))
+ matrix.vsize=0
+ matrix.hsize=0
+ matrix.msize=0
+ matrix.matrix=NULL
+ matrix.bestVJump=NULL
+ matrix.bestHJump=NULL
+
+ if hsize > matrix.hsize:
+ matrix.bestVJump = <int*>realloc(matrix.bestVJump,hsize * sizeof(int))
+ matrix.hsize=hsize
+
+ if vsize > matrix.vsize:
+ matrix.bestHJump = <int*>realloc(matrix.bestHJump,vsize * sizeof(int))
+ matrix.vsize=vsize
+
+ if (hsize * vsize) > matrix.msize:
+ matrix.msize = hsize * vsize
+ matrix.matrix = <AlignCell*>realloc(matrix.matrix, matrix.msize * sizeof(AlignCell))
+
+ return matrix
+
+cdef void freeMatrix(AlignMatrix* matrix):
+ if matrix is not NULL:
+ if matrix.matrix is not NULL:
+ free(matrix.matrix)
+ if matrix.bestVJump is not NULL:
+ free(matrix.bestVJump)
+ if matrix.bestHJump is not NULL:
+ free(matrix.bestHJump)
+ free(matrix)
+
+cdef void resetMatrix(AlignMatrix* matrix):
+ if matrix is not NULL:
+ if matrix.matrix is not NULL:
+ bzero(<void*>matrix.matrix, matrix.msize * sizeof(AlignCell))
+ if matrix.bestHJump is not NULL:
+ memset(<void*>matrix.bestHJump,255,matrix.vsize * sizeof(int))
+ if matrix.bestVJump is not NULL:
+ memset(<void*>matrix.bestVJump,255,matrix.hsize * sizeof(int))
+
+
+cdef alignSequence* allocateSequence(object bioseq, alignSequence* seq=NULL) except *:
+ cdef bytes strseq
+ cdef int i
+
+ if seq is NULL:
+ seq = <alignSequence*>malloc(sizeof(alignSequence))
+ seq.length=0
+ seq.buffsize=0
+ seq.sequence=NULL
+ seq.quality=NULL
+ seq.hasQuality=False
+
+ seq.length=len(bioseq)
+ if seq.length > seq.buffsize:
+ seq.sequence = <char*>realloc(seq.sequence,sizeof(char)*seq.length)
+ seq.quality = <double*>realloc(seq.quality,sizeof(double)*seq.length)
+ seq.buffsize = seq.length
+
+ strseq = str(bioseq).lower()
+ memcpy(seq.sequence,<char*>strseq,seq.length)
+
+ if 'quality' in bioseq:
+ seq.hasQuality=True
+ quality=bioseq['quality']
+ for i in range(0,seq.length):
+ seq.quality[i]=<double>quality[i]
+
+ return seq
+
+cdef void freeSequence(alignSequence* seq):
+ if seq is not NULL:
+ if seq.sequence is not NULL:
+ free(<void*>seq.sequence)
+ if seq.quality is not NULL:
+ free(<void*>seq.quality)
+ free(seq)
+
+cdef alignPath* allocatePath(long l1,long l2,alignPath* path=NULL):
+ cdef long length=l1+l2
+
+ if path is NULL:
+ path = <alignPath*>malloc(sizeof(alignPath))
+ path.length=0
+ path.buffsize=0
+ path.path=NULL
+
+ if length > path.buffsize:
+ path.buffsize=length
+ path.path=<long*>realloc(path.path,sizeof(long)*length)
+
+ path.length=0
+ path.vStart=0
+ path.hStart=0
+
+ return path
+
+cdef void reversePath(alignPath* path):
+ cdef long i
+ cdef long j
+
+ j=path.length
+ for i in range(path.length/2):
+ j-=1
+ path.path[i],path.path[j]=path.path[j],path.path[i]
+
+cdef void freePath(alignPath* path):
+ if path is not NULL:
+ if path.path is not NULL:
+ free(<void*>path.path)
+ free(<void*>path)
+
+
+cdef int aascii = ord(b'a')
+cdef int _basecode[26]
+
+cdef int bitCount(int x):
+ cdef int i=0
+ while(x):
+ i+=1
+ x&=x-1
+ return i
+
+cpdef bint iupacMatch(unsigned char a, unsigned char b):
+ cdef bint m
+
+ if a==42: # * ascii code
+ a=110 # n ascii code
+
+ if b==42: # * ascii code
+ b=110 # n ascii code
+
+ m = _basecode[a - aascii] & _basecode[b - aascii]
+ return m
+
+cpdef unsigned char encodeBase(unsigned char lettre):
+ return _basecode[lettre - aascii]
+
+cpdef double iupacPartialMatch(unsigned char a, unsigned char b):
+ cdef int codeA
+ cdef int codeB
+ cdef int good
+ cdef int all
+ cdef double partial
+
+ if a==42: # * ascii code
+ a=110 # n ascii code
+
+ if b==42: # * ascii code
+ b=110 # n ascii code
+
+ codeA = _basecode[a - aascii]
+ codeB = _basecode[b - aascii]
+ good = bitCount(codeA & codeB)
+ all = bitCount(codeA) * bitCount(codeB)
+ partial= <double>good / all
+
+ return partial
+
+
+cdef class DynamicProgramming:
+
+ def __init__(self,opengap,extgap):
+ self.sequenceChanged=True
+ self.scoreChanged=True
+
+ self.matrix=NULL
+ self.hSeq=NULL
+ self.vSeq=NULL
+ self.path=NULL
+
+ self.horizontalSeq=None
+ self.verticalSeq=None
+
+ self._opengap=opengap
+ self._extgap=extgap
+
+ cdef int _vlen(self):
+ return self.vSeq.length
+
+ cdef int _hlen(self):
+ return self.hSeq.length
+
+ cdef int allocate(self) except -1:
+
+ assert self.horizontalSeq is not None,'Sequence A must be set'
+ assert self.verticalSeq is not None,'Sequence B must be set'
+
+ cdef long lenH=self._hlen()
+ cdef long lenV=self._vlen()
+
+ self.matrix=allocateMatrix(lenH,lenV,self.matrix)
+ return 0
+
+
+ cdef double doAlignment(self) except? 0:
+ pass
+
+ cdef bint _needToCompute(self):
+ return self.scoreChanged or self.sequenceChanged
+
+ cdef void backtrack(self):
+ pass
+
+ property seqA:
+ def __get__(self):
+ return self.horizontalSeq
+
+ def __set__(self, seq):
+ self.sequenceChanged=True
+ self.horizontalSeq=seq
+ self.hSeq=allocateSequence(self.horizontalSeq,self.hSeq)
+
+ property seqB:
+ def __get__(self):
+ return self.verticalSeq
+
+ def __set__(self, seq):
+ self.sequenceChanged=True
+ self.verticalSeq=seq
+ self.vSeq=allocateSequence(self.verticalSeq,self.vSeq)
+
+ property opengap:
+ def __get__(self):
+ return self._opengap
+
+ def __set__(self,opengap):
+ self._opengap=opengap
+ self.scoreChanged=True
+
+ property extgap:
+ def __get__(self):
+ return self._extgap
+
+ def __set__(self,extgap):
+ self._extgap=extgap
+ self.scoreChanged=True
+
+ property needToCompute:
+ def __get__(self):
+ return self.scoreChanged or self.sequenceChanged
+
+ property score:
+ def __get__(self):
+ return self.doAlignment()
+
+ cdef void reset(self):
+ self.scoreChanged=True
+ resetMatrix(self.matrix)
+
+ cdef inline int index(self, int x, int y):
+ return (self._hlen()+1) * y + x
+
+ cdef void clean(self):
+ freeMatrix(self.matrix)
+ freeSequence(self.hSeq)
+ freeSequence(self.vSeq)
+ freePath(self.path)
+
+ def __dealloc__(self):
+ self.clean()
+
+ def __call__(self):
+ cdef list hgaps=[]
+ cdef list vgaps=[]
+ cdef list b
+ cdef int hp=0
+ cdef int vp=0
+ cdef int lenh=0
+ cdef int lenv=0
+ cdef int h,v,p
+ cdef int i
+ cdef object ali
+ cdef double score
+
+ if self._needToCompute():
+ score = self.doAlignment()
+ self.backtrack()
+ for i in range(self.path.length-1,-1,-1):
+ p=self.path.path[i]
+ if p==0:
+ hp+=1
+ vp+=1
+ lenh+=1
+ lenv+=1
+ elif p>0:
+ hp+=p
+ lenh+=p
+ vgaps.append([vp,p])
+ vp=0
+ else:
+ vp-=p
+ lenv-=p
+ hgaps.append([hp,-p])
+ hp=0
+
+ if hp:
+ hgaps.append([hp,0])
+ if vp:
+ vgaps.append([vp,0])
+
+ if lenh < self._hlen():
+ hseq=self.horizontalSeq[self.path.hStart:self.path.hStart+lenh]
+ else:
+ hseq=self.horizontalSeq
+
+ hseq=AlignedSequence(hseq)
+ hseq.gaps=hgaps
+
+ if lenv < self._vlen():
+ vseq=self.verticalSeq[self.path.vStart:self.path.vStart+lenv]
+ else:
+ vseq=self.verticalSeq
+
+ vseq=AlignedSequence(vseq)
+ vseq.gaps=vgaps
+
+ ali=Alignment()
+ ali.append(hseq)
+ ali.append(vseq)
+
+ ali.score=score
+ self.alignment=ali
+ ali=self.alignment.clone()
+ ali.score=self.alignment.score
+ return ali
+
+
+
+
+# initialize iupac carray
+
+__basecode=[1,14,2,13,0,0,4,11,0,0,12,0,3,15,0,0,0,5,6,8,8,7,9,0,10,0]
+for i in range(26):
+ _basecode[i]=__basecode[i]
+
+
\ No newline at end of file
diff --git a/src/obitools/align/_freeendgap.pxd b/src/obitools/align/_freeendgap.pxd
new file mode 100644
index 0000000..d829b33
--- /dev/null
+++ b/src/obitools/align/_freeendgap.pxd
@@ -0,0 +1,9 @@
+from _nws cimport *
+
+cdef class FreeEndGap(NWS):
+ cdef double xsmax
+ cdef int xmax
+
+
+ cdef double doAlignment(self) except? 0
+
diff --git a/src/obitools/align/_freeendgap.pyx b/src/obitools/align/_freeendgap.pyx
new file mode 100644
index 0000000..783fad7
--- /dev/null
+++ b/src/obitools/align/_freeendgap.pyx
@@ -0,0 +1,161 @@
+'''
+Created on 6 Nov. 2009
+
+ at author: coissac
+'''
+#@PydevCodeAnalysisIgnore
+
+from _freeendgap cimport *
+
+
+cdef class FreeEndGap(NWS):
+
+ def __init__(self,match=4,mismatch=-6,opengap=-8,extgap=-2):
+ NWS.__init__(self,match,mismatch,opengap,extgap)
+ self.xsmax=0
+ self.xmax=0
+
+ cdef double doAlignment(self) except? 0:
+ cdef int i # vertical index
+ cdef int j # horizontal index
+ cdef int idx
+ cdef int idx0
+ cdef int idx1
+ cdef int jump
+ cdef int delta
+ cdef double score
+ cdef double scoremax
+ cdef int path
+
+ assert self.hSeq.length > self.vSeq.length, \
+ "Sequence B must be shorter than sequence A"
+
+
+ if self.needToCompute:
+ self.allocate()
+ self.reset()
+ self.xsmax=0
+ self.xmax=0
+
+ for j in range(1,self.hSeq.length+1):
+ idx = self.index(j,0)
+ self.matrix.matrix[idx].score = 0
+ self.matrix.matrix[idx].path = j
+
+ for i in range(1,self.vSeq.length+1):
+ idx = self.index(0,i)
+ self.matrix.matrix[idx].score = self._opengap + (self._extgap * (i-1))
+ self.matrix.matrix[idx].path = -i
+
+ idx0=self.index(-1,0)
+ idx1=self.index(0,1)
+ for i in range(1,self.vSeq.length+1):
+ idx0+=1
+ idx1+=1
+ for j in range(1,self.hSeq.length+1):
+
+ # 1 - came from diagonal
+ #idx = self.index(j-1,i-1)
+ idx = idx0
+ # print "computing cell : %d,%d --> %d/%d" % (j,i,self.index(j,i),self.matrix.msize),
+ scoremax = self.matrix.matrix[idx].score + \
+ self.matchScore(j,i)
+ path = 0
+
+ # print "so=%f sd=%f sm=%f" % (self.matrix.matrix[idx].score,self.matchScore(j,i),scoremax),
+
+ # 2 - open horizontal gap
+ # idx = self.index(j-1,i)
+ idx = idx1 - 1
+ score = self.matrix.matrix[idx].score+ \
+ self._opengap
+ if score > scoremax :
+ scoremax = score
+ path = +1
+
+ # 3 - open vertical gap
+ # idx = self.index(j,i-1)
+ idx = idx0 + 1
+ score = self.matrix.matrix[idx].score + \
+ self._opengap
+ if score > scoremax :
+ scoremax = score
+ path = -1
+
+ # 4 - extend horizontal gap
+ jump = self.matrix.bestHJump[i]
+ if jump >= 0:
+ idx = self.index(jump,i)
+ delta = j-jump
+ score = self.matrix.matrix[idx].score + \
+ self._extgap * delta
+ if score > scoremax :
+ scoremax = score
+ path = delta+1
+
+ # 5 - extend vertical gap
+ jump = self.matrix.bestVJump[j]
+ if jump >= 0:
+ idx = self.index(j,jump)
+ delta = i-jump
+ score = self.matrix.matrix[idx].score + \
+ self._extgap * delta
+ if score > scoremax :
+ scoremax = score
+ path = -delta-1
+
+ # idx = self.index(j,i)
+ idx = idx1
+ self.matrix.matrix[idx].score = scoremax
+ self.matrix.matrix[idx].path = path
+
+ if path == -1:
+ self.matrix.bestVJump[j]=i
+ elif path == +1 :
+ self.matrix.bestHJump[i]=j
+
+ if i==self.vSeq.length and scoremax > self.xsmax:
+ self.xsmax=scoremax
+ self.xmax=j
+ idx0+=1
+ idx1+=1
+
+ self.sequenceChanged=False
+ self.scoreChanged=False
+
+ return self.xsmax
+
+ cdef void backtrack(self):
+ #cdef list path=[]
+ cdef int i
+ cdef int j
+ cdef int p
+
+ self.doAlignment()
+ j=self.xmax
+ i=self.vSeq.length
+ self.path=allocatePath(i,j+1,self.path)
+
+ if self.xmax<self.hSeq.length:
+ self.path.path[self.path.length]=self.hSeq.length-self.xmax
+ self.path.length+=1
+
+ while (i or j):
+ p=self.matrix.matrix[self.index(j,i)].path
+ self.path.path[self.path.length]=p
+ self.path.length+=1
+ #path.append(p)
+ if p==0:
+ i-=1
+ j-=1
+ elif p < 0:
+ i+=p
+ else:
+ j-=p
+
+ #path.reverse()
+ #reversePath(self.path)
+ self.path.hStart=0
+ self.path.vStart=0
+ #return 0,0,path
+
diff --git a/src/obitools/align/_freeendgapfm.pxd b/src/obitools/align/_freeendgapfm.pxd
new file mode 100644
index 0000000..2cf35b9
--- /dev/null
+++ b/src/obitools/align/_freeendgapfm.pxd
@@ -0,0 +1,5 @@
+from _freeendgap cimport *
+
+cdef class FreeEndGapFullMatch(FreeEndGap):
+ cdef double matchScore(self,int h, int v)
+
diff --git a/src/obitools/align/_freeendgapfm.pyx b/src/obitools/align/_freeendgapfm.pyx
new file mode 100644
index 0000000..5a2c858
--- /dev/null
+++ b/src/obitools/align/_freeendgapfm.pyx
@@ -0,0 +1,19 @@
+'''
+Created on 6 Nov. 2009
+
+ at author: coissac
+'''
+#@PydevCodeAnalysisIgnore
+
+from _freeendgapfm cimport *
+
+
+cdef class FreeEndGapFullMatch(FreeEndGap):
+
+ cdef double matchScore(self,int h, int v):
+ cdef double score
+ if iupacMatch(self.hSeq.sequence[h-1],self.vSeq.sequence[v-1]):
+ score=self._match
+ else:
+ score=self._mismatch
+ return score
diff --git a/src/obitools/align/_gprofilenws.pxd b/src/obitools/align/_gprofilenws.pxd
new file mode 100644
index 0000000..22fd47f
--- /dev/null
+++ b/src/obitools/align/_gprofilenws.pxd
@@ -0,0 +1,8 @@
+from _profilenws cimport *
+
+cdef class GProfileNWS(ProfileNWS):
+
+ cdef double matchScore(self,int h, int v)
+ cdef object alignment1
+ cdef object alignment2
+ #cdef double doAlignment(self) except? 0
\ No newline at end of file
diff --git a/src/obitools/align/_gprofilenws.pyx b/src/obitools/align/_gprofilenws.pyx
new file mode 100644
index 0000000..e615dd2
--- /dev/null
+++ b/src/obitools/align/_gprofilenws.pyx
@@ -0,0 +1,167 @@
+'''
+Created on 16 Feb. 2011
+
+ at author: celine
+'''
+#@PydevCodeAnalysisIgnore
+
+from _gprofilenws cimport *
+
+
+cdef class GProfileNWS(ProfileNWS):
+
+ cdef double matchScore(self,int h, int v):
+ cdef double pmatch
+ cdef double* hp = self.hProf.frequency
+ cdef double* vp = self.vProf.frequency
+ cdef int hl = self.hProf.length
+ cdef int vl = self.vProf.length
+
+ h-=1
+ v-=1
+ pmatch = hp[h]*vp[v] + \
+ hp[h+hl]*vp[v+vl] + \
+ hp[h+2*hl]*vp[v+2*vl] + \
+ hp[h+3*hl]*vp[v+3*vl] + \
+ hp[h+4*hl]*vp[v+4*vl] + \
+ hp[h+5*hl]*vp[v+5*vl]
+ return self._match * pmatch + (1-pmatch) * self._mismatch
+
+
+ def __call__(self,pseudocounts=0):
+
+ cdef list hgaps=[]
+ cdef list vgaps=[]
+ cdef list b
+ cdef int hp
+ cdef int vp
+ cdef int rp
+ cdef int lenh=0
+ cdef int lenv=0
+ cdef int h,v,p
+ cdef int i
+ cdef object ali
+ cdef double score
+ cdef DNAProfile newProfile1
+ cdef DNAProfile newProfile2
+ cdef DNAProfile horizontalSeq=self.horizontalSeq
+ cdef DNAProfile verticalSeq=self.verticalSeq
+
+ if self._needToCompute():
+
+ score = self.doAlignment()
+ self.backtrack()
+ sum = 0
+ for p in xrange(self.path.length) :
+ v = self.path.path[p]
+ if v == 0 :
+ sum += 1
+ else :
+ sum += abs(v)
+
+ newProfile1 = DNAProfile(size=sum,pseudo=pseudocounts)
+ newProfile1.profile.weight = horizontalSeq.profile.weight
+
+ newProfile2 = DNAProfile(size=sum,pseudo=pseudocounts)
+ newProfile2.profile.weight = verticalSeq.profile.weight
+
+ hp=horizontalSeq.profile.length-1
+ vp=verticalSeq.profile.length-1
+ rp1=newProfile1.profile.length-1
+ rp2=newProfile2.profile.length-1
+
+ for i in range(self.path.length):
+ p=self.path.path[i]
+
+ for i in range(self.path.length):
+ p=self.path.path[i]
+
+ if p==0:
+
+ newProfile1.A[rp1] = horizontalSeq.A[hp]
+ newProfile1.C[rp1] = horizontalSeq.C[hp]
+ newProfile1.G[rp1] = horizontalSeq.G[hp]
+ newProfile1.T[rp1] = horizontalSeq.T[hp]
+ newProfile1.Og[rp1] = horizontalSeq.Og[hp]
+ newProfile1.Eg[rp1] = horizontalSeq.Eg[hp]
+
+ newProfile2.A[rp2] = verticalSeq.A[vp]
+ newProfile2.C[rp2] = verticalSeq.C[vp]
+ newProfile2.G[rp2] = verticalSeq.G[vp]
+ newProfile2.T[rp2] = verticalSeq.T[vp]
+ newProfile2.Og[rp2] = verticalSeq.Og[vp]
+ newProfile2.Eg[rp2] = verticalSeq.Eg[vp]
+
+ hp-=1
+ vp-=1
+ rp1-=1
+ rp2-=1
+
+ elif p>0:
+
+ for x in xrange(p-1) :
+
+ newProfile1.A[rp1] = horizontalSeq.A[hp]
+ newProfile1.C[rp1] = horizontalSeq.C[hp]
+ newProfile1.G[rp1] = horizontalSeq.G[hp]
+ newProfile1.T[rp1] = horizontalSeq.T[hp]
+ newProfile1.Og[rp1] = horizontalSeq.Og[hp]
+ newProfile1.Eg[rp1] = horizontalSeq.Eg[hp]
+
+ newProfile2.Eg[rp2] = verticalSeq.profile.weight
+
+ hp-=1
+ rp1-=1
+ rp2-=1
+
+ newProfile1.A[rp1] = horizontalSeq.A[hp]
+ newProfile1.C[rp1] = horizontalSeq.C[hp]
+ newProfile1.G[rp1] = horizontalSeq.G[hp]
+ newProfile1.T[rp1] = horizontalSeq.T[hp]
+ newProfile1.Og[rp1] = horizontalSeq.Og[hp]
+ newProfile1.Eg[rp1] = horizontalSeq.Eg[hp]
+
+ newProfile2.Og[rp2] = verticalSeq.profile.weight
+
+ hp-=1
+ rp1-=1
+ rp2-=1
+
+ else:
+
+ for x in xrange(abs(p)-1) :
+
+ newProfile2.A[rp2] = verticalSeq.A[vp]
+ newProfile2.C[rp2] = verticalSeq.C[vp]
+ newProfile2.G[rp2] = verticalSeq.G[vp]
+ newProfile2.T[rp2] = verticalSeq.T[vp]
+ newProfile2.Og[rp2] = verticalSeq.Og[vp]
+ newProfile2.Eg[rp2] = verticalSeq.Eg[vp]
+
+ newProfile1.Eg[rp1] = horizontalSeq.profile.weight
+
+ vp-=1
+ rp1-=1
+ rp2-=1
+
+ newProfile2.A[rp2] = verticalSeq.A[vp]
+ newProfile2.C[rp2] = verticalSeq.C[vp]
+ newProfile2.G[rp2] = verticalSeq.G[vp]
+ newProfile2.T[rp2] = verticalSeq.T[vp]
+ newProfile2.Og[rp2] = verticalSeq.Og[vp]
+ newProfile2.Eg[rp2] = verticalSeq.Eg[vp]
+
+ newProfile1.Og[rp1] = horizontalSeq.profile.weight
+
+ vp-=1
+ rp1-=1
+ rp2-=1
+
+ self.alignment1 = newProfile1
+ self.alignment2 = newProfile2
+
+ ali1=DNAProfile(self.alignment1,pseudo=pseudocounts)
+ ali2=DNAProfile(self.alignment2,pseudo=pseudocounts)
+
+ return ali1, ali2
+
diff --git a/src/obitools/align/_lcs.cfiles b/src/obitools/align/_lcs.cfiles
new file mode 100644
index 0000000..0e27863
--- /dev/null
+++ b/src/obitools/align/_lcs.cfiles
@@ -0,0 +1 @@
+_lcs.h
\ No newline at end of file
diff --git a/src/obitools/align/_lcs.ext.1.c b/src/obitools/align/_lcs.ext.1.c
new file mode 100644
index 0000000..7614125
--- /dev/null
+++ b/src/obitools/align/_lcs.ext.1.c
@@ -0,0 +1,168 @@
+#include "_lcs.h"
+
+#include <string.h>
+#include <stdlib.h>
+#include <limits.h>
+
+#include <stdio.h>
+
+
+
+// Allocate a band allowing to align sequences of length : 'length'
+
+column_t* allocateColumn(int length,column_t *column, bool mode8bits)
+{
+ int size;
+ bool newc = false;
+
+ // The band length should be equal to the length
+ // of the sequence + 7 for taking into account its
+ // shape
+
+ size = (length+1) * ((mode8bits) ? sizeof(int8_t):sizeof(int16_t));
+
+
+ // If the pointer to the old column is NULL we allocate
+ // a new column
+
+ if (column==NULL)
+ {
+
+ column = malloc(sizeof(column_t));
+ if (!column)
+ return NULL;
+
+ column->size = 0;
+ column->data.shrt=NULL;
+ column->score.shrt=NULL;
+ newc = true;
+ }
+
+ // Otherwise we check if its size is sufficient
+ // or if it should be extend
+
+ if (size > column->size)
+ {
+ int16_t *old = column->data.shrt;
+ int16_t *olds= column->score.shrt;
+
+ column->data.shrt = malloc(size);
+ column->score.shrt= malloc(size);
+
+ if (column->data.shrt==NULL || column->score.shrt==NULL)
+ {
+ fprintf(stderr,"Allocation Error on column for a size of %d\n" , size);
+ column->data.shrt = old;
+ column->score.shrt= olds;
+
+ if (newc)
+ {
+ free(column);
+ column=NULL;
+ return NULL;
+ }
+ return NULL;
+ }
+ else
+ column->size = size;
+ }
+
+ return column;
+}
+
+void freeColumn(column_p column)
+{
+ if (column)
+ {
+ if (column->data.shrt)
+ free(column->data.shrt);
+
+ if (column->score.shrt)
+ free(column->score.shrt);
+
+ free(column);
+ }
+}
+
+int fastLCSScore(const char* seq1, const char* seq2,column_pp column,int32_t* lpath)
+{
+ return fastLCSScore16(seq1,seq2,column,lpath);
+}
+
+int simpleLCS(const char* seq1, const char* seq2,column_pp ppcolumn,int32_t* lpath)
+{
+ int lseq1,lseq2; // length of the both sequences
+ int lcs;
+ int itmp; // tmp variables for swap
+ const char* stmp; //
+ int32_t *score;
+ int32_t *path;
+ column_t *column;
+ int32_t i,j;
+ int32_t sl,su,sd;
+ int32_t pl,pu,pd;
+
+ // Made seq1 the longest sequences
+ lseq1=strlen(seq1);
+ lseq2=strlen(seq2);
+
+ if (lseq1 < lseq2)
+ {
+ itmp=lseq1;
+ lseq1=lseq2;
+ lseq2=itmp;
+
+ stmp=seq1;
+ seq1=seq2;
+ seq2=stmp;
+ }
+
+ lseq1++;
+ lseq2++;
+
+ // a band sized to the smallest sequence is allocated
+
+ if (ppcolumn)
+ column = *ppcolumn;
+ else
+ column=NULL;
+
+ column = allocateColumn(lseq1*2,column,0);
+ score = (int32_t*) column->score.shrt;
+ path = (int32_t*) column->data.shrt;
+
+ memset(score,0,lseq1 * sizeof(int32_t));
+
+ for (j=0; j < lseq1; j++)
+ path[j]=j;
+
+ for (i=1; i< lseq2; i++)
+ {
+ sl=0;
+ pl=i;
+ for (j=1; j < lseq1; j++)
+ {
+ sd=score[j-1] + (seq2[i-1]==seq1[j-1] ? 1:0);
+ pd=path[j-1] + 1;
+
+ su=score[j];
+ pu=path[j] + 1;
+
+ score[j-1]=sl;
+
+ if (su > sl) sl=su, pl=pu;
+ if (sd > sl) sl=sd, pl=pd;
+ }
+ }
+
+ lcs = sl;
+ if(lpath) *lpath=pl;
+
+ if (ppcolumn)
+ *ppcolumn=column;
+ else
+ freeColumn(column);
+
+ return lcs;
+}
+
diff --git a/src/obitools/align/_lcs.ext.2.c b/src/obitools/align/_lcs.ext.2.c
new file mode 100644
index 0000000..381dc6a
--- /dev/null
+++ b/src/obitools/align/_lcs.ext.2.c
@@ -0,0 +1,34 @@
+#include "_lcs.h"
+
+#include <string.h>
+#include <stdlib.h>
+#include <limits.h>
+
+#include <stdio.h>
+
+
+
+
+#define VSIZE (8)
+#define VTYPE vInt16
+#define STYPE int16_t
+#define CMENB shrt
+#define VMODE false
+#define FASTLCSSCORE fastLCSScore16
+#define INSERT_REG _MM_INSERT_EPI16
+#define EXTRACT_REG _MM_EXTRACT_EPI16
+#define EQUAL_REG _MM_CMPEQ_EPI16
+#define GREATER_REG _MM_CMPGT_EPI16
+#define SMALLER_REG _MM_CMPLT_EPI16
+#define ADD_REG _MM_ADD_EPI16
+#define SUB_REG _MM_SUB_EPI16
+#define AND_REG _MM_AND_SI128
+#define ANDNOT_REG _MM_ANDNOT_SI128
+#define OR_REG _MM_OR_SI128
+#define SET_CONST _MM_SET1_EPI16
+#define GET_MAX _MM_MAX_EPI16
+#define GET_MIN _MM_MIN_EPI16
+#define MIN_SCORE INT16_MIN
+#define MAX_SCORE 32000
+
+#include "_lcs_fast.h"
diff --git a/src/obitools/align/_lcs.ext.3.c b/src/obitools/align/_lcs.ext.3.c
new file mode 100644
index 0000000..5c3a150
--- /dev/null
+++ b/src/obitools/align/_lcs.ext.3.c
@@ -0,0 +1,34 @@
+#include "_lcs.h"
+
+#include <string.h>
+#include <stdlib.h>
+#include <limits.h>
+
+#include <stdio.h>
+
+
+
+
+#define VSIZE (16)
+#define VTYPE vInt8
+#define STYPE int8_t
+#define CMENB byte
+#define VMODE true
+#define FASTLCSSCORE fastLCSScore8
+#define INSERT_REG _MM_INSERT_EPI8
+#define EXTRACT_REG _MM_EXTRACT_EPI8
+#define EQUAL_REG _MM_CMPEQ_EPI8
+#define GREATER_REG _MM_CMPGT_EPI8
+#define SMALLER_REG _MM_CMPLT_EPI8
+#define ADD_REG _MM_ADD_EPI8
+#define SUB_REG _MM_SUB_EPI8
+#define AND_REG _MM_AND_SI128
+#define ANDNOT_REG _MM_ANDNOT_SI128
+#define OR_REG _MM_OR_SI128
+#define SET_CONST _MM_SET1_EPI8
+#define GET_MAX _MM_MAX_EPI8
+#define GET_MIN _MM_MIN_EPI8
+#define MIN_SCORE INT8_MIN
+#define MAX_SCORE 127
+
+#include "_lcs_fast.h"
diff --git a/src/obitools/align/_lcs.ext.4.c b/src/obitools/align/_lcs.ext.4.c
new file mode 100644
index 0000000..ed2d060
--- /dev/null
+++ b/src/obitools/align/_lcs.ext.4.c
@@ -0,0 +1,225 @@
+#include "_sse.h"
+#include <stdio.h>
+#include <math.h>
+
+
+inline static uchar_v hash4m128(uchar_v frag)
+{
+ uchar_v words;
+
+ vUInt8 mask_03= _MM_SET1_EPI8(0x03); // charge le registre avec 16x le meme octet
+ vUInt8 mask_FC= _MM_SET1_EPI8(0xFC);
+
+ frag.m = _MM_SRLI_EPI64(frag.m,1); // shift logic a droite sur 2 x 64 bits
+ frag.m = _MM_AND_SI128(frag.m,mask_03); // and sur les 128 bits
+
+
+ words.m= _MM_SLLI_EPI64(frag.m,2);
+ words.m= _MM_AND_SI128(words.m,mask_FC);
+ frag.m = _MM_SRLI_SI128(frag.m,1);
+ words.m= _MM_OR_SI128(words.m,frag.m);
+
+ words.m= _MM_SLLI_EPI64(words.m,2);
+ words.m= _MM_AND_SI128(words.m,mask_FC);
+ frag.m = _MM_SRLI_SI128(frag.m,1);
+ words.m= _MM_OR_SI128(words.m,frag.m);
+
+ words.m= _MM_SLLI_EPI64(words.m,2);
+ words.m= _MM_AND_SI128(words.m,mask_FC);
+ frag.m = _MM_SRLI_SI128(frag.m,1);
+ words.m= _MM_OR_SI128(words.m,frag.m);
+
+ return words;
+}
+
+inline static int anyzerom128(vUInt8 data)
+{
+ vUInt8 mask_00= _MM_SETZERO_SI128();
+ uint64_v tmp;
+ tmp.m = _MM_CMPEQ_EPI8(data,mask_00);
+ return (int)(tmp.c[0]!=0 || tmp.c[1]!=0);
+}
+
+inline static void dumpm128(unsigned short *table,vUInt8 data)
+{
+ memcpy(table,&data,16);
+}
+
+int buildTable(const char* sequence, unsigned char *table, int *count)
+{
+ int overflow = 0;
+ int wc=0;
+ int i;
+ vUInt8 mask_00= _MM_SETZERO_SI128();
+
+ uchar_v frag;
+ uchar_v words;
+ uchar_v zero;
+
+ char* s;
+
+ s=(char*)sequence;
+
+ memset(table,0,256*sizeof(unsigned char));
+
+ // encode ascii sequence with A : 00 C : 01 T: 10 G : 11
+
+ for(frag.m=_MM_LOADU_SI128((vUInt8*)s);
+ ! anyzerom128(frag.m);
+ s+=12,frag.m=_MM_LOADU_SI128((vUInt8*)s))
+ {
+ words= hash4m128(frag);
+
+ // printf("%d %d %d %d\n",words.c[0],words.c[1],words.c[2],words.c[3]);
+
+ if (table[words.c[0]]<255) table[words.c[0]]++; else overflow++;
+ if (table[words.c[1]]<255) table[words.c[1]]++; else overflow++;
+ if (table[words.c[2]]<255) table[words.c[2]]++; else overflow++;
+ if (table[words.c[3]]<255) table[words.c[3]]++; else overflow++;
+ if (table[words.c[4]]<255) table[words.c[4]]++; else overflow++;
+ if (table[words.c[5]]<255) table[words.c[5]]++; else overflow++;
+ if (table[words.c[6]]<255) table[words.c[6]]++; else overflow++;
+ if (table[words.c[7]]<255) table[words.c[7]]++; else overflow++;
+ if (table[words.c[8]]<255) table[words.c[8]]++; else overflow++;
+ if (table[words.c[9]]<255) table[words.c[9]]++; else overflow++;
+ if (table[words.c[10]]<255) table[words.c[10]]++; else overflow++;
+ if (table[words.c[11]]<255) table[words.c[11]]++; else overflow++;
+
+ wc+=12;
+ }
+
+ zero.m=_MM_CMPEQ_EPI8(frag.m,mask_00);
+ //printf("frag=%d %d %d %d\n",frag.c[0],frag.c[1],frag.c[2],frag.c[3]);
+ //printf("zero=%d %d %d %d\n",zero.c[0],zero.c[1],zero.c[2],zero.c[3]);
+ words = hash4m128(frag);
+
+ if (zero.c[0]+zero.c[1]+zero.c[2]+zero.c[3]==0)
+ for(i=0;zero.c[i+3]==0;i++,wc++)
+ if (table[words.c[i]]<255) table[words.c[i]]++; else overflow++;
+
+ if (count) *count=wc;
+ return overflow;
+}
+
+static inline vUInt16 partialminsum(vUInt8 ft1,vUInt8 ft2)
+{
+ vUInt8 mini;
+ vUInt16 minilo;
+ vUInt16 minihi;
+ vUInt8 mask_00= _MM_SETZERO_SI128();
+
+ mini = _MM_MIN_EPU8(ft1,ft2);
+ minilo = _MM_UNPACKLO_EPI8(mini,mask_00);
+ minihi = _MM_UNPACKHI_EPI8(mini,mask_00);
+
+ return _MM_ADDS_EPU16(minilo,minihi);
+}
+
+int compareTable(unsigned char *t1, int over1, unsigned char* t2, int over2)
+{
+ vUInt8 ft1;
+ vUInt8 ft2;
+ vUInt8 *table1=(vUInt8*)t1;
+ vUInt8 *table2=(vUInt8*)t2;
+ ushort_v summini;
+ int i;
+ int total;
+
+ ft1 = _MM_LOADU_SI128(table1);
+ ft2 = _MM_LOADU_SI128(table2);
+ summini.m = partialminsum(ft1,ft2);
+ table1++;
+ table2++;
+
+
+ for (i=1;i<16;i++,table1++,table2++)
+ {
+ ft1 = _MM_LOADU_SI128(table1);
+ ft2 = _MM_LOADU_SI128(table2);
+ summini.m = _MM_ADDS_EPU16(summini.m,partialminsum(ft1,ft2));
+
+ }
+
+ // Finishing the sum process
+
+ summini.m = _MM_ADDS_EPU16(summini.m,_MM_SRLI_SI128(summini.m,8)); // sum the 4 firsts with the 4 lasts
+ summini.m = _MM_ADDS_EPU16(summini.m,_MM_SRLI_SI128(summini.m,4));
+
+ total = summini.c[0]+summini.c[1];
+ total+= (over1 < over2) ? over1:over2;
+
+ return total;
+}
+
+int threshold4(int wordcount,double identity)
+{
+ int error;
+ int lmax;
+
+ wordcount+=3;
+ error = (int)floor((double)wordcount * ((double)1.0-identity));
+ lmax = (wordcount - error) / (error + 1);
+ if (lmax < 4)
+ return 0;
+ return (lmax - 3) \
+ * (error + 1) \
+ + ((wordcount - error) % (error + 1));
+}
+
+int thresholdLCS4(int32_t reflen,int32_t lcs)
+{
+ int nbfrag;
+ int smin;
+ int R;
+ int common;
+
+ nbfrag = (reflen - lcs)*2 + 1;
+ smin = lcs/nbfrag;
+ R = lcs - smin * nbfrag;
+ common = MAX(smin - 2,0) * R + MAX(smin - 3,0) * (nbfrag - R);
+ return common;
+}
+
+#ifndef MAX
+#define MAX(x,y) (((x)>(y)) ? (x):(y))
+#define MIN(x,y) (((x)<(y)) ? (x):(y))
+#endif
+
+int ispossible(int len1, unsigned char *t1, int over1,
+ int len2, unsigned char* t2, int over2,
+ double minimum, int normalized, int large)
+{
+ int32_t reflen;
+ int32_t lcs;
+ int32_t mincount;
+
+ if (normalized)
+ {
+ if (large)
+ reflen = MAX(len1,len2);
+ else
+ reflen = MIN(len1,len2);
+
+ lcs = (int32_t)floor((double)reflen * minimum);
+ }
+ else
+ {
+ if (large)
+ reflen = MAX(len1,len2);
+ else
+ reflen = MIN(len1,len2);
+
+ lcs = (int32_t) minimum;
+ }
+
+ if (lcs > MIN(len1,len2))
+ return 0;
+
+ mincount = thresholdLCS4(reflen,lcs);
+
+// fprintf(stderr,"MaxLCS %d %d %d : %d\n",reflen,lcs,compareTable(t1,over1,t2,over2),mincount);
+
+ return compareTable(t1,over1,t2,over2) >=mincount;
+}
+
+
diff --git a/src/obitools/align/_lcs.h b/src/obitools/align/_lcs.h
new file mode 100644
index 0000000..b9d5f15
--- /dev/null
+++ b/src/obitools/align/_lcs.h
@@ -0,0 +1,29 @@
+#include "_sse.h"
+
+#define bool char
+#define false (1==0)
+#define true (1==1)
+
+typedef struct {
+ int16_t size;
+
+ union { int16_t *shrt;
+ int8_t *byte;
+ } data;
+
+ union { int16_t *shrt;
+ int8_t *byte;
+ } score;
+
+
+} column_t, **column_pp, *column_p;
+
+column_p allocateColumn(int length,column_t *column, bool mode8bits);
+
+void freeColumn(column_p column);
+
+int fastLCSScore16(const char* seq1, const char* seq2,column_pp ppcolumn,int32_t* lpath);
+int fastLCSScore8(const char* seq1, const char* seq2,column_pp ppcolumn,int32_t* lpath);
+int simpleLCS(const char* seq1, const char* seq2,column_pp ppcolumn,int32_t* lpath);
+
+int fastLCSScore(const char* seq1, const char* seq2,column_pp column,int32_t* lpath);
diff --git a/src/obitools/align/_lcs.pxd b/src/obitools/align/_lcs.pxd
new file mode 100644
index 0000000..e76a9c5
--- /dev/null
+++ b/src/obitools/align/_lcs.pxd
@@ -0,0 +1,9 @@
+cdef extern from *:
+ ctypedef char* const_char_ptr "const char*"
+ ctypedef int* int32_ptr
+
+
+cdef import from "_lcs.h":
+ struct column_t:
+ pass
+ int fastLCSScore(const_char_ptr seq1, const_char_ptr seq2,column_t* column,int32_ptr length)
diff --git a/src/obitools/align/_lcs.pyx b/src/obitools/align/_lcs.pyx
new file mode 100644
index 0000000..c809d47
--- /dev/null
+++ b/src/obitools/align/_lcs.pyx
@@ -0,0 +1,206 @@
+'''
+Created on 6 Nov. 2009
+
+ at author: coissac
+'''
+#@PydevCodeAnalysisIgnore
+
+from cpython cimport array
+
+from obitools import BioSequence
+from _lcs cimport *
+from _upperbond cimport *
+from _dynamic cimport *
+
+from _upperbond import *
+
+cdef class LCS(DynamicProgramming):
+
+ def __init__(self):
+ DynamicProgramming.__init__(self,opengap=0,extgap=0)
+
+ property opengap:
+ def __get__(self):
+ return self._opengap
+
+ property extgap:
+ def __get__(self): # @DuplicatedSignature
+ return self._extgap
+
+ cdef double matchScore(self,int h, int v):
+ return iupacPartialMatch(self.hSeq.sequence[h-1],self.vSeq.sequence[v-1])
+
+ cdef double doAlignment(self) except? 0:
+ cdef int i # vertical index
+ cdef int j # horizontal index
+ cdef int idx
+ cdef int jump
+ cdef int delta
+ cdef double score
+ cdef double scoremax
+ cdef int path
+
+
+ if self.needToCompute:
+ self.allocate()
+ self.reset()
+
+ for j in range(1,self.hSeq.length+1):
+ idx = self.index(j,0)
+ self.matrix.matrix[idx].score = 0
+ self.matrix.matrix[idx].path = j
+
+ for i in range(1,self.vSeq.length+1):
+ idx = self.index(0,i)
+ self.matrix.matrix[idx].score = 0
+ self.matrix.matrix[idx].path = -i
+
+ for i in range(1,self.vSeq.length+1):
+ for j in range(1,self.hSeq.length+1):
+
+ # 1 - came from diagonal
+ idx = self.index(j-1,i-1)
+ # print "computing cell : %d,%d --> %d/%d" % (j,i,self.index(j,i),self.matrix.msize),
+ scoremax = self.matrix.matrix[idx].score + \
+ self.matchScore(j,i)
+ path = 0
+
+ # print "so=%f sd=%f sm=%f" % (self.matrix.matrix[idx].score,self.matchScore(j,i),scoremax),
+
+ # 2 - open horizontal gap
+ idx = self.index(j-1,i)
+ score = self.matrix.matrix[idx].score
+ if score > scoremax :
+ scoremax = score
+ path = self.matrix.matrix[idx].path
+ if path >=0:
+ path+=1
+ else:
+ path=+1
+
+ # 3 - open vertical gap
+ idx = self.index(j,i-1)
+ score = self.matrix.matrix[idx].score
+ if score > scoremax :
+ scoremax = score
+ path = self.matrix.matrix[idx].path
+ if path <=0:
+ path-=1
+ else:
+ path=-1
+
+ idx = self.index(j,i)
+ self.matrix.matrix[idx].score = scoremax
+ self.matrix.matrix[idx].path = path
+
+ self.sequenceChanged=False
+ self.scoreChanged=False
+
+ idx = self.index(self.hSeq.length,self.vSeq.length)
+ return self.matrix.matrix[idx].score
+
+ cdef void backtrack(self):
+ #cdef list path=[]
+ cdef int i
+ cdef int j
+ cdef int p
+
+ self.doAlignment()
+ i=self.vSeq.length
+ j=self.hSeq.length
+ self.path=allocatePath(i,j,self.path)
+
+ while (i or j):
+ p=self.matrix.matrix[self.index(j,i)].path
+ self.path.path[self.path.length]=p
+ self.path.length+=1
+# path.append(p)
+ if p==0:
+ i-=1
+ j-=1
+ elif p < 0:
+ i+=p
+ else:
+ j-=p
+
+ #path.reverse()
+ #reversePath(self.path)
+ self.path.hStart=0
+ self.path.vStart=0
+ #return 0,0,path
+
+ALILEN=0
+MAXLEN=1
+MINLEN=2
+
+def lenlcs(seq1,seq2,double minimum=0.,bint normalized=False, int reference=ALILEN):
+ cdef double lcs
+ cdef bytes se1=bytes(str(seq1))
+ cdef bytes se2=bytes(str(seq2))
+ cdef int l1 = len(seq1)
+ cdef int l2 = len(seq2)
+ cdef int o1
+ cdef int o2
+ cdef int wordcount
+ cdef int alilength
+ cdef bint possible
+ cdef bint large
+
+ cdef array.array[unsigned char] w1
+ cdef array.array[unsigned char] w2
+
+ cdef char *s1
+ cdef char *s2
+ s1=se1
+ s2=se2
+
+
+ if min(l1,l2) < 8:
+ lcsali = LCS()
+ lcsali.seqA = seq1
+ lcsali.seqB = seq2
+ lcs = lcsali.doAlignment()
+ else:
+ if minimum > 0.:
+ if isinstance(seq1, BioSequence) and hasattr(seq1, "word4table") and seq1.word4table is not None:
+ w1 = seq1.word4table
+ o1 = seq1.word4over
+ else:
+ w1 = newtable()
+ o1 = buildTable(s1,w1.data.as_uchars,&wordcount)
+ if isinstance(seq1, BioSequence):
+ seq1.word4table=w1
+ seq1.word4over=o1
+ if isinstance(seq2, BioSequence) and hasattr(seq2, "word4table") and seq2.word4table is not None:
+ w2 = seq2.word4table
+ o2 = seq2.word4over
+ else:
+ w2 = newtable()
+ o2 = buildTable(s2,w2.data.as_uchars,&wordcount)
+ if isinstance(seq2, BioSequence) :
+ seq2.word4table=w2
+ seq2.word4over=o2
+
+ large = reference==ALILEN or reference==MAXLEN
+ possible = ispossible(l1, w1.data.as_uchars, o1,
+ l2, w2.data.as_uchars, o2,
+ minimum,normalized,large)
+ if possible:
+ lcs = fastLCSScore(s1,s2,NULL,&alilength)
+ else:
+ lcs = -1.0
+ else:
+ lcs = fastLCSScore(s1,s2,NULL,&alilength)
+
+ if lcs >= 0 and normalized:
+ if reference==ALILEN:
+ if alilength > 0:
+ lcs /=alilength
+ else:
+ lcs = 0
+ elif reference==MAXLEN:
+ lcs /=max(l1,l2)
+ elif reference==MINLEN:
+ lcs /=min(l1,l2)
+
+ return lcs,alilength
diff --git a/src/obitools/align/_lcs_fast.h b/src/obitools/align/_lcs_fast.h
new file mode 100644
index 0000000..115cf26
--- /dev/null
+++ b/src/obitools/align/_lcs_fast.h
@@ -0,0 +1,597 @@
+
+/*
+ * Print a SSE register for debug purpose
+ */
+
+#ifdef __SSE2__
+
+static void printreg(VTYPE r)
+{
+ STYPE a0,a1,a2,a3,a4,a5,a6,a7;
+#if VMODE
+ STYPE a8,a9,a10,a11,a12,a13,a14,a15;
+#endif
+
+ a0= EXTRACT_REG(r,0);
+ a1= EXTRACT_REG(r,1);
+ a2= EXTRACT_REG(r,2);
+ a3= EXTRACT_REG(r,3);
+ a4= EXTRACT_REG(r,4);
+ a5= EXTRACT_REG(r,5);
+ a6= EXTRACT_REG(r,6);
+ a7= EXTRACT_REG(r,7);
+#if VMODE
+ a8= EXTRACT_REG(r,8);
+ a9= EXTRACT_REG(r,9);
+ a10= EXTRACT_REG(r,10);
+ a11= EXTRACT_REG(r,11);
+ a12= EXTRACT_REG(r,12);
+ a13= EXTRACT_REG(r,13);
+ a14= EXTRACT_REG(r,14);
+ a15= EXTRACT_REG(r,15);
+#endif
+
+printf( "a00 :-> %7d %7d %7d %7d "
+ " %7d %7d %7d %7d "
+#if VMODE
+ "%7d %7d %7d %7d "
+ " %7d %7d %7d %7d "
+#endif
+ "\n"
+ , a0,a1,a2,a3,a4,a5,a6,a7
+#if VMODE
+ , a8,a9,a10,a11,a12,a13,a14,a15
+#endif
+);
+}
+
+/*
+ * set position p of a SSE register with the value v
+ */
+
+static inline VTYPE insert_reg(VTYPE r, STYPE v, int p)
+{
+ switch (p) {
+ case 0: return INSERT_REG(r,v,0);
+ case 1: return INSERT_REG(r,v,1);
+ case 2: return INSERT_REG(r,v,2);
+ case 3: return INSERT_REG(r,v,3);
+ case 4: return INSERT_REG(r,v,4);
+ case 5: return INSERT_REG(r,v,5);
+ case 6: return INSERT_REG(r,v,6);
+ case 7: return INSERT_REG(r,v,7);
+#if VMODE
+ case 8: return INSERT_REG(r,v,8);
+ case 9: return INSERT_REG(r,v,9);
+ case 10: return INSERT_REG(r,v,10);
+ case 11: return INSERT_REG(r,v,11);
+ case 12: return INSERT_REG(r,v,12);
+ case 13: return INSERT_REG(r,v,13);
+ case 14: return INSERT_REG(r,v,14);
+ case 15: return INSERT_REG(r,v,15);
+#endif
+ }
+ return _MM_SETZERO_SI128();
+}
+
+static inline STYPE extract_reg(VTYPE r, int p)
+{
+ switch (p) {
+ case 0: return EXTRACT_REG(r,0);
+ case 1: return EXTRACT_REG(r,1);
+ case 2: return EXTRACT_REG(r,2);
+ case 3: return EXTRACT_REG(r,3);
+ case 4: return EXTRACT_REG(r,4);
+ case 5: return EXTRACT_REG(r,5);
+ case 6: return EXTRACT_REG(r,6);
+ case 7: return EXTRACT_REG(r,7);
+#if VMODE
+ case 8: return EXTRACT_REG(r,8);
+ case 9: return EXTRACT_REG(r,9);
+ case 10: return EXTRACT_REG(r,10);
+ case 11: return EXTRACT_REG(r,11);
+ case 12: return EXTRACT_REG(r,12);
+ case 13: return EXTRACT_REG(r,13);
+ case 14: return EXTRACT_REG(r,14);
+ case 15: return EXTRACT_REG(r,15);
+#endif
+ }
+ return 0;
+}
+
+#define GET_H_SYMBOLE(s,p) ((p && p < lseq1) ? (s)[(p)-1]:255)
+#define GET_V_SYMBOLE(s,p) ((p && p < lseq2) ? (s)[(p)-1]:0)
+
+#define LSHIFT_SCORE(r) { r = _MM_SLLI_SI128((r),sizeof(STYPE)); }
+#define SET_H_SYMBOLE(r,p,s) { r = insert_reg((r),(STYPE)GET_H_SYMBOLE(seq1,(s)),(p)); }
+#define PUSH_V_SYMBOLE(r,s) { r = insert_reg(_MM_SLLI_SI128((r),sizeof(STYPE)),(STYPE)GET_V_SYMBOLE(seq2,(s)),0); }
+#define EQUAL(f1,f2) _MM_AND_SI128(EQUAL_REG((f1),(f2)),SET_CONST(1))
+
+int FASTLCSSCORE(const char* seq1, const char* seq2,column_pp ppcolumn,int32_t* lpath)
+{
+ int lseq1,lseq2; // length of the both sequences
+
+ int itmp; // tmp variables for swap
+ const char* stmp; //
+
+ int nbands; // Number of bands of width eight in the score matrix
+ int lastband; // width of the last band
+
+ // Register for scanning the score matrix
+ VTYPE minus1;
+ VTYPE minus2;
+ VTYPE current;
+
+ VTYPE left;
+ VTYPE top;
+ VTYPE diag;
+
+
+ VTYPE sminus1;
+ VTYPE sminus2;
+ VTYPE scurrent;
+
+ VTYPE sleft;
+ VTYPE stop;
+ VTYPE sdiag;
+
+ VTYPE way;
+ VTYPE onevect;
+ VTYPE maxvect;
+
+ VTYPE fhseq; // The fragment of the horizontal sequence
+ // to consider for aligment
+ VTYPE fvseq; // The fragment of the horizontal sequence
+ // to consider for aligment
+ VTYPE match;
+
+ int band;
+ int line;
+ int limit;
+
+ int lcs;
+
+ int h;
+ int i;
+
+ column_t *column;
+
+
+ // Made seq1 the longest sequences
+ lseq1=strlen(seq1);
+ lseq2=strlen(seq2);
+
+ if (lseq1 < 10 || lseq2 < 10)
+ return simpleLCS(seq1,seq2,ppcolumn,lpath);
+
+ if (lseq1 < lseq2)
+ {
+ itmp=lseq1;
+ lseq1=lseq2;
+ lseq2=itmp;
+
+ stmp=seq1;
+ seq1=seq2;
+ seq2=stmp;
+ }
+
+ // we add one to the both length for taking into
+ // account the extra line and column in the score
+ // matrix
+
+ lseq1++;
+ lseq2++;
+
+ // a band sized to the smallest sequence is allocated
+
+ if (ppcolumn)
+ column = *ppcolumn;
+ else
+ column=NULL;
+
+ column = allocateColumn(lseq2,column,VMODE);
+
+ // Check memory allocation
+ if (column == NULL)
+ return -1;
+
+ for (i=0; i<lseq2;i++)
+ {
+ column->data.CMENB[i]=MIN_SCORE;
+ column->score.CMENB[i]=-1;
+ }
+
+ nbands = lseq1 / VSIZE; // You have VSIZE element in one SSE register
+ // Alignment will be realized in nbands
+
+ lastband = lseq1 - (nbands * VSIZE); // plus one of width lastband except if
+ // lastband==0
+
+ if (lastband) nbands++;
+ else lastband=VSIZE;
+
+ lastband--;
+
+// printf("seq1 : %s seq2 : %s\n",seq1,seq2);
+
+
+ minus2 = SET_CONST(MIN_SCORE);
+ minus1 = _MM_SETZERO_SI128();
+
+ sminus1= _MM_SETZERO_SI128();
+ sminus2= _MM_SETZERO_SI128();
+ onevect= SET_CONST(1);
+ maxvect= SET_CONST(MAX_SCORE);
+
+ h=0;
+
+ fhseq = _MM_SETZERO_SI128();
+ fvseq = _MM_SETZERO_SI128();
+
+ //
+ // Beginnig of the first band
+ //
+
+ for (line = 0; line < VSIZE; line++,h++) // avant VSIZE - 1
+ {
+// printf("line= %4d h= %4d\n",line,h);
+ SET_H_SYMBOLE(fhseq,line,h)
+ PUSH_V_SYMBOLE(fvseq,line)
+ minus2 = insert_reg(minus2,0,h);
+ minus1 = insert_reg(minus1,MIN_SCORE,line); // 0 avant
+ match = EQUAL(fhseq,fvseq);
+
+ if (lpath)
+ {
+ sminus2 = insert_reg(sminus2,line-1,line); // Je ne suis pas certain de l'initialisation
+ sminus1 = insert_reg(sminus1,0,line);
+ }
+
+// printreg(fvseq);
+// printreg(fhseq);
+// printreg(match);
+// printf("================================\n");
+
+ current = minus1; // The best score is the upper one
+ // It cannot be the best as set to MIN_SCORE
+
+ left = minus1;
+
+// printf("Vert = "); printreg(current);
+
+
+ LSHIFT_SCORE(minus1) // I shift minus1 so know I'll compare with the left position
+ minus1=insert_reg(minus1,(column)->data.CMENB[line],0);
+
+ top=minus1;
+
+ if (lpath)
+ {
+ sleft=sminus1; // I store the path length corresponding to the upper path
+ LSHIFT_SCORE(sminus1) // I shift to prepare the score coming from the left side
+ sminus1=insert_reg(sminus1,(column)->score.CMENB[line],0);
+ stop=sminus1;
+ sdiag=sminus2;
+
+ }
+
+// printf("Horz = "); printreg(minus1);
+
+ current = GET_MAX(current,minus1); // Look for the best between upper and left
+
+// printf("BstHV= "); printreg(current);
+//
+// printf("Diag = "); printreg(ADD_REG(minus2,match));
+
+ diag=minus2;
+
+ // minus2 = ; // Minus2 contains the diagonal score, so I add the match reward
+ // Diag score are setup to 0 so this one will win on the first iteration
+ current = GET_MAX(current,ADD_REG(minus2,match));
+
+ if (lpath)
+ {
+// printf("\n");
+// printf("current: ");
+// printreg(current);
+// printf("current: ");
+// printreg(SUB_REG(current,match));
+// printf("diag : ");
+// printreg(diag);
+// printf("left : ");
+// printreg(left);
+// printf("top : ");
+// printreg(top);
+
+
+ way = EQUAL_REG(SUB_REG(current,match),diag);
+ scurrent= OR_REG(AND_REG(way,sdiag),
+ ANDNOT_REG(way,maxvect));
+// printf("sdiag : ");
+// printreg(scurrent);
+ way = EQUAL_REG(current,left);
+ scurrent= GET_MIN(scurrent,OR_REG(AND_REG(way,sleft),
+ ANDNOT_REG(way,maxvect)));
+
+// printf("sleft : ");
+// printreg(scurrent);
+ way = EQUAL_REG(current,top);
+ scurrent= GET_MIN(scurrent,OR_REG(AND_REG(way,stop),
+ ANDNOT_REG(way,maxvect)));
+// printf("stop : ");
+// printreg(scurrent);
+
+ scurrent= ADD_REG(scurrent,onevect);
+
+ sminus2=sminus1;
+ sminus1=scurrent;
+ }
+// printf("line %d :Best = ",line); printreg(current);
+//
+// printf("================================\n");
+
+ minus2=minus1;
+ minus1=current;
+
+// printf("min2 = "); printreg(minus2);
+// printf("min1 = "); printreg(minus1);
+// printf("================================\n");
+
+// printf("\n");
+// printf("sdiag : ");
+// printreg(sminus2);
+// printf("scur : ");
+// printreg(scurrent);
+// printf("current: ");
+// printreg(current);
+// printf("%8s\n",seq1);
+// printf("%8s\n",seq2);
+// printf("================================\n");
+
+
+ } ///// <<<<<<<<------- Fin du debut de la premiere bande
+
+
+// printf("================================\n");
+
+ (column)->data.CMENB[lseq2-VSIZE+line]=EXTRACT_REG(current,VSIZE-1);
+
+
+ if (lpath)
+ (column)->score.CMENB[lseq2-VSIZE+line]=EXTRACT_REG(scurrent,VSIZE-1);
+
+
+
+ for (band=0; band < nbands; band++)
+ {
+// SET_H_SYMBOLE(fhseq,line,h)
+// minus2 = insert_reg(minus2,0,line);
+// minus1 = insert_reg(minus1,MIN_SCORE,line); // 0 avant
+// h++;
+
+ for (; line < lseq2; line++)
+ {
+// printf("Je tourne avec line= %d \n",line);
+ PUSH_V_SYMBOLE(fvseq,line)
+
+ match = EQUAL(fhseq,fvseq);
+
+// printreg(fvseq);
+// printreg(fhseq);
+// printreg(match);
+// printf("================================\n");
+
+ current = minus1;
+
+ left = minus1;
+
+ // Store the last current score in extra column
+ (column)->data.CMENB[line-VSIZE]=EXTRACT_REG(current,VSIZE-1);
+ LSHIFT_SCORE(minus1)
+ minus1=insert_reg(minus1,(column)->data.CMENB[line],0);
+
+ top = minus1;
+
+// printf("Vert = "); printreg(current);
+
+ if (lpath)
+ {
+ sleft= sminus1;
+ (column)->score.CMENB[line-VSIZE]=EXTRACT_REG(scurrent,VSIZE-1);
+ LSHIFT_SCORE(sminus1)
+ sminus1=insert_reg(sminus1,(column)->score.CMENB[line],0);
+ stop=sminus1;
+ sdiag=sminus2;
+ }
+
+// printf("line = %d --> get = %d\n",line,(column)->data.CMENB[line]);
+
+// printf("Horz = "); printreg(minus1);
+
+ current = GET_MAX(current,minus1);
+
+ diag=minus2;
+
+ current = GET_MAX(current,ADD_REG(minus2,match));
+
+ if (lpath)
+ {
+// printf("\n");
+// printf("current: ");
+// printreg(current);
+// printf("current: ");
+// printreg(SUB_REG(current,match));
+// printf("diag : ");
+// printreg(diag);
+// printf("left : ");
+// printreg(left);
+// printf("top : ");
+// printreg(top);
+
+ way = EQUAL_REG(SUB_REG(current,match),diag);
+ scurrent= OR_REG(AND_REG(way,sdiag),
+ ANDNOT_REG(way,maxvect));
+
+// printf("sdiag : ");
+// printreg(scurrent);
+
+ way = EQUAL_REG(current,left);
+ scurrent= GET_MIN(scurrent,OR_REG(AND_REG(way,sleft),
+ ANDNOT_REG(way,maxvect)));
+
+// printf("sleft : ");
+// printreg(scurrent);
+
+ way = EQUAL_REG(current,top);
+ scurrent= GET_MIN(scurrent,OR_REG(AND_REG(way,stop),
+ ANDNOT_REG(way,maxvect)));
+
+// printf("stop : ");
+// printreg(scurrent);
+
+ scurrent= ADD_REG(scurrent,onevect);
+
+ sminus2=sminus1;
+ sminus1=scurrent;
+ }
+
+ minus2=minus1;
+ minus1=current;
+
+// printf("\n");
+// printf("sdiag : ");
+// printreg(sminus2);
+// printf("scur : ");
+// printreg(scurrent);
+// printf("current: ");
+// printreg(current);
+// printf("%8s\n",seq1);
+// printf("%8s\n",seq2);
+ }
+// printf("================================\n");
+
+ // end of the band and beginnig of the next one
+
+ limit=(band==(nbands-1)) ? lastband:VSIZE;
+
+ for (line = 0; line < limit; line++,h++)
+ {
+// printf("Je fini avec line= %d \n",line);
+
+ SET_H_SYMBOLE(fhseq,line,h)
+ PUSH_V_SYMBOLE(fvseq,line)
+
+
+ minus2 = insert_reg(minus2,MIN_SCORE,line);
+ minus1 = insert_reg(minus1,MIN_SCORE,line);
+ current = minus1;
+ left=minus1;
+
+ match = EQUAL(fhseq,fvseq);
+
+ if (lpath)
+ {
+ sminus2 = insert_reg(sminus2,lseq2-VSIZE+line,line);
+ sminus1 = insert_reg(sminus1,h,line);
+ sleft= sminus1;
+ }
+
+
+// printf("\n");
+// printf("fhseq = "); printreg(fhseq);
+// printf("fvseq = "); printreg(fvseq);
+// printf("----------------------------------------------------------------\n");
+// printf("match = "); printreg(match);
+
+
+ (column)->data.CMENB[lseq2-VSIZE+line]=EXTRACT_REG(current,VSIZE-1);
+ LSHIFT_SCORE(minus1)
+ minus1=insert_reg(minus1,(column)->data.CMENB[line],0);
+ top=minus1;
+
+ current = GET_MAX(current,minus1);
+
+ if (lpath)
+ {
+ (column)->score.CMENB[lseq2-VSIZE+line]=EXTRACT_REG(scurrent,VSIZE-1);
+ LSHIFT_SCORE(sminus1)
+ sminus1=insert_reg(sminus1,(column)->score.CMENB[line],0);
+ stop=sminus1;
+ sdiag=sminus2;
+
+ way = EQUAL_REG(current,minus1);
+
+ scurrent= OR_REG(AND_REG(way,sminus1),
+ ANDNOT_REG(way,scurrent));
+ }
+
+
+ diag=minus2;
+
+ current = GET_MAX(current,ADD_REG(minus2,match));
+
+ if (lpath)
+ {
+ way = EQUAL_REG(SUB_REG(current,match),diag);
+ scurrent= OR_REG(AND_REG(way,sdiag),
+ ANDNOT_REG(way,maxvect));
+
+ way = EQUAL_REG(current,left);
+ scurrent= GET_MIN(scurrent,OR_REG(AND_REG(way,sleft),
+ ANDNOT_REG(way,maxvect)));
+
+ way = EQUAL_REG(current,top);
+ scurrent= GET_MIN(scurrent,OR_REG(AND_REG(way,stop),
+ ANDNOT_REG(way,maxvect)));
+
+ scurrent= ADD_REG(scurrent,onevect);
+
+ sminus2=sminus1;
+ sminus1=scurrent;
+ }
+
+// printf("currt = "); printreg(current);
+
+ minus2=minus1;
+ minus1=current;
+
+// printf("\n");
+// printf("sdiag : ");
+// printreg(sminus2);
+// printf("scur : ");
+// printreg(scurrent);
+// printf("current: ");
+// printreg(current);
+// printf("%8s\n",seq1);
+// printf("%8s\n",seq2);
+
+// printf("Je stocke line= %d la valeur %d\n",lseq2-VSIZE+line,(column)->data.CMENB[lseq2-VSIZE+line]);
+ }
+
+ }
+
+// printf("\n");
+// printf("line = %d, h= %d, lastband = %d\n",line,h,lastband);
+// printf("currt = "); printreg(current);
+ lcs = extract_reg(current,lastband);
+
+ if(lpath)
+ *lpath= extract_reg(scurrent,lastband);
+// printf("lastband = %d (%d) lcs = %d\n",lastband,lseq2,lcs);
+
+ if (ppcolumn)
+ *ppcolumn=column;
+ else
+ freeColumn(column);
+
+ return lcs;
+}
+
+#else
+int FASTLCSSCORE(const char* seq1, const char* seq2,column_pp ppcolumn,int32_t* lpath)
+{
+ return simpleLCS(seq1,seq2,ppcolumn,lpath);
+}
+
+#endif /* __SSE2__ */
+
diff --git a/src/obitools/align/_nws.pxd b/src/obitools/align/_nws.pxd
new file mode 100644
index 0000000..9ed1e7f
--- /dev/null
+++ b/src/obitools/align/_nws.pxd
@@ -0,0 +1,10 @@
+from _dynamic cimport *
+
+cdef class NWS(DynamicProgramming):
+ cdef double _match
+ cdef double _mismatch
+
+ cdef double matchScore(self,int h, int v)
+ cdef double doAlignment(self) except? 0
+
+
diff --git a/src/obitools/align/_nws.pyx b/src/obitools/align/_nws.pyx
new file mode 100644
index 0000000..5c97f31
--- /dev/null
+++ b/src/obitools/align/_nws.pyx
@@ -0,0 +1,162 @@
+'''
+Created on 6 Nov. 2009
+
+ at author: coissac
+'''
+#@PydevCodeAnalysisIgnore
+
+from _nws cimport *
+
+
+cdef class NWS(DynamicProgramming):
+
+ def __init__(self,match=4,mismatch=-6,opengap=-8,extgap=-2):
+ DynamicProgramming.__init__(self,opengap,extgap)
+ self._match=match
+ self._mismatch=mismatch
+
+ cdef double matchScore(self,int h, int v):
+ cdef double score
+ score = iupacPartialMatch(self.hSeq.sequence[h-1],self.vSeq.sequence[v-1])
+ return score * self._match + (1-score) * self._mismatch
+
+ cdef double doAlignment(self) except? 0:
+ cdef int i # vertical index
+ cdef int j # horizontal index
+ cdef int idx
+ cdef int jump
+ cdef int delta
+ cdef double score
+ cdef double scoremax
+ cdef int path
+
+
+ if self.needToCompute:
+ self.allocate()
+ self.reset()
+
+ for j in range(1,self._hlen()+1):
+ idx = self.index(j,0)
+ self.matrix.matrix[idx].score = self._opengap + (self._extgap * (j-1))
+ self.matrix.matrix[idx].path = j
+
+ for i in range(1,self._vlen()+1):
+ idx = self.index(0,i)
+ self.matrix.matrix[idx].score = self._opengap + (self._extgap * (i-1))
+ self.matrix.matrix[idx].path = -i
+
+ for i in range(1,self._vlen()+1):
+ for j in range(1,self._hlen()+1):
+
+ # 1 - came from diagonal
+ idx = self.index(j-1,i-1)
+ # print "computing cell : %d,%d --> %d/%d" % (j,i,self.index(j,i),self.matrix.msize),
+ scoremax = self.matrix.matrix[idx].score + \
+ self.matchScore(j,i)
+ path = 0
+
+ # print "so=%f sd=%f sm=%f" % (self.matrix.matrix[idx].score,self.matchScore(j,i),scoremax),
+
+ # 2 - open horizontal gap
+ idx = self.index(j-1,i)
+ score = self.matrix.matrix[idx].score + \
+ self._opengap
+ if score > scoremax :
+ scoremax = score
+ path = +1
+
+ # 3 - open vertical gap
+ idx = self.index(j,i-1)
+ score = self.matrix.matrix[idx].score + \
+ self._opengap
+ if score > scoremax :
+ scoremax = score
+ path = -1
+
+ # 4 - extend horizontal gap
+ jump = self.matrix.bestHJump[i]
+ if jump >= 0:
+ idx = self.index(jump,i)
+ delta = j-jump
+ score = self.matrix.matrix[idx].score + \
+ self._extgap * delta
+ if score > scoremax :
+ scoremax = score
+ path = delta+1
+
+ # 5 - extend vertical gap
+ jump = self.matrix.bestVJump[j]
+ if jump >= 0:
+ idx = self.index(j,jump)
+ delta = i-jump
+ score = self.matrix.matrix[idx].score + \
+ self._extgap * delta
+ if score > scoremax :
+ scoremax = score
+ path = -delta-1
+
+ idx = self.index(j,i)
+ self.matrix.matrix[idx].score = scoremax
+ self.matrix.matrix[idx].path = path
+
+ if path == -1:
+ self.matrix.bestVJump[j]=i
+ elif path == +1 :
+ self.matrix.bestHJump[i]=j
+
+ self.sequenceChanged=False
+ self.scoreChanged=False
+
+ idx = self.index(self._hlen(),self._vlen())
+ return self.matrix.matrix[idx].score
+
+ cdef void backtrack(self):
+ #cdef list path=[]
+ cdef int i
+ cdef int j
+ cdef int p
+
+ self.doAlignment()
+ i=self._vlen()
+ j=self._hlen()
+ self.path=allocatePath(i,j,self.path)
+
+ while (i or j):
+ p=self.matrix.matrix[self.index(j,i)].path
+ self.path.path[self.path.length]=p
+ self.path.length+=1
+ #path.append(p)
+ if p==0:
+ i-=1
+ j-=1
+ elif p < 0:
+ i+=p
+ else:
+ j-=p
+
+ #path.reverse()
+ #reversePath(self.path)
+ self.path.hStart=0
+ self.path.vStart=0
+
+ #return 0,0,path
+
+ property match:
+ def __get__(self):
+ return self._match
+
+ def __set__(self,match):
+ self._match=match
+ self.scoreChanged=True
+
+ property mismatch:
+ def __get__(self):
+ return self._mismatch
+
+ def __set__(self,mismatch):
+ self._mismatch=mismatch
+ self.scoreChanged=True
+
+
+
+
diff --git a/src/obitools/align/_nwsdnabyprot.pxd b/src/obitools/align/_nwsdnabyprot.pxd
new file mode 100644
index 0000000..18987e9
--- /dev/null
+++ b/src/obitools/align/_nwsdnabyprot.pxd
@@ -0,0 +1,36 @@
+from _dynamic cimport *
+
+cdef struct CodonAlignCell :
+ double score
+ int path
+ int frame
+
+cdef struct CodonAlignMatrix :
+ CodonAlignCell* matrix
+ int* bestVJump
+ int* bestHJump
+ int msize
+ int vsize
+ int hsize
+
+cdef CodonAlignMatrix* allocateCodonMatrix(int hsize, int vsize,CodonAlignMatrix *matrix=?)
+cdef void freeCodonMatrix(CodonAlignMatrix* matrix)
+cdef void resetCodonMatrix(CodonAlignMatrix* matrix)
+
+cdef double iupacPartialCodonMatch(char[3] c1, char[3] c2)
+
+cdef class NWSDNAByProt(DynamicProgramming):
+ cdef double _match
+ cdef double _mismatch
+ cdef int _sframe
+ cdef object _gc
+
+ cdef void getPossibleCodon(self,char[3] codon,int h,int v,int frame)
+ cdef double aaScore(self,char aa1,char aa2)
+ cdef double matchScore(self,int h, int v, int qframe)
+ cdef double doAlignment(self) except? 0
+ cdef void reset(self)
+ cdef int allocate(self) except -1
+ cdef void clean(self)
+
+
diff --git a/src/obitools/align/_nwsdnabyprot.pyx b/src/obitools/align/_nwsdnabyprot.pyx
new file mode 100644
index 0000000..918a1ce
--- /dev/null
+++ b/src/obitools/align/_nwsdnabyprot.pyx
@@ -0,0 +1,516 @@
+'''
+Created on 6 Nov. 2009
+
+ at author: coissac
+'''
+#@PydevCodeAnalysisIgnore
+
+import sys
+
+from _nwsdnabyprot cimport *
+
+from obitools.sequenceencoder.geneticcode import TranslationEncoder
+from obitools.translate import GeneticCode
+from obitools import BioSequence
+from obitools.alignment import AlignedSequence
+from obitools.alignment import Alignment
+
+
+
+cdef CodonAlignMatrix* allocateCodonMatrix(int hsize, int vsize,CodonAlignMatrix *matrix=NULL):
+
+ vsize+=1
+ hsize+=1
+
+ if matrix is NULL:
+ matrix = <CodonAlignMatrix*>malloc(sizeof(CodonAlignMatrix))
+ matrix.vsize=0
+ matrix.hsize=0
+ matrix.msize=0
+ matrix.matrix=NULL
+ matrix.bestVJump=NULL
+ matrix.bestHJump=NULL
+
+ if hsize > matrix.hsize:
+ matrix.bestVJump = <int*>realloc(matrix.bestVJump,hsize * sizeof(int))
+ matrix.hsize=hsize
+
+ if vsize > matrix.vsize:
+ matrix.bestHJump = <int*>realloc(matrix.bestHJump,vsize * sizeof(int))
+ matrix.vsize=vsize
+
+ if (hsize * vsize) > matrix.msize:
+ matrix.msize = hsize * vsize
+ matrix.matrix = <CodonAlignCell*>realloc(matrix.matrix, matrix.msize * sizeof(CodonAlignCell))
+
+ return matrix
+
+cdef void freeCodonMatrix(CodonAlignMatrix* matrix):
+ if matrix is not NULL:
+ if matrix.matrix is not NULL:
+ free(matrix.matrix)
+ if matrix.bestVJump is not NULL:
+ free(matrix.bestVJump)
+ if matrix.bestHJump is not NULL:
+ free(matrix.bestHJump)
+ free(matrix)
+
+cdef void resetCodonMatrix(CodonAlignMatrix* matrix):
+ if matrix is not NULL:
+ if matrix.matrix is not NULL:
+ bzero(<void*>matrix.matrix, matrix.msize * sizeof(CodonAlignCell))
+ if matrix.bestHJump is not NULL:
+ memset(<void*>matrix.bestHJump,255,matrix.vsize * sizeof(int))
+ if matrix.bestVJump is not NULL:
+ memset(<void*>matrix.bestVJump,255,matrix.hsize * sizeof(int))
+
+
+cdef double iupacPartialCodonMatch(char[3] c1, char[3] c2):
+ return (iupacPartialMatch(c1[0],c2[0]) +
+ iupacPartialMatch(c1[1],c2[1]) +
+ iupacPartialMatch(c1[2],c2[2])) / 3.0
+
+cdef class NWSDNAByProt(DynamicProgramming):
+
+ def __init__(self,match=4,
+ mismatch=-6,
+ opengap=-8,
+ extgap=-2,
+ geneticCode=None,
+ startingFrame=0):
+ DynamicProgramming.__init__(self,opengap,extgap)
+ self._match=match
+ self._mismatch=mismatch
+
+ if geneticCode is None:
+ self._gc = TranslationEncoder()
+ else:
+ self._gc = GeneticCode
+
+ self._sframe = startingFrame
+
+ cdef double aaScore(self,char aa1,char aa2):
+ if aa1==aa2 or aa1=='X' or aa2=='X':
+ return self._match
+ else:
+ return self._mismatch
+
+
+
+ cdef void getPossibleCodon(self,char[3] codon,int h, int v,int frame):
+ cdef CodonAlignMatrix* matrix
+ cdef CodonAlignCell* smatrix
+ cdef int path
+ cdef int vv
+
+ matrix = <CodonAlignMatrix*>self.matrix
+ smatrix= matrix.matrix
+ path = smatrix[self.index(h,v)].path
+
+
+ if frame == 0:
+ codon[0]=self.vSeq.sequence[v-1]
+ if v < (self.vSeq.length):
+ codon[1]=self.vSeq.sequence[v]
+ else:
+ codon[1]='*'
+ if v < (self.vSeq.length-1):
+ codon[2]=self.vSeq.sequence[v+1]
+ else:
+ codon[2]='*'
+
+ elif frame==1 :
+ vv=v
+ if v>1:
+ if path==0:
+ vv-=1
+ while(path!=0):
+ if path < 0:
+ vv+=path
+ else:
+ h-=path
+ path = smatrix[self.index(h,vv)].path
+ codon[0]=self.vSeq.sequence[vv-1]
+ else:
+ codon[0]='*'
+ codon[1]=self.vSeq.sequence[v-1]
+ if v < (self.vSeq.length):
+ codon[2]=self.vSeq.sequence[v]
+ else:
+ codon[2]='*'
+ else:
+ vv=v
+ if v>1:
+ if path==0:
+ vv-=1
+ while(path!=0):
+ if path < 0:
+ vv+=path
+ else:
+ h-=path
+ path = smatrix[self.index(h,vv)].path
+ codon[1]=self.vSeq.sequence[vv-1]
+ vv-=1
+ h-=1
+ path = smatrix[self.index(h,vv)].path
+ else:
+ codon[1]='*'
+ if v>2:
+ if path==0:
+ vv-=1
+ while(path!=0):
+ if path < 0:
+ vv+=path
+ else:
+ h-=path
+ path = smatrix[self.index(h,vv)].path
+ codon[0]=self.vSeq.sequence[vv-1]
+ codon[0]=self.vSeq.sequence[v-3]
+ else:
+ codon[0]='*'
+ codon[2]=self.vSeq.sequence[v-1]
+
+
+ cdef double matchScore(self,int h, int v, int qframe):
+ cdef double score
+ cdef int frame
+ cdef char[3] codon
+ cdef char[3] qcodon
+ cdef char aa
+ cdef char qaa
+
+ frame=((h - 1 + self._sframe) % 3)
+
+ # extract reference codon
+
+ if frame==0:
+ codon[0]=self.hSeq.sequence[h-1]
+ if h < (self.hSeq.length):
+ codon[1]=self.hSeq.sequence[h]
+ else:
+ codon[1]='*'
+ if h < (self.hSeq.length-1):
+ codon[2]=self.hSeq.sequence[h+1]
+ else:
+ codon[2]='*'
+ elif frame==1 :
+ if h>1:
+ codon[0]=self.hSeq.sequence[h-2]
+ else:
+ codon[0]='*'
+ codon[1]=self.hSeq.sequence[h-1]
+ if h < (self.hSeq.length):
+ codon[2]=self.hSeq.sequence[h]
+ else:
+ codon[2]='*'
+ else:
+ if h>2:
+ codon[0]=self.hSeq.sequence[h-3]
+ else:
+ codon[0]='*'
+ if h>1:
+ codon[1]=self.hSeq.sequence[h-2]
+ else:
+ codon[1]='*'
+ codon[2]=self.hSeq.sequence[h-1]
+
+ aa=ord(self._gc[str(codon)])
+
+ self.getPossibleCodon(qcodon,h,v,qframe)
+ qaa=ord(self._gc[str(qcodon)])
+ score = iupacPartialMatch(self.hSeq.sequence[h-1],self.vSeq.sequence[v-1])
+ score = self._match * score + self._mismatch * (1-score) + self.aaScore(aa,qaa)
+
+# print >>sys.stderr, h,frame,chr(aa),chr(codon[0])+chr(codon[1])+chr(codon[2]),
+# print >>sys.stderr, chr(qaa),chr(qcodon[0])+chr(qcodon[1])+chr(qcodon[2]), score
+
+ return score
+
+ cdef double doAlignment(self) except? 0:
+ cdef int i # vertical index
+ cdef int j # horizontal index
+ cdef int idx
+ cdef int jump
+ cdef int delta
+ cdef double score
+ cdef double scoremax
+ cdef int path
+ cdef int frame
+ cdef bint sframe
+ cdef int fframe
+ cdef CodonAlignMatrix* matrix
+ cdef CodonAlignCell* smatrix
+
+ cdef fscost=-10
+
+
+ if self.needToCompute:
+ self.allocate()
+ self.reset()
+
+ matrix = <CodonAlignMatrix*>self.matrix
+ smatrix= matrix.matrix
+ smatrix[0].frame=(self._sframe-1) % 3
+
+ for j in range(1,self.hSeq.length+1):
+ idx = self.index(j,0)
+ smatrix[idx].score = self._opengap + (self._extgap * (j-1))
+ smatrix[idx].path = j
+ smatrix[idx].frame = smatrix[0].frame
+
+ for i in range(1,self.vSeq.length+1):
+ idx = self.index(0,i)
+ smatrix[idx].score = self._opengap + (self._extgap * (i-1))
+ smatrix[idx].path = -i
+ smatrix[idx].frame = smatrix[0].frame
+
+ for i in range(1,self.vSeq.length+1):
+ for j in range(1,self.hSeq.length+1):
+
+ # 1 - came from diagonal
+ idx = self.index(j-1,i-1)
+ fframe=smatrix[idx].frame
+ fframe=(fframe + 1) % 3
+ # print "computing cell : %d,%d --> %d/%d" % (j,i,self.index(j,i),self.matrix.msize),
+ scoremax = smatrix[idx].score + \
+ self.matchScore(j,i,0) + \
+ (fframe > -1 and fframe != 0) * fscost
+ path = 0
+ frame= 0
+
+ score = smatrix[idx].score + \
+ self.matchScore(j,i,1) + \
+ (fframe > -1 and fframe != 1) * fscost
+ if score > scoremax or (fframe==1 and score==scoremax):
+ scoremax = score
+ frame = 1
+
+ score = smatrix[idx].score + \
+ self.matchScore(j,i,2) + \
+ (fframe > -1 and fframe != 2) * fscost
+ if score > scoremax or (fframe==2 and score==scoremax) :
+ scoremax = score
+ frame = 2
+
+ # print >>sys.stderr,j,i,frame,scoremax
+ # print "so=%f sd=%f sm=%f" % (self.matrix.matrix[idx].score,self.matchScore(j,i),scoremax),
+
+ # 2 - open horizontal gap
+ idx = self.index(j-1,i)
+ score = smatrix[idx].score + \
+ self._opengap
+ if score > scoremax :
+ scoremax = score
+ path = +1
+ frame= smatrix[idx].frame
+
+ # 3 - open vertical gap
+ idx = self.index(j,i-1)
+ score = smatrix[idx].score + \
+ self._opengap
+ if score > scoremax :
+ scoremax = score
+ path = -1
+ frame= smatrix[idx].frame
+
+ # 4 - extend horizontal gap
+ jump = matrix.bestHJump[i]
+ if jump >= 0:
+ idx = self.index(jump,i)
+ delta = j-jump
+ score = smatrix[idx].score + \
+ self._extgap * delta
+ if score > scoremax :
+ scoremax = score
+ path = delta+1
+ frame= smatrix[idx].frame
+
+ # 5 - extend vertical gap
+ jump = matrix.bestVJump[j]
+ if jump >= 0:
+ idx = self.index(j,jump)
+ delta = i-jump
+ score = smatrix[idx].score + \
+ self._extgap * delta
+ if score > scoremax :
+ scoremax = score
+ path = -delta-1
+ frame= smatrix[idx].frame
+
+ idx = self.index(j,i)
+ smatrix[idx].score = scoremax
+ smatrix[idx].path = path
+ smatrix[idx].frame = frame
+
+ if path == -1:
+ matrix.bestVJump[j]=i
+ elif path == +1 :
+ matrix.bestHJump[i]=j
+
+ self.sequenceChanged=False
+ self.scoreChanged=False
+
+ idx = self.index(self.hSeq.length,self.vSeq.length)
+ return smatrix[idx].score
+
+ cdef void backtrack(self):
+ #cdef list path=[]
+ cdef int i
+ cdef int j
+ cdef int p
+ cdef CodonAlignMatrix* matrix
+ cdef CodonAlignCell* smatrix
+
+
+ self.doAlignment()
+
+ matrix = <CodonAlignMatrix*>self.matrix
+ smatrix= matrix.matrix
+
+ i=self.vSeq.length
+ j=self.hSeq.length
+ self.path=allocatePath(i,j,self.path)
+
+ while (i or j):
+ p=smatrix[self.index(j,i)].path
+ self.path.path[self.path.length]=p
+ self.path.length+=1
+ #path.append(p)
+ if p==0:
+ i-=1
+ j-=1
+ elif p < 0:
+ i+=p
+ else:
+ j-=p
+
+ #path.reverse()
+ #reversePath(self.path)
+ self.path.hStart=0
+ self.path.vStart=0
+
+ #return 0,0,path
+
+ property match:
+ def __get__(self):
+ return self._match
+
+ def __set__(self,match):
+ self._match=match
+ self.scoreChanged=True
+
+ property mismatch:
+ def __get__(self):
+ return self._mismatch
+
+ def __set__(self,mismatch):
+ self._mismatch=mismatch
+ self.scoreChanged=True
+
+ cdef int allocate(self) except -1:
+
+ assert self.horizontalSeq is not None,'Sequence A must be set'
+ assert self.verticalSeq is not None,'Sequence B must be set'
+
+ cdef long lenH=self.hSeq.length
+ cdef long lenV=self.vSeq.length
+
+ self.matrix=<AlignMatrix*>allocateCodonMatrix(lenH,lenV,<CodonAlignMatrix*>self.matrix)
+ return 0
+
+ cdef void reset(self):
+ self.scoreChanged=True
+ resetCodonMatrix(<CodonAlignMatrix*>self.matrix)
+
+ cdef void clean(self):
+ freeCodonMatrix(<CodonAlignMatrix*>self.matrix)
+ freeSequence(self.hSeq)
+ freeSequence(self.vSeq)
+ freePath(self.path)
+
+ def __call__(self):
+ cdef list hgaps=[]
+ cdef list vgaps=[]
+ cdef list vframe=[]
+ cdef list b
+ cdef int hp=0
+ cdef int vp=0
+ cdef int lenh=0
+ cdef int lenv=0
+ cdef int h,v,p
+ cdef int i
+ cdef object ali
+ cdef double score
+ cdef CodonAlignMatrix* matrix
+ cdef CodonAlignCell* smatrix
+
+
+ if self._needToCompute():
+ score = self.doAlignment()
+ self.backtrack()
+
+ h=self.path.hStart
+ v=self.path.vStart
+ matrix = <CodonAlignMatrix*>self.matrix
+ smatrix= matrix.matrix
+
+
+ for i in range(self.path.length-1,-1,-1):
+ p=self.path.path[i]
+ if p==0:
+ hp+=1
+ vp+=1
+ lenh+=1
+ lenv+=1
+ v+=1
+ h+=1
+ vframe.append(smatrix[self.index(h,v)].frame)
+ elif p>0:
+ hp+=p
+ lenh+=p
+ vgaps.append([vp,p])
+ vp=0
+ h+=p
+ #vframe.extend(['*']*p)
+ else:
+ vp-=p
+ lenv-=p
+ hgaps.append([hp,-p])
+ hp=0
+ v-=p
+ vframe.extend([smatrix[self.index(h,v)].frame]*-p)
+
+ if hp:
+ hgaps.append([hp,0])
+ if vp:
+ vgaps.append([vp,0])
+
+ if lenh < self.hSeq.length:
+ hseq=self.horizontalSeq[self.path.hStart:self.path.hStart+lenh]
+ else:
+ hseq=self.horizontalSeq
+
+ hseq=AlignedSequence(hseq)
+ hseq.gaps=hgaps
+
+ if lenv < self.vSeq.length:
+ vseq=self.verticalSeq[self.path.vStart:self.path.vStart+lenv]
+ else:
+ vseq=self.verticalSeq
+
+ vseq=AlignedSequence(vseq)
+ vseq.gaps=vgaps
+
+ ali=Alignment()
+ ali.append(hseq)
+ ali.append(vseq)
+
+ ali.score=score
+ self.alignment=ali
+ ali=self.alignment.clone()
+ ali[1]['frame']=vframe
+ ali.score=self.alignment.score
+ return ali
+
+
+
diff --git a/src/obitools/align/_profilenws.pxd b/src/obitools/align/_profilenws.pxd
new file mode 100644
index 0000000..c1a779d
--- /dev/null
+++ b/src/obitools/align/_profilenws.pxd
@@ -0,0 +1,23 @@
+from _nws cimport *
+from obitools.profile._profile cimport *
+
+cdef struct alignProfile:
+ long length
+ long buffsize
+ double* frequency
+
+cdef alignProfile* allocateProfile(object profile, alignProfile* prof=?)
+
+cdef void freeProfile(alignProfile* prof)
+
+cdef class ProfileNWS(NWS):
+
+ cdef alignProfile* hProf
+ cdef alignProfile* vProf
+
+ cdef double matchScore(self,int h, int v)
+ cdef void clean(self)
+
+ cdef int _vlen(self)
+ cdef int _hlen(self)
+ cdef double doAlignment(self) except? 0
\ No newline at end of file
diff --git a/src/obitools/align/_profilenws.pyx b/src/obitools/align/_profilenws.pyx
new file mode 100644
index 0000000..2380aa4
--- /dev/null
+++ b/src/obitools/align/_profilenws.pyx
@@ -0,0 +1,211 @@
+'''
+Created on 01 Feb. 2011
+
+ at author: celine
+'''
+#@PydevCodeAnalysisIgnore
+
+from _profilenws cimport *
+from obitools.profile._profile import DNAProfile
+
+
+cdef alignProfile* allocateProfile(object profile, alignProfile* prof=NULL):
+ cdef int i,j
+ cdef int buffsize
+ cdef double* freq
+
+ if prof is NULL:
+ prof = <alignProfile*>malloc(sizeof(alignProfile))
+ prof.length=0
+ prof.buffsize=0
+ prof.frequency=NULL
+
+ prof.length=len(profile)
+ buffsize = 6 * prof.length * sizeof(double)
+ if buffsize > prof.buffsize:
+ prof.frequency = <double*>realloc(prof.frequency,buffsize)
+ prof.buffsize = buffsize
+
+ freq = prof.frequency
+ for i in range(prof.length):
+ freq[i] = profile.fA(i)
+ freq[i+prof.length] = profile.fC(i)
+ freq[i+prof.length*2]= profile.fG(i)
+ freq[i+prof.length*3]= profile.fT(i)
+ freq[i+prof.length*4]= profile.fOg(i)
+ freq[i+prof.length*5]= profile.fEg(i)
+
+ return prof
+
+cdef void freeProfile(alignProfile* prof):
+ if prof is not NULL:
+ if prof.frequency is not NULL:
+ free(<void*>prof.frequency)
+ free(prof)
+
+cdef class ProfileNWS(NWS):
+
+ def __init__(self,match=4,mismatch=-6,opengap=-8,extgap=-2):
+ DynamicProgramming.__init__(self,opengap,extgap)
+ self._match=match
+ self._mismatch=mismatch
+ self.hProf=NULL
+ self.vProf=NULL
+
+ cdef double matchScore(self,int h, int v):
+ cdef double pmatch
+ cdef double* hp = self.hProf.frequency
+ cdef double* vp = self.vProf.frequency
+ cdef int hl = self.hProf.length
+ cdef int vl = self.vProf.length
+
+ h-=1
+ v-=1
+ pmatch = hp[h]*vp[v] + hp[h+hl]*vp[v+vl] + hp[h+2*hl]*vp[v+2*vl] + hp[h+3*hl]*vp[v+3*vl]
+ return self._match * pmatch + (1-pmatch) * self._mismatch
+
+ cdef int _vlen(self):
+ return self.vProf.length
+
+ cdef int _hlen(self):
+ return self.hProf.length
+
+
+ property seqA:
+ def __get__(self):
+ return self.horizontalSeq
+
+ def __set__(self, seq):
+ self.sequenceChanged=True
+ if not isinstance(seq,DNAProfile):
+ seq=DNAProfile(seq)
+ self.horizontalSeq=seq
+ self.hProf=allocateProfile(seq,self.hProf)
+
+ property seqB:
+ def __get__(self):
+ return self.verticalSeq
+
+ def __set__(self, seq):
+ self.sequenceChanged=True
+ if not isinstance(seq,DNAProfile):
+ seq=DNAProfile(seq)
+ self.verticalSeq=seq
+ self.vProf=allocateProfile(seq,self.vProf)
+
+ cdef void clean(self):
+ freeProfile(self.hProf)
+ freeProfile(self.vProf)
+ freeMatrix(self.matrix)
+ freePath(self.path)
+
+
+ def __call__(self):
+ cdef list hgaps=[]
+ cdef list vgaps=[]
+ cdef list b
+ cdef int hp
+ cdef int vp
+ cdef int rp
+ cdef int lenh=0
+ cdef int lenv=0
+ cdef int h,v,p
+ cdef int i
+ cdef object ali
+ cdef double score
+ cdef DNAProfile newProfile
+ cdef DNAProfile horizontalSeq=self.horizontalSeq
+ cdef DNAProfile verticalSeq=self.verticalSeq
+
+ if self._needToCompute():
+
+ score = self.doAlignment()
+ self.backtrack()
+
+ sum = 0
+ for p in xrange(self.path.length) :
+ v = self.path.path[p]
+ if v == 0 :
+ sum += 1
+ else :
+ sum += abs(v)
+
+ newProfile = DNAProfile(size=sum)
+ newProfile.profile.weight = horizontalSeq.profile.weight+verticalSeq.profile.weight
+
+ hp=horizontalSeq.profile.length-1
+ vp=verticalSeq.profile.length-1
+ rp=newProfile.profile.length-1
+
+ for i in range(self.path.length):
+ p=self.path.path[i]
+
+ for i in range(self.path.length):
+ p=self.path.path[i]
+
+ if p==0:
+
+ newProfile.A[rp] = horizontalSeq.A[hp] + verticalSeq.A[vp]
+ newProfile.C[rp] = horizontalSeq.C[hp] + verticalSeq.C[vp]
+ newProfile.G[rp] = horizontalSeq.G[hp] + verticalSeq.G[vp]
+ newProfile.T[rp] = horizontalSeq.T[hp] + verticalSeq.T[vp]
+ newProfile.Og[rp] = horizontalSeq.Og[hp] + verticalSeq.Og[vp]
+ newProfile.Eg[rp] = horizontalSeq.Eg[hp] + verticalSeq.Eg[vp]
+
+ hp-=1
+ vp-=1
+ rp-=1
+
+ elif p>0:
+
+ for x in xrange(abs(p)-1) :
+
+ newProfile.A[rp] = horizontalSeq.A[hp]
+ newProfile.C[rp] = horizontalSeq.C[hp]
+ newProfile.G[rp] = horizontalSeq.G[hp]
+ newProfile.T[rp] = horizontalSeq.T[hp]
+ newProfile.Og[rp] = horizontalSeq.Og[hp]
+ newProfile.Eg[rp] = horizontalSeq.Eg[hp] + verticalSeq.profile.weight
+
+ hp-=1
+ rp-=1
+
+ newProfile.A[rp] = horizontalSeq.A[hp]
+ newProfile.C[rp] = horizontalSeq.C[hp]
+ newProfile.G[rp] = horizontalSeq.G[hp]
+ newProfile.T[rp] = horizontalSeq.T[hp]
+ newProfile.Og[rp] = horizontalSeq.Og[hp] + verticalSeq.profile.weight
+ newProfile.Eg[rp] = horizontalSeq.Eg[hp]
+
+ hp-=1
+ rp-=1
+
+ else:
+
+ for x in xrange(abs(p)-1) :
+
+ newProfile.A[rp] = verticalSeq.A[vp]
+ newProfile.C[rp] = verticalSeq.C[vp]
+ newProfile.G[rp] = verticalSeq.G[vp]
+ newProfile.T[rp] = verticalSeq.T[vp]
+ newProfile.Og[rp] = verticalSeq.Og[vp]
+ newProfile.Eg[rp] = verticalSeq.Eg[vp] + horizontalSeq.profile.weight
+
+ vp-=1
+ rp-=1
+
+ newProfile.A[rp] = verticalSeq.A[vp]
+ newProfile.C[rp] = verticalSeq.C[vp]
+ newProfile.G[rp] = verticalSeq.G[vp]
+ newProfile.T[rp] = verticalSeq.T[vp]
+ newProfile.Og[rp] = verticalSeq.Og[vp] + horizontalSeq.profile.weight
+ newProfile.Eg[rp] = verticalSeq.Eg[vp]
+
+ vp-=1
+ rp-=1
+
+ self.alignment = newProfile
+
+ ali=DNAProfile(self.alignment)
+
+ return ali
diff --git a/src/obitools/align/_qsassemble.pyx b/src/obitools/align/_qsassemble.pyx
new file mode 100644
index 0000000..aa5d625
--- /dev/null
+++ b/src/obitools/align/_qsassemble.pyx
@@ -0,0 +1,89 @@
+#@PydevCodeAnalysisIgnore
+'''
+Created on 6 Nov. 2009
+
+ at author: coissac
+'''
+
+from _dynamic cimport *
+from _assemble cimport DirectAssemble
+
+
+cdef class QSolexaDirectAssemble(DirectAssemble):
+
+ cdef double* hError
+ cdef double* vError
+
+ def __init__(self,match=4,mismatch=-4,opengap=-8,extgap=-2):
+ """
+ Rapport entre score de match et mismatch:
+ si mismatch = - match / 3
+ alors quand scrore temps vers 0 et qu'il est impossible de decider
+ pas de penalisation (s'=0)
+ si mismatch < - match / 3 la non decidabilite est penalisee.
+ """
+ DirectAssemble.__init__(self,match,mismatch,opengap,extgap)
+
+ cdef double matchScore(self,int h, int v):
+ cdef double score
+ cdef double smatch
+ cdef double smismatch
+ cdef double hok=1-self.hError[h-1]
+ cdef double vok=1-self.vError[v-1]
+ score=iupacPartialMatch(self.hSeq.sequence[h-1],self.vSeq.sequence[v-1])
+ smatch=((4*hok*vok-hok-vok)*(self._match-self._mismatch)+self._match+2*self._mismatch)/3
+ smismatch=((hok+vok-4*hok*vok)*(self._match-self._mismatch)+2*self._match+7*self._mismatch)/9
+ return smatch * score + smismatch * (1. - score)
+
+ property seqA:
+ def __get__(self):
+ return self.horizontalSeq
+
+ def __set__(self, seq):
+ cdef object oaddresse,olength
+ assert hasattr(seq, "quality"),"You must use sequence with quality indices"
+ self.sequenceChanged=True
+ self.horizontalSeq=seq
+ self.hSeq=allocateSequence(self.horizontalSeq,self.hSeq)
+ (oaddress,olength)=seq.quality.buffer_info()
+ self.hError=<double*><unsigned long int>oaddress
+
+ property seqB:
+ def __get__(self):
+ return self.verticalSeq
+
+ def __set__(self, seq):
+ cdef object oaddresse,olength
+ assert hasattr(seq, "quality"),"You must use sequence with quality indices"
+ self.sequenceChanged=True
+ self.verticalSeq=seq
+ self.vSeq=allocateSequence(self.verticalSeq,self.vSeq)
+ (oaddress,olength)=seq.quality.buffer_info()
+ self.vError=<double*><unsigned long int>oaddress
+
+
+cdef class QSolexaReverseAssemble(QSolexaDirectAssemble):
+
+ cdef double matchScore(self,int h, int v):
+ cdef double score
+ cdef double smatch
+ cdef double smismatch
+ cdef double hok=1-self.hError[h-1]
+ cdef double vok=1-self.vError[self.vSeq.length - v]
+ score=iupacPartialMatch(self.hSeq.sequence[h-1],self.vSeq.sequence[v-1])
+ smatch=((4*hok*vok-hok-vok)*(self._match-self._mismatch)+self._match+2*self._mismatch)/3
+ smismatch=((hok+vok-4*hok*vok)*(self._match-self._mismatch)+2*self._match+7*self._mismatch)/9
+ return smatch * score + smismatch * (1. - score)
+
+ property seqB:
+ def __get__(self):
+ return self.verticalSeq.wrapped
+
+ def __set__(self, seq):
+ cdef object oaddresse,olength
+ assert hasattr(seq, "quality"),"You must use sequence with quality indices"
+ self.sequenceChanged=True
+ self.verticalSeq=seq.complement()
+ self.vSeq=allocateSequence(self.verticalSeq,self.vSeq)
+ (oaddress,olength)=seq.quality.buffer_info()
+ self.vError=<double*><unsigned long int>oaddress
diff --git a/src/obitools/align/_qsrassemble.pyx b/src/obitools/align/_qsrassemble.pyx
new file mode 100644
index 0000000..01f6ddb
--- /dev/null
+++ b/src/obitools/align/_qsrassemble.pyx
@@ -0,0 +1,88 @@
+#@PydevCodeAnalysisIgnore
+'''
+Created on 6 Nov. 2009
+
+ at author: coissac
+'''
+
+from _dynamic cimport *
+from _rassemble cimport RightDirectAssemble
+
+cdef class QSolexaRightDirectAssemble(RightDirectAssemble):
+
+ cdef double* hError
+ cdef double* vError
+
+ def __init__(self,match=4,mismatch=-4,opengap=-8,extgap=-2):
+ """
+ Rapport entre score de match et mismatch:
+ si mismatch = - match / 3
+ alors quand scrore temps vers 0 et qu'il est impossible de decider
+ pas de penalisation (s'=0)
+ si mismatch < - match / 3 la non decidabilite est penalisee.
+ """
+ RightDirectAssemble.__init__(self,match,mismatch,opengap,extgap)
+
+ cdef double matchScore(self,int h, int v):
+ cdef double score
+ cdef double smatch
+ cdef double smismatch
+ cdef double hok=1-self.hError[h-1]
+ cdef double vok=1-self.vError[v-1]
+ score=iupacPartialMatch(self.hSeq.sequence[h-1],self.vSeq.sequence[v-1])
+ smatch=((4*hok*vok-hok-vok)*(self._match-self._mismatch)+self._match+2*self._mismatch)/3
+ smismatch=((hok+vok-4*hok*vok)*(self._match-self._mismatch)+2*self._match+7*self._mismatch)/9
+ return smatch * score + smismatch * (1. - score)
+
+ property seqA:
+ def __get__(self):
+ return self.horizontalSeq
+
+ def __set__(self, seq):
+ cdef object oaddresse,olength
+ assert hasattr(seq, "quality"),"You must use sequence with quality indices"
+ self.sequenceChanged=True
+ self.horizontalSeq=seq
+ self.hSeq=allocateSequence(self.horizontalSeq,self.hSeq)
+ (oaddress,olength)=seq.quality.buffer_info()
+ self.hError=<double*><unsigned long int>oaddress
+
+ property seqB:
+ def __get__(self):
+ return self.verticalSeq
+
+ def __set__(self, seq):
+ cdef object oaddresse,olength
+ assert hasattr(seq, "quality"),"You must use sequence with quality indices"
+ self.sequenceChanged=True
+ self.verticalSeq=seq
+ self.vSeq=allocateSequence(self.verticalSeq,self.vSeq)
+ (oaddress,olength)=seq.quality.buffer_info()
+ self.vError=<double*><unsigned long int>oaddress
+
+
+cdef class QSolexaRightReverseAssemble(QSolexaRightDirectAssemble):
+
+ cdef double matchScore(self,int h, int v):
+ cdef double score
+ cdef double smatch
+ cdef double smismatch
+ cdef double hok=1-self.hError[h-1]
+ cdef double vok=1-self.vError[self.vSeq.length - v]
+ score=iupacPartialMatch(self.hSeq.sequence[h-1],self.vSeq.sequence[v-1])
+ smatch=((4*hok*vok-hok-vok)*(self._match-self._mismatch)+self._match+2*self._mismatch)/3
+ smismatch=((hok+vok-4*hok*vok)*(self._match-self._mismatch)+2*self._match+7*self._mismatch)/9
+ return smatch * score + smismatch * (1. - score)
+
+ property seqB:
+ def __get__(self):
+ return self.verticalSeq.wrapped
+
+ def __set__(self, seq):
+ cdef object oaddresse,olength
+ assert hasattr(seq, "quality"),"You must use sequence with quality indices"
+ self.sequenceChanged=True
+ self.verticalSeq=seq.complement()
+ self.vSeq=allocateSequence(self.verticalSeq,self.vSeq)
+ (oaddress,olength)=seq.quality.buffer_info()
+ self.vError=<double*><unsigned long int>oaddress
diff --git a/src/obitools/align/_rassemble.pxd b/src/obitools/align/_rassemble.pxd
new file mode 100644
index 0000000..2b32986
--- /dev/null
+++ b/src/obitools/align/_rassemble.pxd
@@ -0,0 +1,10 @@
+from _nws cimport *
+
+cdef class RightDirectAssemble(NWS):
+ cdef double xsmax
+ cdef int xmax
+
+ cdef double doAlignment(self) except? 0
+
+cdef class RightReverseAssemble(RightDirectAssemble):
+ pass
\ No newline at end of file
diff --git a/src/obitools/align/_rassemble.pyx b/src/obitools/align/_rassemble.pyx
new file mode 100644
index 0000000..544ac77
--- /dev/null
+++ b/src/obitools/align/_rassemble.pyx
@@ -0,0 +1,157 @@
+#... at PydevCodeAnalysisIgnore
+'''
+Created on 6 Nov. 2009
+
+ at author: coissac
+'''
+
+from _rassemble cimport *
+
+
+cdef class RightDirectAssemble(NWS):
+
+ def __init__(self,match=4,mismatch=-6,opengap=-8,extgap=-2):
+ NWS.__init__(self,match,mismatch,opengap,extgap)
+ self.xsmax=0
+ self.xmax=0
+
+ cdef double doAlignment(self) except? 0:
+ cdef int i # vertical index
+ cdef int j # horizontal index
+ cdef int idx
+ cdef int jump
+ cdef int delta
+ cdef double score
+ cdef double scoremax
+ cdef int path
+
+
+ if self.needToCompute:
+ self.allocate()
+ self.reset()
+ self.xsmax=0
+ self.xmax=0
+
+ for j in range(1,self.hSeq.length+1):
+ idx = self.index(j,0)
+ self.matrix.matrix[idx].score = self._opengap + (self._extgap * (j-1))
+ self.matrix.matrix[idx].path = j
+
+ for i in range(1,self.vSeq.length+1):
+ idx = self.index(0,i)
+ self.matrix.matrix[idx].score = 0
+ self.matrix.matrix[idx].path = -i
+
+ for i in range(1,self.vSeq.length+1):
+ for j in range(1,self.hSeq.length+1):
+
+ # 1 - came from diagonal
+ idx = self.index(j-1,i-1)
+ # print "computing cell : %d,%d --> %d/%d" % (j,i,self.index(j,i),self.matrix.msize),
+ scoremax = self.matrix.matrix[idx].score + \
+ self.matchScore(j,i)
+ path = 0
+
+ # print "so=%f sd=%f sm=%f" % (self.matrix.matrix[idx].score,self.matchScore(j,i),scoremax),
+
+ # 2 - open horizontal gap
+ idx = self.index(j-1,i)
+ score = self.matrix.matrix[idx].score+ \
+ self._opengap
+ if score > scoremax :
+ scoremax = score
+ path = +1
+
+ # 3 - open vertical gap
+ idx = self.index(j,i-1)
+ score = self.matrix.matrix[idx].score + \
+ self._opengap
+ if score > scoremax :
+ scoremax = score
+ path = -1
+
+ # 4 - extend horizontal gap
+ jump = self.matrix.bestHJump[i]
+ if jump >= 0:
+ idx = self.index(jump,i)
+ delta = j-jump
+ score = self.matrix.matrix[idx].score + \
+ self._extgap * delta
+ if score > scoremax :
+ scoremax = score
+ path = delta+1
+
+ # 5 - extend vertical gap
+ jump = self.matrix.bestVJump[j]
+ if jump >= 0:
+ idx = self.index(j,jump)
+ delta = i-jump
+ score = self.matrix.matrix[idx].score + \
+ self._extgap * delta
+ if score > scoremax :
+ scoremax = score
+ path = -delta-1
+
+ idx = self.index(j,i)
+ self.matrix.matrix[idx].score = scoremax
+ self.matrix.matrix[idx].path = path
+
+ if path == -1:
+ self.matrix.bestVJump[j]=i
+ elif path == +1 :
+ self.matrix.bestHJump[i]=j
+
+ if i==self.vSeq.length and scoremax > self.xsmax:
+ self.xsmax=scoremax
+ self.xmax=j
+
+ self.sequenceChanged=False
+ self.scoreChanged=False
+
+ return self.xsmax
+
+ cdef void backtrack(self):
+ cdef list path=[]
+ cdef int i
+ cdef int j
+ cdef int p
+
+ self.doAlignment()
+ j=self.xmax
+ i=self.vSeq.length
+ self.path=allocatePath(i,j+1,self.path)
+
+ if self.xmax<self.hSeq.length:
+ self.path.path[self.path.length]=self.hSeq.length-self.xmax
+ self.path.length+=1
+
+ while (i or j):
+ p=self.matrix.matrix[self.index(j,i)].path
+ self.path.path[self.path.length]=p
+ self.path.length+=1
+ #path.append(p)
+ if p==0:
+ i-=1
+ j-=1
+ elif p < 0:
+ i+=p
+ else:
+ j-=p
+
+ #path.reverse()
+ self.path.hStart=0
+ self.path.vStart=0
+ #reversePath(self.path)
+ #return 0,0,path
+
+
+cdef class RightReverseAssemble(RightDirectAssemble):
+
+ property seqB:
+ def __get__(self):
+ return self.verticalSeq.wrapped
+
+ def __set__(self, seq):
+ self.sequenceChanged=True
+ self.verticalSeq=seq.complement()
+ self.vSeq=allocateSequence(self.verticalSeq,self.vSeq)
diff --git a/src/obitools/align/_sse.h b/src/obitools/align/_sse.h
new file mode 100644
index 0000000..bf1bd64
--- /dev/null
+++ b/src/obitools/align/_sse.h
@@ -0,0 +1,929 @@
+#include <string.h>
+
+#include <inttypes.h>
+#ifdef __SSE2__
+#include <xmmintrin.h>
+#else
+typedef long long __m128i __attribute__ ((__vector_size__ (16), __may_alias__));
+#endif /* __SSE2__ */
+
+#ifndef MAX
+#define MAX(x,y) (((x)>(y)) ? (x):(y))
+#define MIN(x,y) (((x)<(y)) ? (x):(y))
+#endif
+
+#define ALIGN __attribute__((aligned(16)))
+typedef __m128i vUInt8;
+typedef __m128i vInt8;
+
+typedef __m128i vUInt16;
+typedef __m128i vInt16;
+
+typedef __m128i vUInt64;
+
+typedef union
+{
+ __m128i i;
+ int64_t s64[ 2];
+ int16_t s16[ 8];
+ int8_t s8 [16];
+ uint8_t u8 [16];
+ uint16_t u16[8 ];
+ uint32_t u32[4 ];
+ uint64_t u64[2 ];
+} um128;
+
+typedef union
+ {
+ vUInt8 m;
+ uint8_t c[16];
+ } uchar_v;
+
+typedef union
+ {
+ vUInt16 m;
+ uint16_t c[8];
+ } ushort_v;
+
+typedef union
+ {
+ vUInt64 m;
+ uint64_t c[2];
+ } uint64_v;
+
+
+#ifdef __SSE2__
+
+static inline int8_t _s2_extract_epi8(__m128i r, const int p)
+{
+#define ACTIONP(r,x) return _mm_extract_epi16(r,x) & 0xFF
+#define ACTIONI(r,x) return _mm_extract_epi16(r,x) >> 8
+ switch (p) {
+ case 0: ACTIONP(r,0);
+ case 1: ACTIONI(r,0);
+ case 2: ACTIONP(r,1);
+ case 3: ACTIONI(r,1);
+ case 4: ACTIONP(r,2);
+ case 5: ACTIONI(r,2);
+ case 6: ACTIONP(r,3);
+ case 7: ACTIONI(r,3);
+ case 8: ACTIONP(r,4);
+ case 9: ACTIONI(r,4);
+ case 10: ACTIONP(r,5);
+ case 11: ACTIONI(r,5);
+ case 12: ACTIONP(r,6);
+ case 13: ACTIONI(r,6);
+ case 14: ACTIONP(r,7);
+ case 15: ACTIONI(r,7);
+ }
+#undef ACTIONP
+#undef ACTIONI
+
+ return 0;
+}
+
+static inline __m128i _s2_max_epi8(__m128i a, __m128i b)
+{
+ __m128i mask = _mm_cmpgt_epi8( a, b );
+ a = _mm_and_si128 (a,mask );
+ b = _mm_andnot_si128(mask,b);
+ return _mm_or_si128(a,b);
+}
+
+static inline __m128i _s2_min_epi8(__m128i a, __m128i b)
+{
+ __m128i mask = _mm_cmplt_epi8( a, b );
+ a = _mm_and_si128 (a,mask );
+ b = _mm_andnot_si128(mask,b);
+ return _mm_or_si128(a,b);
+}
+
+static inline __m128i _s2_insert_epi8(__m128i r, int b, const int p)
+{
+#define ACTIONP(r,x) return _mm_insert_epi16(r,(_mm_extract_epi16(r,x) & 0xFF00) | (b & 0x00FF),x)
+#define ACTIONI(r,x) return _mm_insert_epi16(r,(_mm_extract_epi16(r,x) & 0x00FF) | ((b << 8)& 0xFF00),x)
+ switch (p) {
+ case 0: ACTIONP(r,0);
+ case 1: ACTIONI(r,0);
+ case 2: ACTIONP(r,1);
+ case 3: ACTIONI(r,1);
+ case 4: ACTIONP(r,2);
+ case 5: ACTIONI(r,2);
+ case 6: ACTIONP(r,3);
+ case 7: ACTIONI(r,3);
+ case 8: ACTIONP(r,4);
+ case 9: ACTIONI(r,4);
+ case 10: ACTIONP(r,5);
+ case 11: ACTIONI(r,5);
+ case 12: ACTIONP(r,6);
+ case 13: ACTIONI(r,6);
+ case 14: ACTIONP(r,7);
+ case 15: ACTIONI(r,7);
+ }
+#undef ACTIONP
+#undef ACTIONI
+
+ return _mm_setzero_si128();
+}
+
+// Fill a SSE Register with 16 time the same 8bits integer value
+#define _MM_SET1_EPI8(x) _mm_set1_epi8(x)
+#define _MM_INSERT_EPI8(r,x,i) _s2_insert_epi8((r),(x),(i))
+#define _MM_CMPEQ_EPI8(x,y) _mm_cmpeq_epi8((x),(y))
+#define _MM_CMPGT_EPI8(x,y) _mm_cmpgt_epi8((x),(y))
+#define _MM_CMPLT_EPI8(x,y) _mm_cmplt_epi8((x),(y))
+#define _MM_MAX_EPI8(x,y) _s2_max_epi8((x),(y))
+#define _MM_MIN_EPI8(x,y) _s2_min_epi8((x),(y))
+#define _MM_ADD_EPI8(x,y) _mm_add_epi8((x),(y))
+#define _MM_SUB_EPI8(x,y) _mm_sub_epi8((x),(y))
+#define _MM_EXTRACT_EPI8(r,p) _s2_extract_epi8((r),(p))
+
+#define _MM_MIN_EPU8(x,y) _mm_min_epu8((x),(y))
+
+// Fill a SSE Register with 8 time the same 16bits integer value
+#define _MM_SET1_EPI16(x) _mm_set1_epi16(x)
+#define _MM_INSERT_EPI16(r,x,i) _mm_insert_epi16((r),(x),(i))
+#define _MM_CMPEQ_EPI16(x,y) _mm_cmpeq_epi16((x),(y))
+#define _MM_CMPGT_EPI16(x,y) _mm_cmpgt_epi16((x),(y))
+#define _MM_CMPLT_EPI16(x,y) _mm_cmplt_epi16((x),(y))
+#define _MM_MAX_EPI16(x,y) _mm_max_epi16((x),(y))
+#define _MM_MIN_EPI16(x,y) _mm_min_epi16((x),(y))
+#define _MM_ADD_EPI16(x,y) _mm_add_epi16((x),(y))
+#define _MM_SUB_EPI16(x,y) _mm_sub_epi16((x),(y))
+#define _MM_EXTRACT_EPI16(r,p) _mm_extract_epi16((r),(p))
+#define _MM_UNPACKLO_EPI8(a,b) _mm_unpacklo_epi8((a),(b))
+#define _MM_UNPACKHI_EPI8(a,b) _mm_unpackhi_epi8((a),(b))
+#define _MM_ADDS_EPU16(x,y) _mm_adds_epu16((x),(y))
+
+
+#define _MM_SRLI_EPI64(r,x) _mm_srli_epi64((r),(x))
+#define _MM_SLLI_EPI64(r,x) _mm_slli_epi64((r),(x))
+
+// Set a SSE Register to 0
+#define _MM_SETZERO_SI128() _mm_setzero_si128()
+
+#define _MM_AND_SI128(x,y) _mm_and_si128((x),(y))
+#define _MM_ANDNOT_SI128(x,y) _mm_andnot_si128((x),(y))
+#define _MM_OR_SI128(x,y) _mm_or_si128((x),(y))
+#define _MM_XOR_SI128(x,y) _mm_xor_si128((x),(y))
+#define _MM_SLLI_SI128(r,s) _mm_slli_si128((r),(s))
+#define _MM_SRLI_SI128(r,s) _mm_srli_si128((r),(s))
+
+// Load a SSE register from an unaligned address
+#define _MM_LOADU_SI128(x) _mm_loadu_si128(x)
+
+// #define _MM_UNPACKLO_EPI8(x,y) _mm_unpacklo_epi8((x),(y))
+
+#else /* __SSE2__ Not defined */
+
+static inline __m128i _em_set1_epi8(int x)
+{
+ um128 a;
+
+ x&=0xFF;
+ a.s8[0]=x;
+ a.s8[1]=x;
+ a.u16[1]=a.u16[0];
+ a.u32[1]=a.u32[0];
+ a.u64[1]=a.u64[0];
+
+ return a.i;
+}
+
+static inline __m128i _em_insert_epi8(__m128i r, int x, const int i)
+{
+ um128 a;
+ a.i=r;
+ a.s8[i]=x & 0xFF;
+ return a.i;
+}
+
+static inline __m128i _em_cmpeq_epi8(__m128i a, __m128i b)
+{
+ um128 x;
+ um128 y;
+ um128 r;
+
+ x.i=a;
+ y.i=b;
+
+#define R(z) r.s8[z]=(x.s8[z]==y.s8[z]) ? 0xFF:0
+ R(0);
+ R(1);
+ R(2);
+ R(3);
+ R(4);
+ R(5);
+ R(6);
+ R(7);
+ R(8);
+ R(9);
+ R(10);
+ R(11);
+ R(12);
+ R(13);
+ R(14);
+ R(15);
+#undef R
+
+ return r.i;
+}
+
+static inline __m128i _em_cmpgt_epi8(__m128i a, __m128i b)
+{
+ um128 x;
+ um128 y;
+ um128 r;
+
+ x.i=a;
+ y.i=b;
+
+#define R(z) r.s8[z]=(x.s8[z]>y.s8[z]) ? 0xFF:0
+ R(0);
+ R(1);
+ R(2);
+ R(3);
+ R(4);
+ R(5);
+ R(6);
+ R(7);
+ R(8);
+ R(9);
+ R(10);
+ R(11);
+ R(12);
+ R(13);
+ R(14);
+ R(15);
+#undef R
+
+ return r.i;
+}
+
+static inline __m128i _em_cmplt_epi8(__m128i a, __m128i b)
+{
+ um128 x;
+ um128 y;
+ um128 r;
+
+ x.i=a;
+ y.i=b;
+
+#define R(z) r.s8[z]=(x.s8[z]<y.s8[z]) ? 0xFF:0
+ R(0);
+ R(1);
+ R(2);
+ R(3);
+ R(4);
+ R(5);
+ R(6);
+ R(7);
+ R(8);
+ R(9);
+ R(10);
+ R(11);
+ R(12);
+ R(13);
+ R(14);
+ R(15);
+#undef R
+
+ return r.i;
+}
+
+static inline __m128i _em_max_epi8(__m128i a, __m128i b)
+{
+ um128 x;
+ um128 y;
+ um128 r;
+
+ x.i=a;
+ y.i=b;
+
+#define R(z) r.s8[z]=MAX(x.s8[z],y.s8[z])
+ R(0);
+ R(1);
+ R(2);
+ R(3);
+ R(4);
+ R(5);
+ R(6);
+ R(7);
+ R(8);
+ R(9);
+ R(10);
+ R(11);
+ R(12);
+ R(13);
+ R(14);
+ R(15);
+#undef R
+
+ return r.i;
+}
+
+static inline __m128i _em_min_epi8(__m128i a, __m128i b)
+{
+ um128 x;
+ um128 y;
+ um128 r;
+
+ x.i=a;
+ y.i=b;
+
+#define R(z) r.s8[z]=MIN(x.s8[z],y.s8[z])
+ R(0);
+ R(1);
+ R(2);
+ R(3);
+ R(4);
+ R(5);
+ R(6);
+ R(7);
+ R(8);
+ R(9);
+ R(10);
+ R(11);
+ R(12);
+ R(13);
+ R(14);
+ R(15);
+#undef R
+
+ return r.i;
+}
+
+static inline __m128i _em_add_epi8(__m128i a, __m128i b)
+{
+ um128 x;
+ um128 y;
+ um128 r;
+
+ x.i=a;
+ y.i=b;
+
+#define R(z) r.s8[z]=x.s8[z]+y.s8[z]
+ R(0);
+ R(1);
+ R(2);
+ R(3);
+ R(4);
+ R(5);
+ R(6);
+ R(7);
+ R(8);
+ R(9);
+ R(10);
+ R(11);
+ R(12);
+ R(13);
+ R(14);
+ R(15);
+#undef R
+
+ return r.i;
+}
+
+static inline __m128i _em_sub_epi8(__m128i a, __m128i b)
+{
+ um128 x;
+ um128 y;
+ um128 r;
+
+ x.i=a;
+ y.i=b;
+
+#define R(z) r.s8[z]=x.s8[z]+y.s8[z]
+ R(0);
+ R(1);
+ R(2);
+ R(3);
+ R(4);
+ R(5);
+ R(6);
+ R(7);
+ R(8);
+ R(9);
+ R(10);
+ R(11);
+ R(12);
+ R(13);
+ R(14);
+ R(15);
+#undef R
+
+ return r.i;
+}
+
+
+static inline int _em_extract_epi8(__m128i r, const int i)
+{
+ um128 a;
+
+ a.i=r;
+
+ return a.s8[i] & 0xFF;
+}
+
+static inline __m128i _em_min_epu8(__m128i a, __m128i b)
+{
+ um128 x;
+ um128 y;
+ um128 r;
+
+ x.i=a;
+ y.i=b;
+
+#define R(z) r.u8[z]=MIN(x.u8[z],y.u8[z])
+ R(0);
+ R(1);
+ R(2);
+ R(3);
+ R(4);
+ R(5);
+ R(6);
+ R(7);
+ R(8);
+ R(9);
+ R(10);
+ R(11);
+ R(12);
+ R(13);
+ R(14);
+ R(15);
+#undef R
+
+ return r.i;
+}
+
+static inline __m128i _em_set1_epi16(int x)
+{
+ um128 a;
+
+ x&=0xFFFF;
+ a.s16[0]=x;
+ a.s16[1]=x;
+ a.u32[1]=a.u32[0];
+ a.u64[1]=a.u64[0];
+
+ return a.i;
+}
+
+static inline __m128i _em_insert_epi16(__m128i r, int x, const int i)
+{
+ um128 a;
+ a.i=r;
+ a.s16[i]=x & 0xFFFF;
+ return a.i;
+}
+
+static inline __m128i _em_cmpeq_epi16(__m128i a, __m128i b)
+{
+ um128 x;
+ um128 y;
+ um128 r;
+
+ x.i=a;
+ y.i=b;
+
+#define R(z) r.s16[z]=(x.s16[z]==y.s16[z]) ? 0xFFFF:0
+ R(0);
+ R(1);
+ R(2);
+ R(3);
+ R(4);
+ R(5);
+ R(6);
+ R(7);
+#undef R
+
+ return r.i;
+}
+
+static inline __m128i _em_cmpgt_epi16(__m128i a, __m128i b)
+{
+ um128 x;
+ um128 y;
+ um128 r;
+
+ x.i=a;
+ y.i=b;
+
+#define R(z) r.s16[z]=(x.s16[z]>y.s16[z]) ? 0xFFFF:0
+ R(0);
+ R(1);
+ R(2);
+ R(3);
+ R(4);
+ R(5);
+ R(6);
+ R(7);
+#undef R
+
+ return r.i;
+}
+
+static inline __m128i _em_cmplt_epi16(__m128i a, __m128i b)
+{
+ um128 x;
+ um128 y;
+ um128 r;
+
+ x.i=a;
+ y.i=b;
+
+#define R(z) r.s16[z]=(x.s16[z]<y.s16[z]) ? 0xFFFF:0
+ R(0);
+ R(1);
+ R(2);
+ R(3);
+ R(4);
+ R(5);
+ R(6);
+ R(7);
+#undef R
+
+ return r.i;
+}
+
+static inline __m128i _em_max_epi16(__m128i a, __m128i b)
+{
+ um128 x;
+ um128 y;
+ um128 r;
+
+ x.i=a;
+ y.i=b;
+
+
+#define R(z) r.s16[z]=MAX(x.s16[z],y.s16[z])
+ R(0);
+ R(1);
+ R(2);
+ R(3);
+ R(4);
+ R(5);
+ R(6);
+ R(7);
+#undef R
+
+ return r.i;
+}
+
+static inline __m128i _em_min_epi16(__m128i a, __m128i b)
+{
+ um128 x;
+ um128 y;
+ um128 r;
+
+ x.i=a;
+ y.i=b;
+
+
+#define R(z) r.s16[z]=MIN(x.s16[z],y.s16[z])
+ R(0);
+ R(1);
+ R(2);
+ R(3);
+ R(4);
+ R(5);
+ R(6);
+ R(7);
+#undef R
+
+ return r.i;
+}
+
+static inline __m128i _em_add_epi16(__m128i a, __m128i b)
+{
+ um128 x;
+ um128 y;
+ um128 r;
+
+ x.i=a;
+ y.i=b;
+
+#define R(z) r.s16[z]=x.s16[z]+y.s16[z]
+ R(0);
+ R(1);
+ R(2);
+ R(3);
+ R(4);
+ R(5);
+ R(6);
+ R(7);
+#undef R
+
+ return r.i;
+}
+
+static inline __m128i _em_sub_epi16(__m128i a, __m128i b)
+{
+ um128 x;
+ um128 y;
+ um128 r;
+
+ x.i=a;
+ y.i=b;
+
+#define R(z) r.s16[z]=x.s16[z]+y.s16[z]
+ R(0);
+ R(1);
+ R(2);
+ R(3);
+ R(4);
+ R(5);
+ R(6);
+ R(7);
+#undef R
+
+ return r.i;
+}
+
+static inline int _em_extract_epi16(__m128i r, const int i)
+{
+ um128 a;
+ a.i=r;
+ return a.s16[i] & 0xFFFF;
+}
+
+static inline __m128i _em_unpacklo_epi8(__m128i a, __m128i b)
+{
+ um128 x;
+ um128 y;
+ um128 r;
+
+ x.i=a;
+ y.i=b;
+
+#define R(z) r.s16[z]=(((int16_t)(y.s8[z])) << 8) | (int16_t)(x.s8[z])
+ R(0);
+ R(1);
+ R(2);
+ R(3);
+ R(4);
+ R(5);
+ R(6);
+ R(7);
+#undef R
+
+ return r.i;
+}
+
+static inline __m128i _em_unpackhi_epi8(__m128i a, __m128i b)
+{
+ um128 x;
+ um128 y;
+ um128 r;
+
+ x.i=a;
+ y.i=b;
+
+#define R(z) r.s16[z]=(((int16_t)(y.s8[z+8])) << 8) | (int16_t)(x.s8[z+8])
+ R(0);
+ R(1);
+ R(2);
+ R(3);
+ R(4);
+ R(5);
+ R(6);
+ R(7);
+#undef R
+
+ return r.i;
+}
+
+static inline __m128i _em_adds_epu16(__m128i a, __m128i b)
+{
+ um128 x;
+ um128 y;
+ um128 r;
+
+ x.i=a;
+ y.i=b;
+
+
+#define R(z) r.u16[z]=x.u16[z]+y.u16[z]
+ R(0);
+ R(1);
+ R(2);
+ R(3);
+ R(4);
+ R(5);
+ R(6);
+ R(7);
+#undef R
+
+ return r.i;
+}
+
+static inline __m128i _em_srli_epi64(__m128i a, int b)
+{
+ um128 x;
+
+ x.i=a;
+
+ x.s64[0]>>=b;
+ x.s64[1]>>=b;
+
+ return x.i;
+}
+
+static inline __m128i _em_slli_epi64(__m128i a, int b)
+{
+ um128 x;
+
+ x.i=a;
+
+ x.s64[0]<<=b;
+ x.s64[1]<<=b;
+
+ return x.i;
+}
+
+static inline __m128i _em_setzero_si128()
+{
+ um128 x;
+
+ x.s64[0]=x.s64[1]=0;
+
+ return x.i;
+}
+
+static inline __m128i _em_and_si128(__m128i a, __m128i b)
+{
+ um128 x;
+ um128 y;
+ um128 r;
+
+ x.i=a;
+ y.i=b;
+
+
+#define R(z) r.u64[z]=x.u64[z] & y.u64[z]
+ R(0);
+ R(1);
+#undef R
+
+ return r.i;
+}
+
+static inline __m128i _em_andnot_si128(__m128i a, __m128i b)
+{
+ um128 x;
+ um128 y;
+ um128 r;
+
+ x.i=a;
+ y.i=b;
+
+
+#define R(z) r.u64[z]=(~x.u64[z]) & y.u64[z]
+ R(0);
+ R(1);
+#undef R
+
+ return r.i;
+}
+
+static inline __m128i _em_or_si128(__m128i a, __m128i b)
+{
+ um128 x;
+ um128 y;
+ um128 r;
+
+ x.i=a;
+ y.i=b;
+
+#define R(z) r.u64[z]=x.u64[z] | y.u64[z]
+ R(0);
+ R(1);
+#undef R
+
+ return r.i;
+}
+
+static inline __m128i _em_xor_si128(__m128i a, __m128i b)
+{
+ um128 x;
+ um128 y;
+ um128 r;
+
+ x.i=a;
+ y.i=b;
+
+#define R(z) r.u64[z]=x.u64[z] ^ y.u64[z]
+ R(0);
+ R(1);
+#undef R
+
+ return r.i;
+}
+
+static inline __m128i _em_slli_si128(__m128i a, int b)
+{
+ um128 x;
+
+ x.i=a;
+
+#define R(z) x.u8[z]=(z>=b) ? x.u8[z-b]:0
+ R(15);
+ R(14);
+ R(13);
+ R(12);
+ R(11);
+ R(10);
+ R(9);
+ R(8);
+ R(7);
+ R(6);
+ R(5);
+ R(4);
+ R(3);
+ R(2);
+ R(1);
+ R(0);
+#undef R
+
+ return x.i;
+}
+
+static inline __m128i _em_srli_si128(__m128i a, int b)
+{
+ um128 x;
+
+ x.i=a;
+
+#define R(z) x.u8[z]=((b+z) > 15) ? 0:x.u8[z+b]
+ R(0);
+ R(1);
+ R(2);
+ R(3);
+ R(4);
+ R(5);
+ R(6);
+ R(7);
+ R(8);
+ R(9);
+ R(10);
+ R(11);
+ R(12);
+ R(13);
+ R(14);
+ R(15);
+#undef R
+
+ return x.i;
+}
+
+inline static __m128i _em_loadu_si128(__m128i const *P)
+{
+ __m128i tmp;
+ memcpy(&tmp,(const char *)P,16);
+ return tmp;
+}
+
+
+#define _MM_SET1_EPI8(x) _em_set1_epi8(x)
+#define _MM_INSERT_EPI8(r,x,i) _em_insert_epi8((r),(x),(i))
+#define _MM_CMPEQ_EPI8(x,y) _em_cmpeq_epi8((x),(y))
+#define _MM_CMPGT_EPI8(x,y) _em_cmpgt_epi8((x),(y))
+#define _MM_CMPLT_EPI8(x,y) _em_cmplt_epi8((x),(y))
+#define _MM_MAX_EPI8(x,y) _em_max_epi8((x),(y))
+#define _MM_MIN_EPI8(x,y) _em_min_epi8((x),(y))
+#define _MM_ADD_EPI8(x,y) _em_add_epi8((x),(y))
+#define _MM_SUB_EPI8(x,y) _em_sub_epi8((x),(y))
+#define _MM_EXTRACT_EPI8(r,p) _em_extract_epi8((r),(p))
+
+#define _MM_MIN_EPU8(x,y) _em_min_epu8((x),(y))
+
+#define _MM_SET1_EPI16(x) _em_set1_epi16(x)
+#define _MM_INSERT_EPI16(r,x,i) _em_insert_epi16((r),(x),(i))
+#define _MM_CMPEQ_EPI16(x,y) _em_cmpeq_epi16((x),(y))
+#define _MM_CMPGT_EPI16(x,y) _em_cmpgt_epi16((x),(y))
+#define _MM_CMPLT_EPI16(x,y) _em_cmplt_epi16((x),(y))
+#define _MM_MAX_EPI16(x,y) _em_max_epi16((x),(y))
+#define _MM_MIN_EPI16(x,y) _em_min_epi16((x),(y))
+#define _MM_ADD_EPI16(x,y) _em_add_epi16((x),(y))
+#define _MM_SUB_EPI16(x,y) _em_sub_epi16((x),(y))
+#define _MM_EXTRACT_EPI16(r,p) _em_extract_epi16((r),(p))
+#define _MM_UNPACKLO_EPI8(a,b) _em_unpacklo_epi8((a),(b))
+#define _MM_UNPACKHI_EPI8(a,b) _em_unpackhi_epi8((a),(b))
+#define _MM_ADDS_EPU16(x,y) _em_adds_epu16((x),(y))
+
+#define _MM_SRLI_EPI64(r,x) _em_srli_epi64((r),(x))
+#define _MM_SLLI_EPI64(r,x) _em_slli_epi64((r),(x))
+
+#define _MM_SETZERO_SI128() _em_setzero_si128()
+
+#define _MM_AND_SI128(x,y) _em_and_si128((x),(y))
+#define _MM_ANDNOT_SI128(x,y) _em_andnot_si128((x),(y))
+#define _MM_OR_SI128(x,y) _em_or_si128((x),(y))
+#define _MM_WOR_SI128(x,y) _em_xor_si128((x),(y))
+#define _MM_SLLI_SI128(r,s) _em_slli_si128((r),(s))
+#define _MM_SRLI_SI128(r,s) _em_srli_si128((r),(s))
+
+#define _MM_LOADU_SI128(x) _em_loadu_si128(x)
+
+
+#endif /* __SSE2__ */
diff --git a/src/obitools/align/_upperbond.cfiles b/src/obitools/align/_upperbond.cfiles
new file mode 100644
index 0000000..51cb8ca
--- /dev/null
+++ b/src/obitools/align/_upperbond.cfiles
@@ -0,0 +1 @@
+_sse.h
diff --git a/src/obitools/align/_upperbond.ext.1.c b/src/obitools/align/_upperbond.ext.1.c
new file mode 100644
index 0000000..ed2d060
--- /dev/null
+++ b/src/obitools/align/_upperbond.ext.1.c
@@ -0,0 +1,225 @@
+#include "_sse.h"
+#include <stdio.h>
+#include <math.h>
+
+
+inline static uchar_v hash4m128(uchar_v frag)
+{
+ uchar_v words;
+
+ vUInt8 mask_03= _MM_SET1_EPI8(0x03); // charge le registre avec 16x le meme octet
+ vUInt8 mask_FC= _MM_SET1_EPI8(0xFC);
+
+ frag.m = _MM_SRLI_EPI64(frag.m,1); // shift logic a droite sur 2 x 64 bits
+ frag.m = _MM_AND_SI128(frag.m,mask_03); // and sur les 128 bits
+
+
+ words.m= _MM_SLLI_EPI64(frag.m,2);
+ words.m= _MM_AND_SI128(words.m,mask_FC);
+ frag.m = _MM_SRLI_SI128(frag.m,1);
+ words.m= _MM_OR_SI128(words.m,frag.m);
+
+ words.m= _MM_SLLI_EPI64(words.m,2);
+ words.m= _MM_AND_SI128(words.m,mask_FC);
+ frag.m = _MM_SRLI_SI128(frag.m,1);
+ words.m= _MM_OR_SI128(words.m,frag.m);
+
+ words.m= _MM_SLLI_EPI64(words.m,2);
+ words.m= _MM_AND_SI128(words.m,mask_FC);
+ frag.m = _MM_SRLI_SI128(frag.m,1);
+ words.m= _MM_OR_SI128(words.m,frag.m);
+
+ return words;
+}
+
+inline static int anyzerom128(vUInt8 data)
+{
+ vUInt8 mask_00= _MM_SETZERO_SI128();
+ uint64_v tmp;
+ tmp.m = _MM_CMPEQ_EPI8(data,mask_00);
+ return (int)(tmp.c[0]!=0 || tmp.c[1]!=0);
+}
+
+inline static void dumpm128(unsigned short *table,vUInt8 data)
+{
+ memcpy(table,&data,16);
+}
+
+int buildTable(const char* sequence, unsigned char *table, int *count)
+{
+ int overflow = 0;
+ int wc=0;
+ int i;
+ vUInt8 mask_00= _MM_SETZERO_SI128();
+
+ uchar_v frag;
+ uchar_v words;
+ uchar_v zero;
+
+ char* s;
+
+ s=(char*)sequence;
+
+ memset(table,0,256*sizeof(unsigned char));
+
+ // encode ascii sequence with A : 00 C : 01 T: 10 G : 11
+
+ for(frag.m=_MM_LOADU_SI128((vUInt8*)s);
+ ! anyzerom128(frag.m);
+ s+=12,frag.m=_MM_LOADU_SI128((vUInt8*)s))
+ {
+ words= hash4m128(frag);
+
+ // printf("%d %d %d %d\n",words.c[0],words.c[1],words.c[2],words.c[3]);
+
+ if (table[words.c[0]]<255) table[words.c[0]]++; else overflow++;
+ if (table[words.c[1]]<255) table[words.c[1]]++; else overflow++;
+ if (table[words.c[2]]<255) table[words.c[2]]++; else overflow++;
+ if (table[words.c[3]]<255) table[words.c[3]]++; else overflow++;
+ if (table[words.c[4]]<255) table[words.c[4]]++; else overflow++;
+ if (table[words.c[5]]<255) table[words.c[5]]++; else overflow++;
+ if (table[words.c[6]]<255) table[words.c[6]]++; else overflow++;
+ if (table[words.c[7]]<255) table[words.c[7]]++; else overflow++;
+ if (table[words.c[8]]<255) table[words.c[8]]++; else overflow++;
+ if (table[words.c[9]]<255) table[words.c[9]]++; else overflow++;
+ if (table[words.c[10]]<255) table[words.c[10]]++; else overflow++;
+ if (table[words.c[11]]<255) table[words.c[11]]++; else overflow++;
+
+ wc+=12;
+ }
+
+ zero.m=_MM_CMPEQ_EPI8(frag.m,mask_00);
+ //printf("frag=%d %d %d %d\n",frag.c[0],frag.c[1],frag.c[2],frag.c[3]);
+ //printf("zero=%d %d %d %d\n",zero.c[0],zero.c[1],zero.c[2],zero.c[3]);
+ words = hash4m128(frag);
+
+ if (zero.c[0]+zero.c[1]+zero.c[2]+zero.c[3]==0)
+ for(i=0;zero.c[i+3]==0;i++,wc++)
+ if (table[words.c[i]]<255) table[words.c[i]]++; else overflow++;
+
+ if (count) *count=wc;
+ return overflow;
+}
+
+static inline vUInt16 partialminsum(vUInt8 ft1,vUInt8 ft2)
+{
+ vUInt8 mini;
+ vUInt16 minilo;
+ vUInt16 minihi;
+ vUInt8 mask_00= _MM_SETZERO_SI128();
+
+ mini = _MM_MIN_EPU8(ft1,ft2);
+ minilo = _MM_UNPACKLO_EPI8(mini,mask_00);
+ minihi = _MM_UNPACKHI_EPI8(mini,mask_00);
+
+ return _MM_ADDS_EPU16(minilo,minihi);
+}
+
+int compareTable(unsigned char *t1, int over1, unsigned char* t2, int over2)
+{
+ vUInt8 ft1;
+ vUInt8 ft2;
+ vUInt8 *table1=(vUInt8*)t1;
+ vUInt8 *table2=(vUInt8*)t2;
+ ushort_v summini;
+ int i;
+ int total;
+
+ ft1 = _MM_LOADU_SI128(table1);
+ ft2 = _MM_LOADU_SI128(table2);
+ summini.m = partialminsum(ft1,ft2);
+ table1++;
+ table2++;
+
+
+ for (i=1;i<16;i++,table1++,table2++)
+ {
+ ft1 = _MM_LOADU_SI128(table1);
+ ft2 = _MM_LOADU_SI128(table2);
+ summini.m = _MM_ADDS_EPU16(summini.m,partialminsum(ft1,ft2));
+
+ }
+
+ // Finishing the sum process
+
+ summini.m = _MM_ADDS_EPU16(summini.m,_MM_SRLI_SI128(summini.m,8)); // sum the 4 firsts with the 4 lasts
+ summini.m = _MM_ADDS_EPU16(summini.m,_MM_SRLI_SI128(summini.m,4));
+
+ total = summini.c[0]+summini.c[1];
+ total+= (over1 < over2) ? over1:over2;
+
+ return total;
+}
+
+int threshold4(int wordcount,double identity)
+{
+ int error;
+ int lmax;
+
+ wordcount+=3;
+ error = (int)floor((double)wordcount * ((double)1.0-identity));
+ lmax = (wordcount - error) / (error + 1);
+ if (lmax < 4)
+ return 0;
+ return (lmax - 3) \
+ * (error + 1) \
+ + ((wordcount - error) % (error + 1));
+}
+
+int thresholdLCS4(int32_t reflen,int32_t lcs)
+{
+ int nbfrag;
+ int smin;
+ int R;
+ int common;
+
+ nbfrag = (reflen - lcs)*2 + 1;
+ smin = lcs/nbfrag;
+ R = lcs - smin * nbfrag;
+ common = MAX(smin - 2,0) * R + MAX(smin - 3,0) * (nbfrag - R);
+ return common;
+}
+
+#ifndef MAX
+#define MAX(x,y) (((x)>(y)) ? (x):(y))
+#define MIN(x,y) (((x)<(y)) ? (x):(y))
+#endif
+
+int ispossible(int len1, unsigned char *t1, int over1,
+ int len2, unsigned char* t2, int over2,
+ double minimum, int normalized, int large)
+{
+ int32_t reflen;
+ int32_t lcs;
+ int32_t mincount;
+
+ if (normalized)
+ {
+ if (large)
+ reflen = MAX(len1,len2);
+ else
+ reflen = MIN(len1,len2);
+
+ lcs = (int32_t)floor((double)reflen * minimum);
+ }
+ else
+ {
+ if (large)
+ reflen = MAX(len1,len2);
+ else
+ reflen = MIN(len1,len2);
+
+ lcs = (int32_t) minimum;
+ }
+
+ if (lcs > MIN(len1,len2))
+ return 0;
+
+ mincount = thresholdLCS4(reflen,lcs);
+
+// fprintf(stderr,"MaxLCS %d %d %d : %d\n",reflen,lcs,compareTable(t1,over1,t2,over2),mincount);
+
+ return compareTable(t1,over1,t2,over2) >=mincount;
+}
+
+
diff --git a/src/obitools/align/_upperbond.h b/src/obitools/align/_upperbond.h
new file mode 100644
index 0000000..873584b
--- /dev/null
+++ b/src/obitools/align/_upperbond.h
@@ -0,0 +1,7 @@
+int buildTable(const char *sequence, unsigned char *table, int *count);
+int compareTable(unsigned char *t1, int over1, unsigned char* t2, int over2);
+int threshold4(int wordcount,double identity);
+int thresholdLCS4(int32_t reflen,int32_t lcs);
+int ispossible(int len1, unsigned char *t1, int over1,
+ int len2, unsigned char* t2, int over2,
+ double minimum, int normalized, int large);
diff --git a/src/obitools/align/_upperbond.pxd b/src/obitools/align/_upperbond.pxd
new file mode 100644
index 0000000..3f7ec02
--- /dev/null
+++ b/src/obitools/align/_upperbond.pxd
@@ -0,0 +1,16 @@
+from cpython cimport array
+
+cdef extern from *:
+ ctypedef char* const_char_ptr "const char*"
+
+
+cdef import from "_upperbond.h":
+ int buildTable(const_char_ptr sequence, unsigned char *table, int *count)
+ int compareTable(unsigned char *t1, int over1, unsigned char* t2, int over2)
+ int threshold4(int wordcount,double identity)
+ int thresholdLCS4(int reflen,int lcs)
+ bint ispossible(int len1, unsigned char *t1, int over1,
+ int len2, unsigned char* t2, int over2,
+ double minimum, bint normalized, bint large)
+
+cdef array.array[unsigned char] newtable()
diff --git a/src/obitools/align/_upperbond.pyx b/src/obitools/align/_upperbond.pyx
new file mode 100644
index 0000000..b77358d
--- /dev/null
+++ b/src/obitools/align/_upperbond.pyx
@@ -0,0 +1,90 @@
+'''
+Created on 6 Nov. 2009
+
+ at author: coissac
+'''
+#@PydevCodeAnalysisIgnore
+
+from _dynamic cimport *
+
+from obitools import BioSequence
+from _upperbond cimport *
+#from libupperbond import buildTable
+
+cdef array.array[unsigned char] newtable():
+ table = array.array('B',[0])
+ array.resize(table,256)
+ return table
+
+
+def indexSequences(seq,double threshold=0.95):
+ cdef bytes sequence
+ cdef array.array[unsigned char] table
+ cdef int overflow
+ cdef int wordcount
+ cdef int wordmin
+
+ table = newtable()
+ sequence=bytes(str(seq))
+ overflow = buildTable(sequence,table.data.as_uchars,&wordcount)
+ wordmin = threshold4(wordcount,threshold)
+ return (table,overflow,wordmin)
+
+cpdef int countCommonWords(array.array table1,
+ int overflow1,
+ array.array table2,
+ int overflow2):
+ return compareTable(table1.data.as_uchars,overflow1,
+ table2.data.as_uchars,overflow2)
+
+cpdef bint isLCSReachable(object seq1,
+ object seq2,
+ double minimum,
+ bint normalized=False,
+ bint large=True):
+
+ cdef bytes se1
+ cdef bytes se2
+ cdef int l1 = len(seq1)
+ cdef int l2 = len(seq2)
+ cdef array.array[unsigned char] w1
+ cdef array.array[unsigned char] w2
+ cdef int o1
+ cdef int o2
+ cdef int wordcount # @DuplicatedSignature
+ cdef bint possible
+
+ cdef char *s1
+ cdef char *s2
+
+ if isinstance(seq1, BioSequence) and seq1.word4table is not None:
+ w1 = seq1.word4table
+ o1 = seq1.word4over
+ else:
+ se1=bytes(str(seq1))
+ s1=se1
+
+ w1 = newtable()
+ o1 = buildTable(s1,w1.data.as_uchars,&wordcount)
+ if isinstance(seq1, BioSequence):
+ seq1.word4table=w1
+ seq1.word4over=o1
+
+ if isinstance(seq2, BioSequence) and seq2.word4table is not None:
+ w2 = seq2.word4table
+ o2 = seq2.word4over
+ else:
+ se2=bytes(str(seq2))
+ s2=se2
+
+ w2 = newtable()
+ o2 = buildTable(s2,w2.data.as_uchars,&wordcount)
+ if isinstance(seq2, BioSequence) :
+ seq2.word4table=w2
+ seq2.word4over=o2
+
+ possible = ispossible(l1, w1.data.as_uchars, o1,
+ l2, w2.data.as_uchars, o2,
+ minimum,normalized,large)
+
+ return possible
diff --git a/src/obitools/align/homopolymere.py b/src/obitools/align/homopolymere.py
new file mode 100644
index 0000000..5efcbff
--- /dev/null
+++ b/src/obitools/align/homopolymere.py
@@ -0,0 +1,56 @@
+'''
+Created on 14 mai 2009
+
+ at author: coissac
+'''
+
+from obitools import WrappedBioSequence
+
+class HomoNucBioSeq(WrappedBioSequence):
+ '''
+ classdocs
+ '''
+
+
+ def __init__(self,reference,id=None,definition=None,**info):
+ '''
+ Constructor
+ '''
+ assert reference.isNucleotide(),"reference must be a nucleic sequence"
+ WrappedBioSequence.__init__(self,reference,id=None,definition=None,**info)
+ self.__cleanHomopolymer()
+
+ def __cleanHomopolymer(self):
+ s = []
+ c = []
+ old=None
+ nc=0
+ for n in self._wrapped:
+ if old is not None and n!=old:
+ s.append(old)
+ c.append(nc)
+ nc=0
+ old=n
+ nc+=1
+ self._cached=''.join(s)
+ self['homopolymer']=c
+ self._cumulative=[]
+ sum=0
+ for c in self._count:
+ sum+=c
+ self._cumulative.append(sum)
+
+ def __len__(self):
+ return len(self._cached)
+
+ def getStr(self):
+ return self._cached
+
+ def __iter__(self):
+ return iter(self._cached)
+
+ def _posInWrapped(self,position):
+ return self._cumulative[position]
+
+
+
\ No newline at end of file
diff --git a/src/obitools/align/ssearch.py b/src/obitools/align/ssearch.py
new file mode 100755
index 0000000..55a74ce
--- /dev/null
+++ b/src/obitools/align/ssearch.py
@@ -0,0 +1,46 @@
+import os
+import re
+
+from obitools.fasta import formatFasta
+
+class SsearchParser(object):
+
+ _matchQuery = re.compile("^Query:.+\n.+?>+([^ ]+)", re.MULTILINE)
+ _matchLQuery = re.compile("^Query:.+\n.+?(\d+)(?= nt\n)", re.MULTILINE)
+ _matchProp = re.compile("^The best scores are:.*\n(.+?)>>>", re.DOTALL+re.MULTILINE)
+ def __init__(self,file):
+ if isinstance(file,str):
+ file = open(file,'rU')
+ self.data = file.read()
+ self.query= SsearchParser._matchQuery.search(self.data).group(1)
+ self.queryLength= int(SsearchParser._matchLQuery.search(self.data).group(1))
+ props = SsearchParser._matchProp.search(self.data)
+ if props:
+ props=props.group(0).split('\n')[1:-2]
+ self.props=[]
+ for line in props:
+ subject,tab = line.split('\t')
+ tab=tab.split()
+ ssp = subject.split()
+ ac = ssp[0]
+ dbl= int(ssp[-5][:-1])
+ ident = float(tab[0])
+ matchlen = abs(int(tab[5]) - int(tab[4])) +1
+ self.props.append({"ac" :ac,
+ "identity" :ident,
+ "subjectlength":dbl,
+ 'matchlength' : matchlen})
+
+def run(seq,database,program='fasta35',opts=''):
+ ssearchin,ssearchout,ssearcherr = os.popen3("%s %s %s" % (program,opts,database))
+ print >>ssearchin,formatFasta(seq)
+ ssearchin.close()
+ result = SsearchParser(ssearchout)
+
+ return seq,result
+
+def ssearchIterator(sequenceIterator,database,program='ssearch35',opts=''):
+ for seq in sequenceIterator:
+ yield run(seq,database,program,opts)
+
+
diff --git a/src/obitools/alignment/__init__.py b/src/obitools/alignment/__init__.py
new file mode 100644
index 0000000..a89793a
--- /dev/null
+++ b/src/obitools/alignment/__init__.py
@@ -0,0 +1,175 @@
+from obitools import BioSequence
+from obitools import WrappedBioSequence
+from copy import deepcopy
+
+class GappedPositionException(Exception):
+ pass
+
+class AlignedSequence(WrappedBioSequence):
+
+ def __init__(self,reference,
+ id=None,definition=None,**info):
+ WrappedBioSequence.__init__(self,reference,id=None,definition=None,**info)
+ self._length=len(reference)
+ self._gaps=[[self._length,0]]
+
+ def clone(self):
+ seq = WrappedBioSequence.clone(self)
+ seq._gaps=deepcopy(self._gaps)
+ seq._length=reduce(lambda x,y:x+y, (z[0]+z[1] for z in self._gaps),0)
+ return seq
+
+ def setGaps(self, value):
+ '''
+ Set gap vector to an AlignedSequence.
+
+ Gap vector describes the gap positions on a sequence.
+ It is a gap of couple. The first couple member is the count
+ of sequence letter, the second one is the gap length.
+ @param value: a list of length 2 list describing gap positions
+ @type value: list of couple
+ '''
+ assert isinstance(value, list),'Gap vector must be a list'
+ assert reduce(lambda x,y: x and y,
+ (isinstance(z, list) and len(z)==2 for z in value),
+ True),"Value must be a list of length 2 list"
+
+ lseq = reduce(lambda x,y:x+y, (z[0] for z in value),0)
+ assert lseq==len(self.wrapped),"Gap vector incompatible with the sequence"
+ self._gaps = value
+ self._length=reduce(lambda x,y:x+y, (z[0]+z[1] for z in value),0)
+
+ def getGaps(self):
+ return tuple(self._gaps)
+ gaps = property(getGaps, setGaps, None, "Gaps's Docstring")
+
+ def _getIndice(self,pos):
+ i=0
+ cpos=0
+ for s,g in self._gaps:
+ cpos+=s
+ if cpos>pos:
+ return i,pos-cpos+s
+ cpos+=g
+ if cpos>pos:
+ return i,-pos+cpos-g-1
+ i+=1
+ raise IndexError
+
+ def getId(self):
+ d = self._id or ("%s_ALN" % self.wrapped.id)
+ return d
+
+ def __len__(self):
+ return self._length
+
+ def getStr(self):
+ return ''.join([x for x in self])
+
+ def __iter__(self):
+ def isymb():
+ cpos=0
+ for s,g in self._gaps:
+ for x in xrange(s):
+ yield self.wrapped[cpos+x]
+ for x in xrange(g):
+ yield '-'
+ cpos+=s
+ return isymb()
+
+ def _posInWrapped(self,position):
+ i,s=self._getIndice(position)
+ if s<0:
+ raise GappedPositionException
+ value=self._gaps
+ p=reduce(lambda x,y:x+y, (z[0] for z in value[:i]),0)+s
+ return p
+
+ def getSymbolAt(self,position):
+ try:
+ return self.wrapped.getSymbolAt(self.posInWrapped(position))
+ except GappedPositionException:
+ return '-'
+
+ def insertGap(self,position,count=1):
+ if position==self._length:
+ idx=len(self._gaps)-1
+ p=-1
+ else:
+ idx,p = self._getIndice(position)
+
+ if p >= 0:
+ self._gaps.insert(idx, [p,count])
+ self._gaps[idx+1][0]-=p
+ else:
+ self._gaps[idx][1]+=count
+ self._length=reduce(lambda x,y:x+y, (z[0]+z[1] for z in self._gaps),0)
+
+
+ id = property(getId,BioSequence.setId, None, "Sequence Identifier")
+
+
+class Alignment(list):
+
+ def _assertData(self,data):
+ assert isinstance(data, BioSequence),'You must only add bioseq to an alignement'
+ if hasattr(self, '_alignlen'):
+ assert self._alignlen==len(data),'All aligned sequences must have the same length'
+ else:
+ self._alignlen=len(data)
+ return data
+
+ def clone(self):
+ ali = Alignment(x.clone() for x in self)
+ return ali
+
+ def append(self,data):
+ data = self._assertData(data)
+ list.append(self,data)
+
+ def __setitem__(self,index,data):
+
+ data = self._assertData(data)
+ list.__setitem__(self,index,data)
+
+ def getSite(self,key):
+ if isinstance(key,int):
+ return [x[key] for x in self]
+
+ def insertGap(self,position,count=1):
+ for s in self:
+ s.insertGap(position,count)
+
+ def isFullGapSite(self,key):
+ return reduce(lambda x,y: x and y,(z=='-' for z in self.getSite(key)),True)
+
+ def isGappedSite(self,key):
+ return '-' in self.getSite(key)
+
+ def __str__(self):
+ l = len(self[0])
+ rep=""
+ idmax = max(len(x.id) for x in self)+2
+ template= "%%-%ds %%-60s" % idmax
+ for p in xrange(0,l,60):
+ for s in self:
+ rep+= (template % (s.id,s[p:p+60])).strip() + '\n'
+ rep+="\n"
+ return rep
+
+def alignmentReader(file,sequenceIterator):
+ seqs = sequenceIterator(file)
+ alignement = Alignment()
+ for seq in seqs:
+ alignement.append(seq)
+ return alignement
+
+
+
+
+
+def columnIterator(alignment):
+ lali = len(alignment[0])
+ for p in xrange(lali):
+ c = [x[p] for x in alignment]
+ yield c
\ No newline at end of file
diff --git a/src/obitools/alignment/ace.py b/src/obitools/alignment/ace.py
new file mode 100644
index 0000000..59cc8f6
--- /dev/null
+++ b/src/obitools/alignment/ace.py
@@ -0,0 +1,47 @@
+from obitools.format.genericparser import GenericParser
+from obitools.utils import universalOpen
+from obitools.fasta import parseFastaDescription
+from obitools import NucSequence
+
+
+import sys
+
+_contigIterator=GenericParser('^CO ')
+
+_contigIterator.addParseAction('AF', '\nAF +(\S+) +([UC]) +(-?[0-9]+)')
+_contigIterator.addParseAction('RD', '\nRD +(\S+) +([0-9]+) +([0-9]+) +([0-9]+) *\n([A-Za-z\n*]+?)\n\n')
+_contigIterator.addParseAction('DS', '\nDS +(.+)')
+_contigIterator.addParseAction('CO', '^CO (\S+)')
+
+def contigIterator(file):
+ file = universalOpen(file)
+ for entry in _contigIterator(file):
+ contig=[]
+ for rd,ds,af in map(None,entry['RD'],entry['DS'],entry['AF']):
+ id = rd[0]
+ shift = int(af[2])
+ if shift < 0:
+ print >> sys.stderr,"Sequence %s in contig %s has a negative paddng value %d : skipped" % (id,entry['CO'][0],shift)
+ #continue
+
+ definition,info = parseFastaDescription(ds)
+ info['shift']=shift
+ seq = rd[4].replace('\n','').replace('*','-').strip()
+ contig.append(NucSequence(id,seq,definition,**info))
+
+ maxlen = max(len(x)+x['shift'] for x in contig)
+ minshift=min(x['shift'] for x in contig)
+ rep = []
+
+ for s in contig:
+ info = s.getTags()
+ info['shift']-=minshift-1
+ head = '-' * (info['shift']-1)
+
+ tail = (maxlen + minshift - len(s) - info['shift'] - 1)
+ info['tail']=tail
+ newseq = NucSequence(s.id,head + str(s)+ '-' * tail,s.definition,**info)
+ rep.append(newseq)
+
+ yield entry['CO'][0],rep
+
\ No newline at end of file
diff --git a/src/obitools/barcodecoverage/__init__.py b/src/obitools/barcodecoverage/__init__.py
new file mode 100644
index 0000000..09e542e
--- /dev/null
+++ b/src/obitools/barcodecoverage/__init__.py
@@ -0,0 +1,7 @@
+'''
+
+ at author: merciece
+Creates the tree representing the coverage of 2 primers from an ecoPCR output file and an ecoPCR database.
+
+
+'''
\ No newline at end of file
diff --git a/src/obitools/barcodecoverage/calcBc.py b/src/obitools/barcodecoverage/calcBc.py
new file mode 100644
index 0000000..13b0401
--- /dev/null
+++ b/src/obitools/barcodecoverage/calcBc.py
@@ -0,0 +1,62 @@
+#!/usr/local/bin/python
+'''
+Created on 24 nov. 2011
+
+ at author: merciece
+'''
+
+
+def main(amplifiedSeqs, seqsFromDB, keptRanks, errors, tax) :
+ '''
+ error threshold is set to 3
+ '''
+
+ listtaxabygroupinDB = {}
+
+ for seq in seqsFromDB :
+ taxid = seq['taxid']
+ p = [a for a in tax.parentalTreeIterator(taxid)]
+ for a in p :
+ if a != p[0] :
+ if a[1] in keptRanks :
+ group = a[0]
+ if group in listtaxabygroupinDB and taxid not in listtaxabygroupinDB[group] :
+ listtaxabygroupinDB[group].add(taxid)
+ elif group not in listtaxabygroupinDB :
+ listtaxabygroupinDB[group]=set([taxid])
+
+ taxabygroup = dict((x,len(listtaxabygroupinDB[x])) for x in listtaxabygroupinDB)
+
+ listamplifiedtaxabygroup = {}
+
+ for seq in amplifiedSeqs :
+ if errors[seq.id][2] <= 3 :
+ taxid = seq['taxid']
+ p = [a for a in tax.parentalTreeIterator(taxid)]
+ for a in p :
+ if a != p[0] :
+ if a[1] in keptRanks :
+ group = a[0]
+ if group in listamplifiedtaxabygroup and taxid not in listamplifiedtaxabygroup[group] :
+ listamplifiedtaxabygroup[group].add(taxid)
+ elif group not in listamplifiedtaxabygroup :
+ listamplifiedtaxabygroup[group]=set([taxid])
+
+ amplifiedtaxabygroup = dict((x,len(listamplifiedtaxabygroup[x])) for x in listamplifiedtaxabygroup)
+
+ BcValues = {}
+
+ groups = [g for g in taxabygroup.keys()]
+
+ for g in groups :
+ if g in amplifiedtaxabygroup :
+ BcValues[g] = float(amplifiedtaxabygroup[g])/taxabygroup[g]*100
+ BcValues[g] = round(BcValues[g], 2)
+ else :
+ BcValues[g] = 0.0
+
+ return BcValues
+
+
+
+
diff --git a/src/obitools/barcodecoverage/drawBcTree.py b/src/obitools/barcodecoverage/drawBcTree.py
new file mode 100644
index 0000000..9b1e215
--- /dev/null
+++ b/src/obitools/barcodecoverage/drawBcTree.py
@@ -0,0 +1,108 @@
+#!/usr/local/bin/python
+'''
+Created on 25 nov. 2011
+
+ at author: merciece
+'''
+
+from obitools.graph.rootedtree import nexusFormat
+
+
+figtree="""\
+begin figtree;
+ set appearance.backgroundColorAttribute="User Selection";
+ set appearance.backgroundColour=#-1;
+ set appearance.branchColorAttribute="bc";
+ set appearance.branchLineWidth=2.0;
+ set appearance.foregroundColour=#-16777216;
+ set appearance.selectionColour=#-2144520576;
+ set branchLabels.colorAttribute="User Selection";
+ set branchLabels.displayAttribute="errors";
+ set branchLabels.fontName="sansserif";
+ set branchLabels.fontSize=10;
+ set branchLabels.fontStyle=0;
+ set branchLabels.isShown=true;
+ set branchLabels.significantDigits=4;
+ set layout.expansion=2000;
+ set layout.layoutType="RECTILINEAR";
+ set layout.zoom=0;
+ set nodeBars.barWidth=4.0;
+ set nodeLabels.colorAttribute="User Selection";
+ set nodeLabels.displayAttribute="label";
+ set nodeLabels.fontName="sansserif";
+ set nodeLabels.fontSize=10;
+ set nodeLabels.fontStyle=0;
+ set nodeLabels.isShown=true;
+ set nodeLabels.significantDigits=4;
+ set polarLayout.alignTipLabels=false;
+ set polarLayout.angularRange=0;
+ set polarLayout.rootAngle=0;
+ set polarLayout.rootLength=100;
+ set polarLayout.showRoot=true;
+ set radialLayout.spread=0.0;
+ set rectilinearLayout.alignTipLabels=false;
+ set rectilinearLayout.curvature=0;
+ set rectilinearLayout.rootLength=100;
+ set scale.offsetAge=0.0;
+ set scale.rootAge=1.0;
+ set scale.scaleFactor=1.0;
+ set scale.scaleRoot=false;
+ set scaleAxis.automaticScale=true;
+ set scaleAxis.fontSize=8.0;
+ set scaleAxis.isShown=false;
+ set scaleAxis.lineWidth=2.0;
+ set scaleAxis.majorTicks=1.0;
+ set scaleAxis.origin=0.0;
+ set scaleAxis.reverseAxis=false;
+ set scaleAxis.showGrid=true;
+ set scaleAxis.significantDigits=4;
+ set scaleBar.automaticScale=true;
+ set scaleBar.fontSize=10.0;
+ set scaleBar.isShown=true;
+ set scaleBar.lineWidth=1.0;
+ set scaleBar.scaleRange=0.0;
+ set scaleBar.significantDigits=4;
+ set tipLabels.colorAttribute="User Selection";
+ set tipLabels.displayAttribute="Names";
+ set tipLabels.fontName="sansserif";
+ set tipLabels.fontSize=10;
+ set tipLabels.fontStyle=0;
+ set tipLabels.isShown=true;
+ set tipLabels.significantDigits=4;
+ set trees.order=false;
+ set trees.orderType="increasing";
+ set trees.rooting=false;
+ set trees.rootingType="User Selection";
+ set trees.transform=false;
+ set trees.transformType="cladogram";
+end;
+"""
+
+
+def cartoonRankGenerator(rank):
+ def cartoon(node):
+ return 'rank' in node and node['rank']==rank
+
+ return cartoon
+
+
+def collapseBcGenerator(Bclimit):
+ def collapse(node):
+ return 'bc' in node and node['bc']<=Bclimit
+ return collapse
+
+
+def label(node):
+ if 'bc' in node:
+ return "(%+3.1f) %s" % (node['bc'],node['name'])
+ else:
+ return " %s" % node['name']
+
+
+def main(coverageTree) :
+ print nexusFormat(coverageTree,
+ label=label,
+ blocks=figtree,
+ cartoon=cartoonRankGenerator('family'))
+ #collapse=collapseBcGenerator(70))
+
diff --git a/src/obitools/barcodecoverage/findErrors.py b/src/obitools/barcodecoverage/findErrors.py
new file mode 100644
index 0000000..dae20a0
--- /dev/null
+++ b/src/obitools/barcodecoverage/findErrors.py
@@ -0,0 +1,56 @@
+#!/usr/local/bin/python
+'''
+Created on 24 nov. 2011
+
+ at author: merciece
+'''
+
+
+def main(seqs, keptRanks, tax):
+ errorsBySeq = getErrorsOnLeaves(seqs)
+ errorsByTaxon = propagateErrors(errorsBySeq, keptRanks, tax)
+ return errorsBySeq, errorsByTaxon
+
+
+def getErrorsOnLeaves(seqs) :
+ errors = {}
+ for s in seqs :
+ taxid = s['taxid']
+ forErrs = s['forward_error']
+ revErrs = s['reverse_error']
+ total = forErrs + revErrs
+ seqNb = 1
+ errors[s.id] = [forErrs,revErrs,total,seqNb,taxid]
+ return errors
+
+
+def propagateErrors(errorsOnLeaves, keptRanks, tax) :
+ allErrors = {}
+ for seq in errorsOnLeaves :
+ taxid = errorsOnLeaves[seq][4]
+ p = [a for a in tax.parentalTreeIterator(taxid)]
+ for a in p :
+ if a[1] in keptRanks :
+ group = a[0]
+ if group in allErrors :
+ allErrors[group][0] += errorsOnLeaves[seq][0]
+ allErrors[group][1] += errorsOnLeaves[seq][1]
+ allErrors[group][2] += errorsOnLeaves[seq][2]
+ allErrors[group][3] += 1
+ else :
+ allErrors[group] = errorsOnLeaves[seq]
+
+ for group in allErrors :
+ allErrors[group][0] /= float(allErrors[group][3])
+ allErrors[group][1] /= float(allErrors[group][3])
+ allErrors[group][2] /= float(allErrors[group][3])
+
+ allErrors[group][0] = round(allErrors[group][0], 2)
+ allErrors[group][1] = round(allErrors[group][1], 2)
+ allErrors[group][2] = round(allErrors[group][2], 2)
+
+ return allErrors
+
+
+
+
diff --git a/src/obitools/barcodecoverage/readFiles.py b/src/obitools/barcodecoverage/readFiles.py
new file mode 100644
index 0000000..b03e72a
--- /dev/null
+++ b/src/obitools/barcodecoverage/readFiles.py
@@ -0,0 +1,69 @@
+#!/usr/local/bin/python
+'''
+Created on 23 nov. 2011
+
+ at author: merciece
+'''
+
+from obitools.ecopcr import sequence
+from obitools.ecopcr import taxonomy
+
+
+def main(entries,options):
+ filteredDataFromDB = ecoPCRDatabaseReader(options)
+ filteredData = ecoPCRFileReader(entries,filteredDataFromDB)
+ return filteredDataFromDB,filteredData
+
+
+def ecoPCRDatabaseReader(options):
+
+ tax = taxonomy.EcoTaxonomyDB(options.taxonomy)
+ seqs = sequence.EcoPCRDBSequenceIterator(options.taxonomy,taxonomy=tax)
+
+ norankid = tax.findRankByName('no rank')
+ speciesid = tax.findRankByName('species')
+ genusid = tax.findRankByName('genus')
+ familyid = tax.findRankByName('family')
+
+ minrankseq = set([speciesid,genusid,familyid])
+
+ usedrankid = {}
+
+ ingroup = {}
+ outgroup= {}
+
+ for s in seqs :
+ if 'taxid' in s :
+ taxid = s['taxid']
+ allrank = set()
+ for p in tax.parentalTreeIterator(taxid):
+ if p[1]!=norankid:
+ allrank.add(p[1])
+ if len(minrankseq & allrank) == 3:
+ for r in allrank:
+ usedrankid[r]=usedrankid.get(r,0) + 1
+
+ if tax.isAncestor(options.ingroup,taxid):
+ ingroup[s.id] = s
+ else:
+ outgroup[s.id] = s
+
+ keptranks = set(r for r in usedrankid
+ if float(usedrankid[r])/float(len(ingroup)) > options.rankthresold)
+
+ return { 'ingroup' : ingroup,
+ 'outgroup': outgroup,
+ 'ranks' : keptranks,
+ 'taxonomy': tax
+ }
+
+
+def ecoPCRFileReader(entries,filteredDataFromDB) :
+ filteredData = []
+ for s in entries :
+ if 'taxid' in s :
+ seqId = s.id
+ if seqId in filteredDataFromDB['ingroup'] :
+ filteredData.append(s)
+ return filteredData
+
diff --git a/src/obitools/barcodecoverage/writeBcTree.py b/src/obitools/barcodecoverage/writeBcTree.py
new file mode 100644
index 0000000..7c8243e
--- /dev/null
+++ b/src/obitools/barcodecoverage/writeBcTree.py
@@ -0,0 +1,42 @@
+#!/usr/local/bin/python
+'''
+Created on 25 nov. 2011
+
+ at author: merciece
+'''
+
+from obitools.graph.rootedtree import RootedTree
+
+
+def main(BcValues,errors,tax) :
+
+ tree = RootedTree()
+ tset = set(BcValues)
+
+ for taxon in BcValues:
+ if taxon in errors :
+ forErr = errors[taxon][0]
+ revErr = errors[taxon][1]
+ totErr = errors[taxon][2]
+ else :
+ forErr = -1.0
+ revErr = -1.0
+ totErr = -1.0
+
+ tree.addNode(taxon, rank=tax.getRank(taxon),
+ name=tax.getScientificName(taxon),
+ bc = BcValues[taxon],
+ errors = str(forErr)+' '+str(revErr),
+ totError = totErr
+ )
+
+ for taxon in BcValues:
+ piter = tax.parentalTreeIterator(taxon)
+ taxon = piter.next()
+ for parent in piter:
+ if taxon[0] in tset and parent[0] in BcValues:
+ tset.remove(taxon[0])
+ tree.addEdge(parent[0], taxon[0])
+ taxon=parent
+
+ return tree
diff --git a/src/obitools/blast/__init__.py b/src/obitools/blast/__init__.py
new file mode 100644
index 0000000..11b5274
--- /dev/null
+++ b/src/obitools/blast/__init__.py
@@ -0,0 +1,207 @@
+from os import popen2
+from itertools import imap,count
+
+from obitools.table import iTableIterator,TableRow,Table,SelectionIterator
+from obitools.utils import ColumnFile
+from obitools.location import SimpleLocation
+from obitools.fasta import formatFasta
+import sys
+
+class Blast(object):
+ '''
+ Run blast
+ '''
+
+ def __init__(self,mode,db,program='blastall',**options):
+ self._mode = mode
+ self._db = db
+ self._program = program
+ self._options = options
+
+ def getMode(self):
+ return self._mode
+
+
+ def getDb(self):
+ return self._db
+
+
+ def getProgram(self):
+ return self._program
+
+ def _blastcmd(self):
+ tmp = """%(program)s \\
+ -p %(mode)s \\
+ -d %(db)s \\
+ -m 8 \\
+ %(options)s \\
+ """
+ options = ' '.join(['-%s %s' % (x[0],str(x[1]))
+ for x in self._options.iteritems()])
+ data = {
+ 'program' : self.program,
+ 'db' : self.db,
+ 'mode' : self.mode,
+ 'options' : options
+ }
+
+ return tmp % data
+
+ def __call__(self,sequence):
+ '''
+ Run blast with one sequence object
+ @param sequence:
+ @type sequence:
+ '''
+ cmd = self._blastcmd()
+
+ (blast_in,blast_out) = popen2(cmd)
+
+ print >>blast_in,formatFasta(sequence)
+ blast_in.close()
+
+ blast = BlastResultIterator(blast_out)
+
+ return blast
+
+ mode = property(getMode, None, None, "Mode's Docstring")
+
+ db = property(getDb, None, None, "Db's Docstring")
+
+ program = property(getProgram, None, None, "Program's Docstring")
+
+
+class NetBlast(Blast):
+ '''
+ Run blast on ncbi servers
+ '''
+
+ def __init__(self,mode,db,**options):
+ '''
+
+ @param mode:
+ @param db:
+ '''
+ Blast.__init__(self, mode, db, 'blastcl3',**options)
+
+
+class BlastResultIterator(iTableIterator):
+
+ def __init__(self,blastoutput,query=None):
+ '''
+
+ @param blastoutput:
+ @type blastoutput:
+ '''
+ self._blast = ColumnFile(blastoutput,
+ strip=True,
+ skip="#",
+ sep="\t",
+ types=self.types
+ )
+ self._query = query
+ self._hindex = dict((k,i) for i,k in imap(None,count(),self._getHeaders()))
+
+ def _getHeaders(self):
+ return ('Query id','Subject id',
+ '% identity','alignment length',
+ 'mismatches', 'gap openings',
+ 'q. start', 'q. end',
+ 's. start', 's. end',
+ 'e-value', 'bit score')
+
+ def _getTypes(self):
+ return (str,str,
+ float,int,
+ int,int,
+ int,int,
+ int,int,
+ float,float)
+
+ def _getRowFactory(self):
+ return BlastMatch
+
+ def _getSubrowFactory(self):
+ return TableRow
+
+ def _getQuery(self):
+ return self._query
+
+
+ headers = property(_getHeaders,None,None)
+ types = property(_getTypes,None,None)
+ rowFactory = property(_getRowFactory,None,None)
+ subrowFactory = property(_getSubrowFactory,None,None)
+ query = property(_getQuery,None,None)
+
+ def next(self):
+ '''
+
+ '''
+ value = self._blast.next()
+ return self.rowFactory(self,value)
+
+
+
+class BlastResult(Table):
+ '''
+ Results of a blast run
+ '''
+
+class BlastMatch(TableRow):
+ '''
+ Blast high scoring pair between two sequences
+ '''
+
+ def getQueryLocation(self):
+ l = SimpleLocation(self[6], self[7])
+ return l
+
+ def getSubjectLocation(self):
+ l = SimpleLocation(self[8], self[9])
+ return l
+
+ def getSubjectSequence(self,database):
+ return database[self[1]]
+
+ def queryCov(self,query=None):
+ '''
+ Compute coverage of match on query sequence.
+
+ @param query: the query sequence. Default is None.
+ In this case the query sequence associated
+ to this blast result is used.
+ @type query: L{obitools.BioSequence}
+
+ @return: coverage fraction
+ @rtype: float
+ '''
+ if query is None:
+ query = self.table.query
+ assert query is not None
+ return float(self[7]-self[6]+1)/float(len(query))
+
+ def __getitem__(self,key):
+ if key=='query coverage' and self.table.query is not None:
+ return self.queryCov()
+ else:
+ return TableRow.__getitem__(self,key)
+
+class BlastCovMinFilter(SelectionIterator):
+
+ def __init__(self,blastiterator,covmin,query=None,**conditions):
+ if query is None:
+ query = blastiterator.table.query
+ assert query is not None
+ SelectionIterator.__init__(self,blastiterator,**conditions)
+ self._query = query
+ self._covmin=covmin
+
+ def _covMinPredicat(self,row):
+ return row.queryCov(self._query)>=self._covmin
+
+ def _checkCondition(self,row):
+ return self._covMinPredicat(row) and SelectionIterator._checkCondition(self, row)
+
+
+
\ No newline at end of file
diff --git a/src/obitools/carto/__init__.py b/src/obitools/carto/__init__.py
new file mode 100644
index 0000000..b7ac176
--- /dev/null
+++ b/src/obitools/carto/__init__.py
@@ -0,0 +1,376 @@
+# -*- coding: latin1 -*-
+
+
+
+from obitools import SVGdraw
+import math
+
+class Map(object):
+ """
+ Map represente une instance d'une carte genetique physique.
+ Une telle carte est definie par la longueur de la sequence
+ qui lui est associe.
+
+ A une carte est associe un certain nombre de niveaux (Level)
+ eux meme decoupe en sous-niveau (SubLevel)
+ Les sous niveaux contiennent eux des features
+ """
+ def __init__(self,name,seqlength,scale=1):
+ """
+ Constructeur d'une nouvelle carte
+
+ *Param*:
+
+ name
+ nom de la carte
+
+ seqlength
+ longueur de la sequence associee a la carte
+
+ scale
+ echelle de la carte indicant combien de pixel
+ correspondent a une unite de la carte
+ """
+ self.name = name
+ self.seqlength = seqlength
+ self.scale = scale
+ self.levels = {}
+ self.basicHSize = 10
+
+ def __str__(self):
+ return '<%s>' % self.name
+
+ def __getitem__(self,level):
+ """
+ retourne le niveau *level* de la carte et
+ le cree s'il n'existe pas
+ """
+ if not isinstance(level,int):
+ raise TypeError('level must be an non Zero integer value')
+ elif level==0:
+ raise AssertionError('Level cannot be set to 0')
+ try:
+ return self.levels[level]
+ except KeyError:
+ self.levels[level] = Level(level,self)
+ return self.levels[level]
+
+ def getBasicHSize(self):
+ """
+ retourne la hauteur de base d'un element de cartographie
+ exprimee en pixel
+ """
+ return self.basicHSize
+
+ def getScale(self):
+ """
+ Retourne l'echelle de la carte en nombre de pixels par
+ unite physique de la carte
+ """
+ return self.scale
+
+
+
+ def getNegativeBase(self):
+ return reduce(lambda x,y:x-y,[self.levels[z].getHeight()
+ for z in self.levels
+ if z < 0],self.getHeight())
+
+ def getPositiveBase(self):
+ return self.getNegativeBase() - 3 * self.getBasicHSize()
+
+ def getHeight(self):
+ return reduce(lambda x,y:x+y,[z.getHeight() for z in self.levels.values()],0) \
+ + 4 * self.getBasicHSize()
+
+ def toXML(self,file=None,begin=0,end=None):
+ dessin = SVGdraw.drawing()
+ if end==None:
+ end = self.seqlength
+ hauteur= self.getHeight()
+ largeur=(end-begin+1)*self.scale
+ svg = SVGdraw.svg((begin*self.scale,0,largeur,hauteur),
+ '%fpx' % (self.seqlength * self.scale),
+ '%dpx' % hauteur)
+
+ centre = self.getPositiveBase() + (1 + 1/4) * self.getBasicHSize()
+ svg.addElement(SVGdraw.rect(0,centre,self.seqlength * self.scale,self.getBasicHSize()/2))
+ for e in self.levels.values():
+ svg.addElement(e.getElement())
+ dessin.setSVG(svg)
+ return dessin.toXml(file)
+
+class Feature(object):
+ pass
+
+class Level(object):
+
+ def __init__(self,level,map):
+ if not isinstance(map,Map):
+ raise AssertionError('map is not an instance of class Map')
+ if level in map.levels:
+ raise AssertionError('Level %d already define for map %s' % (level,map))
+ else:
+ map.levels[level] = self
+ self.map = map
+ self.level = level
+ self.sublevels = {}
+
+ def __getitem__(self,sublevel):
+ """
+ retourne le niveau *sublevel* du niveau en
+ le creant s'il n'existe pas
+ """
+ if not isinstance(sublevel,int):
+ raise TypeError('sublevel must be a positive integer value')
+ elif sublevel<0:
+ raise AssertionError('Level cannot be negative')
+ try:
+ return self.sublevels[sublevel]
+ except KeyError:
+ self.sublevels[sublevel] = SubLevel(sublevel,self)
+ return self.sublevels[sublevel]
+
+ def getBase(self):
+ if self.level < 0:
+ base = self.map.getNegativeBase()
+ base += reduce(lambda x,y:x+y,[self.map.levels[z].getHeight()
+ for z in self.map.levels
+ if z <0 and z >= self.level],0)
+ return base
+ else:
+ base = self.map.getPositiveBase()
+ base -= reduce(lambda x,y:x+y,[self.map.levels[z].getHeight()
+ for z in self.map.levels
+ if z >0 and z < self.level],0)
+ return base
+
+ def getElement(self):
+ objet = SVGdraw.group('level%d' % self.level)
+ for e in self.sublevels.values():
+ objet.addElement(e.getElement())
+ return objet
+
+
+
+ def getHeight(self):
+ return reduce(lambda x,y:x+y,[z.getHeight() for z in self.sublevels.values()],0) \
+ + 2 * self.map.getBasicHSize()
+
+class SubLevel(object):
+
+ def __init__(self,sublevel,level):
+ if not isinstance(level,Level):
+ raise AssertionError('level is not an instance of class Level')
+ if level in level.sublevels:
+ raise AssertionError('Sublevel %d already define for level %s' % (sublevel,level))
+ else:
+ level.sublevels[sublevel] = self
+ self.level = level
+ self.sublevel = sublevel
+ self.features = {}
+
+ def getHeight(self):
+ return max([x.getHeight() for x in self.features.values()]+[0]) + 4 * self.level.map.getBasicHSize()
+
+ def getBase(self):
+ base = self.level.getBase()
+ if self.level.level < 0:
+ base -= self.level.getHeight() - 2 * self.level.map.getBasicHSize()
+ base += reduce(lambda x,y:x+y,[self.level.sublevels[z].getHeight()
+ for z in self.level.sublevels
+ if z <= self.sublevel],0)
+ base -= 2* self.level.map.getBasicHSize()
+ else:
+ base -= reduce(lambda x,y:x+y,[self.level.sublevels[z].getHeight()
+ for z in self.level.sublevels
+ if z < self.sublevel],0)
+ base -= self.level.map.getBasicHSize()
+ return base
+
+ def getElement(self):
+ base = self.getBase()
+ objet = SVGdraw.group('sublevel%d' % self.sublevel)
+ for e in self.features.values():
+ objet.addElement(e.getElement(base))
+ return objet
+
+ def add(self,feature):
+ if not isinstance(feature,Feature):
+ raise TypeError('feature must be an instance oof Feature')
+ if feature.name in self.features:
+ raise AssertionError('A feature with the same name (%s) have already be insert in this sublevel'
+ % feature.name)
+ self.features[feature.name]=feature
+ feature.sublevel=self
+
+class SimpleFeature(Feature):
+
+ def __init__(self,name,begin,end,visiblename=False,color=0):
+ self.begin = begin
+ self.end = end
+ self.name = name
+ self.color = color
+ self.sublevel = None
+ self.visiblename=visiblename
+
+ def getHeight(self):
+ if not self.sublevel:
+ raise AssertionError('Not affected Simple feature')
+ if self.visiblename:
+ return self.sublevel.level.map.getBasicHSize() * 2
+ else:
+ return self.sublevel.level.map.getBasicHSize()
+
+ def getElement(self,base):
+ scale = self.sublevel.level.map.getScale()
+ y = base - self.sublevel.level.map.getBasicHSize()
+ x = self.begin * scale
+ width = (self.end - self.begin + 1) * scale
+ heigh = self.sublevel.level.map.getBasicHSize()
+
+ objet = SVGdraw.rect(x,y,width,heigh,stroke=self.color)
+ objet.addElement(SVGdraw.description(self.name))
+
+ return objet
+
+class BoxFeature(SimpleFeature):
+
+ def getHeight(self):
+ if not self.sublevel:
+ raise AssertionError('Not affected Box feature')
+ if self.visiblename:
+ return self.sublevel.level.map.getBasicHSize() * 4
+ else:
+ return self.sublevel.level.map.getBasicHSize() * 3
+
+ def getElement(self,base):
+ scale = self.sublevel.level.map.getScale()
+ y = base - self.sublevel.level.map.getBasicHSize() * 2
+ x = self.begin * scale
+ width = (self.end - self.begin + 1) * scale
+ height = self.sublevel.level.map.getBasicHSize() * 3
+
+ objet = SVGdraw.rect(x,y,width,height,stroke=self.color,fill="none")
+ objet.addElement(SVGdraw.description(self.name))
+
+ return objet
+
+class MultiPartFeature(Feature):
+
+ def __init__(self,name,*args,**kargs):
+ self.limits = args
+ self.name = name
+ try:
+ self.color = kargs['color']
+ except KeyError:
+ self.color = "black"
+
+ try:
+ self.visiblename=kargs['visiblename']
+ except KeyError:
+ self.visiblename=None
+
+ try:
+ self.flatlink=kargs['flatlink']
+ except KeyError:
+ self.flatlink=False
+
+ try:
+ self.roundlink=kargs['roundlink']
+ except KeyError:
+ self.roundlink=False
+
+ self.sublevel = None
+
+
+ def getHeight(self):
+ if not self.sublevel:
+ raise AssertionError('Not affected Simple feature')
+ if self.visiblename:
+ return self.sublevel.level.map.getBasicHSize() * 3
+ else:
+ return self.sublevel.level.map.getBasicHSize() * 2
+
+ def getElement(self,base):
+ scale = self.sublevel.level.map.getScale()
+
+ y = base - self.sublevel.level.map.getBasicHSize()
+ height = self.sublevel.level.map.getBasicHSize()
+ objet = SVGdraw.group(self.name)
+ for (debut,fin) in self.limits:
+ x = debut * scale
+ width = (fin - debut + 1) * scale
+ part = SVGdraw.rect(x,y,width,height,fill=self.color)
+ objet.addElement(part)
+
+ debut = self.limits[0][1]
+ for (fin,next) in self.limits[1:]:
+ debut*=scale
+ fin*=scale
+ path = SVGdraw.pathdata(debut,y + height / 2)
+ delta = height / 2
+ if self.roundlink:
+ path.qbezier((debut+fin)/2, y - delta,fin,y + height / 2)
+ else:
+ if self.flatlink:
+ delta = - height / 2
+ path.line((debut+fin)/2, y - delta)
+ path.line(fin,y + height / 2)
+ path = SVGdraw.path(path,fill="none",stroke=self.color)
+ objet.addElement(path)
+ debut = next
+
+ objet.addElement(SVGdraw.description(self.name))
+
+ return objet
+
+class TagFeature(Feature):
+
+ def __init__(self,name,begin,length,ratio,visiblename=False,color=0):
+ self.begin = begin
+ self.length = length
+ self.ratio = ratio
+ self.name = name
+ self.color = color
+ self.sublevel = None
+ self.visiblename=visiblename
+
+ def getHeight(self):
+ if not self.sublevel:
+ raise AssertionError('Not affected Tag feature')
+
+ return self.sublevel.level.map.getBasicHSize()*11
+
+ def getElement(self,base):
+ scale = self.sublevel.level.map.getScale()
+ height = math.floor(max(1,self.sublevel.level.map.getBasicHSize()* 10 * self.ratio))
+ y = base + self.sublevel.level.map.getBasicHSize() - height
+ x = self.begin * scale
+ width = self.length * scale
+ objet = SVGdraw.rect(x,y,width,height,stroke=self.color)
+ objet.addElement(SVGdraw.description(self.name))
+
+ return objet
+
+if __name__ == '__main__':
+ carte = Map('essai',20000,scale=0.5)
+ carte[-1][0].add(SimpleFeature('toto',100,300))
+ carte[1][0].add(SimpleFeature('toto',100,300))
+ carte[1][1].add(SimpleFeature('toto',200,1000))
+
+ carte[1][0].add(MultiPartFeature('bout',(1400,1450),(1470,1550),(1650,1800),color='red',flatlink=True))
+ carte[1][0].add(MultiPartFeature('titi',(400,450),(470,550),(650,800),color='red',flatlink=True))
+ carte[-1][1].add(MultiPartFeature('titi',(400,450),(470,550),(650,800),color='green'))
+ carte[-1][2].add(MultiPartFeature('titi',(400,450),(470,550),(650,800),color='purple',roundlink=True))
+
+ carte[-1][1].add(BoxFeature('tutu',390,810,color='purple'))
+ carte[1][0].add(BoxFeature('tutu',390,810,color='red'))
+ carte[2][0].add(TagFeature('t1',1400,20,0.8))
+ carte[2][0].add(TagFeature('t2',1600,20,0.2))
+ carte.basicHSize=6
+ print carte.toXML('truc.svg',begin=0,end=1000)
+ print carte.toXML('truc2.svg',begin=460,end=2000)
+
+
+
diff --git a/src/obitools/collections.py b/src/obitools/collections.py
new file mode 100644
index 0000000..96c4512
--- /dev/null
+++ b/src/obitools/collections.py
@@ -0,0 +1,190 @@
+from operator import itemgetter
+from heapq import nlargest
+from itertools import repeat, ifilter
+
+class Counter(dict):
+ '''Dict subclass for counting hashable objects. Sometimes called a bag
+ or multiset. Elements are stored as dictionary keys and their counts
+ are stored as dictionary values.
+
+ >>> Counter('zyzygy')
+ Counter({'y': 3, 'z': 2, 'g': 1})
+
+ '''
+
+ def __init__(self, iterable=None, **kwds):
+ '''Create a new, empty Counter object. And if given, count elements
+ from an input iterable. Or, initialize the count from another mapping
+ of elements to their counts.
+
+ >>> c = Counter() # a new, empty counter
+ >>> c = Counter('gallahad') # a new counter from an iterable
+ >>> c = Counter({'a': 4, 'b': 2}) # a new counter from a mapping
+ >>> c = Counter(a=4, b=2) # a new counter from keyword args
+
+ '''
+ self.update(iterable, **kwds)
+
+ def __missing__(self, key):
+ return 0
+
+ def most_common(self, n=None):
+ '''List the n most common elements and their counts from the most
+ common to the least. If n is None, then list all element counts.
+
+ >>> Counter('abracadabra').most_common(3)
+ [('a', 5), ('r', 2), ('b', 2)]
+
+ '''
+ if n is None:
+ return sorted(self.iteritems(), key=itemgetter(1), reverse=True)
+ return nlargest(n, self.iteritems(), key=itemgetter(1))
+
+ def elements(self):
+ '''Iterator over elements repeating each as many times as its count.
+
+ >>> c = Counter('ABCABC')
+ >>> sorted(c.elements())
+ ['A', 'A', 'B', 'B', 'C', 'C']
+
+ If an element's count has been set to zero or is a negative number,
+ elements() will ignore it.
+
+ '''
+ for elem, count in self.iteritems():
+ for _ in repeat(None, count):
+ yield elem
+
+ # Override dict methods where the meaning changes for Counter objects.
+
+ @classmethod
+ def fromkeys(cls, iterable, v=None):
+ raise NotImplementedError(
+ 'Counter.fromkeys() is undefined. Use Counter(iterable) instead.')
+
+ def update(self, iterable=None, **kwds):
+ '''Like dict.update() but add counts instead of replacing them.
+
+ Source can be an iterable, a dictionary, or another Counter instance.
+
+ >>> c = Counter('which')
+ >>> c.update('witch') # add elements from another iterable
+ >>> d = Counter('watch')
+ >>> c.update(d) # add elements from another counter
+ >>> c['h'] # four 'h' in which, witch, and watch
+ 4
+
+ '''
+ if iterable is not None:
+ if hasattr(iterable, 'iteritems'):
+ if self:
+ self_get = self.get
+ for elem, count in iterable.iteritems():
+ self[elem] = self_get(elem, 0) + count
+ else:
+ dict.update(self, iterable) # fast path when counter is empty
+ else:
+ self_get = self.get
+ for elem in iterable:
+ self[elem] = self_get(elem, 0) + 1
+ if kwds:
+ self.update(kwds)
+
+ def copy(self):
+ 'Like dict.copy() but returns a Counter instance instead of a dict.'
+ return Counter(self)
+
+ def __delitem__(self, elem):
+ 'Like dict.__delitem__() but does not raise KeyError for missing values.'
+ if elem in self:
+ dict.__delitem__(self, elem)
+
+ def __repr__(self):
+ if not self:
+ return '%s()' % self.__class__.__name__
+ items = ', '.join(map('%r: %r'.__mod__, self.most_common()))
+ return '%s({%s})' % (self.__class__.__name__, items)
+
+ # Multiset-style mathematical operations discussed in:
+ # Knuth TAOCP Volume II section 4.6.3 exercise 19
+ # and at http://en.wikipedia.org/wiki/Multiset
+ #
+ # Outputs guaranteed to only include positive counts.
+ #
+ # To strip negative and zero counts, add-in an empty counter:
+ # c += Counter()
+
+ def __add__(self, other):
+ '''Add counts from two counters.
+
+ >>> Counter('abbb') + Counter('bcc')
+ Counter({'b': 4, 'c': 2, 'a': 1})
+
+
+ '''
+ if not isinstance(other, Counter):
+ return NotImplemented
+ result = Counter()
+ for elem in set(self) | set(other):
+ newcount = self[elem] + other[elem]
+ if newcount > 0:
+ result[elem] = newcount
+ return result
+
+ def __sub__(self, other):
+ ''' Subtract count, but keep only results with positive counts.
+
+ >>> Counter('abbbc') - Counter('bccd')
+ Counter({'b': 2, 'a': 1})
+
+ '''
+ if not isinstance(other, Counter):
+ return NotImplemented
+ result = Counter()
+ for elem in set(self) | set(other):
+ newcount = self[elem] - other[elem]
+ if newcount > 0:
+ result[elem] = newcount
+ return result
+
+ def __or__(self, other):
+ '''Union is the maximum of value in either of the input counters.
+
+ >>> Counter('abbb') | Counter('bcc')
+ Counter({'b': 3, 'c': 2, 'a': 1})
+
+ '''
+ if not isinstance(other, Counter):
+ return NotImplemented
+ _max = max
+ result = Counter()
+ for elem in set(self) | set(other):
+ newcount = _max(self[elem], other[elem])
+ if newcount > 0:
+ result[elem] = newcount
+ return result
+
+ def __and__(self, other):
+ ''' Intersection is the minimum of corresponding counts.
+
+ >>> Counter('abbb') & Counter('bcc')
+ Counter({'b': 1})
+
+ '''
+ if not isinstance(other, Counter):
+ return NotImplemented
+ _min = min
+ result = Counter()
+ if len(self) < len(other):
+ self, other = other, self
+ for elem in ifilter(self.__contains__, other):
+ newcount = _min(self[elem], other[elem])
+ if newcount > 0:
+ result[elem] = newcount
+ return result
+
+
+if __name__ == '__main__':
+ import doctest
+ print doctest.testmod()
+
diff --git a/src/obitools/decorator.py b/src/obitools/decorator.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/obitools/distances/__init__.py b/src/obitools/distances/__init__.py
new file mode 100644
index 0000000..1542fa9
--- /dev/null
+++ b/src/obitools/distances/__init__.py
@@ -0,0 +1,29 @@
+class DistanceMatrix(object):
+
+ def __init__(self,alignment):
+ '''
+ DistanceMatrix constructor.
+
+ @param alignment: aligment used to compute distance matrix
+ @type alignment: obitools.align.Alignment
+ '''
+ self.aligment = alignment
+ self.matrix = [[None] * (x+1) for x in xrange(len(alignment))]
+
+ def evaluateDist(self,x,y):
+ raise NotImplementedError
+
+ def __getitem__(self,key):
+ assert isinstance(key,(tuple,list)) and len(key)==2, \
+ 'key must be a tuple or a list of two integers'
+ x,y = key
+ if y < x:
+ z=x
+ x=y
+ y=z
+ rep = self.matrix[y][x]
+ if rep is None:
+ rep = self.evaluateDist(x,y)
+ self.matrix[y][x] = rep
+
+ return rep
\ No newline at end of file
diff --git a/src/obitools/distances/observed.py b/src/obitools/distances/observed.py
new file mode 100644
index 0000000..8828d92
--- /dev/null
+++ b/src/obitools/distances/observed.py
@@ -0,0 +1,77 @@
+'''
+Module dedicated to compute observed divergeances from
+an alignment. No distance correction is applied at all
+'''
+
+from itertools import imap
+
+from obitools.distances import DistanceMatrix
+
+class PairewiseGapRemoval(DistanceMatrix):
+ '''
+ Observed divergeance matrix from an alignment.
+ Gap are removed from the alignemt on a pairewise
+ sequence base
+ '''
+
+ def evaluateDist(self,x,y):
+ '''
+ Compute the observed divergeance from two sequences
+ of an aligment.
+
+ @attention: For performance purpose this method should
+ be directly used. use instead the __getitem__
+ method from DistanceMatrix.
+
+ @see: L{__getitem__}
+
+ @param x: number of the fisrt sequence in the aligment
+ @type x: int
+ @param y: umber of the second sequence in the aligment
+ @type y: int
+
+
+ '''
+
+ seq1 = self.aligment[x]
+ seq2 = self.aligment[y]
+
+ diff,tot = reduce(lambda x,y: (x[0]+y,x[1]+1),
+ (z[0]!=z[1] for z in imap(None,seq1,seq2)
+ if '-' not in z),(0,0))
+ return float(diff)/tot
+
+
+class Pairewise(DistanceMatrix):
+ '''
+ Observed divergeance matrix from an alignment.
+ Gap are kept from the alignemt
+ '''
+
+ def evaluateDist(self,x,y):
+ '''
+ Compute the observed divergeance from two sequences
+ of an aligment.
+
+ @attention: For performance purpose this method should
+ be directly used. use instead the __getitem__
+ method from DistanceMatrix.
+
+ @see: L{__getitem__}
+
+ @param x: number of the fisrt sequence in the aligment
+ @type x: int
+ @param y: umber of the second sequence in the aligment
+ @type y: int
+
+
+ '''
+
+ seq1 = self.aligment[x]
+ seq2 = self.aligment[y]
+
+ diff,tot = reduce(lambda x,y: (x[0]+y,x[1]+1),
+ (z[0]!=z[1] for z in imap(None,seq1,seq2)),
+ (0,0))
+ return float(diff)/tot
+
\ No newline at end of file
diff --git a/src/obitools/distances/phylip.py b/src/obitools/distances/phylip.py
new file mode 100644
index 0000000..e2043fa
--- /dev/null
+++ b/src/obitools/distances/phylip.py
@@ -0,0 +1,35 @@
+import sys
+
+from itertools import imap,count
+
+def writePhylipMatrix(matrix):
+ names = [x.id for x in matrix.aligment]
+ pnames= [x[:10] for x in names]
+ unicity={}
+ redundent=[]
+ for n in pnames:
+ unicity[n]=unicity.get(n,0)+1
+ redundent.append(unicity[n])
+
+ for i,n,r in imap(None,count(),pnames,redundent):
+ alternate = n
+ if r > 1:
+ while alternate in pnames:
+ lcut = 9 - len(str(r))
+ alternate = n[:lcut]+ '_%d' % r
+ r+=1
+ pnames[i]='%-10s' % alternate
+
+ firstline = '%5d' % len(matrix.aligment)
+ rep = [firstline]
+ for i,n in imap(None,count(),pnames):
+ line = [n]
+ for j in xrange(i):
+ line.append('%5.4f' % matrix[(j,i)])
+ rep.append(' '.join(line))
+ return '\n'.join(rep)
+
+
+
+
+
\ No newline at end of file
diff --git a/src/obitools/distances/r.py b/src/obitools/distances/r.py
new file mode 100644
index 0000000..f674a4c
--- /dev/null
+++ b/src/obitools/distances/r.py
@@ -0,0 +1,25 @@
+import sys
+
+from itertools import imap,count
+
+def writeRMatrix(matrix):
+ names = [x.id for x in matrix.aligment]
+ lmax = max(max(len(x) for x in names),5)
+ lali = len(matrix.aligment)
+
+ nformat = '%%-%ds' % lmax
+ dformat = '%%%d.4f' % lmax
+
+ pnames=[nformat % x for x in names]
+
+ rep = [' '.join(pnames)]
+
+ for i in xrange(lali):
+ line=[]
+ for j in xrange(lali):
+ line.append('%5.4f' % matrix[(j,i)])
+ rep.append(' '.join(line))
+ return '\n'.join(rep)
+
+
+
\ No newline at end of file
diff --git a/src/obitools/dnahash/__init__.py b/src/obitools/dnahash/__init__.py
new file mode 100644
index 0000000..ca02e35
--- /dev/null
+++ b/src/obitools/dnahash/__init__.py
@@ -0,0 +1,100 @@
+_A=[0]
+_C=[1]
+_G=[2]
+_T=[3]
+_R= _A + _G
+_Y= _C + _T
+_M= _C + _A
+_K= _T + _G
+_W= _T + _A
+_S= _C + _G
+_B= _C + _G + _T
+_D= _A + _G + _T
+_H= _A + _C + _T
+_V= _A + _C + _G
+_N= _A + _C + _G + _T
+
+_dnahash={'a':_A,
+ 'c':_C,
+ 'g':_G,
+ 't':_T,
+ 'r':_R,
+ 'y':_Y,
+ 'm':_M,
+ 'k':_K,
+ 'w':_W,
+ 's':_S,
+ 'b':_B,
+ 'd':_D,
+ 'h':_H,
+ 'v':_V,
+ 'n':_N,
+ }
+
+def hashCodeIterator(sequence,wsize,degeneratemax=0,offset=0):
+ errors = 0
+ emask = [0] * wsize
+ epointer = 0
+ size = 0
+ position = offset
+ hashs = set([0])
+ hashmask = 0
+ for i in xrange(wsize):
+ hashmask <<= 2
+ hashmask +=3
+
+ for l in sequence:
+ l = l.lower()
+ hl = _dnahash[l]
+
+ if emask[epointer]:
+ errors-=1
+ emask[epointer]=0
+
+ if len(hl) > 1:
+ errors +=1
+ emask[epointer]=1
+
+ epointer+=1
+ epointer%=wsize
+
+ if errors > degeneratemax:
+ hl=set([hl[0]])
+
+ hashs=set((((hc<<2) | cl) & hashmask)
+ for hc in hashs
+ for cl in hl)
+
+ if size < wsize:
+ size+=1
+
+ if size==wsize:
+ if errors <= degeneratemax:
+ yield (position,hashs,errors)
+ position+=1
+
+def hashSequence(sequence,wsize,degeneratemax=0,offset=0,hashs=None):
+ if hashs is None:
+ hashs=[[] for x in xrange(4**wsize)]
+
+ for pos,keys,errors in hashCodeIterator(sequence, wsize, degeneratemax, offset):
+ for k in keys:
+ hashs[k].append(pos)
+
+ return hashs
+
+def hashSequences(sequences,wsize,maxpos,degeneratemax=0):
+ hashs=None
+ offsets=[]
+ offset=0
+ for s in sequences:
+ offsets.append(offset)
+ hashSequence(s,wsize,degeneratemax=degeneratemax,offset=offset,hashs=hashs)
+ offset+=len(s)
+
+ return hashs,offsets
+
+
+
+
+
\ No newline at end of file
diff --git a/src/obitools/ecobarcode/__init__.py b/src/obitools/ecobarcode/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/obitools/ecobarcode/databases.py b/src/obitools/ecobarcode/databases.py
new file mode 100644
index 0000000..70d2319
--- /dev/null
+++ b/src/obitools/ecobarcode/databases.py
@@ -0,0 +1,32 @@
+'''
+Created on 25 sept. 2010
+
+ at author: coissac
+'''
+from obitools import NucSequence
+
+def referenceDBIterator(options):
+
+ cursor = options.ecobarcodedb.cursor()
+
+ cursor.execute("select id from databases.database where name='%s'" % options.database)
+ options.dbid = cursor.fetchone()[0]
+
+ cursor.execute('''
+ select s.accession,r.id,r.taxid,r.sequence
+ from databases.database d,
+ databases.reference r,
+ databases.relatedsequences s
+ where r.database = d.id
+ and s.reference= r.id
+ and s.mainac
+ and d.name = '%s'
+ ''' % options.database
+ )
+
+ for ac,id,taxid,sequence in cursor:
+ s = NucSequence(ac,sequence)
+ s['taxid']=taxid
+ s['refdbid']=id
+ yield s
+
\ No newline at end of file
diff --git a/src/obitools/ecobarcode/ecotag.py b/src/obitools/ecobarcode/ecotag.py
new file mode 100644
index 0000000..2ebd3fb
--- /dev/null
+++ b/src/obitools/ecobarcode/ecotag.py
@@ -0,0 +1,50 @@
+'''
+Created on 25 sept. 2010
+
+ at author: coissac
+'''
+
+def alreadyIdentified(seqid,options):
+ cursor = options.ecobarcodedb.cursor()
+ cursor.execute('''
+ select count(*)
+ from ecotag.identification
+ where sequence=%s
+ and database=%s
+ ''',(int(seqid),int(options.dbid)))
+
+ return int(cursor.fetchone()[0]) > 0;
+
+def storeIdentification(seqid,
+ idstatus,taxid,
+ matches,
+ options
+ ):
+
+ cursor = options.ecobarcodedb.cursor()
+
+ if not options.updatedb:
+ cursor.execute('''
+ delete from ecotag.identification where sequence=%s and database=%s
+ ''',(int(seqid),int(options.dbid)))
+
+ cursor.execute('''
+ insert into ecotag.identification (sequence,database,idstatus,taxid)
+ values (%s,%s,%s,%s)
+ returning id
+ ''' , (int(seqid),int(options.dbid),idstatus,int(taxid)))
+
+ idid = cursor.fetchone()[0]
+
+ for seq,identity in matches.iteritems():
+ cursor.execute('''
+ insert into ecotag.evidence (identification,reference,identity)
+ values (%s,
+ %s,
+ %s)
+ ''',(idid,seq,identity))
+
+
+ cursor.close()
+
+ options.ecobarcodedb.commit()
diff --git a/src/obitools/ecobarcode/options.py b/src/obitools/ecobarcode/options.py
new file mode 100644
index 0000000..6086423
--- /dev/null
+++ b/src/obitools/ecobarcode/options.py
@@ -0,0 +1,64 @@
+'''
+Created on 23 sept. 2010
+
+ at author: coissac
+'''
+import psycopg2
+
+from obitools.ecobarcode.taxonomy import EcoTaxonomyDB
+
+def addEcoBarcodeDBOption(optionManager):
+ optionManager.add_option('--dbname',
+ action="store", dest="ecobarcodedb",
+ type='str',
+ default=None,
+ help="Specify the name of the ecobarcode database")
+
+ optionManager.add_option('--server',
+ action="store", dest="dbserver",
+ type='str',
+ default="localhost",
+ help="Specify the adress of the ecobarcode database server")
+
+ optionManager.add_option('--user',
+ action="store", dest="dbuser",
+ type='str',
+ default='postgres',
+ help="Specify the user of the ecobarcode database")
+
+ optionManager.add_option('--port',
+ action="store", dest="dbport",
+ type='str',
+ default=5432,
+ help="Specify the port of the ecobarcode database")
+
+ optionManager.add_option('--passwd',
+ action="store", dest="dbpasswd",
+ type='str',
+ default='',
+ help="Specify the passwd of the ecobarcode database")
+
+ optionManager.add_option('--primer',
+ action="store", dest="primer",
+ type='str',
+ default=None,
+ help="Specify the primer used for amplification")
+
+
+def ecobarcodeDatabaseConnection(options):
+ if options.ecobarcodedb is not None:
+ connection = psycopg2.connect(database=options.ecobarcodedb,
+ user=options.dbuser,
+ password=options.dbpasswd,
+ host=options.dbserver,
+ port=options.dbport)
+ options.dbname=options.ecobarcodedb
+ else:
+ connection=None
+ if connection is not None:
+ options.ecobarcodedb=connection
+ taxonomy = EcoTaxonomyDB(connection)
+ else:
+ taxonomy=None
+ return taxonomy
+
diff --git a/src/obitools/ecobarcode/rawdata.py b/src/obitools/ecobarcode/rawdata.py
new file mode 100644
index 0000000..a5f58cf
--- /dev/null
+++ b/src/obitools/ecobarcode/rawdata.py
@@ -0,0 +1,38 @@
+'''
+Created on 25 sept. 2010
+
+ at author: coissac
+'''
+
+from obitools import NucSequence
+from obitools.utils import progressBar
+from obitools.ecobarcode.ecotag import alreadyIdentified
+
+import sys
+
+def sequenceIterator(options):
+ cursor = options.ecobarcodedb.cursor()
+
+ cursor.execute('''
+ select s.id,sum(o.count),s.sequence
+ from rawdata.sequence s,
+ rawdata.occurrences o
+ where o.sequence= s.id
+ and s.primers = '%s'
+ group by s.id,s.sequence
+ ''' % options.primer
+ )
+
+ nbseq = cursor.rowcount
+ progressBar(1, nbseq, True, head=options.dbname)
+ for id,count,sequence in cursor:
+ progressBar(cursor.rownumber+1, nbseq, head=options.dbname)
+ if not options.updatedb or not alreadyIdentified(id,options):
+ s = NucSequence(id,sequence)
+ s['count']=count
+ print >>sys.stderr,' +', cursor.rownumber+1,
+ yield s
+ else:
+ print >>sys.stderr,' @', cursor.rownumber+1,
+
+ print >>sys.stderr
diff --git a/src/obitools/ecobarcode/taxonomy.py b/src/obitools/ecobarcode/taxonomy.py
new file mode 100644
index 0000000..c7d0185
--- /dev/null
+++ b/src/obitools/ecobarcode/taxonomy.py
@@ -0,0 +1,120 @@
+'''
+Created on 24 sept. 2010
+
+ at author: coissac
+'''
+
+from obitools.ecopcr.taxonomy import TaxonomyDump
+from obitools.ecopcr.taxonomy import Taxonomy
+import sys
+
+class EcoTaxonomyDB(TaxonomyDump) :
+
+ def __init__(self,dbconnect):
+ self._dbconnect=dbconnect
+
+ print >> sys.stderr,"Reading ecobarcode taxonomy database..."
+
+ self._readNodeTable()
+ print >> sys.stderr," ok"
+
+ print >>sys.stderr,"Adding scientific name..."
+
+ self._name=[]
+ for taxid,name,classname in self._nameIterator():
+ self._name.append((name,classname,self._index[taxid]))
+ if classname == 'scientific name':
+ self._taxonomy[self._index[taxid]].append(name)
+
+ print >>sys.stderr,"Adding taxid alias..."
+ for taxid,current in self._mergedNodeIterator():
+ self._index[taxid]=self._index[current]
+
+ print >>sys.stderr,"Adding deleted taxid..."
+ for taxid in self._deletedNodeIterator():
+ self._index[taxid]=None
+
+
+ Taxonomy.__init__(self)
+
+ #####
+ #
+ # Iterator functions
+ #
+ #####
+
+ def _readNodeTable(self):
+
+ cursor = self._dbconnect.cursor()
+
+ cursor.execute("""
+ select taxid,rank,parent
+ from ncbitaxonomy.nodes
+ """)
+
+ print >>sys.stderr,"Reading taxonomy nodes..."
+ taxonomy=[list(n) for n in cursor]
+
+ print >>sys.stderr,"List all taxonomy rank..."
+ ranks =list(set(x[1] for x in taxonomy))
+ ranks.sort()
+ rankidx = dict(map(None,ranks,xrange(len(ranks))))
+
+ print >>sys.stderr,"Sorting taxons..."
+ taxonomy.sort(TaxonomyDump._taxonCmp)
+
+ self._taxonomy=taxonomy
+
+ print >>sys.stderr,"Indexing taxonomy..."
+ index = {}
+ for t in self._taxonomy:
+ index[t[0]]=self._bsearchTaxon(t[0])
+
+ print >>sys.stderr,"Indexing parent and rank..."
+ for t in self._taxonomy:
+ t[1]=rankidx[t[1]]
+ t[2]=index[t[2]]
+
+ self._ranks=ranks
+ self._index=index
+
+ cursor.close()
+
+ def _nameIterator(self):
+ cursor = self._dbconnect.cursor()
+
+ cursor.execute("""
+ select taxid,name,nameclass
+ from ncbitaxonomy.names
+ """)
+
+ for taxid,name,nameclass in cursor:
+ yield taxid,name,nameclass
+
+ cursor.close()
+
+ def _mergedNodeIterator(self):
+ cursor = self._dbconnect.cursor()
+
+ cursor.execute("""
+ select oldtaxid,newtaxid
+ from ncbitaxonomy.merged
+ """)
+
+ for oldtaxid,newtaxid in cursor:
+ yield oldtaxid,newtaxid
+
+ cursor.close()
+
+ def _deletedNodeIterator(self):
+ cursor = self._dbconnect.cursor()
+
+ cursor.execute("""
+ select taxid
+ from ncbitaxonomy.delnodes
+ """)
+
+ for taxid in cursor:
+ yield taxid[0]
+
+ cursor.close()
diff --git a/src/obitools/ecopcr/__init__.py b/src/obitools/ecopcr/__init__.py
new file mode 100644
index 0000000..111c8ac
--- /dev/null
+++ b/src/obitools/ecopcr/__init__.py
@@ -0,0 +1,69 @@
+from obitools import utils
+from obitools import NucSequence
+from obitools.utils import universalOpen, universalTell, fileSize, progressBar
+import struct
+import sys
+
+
+class EcoPCRFile(utils.ColumnFile):
+ def __init__(self,stream):
+ utils.ColumnFile.__init__(self,
+ stream, ' | ', True,
+ (str,int,int,
+ str,int,str,
+ int,str,int,
+ str,int,str,
+ str,str,int,float,
+ str,int,float,
+ int,
+ str,str), "#")
+
+
+ def next(self):
+ data = utils.ColumnFile.next(self)
+ seq = NucSequence(data[0],data[20],data[21])
+ seq['seq_length_ori']=data[1]
+ seq['taxid']=data[2]
+ seq['rank']=data[3]
+ seq['species']=data[4]
+ seq['species_name']=data[5]
+ seq['genus']=data[6]
+ seq['genus_name']=data[7]
+ seq['family']=data[8]
+ seq['family_name']=data[9]
+ seq['strand']=data[12]
+ seq['forward_match']=data[13]
+ seq['forward_error']=data[14]
+ seq['forward_tm']=data[15]
+ seq['reverse_match']=data[16]
+ seq['reverse_error']=data[17]
+ seq['reverse_tm']=data[18]
+
+ return seq
+
+
+
+class EcoPCRDBFile(object):
+
+ def _ecoRecordIterator(self,file,noError=False):
+ file = universalOpen(file,noError)
+ (recordCount,) = struct.unpack('> I',file.read(4))
+ self._recover=False
+
+ if recordCount:
+ for i in xrange(recordCount):
+ (recordSize,)=struct.unpack('>I',file.read(4))
+ record = file.read(recordSize)
+ yield record
+ else:
+ print >> sys.stderr,"\n\n WARNING : EcoPCRDB reading set into recover data mode\n"
+ self._recover=True
+ ok=True
+ while(ok):
+ try:
+ (recordSize,)=struct.unpack('>I',file.read(4))
+ record = file.read(recordSize)
+ yield record
+ except:
+ ok=False
+
\ No newline at end of file
diff --git a/src/obitools/ecopcr/annotation.py b/src/obitools/ecopcr/annotation.py
new file mode 100644
index 0000000..7c76fb2
--- /dev/null
+++ b/src/obitools/ecopcr/annotation.py
@@ -0,0 +1,104 @@
+import struct
+
+class EcoPCRDBAnnotationWriter(object):
+ '''
+ Class used to write Annotation description in EcoPCRDB format.
+
+ EcoPCRDBAnnotationWriter is oftenly called through the EcoPCRDBSequenceWriter class
+
+ @see: L{ecopcr.sequence.EcoPCRDBSequenceWriter}
+ '''
+
+ def __init__(self,dbname,id,fileidx=1,type=('CDS'),definition=None):
+ '''
+ class constructor
+
+ @param dbname: name of ecoPCR database
+ @type dbname: C{str}
+ @param id: name of the qualifier used as feature id
+ @type id: C{str}
+ @param fileidx:
+ @type fileidx: C{int}
+ @param type:
+ @type type: C{list} or C{tuple}
+ @param definition:
+ @type definition: C{str}
+ '''
+ self._type = type
+ self._definition = definition
+ self._id = id
+ self._filename="%s_%03d.adx" % (dbname,fileidx)
+ self._file = open(self._filename,'wb')
+ self._sequenceIdx=0
+
+
+ ftname ="%s.fdx" % (dbname)
+ ft = open(ftname,'wb')
+
+ self._fttypeidx=dict(map(None,type,xrange(len(type))))
+
+ ft.write(struct.pack('> I',len(type)))
+
+ for t in type:
+ ft.write(self._ecoFtTypePacker(t))
+
+ ft.close()
+
+ self._annotationCount=0
+ self._file.write(struct.pack('> I',self._annotationCount))
+
+
+ def _ecoFtTypePacker(self,type):
+ totalSize = len(type)
+ packed = struct.pack('> I %ds' % totalSize,totalSize,type)
+
+ assert len(packed) == totalSize+4, "error in feature type packing"
+
+ return packed
+
+ def _ecoAnnotationPacker(self,feature,seqidx):
+ begin = feature.begin-1
+ end = feature.end
+ type = self._fttypeidx[feature.ftType]
+ strand = feature.isDirect()
+ id = feature[self._id][0]
+ if self._definition in feature:
+ definition = feature[self._definition][0]
+ else:
+ definition = ''
+
+ assert strand is not None,"Only strand defined features can be stored"
+
+ deflength = len(definition)
+
+ totalSize = 4 + 4 + 4 + 4 + 4 + 20 + 4 + deflength
+
+ packed = struct.pack('> I I I I I 20s I %ds' % (deflength),
+ totalSize,
+ seqidx,
+ begin,
+ end,
+ type,
+ int(strand),
+ id,
+ deflength,
+ definition)
+
+ assert len(packed) == totalSize+4, "error in annotation packing"
+
+ return packed
+
+
+ def put(self,sequence,seqidx=None):
+ if seqidx is None:
+ seqidx = self._sequenceIdx
+ self._sequenceIdx+=1
+ for feature in sequence.getFeatureTable():
+ if feature.ftType in self._type:
+ self._annotationCount+=1
+ self._file.write(self._ecoAnnotationPacker(feature,seqidx))
+
+ def __del__(self):
+ self._file.seek(0,0)
+ self._file.write(struct.pack('> I',self._annotationCount))
+ self._file.close()
diff --git a/src/obitools/ecopcr/options.py b/src/obitools/ecopcr/options.py
new file mode 100644
index 0000000..d9329ac
--- /dev/null
+++ b/src/obitools/ecopcr/options.py
@@ -0,0 +1,140 @@
+'''
+Created on 13 fevr. 2011
+
+ at author: coissac
+'''
+
+from obitools.ecopcr.taxonomy import Taxonomy, EcoTaxonomyDB, TaxonomyDump, ecoTaxonomyWriter
+
+#try:
+# from obitools.ecobarcode.options import addEcoBarcodeDBOption,ecobarcodeDatabaseConnection
+#except ImportError:
+# def addEcoBarcodeDBOption(optionmanager):
+# pass
+# def ecobarcodeDatabaseConnection(options):
+# return None
+
+def addTaxonomyDBOptions(optionManager):
+ # addEcoBarcodeDBOption(optionManager)
+
+ group = optionManager.add_option_group('Taxonomy loading options')
+ group.add_option('-d','--database',
+ action="store", dest="taxonomy",
+ metavar="<FILENAME>",
+ type="string",
+ help="ecoPCR taxonomy Database "
+ "name")
+ group.add_option('-t','--taxonomy-dump',
+ action="store", dest="taxdump",
+ metavar="<FILENAME>",
+ type="string",
+ help="NCBI Taxonomy dump repository "
+ "name")
+
+
+def addTaxonomyFilterOptions(optionManager):
+ addTaxonomyDBOptions(optionManager)
+ group = optionManager.add_option_group('Taxonomy-related filtering options')
+
+ group.add_option('--require-rank',
+ action="append",
+ dest='requiredRank',
+ metavar="<RANK_NAME>",
+ type="string",
+ default=[],
+ help="select sequence with taxid tag containing "
+ "a parent of rank <RANK_NAME>")
+
+ group.add_option('-r','--required',
+ action="append",
+ dest='required',
+ metavar="<TAXID>",
+ type="int",
+ default=[],
+ help="Select the sequences having "
+ "the ancestor of taxid <TAXID>. If several ancestors are specified "
+ "(with \n'-r taxid1 -r taxid2'), the sequences "
+ "having at least one of them are selected")
+
+ group.add_option('-i','--ignore',
+ action="append",
+ dest='ignored',
+ metavar="<TAXID>",
+ type="int",
+ default=[],
+ help="ignored taxid")
+
+
+def loadTaxonomyDatabase(options):
+ assert hasattr(options, 'taxonomy'), 'No options to load Taxonomy available'
+
+ if isinstance(options.taxonomy, Taxonomy):
+ return options.taxonomy
+
+ taxonomy = None
+ if (options.taxonomy is not None or
+ options.taxdump is not None):
+ if options.taxdump is not None:
+ taxonomy = TaxonomyDump(options.taxdump)
+ if taxonomy is not None and isinstance(options.taxonomy, str):
+ ecoTaxonomyWriter(options.taxonomy,taxonomy)
+ options.ecodb=options.taxonomy
+ if isinstance(options.taxonomy, Taxonomy):
+ taxonomy = options.taxonomy
+ if taxonomy is None and isinstance(options.taxonomy, str):
+ import sys
+ taxonomy = EcoTaxonomyDB(options.taxonomy)
+ options.ecodb=options.taxonomy
+ options.taxonomy=taxonomy
+ return options.taxonomy
+
+def taxonomyFilterGenerator(options):
+ loadTaxonomyDatabase(options)
+ if options.taxonomy is not None:
+ taxonomy=options.taxonomy
+ def taxonomyFilter(seq):
+ def annotateAtRank(seq,rank):
+ if 'taxid' in seq and seq['taxid'] is not None:
+ rtaxid= taxonomy.getTaxonAtRank(seq['taxid'],rank)
+ return rtaxid
+ return None
+ good = True
+ if 'taxid' in seq:
+ taxid = seq['taxid']
+# print taxid,
+ if options.requiredRank:
+ taxonatrank = reduce(lambda x,y: x and y,
+ (annotateAtRank(seq,rank) is not None
+ for rank in options.requiredRank),True)
+ good = good and taxonatrank
+# print >>sys.stderr, " Has rank : ",good,
+ if options.required:
+ good = good and reduce(lambda x,y: x or y,
+ (taxonomy.isAncestor(r,taxid) for r in options.required),
+ False)
+# print " Required : ",good,
+ if options.ignored:
+ good = good and not reduce(lambda x,y: x or y,
+ (taxonomy.isAncestor(r,taxid) for r in options.ignored),
+ False)
+# print " Ignored : ",good,
+# print " Global : ",good
+
+ return good
+
+
+ else:
+ def taxonomyFilter(seq):
+ return True
+
+ return taxonomyFilter
+
+def taxonomyFilterIteratorGenerator(options):
+ taxonomyFilter = taxonomyFilterGenerator(options)
+
+ def filterIterator(seqiterator):
+ for seq in seqiterator:
+ if taxonomyFilter(seq):
+ yield seq
+
+ return filterIterator
\ No newline at end of file
diff --git a/src/obitools/ecopcr/sequence.py b/src/obitools/ecopcr/sequence.py
new file mode 100644
index 0000000..e1a5627
--- /dev/null
+++ b/src/obitools/ecopcr/sequence.py
@@ -0,0 +1,183 @@
+from obitools import NucSequence
+from obitools.ecopcr import EcoPCRDBFile
+from obitools.ecopcr.taxonomy import EcoTaxonomyDB, ecoTaxonomyWriter
+from obitools.ecopcr.options import loadTaxonomyDatabase
+from obitools.ecopcr.annotation import EcoPCRDBAnnotationWriter
+from obitools.utils import universalOpen
+from glob import glob
+import struct
+import gzip
+import sys
+import re
+
+
+class EcoPCRDBSequenceIterator(EcoPCRDBFile):
+ '''
+ Build an iterator over the sequences include in a sequence database
+ formated for ecoPCR
+ '''
+
+ def __init__(self,path,taxonomy=None):
+ '''
+ ecoPCR data iterator constructor
+
+ @param path: path to the ecoPCR database including the database prefix name
+ @type path: C{str}
+ @param taxonomy: a taxonomy can be given to the reader to decode the taxonomic data
+ associated to the sequences. If no Taxonomy is furnish, it will be read
+ before the sequence database files using the same path.
+ @type taxonomy: L{obitools.ecopcr.taxonomy.Taxonomy}
+ '''
+ self._path = path
+
+ if taxonomy is not None:
+ self._taxonomy=taxonomy
+ else:
+ self._taxonomy=EcoTaxonomyDB(path)
+
+ self._seqfilesFiles = glob('%s_???.sdx' % self._path)
+ self._seqfilesFiles.sort()
+
+ def __ecoSequenceIterator(self,file):
+ for record in self._ecoRecordIterator(file):
+ lrecord = len(record)
+ lnames = lrecord - (4*4+20)
+ (taxid,seqid,deflength,seqlength,cptseqlength,string)=struct.unpack('> I 20s I I I %ds' % lnames, record) # @UnusedVariable
+ seqid=seqid.strip('\x00')
+ de = string[:deflength]
+ seq = gzip.zlib.decompress(string[deflength:])
+ bioseq = NucSequence(seqid,seq,de,taxid=self._taxonomy._taxonomy[taxid][0])
+ yield bioseq
+
+ def __iter__(self):
+ for seqfile in self._seqfilesFiles:
+ for seq in self.__ecoSequenceIterator(seqfile):
+ yield seq
+
+ @property
+ def taxonomy(self):
+ """Return the taxonomy associated to the ecoPCRDB reader"""
+ return self._taxonomy
+
+
+
+class EcoPCRDBSequenceWriter(object):
+
+ def __init__(self,options,fileidx=None,ftid=None,type=None,definition=None,append=False):
+ from obitools.options import currentInputFileName
+ self.currentInputFileName=currentInputFileName
+ # Take care of the taxonomy associated to the database
+ self._currentfile=None
+
+ self._taxonomy= loadTaxonomyDatabase(options)
+ dbname = options.ecopcroutput
+
+ if (self._taxonomy is not None
+ and (not hasattr(options,'ecodb') or options.ecodb!=dbname)):
+ print >> sys.stderr,"Writing the taxonomy file...",
+ ecoTaxonomyWriter(dbname,self._taxonomy)
+ print >> sys.stderr,"Ok"
+
+ # Identifiy the next sequence file number
+ if fileidx is None:
+ p = re.compile(r'([0-9]{3})\.sdx')
+ fileidx = max(list(int(p.search(i).group(1))
+ for i in glob('%s_[0-9][0-9][0-9].sdx' % dbname))+[0]
+ ) +1
+
+ self._fileidx=fileidx
+ self._dbname=dbname
+
+
+ self._filename="%s_%03d.sdx" % (dbname,fileidx)
+ if append:
+ f = universalOpen(self._filename)
+ (recordCount,) = struct.unpack('> I',f.read(4))
+ self._sequenceCount=recordCount
+ self._sequenceFileCount=recordCount
+ del f
+ self.open('r+b')
+ self._file.seek(0,2)
+
+ else:
+ self._sequenceCount=0
+ self._sequenceFileCount=0
+ self.open("wb")
+
+ if type is not None:
+ assert ftid is not None,"You must specify an id attribute for features"
+ self._annotation = EcoPCRDBAnnotationWriter(dbname, ftid, fileidx, type, definition)
+ else:
+ self._annotation = None
+
+ def _ecoSeqPacker(self,seq):
+
+ compactseq = gzip.zlib.compress(str(seq).upper(),9)
+ cptseqlength = len(compactseq)
+ delength = len(seq.definition)
+
+ totalSize = 4 + 20 + 4 + 4 + 4 + cptseqlength + delength
+
+ if self._taxonomy is None or 'taxid' not in seq :
+ taxon=-1
+ else:
+ taxon=self._taxonomy.findIndex(seq['taxid'])
+
+ if taxon==-1:
+ raise Exception("Taxonomy error for %s: %s"%(seq.id, "taxonomy is missing" if self._taxonomy is None else "sequence has no taxid" if 'taxid' not in seq else "wrong taxid"))
+
+ try:
+ packed = struct.pack('> I i 20s I I I %ds %ds' % (delength,cptseqlength),
+ totalSize,
+ taxon,
+ seq.id,
+ delength,
+ len(seq),
+ cptseqlength,
+ seq.definition,
+ compactseq)
+ except struct.error as e:
+ print >>sys.stderr,"\n\n============\n\nError on sequence : %s\n\n" % seq.id
+ raise e
+
+ assert len(packed) == totalSize+4, "error in sequence packing"
+
+ return packed
+
+
+ def close(self):
+ self._file.seek(0,0)
+ self._file.write(struct.pack('> I',self._sequenceFileCount))
+ self._file.close()
+
+ def open(self,mode):
+ self._filename="%s_%03d.sdx" % (self._dbname,self._fileidx)
+ self._file=open(self._filename,mode)
+ self._sequenceFileCount=0
+ self._file.write(struct.pack('> I',self._sequenceFileCount))
+
+ def put(self,sequence):
+ if self._currentfile is None:
+ self._currentfile=self.currentInputFileName()
+ if self.currentInputFileName() != self._currentfile:
+ self._currentfile=self.currentInputFileName()
+
+ self.close()
+
+ self._fileidx+=1
+ self.open('wb')
+
+ if self._taxonomy is not None:
+ if 'taxid' not in sequence and hasattr(sequence, 'extractTaxon'):
+ sequence.extractTaxon()
+ self._file.write(self._ecoSeqPacker(sequence))
+ if self._annotation is not None:
+ self._annotation.put(sequence, self._sequenceCount)
+ self._sequenceCount+=1
+ self._sequenceFileCount+=1
+
+ def __del__(self):
+ self.close()
+
+
+
diff --git a/src/obitools/ecopcr/taxonomy.py b/src/obitools/ecopcr/taxonomy.py
new file mode 100644
index 0000000..b9af63e
--- /dev/null
+++ b/src/obitools/ecopcr/taxonomy.py
@@ -0,0 +1,704 @@
+import struct
+import sys
+
+from itertools import count,imap,combinations
+
+from obitools.ecopcr import EcoPCRDBFile
+from obitools.utils import universalOpen
+from obitools.utils import ColumnFile
+import math
+try:
+ from collections import Counter
+except ImportError:
+ from obitools.collections import Counter
+
+
+class Taxonomy(object):
+ def __init__(self):
+ '''
+ The taxonomy database constructor
+
+ @param path: path to the ecoPCR database including the database prefix name
+ @type path: C{str}
+ '''
+
+ self._ranks.append('obi')
+
+ self._speciesidx = self._ranks.index('species')
+ self._genusidx = self._ranks.index('genus')
+ self._familyidx = self._ranks.index('family')
+ self._orderidx = self._ranks.index('order')
+
+ self._nameidx = {}
+ for x in self._name :
+ if x[0] not in self._nameidx :
+ self._nameidx[x[0]] = [x[2]]
+ else :
+ self._nameidx[x[0]].append(x[2])
+
+ self._nameidx.update(dict((x[0],x[2]) for x in self._preferedName))
+ self._preferedidx=dict((x[2],x[1]) for x in self._preferedName)
+
+ self._bigestTaxid = max(x[0] for x in self._taxonomy)
+
+
+ def findTaxonByIdx(self,idx):
+ if idx is None:
+ return None
+ return self._taxonomy[idx]
+
+ def findIndex(self,taxid):
+ if taxid is None:
+ return None
+ return self._index[taxid]
+
+ def findTaxonByTaxid(self,taxid):
+ return self.findTaxonByIdx(self.findIndex(taxid))
+
+ def findTaxonByName(self,name):
+ taxa = []
+ for i in self._nameidx[name] :
+ taxa.append(self._taxonomy[i])
+ return taxa
+
+ def findRankByName(self,rank):
+ try:
+ return self._ranks.index(rank)
+ except ValueError:
+ return None
+
+ def __contains__(self,taxid):
+ try:
+ return self.findTaxonByTaxid(taxid) is not None
+ finally:
+ return False
+
+
+
+
+ #####
+ #
+ # PUBLIC METHODS
+ #
+ #####
+
+
+ def subTreeIterator(self, taxid):
+ "return subtree for given taxonomic id "
+
+ for t in xrange(len(self._taxonomy)) :
+ if self._taxonomy[t][0] == taxid :
+ idx = t
+
+ yield self._taxonomy[idx]
+ for t in self._taxonomy:
+ if t[2] == idx:
+ for subt in self.subTreeIterator(t[0]):
+ yield subt
+
+ def parentalTreeIterator(self, taxid):
+ """
+ return parental tree for given taxonomic id starting from
+ first ancestor to the root.
+ """
+ taxon=self.findTaxonByTaxid(taxid)
+ if taxon is not None:
+ while taxon[2]!= 0:
+ yield taxon
+ taxon = self._taxonomy[taxon[2]]
+ yield self._taxonomy[0]
+ else:
+ raise StopIteration
+
+ def isAncestor(self,parent,taxid):
+ return parent in [x[0] for x in self.parentalTreeIterator(taxid)]
+
+ def lastCommonTaxon(self,*taxids):
+ if not taxids:
+ return None
+ if len(taxids)==1:
+ return taxids[0]
+
+ if len(taxids)==2:
+ t1 = [x[0] for x in self.parentalTreeIterator(taxids[0])]
+ t2 = [x[0] for x in self.parentalTreeIterator(taxids[1])]
+ t1.reverse()
+ t2.reverse()
+
+ count = min(len(t1),len(t2))
+ i=0
+ while(i < count and t1[i]==t2[i]):
+ i+=1
+ i-=1
+
+ return t1[i]
+
+ ancetre = taxids[0]
+ for taxon in taxids[1:]:
+ ancetre = self.lastCommonTaxon(ancetre,taxon)
+
+ return ancetre
+
+ def depth(self,taxid):
+ return len([x for x in self.parentalTreeIterator(taxid)])
+
+ def betterCommonTaxon(self,error=0.2,*taxids):
+
+ def permanentIterator(x):
+ for i in x:
+ yield i
+ while(1):
+ yield None
+
+ taxids = set(taxids)
+
+ if len(taxids)==1: return taxids.pop()
+
+
+ allLineage = [[x[0] for x in self.parentalTreeIterator(y)]
+ for y in taxids]
+
+ for x in allLineage: x.reverse()
+
+ allLineage=[permanentIterator(x) for x in allLineage]
+
+ c=True
+ while(c):
+
+ lcas = Counter([x.next() for x in allLineage])
+ #print lcas
+ if len(lcas) > 1:
+ main = float(max(lcas.values()))/len(taxids)
+ c = main > (1 - error)
+ if c:
+ lca = lcas.most_common(1)[0][0]
+
+ #print lca
+
+ return lca
+
+
+ def getPreferedName(self,taxid):
+ idx = self.findIndex(taxid)
+ return self._preferedidx.get(idx,self._taxonomy[idx][3])
+
+
+ def getScientificName(self,taxid):
+ return self.findTaxonByTaxid(taxid)[3]
+
+ def getRankId(self,taxid):
+ return self.findTaxonByTaxid(taxid)[1]
+
+ def getRank(self,taxid):
+ return self._ranks[self.getRankId(taxid)]
+
+ def getTaxonAtRank(self,taxid,rankid):
+ if isinstance(rankid, str):
+ rankid=self._ranks.index(rankid)
+ try:
+ return [x[0] for x in self.parentalTreeIterator(taxid)
+ if x[1]==rankid][0]
+ except IndexError:
+ return None
+
+ def getSpecies(self,taxid):
+ return self.getTaxonAtRank(taxid, self._speciesidx)
+
+ def getGenus(self,taxid):
+ return self.getTaxonAtRank(taxid, self._genusidx)
+
+ def getFamily(self,taxid):
+ return self.getTaxonAtRank(taxid, self._familyidx)
+
+ def getOrder(self,taxid):
+ return self.getTaxonAtRank(taxid, self._orderidx)
+
+ def rankIterator(self):
+ for x in imap(None,self._ranks,xrange(len(self._ranks))):
+ yield x
+
+ def groupTaxa(self,taxa,groupname):
+ t=[self.findTaxonByTaxid(x) for x in taxa]
+ a=set(x[2] for x in t)
+ assert len(a)==1,"All taxa must have the same parent"
+ newtaxid=max([2999999]+[x[0] for x in self._taxonomy if x[0]>=3000000 and x[0]<4000000])+1
+ newidx=len(self._taxonomy)
+ if 'GROUP' not in self._ranks:
+ self._ranks.append('GROUP')
+ rankid=self._ranks.index('GROUP')
+ self._taxonomy.append((newtaxid,rankid,a.pop(),groupname))
+ for x in t:
+ x[2]=newidx
+
+
+ def addLocalTaxon(self,name,rank,parent,minimaltaxid=10000000):
+ newtaxid = minimaltaxid if (self._bigestTaxid < minimaltaxid) else self._bigestTaxid+1
+
+ rankid=self.findRankByName(rank)
+ parentidx = self.findIndex(int(parent))
+ tx = (newtaxid,rankid,parentidx,name,'local')
+ self._taxonomy.append(tx)
+ newidx=len(self._taxonomy)-1
+ self._name.append((name,'scientific name',newidx))
+
+ if name not in self._nameidx :
+ self._nameidx[name]=[newidx]
+ else :
+ self._nameidx[name].append(newidx)
+
+ self._index[newtaxid]=newidx
+
+ self._bigestTaxid=newtaxid
+
+ return newtaxid
+
+
+ def removeLocalTaxon(self,taxid):
+ raise NotImplemented
+ txidx = self.findIndex(taxid)
+ taxon = self.findTaxonByIdx(txidx)
+
+ assert txidx >= self._localtaxon,"Only local taxon can be deleted"
+
+ for t in self._taxonomy:
+ if t[2] == txidx:
+ self.removeLocalTaxon(t[0])
+
+ return taxon
+
+
+ def addPreferedName(self,taxid,name):
+ idx = self.findIndex(taxid)
+ self._preferedName.append([name,'obi',idx])
+ self._preferedidx[idx]=name
+ return taxid
+
+
+class EcoTaxonomyDB(Taxonomy,EcoPCRDBFile):
+ '''
+ A taxonomy database class
+ '''
+
+
+ def __init__(self,path):
+ '''
+ The taxonomy database constructor
+
+ @param path: path to the ecoPCR database including the database prefix name
+ @type path: C{str}
+ '''
+ self._path = path
+ self._taxonFile = "%s.tdx" % self._path
+ self._localTaxonFile = "%s.ldx" % self._path
+ self._ranksFile = "%s.rdx" % self._path
+ self._namesFile = "%s.ndx" % self._path
+ self._preferedNamesFile = "%s.pdx" % self._path
+ self._aliasFile = "%s.adx" % self._path
+
+ print >> sys.stderr,"Reading binary taxonomy database..."
+
+ self.__readNodeTable()
+
+ print >> sys.stderr," ok"
+
+ Taxonomy.__init__(self)
+
+
+ #####
+ #
+ # Iterator functions
+ #
+ #####
+
+ def __ecoNameIterator(self,file):
+ for record in self._ecoRecordIterator(file):
+ lrecord = len(record)
+ lnames = lrecord - 16
+ (isScientificName,namelength,classLength,indextaxid,names)=struct.unpack('> I I I I %ds' % lnames, record)
+ name=names[:namelength]
+ classname=names[namelength:]
+ yield (name,classname,indextaxid)
+
+
+ def __ecoTaxonomicIterator(self):
+ for record in self._ecoRecordIterator(self._taxonFile):
+ lrecord = len(record)
+ lnames = lrecord - 16
+ (taxid,rankid,parentidx,nameLength,name)=struct.unpack('> I I I I %ds' % lnames, record)
+ yield (taxid,rankid,parentidx,name,'ncbi')
+
+ try :
+ lt=0
+ for record in self._ecoRecordIterator(self._localTaxonFile,noError=True):
+ lrecord = len(record)
+ lnames = lrecord - 16
+ (taxid,rankid,parentidx,nameLength,name)=struct.unpack('> I I I I %ds' % lnames, record) # @UnusedVariable
+ lt+=1
+ yield (taxid,rankid,parentidx,name,'local')
+ print >> sys.stderr, " [INFO : Local taxon file found] : %d added taxa" % lt
+ except:
+ print >> sys.stderr, " [INFO : Local taxon file not found] "
+
+ def __ecoRankIterator(self):
+ for record in self._ecoRecordIterator(self._ranksFile):
+ yield record
+
+ def __ecoAliasIterator(self):
+ for record in self._ecoRecordIterator(self._aliasFile,noError=True):
+ (taxid,index) = struct.unpack('> I i',record)
+ yield taxid,index
+
+ #####
+ #
+ # Indexes
+ #
+ #####
+
+ def __ecoNameIndex(self):
+ indexName = [x for x in self.__ecoNameIterator(self._namesFile)]
+ return indexName
+
+ def __ecoRankIndex(self):
+ rank = [r for r in self.__ecoRankIterator()]
+ return rank
+
+ def __ecoTaxonomyIndex(self):
+ taxonomy = []
+
+ try :
+ index = dict(self.__ecoAliasIterator())
+ print >> sys.stderr, " [INFO : Taxon alias file found] "
+ buildIndex=False
+ except:
+ print >> sys.stderr, " [INFO : Taxon alias file not found] "
+ index={}
+ buildIndex=True
+
+ localtaxon=0
+ i=0
+ for x in self.__ecoTaxonomicIterator():
+ taxonomy.append(x)
+ if x[4]!='local':
+ localtaxon+=1
+
+ if buildIndex or x[4]=='local':
+ index[x[0]] = i
+ i+=1
+
+
+ print >> sys.stderr,"Taxonomical tree read"
+ return taxonomy, index, localtaxon
+
+ def __readNodeTable(self):
+ self._taxonomy, self._index, self._localtaxon= self.__ecoTaxonomyIndex()
+ self._ranks = self.__ecoRankIndex()
+ self._name = self.__ecoNameIndex()
+
+ # Add local taxon tame to the name index
+ i=self._localtaxon
+ for t in self._taxonomy[self._localtaxon:]:
+ self._name.append((t[3],'scientific name',i))
+ i+=1
+
+ try :
+ self._preferedName = [(x[0],'obi',x[2])
+ for x in self.__ecoNameIterator(self._preferedNamesFile)]
+ print >> sys.stderr, " [INFO : Preferred taxon name file found] : %d added taxa" % len(self._preferedName)
+ except:
+ print >> sys.stderr, " [INFO : Preferred taxon name file not found]"
+ self._preferedName = []
+
+
+
+
+class TaxonomyDump(Taxonomy):
+
+ def __init__(self,taxdir):
+
+ self._path=taxdir
+ self._readNodeTable('%s/nodes.dmp' % taxdir)
+
+ print >>sys.stderr,"Adding scientific name..."
+
+ self._name=[]
+ for taxid,name,classname in self._nameIterator('%s/names.dmp' % taxdir):
+ self._name.append((name,classname,self._index[taxid]))
+ if classname == 'scientific name':
+ self._taxonomy[self._index[taxid]].extend([name,'ncbi'])
+
+ print >>sys.stderr,"Adding taxid alias..."
+ for taxid,current in self._mergedNodeIterator('%s/merged.dmp' % taxdir):
+ self._index[taxid]=self._index[current]
+
+ print >>sys.stderr,"Adding deleted taxid..."
+ for taxid in self._deletedNodeIterator('%s/delnodes.dmp' % taxdir):
+ self._index[taxid]=None
+
+ Taxonomy.__init__(self)
+
+# self._nameidx = {}
+# for x in self._name :
+# if x[0] not in self._nameidx :
+# self._nameidx[x[0]] = [x[2]]
+# else :
+# self._nameidx[x[0]].append(x[2])
+
+
+ def _taxonCmp(t1,t2):
+ if t1[0] < t2[0]:
+ return -1
+ elif t1[0] > t2[0]:
+ return +1
+ return 0
+
+ _taxonCmp=staticmethod(_taxonCmp)
+
+ def _bsearchTaxon(self,taxid):
+ taxCount = len(self._taxonomy)
+ begin = 0
+ end = taxCount
+ oldcheck=taxCount
+ check = begin + end / 2
+ while check != oldcheck and self._taxonomy[check][0]!=taxid :
+ if self._taxonomy[check][0] < taxid:
+ begin=check
+ else:
+ end=check
+ oldcheck=check
+ check = (begin + end) / 2
+
+
+ if self._taxonomy[check][0]==taxid:
+ return check
+ else:
+ return None
+
+
+
+ def _readNodeTable(self,file):
+
+ file = universalOpen(file)
+
+ nodes = ColumnFile(file,
+ sep='|',
+ types=(int,int,str,
+ str,str,bool,
+ int,bool,int,
+ bool,bool,bool,str))
+ print >>sys.stderr,"Reading taxonomy dump file..."
+ # (taxid,rank,parent)
+ taxonomy=[[n[0],n[2],n[1]] for n in nodes]
+
+ print >>sys.stderr,"List all taxonomy rank..."
+ ranks =list(set(x[1] for x in taxonomy))
+ ranks.sort()
+ rankidx = dict(map(None,ranks,xrange(len(ranks))))
+
+ # EC: Taxa are sorted by taxid in node.dmp file
+ # No need to sort them
+
+ #print >>sys.stderr,"Sorting taxons..."
+ #taxonomy.sort(TaxonomyDump._taxonCmp)
+
+ self._taxonomy=taxonomy
+ self._localtaxon=len(taxonomy)
+
+ print >>sys.stderr,"Indexing taxonomy..."
+ index = {}
+ for i in xrange(self._localtaxon):
+ index[self._taxonomy[i][0]]=i
+
+ print >>sys.stderr,"Indexing parent and rank..."
+ for t in self._taxonomy:
+ t[1]=rankidx[t[1]]
+ t[2]=index[t[2]]
+
+ self._ranks=ranks
+ self._index=index
+ self._preferedName = []
+
+ def _nameIterator(self,file):
+ file = universalOpen(file)
+ names = ColumnFile(file,
+ sep='|',
+ types=(int,str,
+ str,str))
+ for taxid,name,unique,classname,white in names:
+ yield taxid,name,classname
+
+ def _mergedNodeIterator(self,file):
+ file = universalOpen(file)
+ merged = ColumnFile(file,
+ sep='|',
+ types=(int,int,str))
+ for taxid,current,white in merged:
+ yield taxid,current
+
+ def _deletedNodeIterator(self,file):
+ file = universalOpen(file)
+ deleted = ColumnFile(file,
+ sep='|',
+ types=(int,str))
+ for taxid,white in deleted:
+ yield taxid
+
+#####
+#
+#
+# Binary writer
+#
+#
+#####
+
+def ecoTaxonomyWriter(prefix, taxonomy,onlyLocal=False):
+
+ def ecoTaxPacker(tx):
+
+ namelength = len(tx[3])
+
+ totalSize = 4 + 4 + 4 + 4 + namelength
+
+ try:
+ packed = struct.pack('> I I I I I %ds' % namelength,
+ totalSize,
+ tx[0],
+ tx[1],
+ tx[2],
+ namelength,
+ tx[3])
+ except :
+ raise TypeError,"Cannot convert %s" % tx[3]
+
+ return packed
+
+ def ecoRankPacker(rank):
+
+ namelength = len(rank)
+
+ packed = struct.pack('> I %ds' % namelength,
+ namelength,
+ rank)
+
+ return packed
+
+ def ecoAliasPacker(taxid,index):
+
+ totalSize = 4 + 4
+ try:
+ packed = struct.pack('> I I i',
+ totalSize,
+ taxid,
+ index)
+ except struct.error,e:
+ print >>sys.stderr,(totalSize,taxid,index)
+ print >>sys.stderr,"Total size : %d taxid : %d index : %d" %(totalSize,taxid,index)
+ raise e
+
+ return packed
+
+ def ecoNamePacker(name):
+
+ namelength = len(name[0])
+ classlength= len(name[1])
+ totalSize = namelength + classlength + 4 + 4 + 4 + 4
+
+ packed = struct.pack('> I I I I I %ds %ds' % (namelength,classlength),
+ totalSize,
+ int(name[1]=='scientific name'),
+ namelength,
+ classlength,
+ name[2],
+ name[0],
+ name[1])
+
+ return packed
+
+
+ def ecoTaxWriter(file,taxonomy):
+ output = open(file,'wb')
+ nbtaxon = reduce(lambda x,y:x+y,(1 for t in taxonomy if t[4]!='local'),0)
+
+ output.write(struct.pack('> I',nbtaxon))
+
+ for tx in taxonomy:
+ if tx[4]!='local':
+ output.write(ecoTaxPacker(tx))
+
+ output.close()
+ return nbtaxon < len(taxonomy)
+
+ def ecoLocalTaxWriter(file,taxonomy):
+ nbtaxon = reduce(lambda x,y:x+y,(1 for t in taxonomy if t[4]=='local'),0)
+
+ if nbtaxon:
+ output = open(file,'wb')
+
+ output.write(struct.pack('> I',nbtaxon))
+
+ for tx in taxonomy:
+ if tx[4]=='local':
+ output.write(ecoTaxPacker(tx))
+
+ output.close()
+
+
+ def ecoRankWriter(file,ranks):
+ output = open(file,'wb')
+ output.write(struct.pack('> I',len(ranks)))
+
+ for rank in ranks:
+ output.write(ecoRankPacker(rank))
+
+ output.close()
+
+ def ecoAliasWriter(file,index):
+ output = open(file,'wb')
+ output.write(struct.pack('> I',len(index)))
+
+ for taxid in index:
+ i=index[taxid]
+ if i is None:
+ i=-1
+ output.write(ecoAliasPacker(taxid, i))
+
+ output.close()
+
+ def nameCmp(n1,n2):
+ name1=n1[0].upper()
+ name2=n2[0].upper()
+ if name1 < name2:
+ return -1
+ elif name1 > name2:
+ return 1
+ return 0
+
+
+ def ecoNameWriter(file,names):
+ output = open(file,'wb')
+ output.write(struct.pack('> I',len(names)))
+
+ names.sort(nameCmp)
+
+ for name in names:
+ output.write(ecoNamePacker(name))
+
+ output.close()
+
+ def ecoPreferedNameWriter(file,names):
+ output = open(file,'wb')
+ output.write(struct.pack('> I',len(names)))
+ for name in names:
+ output.write(ecoNamePacker(name))
+
+ output.close()
+
+ localtaxon=True
+ if not onlyLocal:
+ ecoRankWriter('%s.rdx' % prefix, taxonomy._ranks)
+ localtaxon = ecoTaxWriter('%s.tdx' % prefix, taxonomy._taxonomy)
+ ecoNameWriter('%s.ndx' % prefix, [x for x in taxonomy._name if x[2] < taxonomy._localtaxon])
+ ecoAliasWriter('%s.adx' % prefix, taxonomy._index)
+ if localtaxon:
+ ecoLocalTaxWriter('%s.ldx' % prefix, taxonomy._taxonomy)
+ if taxonomy._preferedName:
+ ecoNameWriter('%s.pdx' % prefix, taxonomy._preferedName)
diff --git a/src/obitools/ecotag/__init__.py b/src/obitools/ecotag/__init__.py
new file mode 100644
index 0000000..26c94d3
--- /dev/null
+++ b/src/obitools/ecotag/__init__.py
@@ -0,0 +1,2 @@
+class EcoTagResult(dict):
+ pass
\ No newline at end of file
diff --git a/src/obitools/ecotag/parser.py b/src/obitools/ecotag/parser.py
new file mode 100644
index 0000000..ff6865b
--- /dev/null
+++ b/src/obitools/ecotag/parser.py
@@ -0,0 +1,150 @@
+from itertools import imap
+from obitools import utils
+
+from obitools.ecotag import EcoTagResult
+
+class EcoTagFileIterator(utils.ColumnFile):
+
+ @staticmethod
+ def taxid(x):
+ x = int(x)
+ if x < 0:
+ return None
+ else:
+ return x
+
+ @staticmethod
+ def scientificName(x):
+ if x=='--':
+ return None
+ else:
+ return x
+
+ @staticmethod
+ def value(x):
+ if x=='--':
+ return None
+ else:
+ return float(x)
+
+ @staticmethod
+ def count(x):
+ if x=='--':
+ return None
+ else:
+ return int(x)
+
+
+ def __init__(self,stream):
+ utils.ColumnFile.__init__(self,
+ stream, '\t', True,
+ (str,str,str,
+ EcoTagFileIterator.value,
+ EcoTagFileIterator.value,
+ EcoTagFileIterator.value,
+ EcoTagFileIterator.count,
+ EcoTagFileIterator.count,
+ EcoTagFileIterator.taxid,
+ EcoTagFileIterator.scientificName,
+ str,
+ EcoTagFileIterator.taxid,
+ EcoTagFileIterator.scientificName,
+ EcoTagFileIterator.taxid,
+ EcoTagFileIterator.scientificName,
+ EcoTagFileIterator.taxid,
+ EcoTagFileIterator.scientificName,
+ str
+ ))
+ self._memory=None
+
+ _colname = ['identification',
+ 'seqid',
+ 'best_match_ac',
+ 'max_identity',
+ 'min_identity',
+ 'theorical_min_identity',
+ 'count',
+ 'match_count',
+ 'taxid',
+ 'scientific_name',
+ 'rank',
+ 'order_taxid',
+ 'order_name',
+ 'family_taxid',
+ 'family_name',
+ 'genus_taxid',
+ 'genus_name',
+ 'species_taxid',
+ 'species_name',
+ 'sequence']
+
+ def next(self):
+ if self._memory is not None:
+ data=self._memory
+ self._memory=None
+ else:
+ data = utils.ColumnFile.next(self)
+ data = EcoTagResult(imap(None,EcoTagFileIterator._colname[:len(data)],data))
+
+ if data['identification']=='ID':
+ data.cd=[]
+ try:
+ nextone = utils.ColumnFile.next(self)
+ nextone = EcoTagResult(imap(None,EcoTagFileIterator._colname[:len(nextone)],nextone))
+ except StopIteration:
+ nextone = None
+ while nextone is not None and nextone['identification']=='CD':
+ data.cd.append(nextone)
+ try:
+ nextone = utils.ColumnFile.next(self)
+ nextone = EcoTagResult(imap(None,EcoTagFileIterator._colname[:len(nextone)],nextone))
+ except StopIteration:
+ nextone = None
+ self._memory=nextone
+
+ return data
+
+def ecoTagIdentifiedFilter(ecoTagIterator):
+ for x in ecoTagIterator:
+ if x['identification']=='ID':
+ yield x
+
+
+class EcoTagAbstractIterator(utils.ColumnFile):
+
+ _colname = ['scientific_name',
+ 'taxid',
+ 'rank',
+ 'count',
+ 'max_identity',
+ 'min_identity']
+
+
+ @staticmethod
+ def taxid(x):
+ x = int(x)
+ if x < 0:
+ return None
+ else:
+ return x
+
+ def __init__(self,stream):
+ utils.ColumnFile.__init__(self,
+ stream, '\t', True,
+ (str,
+ EcoTagFileIterator.taxid,
+ str,
+ int,
+ float,float,float))
+
+ def next(self):
+ data = utils.ColumnFile.next(self)
+ data = dict(imap(None,EcoTagAbstractIterator._colname,data))
+
+ return data
+
+def ecoTagAbstractFilter(ecoTagAbsIterator):
+ for x in ecoTagAbsIterator:
+ if x['taxid'] is not None:
+ yield x
+
\ No newline at end of file
diff --git a/src/obitools/eutils/__init__.py b/src/obitools/eutils/__init__.py
new file mode 100644
index 0000000..1e7d3b2
--- /dev/null
+++ b/src/obitools/eutils/__init__.py
@@ -0,0 +1,54 @@
+import time
+from urllib2 import urlopen
+import shelve
+from threading import Lock
+import sys
+
+class EUtils(object):
+ '''
+
+ '''
+
+ _last_request=0
+ _interval=3
+
+ def __init__(self):
+ self._lock = Lock()
+
+ def wait(self):
+ now=time.time()
+ delta = now - EUtils._last_request
+ while delta < EUtils._interval:
+ time.sleep(delta)
+ now=time.time()
+ delta = now - EUtils._last_request
+
+ def _sendRequest(self,url):
+ self.wait()
+ EUtils._last_request=time.time()
+ t = EUtils._last_request
+ print >>sys.stderr,"Sending request to NCBI @ %f" % t
+ data = urlopen(url).read()
+ print >>sys.stderr,"Data red from NCBI @ %f (%f)" % (t,time.time()-t)
+ return data
+
+ def setInterval(self,seconde):
+ EUtils._interval=seconde
+
+
+class EFetch(EUtils):
+ '''
+
+ '''
+ def __init__(self,db,tool='OBITools',
+ retmode='text',rettype="native",
+ server='eutils.ncbi.nlm.nih.gov'):
+ EUtils.__init__(self)
+ self._url = "http://%s/entrez/eutils/efetch.fcgi?db=%s&tool=%s&retmode=%s&rettype=%s"
+ self._url = self._url % (server,db,tool,retmode,rettype)
+
+
+ def get(self,**args):
+ key = "&".join(['%s=%s' % x for x in args.items()])
+ return self._sendRequest(self._url +"&" + key)
+
diff --git a/src/obitools/fast.py b/src/obitools/fast.py
new file mode 100644
index 0000000..760f493
--- /dev/null
+++ b/src/obitools/fast.py
@@ -0,0 +1,56 @@
+"""
+ implement fastn/fastp sililarity search algorithm for BioSequence.
+"""
+
+class Fast(object):
+
+ def __init__(self,seq,kup=2):
+ '''
+ @param seq: sequence to hash
+ @type seq: BioSequence
+ @param kup: word size used for hashing process
+ @type kup: int
+ '''
+ hash={}
+ seq = str(seq)
+ for word,pos in ((seq[i:i+kup].upper(),i) for i in xrange(len(seq)-kup)):
+ if word in hash:
+ hash[word].append(pos)
+ else:
+ hash[word]=[pos]
+
+ self._kup = kup
+ self._hash= hash
+ self._seq = seq
+
+ def __call__(self,seq):
+ '''
+ Align one sequence with the fast hash table.
+
+ @param seq: the sequence to align
+ @type seq: BioSequence
+
+ @return: where smax is the
+ score of the largest diagonal and pmax the
+ associated shift
+ @rtype: a int tuple (smax,pmax)
+ '''
+ histo={}
+ seq = str(seq).upper()
+ hash= self._hash
+ kup = self._kup
+
+ for word,pos in ((seq[i:i+kup],i) for i in xrange(len(seq)-kup)):
+ matchedpos = hash.get(word,[])
+ for p in matchedpos:
+ delta = pos - p
+ histo[delta]=histo.get(delta,0) + 1
+ smax = max(histo.values())
+ pmax = [x for x in histo if histo[x]==smax]
+ return smax,pmax
+
+ def __len__(self):
+ return len(self._seq)
+
+
+
diff --git a/src/obitools/fasta/__init__.py b/src/obitools/fasta/__init__.py
new file mode 100644
index 0000000..45a3042
--- /dev/null
+++ b/src/obitools/fasta/__init__.py
@@ -0,0 +1,13 @@
+"""
+fasta module provides functions to read and write sequences in fasta format.
+
+
+"""
+
+from _fasta import parseFastaDescription, \
+ fastaParser, fastaNucParser,fastaAAParser, fastFastaParser, \
+ fastaIterator,fastFastaIterator, rawFastaIterator, \
+ fastaNucIterator, fastaAAIterator, \
+ formatFasta, formatSAPFastaGenerator
+
+
diff --git a/src/obitools/fasta/_fasta.pxd b/src/obitools/fasta/_fasta.pxd
new file mode 100644
index 0000000..e6057d1
--- /dev/null
+++ b/src/obitools/fasta/_fasta.pxd
@@ -0,0 +1,13 @@
+cpdef object fastaParser(bytes seq,
+ object bioseqfactory,
+ object tagparser,
+ bytes rawparser,
+ object joinseq=?)
+
+cpdef object fastFastaParser(bytes seq,
+ object tagparser,
+ bytes rawparser)
+
+cpdef tuple fastParseFastaDescription(bytes ds)
+cpdef tuple parseFastaDescription(bytes ds, object tagparser)
+
\ No newline at end of file
diff --git a/src/obitools/fasta/_fasta.pyx b/src/obitools/fasta/_fasta.pyx
new file mode 100644
index 0000000..a0e5714
--- /dev/null
+++ b/src/obitools/fasta/_fasta.pyx
@@ -0,0 +1,515 @@
+# cython: profile=True
+"""
+fasta module provides functions to read and write sequences in fasta format.
+
+
+"""
+from _fasta cimport *
+
+from obitools._obitools cimport _bioSeqGenerator,BioSequence,AASequence,NucSequence
+from obitools._obitools cimport __default_raw_parser
+
+from obitools.format.genericparser import genericEntryIteratorGenerator
+
+#from obitools.alignment import alignmentReader
+#from obitools.utils import universalOpen
+
+import re
+from obitools.ecopcr.options import loadTaxonomyDatabase
+from obitools.format import SequenceFileIterator
+
+#from _fasta import parseFastaDescription,fastaParser
+#from _fasta import _fastaJoinSeq
+#from _fasta import _parseFastaTag
+
+
+cdef extern from "regex.h":
+ struct regex_t:
+ pass
+
+ struct regmatch_t:
+ int rm_so # start of match
+ int rm_eo # end of match
+
+ enum REG_EXTENDED:
+ pass
+
+
+
+ int regcomp(regex_t *preg, char *pattern, int cflags)
+ int regexec(regex_t *preg, char *string, int nmatch, regmatch_t *pmatch, int eflags)
+ void regfree(regex_t *preg)
+
+#fastaEntryIterator=fastGenericEntryIteratorGenerator(startEntry='>')
+fastaEntryIterator=genericEntryIteratorGenerator(startEntry='>')
+rawFastaEntryIterator=genericEntryIteratorGenerator(startEntry='\s*>')
+
+cdef bytes _fastaJoinSeq(list seqarray):
+ return b''.join([x.strip() for x in seqarray])
+
+
+cpdef tuple parseFastaDescription(bytes ds, object tagparser):
+
+ cdef bytes definition
+ cdef bytes info
+ cdef object m
+
+ ds = b' '+ds
+ m = tagparser.search(ds)
+
+ if m is not None:
+ info=m.group(0)
+ definition = ds[m.end(0):].rstrip()
+ else:
+ info=None
+ definition=ds
+
+ return definition,info
+
+cdef bytes _fastTagParser=b'^[a-zA-Z][a-zA-Z.0-9_]* *= *[^;]*;( +[a-zA-Z][a-zA-Z.0-9_]* *= *[^;]*;)*'
+cdef object _cfastTagParser=re.compile(_fastTagParser)
+
+#cdef regex_t cfastTagParser
+#cdef int regerror=regcomp(&cfastTagParser, fastTagParser, REG_EXTENDED)
+
+cpdef tuple fastParseFastaDescription(bytes ds):
+
+ cdef bytes definition
+ cdef bytes info
+ cdef object m
+
+ m = _cfastTagParser.search(ds)
+
+ if m is not None:
+ info=m.group(0)
+ definition = ds[m.end(0):].rstrip()
+ else:
+ info=None
+ definition=ds
+
+ return definition,info
+
+
+
+cpdef object fastFastaParser(bytes seq,
+ object tagparser,
+ bytes rawparser):
+ '''
+ Parse a fasta record.
+
+ @attention: internal purpose function
+
+ @param seq: a sequence object containing all lines corresponding
+ to one fasta sequence
+ @type seq: C{list} or C{tuple} of C{str}
+
+ @param bioseqfactory: a callable object return a BioSequence
+ instance.
+ @type bioseqfactory: a callable object
+
+ @param tagparser: a compiled regular expression usable
+ to identify key, value couples from
+ title line.
+ @type tagparser: regex instance
+
+ @return: a C{BioSequence} instance
+ '''
+ cdef list lseq = seq.split(b'\n')
+ cdef list title = lseq.pop(0).split(None,1)
+ cdef bytes id = title[0][1:]
+ cdef bytes defintion,info
+
+ if len(title) == 2:
+ definition,info=fastParseFastaDescription(title[1])
+ else:
+ info= None
+ definition=None
+
+ seq=b''.join([x.rstrip() for x in lseq])
+
+ return _bioSeqGenerator(id, seq, definition,info,rawparser,{})
+
+
+cpdef object fastaParser(bytes seq,
+ object bioseqfactory,
+ object tagparser,
+ bytes rawparser,
+ object joinseq=None):
+ '''
+ Parse a fasta record.
+
+ @attention: internal purpose function
+
+ @param seq: a sequence object containing all lines corresponding
+ to one fasta sequence
+ @type seq: C{list} or C{tuple} of C{str}
+
+ @param bioseqfactory: a callable object return a BioSequence
+ instance.
+ @type bioseqfactory: a callable object
+
+ @param tagparser: a compiled regular expression usable
+ to identify key, value couples from
+ title line.
+ @type tagparser: regex instance
+
+ @return: a C{BioSequence} instance
+ '''
+ cdef list lseq = seq.split(b'\n')
+ cdef list title = lseq.pop(0).split(None,1)
+ cdef bytes id = title[0][1:]
+ cdef bytes defintion,info
+
+ if len(title) == 2:
+ definition,info=parseFastaDescription(title[1], tagparser)
+ else:
+ info= None
+ definition=None
+
+ if joinseq is None:
+ seq=_fastaJoinSeq(lseq)
+ else:
+ seq=joinseq(lseq)
+
+ if bioseqfactory is None:
+ return _bioSeqGenerator(id, seq, definition,info,rawparser,{})
+ else:
+ return bioseqfactory(id, seq, definition,info,rawparser)
+
+def fastaNucParser(seq,tagparser=__default_raw_parser,joinseq=None):
+ return fastaParser(seq,NucSequence,tagparser=tagparser,joinseq=joinseq)
+
+def fastaAAParser(seq,tagparser=__default_raw_parser,joinseq=None):
+ return fastaParser(seq,AASequence,tagparser=tagparser,joinseq=joinseq)
+
+def fastFastaIterator(object file,bytes tagparser=__default_raw_parser):
+ '''
+ iterate through a fasta file sequence by sequence.
+ Returned sequences by this iterator will be BioSequence
+ instances
+
+ @param file: a line iterator containing fasta data or a filename
+ @type file: an iterable object or str
+ @type bioseqfactory: a callable object
+
+ @param tagparser: a compiled regular expression usable
+ to identify key, value couples from
+ title line.
+ @type tagparser: regex instance
+
+ @return: an iterator on C{BioSequence} instance
+
+ @see: L{fastaNucIterator}
+ @see: L{fastaAAIterator}
+
+ >>> from obitools.format.sequence.fasta import fastFastaIterator
+ >>> f = fastFastaIterator('monfichier')
+ >>> s = f.next()
+ >>> print s
+ gctagctagcatgctagcatgcta
+ >>>
+ '''
+ cdef bytes allparser = tagparser % b'[a-zA-Z][a-zA-Z0-9_]*'
+
+ rtagparser = re.compile('( *%s)+' % allparser)
+
+ for entry in fastaEntryIterator(file):
+ yield fastFastaParser(entry,rtagparser,tagparser)
+
+def fastaIterator(object file,
+ object bioseqfactory=None,
+ bytes tagparser=__default_raw_parser,
+ object joinseq=None):
+ '''
+ iterate through a fasta file sequence by sequence.
+ Returned sequences by this iterator will be BioSequence
+ instances
+
+ @param file: a line iterator containing fasta data or a filename
+ @type file: an iterable object or str
+ @param bioseqfactory: a callable object return a BioSequence
+ instance.
+ @type bioseqfactory: a callable object
+
+ @param tagparser: a compiled regular expression usable
+ to identify key, value couples from
+ title line.
+ @type tagparser: regex instance
+
+ @return: an iterator on C{BioSequence} instance
+
+ @see: L{fastaNucIterator}
+ @see: L{fastaAAIterator}
+
+ >>> from obitools.format.sequence.fasta import fastaIterator
+ >>> f = fastaIterator('monfichier')
+ >>> s = f.next()
+ >>> print s
+ gctagctagcatgctagcatgcta
+ >>>
+ '''
+ cdef bytes allparser = tagparser % b'[a-zA-Z][a-zA-Z0-9_]*'
+
+ rtagparser = re.compile('( *%s)+' % allparser)
+
+ for entry in fastaEntryIterator(file):
+ yield fastaParser(entry,bioseqfactory,rtagparser,tagparser,joinseq)
+
+def rawFastaIterator(file,bioseqfactory=None,
+ tagparser=__default_raw_parser,
+ joinseq=None):
+
+ rawparser=tagparser
+ allparser = tagparser % '[a-zA-Z][a-zA-Z.0-9_]*'
+ tagparser = re.compile('( *%s)+' % allparser)
+
+ for entry in rawFastaEntryIterator(file):
+ entry=entry.strip()
+ yield fastaParser(entry,bioseqfactory,tagparser,rawparser,joinseq)
+
+def fastaNucIterator(file,tagparser=__default_raw_parser):
+ '''
+ iterate through a fasta file sequence by sequence.
+ Returned sequences by this iterator will be NucSequence
+ instances
+
+ @param file: a line iterator containint fasta data
+ @type file: an iterable object
+
+ @param tagparser: a compiled regular expression usable
+ to identify key, value couples from
+ title line.
+ @type tagparser: regex instance
+
+ @return: an iterator on C{NucBioSequence} instance
+ @rtype: a generator object
+
+ @see: L{fastaIterator}
+ @see: L{fastaAAIterator}
+ '''
+ return fastaIterator(file, NucSequence,tagparser)
+
+def fastaAAIterator(file,tagparser=__default_raw_parser):
+ '''
+ iterate through a fasta file sequence by sequence.
+ Returned sequences by this iterator will be AASequence
+ instances
+
+ @param file: a line iterator containing fasta data
+ @type file: an iterable object
+
+ @param tagparser: a compiled regular expression usable
+ to identify key, value couples from
+ title line.
+ @type tagparser: regex instance
+
+ @return: an iterator on C{AABioSequence} instance
+
+ @see: L{fastaIterator}
+ @see: L{fastaNucIterator}
+ '''
+ return fastaIterator(file, AASequence,tagparser)
+
+def formatFasta(data,gbmode=False,upper=False,restrict=None):
+ '''
+ Convert a seqence or a set of sequences in a
+ string following the fasta format
+
+ @param data: sequence or a set of sequences
+ @type data: BioSequence instance or an iterable object
+ on BioSequence instances
+
+ @param gbmode: if set to C{True} identifier part of the title
+ line follows recommendation from nbci to allow
+ sequence indexing with the blast formatdb command.
+ @type gbmode: bool
+
+ @param restrict: a set of key name that will be print in the formated
+ output. If restrict is set to C{None} (default) then
+ all keys are formated.
+ @type restrict: any iterable value or None
+
+ @return: a fasta formated string
+ @rtype: str
+ '''
+ if isinstance(data, BioSequence):
+ data = [data]
+
+ if restrict is not None and not isinstance(restrict, set):
+ restrict = set(restrict)
+
+ rep = []
+ for sequence in data:
+ seq = str(sequence)
+ if sequence.definition is None:
+ definition=''
+ else:
+ definition=sequence.definition
+ if upper:
+ frgseq = '\n'.join([seq[x:x+60].upper() for x in xrange(0,len(seq),60)])
+ else:
+ frgseq = '\n'.join([seq[x:x+60] for x in xrange(0,len(seq),60)])
+ info='; '.join(['%s=%s' % x
+ for x in sequence.rawiteritems()
+ if restrict is None or x[0] in restrict])
+ if info:
+ info=info+';'
+ if sequence._rawinfo is not None and sequence._rawinfo:
+ info+=" " + sequence._rawinfo.strip()
+
+ id = sequence.id
+ if gbmode:
+ if 'gi' in sequence:
+ id = "gi|%s|%s" % (sequence['gi'],id)
+ else:
+ id = "lcl|%s|" % (id)
+ title='>%s %s %s' %(id,info,definition)
+ rep.append("%s\n%s" % (title,frgseq))
+ return '\n'.join(rep)
+
+def formatSAPFastaGenerator(options):
+ loadTaxonomyDatabase(options)
+
+ taxonomy=None
+ if options.taxonomy is not None:
+ taxonomy=options.taxonomy
+
+ assert taxonomy is not None,"SAP formating require indication of a taxonomy database"
+
+ ranks = ('superkingdom', 'kingdom', 'subkingdom', 'superphylum',
+ 'phylum', 'subphylum', 'superclass', 'class', 'subclass',
+ 'infraclass', 'superorder', 'order', 'suborder', 'infraorder',
+ 'parvorder', 'superfamily', 'family', 'subfamily', 'supertribe', 'tribe',
+ 'subtribe', 'supergenus', 'genus', 'subgenus', 'species group',
+ 'species subgroup', 'species', 'subspecies')
+
+ trank=set(taxonomy._ranks)
+ ranks = [taxonomy._ranks.index(x) for x in ranks if x in trank]
+
+ def formatSAPFasta(data,gbmode=False,upper=False,restrict=None):
+ '''
+ Convert a seqence or a set of sequences in a
+ string following the fasta format as recommended for the SAP
+ software
+
+ http://ib.berkeley.edu/labs/slatkin/munch/StatisticalAssignmentPackage.html
+
+ @param data: sequence or a set of sequences
+ @type data: BioSequence instance or an iterable object
+ on BioSequence instances
+
+ @param gbmode: if set to C{True} identifier part of the title
+ line follows recommendation from nbci to allow
+ sequence indexing with the blast formatdb command.
+ @type gbmode: bool
+
+ @param restrict: a set of key name that will be print in the formated
+ output. If restrict is set to C{None} (default) then
+ all keys are formated.
+ @type restrict: any iterable value or None
+
+ @return: a fasta formated string
+ @rtype: str
+ '''
+ if isinstance(data, BioSequence):
+ data = [data]
+
+ if restrict is not None and not isinstance(restrict, set):
+ restrict = set(restrict)
+
+ rep = []
+ for sequence in data:
+ seq = str(sequence)
+
+ if upper:
+ frgseq = '\n'.join([seq[x:x+60].upper() for x in xrange(0,len(seq),60)])
+ else:
+ frgseq = '\n'.join([seq[x:x+60] for x in xrange(0,len(seq),60)])
+
+ try:
+ taxid = sequence["taxid"]
+ except KeyError:
+ raise AssertionError('All sequence must have a taxid')
+
+ definition=' ;'
+
+ for r in ranks:
+ taxon = taxonomy.getTaxonAtRank(taxid,r)
+ if taxon is not None:
+ definition+=' %s: %s,' % (taxonomy._ranks[r],taxonomy.getPreferedName(taxon))
+
+ definition='%s ; %s' % (definition[0:-1],taxonomy.getPreferedName(taxid))
+
+ id = sequence.id
+ if gbmode:
+ if 'gi' in sequence:
+ id = "gi|%s|%s" % (sequence['gi'],id)
+ else:
+ id = "lcl|%s|" % (id)
+ title='>%s%s' %(id,definition)
+ rep.append("%s\n%s" % (title,frgseq))
+ return '\n'.join(rep)
+
+ return formatSAPFasta
+
+#class FastaIterator(SequenceFileIterator):
+#
+#
+# entryIterator = genericEntryIteratorGenerator(startEntry='>')
+# classmethod(entryIterator)
+#
+# def __init__(self,inputfile,bioseqfactory=bioSeqGenerator,
+# tagparser=_default_raw_parser,
+# joinseq=_fastaJoinSeq):
+#
+# SequenceFileIterator.__init__(self, inputfile, bioseqfactory)
+#
+# self.__file = FastaIterator.entryIterator(self._inputfile)
+#
+# self._tagparser = tagparser
+# self._joinseq = joinseq
+#
+# def get_tagparser(self):
+# return self.__tagparser
+#
+#
+# def set_tagparser(self, value):
+# self._rawparser = value
+# allparser = value % '[a-zA-Z][a-zA-Z0-9_]*'
+# self.__tagparser = re.compile('( *%s)+' % allparser)
+#
+# def _parseFastaDescription(self,ds):
+#
+# m = self._tagparser.search(' '+ds)
+# if m is not None:
+# info=m.group(0)
+# definition = ds[m.end(0):].strip()
+# else:
+# info=None
+# definition=ds
+#
+# return definition,info
+#
+#
+# def _parser(self):
+# '''
+# Parse a fasta record.
+#
+# @attention: internal purpose function
+#
+# @return: a C{BioSequence} instance
+# '''
+# seq = self._seq.split('\n')
+# title = seq[0].strip()[1:].split(None,1)
+# id=title[0]
+# if len(title) == 2:
+# definition,info=self._parseFastaDescription(title[1])
+# else:
+# info= None
+# definition=None
+#
+# seq=self._joinseq(seq[1:])
+#
+# return self._bioseqfactory(id, seq, definition,info,self._rawparser)
+#
+# _tagparser = property(get_tagparser, set_tagparser, None, "_tagparser's docstring")
diff --git a/src/obitools/fastq/__init__.py b/src/obitools/fastq/__init__.py
new file mode 100644
index 0000000..aa492c2
--- /dev/null
+++ b/src/obitools/fastq/__init__.py
@@ -0,0 +1,19 @@
+'''
+Created on 29 aout 2009
+
+ at author: coissac
+'''
+
+from _fastq import fastqQualitySangerDecoder,fastqQualitySolexaDecoder
+from _fastq import qualityToSangerError,qualityToSolexaError
+from _fastq import errorToSangerFastQStr
+from _fastq import formatFastq
+from _fastq import fastqParserGenetator
+from _fastq import fastqAAIterator,fastqIlluminaIterator,fastqSolexaIterator, \
+ fastqSangerIterator, fastqIterator, fastqEntryIterator
+from _fastq import fastFastqParserGenetator
+from _fastq import fastFastqIlluminaIterator,fastFastqSolexaIterator, \
+ fastFastqSangerIterator, fastFastqIterator
+
+
+
diff --git a/src/obitools/fastq/_fastq.pyx b/src/obitools/fastq/_fastq.pyx
new file mode 100644
index 0000000..4d2f318
--- /dev/null
+++ b/src/obitools/fastq/_fastq.pyx
@@ -0,0 +1,530 @@
+# cython: profile=True
+
+'''
+Created on 16 sept. 2009
+
+ at author: coissac
+'''
+
+
+#from obitools.fasta._fasta cimport *
+from cpython cimport array
+
+
+from obitools.fasta._fasta cimport fastParseFastaDescription
+from obitools.fasta._fasta cimport parseFastaDescription
+from obitools._obitools cimport BioSequence
+from obitools._obitools cimport __default_raw_parser
+from obitools._obitools cimport AASequence,NucSequence
+
+from obitools import bioSeqGenerator
+from obitools.format.genericparser import genericEntryIteratorGenerator
+from obitools.utils import universalOpen
+
+import re
+import sys
+
+cdef import from "math.h" :
+ double log10(double x)
+ double rint(double x)
+
+cdef import from "string.h":
+ int strlen(char* s)
+ void bzero(void *s, size_t n)
+
+cdef import from "stdlib.h":
+ void* malloc(int size) except NULL
+ void* realloc(void* chunk,int size) except NULL
+ void free(void* chunk)
+
+
+fastqEntryIterator=genericEntryIteratorGenerator(startEntry=b'^@',endEntry=b"^\+",strip=True,join=False)
+
+
+
+cpdef array.array[double] fastqQualityDecoder(char* qualstring, int base=0):
+ cdef int i=0
+ cdef int mq=255
+ cdef object oaddresse,olength
+ cdef int length
+ cdef array.array quality
+ cdef double* bdouble
+
+# quality = array.array(b'd',[0]*strlen(qualstring))
+ quality = array.array(b'd',[0])
+
+# print >>sys.stderr,"+@@> ",sys.getrefcount(quality)
+
+ array.resize(quality,strlen(qualstring))
+# (oaddress,olength)=quality.buffer_info()
+ bdouble=quality.data.as_doubles
+
+ if base==0:
+ mq = 255
+ while (qualstring[i]!=0):
+ if qualstring[i]<mq:
+ mq=qualstring[i]
+ i+=1
+ if mq < 59:
+ base=33
+ else:
+ base=64
+
+ i=0
+ while (qualstring[i]!=0):
+ bdouble[i]=qualstring[i]-base
+ i+=1
+
+ return quality
+
+cpdef array.array[double] fastqQualitySangerDecoder(char* qualstring):
+ return fastqQualityDecoder(qualstring,33)
+
+cpdef array.array[double] fastqQualitySolexaDecoder(char* qualstring):
+ return fastqQualityDecoder(qualstring,64)
+
+cpdef array.array[double] qualityToSolexaError(array.array quality):
+ cdef int i=0 # @DuplicatedSignature
+ cdef int lq
+ cdef double proba
+ cdef object oaddresse,olength # @DuplicatedSignature
+ cdef int length # @DuplicatedSignature
+ cdef double* bdouble # @DuplicatedSignature
+
+ (oaddress,olength)=quality.buffer_info()
+ bdouble=<double*><unsigned long int>oaddress
+ lq=olength
+
+ for i in range(lq):
+ proba=1/(1+10.**(bdouble[i]/10.))
+ bdouble[i]=proba
+
+ return quality
+
+cpdef array.array[double] qualityToSangerError(array.array quality):
+ cdef int i=0 # @DuplicatedSignature
+ cdef int lq # @DuplicatedSignature
+ cdef double proba # @DuplicatedSignature
+ cdef object oaddresse,olength # @DuplicatedSignature
+ cdef int length # @DuplicatedSignature
+ cdef double* bdouble # @DuplicatedSignature
+
+ (oaddress,olength)=quality.buffer_info()
+ bdouble=<double*><unsigned long int>oaddress
+ lq=olength
+
+ for i in range(lq):
+ proba=10.**(-bdouble[i]/10.)
+ bdouble[i]=proba
+
+ return quality
+
+cpdef array.array[double] errorToSangerQuality(array.array quality):
+ cdef int i=0 # @DuplicatedSignature
+ cdef int lq # @DuplicatedSignature
+ cdef double proba # @DuplicatedSignature
+ cdef object oaddresse,olength # @DuplicatedSignature
+ cdef int length # @DuplicatedSignature
+ cdef double* bdouble # @DuplicatedSignature
+
+ (oaddress,olength)=quality.buffer_info()
+ bdouble=<double*><unsigned long int>oaddress
+ lq=olength
+
+ for i in range(lq):
+ proba=-rint(log10(bdouble[i])*10)
+ bdouble[i]=proba
+
+ return quality
+
+cpdef array.array[double] solexaToSangerQuality(array.array quality):
+ cdef int i=0 # @DuplicatedSignature
+ cdef int lq # @DuplicatedSignature
+ cdef double proba # @DuplicatedSignature
+ cdef object oaddresse,olength # @DuplicatedSignature
+ cdef int length # @DuplicatedSignature
+ cdef double* bdouble # @DuplicatedSignature
+
+ (oaddress,olength)=quality.buffer_info()
+ bdouble=<double*><unsigned long int>oaddress
+ lq=olength
+
+ for i in range(lq):
+ proba=-rint(log10(1/(1+10.**(bdouble[i]/10.)))*10)
+ bdouble[i]=proba
+
+ return quality
+
+cpdef bytes errorToSangerFastQStr(array.array quality):
+ cdef int i=0 # @DuplicatedSignature
+ cdef int lq # @DuplicatedSignature
+ cdef double proba # @DuplicatedSignature
+ cdef object oaddresse,olength # @DuplicatedSignature
+ cdef int length # @DuplicatedSignature
+ cdef double* bdouble # @DuplicatedSignature
+ cdef char* result
+ cdef bytes code
+
+ (oaddress,olength)=quality.buffer_info()
+ bdouble=<double*><unsigned long int>oaddress
+ lq=olength
+ result=<char *>malloc(olength+1)
+ result[olength]=0
+
+ for i in range(lq):
+ proba=-rint(log10(bdouble[i])*10)
+ if proba > 93.:
+ proba=93.
+ result[i]=33 + <int>proba
+ code=result
+ free(<void *>result)
+ return code
+
+cpdef bytes formatFastq(object data, bint gbmode=False, bint upper=False):
+ cdef list rep=[]
+ cdef bytes seq
+ cdef bytes definition
+ cdef bytes info
+ cdef bytes quality
+ cdef bytes id
+
+
+ if isinstance(data, BioSequence):
+ data = [data]
+
+ for sequence in data:
+ seq = str(sequence)
+ if upper:
+ seq=seq.upper()
+ if sequence.definition is None:
+ definition=b''
+ else:
+ definition=sequence.definition
+ info=b'; '.join([b'%s=%s' % x for x in sequence.rawiteritems()])
+ if info:
+ info=info+b';'
+
+ if sequence._rawinfo is not None and sequence._rawinfo:
+ info+=b" " + sequence._rawinfo.strip()
+
+
+ id = sequence.id
+ if gbmode:
+ if b'gi' in sequence:
+ id = bytes(b"gi|%s|%s" % (sequence[b'gi'],id))
+ else:
+ id = b"lcl|%s|" % (id)
+ if hasattr(sequence, b"quality"):
+ quality=errorToSangerFastQStr(sequence.quality)
+ else:
+ quality=b"I"*len(sequence)
+ title=b'@%s %s %s' %(id,info,definition)
+ rep.append(b"%s\n%s\n+\n%s" % (title,seq,quality))
+ return b'\n'.join(rep)
+
+
+cdef enum FastqType:
+ sanger,solexa
+
+cdef class fastqParserGenetator:
+
+ cdef object bioseqfactory
+ cdef object tagparser
+ cdef object rawparser
+ cdef bint _qualityDecoder
+ cdef bint _errorDecoder
+
+ def __init__(self,fastqvariant=b'sanger',bioseqfactory=NucSequence,tagparser=__default_raw_parser):
+ self.bioseqfactory = bioseqfactory
+
+ self.rawparser=tagparser
+ allparser = tagparser % b'[a-zA-Z][a-zA-Z.0-9_]*'
+ tagparser = re.compile(b'( *%s)+' % allparser)
+
+ self.tagparser = tagparser
+ # Sanger = True
+ # Solexa = False
+ self._qualityDecoder, self._errorDecoder = {b'sanger' : (True,True),
+ b'solexa' : (False,False),
+ b'illumina' : (False,True)}[fastqvariant]
+
+ cdef errorDecoder(self,object qualstring):
+ if self._errorDecoder:
+ return qualityToSangerError(qualstring)
+ else:
+ return qualityToSolexaError(qualstring)
+
+ cdef qualityDecoder(self,char* qualstring):
+ if self._qualityDecoder:
+ return fastqQualitySangerDecoder(qualstring)
+ else:
+ return fastqQualitySolexaDecoder(qualstring)
+
+ def __call__(self,seq):
+ cdef str definition
+ cdef str info
+ cdef str id
+ cdef str s0
+ cdef str tseq
+ cdef bytes tqual
+
+ s0=seq[0]
+ title = s0[1:].split(None,1)
+ id=title[0]
+ if len(title) == 2:
+ definition,info=parseFastaDescription(title[1], self.tagparser)
+ else:
+ info= None
+ definition=None
+
+ tqual = seq[3]
+ quality=self.errorDecoder(self.qualityDecoder(tqual))
+
+ tseq=seq[1]
+
+ seq = self.bioseqfactory(id, tseq, definition,info,self.rawparser)
+ seq.quality = quality
+
+ return seq
+
+def fastqIterator(file,fastqvariant=b'sanger',bioseqfactory=NucSequence,tagparser=__default_raw_parser):
+ '''
+ iterate through a fasta file sequence by sequence.
+ Returned sequences by this iterator will be BioSequence
+ instances
+
+ @param file: a line iterator containing fasta data or a filename
+ @type file: an iterable object or str
+ @param bioseqfactory: a callable object return a BioSequence
+ instance.
+ @type bioseqfactory: a callable object
+
+ @param tagparser: a compiled regular expression usable
+ to identify key, value couples from
+ title line.
+ @type tagparser: regex instance
+
+ @return: an iterator on C{BioSequence} instance
+
+ @see: L{fastaNucIterator}
+ @see: L{fastaAAIterator}
+
+ '''
+ fastqParser=fastqParserGenetator(fastqvariant, bioseqfactory, tagparser)
+ file = universalOpen(file)
+ for entry in fastqEntryIterator(file):
+ title=entry[0]
+ seq=b"".join(entry[1:-1])
+ quality=b''
+ lenseq=len(seq)
+ while (len(quality) < lenseq):
+ quality+=file.next().strip()
+
+ yield fastqParser([title,seq,b'+',quality])
+
+def fastqSangerIterator(file,tagparser=__default_raw_parser):
+ '''
+ iterate through a fastq file sequence by sequence.
+ Returned sequences by this iterator will be NucSequence
+ instances
+
+ @param file: a line iterator containint fasta data
+ @type file: an iterable object
+
+ @param tagparser: a compiled regular expression usable
+ to identify key, value couples from
+ title line.
+ @type tagparser: regex instance
+
+ @return: an iterator on C{NucBioSequence} instance
+
+ @see: L{fastqIterator}
+ @see: L{fastqAAIterator}
+ '''
+ return fastqIterator(file,b'sanger',NucSequence,tagparser)
+
+def fastqSolexaIterator(file,tagparser=__default_raw_parser):
+ '''
+ iterate through a fastq file sequence by sequence.
+ Returned sequences by this iterator will be NucSequence
+ instances
+
+ @param file: a line iterator containint fasta data
+ @type file: an iterable object
+
+ @param tagparser: a compiled regular expression usable
+ to identify key, value couples from
+ title line.
+ @type tagparser: regex instance
+
+ @return: an iterator on C{NucBioSequence} instance
+
+ @see: L{fastqIterator}
+ @see: L{fastqAAIterator}
+ '''
+ return fastqIterator(file,b'solexa',NucSequence,tagparser)
+
+def fastqIlluminaIterator(file,tagparser=__default_raw_parser):
+ '''
+ iterate through a fastq file sequence by sequence.
+ Returned sequences by this iterator will be NucSequence
+ instances
+
+ @param file: a line iterator containint fasta data
+ @type file: an iterable object
+
+ @param tagparser: a compiled regular expression usable
+ to identify key, value couples from
+ title line.
+ @type tagparser: regex instance
+
+ @return: an iterator on C{NucBioSequence} instance
+
+ @see: L{fastqIterator}
+ @see: L{fastqAAIterator}
+ '''
+ return fastqIterator(file,b'illumina',NucSequence,tagparser)
+
+def fastqAAIterator(file,tagparser=__default_raw_parser):
+ '''
+ iterate through a fastq file sequence by sequence.
+ Returned sequences by this iterator will be AASequence
+ instances
+
+ @param file: a line iterator containing fasta data
+ @type file: an iterable object
+
+ @param tagparser: a compiled regular expression usable
+ to identify key, value couples from
+ title line.
+ @type tagparser: regex instance
+
+ @return: an iterator on C{AABioSequence} instance
+
+ @see: L{fastqIterator}
+ @see: L{fastqNucIterator}
+ '''
+ return fastqIterator(file,b'sanger',AASequence,tagparser)
+
+
+cdef class fastFastqParserGenetator(fastqParserGenetator):
+
+
+ def __init__(self,fastqvariant=b'sanger'):
+
+ self.rawparser=__default_raw_parser
+
+ # Sanger = True
+ # Solexa = False
+ self._qualityDecoder, self._errorDecoder = {b'sanger' : (True,True),
+ b'solexa' : (False,False),
+ b'illumina' : (False,True)}[fastqvariant]
+
+
+ def __call__(self, list seq):
+
+ cdef bytes s0 = seq[0]
+ cdef list title = s0.split(None,1)
+ cdef bytes id = title[0][1:]
+ cdef bytes defintion,info
+ cdef bytes tqual = seq[3]
+ cdef bytes tseq = seq[1]
+ cdef object sseq
+
+ if len(title) == 2:
+ definition,info=fastParseFastaDescription(title[1])
+ else:
+ info= None
+ definition=None
+#FIXME: regarder ici
+ quality=self.errorDecoder(self.qualityDecoder(tqual))
+
+# print >>sys.stderr,b"@@@> ",sys.getrefcount(quality)
+
+ sseq = NucSequence(id, tseq, definition,info,__default_raw_parser)
+ sseq.quality = quality
+
+ return sseq
+
+def fastFastqIterator(file,fastqvariant=b'sanger'):
+ '''
+ iterate through a fasta file sequence by sequence.
+ Returned sequences by this iterator will be BioSequence
+ instances
+
+ @param file: a line iterator containing fasta data or a filename
+ @type file: an iterable object or str
+ @param bioseqfactory: a callable object return a BioSequence
+ instance.
+ @type bioseqfactory: a callable object
+
+ @param tagparser: a compiled regular expression usable
+ to identify key, value couples from
+ title line.
+ @type tagparser: regex instance
+
+ @return: an iterator on C{BioSequence} instance
+
+ @see: L{fastaNucIterator}
+ @see: L{fastaAAIterator}
+
+ '''
+ fastqParser=fastFastqParserGenetator(fastqvariant)
+ file = universalOpen(file)
+ for entry in fastqEntryIterator(file):
+ title=entry[0]
+ seq=b"".join(entry[1:-1])
+ quality=b''
+ lenseq=len(seq)
+ while (len(quality) < lenseq):
+ quality+=file.next().strip()
+
+ yield fastqParser([title,seq,b'+',quality])
+
+def fastFastqSangerIterator(file):
+ '''
+ iterate through a fastq file sequence by sequence.
+ Returned sequences by this iterator will be NucSequence
+ instances
+
+ @param file: a line iterator containint fasta data
+ @type file: an iterable object
+
+ @return: an iterator on C{NucBioSequence} instance
+
+ @see: L{fastqIterator}
+ @see: L{fastqAAIterator}
+ '''
+ return fastFastqIterator(file,b'sanger')
+
+def fastFastqSolexaIterator(file):
+ '''
+ iterate through a fastq file sequence by sequence.
+ Returned sequences by this iterator will be NucSequence
+ instances
+
+ @param file: a line iterator containint fasta data
+ @type file: an iterable object
+
+ @return: an iterator on C{NucBioSequence} instance
+
+ @see: L{fastqIterator}
+ @see: L{fastqAAIterator}
+ '''
+ return fastFastqIterator(file,b'solexa')
+
+def fastFastqIlluminaIterator(file):
+ '''
+ iterate through a fastq file sequence by sequence.
+ Returned sequences by this iterator will be NucSequence
+ instances
+
+ @param file: a line iterator containint fasta data
+ @type file: an iterable object
+
+ @return: an iterator on C{NucBioSequence} instance
+
+ @see: L{fastqIterator}
+ @see: L{fastqAAIterator}
+ '''
+ return fastFastqIterator(file,b'illumina')
diff --git a/src/obitools/fnaqual/__init__.py b/src/obitools/fnaqual/__init__.py
new file mode 100644
index 0000000..384eb96
--- /dev/null
+++ b/src/obitools/fnaqual/__init__.py
@@ -0,0 +1,2 @@
+
+fnaTag=' %s *= *([^\s]+)'
diff --git a/src/obitools/fnaqual/fasta.py b/src/obitools/fnaqual/fasta.py
new file mode 100644
index 0000000..102a13e
--- /dev/null
+++ b/src/obitools/fnaqual/fasta.py
@@ -0,0 +1,8 @@
+from obitools.fasta import fastaNucIterator
+from obitools.fnaqual import fnaTag
+
+def fnaFastaIterator(file):
+
+ x = fastaNucIterator(file, fnaTag)
+
+ return x
\ No newline at end of file
diff --git a/src/obitools/fnaqual/quality.py b/src/obitools/fnaqual/quality.py
new file mode 100644
index 0000000..092f610
--- /dev/null
+++ b/src/obitools/fnaqual/quality.py
@@ -0,0 +1,137 @@
+"""
+
+
+"""
+
+from obitools import _default_raw_parser
+from obitools.fasta import fastaIterator
+from obitools.fnaqual import fnaTag
+from obitools.location import Location
+
+import re
+
+
+class QualitySequence(list):
+
+ def __init__(self,id,seq,definition=None,rawinfo=None,rawparser=_default_raw_parser,**info):
+ '''
+
+ @param id:
+ @param seq:
+ @param definition:
+ '''
+ list.__init__(self,seq)
+ self._info = info
+ self.definition=definition
+ self.id=id
+ self._rawinfo=' ' + rawinfo
+ self._rawparser=rawparser
+
+ def getDefinition(self):
+ '''
+ Sequence definition getter
+
+ @return: the sequence definition
+ @rtype: str
+
+ '''
+ return self._definition
+
+ def setDefinition(self, value):
+ self._definition = value
+
+ def getId(self):
+ return self._id
+
+ def setId(self, value):
+ self._id = value
+
+ def getKey(self,key):
+ if key not in self._info:
+ p = re.compile(self._rawparser % key)
+ m = p.search(self._rawinfo)
+ if m is not None:
+ v=m.group(1)
+ self._rawinfo=' ' + self._rawinfo[0:m.start(0)]+self._rawinfo[m.end(0):]
+ try:
+ v = eval(v)
+ except:
+ pass
+ self._info[key]=v
+ else:
+ raise KeyError,key
+ else:
+ v=self._info[key]
+ return v
+
+ def __getitem__(self,key):
+ if isinstance(key,Location):
+ return key.extractSequence(self)
+ elif isinstance(key, str):
+ return self._getKey(key)
+ elif isinstance(key, int):
+ return list.__getitem__(self,key)
+ elif isinstance(key, slice):
+ subseq=list.__getitem__(self,key)
+ info = dict(self._info)
+ if key.start is not None:
+ start = key.start +1
+ else:
+ start = 1
+ if key.stop is not None:
+ stop = key.stop+1
+ else:
+ stop = len(self)
+ if key.step is not None:
+ step = key.step
+ else:
+ step = 1
+
+ info['cut']='[%d,%d,%s]' % (start,stop,step)
+ return QualitySequence(self.id, subseq, self.definition,self._rawinfo,self._rawparser,**info)
+
+ raise TypeError,'key must be an integer, a str or a slice'
+
+ def __setitem__(self,key,value):
+ self._info[key]=value
+
+ def __delitem__(self,key):
+ if isinstance(key, str):
+ del self._info[key]
+ else:
+ raise TypeError,key
+
+ def __iter__(self):
+ return list.__iter__(self)
+
+ def __contains__(self,key):
+ return key in self._info
+
+ def getTags(self):
+ return self._info
+
+ def complement(self):
+ '''
+
+ '''
+ cseq = self[::-1]
+ rep = QualitySequence(self.id,cseq,self.definition,self._rawinfo,self._rawparser,**self._info)
+ rep._info['complemented']=not rep._info.get('complemented',False)
+ return rep
+
+
+ definition = property(getDefinition, setDefinition, None, "Sequence Definition")
+
+ id = property(getId, setId, None, 'Sequence identifier')
+
+
+def _qualityJoinSeq(seqarray):
+ text = ' '.join([x.strip() for x in seqarray])
+ return [int(x) for x in text.split()]
+
+def qualityIterator(file):
+ for q in fastaIterator(file, QualitySequence, fnaTag, _qualityJoinSeq):
+ yield q
+
+
+
\ No newline at end of file
diff --git a/src/obitools/format/__init__.py b/src/obitools/format/__init__.py
new file mode 100644
index 0000000..a680505
--- /dev/null
+++ b/src/obitools/format/__init__.py
@@ -0,0 +1,28 @@
+from obitools import bioSeqGenerator
+from obitools.utils import universalOpen
+
+
+class SequenceFileIterator:
+
+ def __init__(self,inputfile,bioseqfactory=bioSeqGenerator):
+ self._inputfile = universalOpen(inputfile)
+ self._bioseqfactory = bioseqfactory
+
+ def get_inputfile(self):
+ return self.__file
+
+
+ def get_bioseqfactory(self):
+ return self.__bioseqfactory
+
+ def next(self):
+ entry = self.inputfile.next()
+ return self._parse(entry)
+
+ def __iter__(self):
+ return self
+
+ _inputfile = property(get_inputfile, None, None, "_file's docstring")
+ _bioseqfactory = property(get_bioseqfactory, None, None, "_bioseqfactory's docstring")
+
+
\ No newline at end of file
diff --git a/src/obitools/format/_format.pyx b/src/obitools/format/_format.pyx
new file mode 100644
index 0000000..5a7c7d2
--- /dev/null
+++ b/src/obitools/format/_format.pyx
@@ -0,0 +1,19 @@
+# cython: profile=True
+
+import sys
+from obitools.fasta import formatFasta
+#from obitools.ecopcr.sequence import EcoPCRDBSequenceWriter
+
+cpdef printOutput(options,seq,output=sys.stdout):
+ if options.output is not None:
+ r=options.output(seq)
+ elif options.outputFormater is not None:
+ r=options.outputFormater(seq,upper=options.uppercase)
+ else:
+ r=formatFasta(seq)
+
+ try:
+ output.write(r)
+ output.write("\n")
+ except IOError:
+ sys.exit(0)
diff --git a/src/obitools/format/genericparser/__init__.py b/src/obitools/format/genericparser/__init__.py
new file mode 100644
index 0000000..bdb2984
--- /dev/null
+++ b/src/obitools/format/genericparser/__init__.py
@@ -0,0 +1,219 @@
+"""
+G{packagetree format}
+"""
+import re
+
+from obitools.utils import universalOpen
+
+from _genericparser import genericEntryIteratorGenerator
+
+#def genericEntryIteratorGenerator(startEntry=None,endEntry=None,
+# head=False,tail=False,
+# strip=False,join=True):
+# '''
+# Transfome a text line iterator to an entry oriented iterator.
+#
+# This iterator converted is useful to implement first stage
+# of flat file parsing.
+#
+# @param startEntry: a regular pattern matching the beginning of
+# an entry
+# @type startEntry: C{str} or None
+# @param endEntry: a regular pattern matching the end of
+# an entry
+# @type endEntry: C{str} or None
+# @param head: indicate if an header is present before
+# the first entry (as in many original genbank
+# files)
+# @type head: C{bool}
+# @param tail: indicate if some extra informations are present
+# after the last entry.
+# @type tail: C{bool}
+#
+# @return: an iterator on entries in text format
+# @rtype: an iterator on C{str}
+# '''
+#
+# def isBeginning(line):
+# return startEntry is None or startEntry.match(line) is not None
+#
+# def isEnding(line):
+# return ((endEntry is not None and endEntry.match(line) is not None) or
+# (endEntry is None and startEntry is not None and startEntry.match(line) is not None))
+#
+# def transparentIteratorEntry(file):
+# file = universalOpen(file)
+# return file
+#
+# def genericEntryIterator(file):
+# file = universalOpen(file)
+# entry = []
+# line = file.next()
+# started = head or isBeginning(line)
+#
+# try:
+# while 1:
+# while not started:
+# line = file.next()
+# started = isBeginning(line)
+#
+# if endEntry is None:
+# entry.append(line)
+# line = file.next()
+#
+# while started:
+# end = isEnding(line)
+# if end:
+# if endEntry is not None:
+# entry.append(line)
+# if join:
+# e = ''.join(entry)
+# if strip:
+# e=e.strip()
+# else:
+# e=entry
+# if strip:
+# e=[x.strip() for x in e]
+# entry=[]
+# yield e
+# started=False
+# if endEntry is not None:
+# line = file.next()
+# else:
+# entry.append(line)
+# line = file.next()
+#
+# started = isBeginning(line)
+#
+# except StopIteration:
+# if entry and (endEntry is None or tail):
+# if join:
+# e = ''.join(entry)
+# if strip:
+# e=e.strip()
+# else:
+# e=entry
+# if strip:
+# e=[x.strip() for x in e]
+# yield e
+#
+#
+#
+# if startEntry is not None:
+# startEntry = re.compile(startEntry)
+# if endEntry is not None:
+# endEntry = re.compile(endEntry)
+#
+# if startEntry is None and endEntry is None:
+# return transparentIteratorEntry
+#
+# return genericEntryIterator
+
+
+class GenericParser(object):
+
+ def __init__(self,
+ startEntry=None,
+ endEntry=None,
+ head=False,
+ tail=False,
+ strip=False,
+ **parseAction):
+ """
+ @param startEntry: a regular pattern matching the beginning of
+ an entry
+ @type startEntry: C{str} or None
+ @param endEntry: a regular pattern matching the end of
+ an entry
+ @type endEntry: C{str} or None
+ @param head: indicate if an header is present before
+ the first entry (as in many original genbank
+ files)
+ @type head: C{bool}
+ @param tail: indicate if some extra informations are present
+ after the last entry.
+ @type tail: C{bool}
+
+ @param parseAction:
+
+ """
+ self.flatiterator= genericEntryIteratorGenerator(startEntry,
+ endEntry,
+ head,
+ tail,
+ strip)
+
+ self.action={}
+
+ for k in parseAction:
+ self.addParseAction(k,*parseAction[k])
+
+ def addParseAction(self,name,dataMatcher,dataCleaner=None,cleanSub=''):
+ '''
+ Add a parse action to the generic parser. A parse action
+ allows to extract one information from an entry. A parse
+ action is defined by a name and a method to extract this
+ information from the full text entry.
+
+ A parse action can be defined following two ways.
+
+ - via regular expression patterns
+
+ - via dedicated function.
+
+ In the first case, you have to indicate at least the
+ dataMatcher regular pattern. This pattern should match exactly
+ the data part you want to retrieve. If cleanning of extra
+ characters is needed. The second pattern dataCLeanner can be
+ used to specifyed these characters.
+
+ In the second case you must provide a callable object (function)
+ that extract and clean data from the text entry. This function
+ should return an array containing all data retrevied even if
+ no data or only one data is retrevied.
+
+ @summary: Add a parse action to the generic parser.
+
+ @param name: name of the data extracted
+ @type name: C{str}
+ @param dataMatcher: a regular pattern matching the data
+ or a callable object parsing the
+ entry and returning a list of marched data
+ @type dataMatcher: C{str} or C{SRE_Pattern} instance or a callable
+ object
+ @param dataCleaner: a regular pattern matching part of the data
+ to suppress.
+ @type dataCleaner: C{str} or C{SRE_Pattern} instance or C{None}
+ @param cleanSub: string used to replace dataCleaner matches.
+ Default is an empty string
+ @type cleanSub: C{str}
+
+ '''
+ if callable(dataMatcher):
+ self.action[name]=dataMatcher
+ else :
+ if isinstance(dataMatcher, str):
+ dataMatcher=re.compile(dataMatcher)
+ if isinstance(dataCleaner, str):
+ dataCleaner=re.compile(dataCleaner)
+ self.action[name]=self._buildREParser(dataMatcher,
+ dataCleaner,
+ cleanSub)
+
+ def _buildREParser(self,dataMatcher,dataCleaner,cleanSub):
+ def parser(data):
+ x = dataMatcher.findall(data)
+ if dataCleaner is not None:
+ x = [dataCleaner.sub(cleanSub,y) for y in x]
+ return x
+ return parser
+
+ def __call__(self,file):
+ for e in self.flatiterator(file):
+ pe = {'fullentry':e}
+ for k in self.action:
+ pe[k]=self.action[k](e)
+ yield pe
+
+
+
\ No newline at end of file
diff --git a/src/obitools/format/genericparser/_genericparser.pyx b/src/obitools/format/genericparser/_genericparser.pyx
new file mode 100644
index 0000000..b5062f2
--- /dev/null
+++ b/src/obitools/format/genericparser/_genericparser.pyx
@@ -0,0 +1,232 @@
+# cython: profile=True
+
+
+import re
+from obitools.utils import universalOpen
+
+cdef bint isBeginning(bytes line, object startEntry):
+ return startEntry is None or startEntry.match(line) is not None
+
+cdef bint isEnding(bytes line, object startEntry, object endEntry):
+ return ((endEntry is not None and endEntry.match(line) is not None) or
+ (endEntry is None and startEntry is not None and startEntry.match(line) is not None))
+
+
+
+def genericEntryIteratorGenerator(bytes startEntry=None,
+ bytes endEntry=None,
+ bint head=False,
+ bint tail=False,
+ bint strip=False,
+ bint join=True):
+ '''
+ Transfom a text line iterator to an entry oriented iterator.
+
+ This iterator converted is useful to implement first stage
+ of flat file parsing.
+
+ @param startEntry: a regular pattern matching the beginning of
+ an entry
+ @type startEntry: C{str} or None
+ @param endEntry: a regular pattern matching the end of
+ an entry
+ @type endEntry: C{str} or None
+ @param head: indicate if an header is present before
+ the first entry (as in many original genbank
+ files)
+ @type head: C{bool}
+ @param tail: indicate if some extra informations are present
+ after the last entry.
+ @type tail: C{bool}
+
+ @return: an iterator on entries in text format
+ @rtype: an iterator on C{str}
+ '''
+
+ if startEntry is not None:
+ c_startEntry = re.compile(startEntry)
+ else:
+ c_startEntry = None
+
+ if endEntry is not None:
+ c_endEntry = re.compile(endEntry)
+ else:
+ c_endEntry = None
+
+
+ def transparentIteratorEntry(object f):
+ f = universalOpen(f)
+ return f
+
+ def genericEntryIterator(file):
+
+ cdef list entry = []
+ cdef bytes line
+ cdef bint started
+
+ if not hasattr(file, 'next'):
+ file = universalOpen(file)
+
+ line = file.next()
+ started = head or isBeginning(line,c_startEntry)
+
+ try:
+ while 1:
+ while not started:
+ line = file.next()
+ started = isBeginning(line,c_startEntry)
+
+ if endEntry is None:
+ entry.append(line)
+ line = file.next()
+
+ while started:
+ end = isEnding(line,c_startEntry,c_endEntry)
+ if end:
+ if endEntry is not None:
+ entry.append(line)
+ if join:
+ e = ''.join(entry)
+ if strip:
+ e=e.strip()
+ else:
+ e=entry
+ if strip:
+ e=[x.strip() for x in e]
+ entry=[]
+ yield e
+ started=False
+ if endEntry is not None:
+ line = file.next()
+ else:
+ entry.append(line)
+ line = file.next()
+
+ started = isBeginning(line,c_startEntry)
+
+ except StopIteration:
+ if entry and (endEntry is None or tail):
+ if join:
+ e = ''.join(entry)
+ if strip:
+ e=e.strip()
+ else:
+ e=entry
+ if strip:
+ e=[x.strip() for x in e]
+ yield e
+
+
+
+ if startEntry is None and endEntry is None:
+ return transparentIteratorEntry
+
+ return genericEntryIterator
+
+
+class GenericParser(object):
+
+ def __init__(self,
+ startEntry=None,
+ endEntry=None,
+ head=False,
+ tail=False,
+ strip=False,
+ **parseAction):
+ """
+ @param startEntry: a regular pattern matching the beginning of
+ an entry
+ @type startEntry: C{str} or None
+ @param endEntry: a regular pattern matching the end of
+ an entry
+ @type endEntry: C{str} or None
+ @param head: indicate if an header is present before
+ the first entry (as in many original genbank
+ files)
+ @type head: C{bool}
+ @param tail: indicate if some extra informations are present
+ after the last entry.
+ @type tail: C{bool}
+
+ @param parseAction:
+
+ """
+ self.flatiterator= genericEntryIteratorGenerator(startEntry,
+ endEntry,
+ head,
+ tail,
+ strip)
+
+ self.action={}
+
+ for k in parseAction:
+ self.addParseAction(k,*parseAction[k])
+
+ def addParseAction(self,name,dataMatcher,dataCleaner=None,cleanSub=''):
+ '''
+ Add a parse action to the generic parser. A parse action
+ allows to extract one information from an entry. A parse
+ action is defined by a name and a method to extract this
+ information from the full text entry.
+
+ A parse action can be defined following two ways.
+
+ - via regular expression patterns
+
+ - via dedicated function.
+
+ In the first case, you have to indicate at least the
+ dataMatcher regular pattern. This pattern should match exactly
+ the data part you want to retrieve. If cleanning of extra
+ characters is needed. The second pattern dataCLeanner can be
+ used to specifyed these characters.
+
+ In the second case you must provide a callable object (function)
+ that extract and clean data from the text entry. This function
+ should return an array containing all data retrevied even if
+ no data or only one data is retrevied.
+
+ @summary: Add a parse action to the generic parser.
+
+ @param name: name of the data extracted
+ @type name: C{str}
+ @param dataMatcher: a regular pattern matching the data
+ or a callable object parsing the
+ entry and returning a list of marched data
+ @type dataMatcher: C{str} or C{SRE_Pattern} instance or a callable
+ object
+ @param dataCleaner: a regular pattern matching part of the data
+ to suppress.
+ @type dataCleaner: C{str} or C{SRE_Pattern} instance or C{None}
+ @param cleanSub: string used to replace dataCleaner matches.
+ Default is an empty string
+ @type cleanSub: C{str}
+
+ '''
+ if callable(dataMatcher):
+ self.action[name]=dataMatcher
+ else :
+ if isinstance(dataMatcher, str):
+ dataMatcher=re.compile(dataMatcher)
+ if isinstance(dataCleaner, str):
+ dataCleaner=re.compile(dataCleaner)
+ self.action[name]=self._buildREParser(dataMatcher,
+ dataCleaner,
+ cleanSub)
+
+ def _buildREParser(self,dataMatcher,dataCleaner,cleanSub):
+ def parser(data):
+ x = dataMatcher.findall(data)
+ if dataCleaner is not None:
+ x = [dataCleaner.sub(cleanSub,y) for y in x]
+ return x
+ return parser
+
+ def __call__(self,file):
+ for e in self.flatiterator(file):
+ pe = {'fullentry':e}
+ for k in self.action:
+ pe[k]=self.action[k](e)
+ yield pe
+
+
diff --git a/src/obitools/format/ontology/__init__.py b/src/obitools/format/ontology/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/obitools/format/ontology/go_obo.py b/src/obitools/format/ontology/go_obo.py
new file mode 100644
index 0000000..cd1d87e
--- /dev/null
+++ b/src/obitools/format/ontology/go_obo.py
@@ -0,0 +1,274 @@
+__docformat__ = 'restructuredtext'
+
+import re
+import string
+import textwrap
+
+
+from obitools.obo.go.parser import GOEntryIterator
+from obitools.obo.go.parser import GOTerm
+from obitools.obo.go.parser import GOEntry
+
+"""
+go_obo.py : gene_ontology_edit.obo file parser:
+----------------------------------------------------
+
+- OBOFile class: open a flat file and return an entry.
+
+"""
+class OBOFile(object):
+ """
+ Iterator over all entries of an OBO file
+ """
+
+ def __init__(self,_path):
+ self.file = GOEntryIterator(_path)
+
+ def __iter__(self):
+ return self
+
+ def next(self):
+ fiche = self.file.next()
+
+ if isinstance(fiche, GOTerm):
+ self.isaterm=True
+ return Term(fiche)
+ elif isinstance(fiche, GOEntry):
+ self.isaterm=False
+ return Entry(fiche)
+ else:
+ self.isaterm=False
+ return Header(fiche)
+
+
+############# tout le reste doit descendre a l'etage obitools/ogo/go/parser.py ##########
+
+# define an XRef into a go_obo.py script in the microbi pylib
+class Xref(object):
+ """
+ Class Xref
+ Xref.db Xref database
+ Xref.id Xref identifier
+ """
+
+ def __init__(self,description):
+ data = description.split(':')
+ self.db = data[0].strip()
+ self.id = data[1].strip()
+
+# define a RelatedTerm into a go_obo.py script in the microbi pylib
+class RelatedTerm(object):
+ """
+ Class RelatedTerm
+ RelatedTerm.relation RelatedTerm relation
+ RelatedTerm.related_term RelatedTerm GO identifier
+ RelatedTerm.comment all terms have 0 or 1 comment
+ """
+
+ def __init__(self,relation,value,comment):
+ self.relation = relation
+ self.related_term = value.strip('GO:')
+ self.comment = comment
+
+
+# define into a go_obo.py script in the microbi pylib
+#class Term(object):
+# """
+# class representing an OBO term (entry).
+# """
+#
+# def __init__(self):
+# raise RuntimeError('biodb.go_obo is an abstract class')
+#
+# def __checkEntry__(self):
+# minimum=(hasattr(self,'goid') )
+# if not minimum:
+# raise AssertionError('Misconstructed GO Term instance %s' % [x for x in dir(self) if x[0]!='_'])
+
+class Term(object):
+ """
+ Class Term
+ representing a GO term.
+ """
+
+ def __init__(self,data=None):
+ """
+ """
+ self.data=data
+ self.isaterm = True
+
+ if data:
+ self.__filtreGoid__()
+ self.__filtreName__()
+ self.__filtreComment__()
+ self.__filtreSynonyms__()
+ self.__filtreDef__()
+ self.__filtreParents__()
+ self.__filtreRelationships__()
+ self.__filtreRelation__()
+ self.__filtreObsolete__()
+ self.__filtreAltIds__()
+ self.__filtreXRefs__()
+ self.__filtreSubsets__()
+
+ # check if all required attributes were valued
+ self.__checkEntry__()
+
+
+ def __checkEntry__(self):
+ minimum=(hasattr(self,'goid') )
+ if not minimum:
+ raise AssertionError('Misconstructed GO Term instance %s' % [x for x in dir(self) if x[0]!='_'])
+
+
+ def __filtreGoid__(self):
+ """
+ Extract GO id.
+ """
+ self.goid = self.data.id.value.strip('GO:')
+
+ def __filtreName__(self):
+ """
+ Extract GO name.
+ """
+ self.name = self.data.name.value
+
+ def __filtreSynonyms__(self):
+ """
+ Extract GO synonym(s).
+ """
+ self.list_synonyms = {}
+ if self.data.synonyms:
+ for y in self.data.synonyms:
+ self.list_synonyms[y.value] = y.scope
+
+
+ def __filtreComment__(self):
+ """
+ manage None comments
+ """
+ if self.data.comment != None:
+ self.comment = self.data.comment.value
+ else:
+ self.comment = ""
+
+ def __filtreDef__(self):
+ """
+ Extract GO definition.
+ """
+ if self.data.definition != None:
+ self.definition = self.data.definition.value
+ else:
+ self.definition = ""
+
+ def __filtreParents__(self):
+ """
+ To make the is_a hierarchy
+ """
+ if self.data.is_a != None:
+ self.is_a = set([isa.value.strip('GO:') for isa in self.data.is_a])
+ else:
+ self.is_a = set()
+
+ def __filtreRelation__(self):
+ """
+ To make the part_of hierarchy
+ """
+ self.part_of = set()
+ self.regulates = set()
+ self.negatively_regulates = set()
+ self.positively_regulates = set()
+
+ if self.data.relationship != None:
+ for rel in self.data.relationship:
+ if rel.relationship == "part_of":
+ self.part_of.add(rel.value.strip('GO:'))
+ elif rel.relationship == "regulates":
+ self.regulates.add(rel.value.strip('GO:'))
+ elif rel.relationship == "negatively_regulates":
+ self.negatively_regulates.add(rel.value.strip('GO:'))
+ elif rel.relationship == "positively_regulates":
+ self.positively_regulates.add(rel.value.strip('GO:'))
+
+
+ def __filtreRelationships__(self):
+ """
+ Relation list with other GO Terms (is_a, part_of or some regulates relation)
+ """
+ self.related_term =[]
+ if self.data.relationship != None:
+ for x in self.data.relationship:
+ self.related_term.append(RelatedTerm(x.relationship,x.value,x.__doc__))
+ #self.related_term.append(RelatedTerm(x.relationship,x.value,x.comment))
+ if self.data.is_a != None:
+ for x in self.data.is_a:
+ self.related_term.append(RelatedTerm('is_a',x.value,x.__doc__))
+ #self.related_term.append(RelatedTerm('is_a',x.value,x.comment))
+
+
+
+ def __filtreObsolete__(self):
+ """
+ for each obsolete terms corresponds a set of GO Identifiers
+ so that this GO term is consider as others GO Terms
+ """
+ self.considers = set()
+ self.replaces = set()
+ self.is_obsolete = self.data.is_obsolete
+ if self.data.is_obsolete:
+ if self.data.consider:
+ self.considers = set([considered.value.strip('GO:') for considered in self.data.consider])
+ if self.data.replaced_by:
+ self.replaces = set([replaced.value.strip('GO:') for replaced in self.data.replaced_by])
+
+
+ def __filtreAltIds__(self):
+ """
+ alternate(s) id(s) for this term (= alias in the geneontology schema model!)
+ """
+ if self.data.alt_ids:
+ self.alt_ids = set([x.value.strip('GO:') for x in self.data.alt_ids])
+ else:
+ self.alt_ids = set()
+
+ def __filtreXRefs__(self):
+ """
+ cross references to other databases
+ """
+ self.xrefs = set()
+ if self.data.xrefs:
+ self.xrefs = set([Xref(x.value.reference) for x in self.data.xrefs])
+
+
+ def __filtreSubsets__(self):
+ """
+ subset label to make smaller sets of GO Terms
+ """
+ self.subsets = set()
+ if self.data.subsets:
+ self.subsets = set([x.value for x in self.data.subsets])
+
+
+class Entry(object):
+ """
+ a Stanza entry, like [Typedef] for example
+ """
+ def __init__(self,data=None):
+ self.data=data
+ self.isaterm=False
+ self.isanentry=True
+
+
+class Header(object):
+ """
+ class representing a GO header.
+ """
+
+ def __init__(self,data=None):
+ """
+ """
+ self.data=data
+ self.isaterm = False
+
+
+
diff --git a/src/obitools/format/options.py b/src/obitools/format/options.py
new file mode 100644
index 0000000..f7ca1ec
--- /dev/null
+++ b/src/obitools/format/options.py
@@ -0,0 +1,375 @@
+'''
+Created on 13 oct. 2009
+
+ at author: coissac
+'''
+
+from obitools.format.sequence.embl import emblIterator
+from obitools.format.sequence.genbank import genbankIterator
+from obitools.format.sequence.fnaqual import fnaFastaIterator
+from obitools.format.sequence.fasta import fastaAAIterator, fastaNucIterator, fastFastaIterator
+from obitools.format.sequence.fastq import fastFastqIlluminaIterator,fastFastqSolexaIterator
+
+from obitools.fastq import fastFastqSangerIterator
+from obitools.fnaqual.quality import qualityIterator
+from obitools.ecopcr.sequence import EcoPCRDBSequenceIterator
+from obitools.fasta import formatFasta, rawFastaIterator,\
+ formatSAPFastaGenerator
+from obitools.fastq import formatFastq
+
+from obitools.ecopcr.sequence import EcoPCRDBSequenceWriter
+
+from cPickle import dump,load,UnpicklingError
+
+#from obitools.format._format import printOutput
+
+from array import array
+from itertools import chain
+import sys
+
+import re
+from obitools.ecopcr import EcoPCRFile
+from obitools.format.sequence import skipOnErrorIterator, skipfirst, only
+from obitools import BioSequence
+from obitools.utils import FakeFile
+
+def binarySequenceIterator(lineiterator):
+
+ f = FakeFile(lineiterator)
+
+ try:
+ while(1):
+ try:
+ s = load(f)
+ yield s
+ except UnpicklingError:
+ pass
+ except EOFError:
+ raise StopIteration
+
+def addInputFormatOption(optionManager):
+
+ group = optionManager.add_option_group("Restriction to a sub-part options",
+ "Allow to limit analysis to a sub-part of the data file")
+
+ group.add_option('--skip',
+ action="store", dest="skip",
+ metavar='<N>',
+ default=None,
+ type='int',
+ help="skip the N first sequences")
+
+ group.add_option('--only',
+ action="store", dest="only",
+ metavar='<N>',
+ default=None,
+ type='int',
+ help="treat only N sequences")
+
+ group = optionManager.add_option_group("Input format options",
+ "If not specified, a test is done to determine the file format")
+
+ group.add_option('--genbank',
+ action="store_const", dest="seqinformat",
+ default=None,
+ const='genbank',
+ help="Input file is in genbank format")
+
+ group.add_option('--embl',
+ action="store_const", dest="seqinformat",
+ default=None,
+ const='embl',
+ help="Input file is in embl format")
+
+ group.add_option('--skip-on-error',
+ action="store_true", dest="skiperror",
+ default=False,
+ help="Skip sequence entries with parse error")
+
+ group.add_option('--fasta',
+ action="store_const", dest="seqinformat",
+ default=None,
+ const='fasta',
+ help="Input file is in fasta nucleic format (including obitools fasta extentions)")
+
+ group.add_option('--ecopcr',
+ action="store_const", dest="seqinformat",
+ default=None,
+ const='ecopcr',
+ help="Input file is in ecopcr format")
+
+ group.add_option('--raw-fasta',
+ action="store_const", dest="seqinformat",
+ default=None,
+ const='rawfasta',
+ help="Input file is in fasta format (but more tolerant to format variant)")
+
+# group.add_option('--fna',
+# action="store_const", dest="seqinformat",
+# default=None,
+# const='fna',
+# help="input file is in fasta nucleic format produced by 454 sequencer pipeline")
+#
+# group.add_option('--qual',
+# action="store", dest="withqualfile",
+# type='str',
+# default=None,
+# help="Specify the name of a quality file produced by 454 sequencer pipeline")
+
+ group.add_option('--sanger',
+ action="store_const", dest="seqinformat",
+ default=None,
+ const='sanger',
+ help="Input file is in sanger fastq nucleic format (standard fastq)")
+
+ group.add_option('--solexa',
+ action="store_const", dest="seqinformat",
+ default=None,
+ const='solexa',
+ help="Input file is in fastq nucleic format produced by solexa sequencer")
+
+ #===========================================================================
+ # group.add_option('--illumina',
+ # action="store_const", dest="seqinformat",
+ # default=None,
+ # const='illumina',
+ # help="input file is in fastq nucleic format produced by old solexa sequencer")
+ #===========================================================================
+
+ group.add_option('--ecopcrdb',
+ action="store_const", dest="seqinformat",
+ default=None,
+ const='ecopcrdb',
+ help="Input file is an ecopcr database")
+
+ group.add_option('--nuc',
+ action="store_const", dest="moltype",
+ default=None,
+ const='nuc',
+ help="Input file contains nucleic sequences")
+
+ group.add_option('--prot',
+ action="store_const", dest="moltype",
+ default=None,
+ const='pep',
+ help="Input file contains protein sequences")
+
+def addOutputFormatOption(optionManager):
+
+ group = optionManager.add_option_group("Output format options")
+
+
+# optionManager.add_option('-B','--bin-output',
+# action="store_const", dest="output",
+# default=None,
+# const=dump,
+# help="output sequences in binary format")
+ group.add_option('--fasta-output',
+ action="store_const", dest="output",
+ default=None,
+ const=formatFasta,
+ help="Output sequences in obitools fasta format")
+ group.add_option('--fastq-output',
+ action="store_const", dest="output",
+ default=None,
+ const=formatFastq,
+ help="Output sequences in sanger fastq format")
+# group.add_option('--sap-output',
+# action="store_const", dest="output",
+# default=None,
+# const=formatSAPFastaGenerator,
+# help="Output sequences in sap fasta format "
+# "(Sequence must have a taxid and a taxonomy has to be loaded)")
+
+ group.add_option('--ecopcrdb-output',
+ action="store", dest="ecopcroutput",
+ default=None,
+ help="Output sequences in ecopcr database format "
+ "(sequence records are not printed on standard output)")
+ group.add_option('--uppercase',
+ action='store_true',dest='uppercase',
+ default=False,
+ help="Print sequences in upper case (default is lower case)")
+
+
+
+def addInOutputOption(optionManager):
+ addInputFormatOption(optionManager)
+ addOutputFormatOption(optionManager)
+
+
+def autoEntriesIterator(options):
+ options.outputFormater=formatFasta
+ options.outputFormat="fasta"
+
+ ecopcr_pattern = re.compile('^[^ ]+ +| +[0-9]+ +| + [0-9]+ + | +')
+
+ def annotatedIterator(formatIterator):
+ options.outputFormater=formatFasta
+ options.outputFormat="fasta"
+ def iterator(lineiterator):
+ for s in formatIterator(lineiterator):
+ s.extractTaxon()
+ yield s
+
+ return iterator
+
+ def withQualIterator(qualityfile):
+ options.outputFormater=formatFastq
+ options.outputFormat="fastq"
+ def iterator(lineiterator):
+ for s in fnaFastaIterator(lineiterator):
+ q = qualityfile.next()
+ quality = array('d',(10.**(-x/10.) for x in q))
+ s.quality=quality
+ yield s
+
+ return iterator
+
+ def autoSequenceIterator(lineiterator):
+ options.outputFormater=formatFasta
+ options.outputFormat="fasta"
+ first = lineiterator.next()
+
+ if first[0]==">":
+# if options.withqualfile is not None:
+# qualfile=qualityIterator(options.withqualfile)
+# reader=withQualIterator(qualfile)
+# options.outputFormater=formatFastq
+# options.outputFormat="fastq"
+ if options.moltype=='nuc':
+ reader=fastaNucIterator
+ elif options.moltype=='pep':
+ reader=fastaAAIterator
+ else:
+ reader=fastFastaIterator
+ elif first[0]=='@':
+ reader=fastFastqSangerIterator
+ options.outputFormater=formatFastq
+ options.outputFormat="fastq"
+ elif first[0:3]=='ID ':
+ reader=emblIterator
+ elif first[0:6]=='LOCUS ':
+ reader=genbankIterator
+ elif first[0:8]=="#!Pickle":
+ reader=binarySequenceIterator
+ elif first[0]=="#" or ecopcr_pattern.search(first):
+ reader=EcoPCRFile
+ else:
+ raise AssertionError,'file is not in fasta, fasta, embl, genbank or ecoPCR format'
+
+ if reader==binarySequenceIterator:
+ input = binarySequenceIterator(lineiterator) # @ReservedAssignment
+ else:
+ input = reader(chain([first],lineiterator)) # @ReservedAssignment
+
+ return input
+
+ if options.seqinformat is None:
+ reader = autoSequenceIterator
+ else:
+ if options.seqinformat=='fasta':
+ if options.moltype=='nuc':
+ reader=fastaNucIterator
+ elif options.moltype=='pep':
+ reader=fastaAAIterator
+ else:
+ reader=fastFastaIterator
+ elif options.seqinformat=='rawfasta':
+ reader=annotatedIterator(rawFastaIterator)
+ elif options.seqinformat=='genbank':
+ reader=annotatedIterator(genbankIterator)
+ elif options.seqinformat=='embl':
+ reader=annotatedIterator(emblIterator)
+ elif options.seqinformat=='fna':
+ reader=fnaFastaIterator
+ elif options.seqinformat=='sanger':
+ options.outputFormater=formatFastq
+ options.outputFormat="fastq"
+ reader=fastFastqSangerIterator
+ elif options.seqinformat=='solexa':
+ options.outputFormater=formatFastq
+ options.outputFormat="fastq"
+ reader=fastFastqSolexaIterator
+ elif options.seqinformat=='illumina':
+ options.outputFormater=formatFastq
+ options.outputFormat="fastq"
+ reader=fastFastqIlluminaIterator
+ elif options.seqinformat=='ecopcr':
+ reader=EcoPCRFile
+ elif options.seqinformat=='ecopcrdb':
+ reader=EcoPCRDBSequenceIterator
+
+ if options.seqinformat=='fna' and options.withqualfile is not None:
+ qualfile=qualityIterator(options.withqualfile)
+ reader=withQualIterator(qualfile)
+ options.outputFormater=formatFastq
+ options.outputFormat="fastq"
+
+ if options.skiperror:
+ reader = skipOnErrorIterator(reader)
+
+ if hasattr(options, 'skip') and options.skip is not None:
+ print >>sys.stderr,"Skipping %d sequences" % options.skip
+ reader = skipfirst(reader,options.skip)
+
+ if hasattr(options, 'only') and options.only is not None:
+ print >>sys.stderr,"Analysing only %d sequences" % options.only
+ reader = only(reader,options.only)
+
+ return reader
+
+def sequenceWriterGenerator(options,output=sys.stdout):
+ class SequenceWriter:
+ def __init__(self,options,file=sys.stdout): # @ReservedAssignment
+ self._format=None
+ self._file=file
+ self._upper=options.uppercase
+ def put(self,seq):
+ if self._format is None:
+ self._format=formatFasta
+ if options.output is not None:
+ self._format=options.output
+ if self._format is formatSAPFastaGenerator:
+ self._format=formatSAPFastaGenerator(options)
+ elif options.outputFormater is not None:
+ self._format=options.outputFormater
+
+ if hasattr(seq,'_hasTaxid') and seq._hasTaxid:
+ seq.extractTaxon()
+
+ s = self._format(seq,upper=self._upper)
+ try:
+ self._file.write(s)
+ self._file.write("\n")
+ except IOError:
+ sys.exit(0)
+
+ class BinaryWriter:
+ def __init__(self,options,file=sys.stdout): # @ReservedAssignment
+ self._file=file
+ self._file.write("#!Pickle\n")
+ def put(self,seq):
+ try:
+ if isinstance(seq, BioSequence):
+ dump(seq,self._file,protocol=2)
+ else:
+ for s in seq:
+ dump(s,self._file,protocol=2)
+ except IOError:
+ sys.exit(0)
+
+
+ if options.ecopcroutput is not None:
+ writer=EcoPCRDBSequenceWriter(options)
+ elif options.output==dump:
+ writer=BinaryWriter(options,output)
+ else:
+ writer=SequenceWriter(options,output)
+
+ def sequenceWriter(sequence):
+ writer.put(sequence)
+
+ return sequenceWriter
+
+
\ No newline at end of file
diff --git a/src/obitools/format/sequence/__init__.py b/src/obitools/format/sequence/__init__.py
new file mode 100644
index 0000000..9c3d8eb
--- /dev/null
+++ b/src/obitools/format/sequence/__init__.py
@@ -0,0 +1,69 @@
+from obitools.fasta import fastFastaIterator
+from obitools.fastq import fastqSangerIterator
+from obitools.seqdb.embl.parser import emblIterator
+from obitools.seqdb.genbank.parser import genbankIterator
+from itertools import chain
+from obitools.utils import universalOpen
+import sys
+
+def skipOnErrorIterator(seqIterator):
+ def internal(inputdata):
+ si = seqIterator(inputdata)
+ while(1):
+ try:
+ seq = si.next()
+ yield seq
+ except Exception,e:
+ print >>sys.stderr,"coucou"
+ if isinstance(e,StopIteration):
+ raise e
+ else:
+ continue
+
+ return internal
+
+def skipfirst(seqIterator,n):
+ def internal(inputdata):
+ si = seqIterator(inputdata)
+ c=0
+ for seq in si:
+ c+=1
+ if c > n:
+ yield seq
+ print >>sys.stderr
+
+ return internal
+
+
+def only(seqIterator,n):
+ def internal(inputdata):
+ si = seqIterator(inputdata)
+ c=0
+ for seq in si:
+ if c < n:
+ yield seq
+ else:
+ break
+ c+=1
+ print >>sys.stderr
+ return internal
+
+
+
+def autoSequenceIterator(file):
+ lineiterator = universalOpen(file)
+ first = lineiterator.next()
+ if first[0]==">":
+ reader=fastFastaIterator
+ elif first[0]=='@':
+ reader=fastqSangerIterator
+ elif first[0:3]=='ID ':
+ reader=emblIterator
+ elif first[0:6]=='LOCUS ':
+ reader=genbankIterator
+ else:
+ raise AssertionError,'file is not in fasta, fasta, embl, or genbank format'
+
+ input = reader(chain([first],lineiterator))
+
+ return input
diff --git a/src/obitools/format/sequence/embl.py b/src/obitools/format/sequence/embl.py
new file mode 100644
index 0000000..f59f14a
--- /dev/null
+++ b/src/obitools/format/sequence/embl.py
@@ -0,0 +1,2 @@
+from obitools.seqdb.embl.parser import emblIterator,emblParser
+
diff --git a/src/obitools/format/sequence/fasta.py b/src/obitools/format/sequence/fasta.py
new file mode 100644
index 0000000..74a55f3
--- /dev/null
+++ b/src/obitools/format/sequence/fasta.py
@@ -0,0 +1,4 @@
+from obitools.fasta import fastFastaIterator,fastaIterator,fastaParser
+from obitools.fasta import fastaAAIterator,fastaAAParser
+from obitools.fasta import fastaNucIterator,fastaNucParser
+from obitools.fasta import formatFasta
diff --git a/src/obitools/format/sequence/fastq.py b/src/obitools/format/sequence/fastq.py
new file mode 100644
index 0000000..9addf61
--- /dev/null
+++ b/src/obitools/format/sequence/fastq.py
@@ -0,0 +1,16 @@
+'''
+Created on 15 janv. 2010
+
+ at author: coissac
+'''
+
+from obitools.fastq import fastqIterator,fastqParserGenetator
+from obitools.fastq import fastqSangerIterator,fastqSolexaIterator, \
+ fastqIlluminaIterator
+from obitools.fastq import fastFastqIterator,fastFastqParserGenetator
+from obitools.fastq import fastFastqSangerIterator,fastFastqSolexaIterator, \
+ fastFastqIlluminaIterator
+from obitools.fastq import fastqAAIterator
+from obitools.fastq import formatFastq
+
+
diff --git a/src/obitools/format/sequence/fnaqual.py b/src/obitools/format/sequence/fnaqual.py
new file mode 100644
index 0000000..ab69916
--- /dev/null
+++ b/src/obitools/format/sequence/fnaqual.py
@@ -0,0 +1,8 @@
+'''
+Created on 12 oct. 2009
+
+ at author: coissac
+'''
+
+from obitools.fnaqual.fasta import fnaFastaIterator
+from obitools.fnaqual.quality import qualityIterator
diff --git a/src/obitools/format/sequence/genbank.py b/src/obitools/format/sequence/genbank.py
new file mode 100644
index 0000000..8524b6f
--- /dev/null
+++ b/src/obitools/format/sequence/genbank.py
@@ -0,0 +1,4 @@
+from obitools.seqdb.genbank.parser import genpepIterator,genpepParser
+from obitools.seqdb.genbank.parser import genbankIterator,genbankParser
+
+
diff --git a/src/obitools/format/sequence/tagmatcher.py b/src/obitools/format/sequence/tagmatcher.py
new file mode 100644
index 0000000..60ad8d8
--- /dev/null
+++ b/src/obitools/format/sequence/tagmatcher.py
@@ -0,0 +1,5 @@
+from obitools.tagmatcher.parser import tagMatcherParser
+from obitools.tagmatcher.parser import TagMatcherIterator
+from obitools.tagmatcher.parser import formatTagMatcher
+
+tagMatcherIterator=TagMatcherIterator
diff --git a/src/obitools/goa/__init__.py b/src/obitools/goa/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/obitools/goa/parser.py b/src/obitools/goa/parser.py
new file mode 100644
index 0000000..8ffd1e3
--- /dev/null
+++ b/src/obitools/goa/parser.py
@@ -0,0 +1,33 @@
+from itertools import imap
+from obitools import utils
+
+class GoAFileIterator(utils.ColumnFile):
+ def __init__(self,stream):
+ utils.ColumnFile.__init__(self,
+ stream, '\t', True,
+ (str,))
+
+ _colname = ['database',
+ 'ac',
+ 'symbol',
+ 'qualifier',
+ 'goid',
+ 'origin',
+ 'evidence',
+ 'evidnce_origine',
+ 'namespace',
+ 'db_object_name',
+ 'gene',
+ 'object_type',
+ 'taxid',
+ 'date',
+ 'assigned_by']
+
+ def next(self):
+ data = utils.ColumnFile.next(self)
+ data = dict(imap(None,GoAFileIterator._colname,data))
+
+ return data
+
+
+
diff --git a/src/obitools/graph/__init__.py b/src/obitools/graph/__init__.py
new file mode 100644
index 0000000..2d34fd9
--- /dev/null
+++ b/src/obitools/graph/__init__.py
@@ -0,0 +1,1016 @@
+'''
+**obitool.graph** for representing graph structure in obitools
+--------------------------------------------------------------
+
+.. codeauthor:: Eric Coissac <eric.coissac at metabarcoding.org>
+
+
+This module offert classes to manipulate graphs, mainly trough the
+:py:class:`obitools.graph.Graph` class.
+
+.. inheritance-diagram:: Graph DiGraph UndirectedGraph
+ :parts: 2
+
+'''
+
+import sys
+
+
+from obitools.utils import progressBar
+
+
+class Indexer(dict):
+ '''
+ Allow to manage convertion between an arbitrarly hashable python
+ value and an unique integer key
+ '''
+
+ def __init__(self):
+
+ self.__max=0
+ self.__reverse=[]
+
+ def getLabel(self,index):
+ '''
+ Return the python value associated to an integer index.
+
+ :param index: an index value
+ :type index: int
+
+ :raises: IndexError if the index is not used in this
+ Indexer instance
+ '''
+ return self.__reverse[index]
+
+ def getIndex(self,key,strict=False):
+ '''
+ Return the index associated to a **key** in the indexer. Two
+ modes are available :
+
+ - strict mode :
+
+ if the key is not known by the :py:class:`Indexer` instance
+ a :py:exc:`KeyError` exception is raised.
+
+ - non strict mode :
+
+ in this mode if the requested *key** is absent, it is added to
+ the :py:class:`Indexer` instance and the new index is returned
+
+ :param key: the requested key
+ :type key: a hashable python value
+
+ :param strict: select the looking for mode
+ :type strict: bool
+
+ :return: the index corresponding to the key
+ :rtype: int
+
+ :raises: - :py:exc:`KeyError` in strict mode is key is absent
+ of the :py:class:`Indexer` instance
+
+ - :py:exc:`TypeError` if key is not an hashable value.
+ '''
+ if dict.__contains__(self,key):
+ return dict.__getitem__(self,key)
+ elif strict:
+ raise KeyError,key
+ else:
+ value = self.__max
+ self[key]= value
+ self.__reverse.append(key)
+ self.__max+=1
+ return value
+
+ def __getitem__(self,key):
+ '''
+ Implement the [] operateor to emulate the standard dictionnary
+ behaviour on :py:class:`Indexer` and returns the integer key
+ associated to a python value.
+
+ Actually this method call the:py:meth:`getIndex` method in
+ non strict mode so it only raises an :py:exc:`TypeError`
+ if key is not an hashable value.
+
+ :param key: the value to index
+ :type key: an hashable python value
+
+ :return: an unique integer value associated to the key
+ :rtype: int
+
+ :raises: :py:exc:`TypeError` if **key** is not an hashable value.
+
+ '''
+ return self.getIndex(key)
+
+ def __equal__(self,index):
+ '''
+ Implement equal operator **==** for comparing two :py:class:`Indexer` instances.
+ Two :py:class:`Indexer` instances are equals only if they are physically
+ the same instance
+
+ :param index: the second Indexer
+ :type index: an :py:class:`Indexer` instance
+
+ :return: True is the two :py:class:`Indexer` instances are the same
+ :rtype: bool
+ '''
+ return id(self)==id(index)
+
+
+class Graph(object):
+ '''
+ Class used to represent directed or undirected graph.
+
+ .. warning::
+
+ Only one edge can connect two nodes in a given direction.
+
+ .. warning::
+
+ Specifying nodes through their index seepud your code but as no check
+ is done on index value, it may result in inconsistency. So prefer the
+ use of node label to specify a node.
+
+
+ '''
+ def __init__(self,label='G',directed=False,indexer=None,nodes=None,edges=None):
+ '''
+ :param label: Graph name, set to 'G' by default
+ :type label: str
+
+ :param directed: true for directed graph, set to False by defalt
+ :type directed: boolean
+
+ :param indexer: node label indexer. This allows to define several graphs
+ sharing the same indexer (see : :py:meth:`newEmpty`)
+ :type indexer: :py:class:`Indexer`
+
+ :param nodes: set of nodes to add to the graph
+ :type nodes: iterable value
+
+ :param edges: set of edges to add to the graph
+ :type edges: iterable value
+ '''
+
+ self._directed=directed
+ if indexer is None:
+ indexer = Indexer()
+ self._index = indexer
+ self._node = {}
+ self._node_attrs = {}
+ self._edge_attrs = {}
+ self._label=label
+
+ def newEmpty(self):
+ """
+ Build a new empty graph using the same :py:class:`Indexer` instance.
+ This allows two graph for sharing their vertices through their indices.
+ """
+ n = Graph(self._label+"_compact",self._directed,self._index)
+
+ return n
+
+ def addNode(self,node=None,index=None,**data):
+ '''
+ Add a new node or update an existing one.
+
+ :param node: the new node label or the label of an existing node
+ for updating it.
+ :type node: an hashable python value
+
+ :param index: the index of an existing node for updating it.
+ :type index: int
+
+ :return: the index of the node
+ :rtype: int
+
+ :raises: :py:exc:`IndexError` is index is not **None** and
+ corresponds to a not used index in this graph.
+ '''
+ if index is None:
+ index = self._index[node]
+ else:
+ if index >= len(self._index):
+ raise IndexError,"This index is not used in this graph..."
+
+ if index not in self._node:
+ self._node[index]=set()
+
+ if data:
+ if index in self._node_attrs:
+ self._node_attrs[index].update(data)
+ else:
+ self._node_attrs[index]=dict(data)
+
+ return index
+
+ def __contains__(self,node):
+ try:
+ index = self._index.getIndex(node,strict=True)
+ r = index in self._node
+ except KeyError:
+ r=False
+ return r
+
+ def getNode(self,node=None,index=None):
+ """
+ :param node: a node label.
+ :type node: an hashable python value
+
+ :param index: the index of an existing node.
+ :type index: int
+
+ .. note:: Index value are prevalent over node label.
+
+ :return: the looked for node
+ :rtype: :py:class:`Node`
+
+ :raises: :py:exc:`IndexError` if specified node lablel
+ corresponds to a non-existing node.
+
+ .. warning:: no check on index value
+ """
+ if index is None:
+ index = self._index.getIndex(node, True)
+ return Node(index,self)
+
+ def getBestNode(self,estimator):
+ '''
+ Select the node maximizing the estimator function
+
+ :param estimator: the function to maximize
+ :type estimator: a function returning a numerical value and accepting one
+ argument of type :py:class:`Node`
+
+ :return: the best node
+ :rtype: py:class:`Node`
+ '''
+
+ bestScore=0
+ best=None
+ for n in self:
+ score = estimator(n)
+ if best is None or score > bestScore:
+ bestScore = score
+ best=n
+ return best
+
+
+ def delNode(self,node=None,index=None):
+ """
+ Delete a node from a graph and all associated edges.
+
+ :param node: a node label.
+ :type node: an hashable python value
+
+ :param index: the index of an existing node.
+ :type index: int
+
+ .. note:: Index value are prevalent over node label.
+
+ :raises: :py:exc:`IndexError` if specified node lablel
+ corresponds to a non-existing node.
+
+ .. warning:: no check on index value
+ """
+ if index is None:
+ index = self._index[node]
+
+ #
+ # Remove edges pointing to the node
+ #
+
+ for n in self._node:
+ if n!=index:
+ e = self._node[n]
+ if index in e:
+ if (n,index) in self._edge_attrs:
+ del self._edge_attrs[(n,index)]
+ e.remove(index)
+
+ #
+ # Remove edges starting from the node
+ #
+
+ e = self._node[index]
+
+ for n in e:
+ if (index,n) in self._edge_attrs:
+ del self._edge_attrs[(index,n)]
+
+ #
+ # Remove the node by itself
+ #
+
+ del self._node[index]
+
+ #
+ # Remove attributes associated to the node
+ #
+
+ if index in self._node_attrs:
+ del self._node_attrs[index]
+
+
+ def hasEdge(self,node1=None,node2=None,index1=None,index2=None,**data):
+ if index1 is None:
+ index1 = self._index.getIndex(node1, True)
+ else:
+ if index1 >= len(self._index):
+ raise IndexError,"index1 = %d not in the graph" % index1
+
+ if index2 is None:
+ index2 = self._index.getIndex(node2, True)
+ else:
+ if index2 >= len(self._index):
+ raise IndexError,"index2 = %d not in the graph" % index1
+
+ rep = index2 in self._node[index1]
+
+ if not self._directed:
+ rep = rep or (index1 in self._node[index2])
+
+ return rep
+
+ def addEdge(self,node1=None,node2=None,index1=None,index2=None,**data):
+ '''
+ Create a new edge in the graph between both the specified nodes.
+
+ .. note:: Nodes can be specified using their label or their index in the graph
+ if both values are indicated the index is used.
+
+ :param node1: The first vertex label
+ :type node1: an hashable python value
+ :param node2: The second vertex label
+ :type node2: an hashable python value
+ :param index1: The first vertex index
+ :type index1: int
+ :param index2: The second vertex index
+ :type index2: int
+
+ :raises: :py:exc:`IndexError` if one of both the specified node lablel
+ corresponds to a non-existing node.
+
+
+ .. warning:: no check on index value
+ '''
+
+ index1=self.addNode(node1, index1)
+ index2=self.addNode(node2, index2)
+
+ self._node[index1].add(index2)
+
+ if not self._directed:
+ self._node[index2].add(index1)
+
+ if data:
+ if (index1,index2) not in self._edge_attrs:
+ data =dict(data)
+ self._edge_attrs[(index1,index2)]=data
+ if not self._directed:
+ self._edge_attrs[(index2,index1)]=data
+ else:
+ self._edge_attrs[(index1,index2)].update(data)
+
+ return (index1,index2)
+
+ def getEdge(self,node1=None,node2=None,index1=None,index2=None):
+ '''
+ Extract the :py:class:`Edge` instance linking two nodes of the graph.
+
+ .. note:: Nodes can be specified using their label or their index in the graph
+ if both values are indicated the index is used.
+
+ :param node1: The first vertex label
+ :type node1: an hashable python value
+ :param node2: The second vertex label
+ :type node2: an hashable python value
+ :param index1: The first vertex index
+ :type index1: int
+ :param index2: The second vertex index
+ :type index2: int
+
+ :raises: :py:exc:`IndexError` if one of both the specified node lablel
+ corresponds to a non-existing node.
+
+
+ .. warning:: no check on index value
+ '''
+ node1=self.getNode(node1, index1)
+ node2=self.getNode(node2, index2)
+ return Edge(node1,node2)
+
+ def delEdge(self,node1=None,node2=None,index1=None,index2=None):
+ """
+ Delete the edge linking node 1 to node 2.
+
+ .. note:: Nodes can be specified using their label or their index in the graph
+ if both values are indicated the index is used.
+
+
+ :param node1: The first vertex label
+ :type node1: an hashable python value
+ :param node2: The second vertex label
+ :type node2: an hashable python value
+ :param index1: The first vertex index
+ :type index1: int
+ :param index2: The second vertex index
+ :type index2: int
+
+ :raises: :py:exc:`IndexError` if one of both the specified node lablel
+ corresponds to a non-existing node.
+
+
+ .. warning:: no check on index value
+ """
+ if index1 is None:
+ index1 = self._index[node1]
+ if index2 is None:
+ index2 = self._index[node2]
+ if index1 in self._node and index2 in self._node[index1]:
+ self._node[index1].remove(index2)
+ if (index1,index2) in self._node_attrs:
+ del self._node_attrs[(index1,index2)]
+ if not self._directed:
+ self._node[index2].remove(index1)
+ if (index2,index1) in self._node_attrs:
+ del self._node_attrs[(index2,index1)]
+
+ def edgeIterator(self,predicate=None):
+ """
+ Iterate through a set of selected vertices.
+
+ :param predicate: a function allowing node selection. Default value
+ is **None** and indicate that all nodes are selected.
+ :type predicate: a function returning a boolean value
+ and accepting one argument of class :py:class:`Edge`
+
+ :return: an iterator over selected edge
+ :rtype: interator over :py:class:`Edge` instances
+
+ .. seealso::
+ function :py:func:`selectEdgeAttributeFactory` for simple predicate.
+
+ """
+ for n1 in self._node:
+ for n2 in self._node[n1]:
+ if self._directed or n1 <= n2:
+ e = self.getEdge(index1=n1, index2=n2)
+ if predicate is None or predicate(e):
+ yield e
+
+
+ def nodeIterator(self,predicate=None):
+ """
+ Iterate through a set of selected vertices.
+
+ :param predicate: a function allowing edge selection. Default value
+ is **None** and indicate that all edges are selected.
+ :type predicate: a function returning a boolean value
+ and accepting one argument of class :py:class:`Node`
+
+ :return: an iterator over selected nodes.
+ :rtype: interator over :py:class:`Node` instances
+
+ """
+ for n in self._node:
+ node = self.getNode(index=n)
+ if predicate is None or predicate(node):
+ yield node
+
+ def nodeIndexIterator(self,predicate=None):
+ """
+ Iterate through the indexes of a set of selected vertices.
+
+ :param predicate: a function allowing edge selection. Default value
+ is **None** and indicate that all edges are selected.
+ :type predicate: a function returning a boolean value
+ and accepting one argument of class :py:class:`Node`
+
+ :return: an iterator over selected node indices.
+ :rtype: interator over `int`
+
+ """
+ for n in self._node:
+ node = self.getNode(index=n)
+ if predicate is None or predicate(node):
+ yield n
+
+ def neighbourIndexSet(self,node=None,index=None):
+ if index is None:
+ index=self.getNode(node).index
+ return self._node[index]
+
+ def edgeCount(self):
+ n = reduce(lambda x,y:x+y, (len(z) for z in self._node.itervalues()),0)
+ if not self._directed:
+ n=n/2
+ return n
+
+ def subgraph(self,nodes,name='G'):
+ sub = Graph(name,self._directed,self._index)
+ if not isinstance(nodes, set):
+ nodes = set(nodes)
+ for n in nodes:
+ sub._node[n]=nodes & self._node[n]
+ if n in self._node_attrs:
+ sub._node_attrs[n]=dict(self._node_attrs[n])
+ for n2 in sub._node[n]:
+ if not self._directed:
+ if n <= n2:
+ if (n,n2) in self._edge_attrs:
+ data=dict(self._edge_attrs[(n,n2)])
+ sub._edge_attrs[(n,n2)]=data
+ sub._edge_attrs[(n2,n)]=data
+ else:
+ if (n,n2) in self._edge_attrs:
+ data=dict(self._edge_attrs[(n,n2)])
+ sub._edge_attrs[(n,n2)]=data
+ return sub
+
+ def __len__(self):
+ return len(self._node)
+
+ def __getitem__(self,key):
+ return self.getNode(node=key)
+
+ def __delitem__(self,key):
+ self.delNode(node=key)
+
+ def __iter__(self):
+ return self.nodeIterator()
+
+ def dot(self,nodePredicat=None,edgePredicat=None):
+ def combinedPredicat(edge):
+ graph = edge.graph
+ n1 = graph.getNode(edge.node1)
+ n2 = graph.getNode(edge.node2)
+
+ return nodePredicat(n1) and nodePredicat(n2) and edgePredicat(edge)
+
+ if edgePredicat is not None and nodePredicat is not None:
+ edgePredicat = combinedPredicat
+
+ if self._directed:
+ kw ='digraph'
+ else:
+ kw='graph'
+
+ nodes = "\n ".join([str(x) for x in self.nodeIterator(nodePredicat)])
+ edges = "\n ".join([str(x) for x in self.edgeIterator(edgePredicat)])
+
+ return "%s %s {\n %s\n\n %s\n}" % (kw,self._label,nodes,edges)
+
+ def __str__(self):
+ return self.dot()
+
+class Node(object):
+ """
+ Class used for representing one node or vertex in a graph
+
+ """
+ def __init__(self,index,graph):
+ '''
+ .. warning::
+
+ :py:class:`Node` constructor is usualy called through the :py:class:`Graph` methods
+
+ :param index: Index of the node in the graph
+ :type index: int
+ :param graph: graph instance owning the node
+ :type graph: :py:class:`obitools.graph.Graph`
+ '''
+ self.index = index
+ self.__graph = graph
+
+ def getGraph(self):
+ '''
+ return graph owning this node.
+
+ :rtype: :py:class:`obitools.graph.Graph`
+ '''
+ return self.__graph
+
+
+ def getLabel(self):
+ '''
+ return label associated to this node.
+ '''
+ return self.__graph._index.getLabel(self.index)
+
+
+ def has_key(self,key):
+ '''
+ test is the node instance has a property named 'key'.
+
+ :param key: the name of a property
+ :type key: str
+
+ :return: True if the nade has a property named <key>
+ :rtype: bool
+ '''
+ if self.index in self.__graph._node_attrs:
+ return key in self.__graph._node_attrs[self.index]
+ else:
+ return False
+
+ def neighbourIterator(self,nodePredicat=None,edgePredicat=None):
+ '''
+ iterate through the nodes directly connected to
+ this node.
+
+ :param nodePredicat: a function accepting one node as parameter
+ and returning **True** if this node must be
+ returned by the iterator.
+ :type nodePredicat: function
+
+ :param edgePredicat: a function accepting one edge as parameter
+ and returning True if the edge linking self and
+ the current must be considered.
+ :type edgePredicat: function
+
+
+ :rtype: iterator on Node instances
+ '''
+ for n in self.neighbourIndexIterator(nodePredicat, edgePredicat):
+ node = self.graph.getNode(index=n)
+ yield node
+
+ def neighbourIndexSet(self):
+ '''
+ Return a set of node indexes directely connected
+ to this node.
+
+ .. warning::
+
+ do not change this set unless you know
+ exactly what you do.
+
+ @rtype: set of int
+ '''
+ return self.__graph._node[self.index]
+
+ def neighbourIndexIterator(self,nodePredicat=None,edgePredicat=None):
+ '''
+ iterate through the node indexes directly connected to
+ this node.
+
+ :param nodePredicat: a function accepting one node as parameter
+ and returning True if this node must be
+ returned by the iterator.
+ :type nodePredicat: function
+
+ :param edgePredicat: a function accepting one edge as parameter
+ and returning True if the edge linking self and
+ the current must be considered.
+ :type edgePredicat: function
+
+ :rtype: iterator on int
+ '''
+ for n in self.neighbourIndexSet():
+ if nodePredicat is None or nodePredicat(self.__graph.getNode(index=n)):
+ if edgePredicat is None or edgePredicat(self.__graph.getEdge(index1=self.index,index2=n)):
+ yield n
+
+ def degree(self,nodeIndexes=None):
+ '''
+ return count of edges linking this node to the
+ set of nodes describes by their index in nodeIndexes
+
+ :param nodeIndexes: set of node indexes.
+ if set to None, all nodes of the
+ graph are take into account.
+ Set to None by default.
+ :type nodeIndexes: set of int
+
+ :rtype: int
+ '''
+ if nodeIndexes is None:
+ return len(self.__graph._node[self.index])
+ else:
+ return len(self.__graph._node[self.index] & nodeIndexes)
+
+ def componentIndexSet(self,nodePredicat=None,edgePredicat=None):
+ '''
+ Return the set of node index in the same connected component.
+
+ :param nodePredicat: a function accepting one node as parameter
+ and returning True if this node must be
+ returned by the iterator.
+ :type nodePredicat: function
+
+ :param edgePredicat: a function accepting one edge as parameter
+ and returning True if the edge linking self and
+ the current must be considered.
+ :type edgePredicat: function
+
+
+ :rtype: set of int
+ '''
+ cc=set([self.index])
+ added = set(x for x in self.neighbourIndexIterator(nodePredicat, edgePredicat))
+ while added:
+ cc |= added
+ added = reduce(lambda x,y : x | y,
+ (set(z for z in self.graph.getNode(index=c).neighbourIndexIterator(nodePredicat, edgePredicat))
+ for c in added),
+ set())
+ added -= cc
+ return cc
+
+ def componentIterator(self,nodePredicat=None,edgePredicat=None):
+ '''
+ Iterate through the nodes in the same connected
+ component.
+
+ :rtype: iterator on :py:class:`Node` instance
+ '''
+ for c in self.componentIndexSet(nodePredicat, edgePredicat):
+ yield self.graph.getNode(c)
+
+ def shortestPathIterator(self,nodes=None):
+ '''
+ Iterate through the shortest path sourcing
+ from this node. if nodes is not None, iterates
+ only path linkink this node to one node listed in
+ nodes
+
+ :param nodes: set of node index
+ :type nodes: iterable on int
+
+ :return: an iterator on list of int describing path
+ :rtype: iterator on list of int
+ '''
+ if nodes is not None:
+ nodes = set(nodes)
+
+
+ Q=[(self.index,-1)]
+
+ gray = set([self.index])
+ paths = {}
+
+ while Q and (nodes is None or nodes):
+ u,p = Q.pop()
+ paths[u]=p
+ next = self.graph._node[u] - gray
+ gray|=next
+ Q.extend((x,u) for x in next)
+ if nodes is None or u in nodes:
+ if nodes:
+ nodes.remove(u)
+ path = [u]
+ while p >= 0:
+ path.append(p)
+ p = paths[p]
+ path.reverse()
+ yield path
+
+ def shortestPathTo(self,node=None,index=None):
+ '''
+ return one of the shortest path linking this
+ node to specified node.
+
+ :param node: a node label or None
+ :param index: a node index or None. the parameter index
+ has a priority on the parameter node.
+ :type index: int
+
+ :return: list of node index corresponding to the path or None
+ if no path exists.
+ :rtype: list of int or None
+ '''
+ if index is None:
+ index=self.graph.getNode(node).index
+ for p in self.shortestPathIterator([index]):
+ return p
+
+
+ def __getitem__(self,key):
+ '''
+ return the value of the <key> property of this node
+
+ :param key: the name of a property
+ :type key: str
+ '''
+ return self.__graph._node_attrs.get(self.index,{})[key]
+
+ def __setitem__(self,key,value):
+ '''
+ set the value of a node property. In the property doesn't
+ already exist a new property is added to this node.
+
+ :param key: the name of a property
+ :type key: str
+ :param value: the value of the property
+
+ .. seealso::
+
+ :py:meth:`Node.__getitem__`
+ '''
+ if self.index in self.__graph._node_attrs:
+ data = self.__graph._node_attrs[self.index]
+ data[key]=value
+ else:
+ self.graph._node_attrs[self.index]={key:value}
+
+ def __delitem__(self,key):
+ data = self.__graph._node_attrs[self.index]
+ del data[key]
+
+ def __len__(self):
+ '''
+ Count neighbour of this node
+
+ :rtype: int
+
+ .. seealso::
+
+ :py:meth:`Node.degree`
+ '''
+ return len(self.__graph._node[self.index])
+
+ def __iter__(self):
+ '''
+ iterate through neighbour of this node
+
+ :rtype: iterator in :py:class:`Node` instances
+
+ .. seealso::
+
+ :py:meth:`Node.neighbourIterator`
+ '''
+ return self.neighbourIterator()
+
+ def __contains__(self,key):
+ return self.has_key(key)
+
+ def __str__(self):
+
+ if self.index in self.__graph._node_attrs:
+ keys = " ".join(['%s="%s"' % (x[0],str(x[1]).replace('"','\\"').replace('\n','\\n'))
+ for x in self.__graph._node_attrs[self.index].iteritems()]
+ )
+ else:
+ keys=''
+
+ return '%d [label="%s" %s]' % (self.index,
+ str(self.label).replace('"','\\"').replace('\n','\\n'),
+ keys)
+
+ def keys(self):
+ if self.index in self.__graph._node_attrs:
+ k = self.__graph._node_attrs[self.index].keys()
+ else:
+ k=[]
+ return k
+
+ label = property(getLabel, None, None, "Label of the node")
+
+ graph = property(getGraph, None, None, "Graph owning this node")
+
+
+
+class Edge(object):
+ """
+ Class used for representing one edge of a graph
+
+ """
+
+ def __init__(self,node1,node2):
+ '''
+ .. warning::
+
+ :py:class:`Edge` constructor is usualy called through the :py:class:`Graph` methods
+
+ :param node1: First node likend by the edge
+ :type node1: :py:class:`Node`
+ :param node2: Seconde node likend by the edge
+ :type node2: :py:class:`Node`
+ '''
+ self.node1 = node1
+ self.node2 = node2
+
+ def getGraph(self):
+ """
+ Return the :py:class:`Graph` instance owning this edge.
+ """
+ return self.node1.graph
+
+ def has_key(self,key):
+ '''
+ test is the :py:class:`Edge` instance has a property named **key**.
+
+ :param key: the name of a property
+ :type key: str
+
+ :return: True if the edge has a property named <key>
+ :rtype: bool
+ '''
+ if (self.node1.index,self.node2.index) in self.graph._edge_attrs:
+ return key in self.graph._edge_attrs[(self.node1.index,self.node2.index)]
+ else:
+ return False
+
+
+ def getDirected(self):
+ return self.node1.graph._directed
+
+ def __getitem__(self,key):
+ return self.graph._edge_attrs.get((self.node1.index,self.node2.index),{})[key]
+
+ def __setitem__(self,key,value):
+ e = (self.node1.index,self.node2.index)
+ if e in self.graph._edge_attrs:
+ data = self.graph._edge_attrs[e]
+ data[key]=value
+ else:
+ self.graph._edge_attrs[e]={key:value}
+
+ def __str__(self):
+ e = (self.node1.index,self.node2.index)
+ if e in self.graph._edge_attrs:
+ keys = "[%s]" % " ".join(['%s="%s"' % (x[0],str(x[1]).replace('"','\\"'))
+ for x in self.graph._edge_attrs[e].iteritems()]
+ )
+ else:
+ keys = ""
+
+ if self.directed:
+ link='->'
+ else:
+ link='--'
+
+ return "%d %s %d %s" % (self.node1.index,link,self.node2.index,keys)
+
+ def __contains__(self,key):
+ return self.has_key(key)
+
+
+ graph = property(getGraph, None, None, "Graph owning this edge")
+
+ directed = property(getDirected, None, None, "Directed's Docstring")
+
+
+class DiGraph(Graph):
+ """
+ :py:class:`DiGraph class`is a specialisation of the :py:class:`Graph` class
+ dedicated to directed graph representation
+
+ .. seealso::
+
+ :py:class:`UndirectedGraph`
+
+ """
+ def __init__(self,label='G',indexer=None,nodes=None,edges=None):
+ '''
+ :param label: Graph name, set to 'G' by default
+ :type label: str
+ :param indexer: node label indexer
+ :type indexer: Indexer instance
+ :param nodes: set of nodes to add to the graph
+ :type nodes: iterable value
+ :param edges: set of edges to add to the graph
+ :type edges: iterable value
+ '''
+
+ Graph.__init__(self, label, True, indexer, nodes, edges)
+
+class UndirectedGraph(Graph):
+ """
+ :py:class:`UndirectGraph class`is a specialisation of the :py:class:`Graph` class
+ dedicated to undirected graph representation
+
+ .. seealso::
+
+ :py:class:`DiGraph`
+
+ """
+ def __init__(self,label='G',indexer=None,nodes=None,edges=None):
+ '''
+ :param label: Graph name, set to 'G' by default
+ :type label: str
+ :param indexer: node label indexer
+ :type indexer: Indexer instance
+ :param nodes: set of nodes to add to the graph
+ :type nodes: iterable value
+ :param edges: set of edges to add to the graph
+ :type edges: iterable value
+ '''
+
+ Graph.__init__(self, label, False, indexer, nodes, edges)
+
+
+
+def selectEdgeAttributeFactory(attribut,value):
+ """
+ This function help in building predicat function usable for selecting edge
+ in the folowing :py:class:`Graph` methods :
+
+ - :py:meth:`Graph.edgeIterator`
+
+ """
+ def selectEdge(e):
+ return attribut in e and e[attribut]==value
+ return selectEdge
diff --git a/src/obitools/graph/algorithms/__init__.py b/src/obitools/graph/algorithms/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/obitools/graph/algorithms/clique.py b/src/obitools/graph/algorithms/clique.py
new file mode 100644
index 0000000..2007c1a
--- /dev/null
+++ b/src/obitools/graph/algorithms/clique.py
@@ -0,0 +1,134 @@
+import time
+import sys
+
+
+
+_maxsize=0
+_solution=0
+_notbound=0
+_sizebound=0
+_lastyield=0
+_maxclique=None
+
+def cliqueIterator(graph,minsize=1,node=None,timeout=None):
+ global _maxsize,_solution,_notbound,_sizebound,_lastyield
+ _maxsize=0
+ _solution=0
+ _notbound=0
+ _sizebound=0
+ starttime = time.time()
+
+ if node:
+ node = graph.getNode(node)
+ index = node.index
+ clique= set([index])
+ candidates= set(graph.neighbourIndexSet(index=index))
+ else:
+ clique=set()
+ candidates = set(x.index for x in graph)
+
+
+# candidates = set(x for x in candidates
+# if len(graph.neighbourIndexSet(index=x) & candidates) >= (minsize - 1))
+
+ _lastyield=time.time()
+ for c in _cliqueIterator(graph,clique,candidates,set(),minsize,start=starttime,timeout=timeout):
+ yield c
+
+
+
+
+
+def _cliqueIterator(graph,clique,candidates,notlist,minsize=0,start=None,timeout=None):
+ global _maxsize,_maxclique,_solution,_notbound,_sizebound,_lastyield
+
+ # Speed indicator
+ lclique = len(clique)
+ lcandidates = len(candidates)
+ notmin = lcandidates
+ notfix = None
+
+ for n in notlist:
+ nnc = candidates - graph.neighbourIndexSet(index=n)
+ nc = len(nnc)
+ if nc < notmin:
+ notmin=nc
+ notfix=n
+ notfixneib = nnc
+
+ if lclique > _maxsize or not _solution % 1000 :
+ if start is not None:
+ top = time.time()
+ delta = top - start
+ if delta==0:
+ delta=1e-6
+ speed = _solution / delta
+ start = top
+ else:
+ speed = 0
+ print >>sys.stderr,"\rCandidates : %-5d Maximum clique size : %-5d Solutions explored : %10d speed = %5.2f solutions/sec sizebound=%10d notbound=%10d " % (lcandidates,_maxsize,_solution,speed,_sizebound,_notbound),
+ sys.stderr.flush()
+ if lclique > _maxsize:
+ _maxsize=lclique
+
+# print >>sys.stderr,'koukou'
+
+ timer = time.time() - _lastyield
+
+ if not candidates and not notlist:
+ if lclique==_maxsize:
+ _maxclique=set(clique)
+ if lclique >= minsize:
+ yield set(clique)
+ if timeout is not None and timer > timeout and _maxclique is not None:
+ yield _maxclique
+ _maxclique=None
+
+ else:
+ while notmin and candidates and ((lclique + len(candidates)) >= minsize or (timeout is not None and timer > timeout)):
+ # count explored solution
+ _solution+=1
+
+ if notfix is None:
+ nextcandidate = candidates.pop()
+ else:
+ nextcandidate = notfixneib.pop()
+ candidates.remove(nextcandidate)
+
+ clique.add(nextcandidate)
+
+ neighbours = graph.neighbourIndexSet(index=nextcandidate)
+
+ nextcandidates = candidates & neighbours
+ nextnot = notlist & neighbours
+
+ nnc = candidates - neighbours
+ lnnc=len(nnc)
+
+ for c in _cliqueIterator(graph,
+ set(clique),
+ nextcandidates,
+ nextnot,
+ minsize,
+ start,
+ timeout=timeout):
+ yield c
+
+
+ clique.remove(nextcandidate)
+
+ notmin-=1
+
+ if lnnc < notmin:
+ notmin = lnnc
+ notfix = nextcandidate
+ notfixneib = nnc
+
+ if notmin==0:
+ _notbound+=1
+
+ notlist.add(nextcandidate)
+ else:
+ if (lclique + len(candidates)) < minsize:
+ _sizebound+=1
+
diff --git a/src/obitools/graph/algorithms/compact.py b/src/obitools/graph/algorithms/compact.py
new file mode 100644
index 0000000..8065a93
--- /dev/null
+++ b/src/obitools/graph/algorithms/compact.py
@@ -0,0 +1,8 @@
+
+def compactGraph(graph,nodeSetIterator):
+ compact = graph.newEmpty()
+ for ns in nodeSetIterator(graph):
+ nlabel = "\n".join([str(graph.getNode(index=x).label) for x in ns])
+ compact.addNode(nlabel)
+ print
+ print compact
diff --git a/src/obitools/graph/algorithms/component.py b/src/obitools/graph/algorithms/component.py
new file mode 100644
index 0000000..a17c8dd
--- /dev/null
+++ b/src/obitools/graph/algorithms/component.py
@@ -0,0 +1,82 @@
+"""
+Iterate through the connected components of a graph
+---------------------------------------------------
+
+the module :py:mod:`obitools.graph.algorithm.component` provides
+two functions to deal with the connected component of a graph
+represented as a :py:class:`obitools.graph.Graph` instance.
+
+The whole set of connected component of a graph is a partition of this graph.
+So a node cannot belongs to two distinct connected component.
+
+Two nodes are in the same connected component if it exits a path through
+the graph edges linking them.
+
+TODO: THere is certainly a bug with DirectedGraph
+
+"""
+
+def componentIterator(graph,nodePredicat=None,edgePredicat=None):
+ '''
+ Build an iterator over the connected component of a graph.
+ Each connected component returned by the iterator is represented
+ as a `set` of node indices.
+
+ :param graph: the graph to partitionne
+ :type graph: :py:class:`obitools.graph.Graph`
+
+ :param predicate: a function allowing edge selection. Default value
+ is **None** and indicate that all edges are selected.
+ :type predicate: a function returning a boolean value
+ and accepting one argument of class :py:class:`Node`
+
+ :param predicate: a function allowing node selection. Default value
+ is **None** and indicate that all nodes are selected.
+ :type predicate: a function returning a boolean value
+ and accepting one argument of class :py:class:`Edge`
+
+ :return: an iterator over the connected component set
+ :rtype: an iterator over `set` of `int`
+
+ .. seealso::
+ the :py:meth:`obitools.graph.Graph.componentIndexSet` method
+ on which is based this function.
+ '''
+ seen = set()
+ for n in graph.nodeIterator(nodePredicat):
+ if n.index not in seen:
+ cc=n.componentIndexSet(nodePredicat, edgePredicat)
+ yield cc
+ seen |= cc
+
+def componentCount(graph,nodePredicat=None,edgePredicat=None):
+ '''
+ Count the connected componnent in a graph.
+
+ :param graph: the graph to partitionne
+ :type graph: :py:class:`obitools.graph.Graph`
+
+ :param predicate: a function allowing edge selection. Default value
+ is **None** and indicate that all edges are selected.
+ :type predicate: a function returning a boolean value
+ and accepting one argument of class :py:class:`Node`
+
+ :param predicate: a function allowing node selection. Default value
+ is **None** and indicate that all nodes are selected.
+ :type predicate: a function returning a boolean value
+ and accepting one argument of class :py:class:`Edge`
+
+ :return: an iterator over the connected component set
+ :rtype: an iterator over `set` of `int`
+
+ .. seealso::
+ the :py:func:`componentIterator` function
+ on which is based this function.
+ '''
+ n=0
+ for c in componentIterator(graph,nodePredicat, edgePredicat):
+ n+=1
+ return n
+
+
+
\ No newline at end of file
diff --git a/src/obitools/graph/dag.py b/src/obitools/graph/dag.py
new file mode 100644
index 0000000..c4e8d13
--- /dev/null
+++ b/src/obitools/graph/dag.py
@@ -0,0 +1,99 @@
+from obitools.graph import DiGraph,Node
+from obitools.graph.algorithms.component import componentIterator
+
+class DAG(DiGraph):
+ def __init__(self,label='G',indexer=None,nodes=None,edges=None):
+ '''
+ Directed Graph constructor.
+
+ @param label: Graph name, set to 'G' by default
+ @type label: str
+ @param indexer: node label indexer
+ @type indexer: Indexer instance
+ @param nodes: set of nodes to add to the graph
+ @type nodes: iterable value
+ @param edges: set of edges to add to the graph
+ @type edges: iterable value
+ '''
+
+ self._parents={}
+ DiGraph.__init__(self, label, indexer, nodes, edges)
+
+ def getNode(self,node=None,index=None):
+ if index is None:
+ index = self._index.getIndex(node, True)
+ return DAGNode(index,self)
+
+ def addEdge(self,node1=None,node2=None,index1=None,index2=None,**data):
+ index1=self.addNode(node1, index1)
+ index2 =self.addNode(node2, index2)
+
+ pindex = set(n.index
+ for n in self.getNode(index=index1).ancestorIterator())
+
+ assert index2 not in pindex,'Child node cannot be a parent node'
+
+ DiGraph.addEdge(self,index1=index1,index2=index2,**data)
+
+ if index2 in self._parents:
+ self._parents[index2].add(index1)
+ else:
+ self._parents[index2]=set([index1])
+
+
+ return (index1,index2)
+
+ def getRoots(self):
+ '''
+ Return the list of all roots of the DAG (i.e. nodes without parent)
+
+ @return: a list of DAGNode
+ '''
+ return [x for x in self.nodeIterator(lambda n : n.index not in self._parents)]
+
+ def getLeaves(self):
+ '''
+ Return the list of all leaves of the DAG (i.e. nodes without child)
+
+ @return: a list of DAGNode
+ '''
+ return [x for x in self.nodeIterator(lambda n : not n.neighbourIndexSet())]
+
+
+
+
+class DAGNode(Node):
+
+ def getParents(self):
+ if self.index in self.graph._parents:
+ return [DAGNode(p,self.graph) for p in self.graph._parents[self.index]]
+ else:
+ return []
+
+ def ancestorIterator(self):
+ if self.index in self.graph._parents:
+ for p in self.graph._parents[self.index]:
+ parent = DAGNode(p,self.graph)
+ yield parent
+ for pnode in parent.ancestorIterator():
+ yield pnode
+
+ def getRoot(self):
+ x=self
+ for x in self.ancestorIterator():
+ pass
+ return x
+
+ def leavesIterator(self):
+ if not self:
+ yield self
+ for n in self:
+ for nn in n.leavesIterator():
+ yield nn
+
+ def subgraphIterator(self):
+ yield self
+ for n in self:
+ for nn in n.subgraphIterator():
+ yield nn
+
diff --git a/src/obitools/graph/layout/__init__.py b/src/obitools/graph/layout/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/obitools/graph/layout/radialtree.py b/src/obitools/graph/layout/radialtree.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/obitools/graph/rootedtree.py b/src/obitools/graph/rootedtree.py
new file mode 100644
index 0000000..aaad598
--- /dev/null
+++ b/src/obitools/graph/rootedtree.py
@@ -0,0 +1,115 @@
+from obitools.graph.dag import DAG,DAGNode
+
+class RootedTree(DAG):
+
+ def addEdge(self,parent=None,node=None,indexp=None,index=None,**data):
+ indexp=self.addNode(parent, indexp)
+ index =self.addNode(node , index)
+
+ assert index not in self._parents or indexp in self._parents[index], \
+ 'Child node cannot have more than one parent node'
+
+ return DAG.addEdge(self,indexp=indexp,index=index,**data)
+
+ def getNode(self,node=None,index=None):
+ if index is None:
+ index = self._index.getIndex(node, True)
+ return RootedTreeNode(index,self)
+
+
+
+class RootedTreeNode(DAGNode):
+
+ def subTreeSize(self):
+ n=1
+ for subnode in self:
+ n+=subnode.subTreeSize()
+ return n
+
+ def subTreeLeaves(self):
+ if not self:
+ return 1
+ n=0
+ for subnode in self:
+ n+=subnode.subTreeLeaves()
+ return n
+
+
+def nodeWriter(node,deep=0,label=None,distance="distance", bootstrap="bootstrap",cartoon=None,collapse=None):
+
+ ks = node.keys()
+
+
+ if label is None:
+ name=node.label
+ elif callable(label):
+ name=label(node)
+ elif isinstance(label, str) and label in node:
+ name=node[label]
+ ks.remove(label)
+ else:
+ name=''
+
+ if distance in node:
+ dist=':%6.5f' % node[distance]
+ ks.remove(distance)
+ else:
+ dist=''
+
+ ks = ["%s=%s" % (k,node[k]) for k in ks]
+
+ if cartoon is not None and cartoon(node):
+ ks.append("!cartoon={%d,0.0}" % node.subTreeLeaves())
+
+ if collapse is not None and collapse(node):
+ ks.append('!collapse={"collapsed",0.0}')
+
+ if ks:
+ ks="[&"+",".join(ks)+"]"
+ else:
+ ks=''
+
+
+ nodeseparator = ',\n' + ' ' * (deep+1)
+
+ subnodes = nodeseparator.join([nodeWriter(x, deep+1,label,distance,bootstrap,cartoon=cartoon,collapse=collapse)
+ for x in node])
+ if subnodes:
+ subnodes='(\n' + ' ' * (deep+1) + subnodes + '\n' + ' ' * deep + ')'
+
+ return '%s"%s"%s%s' % (subnodes,name,ks,dist)
+
+
+def nexusFormat(tree,startnode=None,label=None,blocks="",cartoon=None,collapse=None):
+ head="#NEXUS\n"
+
+ tx = []
+
+ for n in tree:
+ if label is None:
+ name=n.label
+ elif callable(label):
+ name=label(n)
+ elif isinstance(label, str) and label in n:
+ name=n[label]
+ else:
+ name=''
+
+ if name:
+ tx.append('"%s"' % name)
+
+ taxa = "begin taxa;\n\tdimensions ntax=%d;\n\ttaxlabels\n\t" % len(tx)
+
+ taxa+="\n\t".join(tx)
+
+ taxa+="\n;\nend;\n\n"
+
+
+
+ if startnode is not None:
+ roots =[startnode]
+ else:
+ roots = tree.getRoots()
+ trees = nodeWriter(roots[0],0,label,cartoon=cartoon,collapse=collapse)
+ trees = "begin trees;\n\ttree tree_1 = [&R] "+ trees +";\nend;\n\n"
+ return head+taxa+trees+"\n\n"+blocks+"\n"
diff --git a/src/obitools/graph/tree.py b/src/obitools/graph/tree.py
new file mode 100644
index 0000000..940ee44
--- /dev/null
+++ b/src/obitools/graph/tree.py
@@ -0,0 +1,37 @@
+from obitools.graph import UndirectedGraph,Node
+from obitools.graph.algorithms.component import componentCount
+
+
+class Forest(UndirectedGraph):
+
+
+ def getNode(self,node=None,index=None):
+ if index is None:
+ index = self._index.getIndex(node, True)
+ return TreeNode(index,self)
+
+ def addEdge(self,node1=None,node2=None,index1=None,index2=None,**data):
+ index1=self.addNode(node1, index1)
+ index2=self.addNode(node2, index2)
+
+ cc = set(n.index for n in self.getNode(index=index2).componentIterator())
+
+ assert index1 in self._node[index2] or index1 not in cc, \
+ "No more than one path is alloed between two nodes in a tree"
+
+ UndirectedGraph.addEdge(self, index1=index1, index2=index2,**data)
+
+ return (index1,index2)
+
+ def isASingleTree(self):
+ return componentCount(self)==1
+
+class TreeNode(Node):
+
+ def componentIterator(self):
+ for c in self:
+ yield c
+ for cc in c:
+ yield cc
+
+
\ No newline at end of file
diff --git a/src/obitools/gzip.py b/src/obitools/gzip.py
new file mode 100644
index 0000000..841641a
--- /dev/null
+++ b/src/obitools/gzip.py
@@ -0,0 +1,504 @@
+"""Functions that read and write gzipped files.
+
+The user of the file doesn't have to worry about the compression,
+but random access is not allowed.
+
+This consisted on a patched version of of standard gzip python
+module based on Andrew Kuchling's minigzip.py distributed with the zlib module
+
+"""
+
+# based on Andrew Kuchling's minigzip.py distributed with the zlib module
+
+import struct, sys, time
+import zlib
+import __builtin__
+
+__all__ = ["GzipFile","open"]
+
+FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
+
+READ, WRITE = 1, 2
+
+def U32(i):
+ """Return i as an unsigned integer, assuming it fits in 32 bits.
+
+ If it's >= 2GB when viewed as a 32-bit unsigned int, return a long.
+ """
+ if i < 0:
+ i += 1L << 32
+ return i
+
+def LOWU32(i):
+ """Return the low-order 32 bits of an int, as a non-negative int."""
+ return i & 0xFFFFFFFFL
+
+def write32(output, value):
+ output.write(struct.pack("<l", value))
+
+def write32u(output, value):
+ # The L format writes the bit pattern correctly whether signed
+ # or unsigned.
+ output.write(struct.pack("<L", value))
+
+def read32(input):
+ return struct.unpack("<l", input.read(4))[0]
+
+def unpack32(buf):
+ return struct.unpack("<l", buf)[0]
+
+def open(filename, mode="rb", compresslevel=9):
+ """Shorthand for GzipFile(filename, mode, compresslevel).
+
+ The filename argument is required; mode defaults to 'rb'
+ and compresslevel defaults to 9.
+
+ """
+ return GzipFile(filename, mode, compresslevel)
+
+class GzipFile:
+ """The GzipFile class simulates most of the methods of a file object with
+ the exception of the readinto() and truncate() methods.
+
+ """
+
+ myfileobj = None
+ max_read_chunk = 10 * 1024 * 1024 # 10Mb
+
+ def __init__(self, filename=None, mode=None,
+ compresslevel=9, fileobj=None):
+ """Constructor for the GzipFile class.
+
+ At least one of fileobj and filename must be given a
+ non-trivial value.
+
+ The new class instance is based on fileobj, which can be a regular
+ file, a StringIO object, or any other object which simulates a file.
+ It defaults to None, in which case filename is opened to provide
+ a file object.
+
+ When fileobj is not None, the filename argument is only used to be
+ included in the gzip file header, which may includes the original
+ filename of the uncompressed file. It defaults to the filename of
+ fileobj, if discernible; otherwise, it defaults to the empty string,
+ and in this case the original filename is not included in the header.
+
+ The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', or 'wb',
+ depending on whether the file will be read or written. The default
+ is the mode of fileobj if discernible; otherwise, the default is 'rb'.
+ Be aware that only the 'rb', 'ab', and 'wb' values should be used
+ for cross-platform portability.
+
+ The compresslevel argument is an integer from 1 to 9 controlling the
+ level of compression; 1 is fastest and produces the least compression,
+ and 9 is slowest and produces the most compression. The default is 9.
+
+ """
+
+ # guarantee the file is opened in binary mode on platforms
+ # that care about that sort of thing
+ if mode and 'b' not in mode:
+ mode += 'b'
+ if fileobj is None:
+ fileobj = self.myfileobj = __builtin__.open(filename, mode or 'rb')
+ if filename is None:
+ if hasattr(fileobj, 'name'): filename = fileobj.name
+ else: filename = ''
+ if mode is None:
+ if hasattr(fileobj, 'mode'): mode = fileobj.mode
+ else: mode = 'rb'
+
+ if mode[0:1] == 'r':
+ self.mode = READ
+ # Set flag indicating start of a new member
+ self._new_member = True
+ self.extrabuf = ""
+ self.extrasize = 0
+ self.filename = filename
+ # Starts small, scales exponentially
+ self.min_readsize = 100
+
+ elif mode[0:1] == 'w' or mode[0:1] == 'a':
+ self.mode = WRITE
+ self._init_write(filename)
+ self.compress = zlib.compressobj(compresslevel,
+ zlib.DEFLATED,
+ -zlib.MAX_WBITS,
+ zlib.DEF_MEM_LEVEL,
+ 0)
+ else:
+ raise IOError, "Mode " + mode + " not supported"
+
+ self.fileobj = fileobj
+ self.offset = 0
+ self.inputbuf = ''
+ self.last8 = ''
+
+ if self.mode == WRITE:
+ self._write_gzip_header()
+
+ def __repr__(self):
+ s = repr(self.fileobj)
+ return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
+
+ def _init_write(self, filename):
+ if filename[-3:] != '.gz':
+ filename = filename + '.gz'
+ self.filename = filename
+ self.crc = zlib.crc32("")
+ self.size = 0
+ self.writebuf = []
+ self.bufsize = 0
+
+ def _write_gzip_header(self):
+ self.fileobj.write('\037\213') # magic header
+ self.fileobj.write('\010') # compression method
+ fname = self.filename[:-3]
+ flags = 0
+ if fname:
+ flags = FNAME
+ self.fileobj.write(chr(flags))
+ write32u(self.fileobj, long(time.time()))
+ self.fileobj.write('\002')
+ self.fileobj.write('\377')
+ if fname:
+ self.fileobj.write(fname + '\000')
+
+ def _init_read(self):
+ self.crc = zlib.crc32("")
+ self.size = 0
+
+ def _read_internal(self, size):
+ if len(self.inputbuf) < size:
+ self.inputbuf += self.fileobj.read(size-len(self.inputbuf))
+ chunk = self.inputbuf[:size]
+ # need to use len(chunk) bellow instead of size in case it's EOF.
+ if len(chunk) < 8:
+ self.last8 = self.last8[len(chunk):] + chunk
+ else:
+ self.last8 = chunk[-8:]
+ self.inputbuf = self.inputbuf[size:]
+ return chunk
+
+ def _read_gzip_header(self):
+ magic = self._read_internal(2)
+ if len(magic) != 2:
+ raise EOFError, "Reached EOF"
+ if magic != '\037\213':
+ raise IOError, 'Not a gzipped file'
+ method = ord( self._read_internal(1) )
+ if method != 8:
+ raise IOError, 'Unknown compression method'
+ flag = ord( self._read_internal(1) )
+ # modtime = self.fileobj.read(4)
+ # extraflag = self.fileobj.read(1)
+ # os = self.fileobj.read(1)
+ self._read_internal(6)
+
+ if flag & FEXTRA:
+ # Read & discard the extra field, if present
+ xlen = ord(self._read_internal(1))
+ xlen = xlen + 256*ord(self._read_internal(1))
+ self._read_internal(xlen)
+ if flag & FNAME:
+ # Read and discard a null-terminated string containing the filename
+ while True:
+ s = self._read_internal(1)
+ if not s or s=='\000':
+ break
+ if flag & FCOMMENT:
+ # Read and discard a null-terminated string containing a comment
+ while True:
+ s = self._read_internal(1)
+ if not s or s=='\000':
+ break
+ if flag & FHCRC:
+ self._read_internal(2) # Read & discard the 16-bit header CRC
+
+
+ def write(self,data):
+ if self.mode != WRITE:
+ import errno
+ raise IOError(errno.EBADF, "write() on read-only GzipFile object")
+
+ if self.fileobj is None:
+ raise ValueError, "write() on closed GzipFile object"
+ if len(data) > 0:
+ self.size = self.size + len(data)
+ self.crc = zlib.crc32(data, self.crc)
+ self.fileobj.write( self.compress.compress(data) )
+ self.offset += len(data)
+
+ def read(self, size=-1):
+ if self.mode != READ:
+ import errno
+ raise IOError(errno.EBADF, "read() on write-only GzipFile object")
+
+ if self.extrasize <= 0 and self.fileobj is None:
+ return ''
+
+ readsize = 1024
+ if size < 0: # get the whole thing
+ try:
+ while True:
+ self._read(readsize)
+ readsize = min(self.max_read_chunk, readsize * 2)
+ except EOFError:
+ size = self.extrasize
+ else: # just get some more of it
+ try:
+ while size > self.extrasize:
+ self._read(readsize)
+ readsize = min(self.max_read_chunk, readsize * 2)
+ except EOFError:
+ if size > self.extrasize:
+ size = self.extrasize
+
+ chunk = self.extrabuf[:size]
+ self.extrabuf = self.extrabuf[size:]
+ self.extrasize = self.extrasize - size
+
+ self.offset += size
+ return chunk
+
+ def _unread(self, buf):
+ self.extrabuf = buf + self.extrabuf
+ self.extrasize = len(buf) + self.extrasize
+ self.offset -= len(buf)
+
+ def _read(self, size=1024):
+ if self.fileobj is None:
+ raise EOFError, "Reached EOF"
+
+ if self._new_member:
+ # If the _new_member flag is set, we have to
+ # jump to the next member, if there is one.
+ #
+ # _read_gzip_header will raise EOFError exception
+ # if there no more members to read.
+ self._init_read()
+ self._read_gzip_header()
+ self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
+ self._new_member = False
+
+ # Read a chunk of data from the file
+ buf = self._read_internal(size)
+
+ # If the EOF has been reached, flush the decompression object
+ # and mark this object as finished.
+
+ if buf == "":
+ uncompress = self.decompress.flush()
+ self._read_eof()
+ self._add_read_data( uncompress )
+ raise EOFError, 'Reached EOF'
+
+ uncompress = self.decompress.decompress(buf)
+ self._add_read_data( uncompress )
+
+ if self.decompress.unused_data != "":
+ # Ending case: we've come to the end of a member in the file,
+ # so put back unused_data and initialize last8 by reading them.
+ self.inputbuf = self.decompress.unused_data + self.inputbuf
+ self._read_internal(8)
+
+ # Check the CRC and file size, and set the flag so we read
+ # a new member on the next call
+ self._read_eof()
+ self._new_member = True
+
+ def _add_read_data(self, data):
+ self.crc = zlib.crc32(data, self.crc)
+ self.extrabuf = self.extrabuf + data
+ self.extrasize = self.extrasize + len(data)
+ self.size = self.size + len(data)
+
+ def _read_eof(self):
+ # We've read to the end of the file, so we have to rewind in order
+ # to reread the 8 bytes containing the CRC and the file size.
+ # We check the that the computed CRC and size of the
+ # uncompressed data matches the stored values. Note that the size
+ # stored is the true file size mod 2**32.
+ crc32 = unpack32(self.last8[:4])
+ isize = U32(unpack32(self.last8[4:])) # may exceed 2GB
+ if U32(crc32) != U32(self.crc):
+ raise IOError, "CRC check failed"
+ elif isize != LOWU32(self.size):
+ raise IOError, "Incorrect length of data produced"
+
+ def close(self):
+ if self.mode == WRITE:
+ self.fileobj.write(self.compress.flush())
+ # The native zlib crc is an unsigned 32-bit integer, but
+ # the Python wrapper implicitly casts that to a signed C
+ # long. So, on a 32-bit box self.crc may "look negative",
+ # while the same crc on a 64-bit box may "look positive".
+ # To avoid irksome warnings from the `struct` module, force
+ # it to look positive on all boxes.
+ write32u(self.fileobj, LOWU32(self.crc))
+ # self.size may exceed 2GB, or even 4GB
+ write32u(self.fileobj, LOWU32(self.size))
+ self.fileobj = None
+ elif self.mode == READ:
+ self.fileobj = None
+ if self.myfileobj:
+ self.myfileobj.close()
+ self.myfileobj = None
+
+ def __del__(self):
+ try:
+ if (self.myfileobj is None and
+ self.fileobj is None):
+ return
+ except AttributeError:
+ return
+ self.close()
+
+ def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
+ if self.mode == WRITE:
+ # Ensure the compressor's buffer is flushed
+ self.fileobj.write(self.compress.flush(zlib_mode))
+ self.fileobj.flush()
+
+ def fileno(self):
+ """Invoke the underlying file object's fileno() method.
+
+ This will raise AttributeError if the underlying file object
+ doesn't support fileno().
+ """
+ return self.fileobj.fileno()
+
+ def isatty(self):
+ return False
+
+ def tell(self):
+ return self.offset
+
+ def rewind(self):
+ '''Return the uncompressed stream file position indicator to the
+ beginning of the file'''
+ if self.mode != READ:
+ raise IOError("Can't rewind in write mode")
+ self.fileobj.seek(0)
+ self._new_member = True
+ self.extrabuf = ""
+ self.extrasize = 0
+ self.offset = 0
+
+ def seek(self, offset):
+ if self.mode == WRITE:
+ if offset < self.offset:
+ raise IOError('Negative seek in write mode')
+ count = offset - self.offset
+ for i in range(count // 1024):
+ self.write(1024 * '\0')
+ self.write((count % 1024) * '\0')
+ elif self.mode == READ:
+ if offset < self.offset:
+ # for negative seek, rewind and do positive seek
+ self.rewind()
+ count = offset - self.offset
+ for i in range(count // 1024):
+ self.read(1024)
+ self.read(count % 1024)
+
+ def readline(self, size=-1):
+ if size < 0:
+ size = sys.maxint
+ readsize = self.min_readsize
+ else:
+ readsize = size
+ bufs = []
+ while size != 0:
+ c = self.read(readsize)
+ i = c.find('\n')
+
+ # We set i=size to break out of the loop under two
+ # conditions: 1) there's no newline, and the chunk is
+ # larger than size, or 2) there is a newline, but the
+ # resulting line would be longer than 'size'.
+ if (size <= i) or (i == -1 and len(c) > size):
+ i = size - 1
+
+ if i >= 0 or c == '':
+ bufs.append(c[:i + 1]) # Add portion of last chunk
+ self._unread(c[i + 1:]) # Push back rest of chunk
+ break
+
+ # Append chunk to list, decrease 'size',
+ bufs.append(c)
+ size = size - len(c)
+ readsize = min(size, readsize * 2)
+ if readsize > self.min_readsize:
+ self.min_readsize = min(readsize, self.min_readsize * 2, 512)
+ return ''.join(bufs) # Return resulting line
+
+ def readlines(self, sizehint=0):
+ # Negative numbers result in reading all the lines
+ if sizehint <= 0:
+ sizehint = sys.maxint
+ L = []
+ while sizehint > 0:
+ line = self.readline()
+ if line == "":
+ break
+ L.append(line)
+ sizehint = sizehint - len(line)
+
+ return L
+
+ def writelines(self, L):
+ for line in L:
+ self.write(line)
+
+ def __iter__(self):
+ return self
+
+ def next(self):
+ line = self.readline()
+ if line:
+ return line
+ else:
+ raise StopIteration
+
+
+def _test():
+ # Act like gzip; with -d, act like gunzip.
+ # The input file is not deleted, however, nor are any other gzip
+ # options or features supported.
+ args = sys.argv[1:]
+ decompress = args and args[0] == "-d"
+ if decompress:
+ args = args[1:]
+ if not args:
+ args = ["-"]
+ for arg in args:
+ if decompress:
+ if arg == "-":
+ f = GzipFile(filename="", mode="rb", fileobj=sys.stdin)
+ g = sys.stdout
+ else:
+ if arg[-3:] != ".gz":
+ print "filename doesn't end in .gz:", repr(arg)
+ continue
+ f = open(arg, "rb")
+ g = __builtin__.open(arg[:-3], "wb")
+ else:
+ if arg == "-":
+ f = sys.stdin
+ g = GzipFile(filename="", mode="wb", fileobj=sys.stdout)
+ else:
+ f = __builtin__.open(arg, "rb")
+ g = open(arg + ".gz", "wb")
+ while True:
+ chunk = f.read(1024)
+ if not chunk:
+ break
+ g.write(chunk)
+ if g is not sys.stdout:
+ g.close()
+ if f is not sys.stdin:
+ f.close()
+
+if __name__ == '__main__':
+ _test()
diff --git a/src/obitools/interactive/__init__.py b/src/obitools/interactive/__init__.py
new file mode 100644
index 0000000..0911cce
--- /dev/null
+++ b/src/obitools/interactive/__init__.py
@@ -0,0 +1,30 @@
+from obitools import bioSeqGenerator as __bioSeqGenerator
+from obitools import BioSequence
+from obitools.fasta import formatFasta
+
+__anonymous_seq__=0
+
+class InteractiveBioseqProxy:
+ def __init__(self,bio):
+ assert(isinstance(bio, BioSequence))
+ self._reference=bio
+
+ def __repr__(self):
+ return formatFasta(self._reference)
+
+
+
+ def __getattr__(self,key):
+ return getattr(self._reference,key)
+
+def bioseq(seq,id=None,definition=None):
+ global __anonymous_seq__
+
+ if id is None:
+ __anonymous_seq__+=1
+ id='seq%05d' % __anonymous_seq__
+
+ if definition is None:
+ definition=""
+
+ return InteractiveBioseqProxy(__bioSeqGenerator(id,seq,definition))
diff --git a/src/obitools/location/__init__.py b/src/obitools/location/__init__.py
new file mode 100644
index 0000000..26a347c
--- /dev/null
+++ b/src/obitools/location/__init__.py
@@ -0,0 +1,547 @@
+import obitools
+import re
+import array
+
+class Location(object):
+ """
+ Define a location on a sequence.
+ """
+
+ def extractSequence(self,sequence):
+ '''
+ Extract subsequence corresponding to a Location.
+
+ @param sequence:
+ @type sequence: C{BioSequence} or C{str}
+ '''
+ assert isinstance(sequence, (obitools.BioSequence,str)), \
+ "sequence must be an instance of str or BioSequence"
+
+ if isinstance(sequence, str):
+ seq = self._extractSequence(sequence)
+ else:
+ if isinstance(sequence, obitools.AASequence):
+ assert not self.needNucleic(), \
+ "This location can be used only with Nucleic sequences"
+ seq = self._extractSequence(str(sequence))
+
+ if isinstance(sequence, obitools.AASequence):
+ st = obitools.AASequence
+ else:
+ st = obitools.NucSequence
+
+ seq = st(sequence.id,
+ seq,
+ sequence.definition,
+ **sequence.getTags())
+ seq['location']=str(self)
+
+ if 'length' in sequence.getTags():
+ seq['length']=len(seq)
+
+ if hasattr(sequence, 'quality'):
+ quality = self._extractQuality(sequence)
+ seq.quality=quality
+
+ return seq
+
+ def isDirect(self):
+ return None
+
+ def isSimple(self):
+ '''
+ Indicate if a location is composed of a single continuous
+ region or is composed by the junction of several locations
+ by the C{join} operator.
+
+ @return: C{True} if the location is composed of a single
+ continuous region.
+ @rtype: bool
+ '''
+
+ return None
+
+ def isFullLength(self):
+ return None
+
+ def needNucleic(self):
+ '''
+ If a location contains a complement operator, it can be use
+ only on nucleic sequence.
+
+ @return: C{True} if location contains a complement operator
+ @rtype: bool
+ '''
+ return None
+
+ def getGloc(self):
+ loc = self.simplify()
+ assert loc.isDirect() is not None,"Gloc cannot be created for multi oriented location : %s" % str(loc)
+ positions = ','.join([str(x) for x in loc._getglocpos()])
+ return "(%s,%s)" % ({True:'T',False:'F'}[loc.isDirect()],
+ positions)
+
+ def shift(self,s):
+ return None
+
+ def getBegin(self):
+ return None
+
+ def getEnd(self):
+ return None
+
+ def getFivePrime(self):
+ return self.getBegin()
+
+ def getThreePrime(self):
+ return self.getEnd()
+
+ begin = property(getBegin,None,None,"beginning position of the location")
+ end = property(getEnd,None,None,"ending position of the location")
+ fivePrime=property(getFivePrime,None,None,"5' position of the location")
+ threePrime=property(getThreePrime,None,None,"3' position of the location")
+
+ def __abs__(self):
+ assert self.isDirect() is not None,"Abs operator cannot be applied on non oriented location"
+ if self.isDirect():
+ return self
+ else:
+ return ComplementLocation(self).simplify()
+
+ def __cmp__(self,y):
+ if self.begin < y.begin:
+ return -1
+ if self.begin > y.begin:
+ return 1
+ if self.isDirect() == y.isDirect():
+ return 0
+ if self.isDirect() and not y.isDirect():
+ return -1
+ return 1
+
+class SimpleLocation(Location):
+ """
+ A simple location is describe a continuous region of
+ a sequence define by a C{begin} and a C{end} position.
+ """
+
+ def __init__(self,begin,end):
+ '''
+ Build a new C{SimpleLocation} instance. Valid
+ position are define on M{[1,N]} with N the length
+ of the sequence.
+
+ @param begin: start position of the location
+ @type begin: int
+ @param end: end position of the location
+ @type end: int
+ '''
+ assert begin > 0 and end > 0
+
+ self._begin = begin
+ self._end = end
+ self._before=False
+ self._after=False
+
+ def _extractSequence(self,sequence):
+
+ assert ( self._begin < len(sequence)
+ and self._end <= len(sequence)), \
+ "Sequence length %d is too short" % len(sequence)
+
+ return sequence[self._begin-1:self._end]
+
+ def _extractQuality(self,sequence):
+
+ assert ( self._begin < len(sequence)
+ and self._end <= len(sequence)), \
+ "Sequence length %d is too short" % len(sequence)
+
+ return sequence.quality[self._begin-1:self._end]
+
+
+ def isDirect(self):
+ return True
+
+ def isSimple(self):
+ return True
+
+ def isFullLength(self):
+ return not (self.before or self.after)
+
+ def simplify(self):
+ if self._begin == self._end:
+ return PointLocation(self._begin)
+ else:
+ return self
+
+ def needNucleic(self):
+ return False
+
+ def __str__(self):
+ before = {True:'<',False:''}[self.before]
+ after = {True:'>',False:''}[self.after]
+ return "%s%d..%s%d" % (before,self._begin,after,self._end)
+
+ def shift(self,s):
+ assert (self._begin + s) > 0,"shift to large (%d)" % s
+ if s == 0:
+ return self
+ return SimpleLocation(self._begin + s, self._end + s)
+
+ def _getglocpos(self):
+ return (self.begin,self.end)
+
+ def getGloc(self):
+ positions = ','.join([str(x) for x in self._getglocpos()])
+ return "(%s,%s)" % ({True:'T',False:'F'}[self.isDirect()],
+ positions)
+
+ def getBegin(self):
+ return self._begin
+
+ def getEnd(self):
+ return self._end
+
+
+ begin = property(getBegin,None,None,"beginning position of the location")
+ end = property(getEnd,None,None,"ending position of the location")
+
+ def getBefore(self):
+ return self._before
+
+ def getAfter(self):
+ return self._after
+
+ def setBefore(self,value):
+ assert isinstance(value, bool)
+ self._before=value
+
+ def setAfter(self,value):
+ assert isinstance(value, bool)
+ self._after=value
+
+ before=property(getBefore,setBefore,None)
+ after=property(getAfter,setAfter,None)
+
+
+
+
+class PointLocation(Location):
+ """
+ A point location describes a location on a sequence
+ limited to a single position
+ """
+
+ def __init__(self,position):
+ assert position > 0
+ self._pos=position
+
+ def _extractSequence(self,sequence):
+
+ assert self._end <= len(sequence), \
+ "Sequence length %d is too short" % len(sequence)
+
+ return sequence[self._pos-1]
+
+ def _extractQuality(self,sequence):
+
+ assert self._end <= len(sequence), \
+ "Sequence length %d is too short" % len(sequence)
+
+ return sequence[self._pos-1:self._pos]
+
+ def isDirect(self):
+ return True
+
+ def isSimple(self):
+ return True
+
+ def isFullLength(self):
+ return True
+
+ def simplify(self):
+ return self
+
+ def needNucleic(self):
+ return False
+
+ def shift(self,s):
+ assert (self._pos + s) > 0,"shift to large (%d)" % s
+ if s == 0:
+ return self
+ return PointLocation(self._pos + s)
+
+ def _getglocpos(self):
+ return (self._pos,self._pos)
+
+ def getBegin(self):
+ return self._pos
+
+ def getEnd(self):
+ return self._pos
+
+ begin = property(getBegin,None,None,"beginning position of the location")
+ end = property(getEnd,None,None,"ending position of the location")
+
+ def __str__(self):
+ return str(self._pos)
+
+class CompositeLocation(Location):
+ """
+ """
+ def __init__(self,locations):
+ self._locs = tuple(locations)
+
+
+ def _extractSequence(self,sequence):
+ seq = ''.join([x._extractSequence(sequence)
+ for x in self._locs])
+ return seq
+
+ def _extractQuality(self,sequence):
+ rep=array.array('d',[])
+ for x in self._locs:
+ rep.extend(x._extractQuality(sequence))
+ return rep
+
+ def isDirect(self):
+ hasDirect,hasReverse = reduce(lambda x,y: (x[0] or y,x[1] or not y),
+ (z.isDirect() for z in self._locs),(False,False))
+
+ if hasDirect and not hasReverse:
+ return True
+ if hasReverse and not hasDirect:
+ return False
+
+ return None
+
+
+ def isSimple(self):
+ return False
+
+
+ def simplify(self):
+ if len(self._locs)==1:
+ return self._locs[0]
+
+ rep = CompositeLocation(x.simplify() for x in self._locs)
+
+ if reduce(lambda x,y : x and y,
+ (isinstance(z, ComplementLocation)
+ for z in self._locs)):
+ rep = ComplementLocation(CompositeLocation(x._loc.simplify()
+ for x in rep._locs[::-1]))
+
+ return rep
+
+ def isFullLength(self):
+ return reduce(lambda x,y : x and y, (z.isFullLength() for z in self._locs),1)
+
+ def needNucleic(self):
+ return reduce(lambda x,y : x or y,
+ (z.needNucleic for z in self._locs),
+ False)
+
+ def _getglocpos(self):
+ return reduce(lambda x,y : x + y,
+ (z._getglocpos() for z in self._locs))
+
+
+ def getBegin(self):
+ return min(x.getBegin() for x in self._locs)
+
+ def getEnd(self):
+ return max(x.getEnd() for x in self._locs)
+
+ def shift(self,s):
+ assert (self.getBegin() + s) > 0,"shift to large (%d)" % s
+ if s == 0:
+ return self
+ return CompositeLocation(x.shift(s) for x in self._locs)
+
+
+ begin = property(getBegin,None,None,"beginning position of the location")
+ end = property(getEnd,None,None,"ending position of the location")
+
+
+ def __str__(self):
+ return "join(%s)" % ','.join([str(x)
+ for x in self._locs])
+
+class CompositeLocationOrder(CompositeLocation):
+
+ def __str__(self):
+ return "order(%s)" % ','.join([str(x)
+ for x in self._locs])
+
+class ComplementLocation(Location):
+ """
+ """
+
+ _comp={'a': 't', 'c': 'g', 'g': 'c', 't': 'a',
+ 'r': 'y', 'y': 'r', 'k': 'm', 'm': 'k',
+ 's': 's', 'w': 'w', 'b': 'v', 'd': 'h',
+ 'h': 'd', 'v': 'b', 'n': 'n', 'u': 'a',
+ '-': '-'}
+
+ def __init__(self,location):
+ self._loc = location
+
+ def _extractSequence(self,sequence):
+ seq = self._loc._extractSequence(sequence)
+ seq = ''.join([ComplementLocation._comp.get(x.lower(),'n') for x in seq[::-1]])
+ return seq
+
+ def _extractQuality(self,sequence):
+ return sequence.quality[::-1]
+
+ def isDirect(self):
+ return False
+
+ def isSimple(self):
+ return self._loc.isSimple()
+
+ def isFullLength(self):
+ return self._loc.isFullLength()
+
+ def simplify(self):
+ if isinstance(self._loc, ComplementLocation):
+ return self._loc._loc.simplify()
+ else:
+ return self
+
+ def needNucleic(self):
+ return True
+
+ def __str__(self):
+ return "complement(%s)" % self._loc
+
+ def shift(self,s):
+ assert (self.getBegin() + s) > 0,"shift to large (%d)" % s
+ if s == 0:
+ return self
+ return ComplementLocation(self._loc.shift(s))
+
+ def _getglocpos(self):
+ return self._loc._getglocpos()
+
+ def getBegin(self):
+ return self._loc.getBegin()
+
+ def getEnd(self):
+ return self._loc.getEnd()
+
+ def getFivePrime(self):
+ return self.getEnd()
+
+ def getThreePrime(self):
+ return self.getBegin()
+
+
+ begin = property(getBegin,None,None,"beginning position of the location")
+ end = property(getEnd,None,None,"ending position of the location")
+ fivePrime=property(getFivePrime,None,None,"5' potisition of the location")
+ threePrime=property(getThreePrime,None,None,"3' potisition of the location")
+
+
+ #
+ # Internal functions used for location parsing
+ #
+
+def __sublocationIterator(text):
+ sl = []
+ plevel=0
+ for c in text:
+ assert plevel>=0,"Misformated location : %s" % text
+ if c == '(':
+ plevel+=1
+ sl.append(c)
+ elif c==')':
+ plevel-=1
+ sl.append(c)
+ elif c==',' and plevel == 0:
+ assert sl,"Misformated location : %s" % text
+ yield ''.join(sl)
+ sl=[]
+ else:
+ sl.append(c)
+ assert sl and plevel==0,"Misformated location : %s" % text
+ yield ''.join(sl)
+
+
+
+ #
+ # Internal functions used for location parsing
+ #
+
+__simplelocparser = re.compile('(?P<before><?)(?P<from>[0-9]+)(\.\.(?P<after>>?)(?P<to>[0-9]+))?')
+
+
+def __locationParser(text):
+ text=text.strip()
+ if text[0:5]=='join(':
+ assert text[-1]==')',"Misformated location : %s" % text
+ return CompositeLocation(__locationParser(sl) for sl in __sublocationIterator(text[5:-1]))
+ if text[0:6]=='order(':
+ assert text[-1]==')',"Misformated location : %s" % text
+ return CompositeLocationOrder(__locationParser(sl) for sl in __sublocationIterator(text[6:-1]))
+ elif text[0:11]=='complement(':
+ assert text[-1]==')',"Misformated location : %s" % text
+ subl = tuple(__locationParser(sl) for sl in __sublocationIterator(text[11:-1]))
+ if len(subl)>1:
+ subl = CompositeLocation(subl)
+ else:
+ subl = subl[0]
+ return ComplementLocation(subl)
+ else:
+ data = __simplelocparser.match(text)
+ assert data is not None,"Misformated location : %s" % text
+ data = data.groupdict()
+ if not data['to'] :
+ sl = PointLocation(int(data['from']))
+ else:
+ sl = SimpleLocation(int(data['from']),int(data['to']))
+ sl.before=data['before']=='<'
+ sl.after=data['after']=='>'
+ return sl
+
+def locationGenerator(locstring):
+ '''
+ Parse a location string as present in genbank or embl file.
+
+ @param locstring: string description of the location in embl/gb format
+ @type locstring: str
+
+ @return: a Location instance
+ @rtype: C{Location} subclass instance
+ '''
+ return __locationParser(locstring)
+
+
+_matchExternalRef = re.compile('[A-Za-z0-9_|]+(\.[0-9]+)?(?=:)')
+
+def extractExternalRefs(locstring):
+ '''
+ When a location describe external references (ex: D28156.1:1..>1292)
+ separate the external reference part of the location and the location
+ by itself.
+
+ @param locstring: text representation of the location.
+ @type locstring: str
+
+ @return: a tuple with a set of string describing accession number
+ of the referred sequences and a C{Location} instance.
+
+ @rtype: tuple(set,Location)
+ '''
+ m = set(x.group() for x in _matchExternalRef.finditer(locstring))
+ clean = re.compile(':|'.join([re.escape(x) for x in m])+':')
+ cloc = locationGenerator(clean.sub('',locstring))
+
+ return m,cloc
+
+
+
+
+
diff --git a/src/obitools/location/feature.py b/src/obitools/location/feature.py
new file mode 100644
index 0000000..89a183f
--- /dev/null
+++ b/src/obitools/location/feature.py
@@ -0,0 +1,177 @@
+from obitools.location import Location,locationGenerator
+import logging
+import re
+
+
+
+
+_featureMatcher = re.compile('^(FT| ) [^ ].+\n((FT| ) .+\n)+',re.M)
+_featureCleaner = re.compile('^FT',re.M)
+
+
+def textFeatureIterator(fttable):
+ '''
+ Iterate through a textual description of a feature table in a genbank
+ or embl format. Return at each step a text representation of each individual
+ feature composing the table.
+
+ @param fttable: a string corresponding to the feature table of a genbank
+ or an embl entry
+
+ @type fttable: C{str}
+
+ @return: an iterator on str
+ @rtype: iterator
+
+ @see: L{ftParser}
+ '''
+ for m in _featureMatcher.finditer(fttable):
+ t = m.group()
+ t = _featureCleaner.sub(' ',t)
+ yield t
+
+_qualifierMatcher = re.compile('(?<=^ {21}/).+(\n {21}[^/].+)*',re.M)
+_qualifierCleanner= re.compile("^ +",re.M)
+
+def qualifierIterator(qualifiers):
+ '''
+ Parse a textual description of a feature in embl or genbank format
+ as returned by the textFeatureIterator iterator and iterate through
+ the key, value qualified defining this location.
+
+ @param qualifiers: substring containing qualifiers
+ @type qualifiers: str
+
+ @return: an iterator on tuple (key,value), where keys are C{str}
+ @rtype: iterator
+ '''
+ for m in _qualifierMatcher.finditer(qualifiers):
+ t = m.group()
+ t = _qualifierCleanner.sub('',t)
+ t = t.split('=',1)
+ if len(t)==1:
+ t = (t[0],None)
+ else:
+ if t[0]=='translation':
+ value = t[1].replace('\n','')
+ else:
+ value = t[1].replace('\n',' ')
+ try:
+ value = eval(value)
+ except:
+ pass
+ t = (t[0],value)
+ yield t
+
+
+_ftmatcher = re.compile('(?<=^ {5})\S+')
+_locmatcher= re.compile('(?<=^.{21})[^/]+',re.DOTALL)
+_cleanloc = re.compile('[\s\n]+')
+_qualifiersMatcher = re.compile('^ +/.+',re.M+re.DOTALL)
+
+def ftParser(feature):
+ fttype = _ftmatcher.search(feature).group()
+ location=_locmatcher.search(feature).group()
+ location=_cleanloc.sub('',location)
+ qualifiers=_qualifiersMatcher.search(feature)
+ if qualifiers is not None:
+ qualifiers=qualifiers.group()
+ else:
+ qualifiers=""
+ logging.debug("Qualifiers regex not matching on \n=====\n%s\n========" % feature)
+
+ return fttype,location,qualifiers
+
+
+class Feature(dict,Location):
+ def __init__(self,type,location):
+ self._fttype=type
+ self._loc=location
+
+ def getFttype(self):
+ return self._fttype
+
+
+ def extractSequence(self,sequence,withQualifier=False):
+ seq = self._loc.extractSequence(sequence)
+ if withQualifier:
+ seq.getInfo().update(self)
+ return seq
+
+ def isDirect(self):
+ return self._loc.isDirect()
+
+ def isSimple(self):
+ return self._loc.isSimple()
+
+ def isFullLength(self):
+ return self._loc.isFullLength()
+
+ def simplify(self):
+ f = Feature(self._fttype,self._loc.simplify())
+ f.update(self)
+ return f
+
+ def locStr(self):
+ return str(self._loc)
+
+ def needNucleic(self):
+ return self._loc.needNucleic()
+
+ def __str__(self):
+ return repr(self)
+
+ def __repr__(self):
+ return str((self.ftType,str(self._loc),dict.__repr__(self)))
+
+ def __cmp__(self,y):
+ return self._loc.__cmp__(y)
+
+ def _getglocpos(self):
+ return self._loc._getglocpos()
+
+ ftType = property(getFttype, None, None, "Feature type name")
+
+ def shift(self,s):
+ assert (self.getBegin() + s) > 0,"shift to large (%d)" % s
+ if s == 0:
+ return self
+ f = Feature(self._fttype,self._loc.shift(s))
+ f.update(self)
+ return f
+
+
+ def getBegin(self):
+ return self._loc.getBegin()
+
+ def getEnd(self):
+ return self._loc.getEnd()
+
+ begin = property(getBegin,None,None,"beginning position of the location")
+ end = property(getEnd,None,None,"ending position of the location")
+
+
+def featureFactory(featureDescription):
+ fttype,location,qualifiers = ftParser(featureDescription)
+ location = locationGenerator(location)
+ feature = Feature(fttype,location)
+ feature.raw = featureDescription
+
+ for k,v in qualifierIterator(qualifiers):
+ feature.setdefault(k,[]).append(v)
+
+ return feature
+
+def featureIterator(featureTable,skipError=False):
+ for tft in textFeatureIterator(featureTable):
+ try:
+ feature = featureFactory(tft)
+ except AssertionError,e:
+ logging.debug("Parsing error on feature :\n===============\n%s\n===============" % tft)
+ if not skipError:
+ raise e
+ logging.debug("\t===> Error skipped")
+ continue
+
+ yield feature
+
\ No newline at end of file
diff --git a/src/obitools/metabarcoding/__init__.py b/src/obitools/metabarcoding/__init__.py
new file mode 100644
index 0000000..1a88003
--- /dev/null
+++ b/src/obitools/metabarcoding/__init__.py
@@ -0,0 +1,301 @@
+from obitools.ecopcr.options import addTaxonomyFilterOptions,\
+ loadTaxonomyDatabase
+from obitools.graph import UndirectedGraph
+from obitools.align import lenlcs,isLCSReachable
+from obitools.graph.algorithms.component import componentIterator
+from obitools.utils.bioseq import uniqSequence
+from obitools.utils import progressBar
+import math
+import sys
+from obitools.graph.rootedtree import RootedTree
+
+def average(x):
+ x=list(x)
+ s = sum(i*j for (i,j) in x)
+ n = sum(i[1] for i in x)
+ return (float(s)/float(n),n)
+
+def minimum(x):
+ x=list(x)
+ m = min(i[0] for i in x)
+ n = sum(i[1] for i in x)
+ return (float(m),n)
+
+def ecoPCRReader(entries,options):
+ '''
+
+ :param entries: an iterator over the entries to analyze
+ :type entries: an iterable element
+ :param options: the option structure return by the option manager
+ :type options: object
+ '''
+
+ taxonomy = loadTaxonomyDatabase(options)
+
+ norankid =options.taxonomy.findRankByName('no rank')
+ speciesid=options.taxonomy.findRankByName('species')
+ genusid =options.taxonomy.findRankByName('genus')
+ familyid =options.taxonomy.findRankByName('family')
+
+
+ #
+ # to be used a sequence must have at least
+ # a species a genus and a family
+ #
+
+ minrankseq = set([speciesid,genusid,familyid])
+
+ usedrankid = {}
+
+ ingroup = []
+ outgroup= []
+
+ totalentries = 0
+ entrieswithtaxid = 0
+ goodtaxid = 0
+
+ for s in entries:
+ totalentries+=1
+ if 'taxid' in s :
+ entrieswithtaxid+=1
+ taxid = s['taxid']
+ if taxid in taxonomy:
+ goodtaxid+=1
+ allrank = set()
+ for p in options.taxonomy.parentalTreeIterator(taxid):
+ if p[1]!=norankid:
+ allrank.add(p[1])
+ if len(minrankseq & allrank) == 3:
+ if taxonomy.isAncestor(options.ingroup,taxid):
+ for r in allrank:
+ usedrankid[r]=usedrankid.get(r,0) + 1
+ ingroup.append(s)
+ else:
+ outgroup.append(s)
+
+ keptrank = set(r for r in usedrankid
+ if float(usedrankid[r])/float(len(ingroup)) > options.rankthresold)
+
+ return { 'ingroup' : ingroup, # The group of interest
+ 'outgroup': outgroup, # all other taxa
+ 'ranks' : keptrank # the rank to analyzed (more frequent than options.rankthresold
+ }
+
+def buildSimilarityGraph(dbseq,ranks,taxonomy,dcmax=5):
+
+ ldbseq = len(dbseq)
+ pos = 1
+ digit = int(math.ceil(math.log10(ldbseq)))
+ header = "Alignment : %%0%dd x %%0%dd -> %%0%dd " % (digit,digit,digit)
+ aligncount = ldbseq*(ldbseq+1)/2
+ edgecount = 0
+ print >>sys.stderr
+
+ progressBar(1,aligncount,True,"Alignment : %s x %s -> %s " % ('-'*digit,'-'*digit, '0'*digit))
+
+
+ sim = UndirectedGraph()
+
+ i=0
+ for s in dbseq:
+ taxid = s['taxid']
+
+ rtaxon = dict((rid,taxonomy.getTaxonAtRank(taxid,rid))
+ for rid in ranks)
+
+ sim.addNode(i, seq=s,taxid=taxid,rtaxon=rtaxon)
+
+ i+=1
+
+# aligner = LCS()
+
+ for is1 in xrange(ldbseq):
+ s1 = dbseq[is1]
+ ls1= len(s1)
+# aligner.seqA=s1
+
+ for is2 in xrange(is1+1,ldbseq):
+
+ s2=dbseq[is2]
+ ls2=len(s2)
+
+ lm = max(ls1,ls2)
+ lcsmin = lm - dcmax
+
+ if isLCSReachable(s1,s2,lcsmin):
+ llcs,lali=lenlcs(s1,s2)
+ ds1s2 = lali - llcs
+
+ if ds1s2 <= dcmax:
+ sim.addEdge(node1=is1, node2=is2,ds1s2=ds1s2,label=ds1s2)
+ edgecount+=1
+
+ progressBar(pos,aligncount,head=header % (is1,is2,edgecount))
+ pos+=(ldbseq-is1-1)
+
+ return sim
+
+def buildTsr(component):
+ '''
+ Build for each consider taxonomic rank the list of taxa
+ present in the connected component
+
+ :param component: the analyzed connected component
+ :type component: :py:class:`UndirectedGraph`
+
+ :return: a dictionary indexed by rankid containing a `dict` indexed by taxid and containing count of sequences for this taxid
+ :rtype: `dict` indexed by `int` containing `dict` indexed by `int` and containing of `int`
+
+ '''
+ taxalist = {}
+ for n in component:
+ for r in n['rtaxon']:
+ rtaxid = n['rtaxon'][r]
+ if rtaxid is not None:
+ ts = taxalist.get(r,{})
+ ts[rtaxid]=ts.get(rtaxid,0)+1
+ taxalist[r]=ts
+
+ return taxalist
+
+def edgeDistSelector(dcmax):
+ def predicate(e):
+ return e['ds1s2'] <= dcmax
+ return predicate
+
+def distanceOfConfusion(simgraph,dcmax=5,aggregate=average):
+
+ alltaxa = set()
+
+ for n in simgraph:
+ alltaxa|=set(n['rtaxon'].values())
+
+ taxacount = len(alltaxa)
+
+ result = {}
+
+ pos = [1]
+ header = "Component : %-5d Identified : %-8d "
+ progressBar(1,taxacount,True,header % (0,0))
+
+ def _idc(cc,dcmax):
+ composante=[]
+ for x in cc:
+ composante.extend(simgraph.subgraph(c)
+ for c in componentIterator(x,
+ edgePredicat=edgeDistSelector(dcmax)))
+
+ good = set()
+ bad = {}
+
+ complexe = []
+
+ for c in composante:
+ tsr = buildTsr(c)
+ newbad=False
+ for r in tsr:
+ if len(tsr[r]) == 1:
+ taxid = tsr[r].keys()[0]
+ good.add((taxid,tsr[r][taxid]))
+ else:
+ newbad=True
+ for taxid in tsr[r]:
+ bad[taxid]=bad.get(taxid,0)+tsr[r][taxid]
+ if newbad:
+ complexe.append(c)
+
+# good = good - bad
+
+ for taxid,weight in good:
+ if taxid not in result:
+ result[taxid]=[]
+ result[taxid].append((dcmax+1,weight))
+
+
+ progressBar(pos[0],taxacount,False,header % (len(composante),pos[0]))
+ pos[0]=len(result)
+
+ if dcmax > 0:
+ dcmax-=1
+ _idc(complexe,dcmax)
+
+ else:
+ for taxid in bad:
+ if taxid not in result:
+ result[taxid]=[]
+ result[taxid].append((0,bad[taxid]))
+
+ progressBar(pos[0],taxacount,False,header % (len(composante),pos[0]))
+ pos[0]=len(result)
+
+ _idc([simgraph],dcmax)
+
+ for taxid in result:
+ result[taxid]=aggregate(result[taxid])
+ return result
+
+def propagateDc(tree,node=None,aggregate=min):
+ if node is None:
+ node = tree.getRoots()[0]
+ dca=aggregate(n['dc'] for n in node.leavesIterator())
+ node['dc']=dca
+ for n in node:
+ propagateDc(tree, n, aggregate)
+
+def confusionTree(distances,ranks,taxonomy,aggregate=min,bsrank='species',dcmax=1):
+ '''
+
+ :param distances:
+ :type distances:
+ :param ranks:
+ :type ranks:
+ :param taxonomy:
+ :type taxonomy:
+ :param aggregate:
+ :type aggregate:
+ :param bsrank:
+ :type bsrank:
+ :param dcmax:
+ :type dcmax:
+
+
+ '''
+
+ def Bs(node,rank,dcmax):
+ n = len(node)
+ if n:
+ g = [int(x['dc']>=dcmax) for x in node.subgraphIterator() if x['rank']==bsrank]
+ n = len(g)
+ g = sum(g)
+ bs= float(g)/float(n)
+ node['bs']=bs
+ node['bs_label']="%3.2f (%d)" % (bs,n)
+
+ for n in node:
+ Bs(n,rank,dcmax)
+
+ tree = RootedTree()
+ ranks = set(ranks)
+ tset = set(distances)
+
+ for taxon in distances:
+ tree.addNode(taxon, rank=taxonomy.getRank(taxon),
+ name=taxonomy.getScientificName(taxon),
+ dc=float(distances[taxon][0]),
+ n=distances[taxon][1],
+ dc_label="%4.2f (%d)" % (float(distances[taxon][0]),distances[taxon][1])
+ )
+
+ for taxon in distances:
+ piter = taxonomy.parentalTreeIterator(taxon)
+ taxon = piter.next()
+ for parent in piter:
+ if taxon[0] in tset and parent[0] in distances:
+ tset.remove(taxon[0])
+ tree.addEdge(parent[0], taxon[0])
+ taxon=parent
+
+ root = tree.getRoots()[0]
+ Bs(root,bsrank,dcmax)
+
+ return tree
diff --git a/src/obitools/metabarcoding/options.py b/src/obitools/metabarcoding/options.py
new file mode 100644
index 0000000..08ff423
--- /dev/null
+++ b/src/obitools/metabarcoding/options.py
@@ -0,0 +1,34 @@
+'''
+Created on 30 oct. 2011
+
+ at author: coissac
+'''
+
+from obitools.ecopcr.options import addTaxonomyDBOptions
+
+
+def addMetabarcodingOption(optionManager):
+
+ addTaxonomyDBOptions(optionManager)
+
+ optionManager.add_option('--dcmax',
+ action="store", dest="dc",
+ metavar="###",
+ type="int",
+ default=0,
+ help="Maximum confusion distance considered")
+
+ optionManager.add_option('--ingroup',
+ action="store", dest="ingroup",
+ metavar="###",
+ type="int",
+ default=1,
+ help="ncbi taxid delimitation the in group")
+
+ optionManager.add_option('--rank-thresold',
+ action="store", dest="rankthresold",
+ metavar="#.##",
+ type="float",
+ default=0.5,
+ help="minimum fraction of the ingroup sequences "
+ "for concidering the rank")
diff --git a/src/obitools/obischemas/__init__.py b/src/obitools/obischemas/__init__.py
new file mode 100644
index 0000000..6bcafde
--- /dev/null
+++ b/src/obitools/obischemas/__init__.py
@@ -0,0 +1,28 @@
+from obitools.obischemas import kb
+__connection__ = None
+
+def initConnection(options):
+ global __connection__
+ param = {}
+ if hasattr(options, "dbname") and options.dbname is not None:
+ param["database"]=options.dbname
+ if hasattr(options, "dbhost") and options.dbhost is not None:
+ param["host"]=options.dbhost
+ if hasattr(options, "dbuser") and options.dbuser is not None:
+ param["username"]=options.dbuser
+ if hasattr(options, "dbpassword") and options.dbpassword is not None:
+ param["password"]=options.dbpassword
+
+ __connection__=kb.getConnection(**param)
+ __connection__.autocommit=options.autocommit
+
+def getConnection(options=None):
+ global __connection__
+
+ if options is not None:
+ initConnection(options)
+
+ assert __connection__ is not None,"database connection is not initialized"
+
+ return __connection__
+
\ No newline at end of file
diff --git a/src/obitools/obischemas/kb/__init__.py b/src/obitools/obischemas/kb/__init__.py
new file mode 100644
index 0000000..7d35dcb
--- /dev/null
+++ b/src/obitools/obischemas/kb/__init__.py
@@ -0,0 +1,55 @@
+"""
+ kb package is devoted to manage access to postgresql database from python
+ script
+"""
+
+
+class Connection(object):
+
+ def __init__(self):
+ raise RuntimeError('pyROM.KB.Connection is an abstract class')
+
+ def cursor(self):
+ raise RuntimeError('pyROM.KB.Connection.cursor is an abstract function')
+
+ def commit(self):
+ raise RuntimeError('pyROM.KB.Connection.commit is an abstract function')
+
+ def rollback(self):
+ raise RuntimeError('pyROM.KB.Connection.rollback is an abstract function')
+
+ def __call__(self,query):
+ return self.cursor().execute(query)
+
+
+class Cursor(object):
+
+ def __init__(self,db):
+ raise RuntimeError('pyROM.KB.Cursor is an abstract class')
+
+ def execute(self,query):
+ raise RuntimeError('pyROM.KB.Cursor.execute is an abstract function')
+
+ __call__=execute
+
+
+_current_connection = None # Static variable used to store connection to KB
+
+def getConnection(*args,**kargs):
+ """
+ return a connection to the database.
+ When call from database backend no argument are needed.
+ All connection returned by this function
+ """
+ global _current_connection
+
+ if _current_connection==None or args or kargs :
+ try:
+ from obischemas.kb import backend
+ _current_connection = backend.Connection()
+ except ImportError:
+ from obischemas.kb import extern
+ _current_connection = extern.Connection(*args,**kargs)
+ return _current_connection
+
+
diff --git a/src/obitools/obischemas/kb/extern.py b/src/obitools/obischemas/kb/extern.py
new file mode 100644
index 0000000..ce2ff84
--- /dev/null
+++ b/src/obitools/obischemas/kb/extern.py
@@ -0,0 +1,78 @@
+"""
+Module : KB.extern
+Author : Eric Coissac
+Date : 03/05/2004
+
+Module wrapping psycopg interface module to allow connection
+to a postgresql databases with the same interface from
+backend and external script.
+
+This module define a class usable from external script
+"""
+
+
+import psycopg2
+import sys
+from obischemas import kb
+
+class Connection(kb.Connection):
+
+ def __init__(self,*connectParam,**kconnectParam):
+ if connectParam:
+ self.connectParam=={'dsn':connectParam}
+ else:
+ self.connectParam=kconnectParam
+ print self.connectParam
+ self.db = psycopg2.connect(**(self.connectParam))
+
+ def restart(self):
+ ok=1
+ while (ok and ok < 1000):
+ try:
+ self.db = psycopg2.connect(**self.connectParam)
+ except:
+ ok+=1
+ else:
+ ok=0
+
+
+ def cursor(self):
+ curs = Cursor(self.db)
+ if hasattr(self,'autocommit') and self.autocommit:
+ curs.autocommit = self.autocommit
+ return curs
+
+ def commit(self):
+ self.db.commit()
+
+ def rollback(self):
+ if hasattr(self,'db'):
+ self.db.rollback()
+
+ def __del__(self):
+ if hasattr(self,'db'):
+ self.rollback()
+
+class Cursor(kb.Cursor):
+
+ def __init__(self,db):
+ self.db = db
+ self.curs = db.cursor()
+
+ def execute(self,query):
+ try:
+ self.curs.execute(query)
+ if hasattr(self,'autocommit') and self.autocommit:
+ self.db.commit()
+ except psycopg2.ProgrammingError,e:
+ print >>sys.stderr,"===> %s" % query
+ raise e
+ except psycopg2.IntegrityError,e:
+ print >>sys.stderr,"---> %s" % query
+ raise e
+ try:
+ label = [x[0] for x in self.curs.description]
+ return [dict(map(None,label,y))
+ for y in self.curs.fetchall()]
+ except TypeError:
+ return []
diff --git a/src/obitools/obischemas/options.py b/src/obitools/obischemas/options.py
new file mode 100644
index 0000000..66f5138
--- /dev/null
+++ b/src/obitools/obischemas/options.py
@@ -0,0 +1,31 @@
+def addConnectionOptions(optionManager):
+
+ optionManager.add_option('-d','--dbname',
+ action="store", dest="dbname",
+ metavar="<DB NAME>",
+ type="string",
+ help="OBISchema database name containing"
+ "taxonomical data")
+
+ optionManager.add_option('-H','--host',
+ action="store", dest="dbhost",
+ metavar="<DB HOST>",
+ type="string",
+ help="host hosting OBISchema database")
+
+ optionManager.add_option('-U','--user',
+ action="store", dest="dbuser",
+ metavar="<DB USER>",
+ type="string",
+ help="user for OBISchema database connection")
+
+ optionManager.add_option('-W','--password',
+ action="store", dest="dbpassword",
+ metavar="<DB PASSWORD>",
+ type="string",
+ help="password for OBISchema database connection")
+
+ optionManager.add_option('-A','--autocommit',
+ action="store_true",dest="autocommit",
+ default=False,
+ help="add commit action after each query")
\ No newline at end of file
diff --git a/src/obitools/obo/__init__.py b/src/obitools/obo/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/obitools/obo/go/__init__.py b/src/obitools/obo/go/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/obitools/obo/go/parser.py b/src/obitools/obo/go/parser.py
new file mode 100644
index 0000000..6902974
--- /dev/null
+++ b/src/obitools/obo/go/parser.py
@@ -0,0 +1,53 @@
+from obitools.obo.parser import OBOTerm
+from obitools.obo.parser import OBOEntry
+from obitools.obo.parser import stanzaIterator
+from logging import debug
+
+class GOEntry(OBOEntry):
+ '''
+ An entry of a GeneOntology .obo file. It can be a header (without a stanza name) or
+ a stanza (with a stanza name between brackets). It inherits from the class dict.
+ '''
+
+
+class GOTerm(OBOTerm):
+
+ '''
+ A stanza named 'Term'. It inherits from the class OBOTerm.
+ '''
+
+ def __init__(self,stanza):
+
+ ## use of the OBOEntry constructor.
+ OBOTerm.__init__(self, stanza)
+
+ assert 'namespace' in self and len(self['namespace'])==1, "An OBOTerm must belong to one of the cell_component, molecular_function or biological_process namespace"
+
+
+def GOEntryFactory(stanza):
+ '''
+ Dispatcher of stanza.
+
+ @param stanza: a stanza composed of several lines.
+ @type stanza: text
+
+ @return: an C{OBOTerm} | C{OBOEntry} instance
+
+ @note: The dispatcher treats differently the stanza which are OBO "Term"
+ and the others.
+ '''
+
+ stanzaType = OBOEntry.parseStanzaName(stanza)
+
+ if stanzaType=="Term":
+ return GOTerm(stanza)
+ else:
+ return OBOEntry(stanza)
+
+
+def GOEntryIterator(file):
+ entries = stanzaIterator(file)
+ for e in entries:
+ debug(e)
+ yield GOEntryFactory(e)
+
diff --git a/src/obitools/obo/parser.py b/src/obitools/obo/parser.py
new file mode 100644
index 0000000..f6f05f3
--- /dev/null
+++ b/src/obitools/obo/parser.py
@@ -0,0 +1,707 @@
+from obitools.utils import skipWhiteLineIterator,multiLineWrapper
+from obitools.utils import universalOpen
+from obitools.format.genericparser import genericEntryIteratorGenerator
+from logging import debug,warning
+
+import re
+
+
+#################################################################################
+## Stanza preparation area ##
+#################################################################################
+
+
+class FileFormatError(Exception):
+ '''
+ An error derived from the class Exception.
+ '''
+ pass
+
+_oboEntryIterator = genericEntryIteratorGenerator(endEntry='^ *$',
+ strip=True)
+
+def stanzaIterator(inputfile):
+ '''
+ Iterator of stanza. The stanza are the basic units of OBO files.
+
+ @param inputfile: a stream of strings from an opened OBO file.
+ @type inputfile: a stream of strings
+
+ @return: a stream of stanza
+ @rtype: a stream of aggregated strings
+
+ @note: The iterator constructs stanza by aggregate strings from the
+ OBO file.
+ '''
+ inputfile = universalOpen(inputfile)
+ inputfile = multiLineWrapper(inputfile)
+ return _oboEntryIterator(inputfile)
+
+
+
+#################################################################################
+## Trailing Modifiers treatment area ##
+#################################################################################
+
+
+class TrailingModifier(dict):
+ '''
+ A class object which inherits from the class dict. Trailing modifiers can be found
+ at the end of TaggedValue objects when they exist.
+ '''
+
+ _match_brace = re.compile('(?<=\ {)[^\]]*(\}) *( !|$)')
+
+ def __init__(self,string):
+
+ ## search for trailing modifiers signals
+ trailing_modifiers = TrailingModifier._match_brace.search(string)
+
+ ## the trailing modifiers exist
+ if trailing_modifiers:
+ trailing_modifiers=trailing_modifiers.group(0).strip()
+ print trailing_modifiers
+ ## creates and feeds the dictionary of trailing modifiers
+ dict.__init__(self,(x.strip().split('=',1) for x in trailing_modifiers.split(',')))
+
+
+def trailingModifierFactory(string):
+ '''
+ Dispatcher of trailing modifiers.
+
+ @param string: a string from a TaggedValue object with a trailing modifiers signal.
+ @type string: string
+
+ @return: a class object
+
+ @note: The dispatcher is currently very simple. Only one case is treated by the function.
+ `the function returns a class object inherited from the class dict if the trailing modifiers
+ exist, None if they don't.
+ '''
+
+ trailing_modifiers = TrailingModifier(string)
+ if not trailing_modifiers:
+ trailing_modifiers=None
+ return trailing_modifiers
+
+
+#################################################################################
+## TaggedValue treatment area ##
+#################################################################################
+
+
+class TaggedValue(object):
+ '''
+ A couple 'tag:value' of an OBOEntry.
+ '''
+
+ _match_value = re.compile('(("(\\\\"|[^\"])*")|(\\\\"|[^\"]))*?( !| {|$)')
+ _split_comment = re.compile('^!| !')
+ _match_quotedString = re.compile('(?<=")(\\\\"|[^\"])*(?=")')
+ _match_bracket = re.compile('\[[^\]]*\]')
+
+ def __init__(self,line):
+ '''
+ Constructor of the class TaggedValue.
+
+ @param line: a line of an OBOEntry composed of a tag and a value.
+ @type line: string
+
+ @note: The constructor separates tags from right terms. 'value' is extracted
+ from right terms using a regular expression (value is at the beginning of the
+ string, between quotes or not). Then, 'comment' is extracted from the rest of the
+ string using another regular expression ('comment' is at the end of the string
+ after a '!'. By default, 'comment' is set to None). Finally, 'trailing_modifiers'
+ are extracted from the last string using another regular expression.
+ The tag, the value, the comment and the trailing_modifiers are saved.
+ '''
+
+ debug("tagValueParser : %s" % line)
+
+ ## by default :
+ trailing_modifiers = None
+ comment = None
+
+ ## the tag is saved. 'right' is composed of the value, the comment and the trailing modifiers
+ tag,rigth = line.split(':',1)
+
+ ## the value is saved
+ value = TaggedValue._match_value.search(rigth).group(0)
+ debug("Extracted value : %s" % value)
+
+ ## if there is a value AND a sign of a comment or trailing modifiers
+ if value and value[-1] in '!{':
+ lvalue = len(value)
+ ## whatever it is a comment or trailing modifiers, it is saved into 'extra'
+ extra = rigth[lvalue-1:].strip()
+ ## a comment is extracted
+ extra =TaggedValue._split_comment.split(extra,1)
+ ## and saved if it exists
+ if len(extra)==2:
+ comment=extra[1].strip()
+ ## trailing modifiers are extracted
+ extra=extra[0]
+ trailing_modifiers = trailingModifierFactory(extra)
+ ## the value is cleaned of any comment or trailing modifiers signals
+ value = value[0:-1]
+
+ if tag=='use_term':
+ tag='consider'
+ raise DeprecationWarning,"user_term is a deprecated tag, you should instead use consider"
+
+ ## recording zone
+ self.value =value.strip()
+ self.tag = tag
+ self.__doc__=comment
+ self.trailing_modifiers=trailing_modifiers
+
+ def __str__(self):
+ return str(self.value)
+
+ def __repr__(self):
+ return '''"""%s"""''' % str(self)
+
+
+class NameValue(TaggedValue):
+ '''
+ A couple 'name:value' inherited from the class TaggedValue. Used to manage name tags.
+ '''
+
+ def __init__(self,line):
+
+ ## no use of the TaggedValue constructor. The NameValue is very simple.
+ tag,rigth = line.split(':',1)
+
+ ## recording zone
+ self.value = rigth.strip()
+ self.tag = 'name'
+ self.__doc__=None
+ self.trailing_modifiers=None
+
+
+
+class DefValue(TaggedValue):
+ '''
+ A couple 'def:value' inherited from the class TaggedValue. Used to manage def tags.
+ '''
+
+ def __init__(self,line):
+ '''
+ Constructor of the class DefValue.
+
+ @param line: a line of an OBOEntry composed of a tag named 'def' and a value.
+ @type line: string
+
+ @note: The constructor calls the TaggedValue constructor. A regular expression
+ is used to extract the 'definition' from TaggedValue.value (definition is a not
+ quoted TaggedValue.value). A regular expression is used to extract 'dbxrefs'
+ from the aggedValue.value without the definition (dbxrefs are between brackets
+ and definition can be so). Definition is saved as the new value of the DefValue.
+ dbxrefs are saved.
+ '''
+
+ ## use of the TaggedValue constructor
+ TaggedValue.__init__(self, line)
+
+ ## definition, which is quoted, is extracted from the standard value of a TaggedValue.
+ definition = TaggedValue._match_quotedString.search(self.value).group(0)
+
+ ## the standard value is cleaned of the definition.
+ cleanvalue = self.value.replace(definition,'')
+ cleanvalue = cleanvalue.replace(' ',' ')
+
+ ## dbxrefs are searched into the rest of the standard value.
+ dbxrefs = TaggedValue._match_bracket.search(cleanvalue).group(0)
+
+ ## recording zone
+ self.tag = 'def'
+ ## the value of a DefValue is not the standard value but the definition.
+ self.value=definition
+ self.dbxrefs=xrefFactory(dbxrefs)
+
+
+class SynonymValue(TaggedValue):
+ '''
+ A couple 'synonym:value' inherited from the class TaggedValue. Used to manage
+ synonym tags, exact_synonym tags, broad_synonym tags and narrow_synonym tags.
+ '''
+
+ _match_scope = re.compile('(?<="")[^\[]*(?=\[|$)')
+
+ def __init__(self,line):
+ '''
+ Constructor of the class SynonymValue.
+
+ @param line: a line of an OBOEntry composed of a tag named 'synonym' or
+ 'exact_synonym' or 'broad_synonym' or 'narrow_synonym' and a value.
+ @type line: string
+
+ @note: SynonymValue is composed of a tag, a value, a scope, a list of types and
+ dbxrefs.
+ The constructor calls the TaggedValue constructor. A regular expression
+ is used to extract 'definition' from TaggedValue.value (definition is a not
+ quoted TaggedValue.value). Definition is saved as the new value of the class
+ SynonymValue.
+ A regular expression is used to extract 'attributes' from the rest of the
+ string. Attributes may contain an optional synonym scope and an optional list
+ of synonym types. The scope is extracted from attributes or set by default to
+ 'RELATED'. It is saved as the scope of the class. The types are the rest of the
+ attributes and are saved as the list of types of the class.
+ For deprecated tags 'exact_synonym', 'broad_synonym' and 'narrow_synonym', tag
+ is set to 'synonym' and scope is set respectively to 'EXACT', 'BROAD' and 'NARROW'.
+ A regular expression is used to extract 'dbxrefs' from the TaggedValue.value
+ without the definition (dbxrefs are between brackets and definition can be so).
+ dbxrefs are saved.
+ '''
+
+ ## use of the TaggedValue constructor
+ TaggedValue.__init__(self, line)
+
+ ## definition, which is quoted, is extracted from the standard value of a TaggedValue.
+ definition = TaggedValue._match_quotedString.search(self.value).group(0)
+
+ ## the standard value is cleaned of the definition.
+ cleanvalue = self.value.replace(definition,'')
+ cleanvalue = cleanvalue.replace(' ',' ')
+
+ ## 1) attributes are searched into the rest of the standard value.
+ ## 2) then they are stripped.
+ ## 3) then they are split on every ' '.
+ ## 4) finally they are ordered into a set.
+ attributes = set(SynonymValue._match_scope.search(cleanvalue).group(0).strip().split())
+
+ ## the scopes are the junction between the attributes and a set of specific terms.
+ scopes = attributes & set(['RELATED','EXACT','BROAD','NARROW'])
+
+ ## the types are the rest of the attributes.
+ types = attributes - scopes
+
+ ## this is a constraint of the OBO format
+ assert len(scopes)< 2,"Only one synonym scope allowed"
+
+ ## the scope of the SynonymValue is into scopes or set by default to RELATED
+ if scopes:
+ scope = scopes.pop()
+ else:
+ scope = 'RELATED'
+
+ ## Specific rules are defined for the following tags :
+ if self.tag == 'exact_synonym':
+ raise DeprecationWarning,'exact_synonym is a deprecated tag use instead synonym tag'
+ self.tag = 'synonym'
+ scope = 'EXACT'
+
+ if self.tag == 'broad_synonym':
+ raise DeprecationWarning,'broad_synonym is a deprecated tag use instead synonym tag'
+ self.tag = 'synonym'
+ scope = 'BROAD'
+
+ if self.tag == 'narrow_synonym':
+ raise DeprecationWarning,'narrow_synonym is a deprecated tag use instead synonym tag'
+ self.tag = 'synonym'
+ scope = 'NARROW'
+
+ if self.tag == 'systematic_synonym':
+ #raise DeprecationWarning,'narrow_synonym is a deprecated tag use instead sysnonym tag'
+ self.tag = 'synonym'
+ scope = 'SYSTEMATIC'
+
+ ## this is our own constraint. deprecated tags are not saved by this parser.
+ assert self.tag =='synonym',"%s synonym type is not managed" % self.tag
+
+ ## dbxrefs are searched into the rest of the standard value.
+ dbxrefs = TaggedValue._match_bracket.search(cleanvalue).group(0)
+
+ ## recording zone
+ ## the value of a SynonymValue is not the standard value but the definition.
+ self.value = definition
+ self.dbxrefs = xrefFactory(dbxrefs)
+ self.scope = scope
+ self.types = list(types)
+
+ def __eq__(self,b):
+ return ((self.value==b.value) and (self.dbxrefs==b.dbxrefs)
+ and (self.scope==b.scope) and (self.types==b.types)
+ and (self.__doc__==b.__doc__) and (self.tag==b.tag)
+ and (self.trailing_modifiers==b.trailing_modifiers))
+
+ def __hash__(self):
+ return (reduce(lambda x,y:x+y,(hash(z) for z in [self.__doc__,
+ self.value,
+ frozenset(self.dbxrefs),
+ self.scope,
+ frozenset(self.types),
+ self.tag,
+ self.trailing_modifiers]),0)) % (2**31)
+
+
+class XrefValue(TaggedValue):
+ '''
+ A couple 'xref:value' inherited from the class TaggedValue. Used to manage
+ xref tags.
+ '''
+
+ def __init__(self,line):
+
+ ## use of the TaggedValue constructor
+ TaggedValue.__init__(self, line)
+
+ ## use the same function as the dbxrefs
+ self.value=xrefFactory(self.value)
+
+ if self.tag in ('xref_analog','xref_unk'):
+ raise DeprecationWarning,'%s is a deprecated tag use instead sysnonym tag' % self.tag
+ self.tag='xref'
+
+ ## this is our own constraint. deprecated tags are not saved by this parser.
+ assert self.tag=='xref'
+
+
+class RelationshipValue(TaggedValue):
+ '''
+ A couple 'xref:value' inherited from the class TaggedValue. Used to manage
+ xref tags.
+ '''
+
+ def __init__(self,line):
+
+ ## use of the TaggedValue constructor
+ TaggedValue.__init__(self, line)
+
+ ## the value is split on the first ' '.
+ value = self.value.split(None,1)
+
+ ## succesful split !
+ if len(value)==2:
+ relationship=value[0]
+ term=value[1]
+ ## unsuccesful split. The relationship is set by default to IS_A
+ else:
+ relationship='is_a'
+ term=value[0]
+
+ ## recording zone
+ self.value=term
+ self.relationship=relationship
+
+
+class NamespaceValue(TaggedValue):
+ def __init__(self,line):
+ TaggedValue.__init__(self, line)
+
+class RemarkValue(TaggedValue):
+ def __init__(self,line):
+ TaggedValue.__init__(self, line)
+ label,value = self.value.split(':',1)
+ label = label.strip()
+ value = value.strip()
+ self.value=value
+ self.label=label
+
+
+def taggedValueFactory(line):
+ '''
+ A function used to dispatch lines of an OBOEntry between the class TaggedValue
+ and its inherited classes.
+
+ @param line: a line of an OBOEntry composed of a tag and a value.
+ @type line: string
+
+ @return: a class object
+ '''
+
+ if (line[0:9]=='namespace' or
+ line[0:17]=='default-namespace'):
+ return NamespaceValue(line)
+ ## DefValue is an inherited class of TaggedValue
+ elif line[0:3]=='def':
+ return DefValue(line)
+ ## SynonymValue is an inherited class of TaggedValue
+ elif ((line[0:7]=="synonym" and line[0:14]!="synonymtypedef") or
+ line[0:13]=="exact_synonym" or
+ line[0:13]=="broad_synonym" or
+ line[0:14]=="narrow_synonym"):
+ return SynonymValue(line)
+ ## XrefValue is an inherited class of TaggedValue
+ elif line[0:4]=='xref':
+ return XrefValue(line)
+ ## NameValue is an inherited class of TaggedValue
+ elif line[0:4]=='name':
+ return NameValue(line)
+ ## RelationshipValue is an inherited class of TaggedValue
+ elif (line[0:15]=='intersection_of' or
+ line[0:8] =='union_of' or
+ line[0:12]=='relationship'):
+ return RelationshipValue(line)
+ elif (line[0:6]=='remark'):
+ return RemarkValue(line)
+ ## each line is a couple : tag / value (and some more features)
+ else:
+ return TaggedValue(line)
+
+
+#################################################################################
+## Xref treatment area ##
+#################################################################################
+
+
+
+class Xref(object):
+ '''
+ A xref object of an OBOentry. It may be the 'dbxrefs' of SynonymValue and
+ DefValue objects or the 'value' of XrefValue objects.
+ '''
+
+ __splitdata__ = re.compile(' +(?=["{])')
+
+ def __init__(self,ref):
+ if ref == '' : #
+ ref = None #
+ data = '' #
+ else : # Modifs JJ sinon erreur : list index out of range
+ data = Xref.__splitdata__.split(ref,1) #
+ ref = data[0] #
+ description=None
+ trailing_modifiers = None
+ if len(data)> 1:
+ extra = data[1]
+ description = TaggedValue._match_quotedString.search(extra)
+ if description is not None:
+ description = description.group(0)
+ extra.replace(description,'')
+ trailing_modifiers=trailingModifierFactory(extra)
+ self.reference=ref
+ self.description=description
+ self.trailing_modifiers=trailing_modifiers
+
+ def __eq__(self,b):
+ return ((self.reference==b.reference) and (self.description==b.description)
+ and (self.trailing_modifiers==b.trailing_modifiers))
+
+ def __hash__(self):
+ return (reduce(lambda x,y:x+y,(hash(z) for z in [self.reference,
+ self.description,
+ self.trailing_modifiers]),0)) % (2**31)
+
+
+def xrefFactory(string):
+ '''
+ Dispatcher of xrefs.
+
+ @param string: a string (between brackets) from an inherited TaggedValue object with a dbxrefs
+ signal (actually, the signal can only be found into SynonymValue and DefValue
+ objects) or a string (without brackets) from a XrefValue object.
+ @type string: string
+
+ @return: a class object
+
+ @note: The dispatcher treats differently the strings between brackets (from SynonymValue and
+ DefValue objects) and without brackets (from XrefValue objects).
+ '''
+
+ string = string.strip()
+ if string[0]=='[':
+ return [Xref(x.strip()) for x in string[1:-1].split(',')]
+ else:
+ return Xref(string)
+
+
+#################################################################################
+## Stanza treatment area ##
+#################################################################################
+
+
+class OBOEntry(dict):
+ '''
+ An entry of an OBOFile. It can be a header (without a stanza name) or
+ a stanza (with a stanza name between brackets). It inherits from the class dict.
+ '''
+ _match_stanza_name = re.compile('(?<=^\[)[^\]]*(?=\])')
+
+ def __init__(self,stanza):
+ ## tests if it is the header of the OBO file (returns TRUE) or not (returns FALSE)
+ self.isHeader = stanza[0]!='['
+ lines = stanza.split('\n')
+ ## not the header : there is a [stanzaName]
+ if not self.isHeader:
+ self.stanzaName = lines[0].strip()[1:-1]
+ lines=lines[1:]
+ self["stanza"] = [stanza.strip()]
+
+ ## whatever the stanza is.
+ for line in lines:
+ ## each line is a couple : tag / value
+ taggedvalue = taggedValueFactory(line)
+ if taggedvalue.tag in self:
+ self[taggedvalue.tag].append(taggedvalue)
+ else:
+ self[taggedvalue.tag]=[taggedvalue]
+
+
+ def parseStanzaName(stanza):
+ sm = OBOEntry._match_stanza_name.search(stanza)
+ if sm:
+ return sm.group(0)
+ else:
+ return None
+
+ parseStanzaName=staticmethod(parseStanzaName)
+
+
+
+class OBOTerm(OBOEntry):
+ '''
+ A stanza named 'Term'. It inherits from the class OBOEntry.
+ '''
+ def __init__(self,stanza):
+
+ ## use of the OBOEntry constructor.
+ OBOEntry.__init__(self, stanza)
+
+ assert self.stanzaName=='Term'
+ assert 'stanza' in self
+ assert 'id' in self and len(self['id'])==1,"An OBOTerm must have an id"
+ assert 'name' in self and len(self['name'])==1,"An OBOTerm must have a name"
+ assert 'namespace' not in self or len(self['namespace'])==1, "Only one namespace is allowed for an OBO term"
+
+ assert 'def' not in self or len(self['def'])==1,"Only one definition is allowed for an OBO term"
+ assert 'comment' not in self or len(self['comment'])==1,"Only one comment is allowed for an OBO term"
+
+ assert 'union_of' not in self or len(self['union_of'])>=2,"Only one union relationship is allowed for an OBO term"
+ assert 'intersection_of' not in self or len(self['intersection_of'])>=2,"Only one intersection relationship is allowed for an OBO term"
+
+ if self._isObsolete():
+ #assert 'is_a' not in self
+ assert 'relationship' not in self
+ assert 'inverse_of' not in self
+ assert 'disjoint_from' not in self
+ assert 'union_of' not in self
+ assert 'intersection_of' not in self
+
+ assert 'replaced_by' not in self or self._isObsolete()
+ assert 'consider' not in self or self._isObsolete()
+
+ def _getStanza(self):
+ return self['stanza'][0]
+
+ ## make-up functions.
+ def _getDefinition(self):
+ if 'def' in self:
+ return self['def'][0]
+ return None
+
+ def _getId(self):
+ return self['id'][0]
+
+ def _getNamespace(self):
+ return self['namespace'][0]
+
+ def _getName(self):
+ return self['name'][0]
+
+ def _getComment(self):
+ if 'comment' in self:
+ return self['comment'][0]
+ return None
+
+ def _getAltIds(self):
+ if 'alt_id' in self:
+ return list(set(self.get('alt_id',None)))
+ return None
+
+ def _getIsA(self):
+ if 'is_a' in self:
+ return list(set(self.get('is_a',None)))
+ return None
+
+ def _getSynonym(self):
+ if 'synonym' in self :
+ return list(set(self.get('synonym',None)))
+ return None
+
+ def _getSubset(self):
+ if self.get('subset',None) != None:
+ return list(set(self.get('subset',None)))
+ else:
+ return None
+
+ def _getXref(self):
+ if 'xref' in self:
+ return list(set(self.get('xref',None)))
+ return None
+
+ def _getRelationShip(self):
+ if 'relationship' in self:
+ return list(set(self.get('relationship',None)))
+ return None
+
+ def _getUnion(self):
+ return list(set(self.get('union_of',None)))
+
+ def _getIntersection(self):
+ return list(set(self.get('intersection_of',None)))
+
+ def _getDisjonction(self):
+ return list(set(self.get('disjoint_from',None)))
+
+ def _isObsolete(self):
+ return 'is_obsolete' in self and str(self['is_obsolete'][0])=='true'
+
+ def _getReplacedBy(self):
+ if 'replaced_by' in self:
+ return list(set(self.get('replaced_by',None)))
+ return None
+
+ def _getConsider(self):
+ if 'consider' in self:
+ return list(set(self.get('consider',None)))
+ return None
+
+ ## automatically make-up !
+ stanza = property(_getStanza,None,None)
+ definition = property(_getDefinition,None,None)
+ id = property(_getId,None,None)
+ namespace = property(_getNamespace,None,None)
+ name = property(_getName,None,None)
+ comment = property(_getComment,None,None)
+ alt_ids = property(_getAltIds,None,None)
+ is_a = property(_getIsA,None,None)
+ synonyms = property(_getSynonym,None,None)
+ subsets = property(_getSubset,None,None)
+ xrefs = property(_getXref,None,None)
+ relationship = property(_getRelationShip,None,None)
+ union_of = property(_getUnion,None,None)
+ intersection_of = property(_getIntersection,None,None)
+ disjoint_from = property(_getDisjonction,None,None)
+ is_obsolete = property(_isObsolete,None,None)
+ replaced_by = property(_getReplacedBy,None,None)
+ consider = property(_getConsider,None,None)
+
+
+def OBOEntryFactory(stanza):
+ '''
+ Dispatcher of stanza.
+
+ @param stanza: a stanza composed of several lines.
+ @type stanza: text
+
+ @return: an C{OBOTerm} | C{OBOEntry} instance
+
+ @note: The dispatcher treats differently the stanza which are OBO "Term"
+ and the others.
+ '''
+
+ stanzaType = OBOEntry.parseStanzaName(stanza)
+
+ if stanzaType=="Term":
+ return OBOTerm(stanza)
+ else:
+ return OBOEntry(stanza)
+
+def OBOEntryIterator(file):
+ entries = stanzaIterator(file)
+ for e in entries:
+ debug(e)
+ yield OBOEntryFactory(e)
+
+
\ No newline at end of file
diff --git a/src/obitools/options/__init__.py b/src/obitools/options/__init__.py
new file mode 100644
index 0000000..3590db3
--- /dev/null
+++ b/src/obitools/options/__init__.py
@@ -0,0 +1,101 @@
+"""
+ Module providing high level functions to manage command line options.
+"""
+import logging
+import sys
+
+from logging import debug
+
+from optparse import OptionParser
+from optparse import IndentedHelpFormatter
+
+from obitools.utils import universalOpen
+from obitools.utils import fileSize
+from obitools.utils import universalTell
+from obitools.utils import progressBar
+from obitools.format.options import addInputFormatOption, addInOutputOption,\
+ autoEntriesIterator
+import time
+
+from _options import fileWithProgressBar # @UnresolvedImport
+from _options import currentInputFileName # @UnresolvedImport
+from _options import currentInputFile # @UnresolvedImport
+from _options import currentFileSize # @UnresolvedImport
+from _options import currentFileTell # @UnresolvedImport
+from _options import allEntryIterator # @UnresolvedImport
+
+from obitools.ecopcr.sequence import EcoPCRDBSequenceIterator
+
+class ObiHelpFormatter (IndentedHelpFormatter):
+ def __init__(self,
+ indent_increment=2,
+ max_help_position=24,
+ width=None,
+ short_first=1):
+ IndentedHelpFormatter.__init__(self, indent_increment, max_help_position, width, short_first)
+
+ def format_heading(self, heading):
+ return '\n'.join(("%*s%s" % (self.current_indent, "", '*'*(len(heading)+4)),
+ "%*s* %s *" % (self.current_indent, "", heading),
+ "%*s%s\n" % (self.current_indent, "", '*'*(len(heading)+4))))
+
+
+def getOptionManager(optionDefinitions,entryIterator=None,progdoc=None,checkFormat=False):
+ '''
+ Build an option manager function. that is able to parse
+ command line options of the script.
+
+ @param optionDefinitions: list of function describing a set of
+ options. Each function must allows as
+ unique parameter an instance of OptionParser.
+ @type optionDefinitions: list of functions.
+
+ @param entryIterator: an iterator generator function returning
+ entries from the data files.
+
+ @type entryIterator: an iterator generator function with only one
+ parameter of type file
+ '''
+
+ parser = OptionParser(usage=progdoc, formatter=ObiHelpFormatter())
+ parser.add_option('--DEBUG',
+ action="store_true", dest="debug",
+ default=False,
+ help="Set logging in debug mode")
+
+ parser.add_option('--without-progress-bar',
+ action="store_false", dest="progressbar",
+ default=True,
+ help="desactivate progress bar")
+
+ for f in optionDefinitions:
+ if f == addInputFormatOption or f == addInOutputOption:
+ checkFormat=True
+ f(parser)
+
+
+ def commandLineAnalyzer():
+ options,files = parser.parse_args()
+ if options.debug:
+ logging.root.setLevel(logging.DEBUG)
+
+ if checkFormat:
+ if not hasattr(options, "skiperror"):
+ options.skiperror=False
+ ei=autoEntriesIterator(options)
+ else:
+ ei=entryIterator
+
+
+ options.readerIterator=ei
+
+ if ei==EcoPCRDBSequenceIterator:
+ options.taxonomy=files[0]
+
+ i = allEntryIterator(files,ei,with_progress=options.progressbar,options=options)
+ return options,i
+
+ return commandLineAnalyzer
+
+
+
\ No newline at end of file
diff --git a/src/obitools/options/_bioseqfilter.pyx b/src/obitools/options/_bioseqfilter.pyx
new file mode 100644
index 0000000..9e36ae6
--- /dev/null
+++ b/src/obitools/options/_bioseqfilter.pyx
@@ -0,0 +1,82 @@
+# cython: profile=True
+
+from obitools.options.taxonomyfilter import taxonomyFilterGenerator
+
+
+def filterGenerator(options):
+ taxfilter = taxonomyFilterGenerator(options)
+
+ if options.idlist is not None:
+ idset=set(x.strip() for x in open(options.idlist))
+ else:
+ idset=None
+
+
+ def sequenceFilter(seq):
+ cdef bint good = True
+
+ if hasattr(options, 'sequencePattern'):
+ good = <bint>(options.sequencePattern.search(str(seq)))
+
+ if good and hasattr(options, 'identifierPattern'):
+ good = <bint>(options.identifierPattern.search(seq.id))
+
+ if good and idset is not None:
+ good = seq.id in idset
+
+ if good and hasattr(options, 'definitionPattern'):
+ good = <bint>(options.definitionPattern.search(seq.definition))
+
+ if good :
+ good = reduce(lambda bint x, bint y:x and y,
+ (k in seq for k in options.has_attribute),
+ True)
+
+ if good and hasattr(options, 'attributePatterns'):
+ good = (reduce(lambda bint x, bint y : x and y,
+ (<bint>(options.attributePatterns[p].search(str(seq[p])))
+ for p in options.attributePatterns
+ if p in seq),True)
+ and
+ reduce(lambda bint x, bint y : x and y,
+ (bool(p in seq)
+ for p in options.attributePatterns),True)
+ )
+
+ if good and hasattr(options, 'predicats') and options.predicats is not None:
+ if options.taxonomy is not None:
+ e = {'taxonomy' : options.taxonomy,'sequence':seq}
+ else:
+ e = {'sequence':seq}
+
+ good = (reduce(lambda bint x, bint y: x and y,
+ (bool(eval(p,e,seq))
+ for p in options.predicats),True)
+ )
+
+ if good and hasattr(options, 'lmin') and options.lmin is not None:
+ good = len(seq) >= options.lmin
+
+ if good and hasattr(options, 'lmax') and options.lmax is not None:
+ good = len(seq) <= options.lmax
+
+ if good:
+ good = taxfilter(seq)
+
+ if hasattr(options, 'invertedFilter') and options.invertedFilter:
+ good=not good
+
+
+ return good
+
+ return sequenceFilter
+
+def sequenceFilterIteratorGenerator(options):
+ filter = filterGenerator(options)
+
+ def sequenceFilterIterator(seqIterator):
+ for seq in seqIterator:
+ if filter(seq):
+ yield seq
+
+ return sequenceFilterIterator
diff --git a/src/obitools/options/_options.pyx b/src/obitools/options/_options.pyx
new file mode 100644
index 0000000..c972215
--- /dev/null
+++ b/src/obitools/options/_options.pyx
@@ -0,0 +1,124 @@
+# cython: profile=True
+
+from obitools.utils._utils cimport progressBar
+
+from obitools.utils import universalOpen
+from obitools.utils import universalTell
+from obitools.utils import fileSize
+from obitools.ecopcr.sequence import EcoPCRDBSequenceIterator
+from glob import glob
+from logging import debug
+import sys
+
+cdef extern from "stdio.h":
+ ctypedef unsigned int off_t "unsigned long long"
+
+
+cdef class CurrentFileStatus:
+ cdef public bytes currentInputFileName
+ cdef public object currentFile
+ cdef public off_t currentFileSize
+
+ def __init__(self):
+ self.currentInputFileName=None
+ self.currentFile = None
+ self.currentFileSize = -1
+
+cfs=CurrentFileStatus()
+
+cpdef bytes currentInputFileName():
+ return cfs.currentInputFileName
+
+cpdef object currentInputFile():
+ return cfs.currentFile
+
+cpdef off_t currentFileSize():
+ return cfs.currentFileSize
+
+cpdef off_t currentFileTell():
+ return universalTell(cfs.currentFile)
+
+def fileWithProgressBar(file, int step=100):
+
+ cdef off_t size
+ cdef off_t pos
+
+ size = cfs.currentFileSize
+
+ def fileBar():
+
+ cdef str l
+
+ pos=1
+ progressBar(pos, size, True,cfs.currentInputFileName)
+ for l in file:
+ progressBar(currentFileTell,size, False,
+ cfs.currentInputFileName)
+ yield l
+ print >>sys.stderr,''
+
+ if size < 0:
+ return file
+ else:
+ f = fileBar()
+ return f
+
+
+def allEntryIterator(files,entryIterator,with_progress=False,histo_step=102,options=None):
+
+ if files :
+ for f in files:
+ if (entryIterator != EcoPCRDBSequenceIterator) :
+
+ cfs.currentInputFileName=f
+ try:
+ f = universalOpen(f,noError=True)
+ except Exception as e:
+ if glob('%s_[0-9][0-9][0-9].sdx' % f):
+ entryIterator=EcoPCRDBSequenceIterator
+ else:
+ print >>sys.stderr, e
+ sys.exit();
+ else:
+ cfs.currentFile=f
+ cfs.currentFileSize=fileSize(cfs.currentFile)
+ debug(f)
+
+ if with_progress and cfs.currentFileSize >0:
+ f=fileWithProgressBar(f,step=histo_step)
+
+ if entryIterator is None:
+ for line in f:
+ yield line
+ else:
+
+ if entryIterator == EcoPCRDBSequenceIterator and options is not None:
+ if hasattr(options,'ecodb') and options.ecodb==f:
+ iterator = entryIterator(f,options.taxonomy)
+ else:
+ iterator = entryIterator(f)
+ options.taxonomy=iterator.taxonomy
+ options.ecodb=f
+ else:
+ iterator = entryIterator(f)
+ for entry in iterator:
+ yield entry
+
+
+ else:
+ if entryIterator is None:
+ for line in sys.stdin:
+ yield line
+ else:
+ import os, stat
+
+ mode = os.fstat(0).st_mode
+ if stat.S_ISFIFO(mode):
+ pass
+ elif stat.S_ISREG(mode):
+ pass
+ else:
+ print>>sys.stderr, "No Entry to process"
+ sys.exit()
+ for entry in entryIterator(sys.stdin):
+ yield entry
diff --git a/src/obitools/options/bioseqcutter.py b/src/obitools/options/bioseqcutter.py
new file mode 100644
index 0000000..349a019
--- /dev/null
+++ b/src/obitools/options/bioseqcutter.py
@@ -0,0 +1,87 @@
+from logging import debug
+
+def _beginOptionCallback(options,opt,value,parser):
+ def beginCutPosition(seq):
+ debug("begin = %s" % value )
+ if hasattr(options, 'taxonomy') and options.taxonomy is not None:
+ environ = {'taxonomy' : options.taxonomy,'sequence':seq}
+ else:
+ environ = {'sequence':seq}
+
+ return eval(value,environ,seq) - 1
+
+ parser.values.beginCutPosition=beginCutPosition
+
+def _endOptionCallback(options,opt,value,parser):
+ def endCutPosition(seq):
+ if hasattr(options, 'taxonomy') and options.taxonomy is not None:
+ environ = {'taxonomy' : options.taxonomy,'sequence':seq}
+ else:
+ environ = {'sequence':seq}
+
+ return eval(value,environ,seq)
+
+ parser.values.endCutPosition=endCutPosition
+
+
+
+
+def addSequenceCuttingOptions(optionManager):
+
+ group = optionManager.add_option_group('Cutting options')
+
+ group.add_option('-b','--begin',
+ action="callback", callback=_beginOptionCallback,
+ metavar="<PYTHON_EXPRESSION>",
+ type="string",
+ help="python expression to be evaluated in the "
+ "sequence context. The attribute name can be "
+ "used in the expression as variable name. "
+ "An extra variable named 'sequence' refers "
+ "to the sequence object itself. ")
+
+ group.add_option('-e','--end',
+ action="callback", callback=_endOptionCallback,
+ metavar="<PYTHON_EXPRESSION>",
+ type="string",
+ help="python expression to be evaluated in the "
+ "sequence context. The attribute name can be "
+ "used in the expression as variable name ."
+ "An extra variable named 'sequence' refers"
+ "to the sequence object itself. ")
+
+
+def cutterGenerator(options):
+
+ def sequenceCutter(seq):
+
+ lseq = len(seq)
+
+ if hasattr(options, 'beginCutPosition'):
+ begin = int(options.beginCutPosition(seq))
+ else:
+ begin = 0
+
+ if hasattr(options, 'endCutPosition'):
+ end = int(options.endCutPosition(seq))
+ else:
+ end = lseq
+
+ if begin > 0 or end < lseq:
+ seq = seq[begin:end]
+ seq['subsequence']="%d..%d" % (begin+1,end)
+
+ return seq
+
+ return sequenceCutter
+
+def cutterIteratorGenerator(options):
+ _cutter = cutterGenerator(options)
+
+ def sequenceCutterIterator(seqIterator):
+ for seq in seqIterator:
+ yield _cutter(seq)
+
+ return sequenceCutterIterator
+
+
diff --git a/src/obitools/options/bioseqedittag.py b/src/obitools/options/bioseqedittag.py
new file mode 100644
index 0000000..50f0e0a
--- /dev/null
+++ b/src/obitools/options/bioseqedittag.py
@@ -0,0 +1,317 @@
+import sys
+from obitools.options.taxonomyfilter import loadTaxonomyDatabase
+import math
+
+def addSequenceEditTagOptions(optionManager):
+
+ group = optionManager.add_option_group('Sequences and attributes editing options')
+
+ group.add_option('--seq-rank',
+ action="store_true", dest='addrank',
+ default=False,
+ help="add a rank attribute to the sequence "
+ "indicating the sequence position in the input data")
+
+ group.add_option('-R','--rename-tag',
+ action="append",
+ dest='renameTags',
+ metavar="<OLD_NAME:NEW_NAME>",
+ type="string",
+ default=[],
+ help="change tag name from OLD_NAME to NEW_NAME")
+
+ group.add_option('--delete-tag',
+ action="append",
+ dest='deleteTags',
+ metavar="<TAG_NAME>",
+ type="string",
+ default=[],
+ help="delete tag TAG_NAME")
+
+ group.add_option('-S','--set-tag',
+ action="append",
+ dest='setTags',
+ metavar="<TAG_NAME:PYTHON_EXPRESSION>",
+ type="string",
+ default=[],
+ help="Add a new tag named TAG_NAME with "
+ "a value computed from PYTHON_EXPRESSION")
+
+ group.add_option('--tag-list',
+ action="store",
+ dest='taglist',
+ metavar="<FILENAME>",
+ type="string",
+ default=None,
+ help="Indicate a file containing tag and values "
+ "to modify on specified sequences")
+
+ group.add_option('--set-identifier',
+ action="store",
+ dest='setIdentifier',
+ metavar="<PYTHON_EXPRESSION>",
+ type="string",
+ default=None,
+ help="Set sequence identifier with "
+ "a value computed from PYTHON_EXPRESSION")
+
+ group.add_option('--run',
+ action="store",
+ dest='run',
+ metavar="<PYTHON_EXPRESSION>",
+ type="string",
+ default=None,
+ help="Run a python expression on each selected sequence")
+
+ group.add_option('--set-sequence',
+ action="store",
+ dest='setSequence',
+ metavar="<PYTHON_EXPRESSION>",
+ type="string",
+ default=None,
+ help="Change the sequence itself with "
+ "a value computed from PYTHON_EXPRESSION")
+
+ group.add_option('-T','--set-definition',
+ action="store",
+ dest='setDefinition',
+ metavar="<PYTHON_EXPRESSION>",
+ type="string",
+ default=None,
+ help="Set sequence definition with "
+ "a value computed from PYTHON_EXPRESSION")
+
+ group.add_option('-O','--only-valid-python',
+ action="store_true",
+ dest='onlyValid',
+ default=False,
+ help="only valid python expressions are allowed")
+
+ group.add_option('-C','--clear',
+ action="store_true",
+ dest='clear',
+ default=False,
+ help="clear all tags associated to the sequences")
+
+ group.add_option('-k','--keep',
+ action='append',
+ dest='keep',
+ default=[],
+ type="string",
+ help="only keep this tag")
+
+ group.add_option('--length',
+ action="store_true",
+ dest='length',
+ default=False,
+ help="add seqLength tag with sequence length")
+
+ group.add_option('--with-taxon-at-rank',
+ action='append',
+ dest='taxonrank',
+ default=[],
+ type="string",
+ help="add taxonomy annotation at a specified rank level")
+
+ group.add_option('-m','--mcl',
+ action="store", dest="mcl",
+ metavar="<mclfile>",
+ type="string",
+ default=None,
+ help="add cluster tag to sequences according to a mcl graph clustering partition")
+
+ group.add_option('--uniq-id',
+ action="store_true", dest="uniqids",
+ default=False,
+ help="force sequence ids to be uniq")
+
+
+def readMCLFile(file):
+ partition=1
+ parts = {}
+ for l in file:
+ for seq in l.strip().split():
+ parts[seq]=partition
+ partition+=1
+ return parts
+
+def readTagFile(f):
+ tags = {}
+
+ for l in f:
+ ident,tag,value = l.split(None,2)
+ value=value.strip()
+ d = tags.get(ident,[])
+ try:
+ value = eval(value)
+ except Exception:
+ pass
+ d.append((tag,value))
+ tags[ident]=d
+
+ return tags
+
+
+def sequenceTaggerGenerator(options):
+ toDelete = options.deleteTags[:]
+ toRename = [x.split(':',1) for x in options.renameTags if len(x.split(':',1))==2]
+ toSet = [x.split(':',1) for x in options.setTags if len(x.split(':',1))==2]
+ newId = options.setIdentifier
+ newDef = options.setDefinition
+ newSeq = options.setSequence
+ clear = options.clear
+ keep = set(options.keep)
+ length = options.length
+ run = options.run
+ uniqids = options.uniqids
+ counter = [0]
+ loadTaxonomyDatabase(options)
+ if options.taxonomy is not None:
+ annoteRank=options.taxonrank
+ else:
+ annoteRank=[]
+
+ if options.mcl is not None:
+ parts = readMCLFile(open(options.mcl))
+ else:
+ parts = False
+
+ if options.taglist is not None:
+ tags = readTagFile(open(options.taglist))
+ else:
+ tags = False
+
+ if uniqids:
+ idlist = {}
+
+ def sequenceTagger(seq):
+
+ if counter[0]>=0:
+ counter[0]+=1
+
+ if clear or keep:
+ ks = seq.keys()
+ for k in ks:
+ if k not in keep:
+ del seq[k]
+ else:
+ for i in toDelete:
+ if i in seq:
+ del seq[i]
+ for o,n in toRename:
+ if o in seq:
+ seq[n]=seq[o]
+ del seq[o]
+
+ for rank in annoteRank:
+ if 'taxid' in seq:
+ taxid = seq['taxid']
+ if taxid is not None:
+ rtaxid = options.taxonomy.getTaxonAtRank(taxid,rank)
+ if rtaxid is not None:
+ scn = options.taxonomy.getScientificName(rtaxid)
+ else:
+ scn=None
+ seq[rank]=rtaxid
+ seq["%s_name"%rank]=scn
+
+ if parts and seq.id in parts:
+ seq['cluster']=parts[seq.id]
+
+ if tags and seq.id in tags:
+ for t,v in tags[seq.id]:
+ seq[t]=v
+
+ if options.addrank:
+ seq['seq_rank']=counter[0]
+
+ for i,v in toSet:
+ try:
+ if options.taxonomy is not None:
+ environ = {'taxonomy' : options.taxonomy,'sequence':seq, 'counter':counter[0], 'math':math}
+ else:
+ environ = {'sequence':seq, 'counter':counter[0], 'math':math}
+
+ val = eval(v,environ,seq)
+ except Exception,e:
+ if options.onlyValid:
+ raise e
+ val = v
+ seq[i]=val
+
+ if length:
+ seq['seq_length']=len(seq)
+
+ if newId is not None:
+ try:
+ if options.taxonomy is not None:
+ environ = {'taxonomy' : options.taxonomy,'sequence':seq, 'counter':counter[0], 'math':math}
+ else:
+ environ = {'sequence':seq, 'counter':counter[0], 'math':math}
+
+ val = eval(newId,environ,seq)
+ except Exception,e:
+ if options.onlyValid:
+ raise e
+ val = newId
+ seq.id=val
+ if newDef is not None:
+ try:
+ if options.taxonomy is not None:
+ environ = {'taxonomy' : options.taxonomy,'sequence':seq, 'counter':counter[0], 'math':math}
+ else:
+ environ = {'sequence':seq, 'counter':counter[0], 'math':math}
+
+ val = eval(newDef,environ,seq)
+ except Exception,e:
+ if options.onlyValid:
+ raise e
+ val = newDef
+ seq.definition=val
+
+ if newSeq is not None:
+ try:
+ if options.taxonomy is not None:
+ environ = {'taxonomy' : options.taxonomy,'sequence':seq, 'counter':counter[0], 'math':math}
+ else:
+ environ = {'sequence':seq, 'counter':counter[0], 'math':math}
+
+ val = eval(newSeq,environ,seq)
+ except Exception,e:
+ if options.onlyValid:
+ raise e
+ val = newSeq
+ if hasattr(seq, '_seq'):
+ seq._seq=str(val).lower()
+ if 'seq_length' in seq:
+ seq['seq_length']=len(seq)
+
+ if run is not None:
+ try:
+ if options.taxonomy is not None:
+ environ = {'taxonomy' : options.taxonomy,'sequence':seq, 'counter':counter[0], 'math':math}
+ else:
+ environ = {'sequence':seq, 'counter':counter[0], 'math':math}
+
+ val = eval(run,environ,seq)
+ except Exception,e:
+ if options.onlyValid:
+ raise e
+
+ if uniqids:
+ n = idlist.get(seq.id,0)
+ if (n > 0):
+ newid = seq.id
+ while (n > 0):
+ old = newid
+ newid = "%s.%d" % (old,n)
+ n = idlist.get(newid,0)
+ idlist[old]+=1
+ seq.id=newid
+ idlist[seq.id]=1
+
+
+
+ return seq
+
+ return sequenceTagger
\ No newline at end of file
diff --git a/src/obitools/options/bioseqfilter.py b/src/obitools/options/bioseqfilter.py
new file mode 100644
index 0000000..5dd4491
--- /dev/null
+++ b/src/obitools/options/bioseqfilter.py
@@ -0,0 +1,121 @@
+import re
+
+from obitools.options.taxonomyfilter import addTaxonomyFilterOptions
+
+from _bioseqfilter import filterGenerator,sequenceFilterIteratorGenerator
+
+def _sequenceOptionCallback(options,opt,value,parser):
+ parser.values.sequencePattern = re.compile(value,re.I)
+
+def _defintionOptionCallback(options,opt,value,parser):
+ parser.values.definitionPattern = re.compile(value)
+
+def _identifierOptionCallback(options,opt,value,parser):
+ parser.values.identifierPattern = re.compile(value)
+
+def _attributeOptionCallback(options,opt,value,parser):
+ if not hasattr(options, 'attributePatterns'):
+ parser.values.attributePatterns={}
+ attribute,pattern=value.split(':',1)
+ parser.values.attributePatterns[attribute]=re.compile(pattern)
+
+def _predicatOptionCallback(options,opt,value,parser):
+ if not hasattr(options, 'predicats'):
+ options.predicats=[]
+ parser.values.predicats.append(value)
+
+
+def addSequenceFilteringOptions(optionManager):
+
+ group = optionManager.add_option_group('Filtering options')
+
+ group.add_option('-s','--sequence',
+ action="callback", callback=_sequenceOptionCallback,
+ metavar="<REGULAR_PATTERN>",
+ type="string",
+ help="regular expression pattern used to select "
+ "the sequence. The pattern is case insensitive")
+
+ group.add_option('-D','--definition',
+ action="callback", callback=_defintionOptionCallback,
+ type="string",
+ metavar="<REGULAR_PATTERN>",
+ help="regular expression pattern matched against "
+ "the definition of the sequence. "
+ "The pattern is case sensitive")
+
+ group.add_option('-I','--identifier',
+ action="callback", callback=_identifierOptionCallback,
+ type="string",
+ metavar="<REGULAR_PATTERN>",
+ help="regular expression pattern matched against "
+ "the identifier of the sequence. "
+ "The pattern is case sensitive")
+
+ group.add_option('--id-list',
+ action="store", dest="idlist",
+ metavar="<FILENAME>",
+ type="string",
+ default=None,
+ help="file containing identifiers of sequences to select")
+
+ group.add_option('-a','--attribute',
+ action="callback", callback=_attributeOptionCallback,
+ type="string",
+ metavar="<ATTRIBUTE_NAME>:<REGULAR_PATTERN>",
+ help="regular expression pattern matched against "
+ "the attributes of the sequence. "
+ "the value of this atribute is of the form : "
+ "attribute_name:regular_pattern. "
+ "The pattern is case sensitive."
+ "Several -a option can be used on the same "
+ "commande line.")
+
+ group.add_option('-A','--has-attribute',
+ action="append",
+ type="string",
+ dest="has_attribute",
+ default=[],
+ metavar="<ATTRIBUTE_NAME>",
+ help="select sequence with attribute <ATTRIBUTE_NAME> "
+ "defined")
+
+ group.add_option('-p','--predicat',
+ action="append", dest="predicats",
+ metavar="<PYTHON_EXPRESSION>",
+ help="python boolean expression to be evaluated in the "
+ "sequence context. The attribute name can be "
+ "used in the expression as variable name ."
+ "An extra variable named 'sequence' refers"
+ "to the sequence object itself. "
+ "Several -p option can be used on the same "
+ "commande line.")
+
+ group.add_option('-L','--lmax',
+ action='store',
+ metavar="<##>",
+ type="int",dest="lmax",
+ help="keep sequences shorter than lmax")
+
+ group.add_option('-l','--lmin',
+ action='store',
+ metavar="<##>",
+ type="int",dest="lmin",
+ help="keep sequences longer than lmin")
+
+ group.add_option('-v','--inverse-match',
+ action='store_true',
+ default=False,
+ dest="invertedFilter",
+ help="revert the sequence selection "
+ "[default : %default]")
+
+ addTaxonomyFilterOptions(optionManager)
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/src/obitools/options/taxonomyfilter.py b/src/obitools/options/taxonomyfilter.py
new file mode 100644
index 0000000..5526c79
--- /dev/null
+++ b/src/obitools/options/taxonomyfilter.py
@@ -0,0 +1,6 @@
+from obitools.ecopcr.options import addTaxonomyDBOptions, \
+ addTaxonomyFilterOptions, \
+ loadTaxonomyDatabase, \
+ taxonomyFilterGenerator, \
+ taxonomyFilterIteratorGenerator
+
diff --git a/src/obitools/parallel/__init__.py b/src/obitools/parallel/__init__.py
new file mode 100644
index 0000000..2aa1b07
--- /dev/null
+++ b/src/obitools/parallel/__init__.py
@@ -0,0 +1,99 @@
+import threading
+
+class TaskPool(object):
+
+ def __init__(self,iterable,function,count=2):
+ self.pool = []
+ self.queue= []
+ self.plock= threading.Lock()
+ self.qlock= threading.Lock()
+ self.function=function
+ self.event=threading.Event()
+ self.iterable=iterable
+ for i in xrange(count):
+ Task(self)
+
+ def register(self,task):
+ self.plock.acquire()
+ self.pool.append(task)
+ self.plock.release()
+ self.ready(task)
+
+ def unregister(self,task):
+ task.thread.join()
+ self.plock.acquire()
+ self.pool.remove(task)
+ self.plock.release()
+
+
+ def ready(self,task):
+ self.qlock.acquire()
+ self.queue.append(task)
+ self.qlock.release()
+ self.event.set()
+
+ def __iter__(self):
+ for data in self.iterable:
+ while not self.queue:
+ self.event.wait()
+ self.event.clear()
+ self.qlock.acquire()
+ task=self.queue.pop(0)
+ self.qlock.release()
+ if hasattr(task, 'rep'):
+ yield task.rep
+ #print "send ",data
+ if isinstance(data,dict):
+ task.submit(**data)
+ else:
+ task.submit(*data)
+
+ while self.pool:
+ self.pool[0].finish()
+ while self.queue:
+ self.event.clear()
+ self.qlock.acquire()
+ task=self.queue.pop(0)
+ self.qlock.release()
+ if hasattr(task, 'rep'):
+ yield task.rep
+
+
+
+
+
+class Task(object):
+ def __init__(self,pool):
+ self.pool = pool
+ self.lock = threading.Lock()
+ self.dataOk = threading.Event()
+ self.repOk = threading.Event()
+ self.args = None
+ self.kwargs=None
+ self.stop=False
+ self.thread = threading.Thread(target=self)
+ self.thread.start()
+ self.pool.register(self)
+
+ def __call__(self):
+ self.dataOk.wait()
+ while(not self.stop):
+ self.lock.acquire()
+ self.dataOk.clear()
+ self.rep=self.pool.function(*self.args,**self.kwargs)
+ self.pool.ready(self)
+ self.lock.release()
+ self.dataOk.wait()
+
+ def submit(self,*args,**kwargs):
+ self.args=args
+ self.kwargs=kwargs
+ self.dataOk.set()
+
+ def finish(self):
+ self.lock.acquire()
+ self.stop=True
+ self.dataOk.set()
+ self.pool.unregister(self)
+
+
diff --git a/src/obitools/parallel/jobqueue.py b/src/obitools/parallel/jobqueue.py
new file mode 100644
index 0000000..9df4804
--- /dev/null
+++ b/src/obitools/parallel/jobqueue.py
@@ -0,0 +1,183 @@
+import threading
+from logging import warning,info
+from time import sleep,time
+
+from obitools.parallel import TaskPool
+
+
+class JobPool(dict):
+ '''
+ JobPool is dedicated to manage a job queue. These jobs
+ will run in a limited number of thread.
+ '''
+
+ def __init__(self,count,precision=0.01):
+ '''
+
+ @param count: number of thread dedicated to this JobPool
+ @type count: int
+ @param precision: delay between two check for new job (in second)
+ @type precision: float
+ '''
+ self._iterator = JobIterator(self)
+ self._taskPool = TaskPool(self._iterator,
+ self._runJob,
+ count)
+ self._precision=precision
+ self._toRun=set()
+ self._runnerThread = threading.Thread(target=self._runner)
+ self._runnerThread.start()
+ self._finalyzed=False
+
+ def _runner(self):
+ for rep in self._taskPool:
+ info('Job %d finnished' % id(rep))
+ info('All jobs in %d JobPool finished' % id(self))
+
+ def _jobIterator(self):
+ return self._iterator
+
+ def _runJob(self,job):
+ job.started= time()
+ info('Job %d started' % id(job))
+ job.result = job()
+ job.ended = time()
+ job.finished=True
+ return job
+
+ def submit(self,job,priority=1.0,userid=None):
+ '''
+ Submit a new job to the JobPool.
+
+ @param job: the new submited job
+ @type job: Job instance
+ @param priority: priority level of this job (higher is better)
+ @type priority: float
+ @param userid: a user identifier (Default is None)
+
+ @return: job identifier
+ @rtype: int
+ '''
+
+ assert not self._finalyzed,\
+ "This jobPool does not accept new job"
+ if job.submitted is not None:
+ warning('Job %d was already submitted' % id(job))
+ return id(job)
+
+ job.submitted = time()
+ job.priority = priority
+ job.userid = userid
+ i=id(job)
+ job.id=id
+ self[i]=job
+ self._toRun.add(job)
+
+ info('Job %d submitted' % i)
+
+ return i
+
+ def finalyze(self):
+ '''
+ Indicate to the JobPool, that no new jobs will
+ be submitted.
+ '''
+ self._iterator.finalyze()
+ self._finalyzed=True
+
+ def __del__(self):
+ self.finalyze()
+
+
+class JobIterator(object):
+ def __init__(self,pool):
+ self._pool = pool
+ self._finalyze=False
+ self._nextLock=threading.Lock()
+
+
+ def __iter__(self):
+ return self
+
+ def finalyze(self):
+ '''
+ Indicate to the JobIterator, that no new jobs will
+ be submitted.
+ '''
+ self._finalyze=True
+
+
+ def next(self):
+ '''
+
+ @return: the next job to run
+ @rtype: Job instance
+ '''
+ self._nextLock.acquire()
+ while self._pool._toRun or not self._finalyze:
+ rep = None
+ maxScore=0
+ for k in self._pool._toRun:
+ s = k.runScore()
+ if s > maxScore:
+ maxScore=s
+ rep=k
+ if rep is not None:
+ self._pool._toRun.remove(rep)
+ self._nextLock.release()
+ return (rep,)
+ sleep(self._pool._precision)
+ self._nextLock.release()
+ info('No more jobs in %d JobPool' % id(self._pool))
+ raise StopIteration
+
+
+
+class Job(object):
+
+ def __init__(self,pool=None,function=None,*args,**kwargs):
+ '''
+ Create a new job
+
+ @param pool: the jobpool used to run job. Can be None to not
+ execute the job immediately.
+ @type pool: JobPool instance
+
+ @param function: the function to run for the job
+ @type function: callable object
+
+ @param args: parametters for function call
+ @param kwargs: named parametters for function call
+
+ @precondition: function cannot be None
+ '''
+ assert function is not None
+ self._args=args
+ self._kwargs = kwargs
+ self._function = function
+ self.running = False
+ self.finished= False
+ self.submitted = None
+ self.priority = None
+ self.userid = None
+
+ if pool is not None:
+ pool.submit(self)
+
+ def runScore(self):
+ '''
+ @return: the score used to ordonnance job in the queue
+ @rtype: C{float}
+ '''
+
+ return (time() - self.submitted) * self.priority
+
+ def __call__(self):
+ return self._function(*self._args,**self._kwargs)
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/src/obitools/phylogeny/__init__.py b/src/obitools/phylogeny/__init__.py
new file mode 100644
index 0000000..8eb1587
--- /dev/null
+++ b/src/obitools/phylogeny/__init__.py
@@ -0,0 +1,119 @@
+
+from obitools.graph.tree import Forest,TreeNode
+from obitools.graph import Edge
+
+
+
+class PhylogenicTree(Forest):
+
+ def __init__(self,label='G',indexer=None,nodes=None,edges=None):
+ Forest.__init__(self, label, indexer, nodes, edges)
+ self.root=None
+ self.comment=None
+
+ def addNode(self,node=None,index=None,**data):
+ if node is None and index is None:
+ node = '__%d' % (len(self._node)+1)
+
+ return Forest.addNode(self, node, index, **data)
+
+ def getNode(self,node=None,index=None):
+ if index is None:
+ index = self._index.getIndex(node, True)
+ return PhylogenicNode(index,self)
+
+ def getEdge(self,node1=None,node2=None,index1=None,index2=None):
+ '''
+
+ @param node1:
+ @type node1:
+ @param node2:
+ @type node2:
+ @param index1:
+ @type index1:
+ @param index2:
+ @type index2:
+ '''
+ node1=self.getNode(node1, index1)
+ node2=self.getNode(node2, index2)
+ return PhylogenicEdge(node1,node2)
+
+
+
+class PhylogenicNode(TreeNode):
+
+ def getLabel(self):
+ label = TreeNode.getLabel(self)
+ if label[0:2]=='__':
+ return None
+ else:
+ return label
+
+ def __str__(self):
+
+ if self.index in self.graph._node_attrs:
+ keys = " ".join(['%s="%s"' % (x[0],str(x[1]).replace('"','\\"'))
+ for x in self.graph._node_attrs[self.index].iteritems()]
+ )
+ else:
+ keys=''
+
+ if self.label is None:
+ label=''
+ shape='point'
+ else:
+ label=self.label
+ shape='box'
+
+ return '%d [label="%s" shape="%s" %s]' % (self.index,str(label).replace('"','\\"'),shape,keys)
+
+ def distanceTo(self,node=None,index=None):
+ '''
+ compute branch length between the two nodes.
+ If distances are not secified for this tree, None is returned.
+
+ @param node: a node label or None
+ @param index: a node index or None. the parameter index
+ has a priority on the parameter node.
+ @type index: int
+
+ @return: the evolutive distance between the two nodes
+ @rtype: int, float or None
+ '''
+ path = self.shortestPathTo(node, index)
+
+ start = path.pop(0)
+ dist=0
+ for dest in path:
+ edge = self.graph.getEdge(index1=start,index2=dest)
+ if 'distance' in edge:
+ dist+=edge['distance']
+ else:
+ return None
+ start=dest
+
+ return dist
+
+ label = property(getLabel, None, None, "Label of the node")
+
+class PhylogenicEdge(Edge):
+
+ def __str__(self):
+ e = (self.node1.index,self.node2.index)
+ if e in self.graph._edge_attrs:
+ keys = "[%s]" % " ".join(['%s="%s"' % (x[0],str(x[1]).replace('"','\\"'))
+ for x in self.graph._edge_attrs[e].iteritems()
+ if x[0] not in ('distance','bootstrap')]
+ )
+ else:
+ keys = ""
+
+
+
+ if self.directed:
+ link='->'
+ else:
+ link='--'
+
+ return "%d %s %d %s" % (self.node1.index,link,self.node2.index,keys)
+
diff --git a/src/obitools/phylogeny/newick.py b/src/obitools/phylogeny/newick.py
new file mode 100644
index 0000000..cf0330c
--- /dev/null
+++ b/src/obitools/phylogeny/newick.py
@@ -0,0 +1,123 @@
+import re
+import sys
+
+from obitools.utils import universalOpen
+from obitools.phylogeny import PhylogenicTree
+
+def subNodeIterator(data):
+ level=0
+ start = 1
+ if data[0]=='(':
+ for i in xrange(1,len(data)):
+ c=data[i]
+ if c=='(':
+ level+=1
+ elif c==')':
+ level-=1
+ if c==',' and not level:
+ yield data[start:i]
+ start = i+1
+ yield data[start:i]
+ else:
+ yield data
+
+
+_nodeParser=re.compile('\s*(?P<subnodes>\(.*\))?(?P<name>[^ :]+)? *(?P<bootstrap>[0-9.]+)?(:(?P<distance>-?[0-9.]+))?')
+
+def nodeParser(data):
+ parsedNode = _nodeParser.match(data).groupdict(0)
+ if not parsedNode['name']:
+ parsedNode['name']=None
+
+ if not parsedNode['bootstrap']:
+ parsedNode['bootstrap']=None
+ else:
+ parsedNode['bootstrap']=float(parsedNode['bootstrap'])
+
+ if not parsedNode['distance']:
+ parsedNode['distance']=None
+ else:
+ parsedNode['distance']=float(parsedNode['distance'])
+
+ if not parsedNode['subnodes']:
+ parsedNode['subnodes']=None
+
+ return parsedNode
+
+_cleanTreeData=re.compile('\s+')
+
+def treeParser(data,tree=None,parent=None):
+ if tree is None:
+ tree = PhylogenicTree()
+ data = _cleanTreeData.sub(' ',data).strip()
+
+ parsedNode = nodeParser(data)
+
+ if parent is not None:
+ son,parent = tree.addEdge(node1=parsedNode['name'],
+ index2=parent,
+ distance=parsedNode['distance'],
+ bootstrap=parsedNode['bootstrap'])
+ else:
+ son = tree.addNode(node1=parsedNode['name'])
+ tree.root=son
+
+
+
+ if parsedNode['subnodes']:
+ for subnode in subNodeIterator(parsedNode['subnodes']):
+ treeParser(subnode,tree,son)
+
+ return tree
+
+_treecomment=re.compile('\[.*\]')
+
+def treeIterator(file):
+ file = universalOpen(file)
+ data = file.read()
+
+ comment = _treecomment.findall(data)
+ data=_treecomment.sub('',data).strip()
+
+ if comment:
+ comment=comment[0]
+ else:
+ comment=None
+ for tree in data.split(';'):
+ t = treeParser(tree)
+ if comment:
+ t.comment=comment
+ yield t
+
+def nodeWriter(tree,node,deep=0):
+ name = node._name
+ if name is None:
+ name=''
+
+ distance=node._dist
+ if distance is None:
+ distance=''
+ else:
+ distance = ':%6.5f' % distance
+
+ bootstrap=node._bootstrap
+ if bootstrap is None:
+ bootstrap=''
+ else:
+ bootstrap=' %d' % int(bootstrap)
+
+ nodeseparator = ',\n' + ' ' * (deep+1)
+
+ subnodes = nodeseparator.join([nodeWriter(tree, x, deep+1)
+ for x in tree.childNodeIterator(node)])
+ if subnodes:
+ subnodes='(\n' + ' ' * (deep+1) + subnodes + '\n' + ' ' * deep + ')'
+
+ return '%s%s%s%s' % (subnodes,name,bootstrap,distance)
+
+def treeWriter(tree,startnode=None):
+ if startnode is not None:
+ root=startnode
+ else:
+ root = tree.getRoot()
+ return nodeWriter(tree,root)+';'
diff --git a/src/obitools/profile/__init__.py b/src/obitools/profile/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/obitools/profile/_profile.pxd b/src/obitools/profile/_profile.pxd
new file mode 100644
index 0000000..8d79b35
--- /dev/null
+++ b/src/obitools/profile/_profile.pxd
@@ -0,0 +1,72 @@
+
+cdef import from "stdlib.h":
+ void* malloc(int size) except NULL
+ void* realloc(void* chunk,int size) except NULL
+ void free(void* chunk)
+
+cdef import from "string.h":
+ void bzero(void *s, size_t n)
+ void memset(void* chunk,int car,int length)
+ void memcpy(void* s1, void* s2, int n)
+ int memcmp(void* s1, void* s2, int n)
+
+cdef import from "math.h":
+ double exp(double x)
+
+cdef extern from *:
+ ctypedef int* int_p "int*"
+
+
+cdef struct dnaprofile_t:
+ int length
+ int weight
+ int value
+ double pseudo
+ int_p A
+ int_p C
+ int_p G
+ int_p T
+ int_p Og
+ int_p Eg
+
+
+cdef dnaprofile_t* allocateDNAProfile(int size)
+cdef void freeDNAProfile(dnaprofile_t* profile)
+cdef void copyDNAProfile(dnaprofile_t* dest, dnaprofile_t* source)
+
+cdef class _MemIntArray:
+
+ cdef int_p start
+ cdef int size
+
+ cdef initialize(self, int_p start,int size)
+ cdef int normalize(self, int pos)
+ cpdef double frequency(self,int pos, int weight, double pseudo=?)
+
+
+cdef class DNAProfile:
+
+ cdef dnaprofile_t* profile
+ cdef _MemIntArray _baseA
+ cdef _MemIntArray _baseC
+ cdef _MemIntArray _baseG
+ cdef _MemIntArray _baseT
+ cdef _MemIntArray _Og
+ cdef _MemIntArray _Eg
+
+ cpdef bint equal(self,DNAProfile profile)
+ cpdef DNAProfile add(DNAProfile self,DNAProfile profile)
+ cpdef double lproba(DNAProfile self,DNAProfile profile) except 1.
+ cpdef double proba(DNAProfile self,DNAProfile profile) except -1.
+
+ cdef void _initLetter(self)
+ cdef void _initFromString(self, char *seq)
+
+ cpdef double fA(self,int pos)
+ cpdef double fC(self,int pos)
+ cpdef double fG(self,int pos)
+ cpdef double fT(self,int pos)
+ cpdef double fOg(self,int pos)
+ cpdef double fEg(self,int pos)
+
+ cpdef int WP(self)
\ No newline at end of file
diff --git a/src/obitools/profile/_profile.pyx b/src/obitools/profile/_profile.pyx
new file mode 100644
index 0000000..459ed57
--- /dev/null
+++ b/src/obitools/profile/_profile.pyx
@@ -0,0 +1,289 @@
+from obitools import NucSequence
+from math import log
+from _profile cimport *
+
+
+cdef dnaprofile_t* allocateDNAProfile(int size):
+
+ cdef dnaprofile_t* profile
+ cdef int sblock
+ profile = <dnaprofile_t*>malloc(sizeof(dnaprofile_t))
+ profile.length = size
+ profile.weight = 0
+ profile.pseudo = 0
+ sblock = sizeof(int)*6*size
+ profile.A = <int*>malloc(sblock)
+ bzero(<void*>profile.A, sblock)
+ profile.C = profile.A + size
+ profile.G = profile.C + size
+ profile.T = profile.G + size
+ profile.Og= profile.T + size
+ profile.Eg= profile.Og+ size
+
+ return profile
+
+
+cdef void freeDNAProfile(dnaprofile_t *profile):
+ if profile is not NULL:
+ if profile.A is not NULL:
+ free(profile.A)
+
+ free(profile)
+
+
+cdef void copyDNAProfile(dnaprofile_t* dest, dnaprofile_t *source):
+ cdef int size
+
+ assert source is not NULL and dest is not NULL
+ assert source.length==dest.length
+
+ size = source.length * 6 * sizeof(int)
+ memcpy(dest.A,source.A,size)
+ dest.weight=source.weight
+ dest.pseudo=source.pseudo
+
+
+cdef class _MemIntArray:
+
+ cdef initialize(self, int* begin,int size):
+ self.start=begin
+ self.size=size
+
+ cdef int normalize(self, int pos):
+ if pos < 0:
+ pos = self.size + pos
+
+ if pos >= self.size or pos < 0:
+ raise IndexError
+
+ return pos
+
+ def __init__(self):
+ self.start=NULL
+
+ def __getitem__(self, int pos):
+ pos = self.normalize(pos)
+ return self.start[pos]
+
+ def __setitem__(self,int pos, int value):
+ pos = self.normalize(pos)
+ self.start[pos]=value
+
+ def __len__(self):
+ return self.size
+
+ cpdef double frequency(self,int pos, int weight, double pseudo=0):
+ pos = self.normalize(pos)
+ if weight==0:
+ raise ZeroDivisionError
+ pseudo*=weight
+ return (<double>self.start[pos]+ weight * pseudo/6) / (<double>weight + pseudo)
+
+
+cdef class DNAProfile:
+
+ cdef void _initLetter(self):
+ cdef dnaprofile_t* profile = self.profile
+
+ self._baseA = _MemIntArray()
+ self._baseA.initialize(profile.A ,profile.length)
+ self._baseC = _MemIntArray()
+ self._baseC.initialize(profile.C ,profile.length)
+ self._baseG = _MemIntArray()
+ self._baseG.initialize(profile.G ,profile.length)
+ self._baseT = _MemIntArray()
+ self._baseT.initialize(profile.T ,profile.length)
+ self._Og = _MemIntArray()
+ self._Og.initialize(profile.Og,profile.length)
+ self._Eg = _MemIntArray()
+ self._Eg.initialize(profile.Eg,profile.length)
+
+
+ def __init__(self,sequence=None,size=None,pseudo=0):
+
+ if sequence is not None:
+ size = len(sequence)
+
+ self.profile = allocateDNAProfile(size)
+ self.profile.pseudo=pseudo
+ if sequence is not None:
+ if isinstance(sequence,NucSequence):
+ seq = str(sequence).lower()
+ self._initFromString(seq)
+ elif isinstance(sequence,str):
+ seq = sequence.lower()
+ self._initFromString(seq)
+ elif isinstance(sequence,DNAProfile):
+ copyDNAProfile(self.profile,(<DNAProfile>sequence).profile)
+
+ self._initLetter()
+
+
+ def __dealloc__(self):
+ freeDNAProfile(self.profile)
+
+ def __hash__(self):
+ return id(self)
+
+ def __str__(self):
+ cdef int i
+ cdef int lseq = self.profile.length
+ cdef list output=[]
+ cdef str line
+ cdef int* A= self.profile.A
+ cdef int* C= self.profile.C
+ cdef int* G= self.profile.G
+ cdef int* T= self.profile.T
+ cdef int* Og=self.profile.Og
+ cdef int* Eg=self.profile.Eg
+
+ for i in range(lseq):
+ line = "%6d %6d %6d %6d %6d %6d %6d " % (i,A[i],C[i],G[i],T[i],Og[i],Eg[i])
+ output.append(line)
+
+ line = "\n".join(output)
+ return " pos A C G T Og Eg\n"+line
+
+
+ def __len__(self):
+ return self.profile.length
+
+
+ cpdef bint equal(DNAProfile self,DNAProfile profile):
+ cdef int sblock
+ cdef bint r
+ cdef int size
+ r=False
+ if self.profile.length == profile.profile.length :
+ if self.profile.weight == profile.profile.weight :
+ size = self.profile.length
+ sblock = sizeof(int)*6*size
+ r = memcmp(<void*>self.profile.A, <void*>profile.profile.A, sblock) == 0
+ return r
+
+
+ def __richcmp__(DNAProfile self,DNAProfile profile,int op):
+ if op==2:
+ return self.equal(profile)
+ else:
+ return NotImplemented
+
+
+ cpdef DNAProfile add(DNAProfile self,DNAProfile profile):
+ cdef DNAProfile newProfile
+ cdef int p
+ assert self.profile.length==profile.profile.length,'Only profiles with identical length can be added'
+ pc = max(self.profile.pseudo,profile.profile.pseudo)
+ newProfile = DNAProfile(size=self.profile.length,pseudo=pc)
+ for p in xrange(self.profile.length) :
+ newProfile.profile.A[p] = self.profile.A[p] + profile.A[p]
+ newProfile.profile.C[p] = self.profile.C[p] + profile.profile.C[p]
+ newProfile.profile.T[p] = self.profile.T[p] + profile.profile.T[p]
+ newProfile.profile.G[p] = self.profile.G[p] + profile.profile.G[p]
+ newProfile.profile.Og[p] = self.profile.Og[p] + profile.profile.Og[p]
+ newProfile.profile.Eg[p] = self.profile.Eg[p] + profile.profile.Eg[p]
+ newProfile.profile.weight = self.profile.weight + profile.profile.weight
+ return newProfile
+
+
+ def __add__(DNAProfile self,DNAProfile profile):
+ return self.add(profile)
+
+
+ cpdef double lproba(DNAProfile self,DNAProfile profile) except 1.:
+ cdef float score
+ cdef float prob
+ cdef int pos
+ assert self.profile.length==profile.profile.length,'Only profiles with identical length can be added'
+ score = 0
+ for pos in xrange(self.profile.length) :
+ prob = self.fA(pos)*profile.fA(pos) + \
+ self.fC(pos)*profile.fC(pos) + \
+ self.fT(pos)*profile.fT(pos) + \
+ self.fG(pos)*profile.fG(pos) + \
+ self.fOg(pos)*profile.fOg(pos) + \
+ self.fEg(pos)*profile.fEg(pos)
+ #if prob != 0 :
+ score += log(prob)
+ return score
+
+
+ cpdef double proba(DNAProfile self,DNAProfile profile) except -1.:
+ return exp(self.lproba(profile))
+
+
+ cdef void _initFromString(self, char *seq):
+ cdef int i=0
+ cdef int lseq = len(seq)
+ cdef int* A= self.profile.A
+ cdef int* C= self.profile.C
+ cdef int* G= self.profile.G
+ cdef int* T= self.profile.T
+ cdef int* Og=self.profile.Og
+ cdef int* Eg=self.profile.Eg
+
+ for i in range(lseq):
+ nuc = seq[i]
+ if nuc=='a':
+ A[i]=1
+ elif nuc=='c':
+ C[i]=1
+ elif nuc=='g':
+ G[i]=1
+ elif nuc=='t':
+ T[i]=1
+ elif nuc=='-':
+ if i > 0 and seq[i-1]=='-':
+ Eg[i]=1
+ else:
+ Og[i]=1
+
+ self.profile.weight = 1
+
+ property A:
+ def __get__(self):
+ return self._baseA
+
+ property C:
+ def __get__(self):
+ return self._baseC
+
+ property G:
+ def __get__(self):
+ return self._baseG
+
+ property T:
+ def __get__(self):
+ return self._baseT
+
+ property Og:
+ def __get__(self):
+ return self._Og
+
+ property Eg:
+ def __get__(self):
+ return self._Eg
+
+
+ cpdef double fA(self,int pos):
+ return self.A.frequency(pos,self.profile.weight,self.profile.pseudo)
+
+ cpdef double fC(self,int pos):
+ return self.C.frequency(pos,self.profile.weight,self.profile.pseudo)
+
+ cpdef double fG(self,int pos):
+ return self.G.frequency(pos,self.profile.weight,self.profile.pseudo)
+
+ cpdef double fT(self,int pos):
+ return self.T.frequency(pos,self.profile.weight,self.profile.pseudo)
+
+ cpdef double fOg(self,int pos):
+ return self.Og.frequency(pos,self.profile.weight,self.profile.pseudo)
+
+ cpdef double fEg(self,int pos):
+ return self.Eg.frequency(pos,self.profile.weight,self.profile.pseudo)
+
+
+ cpdef int WP(self):
+ return self.profile.weight
+
diff --git a/src/obitools/sample.py b/src/obitools/sample.py
new file mode 100644
index 0000000..7c68e96
--- /dev/null
+++ b/src/obitools/sample.py
@@ -0,0 +1,87 @@
+'''
+Created on 31 oct. 2009
+
+ at author: coissac
+'''
+from random import randrange, sample
+try:
+ from collections import Counter
+except ImportError:
+ from obitools.collections import Counter
+
+
+def lookfor(x,cumsum):
+ lmax=len(cumsum)
+ lmin=0
+
+ assert x < cumsum[-1],"x must be smaller then cumulative sum"
+
+ while((lmax - lmin) > 0):
+
+ i=(lmax+lmin)/2
+ #print i,lmin,lmax
+ if (x<cumsum[i] and (i==0 or x>cumsum[i-1])):
+ #print "return 1 :",i,cumsum[i-1],"<",x,"<",cumsum[i]
+ return i
+ elif cumsum[i]==x:
+ while cumsum[i]==x:
+ i+=1
+ #print "return 2 :",i,cumsum[i],"<",x,"<",cumsum[i+1]
+ return i
+ elif x<cumsum[i]:
+ lmax=i
+ else:
+ lmin=i
+
+ raise AssertionError
+ #print "end return :",i,cumsum[i-1],"<",x,"<",cumsum[i]
+ return i
+
+def weigthedSample(events,size):
+ entries = events.keys()
+ cumul=[0] * len(entries)
+ s=0
+ i=0
+ for e in entries:
+ s+=events[e]
+ cumul[i]=s
+ i+=1
+
+ c = [randrange(0,s) for x in xrange(size)]
+ c.sort()
+
+ i = 0
+ for j in xrange(len(c)):
+ v = c[j]
+ while (v > cumul[i]):
+ i+=1
+ c[j]=entries[i]
+
+ result=Counter(c)
+
+ return result
+
+def weigthedSampleWithoutReplacement(events,size):
+ # entries = [k for k in events.iterkeys() if events[k]>0]
+ entries = events.keys()
+ cumul=[0] * len(entries)
+ s=0
+ i=0
+ for e in entries:
+ s+=events[e]
+ cumul[i]=s
+ i+=1
+
+ c = sample(xrange(s),size)
+ c.sort()
+
+ i = 0
+ for j in xrange(len(c)):
+ v = c[j]
+ while (v > cumul[i]):
+ i+=1
+ c[j]=entries[i]
+
+ result=Counter(c)
+
+ return result
diff --git a/src/obitools/seqdb/__init__.py b/src/obitools/seqdb/__init__.py
new file mode 100644
index 0000000..ef89f43
--- /dev/null
+++ b/src/obitools/seqdb/__init__.py
@@ -0,0 +1,88 @@
+from obitools import NucSequence,AASequence
+from obitools.format.genericparser import genericEntryIteratorGenerator
+from obitools.location.feature import featureIterator
+
+from itertools import chain
+
+class AnnotatedSequence(object):
+
+ def __init__(self,header,featureTable,secondaryAcs):
+ self._header = header
+ self._featureTableText = featureTable
+ self._featureTable=None
+ self._secondaryAcs=secondaryAcs
+ self._hasTaxid=True
+
+ def getHeader(self):
+ return self._header
+
+
+ def getFeatureTable(self,skipError=False):
+ if self._featureTable is None:
+ self._featureTable = [x for x in featureIterator(self._featureTableText,skipError)]
+ return self._featureTable
+
+
+ def getSecondaryAcs(self):
+ return self._secondaryAcs
+
+ def extractTaxon(self):
+ if b'taxid' not in self and self._hasTaxid:
+
+ if self._featureTable is not None:
+ s = [f for f in self._featureTable if f.ftType=='source']
+ else:
+ s = featureIterator(self._featureTableText).next()
+ if s.ftType=='source':
+ s = [s]
+ else:
+ s = [f for f in self.featureTable if f.ftType=='source']
+
+ t =set(int(v[6:]) for v in chain(*tuple(f['db_xref'] for f in s if 'db_xref' in f))
+ if v[0:6]=='taxon:')
+
+ self._hasTaxid=False
+
+ if len(t)==1 :
+ taxid=t.pop()
+ if taxid >=0:
+ self['taxid']=taxid
+ self._hasTaxid=True
+
+
+ t =set(chain(*tuple(f['organism'] for f in s if 'organism' in f)))
+
+ if len(t)==1:
+ self['organism']=t.pop()
+
+
+ header = property(getHeader, None, None, "Header's Docstring")
+
+ featureTable = property(getFeatureTable, None, None, "FeatureTable's Docstring")
+
+ secondaryAcs = property(getSecondaryAcs, None, None, "SecondaryAcs's Docstring")
+
+class AnnotatedNucSequence(AnnotatedSequence,NucSequence):
+ '''
+
+ '''
+ def __init__(self,id,seq,de,header,featureTable,secondaryAcs,**info):
+ NucSequence.__init__(self, id, seq, de,**info)
+ AnnotatedSequence.__init__(self, header, featureTable, secondaryAcs)
+
+
+class AnnotatedAASequence(AnnotatedSequence,AASequence):
+ '''
+
+ '''
+ def __init__(self,id,seq,de,header,featureTable,secondaryAcs,**info):
+ AASequence.__init__(self, id, seq, de,**info)
+ AnnotatedSequence.__init__(self, header, featureTable, secondaryAcs)
+
+
+
+nucEntryIterator=genericEntryIteratorGenerator(endEntry='^//')
+aaEntryIterator=genericEntryIteratorGenerator(endEntry='^//')
+
+
+
diff --git a/src/obitools/seqdb/blastdb/__init__.py b/src/obitools/seqdb/blastdb/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/obitools/seqdb/dnaparser.py b/src/obitools/seqdb/dnaparser.py
new file mode 100644
index 0000000..85b82a2
--- /dev/null
+++ b/src/obitools/seqdb/dnaparser.py
@@ -0,0 +1,16 @@
+from obitools.format.sequence import embl,fasta,genbank
+
+class UnknownFormatError(Exception):
+ pass
+
+def whichParser(seq):
+ if seq[0]=='>':
+ return fasta.fastaNucParser
+ if seq[0:2]=='ID':
+ return embl.emblParser
+ if seq[0:5]=='LOCUS':
+ return genbank.genbankParser
+ raise UnknownFormatError,"Unknown nucleic format"
+
+def nucleicParser(seq):
+ return whichParser(seq)(seq)
diff --git a/src/obitools/seqdb/embl/__init__.py b/src/obitools/seqdb/embl/__init__.py
new file mode 100644
index 0000000..94f9efc
--- /dev/null
+++ b/src/obitools/seqdb/embl/__init__.py
@@ -0,0 +1,13 @@
+from obitools.seqdb import AnnotatedNucSequence, AnnotatedAASequence
+from obitools.location import locationGenerator,extractExternalRefs
+
+
+
+class EmblSequence(AnnotatedNucSequence):
+ '''
+ Class used to represent a nucleic sequence issued from EMBL.
+ '''
+
+
+
+
diff --git a/src/obitools/seqdb/embl/parser.py b/src/obitools/seqdb/embl/parser.py
new file mode 100644
index 0000000..b90278f
--- /dev/null
+++ b/src/obitools/seqdb/embl/parser.py
@@ -0,0 +1,52 @@
+import re
+import sys
+
+from obitools.seqdb import embl
+from obitools.seqdb import nucEntryIterator
+
+_featureMatcher = re.compile('(^FT .*\n)+', re.M)
+_cleanFT = re.compile('^FT',re.M)
+
+_headerMatcher = re.compile('^ID.+(?=\nFH )', re.DOTALL)
+_seqMatcher = re.compile('(^ ).+(?=//\n)', re.DOTALL + re.M)
+_cleanSeq = re.compile('[ \n0-9]+')
+_acMatcher = re.compile('(?<=^AC ).+',re.M)
+_deMatcher = re.compile('(^DE .+\n)+',re.M)
+_cleanDe = re.compile('(^|\n)DE +')
+
+def __emblparser(text):
+ try:
+ header = _headerMatcher.search(text).group()
+
+ ft = _featureMatcher.search(text).group()
+ ft = _cleanFT.sub(' ',ft)
+
+ seq = _seqMatcher.search(text).group()
+ seq = _cleanSeq.sub('',seq).upper()
+
+ acs = _acMatcher.search(text).group()
+ acs = acs.replace(';', ' ')
+ acs = acs.split()
+ ac = acs[0]
+ acs = acs[1:]
+
+
+ de = _deMatcher.search(header).group()
+ de = _cleanDe.sub(' ',de).strip().strip('.')
+ except AttributeError,e:
+ print >>sys.stderr,'======================================================='
+ print >>sys.stderr,text
+ print >>sys.stderr,'======================================================='
+ raise e
+
+ return (ac,seq,de,header,ft,acs)
+
+def emblParser(text):
+ return embl.EmblSequence(*__emblparser(text))
+
+
+def emblIterator(file):
+ for e in nucEntryIterator(file):
+ yield emblParser(e)
+
+
diff --git a/src/obitools/seqdb/genbank/__init__.py b/src/obitools/seqdb/genbank/__init__.py
new file mode 100644
index 0000000..fb5b622
--- /dev/null
+++ b/src/obitools/seqdb/genbank/__init__.py
@@ -0,0 +1,84 @@
+from obitools.seqdb import AnnotatedNucSequence, AnnotatedAASequence
+from obitools.location import locationGenerator,extractExternalRefs
+
+
+
+class GbSequence(AnnotatedNucSequence):
+ '''
+ Class used to represent a nucleic sequence issued from Genbank.
+ '''
+
+
+class GpepSequence(AnnotatedAASequence):
+ '''
+ Class used to represent a peptidic sequence issued from Genpep.
+ '''
+
+ def __init__(self,id,seq,de,header,featureTable,secondaryAcs,**info):
+ AnnotatedAASequence.__init__(self,id, seq, de, header, featureTable, secondaryAcs,**info)
+ self.__hasNucRef=None
+
+ def __getGeneRef(self):
+ if self.__hasNucRef is None:
+ self.__hasNucRef=False
+ cds = [x for x in self.featureTable
+ if x.ftType=='CDS'
+ and 'coded_by' in x]
+
+ if cds:
+ source = cds[0]['coded_by'][0]
+ if 'transl_table' in cds[0]:
+ tt = cds[0]['transl_table'][0]
+ else:
+ tt=None
+ ac,loc = extractExternalRefs(source)
+
+ if len(ac)==1:
+ ac = ac.pop()
+ self.__hasNucRef=True
+ self.__nucRef = (ac,loc,tt)
+
+
+
+ def geneAvailable(self):
+ '''
+ Predicat indicating if reference to the nucleic sequence encoding
+ this protein is available in feature table.
+
+ @return: True if gene description is available
+ @rtype: bool
+ '''
+ self.__getGeneRef()
+ return self.__hasNucRef is not None and self.__hasNucRef
+
+
+ def getCDS(self,database):
+ '''
+ Return the nucleic sequence coding for this protein if
+ data are available.
+
+ @param database: a database object where looking for the sequence
+ @type database: a C{dict} like object
+
+ @return: a NucBioseq instance carreponding to the CDS
+ @rtype: NucBioSeq
+
+ @raise AssertionError: if no gene references are available
+ @see: L{geneAvailable}
+
+ '''
+
+ assert self.geneAvailable(), \
+ "No information available to retreive gene sequence"
+
+ ac,loc,tt = self.__nucRef
+ seq = database[ac]
+ seq.extractTaxon()
+ gene = seq[loc]
+ if tt is not None:
+ gene['transl_table']=tt
+ return gene
+
+
+
+
diff --git a/src/obitools/seqdb/genbank/ncbi.py b/src/obitools/seqdb/genbank/ncbi.py
new file mode 100644
index 0000000..40ddf91
--- /dev/null
+++ b/src/obitools/seqdb/genbank/ncbi.py
@@ -0,0 +1,79 @@
+from urllib2 import urlopen
+import sys
+import re
+
+import cStringIO
+
+from obitools.eutils import EFetch
+from parser import genbankParser,genpepParser
+from parser import genbankIterator,genpepIterator
+
+from obitools.utils import CachedDB
+
+
+class NCBIGenbank(EFetch):
+ def __init__(self):
+ EFetch.__init__(self,db='nucleotide',
+ rettype='gbwithparts')
+
+ def __getitem__(self,ac):
+ if isinstance(ac,str):
+ text = self.get(id=ac)
+ seq = genbankParser(text)
+ return seq
+ else:
+ query = ','.join([x for x in ac])
+ data = cStringIO.StringIO(self.get(id=query))
+ return genbankIterator(data)
+
+
+
+
+class NCBIGenpep(EFetch):
+ def __init__(self):
+ EFetch.__init__(self,db='protein',
+ rettype='gbwithparts')
+
+ def __getitem__(self,ac):
+ if isinstance(ac,str):
+ text = self.get(id=ac)
+ seq = genpepParser(text)
+ return seq
+ else:
+ query = ','.join([x for x in ac])
+ data = cStringIO.StringIO(self.get(id=query))
+ return genpepIterator(data)
+
+class NCBIAccession(EFetch):
+
+ _matchACS = re.compile(' +accession +"([^"]+)"')
+
+ def __init__(self):
+ EFetch.__init__(self,db='nucleotide',
+ rettype='seqid')
+
+ def __getitem__(self,ac):
+ if isinstance(ac,str):
+ text = self.get(id=ac)
+ rep = NCBIAccession._matchACS.search(text).group(1)
+ return rep
+ else:
+ query = ','.join([x for x in ac])
+ text = self.get(id=query)
+ rep = (ac.group(1) for ac in NCBIAccession._matchACS.finditer(text))
+ return rep
+
+def Genbank(cache=None):
+ gb = NCBIGenbank()
+ if cache is not None:
+ gb = CachedDB(cache, gb)
+ return gb
+
+
+def Genpep(cache=None):
+ gp = NCBIGenpep()
+ if cache is not None:
+ gp = CachedDB(cache, gp)
+ return gp
+
+
diff --git a/src/obitools/seqdb/genbank/parser.py b/src/obitools/seqdb/genbank/parser.py
new file mode 100644
index 0000000..b52fe59
--- /dev/null
+++ b/src/obitools/seqdb/genbank/parser.py
@@ -0,0 +1,53 @@
+import re
+import sys
+
+import obitools.seqdb.genbank as gb
+from obitools.seqdb import nucEntryIterator,aaEntryIterator
+
+_featureMatcher = re.compile('^FEATURES.+\n(?=ORIGIN)',re.DOTALL + re.M)
+
+_headerMatcher = re.compile('^LOCUS.+(?=\nFEATURES)', re.DOTALL + re.M)
+_seqMatcher = re.compile('(?<=ORIGIN).+(?=//\n)', re.DOTALL + re.M)
+_cleanSeq = re.compile('[ \n0-9]+')
+_acMatcher = re.compile('(?<=^ACCESSION ).+',re.M)
+_deMatcher = re.compile('(?<=^DEFINITION ).+\n( .+\n)*',re.M)
+_cleanDe = re.compile('\n *')
+
+def __gbparser(text):
+ try:
+ header = _headerMatcher.search(text).group()
+ ft = _featureMatcher.search(text).group()
+ seq = _seqMatcher.search(text).group()
+ seq = _cleanSeq.sub('',seq).upper()
+ acs = _acMatcher.search(text).group()
+ acs = acs.split()
+ ac = acs[0]
+ acs = acs[1:]
+ de = _deMatcher.search(header).group()
+ de = _cleanDe.sub(' ',de).strip().strip('.')
+ except AttributeError,e:
+ print >>sys.stderr,'======================================================='
+ print >>sys.stderr,text
+ print >>sys.stderr,'======================================================='
+ raise e
+
+ return (ac,seq,de,header,ft,acs)
+
+def genbankParser(text):
+ return gb.GbSequence(*__gbparser(text))
+
+
+def genbankIterator(file):
+ for e in nucEntryIterator(file):
+ yield genbankParser(e)
+
+
+def genpepParser(text):
+ return gb.GpepSequence(*__gbparser(text))
+
+
+def genpepIterator(file):
+ for e in aaEntryIterator(file):
+ yield genpepParser(e)
+
+
\ No newline at end of file
diff --git a/src/obitools/sequenceencoder/__init__.py b/src/obitools/sequenceencoder/__init__.py
new file mode 100644
index 0000000..89a8a59
--- /dev/null
+++ b/src/obitools/sequenceencoder/__init__.py
@@ -0,0 +1,73 @@
+from obitools import location
+
+class SequenceEncoder(object):
+ pass
+
+class DNAComplementEncoder(SequenceEncoder):
+ _comp={'a': 't', 'c': 'g', 'g': 'c', 't': 'a',
+ 'r': 'y', 'y': 'r', 'k': 'm', 'm': 'k',
+ 's': 's', 'w': 'w', 'b': 'v', 'd': 'h',
+ 'h': 'd', 'v': 'b', 'n': 'n', 'u': 'a',
+ '-': '-'}
+
+ _info={'complemented':True}
+
+ @staticmethod
+ def _encode(seq,position=slice(None, None, -1)):
+ cseq = [DNAComplementEncoder._comp.get(x.lower(),'n') for x in seq[position]]
+ return ''.join(cseq)
+
+ @staticmethod
+ def _check(seq):
+ assert seq.isNucleotide()
+
+ @staticmethod
+ def _convertpos(position):
+ if isinstance(position, int):
+ return -(position+1)
+ elif isinstance(position, slice):
+ return slice(-(position.stop+1),
+ -(position.start+1),
+ -position.step)
+ elif isinstance(position, location.Location):
+ return location.ComplementLocation(position).simplify()
+
+ raise TypeError,"position must be an int, slice or Location instance"
+
+ @staticmethod
+ def complement(seq):
+ return seq
+
+class SeqFragmentEncoder(SequenceEncoder):
+ def __init__(self,begin,end):
+ assert begin < end and begin >=0
+ self._limits = slice(begin,end)
+ self._info = {'cut' : [begin,end,1]}
+ self._len = end - begin + 1
+
+ def _check(self,seq):
+ lseq = len(seq)
+ assert self._limits.stop <= lseq
+
+ def _encode(self,seq,position=None):
+ return str(seq)[self._limits]
+
+ def _convertpos(self,position):
+ if isinstance(position, int):
+ if position < -self._len or position >= self._len:
+ raise IndexError,position
+ if position >=0:
+ return self._limits.start + position
+ else:
+ return self._limits.stop + position + 1
+ elif isinstance(position, slice):
+ return slice(-(position.stop+1),
+ -(position.start+1),
+ -position.step)
+ elif isinstance(position, location.Location):
+ return location.ComplementLocation(position).simplify()
+
+ raise TypeError,"position must be an int, slice or Location instance"
+
+
+
\ No newline at end of file
diff --git a/src/obitools/solexa/__init__.py b/src/obitools/solexa/__init__.py
new file mode 100644
index 0000000..60e35f8
--- /dev/null
+++ b/src/obitools/solexa/__init__.py
@@ -0,0 +1,45 @@
+from obitools import utils
+from obitools import NucSequence
+from obitools.dnahash import hashCodeIterator
+
+
+class SolexaSequence(NucSequence):
+ def __init__(self,id,seq,definition=None,quality=None,**info):
+ NucSequence.__init__(self, id, seq, definition,**info)
+ self._quality=quality
+ self._hash=None
+
+ def getQuality(self):
+ if isinstance(self._quality, str):
+ self._quality=[int(x) for x in self._quality.split()]
+ return self._quality
+
+
+ def __hash__(self):
+ if self._hash is None:
+ self._hash = hashCodeIterator(str(self), len(str(self)), 16, 0).next()[1].pop()
+ return self._hash
+
+class SolexaFile(utils.ColumnFile):
+ def __init__(self,stream):
+ utils.ColumnFile.__init__(self,
+ stream, ':', True,
+ (str,
+ int,int,int,int,
+ str,
+ str), "#")
+
+
+ def next(self):
+ data = utils.ColumnFile.next(self)
+ seq = SolexaSequence('%d_%d_%d_%d'%(data[1],data[2],data[3],data[4]),
+ data[5],
+ quality=data[6])
+ seq['machine']=data[0]
+ seq['channel']=data[1]
+ seq['tile']=data[2]
+ seq['pos_x']=data[3]
+ seq['pos_y']=data[4]
+
+ #assert len(seq['quality'])==len(seq),"Error in file format"
+ return seq
diff --git a/src/obitools/solexaPairEnd.py b/src/obitools/solexaPairEnd.py
new file mode 100644
index 0000000..5d082ec
--- /dev/null
+++ b/src/obitools/solexaPairEnd.py
@@ -0,0 +1,103 @@
+#!/usr/local/bin/python
+'''
+Created on 30 dec. 2009
+
+ at author: coissac
+'''
+
+from obitools.options import getOptionManager
+from obitools.fastq import fastqSolexaIterator, formatFastq
+from obitools.align import QSolexaReverseAssemble
+from obitools.align import QSolexaRightReverseAssemble
+from obitools.tools._solexapairend import buildConsensus
+
+from itertools import chain
+
+def addSolexaPairEndOptions(optionManager):
+ optionManager.add_option('-r','--reverse-reads',
+ action="store", dest="reverse",
+ metavar="<FILENAME>",
+ type="string",
+ default=None,
+ help="Filename containing reverse solexa reads "
+ )
+
+
+def cutDirectReverse(entries):
+ first = []
+
+ for i in xrange(10):
+ first.append(entries.next())
+
+ lens = [len(x) for x in first]
+ clen = {}
+ for i in lens:
+ clen[i]=clen.get(i,0)+1
+ freq = max(clen.values())
+ freq = [k for k in clen if clen[k]==freq]
+ assert len(freq)==1,"To many sequence length"
+ freq = freq[0]
+ assert freq % 2 == 0, ""
+ lread = freq/2
+
+ seqs = chain(first,entries)
+
+ for s in seqs:
+ d = s[0:lread]
+ r = s[lread:]
+ yield(d,r)
+
+def seqPairs(direct,reverse):
+ for d in direct:
+ r = reverse.next()
+ yield(d,r)
+
+def checkAlignOk(ali):
+ #print not (ali[0][0]=='-' or ali[1][len(ali[1])-1]=='-')
+ return not (ali[0][0]=='-' or ali[1][len(ali[1])-1]=='-')
+
+
+
+def buildAlignment(sequences):
+ la = QSolexaReverseAssemble()
+ ra = QSolexaRightReverseAssemble()
+
+ for d,r in sequences:
+ la.seqA=d
+ la.seqB=r
+ ali=la()
+ ali.direction='left'
+ if not checkAlignOk(ali):
+# print >>sys.stderr,"-> bad : -------------------------"
+# print >>sys.stderr,ali
+# print >>sys.stderr,"----------------------------------"
+ ra.seqA=d
+ ra.seqB=r
+ ali=ra()
+ ali.direction='right'
+# print >>sys.stderr,ali
+# print >>sys.stderr,"----------------------------------"
+ yield ali
+
+
+
+
+if __name__ == '__main__':
+ optionParser = getOptionManager([addSolexaPairEndOptions],
+ entryIterator=fastqSolexaIterator
+ )
+
+ (options, direct) = optionParser()
+
+ if options.reverse is None:
+ sequences=cutDirectReverse(direct)
+ else:
+ reverse = fastqSolexaIterator(options.reverse)
+ sequences=seqPairs(direct,reverse)
+
+ for ali in buildAlignment(sequences):
+ consensus = buildConsensus(ali)
+ print formatFastq(consensus)
+
+
+
diff --git a/src/obitools/statistics/__init__.py b/src/obitools/statistics/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/obitools/statistics/hypergeometric.py b/src/obitools/statistics/hypergeometric.py
new file mode 100644
index 0000000..9a9b812
--- /dev/null
+++ b/src/obitools/statistics/hypergeometric.py
@@ -0,0 +1,166 @@
+# -*- coding: utf-8 -*-
+"""
+ Module de calcules statistiques.
+
+ Le module `statistics` contient des fonctions permettant le calcule
+ des probabilités associées à la loi hypergéométrique et
+ hypergéométrique cumulée, ainsi d'une méthode de correction pour les
+ tests multiples.
+
+"""
+
+from decimal import *
+
+getcontext().prec = 28
+
+
+def _hyper0(N,n,r):
+ """
+ Fonction interne permetant le calcule du terme 0 de la loi hypergéométrique.
+
+ Le calcule est réalisé selon la méthode décrite dans l'article
+
+ Trong Wu, An accurate computation of the hypergeometric distribution function,
+ ACM Trans. Math. Softw. 19 (1993), no. 1, 33–43.
+
+ Paramètres:
+
+ - `N` : La taille de la population
+ - `n` : Le nombre d'éléments marqués
+ - `r` : La taille de l'echantillon
+
+ Retourne un *float* indiquant la probabilité de récupérer 0 élément
+ marqué parmi *n* dans une population de taille *N* lors du tirage
+ d'un échantillon de taille *r*
+ """
+
+ #
+ # au numerateur nous avons :
+ # [N -r + 1 -n;N - n + 1[
+ #
+ # au denominateur :
+ # [N - r + 1; N + 1]
+ #
+ # avec X = N - r + 1
+ # et Y = N + 1
+ #
+ # Numerateur -> [ X - n; Y - n [
+ # Denominateur -> [ X ; Y [
+ #
+ # On peut donc siplifier
+ #
+ # Numerateur -> [X - n; X [
+ # Denominateur -> [Y - n; Y [
+
+ numerateur = xrange(N - r + 1 - n, N - r + 1)
+ denominateur= xrange(N + 1 - n, N + 1)
+#
+# version original
+#
+# m = N - n
+# numerateur = set(range(m-r+1,m+1))
+# denominateur = set(range(N-r+1,N+1))
+# simplification = numerateur & denominateur
+# numerateur -= simplification
+# denominateur -= simplification
+# numerateur = list(numerateur)
+# denominateur=list(denominateur)
+# numerateur.sort()
+# denominateur.sort()
+
+
+ p = reduce(lambda x,y:x*y,map(lambda i,j:Decimal(i)/Decimal(j),numerateur,denominateur))
+ return p
+
+
+def hypergeometric(x,N,n,r):
+ """
+ Calcule le terme *x* d'une loi hypergéométrique
+
+ Le calcule est réalisé selon la méthode décrite dans l'article
+
+ Trong Wu, An accurate computation of the hypergeometric distribution function,
+ ACM Trans. Math. Softw. 19 (1993), no. 1, 33–43.
+
+ Paramètres:
+
+ - `x` : Nombre d'éléments marqués attendu
+ - `N` : La taille de la population
+ - `n` : Le nombre d'éléments marqués
+ - `r` : La taille de l'echantillon
+
+ Retourne un *float* indiquant la probabilité de récupérer *x* éléments
+ marqués parmi *n* dans une population de taille *N* lors du tirage
+ d'un échantillon de taille *r*
+ """
+ if n < r:
+ s = n
+ n = r
+ r = s
+ assert x>=0 and x <= r,"x out of limits"
+ if x > 0 :
+ return hypergeometric(x-1,N,n,r) * (n - x + 1)/x * (r - x + 1)/(N-n-r+x)
+ else:
+ return _hyper0(N,n,r)
+
+def chypergeometric(xmin,xmax,N,n,r):
+ """
+ Calcule le terme *x* d'une loi hypergéométrique
+
+ Le calcule est réalisé selon la méthode décrite dans l'article
+
+ Trong Wu, An accurate computation of the hypergeometric distribution function,
+ ACM Trans. Math. Softw. 19 (1993), no. 1, 33–43.
+
+ Paramètres:
+
+ - `xmin` : Nombre d'éléments marqués minimum attendu
+ - `xmax` : Nombre d'éléments marqués maximum attendu
+ - `N` : La taille de la population
+ - `n` : Le nombre d'éléments marqués
+ - `r` : La taille de l'echantillon
+
+ Retourne un *float* indiquant la probabilité de récupérer entre
+ *xmin* et *xmax* éléments marqués parmi *n* dans une population
+ de taille *N* lors du tirage d'un échantillon de taille *r*
+ """
+ if n < r:
+ s = n
+ n = r
+ r = s
+ assert xmin>=0 and xmin <= r and xmax>=0 and xmax <= r and xmin <=xmax,"x out of limits"
+ hg = hypergeometric(xmin,N,n,r)
+ rep = hg
+ for x in xrange(xmin+1,xmax+1):
+ hg = hg * (n - x + 1)/x * (r - x + 1)/(N-n-r+x)
+ rep+=hg
+ return rep
+
+def multipleTest(globalPvalue,testList):
+ """
+ Correction pour les tests multiples.
+
+ Séléctionne parmis un ensemble de test le plus grand sous ensemble
+ telque le risque global soit inférieur à une pvalue déterminée.
+
+ Paramètres:
+
+ - `globalPvalue` : Risque global à prendre pour l'ensemble des tests
+ - `testList` : un élément itérable sur un ensemble de tests.
+ Chaque test est une liste ou un tuple dont le dernier élément
+ est la pvalue associée au test
+
+ Retourne une liste contenant le sous ensemble des tests selectionnés dans
+ `testList`
+ """
+ testList=list(testList)
+ testList.sort(lambda x,y:cmp(x[-1],y[-1]))
+ h0=1.0-globalPvalue
+ p=1.0
+ rep = []
+ for t in testList:
+ p*=1.0-t[-1]
+ if p > h0:
+ rep.append(t)
+ return rep
+
\ No newline at end of file
diff --git a/src/obitools/statistics/noncentralhypergeo.py b/src/obitools/statistics/noncentralhypergeo.py
new file mode 100644
index 0000000..e6a96ce
--- /dev/null
+++ b/src/obitools/statistics/noncentralhypergeo.py
@@ -0,0 +1,208 @@
+from decimal import *
+from math import log
+
+#from obitools.utils import moduleInDevelopment
+
+#moduleInDevelopment(__name__)
+
+# from : http://www.programmish.com/?p=25
+
+def dec_log(self, base=10):
+ cur_prec = getcontext().prec
+ getcontext().prec += 2
+ baseDec = Decimal(10)
+ retValue = self
+
+ if isinstance(base, Decimal):
+ baseDec = base
+ elif isinstance(base, float):
+ baseDec = Decimal("%f" % (base))
+ else:
+ baseDec = Decimal(base)
+
+ integer_part = Decimal(0)
+ while retValue < 1:
+ integer_part = integer_part - 1
+ retValue = retValue * baseDec
+ while retValue >= baseDec:
+ integer_part = integer_part + 1
+ retValue = retValue / baseDec
+
+ retValue = retValue ** 10
+ decimal_frac = Decimal(0)
+ partial_part = Decimal(1)
+ while cur_prec > 0:
+ partial_part = partial_part / Decimal(10)
+ digit = Decimal(0)
+ while retValue >= baseDec:
+ digit += 1
+ retValue = retValue / baseDec
+ decimal_frac = decimal_frac + digit * partial_part
+ retValue = retValue ** 10
+ cur_prec -= 1
+ getcontext().prec -= 2
+
+ return integer_part + decimal_frac
+
+class Interval(object):
+ def __init__(self,begin,end,facteur=1):
+ self._begin = begin
+ self._end = end
+ self._facteur=facteur
+
+ def __str__(self):
+ return '[%d,%d] ^ %d' % (self._begin,self._end,self._facteur)
+
+ def __repr__(self):
+ return 'Interval(%d,%d,%d)' % (self._begin,self._end,self._facteur)
+
+ def begin(self):
+ return (self._begin,self._facteur,True)
+
+ def end(self):
+ return (self._end,-self._facteur,False)
+
+
+def cmpb(i1,i2):
+ x= cmp(i1[0],i2[0])
+ if x==0:
+ x = cmp(i2[2],i1[2])
+ return x
+
+class Product(object):
+ def __init__(self,i=None):
+ if i is not None:
+ self.prod=[i]
+ else:
+ self.prod=[]
+ self._simplify()
+
+ def _simplify(self):
+ bornes=[]
+ prod =[]
+
+ if self.prod:
+
+ for i in self.prod:
+ bornes.append(i.begin())
+ bornes.append(i.end())
+ bornes.sort(cmpb)
+
+
+ j=0
+ r=len(bornes)
+ for i in xrange(1,len(bornes)):
+ if bornes[i][0]==bornes[j][0] and bornes[i][2]==bornes[j][2]:
+ bornes[j]=(bornes[j][0],bornes[j][1]+bornes[i][1],bornes[i][2])
+ r-=1
+ else:
+ j+=1
+ bornes[j]=bornes[i]
+
+ bornes=bornes[0:r]
+
+ facteur=0
+ close=1
+
+ for b,level,open in bornes:
+ if not open:
+ close=0
+ else:
+ close=1
+ if facteur:
+ prod.append(Interval(debut,b-close,facteur))
+ debut=b+1-close
+ facteur+=level
+
+ self.prod=prod
+
+
+
+
+ def __mul__(self,p):
+ res = Product()
+ res.prod=list(self.prod)
+ res.prod.extend(p.prod)
+ res._simplify()
+ return res
+
+ def __div__(self,p):
+ np = Product()
+ np.prod = [Interval(x._begin,x._end,-x._facteur) for x in p.prod]
+ return self * np
+
+ def __str__(self):
+ return str(self.prod)
+
+ def log(self):
+ p=Decimal(0)
+ for k in self.prod:
+ p+= Decimal(k._facteur) * reduce(lambda x,y:x+dec_log(Decimal(y),Decimal(10)),xrange(k._begin,k._end+1),Decimal(0))
+ return p
+
+ def product(self):
+ p=Decimal(1)
+ for k in self.prod:
+ p*= reduce(lambda x,y:x*Decimal(y),xrange(k._begin,k._end+1),Decimal(1)) ** Decimal(k._facteur)
+ return p
+
+ def __call__(self,log=True):
+ if log:
+ return self.log()
+ else:
+ return self.product()
+
+
+def fact(n):
+ return Product(Interval(1,n))
+
+def cnp(n,p):
+ return fact(n)/fact(p)/fact(n-p)
+
+def hypergeometic(x,n,M,N):
+ '''
+
+ @param x: Variable aleatoire
+ @type x: int
+ @param n: taille du tirage
+ @type n: int
+ @param M: boule gagnante
+ @type M: int
+ @param N: nombre total dans l'urne
+ @type N: int
+
+ p(x)= cnp(M,x) * cnp(N-M,n-x) / cnp(N,n)
+ '''
+ return cnp(M,x) * cnp(N-M,n-x) / cnp(N,n)
+
+def nchypergeometique(x,n,M,N,r):
+ '''
+
+ @param x: Variable aleatoire
+ @type x: int
+ @param n: taille du tirage
+ @type n: int
+ @param M: boule gagnante
+ @type M: int
+ @param N: nombre total dans l'urne
+ @type N: int
+ @param r: odd ratio
+ @type r: float
+
+ p(x)= cnp(M,x) * cnp(N-M,n-x) / cnp(N,n)
+ '''
+
+ xmin = max(0,n-N+M)
+ xmax = min(n,M)
+ lr = dec_log(r)
+ xlr = x * lr
+ num = cnp(M,x) * cnp(N-M,n-x)
+ den = [cnp(M,y) * cnp(N-M,n-y) / num for y in xrange(xmin,xmax+1)]
+ fden = [lr * y - xlr for y in xrange(xmin,xmax+1)]
+
+ inverse=reduce(lambda x,y : x+y,
+ map(lambda i,j: i(False) * 10**j ,den,fden))
+ return 1/inverse
+
+
+
\ No newline at end of file
diff --git a/src/obitools/svg.py b/src/obitools/svg.py
new file mode 100644
index 0000000..c42e3ef
--- /dev/null
+++ b/src/obitools/svg.py
@@ -0,0 +1,120 @@
+#!/usr/bin/env python
+"""\
+SVG.py - Construct/display SVG scenes.
+
+The following code is a lightweight wrapper around SVG files. The metaphor
+is to construct a scene, add objects to it, and then write it to a file
+to display it.
+
+This program uses ImageMagick to display the SVG files. ImageMagick also
+does a remarkable job of converting SVG files into other formats.
+"""
+
+import os
+display_prog = 'display' # Command to execute to display images.
+
+class Scene:
+ def __init__(self,name="svg",height=400,width=400):
+ self.name = name
+ self.items = []
+ self.height = height
+ self.width = width
+ return
+
+ def add(self,item): self.items.append(item)
+
+ def strarray(self):
+ var = ["<?xml version=\"1.0\"?>\n",
+ "<svg height=\"%d\" width=\"%d\" >\n" % (self.height,self.width),
+ " <g style=\"fill-opacity:1.0; stroke:black;\n",
+ " stroke-width:1;\">\n"]
+ for item in self.items: var += item.strarray()
+ var += [" </g>\n</svg>\n"]
+ return var
+
+ def write_svg(self,filename=None):
+ if filename:
+ self.svgname = filename
+ else:
+ self.svgname = self.name + ".svg"
+ file = open(self.svgname,'w')
+ file.writelines(self.strarray())
+ file.close()
+ return
+
+ def display(self,prog=display_prog):
+ os.system("%s %s" % (prog,self.svgname))
+ return
+
+
+class Line:
+ def __init__(self,start,end):
+ self.start = start #xy tuple
+ self.end = end #xy tuple
+ return
+
+ def strarray(self):
+ return [" <line x1=\"%d\" y1=\"%d\" x2=\"%d\" y2=\"%d\" />\n" %\
+ (self.start[0],self.start[1],self.end[0],self.end[1])]
+
+
+class Circle:
+ def __init__(self,center,radius,color):
+ self.center = center #xy tuple
+ self.radius = radius #xy tuple
+ self.color = color #rgb tuple in range(0,256)
+ return
+
+ def strarray(self):
+ return [" <circle cx=\"%d\" cy=\"%d\" r=\"%d\"\n" %\
+ (self.center[0],self.center[1],self.radius),
+ " style=\"fill:%s;\" />\n" % colorstr(self.color)]
+
+class Rectangle:
+ def __init__(self,origin,height,width,color):
+ self.origin = origin
+ self.height = height
+ self.width = width
+ self.color = color
+ return
+
+ def strarray(self):
+ return [" <rect x=\"%d\" y=\"%d\" height=\"%d\"\n" %\
+ (self.origin[0],self.origin[1],self.height),
+ " width=\"%d\" style=\"fill:%s;\" />\n" %\
+ (self.width,colorstr(self.color))]
+
+class Text:
+ def __init__(self,origin,text,size=24):
+ self.origin = origin
+ self.text = text
+ self.size = size
+ return
+
+ def strarray(self):
+ return [" <text x=\"%d\" y=\"%d\" font-size=\"%d\">\n" %\
+ (self.origin[0],self.origin[1],self.size),
+ " %s\n" % self.text,
+ " </text>\n"]
+
+
+def colorstr(rgb): return "#%x%x%x" % (rgb[0]/16,rgb[1]/16,rgb[2]/16)
+
+def test():
+ scene = Scene('test')
+ scene.add(Rectangle((100,100),200,200,(0,255,255)))
+ scene.add(Line((200,200),(200,300)))
+ scene.add(Line((200,200),(300,200)))
+ scene.add(Line((200,200),(100,200)))
+ scene.add(Line((200,200),(200,100)))
+ scene.add(Circle((200,200),30,(0,0,255)))
+ scene.add(Circle((200,300),30,(0,255,0)))
+ scene.add(Circle((300,200),30,(255,0,0)))
+ scene.add(Circle((100,200),30,(255,255,0)))
+ scene.add(Circle((200,100),30,(255,0,255)))
+ scene.add(Text((50,50),"Testing SVG"))
+ scene.write_svg()
+ scene.display()
+ return
+
+if __name__ == '__main__': test()
diff --git a/src/obitools/table/__init__.py b/src/obitools/table/__init__.py
new file mode 100644
index 0000000..41e00bd
--- /dev/null
+++ b/src/obitools/table/__init__.py
@@ -0,0 +1,633 @@
+'''
+
+'''
+
+from itertools import imap,count,chain
+
+from itertools import imap,count,chain
+
+class Table(list):
+ """
+ Tables are list of rows of the same model
+ """
+ def __init__(self, headers=None,
+ types=None,
+ colcount=None,
+ rowFactory=None,
+ subrowFactory=None):
+ '''
+
+ @param headers: the list of column header.
+
+ if this parametter is C{None}, C{colcount}
+ parametter must be set.
+
+ @type headers: C{list}, C{tuple} or and iterable object
+
+ @param types: the list of data type associated to each column.
+
+ If this parametter is specified its length must be
+ equal to the C{headers} length or to C{colcount}.
+
+ @type types: C{list}, C{tuple} or and iterable object
+
+ @param colcount: number of column in the created table.
+
+ If C{headers} parametter is not C{None} this
+ parametter is ignored
+
+ @type colcount: int
+ '''
+
+ assert headers is not None or colcount is not None,\
+ 'headers or colcount parametter must be not None value'
+
+ if headers is None:
+ headers = tuple('Col_%d' % x for x in xrange(colcount))
+
+ self.headers = headers
+ self.types = types
+ self.colcount= len(self.headers)
+
+ if rowFactory is None:
+ self.rowFactory=TableRow
+ else:
+ self.rowFactory=rowFactory
+
+ if subrowFactory is None:
+ self.subrowFactory=TableRow
+ else:
+ self.subrowFactory=rowFactory
+
+
+ self.likedTo=set()
+
+
+
+ def isCompatible(self,data):
+ assert isinstance(data,(Table,TableRow))
+ return (self.colcount == data.colcount and
+ (id(self.types)==id(data.types) or
+ self.types==data.types
+ )
+ )
+
+ def __setitem__ (self,key,value):
+ '''
+
+ @param key:
+ @type key: C{int}, C{slice} or C{str}
+ @param value:
+ @type value:
+ '''
+
+ if isintance(key,int):
+ if not isinstance(value, TableRow):
+ value = self.rowFactory(self,value)
+ else:
+ assert self.isCompatible(value)
+ list.__setitem__(self,key,value.row)
+
+ elif isinstance(key,slice):
+ indices = xrange(key.indices(len(self)))
+ for i,d in imap(None,indices,value):
+ self[i]=d
+
+ else:
+ raise TypeError, "Key must be an int or slice value"
+
+ def __getitem__(self,key):
+ '''
+ this function has different comportements depending
+ of the data type of C{key} and the table used.
+
+ @param key: description of the table part to return
+ @type key: C{int} or C{slice}
+
+ @return: return a TableRow (if key is C{int})
+ or a subpart of the table (if key is C{slice}).
+ '''
+ if isinstance(key,int):
+ return self.rowFactory(self,
+ list.__getitem__(self,key))
+
+ if isinstance(key,slice):
+ newtable=Table(self.headers,self.types)
+ indices = xrange(key.indices(len(self)))
+ for i in indices:
+ list.append(newtable,list.__getitem__(self,i))
+ self.likedTo.add(newtable)
+ return newtable
+
+ raise TypeError
+
+
+ def __getslice__(self,x,y):
+ return self.__getitem__(slice(x,y))
+
+ def __iter__(self):
+ return TableIterator(self)
+
+ def __hash__(self):
+ return id(self)
+
+ def __add__(self,itable):
+ return concatTables(self,itable)
+
+ def _setTypes(self,types):
+ if types is not None and not isinstance(type,tuple):
+ types = tuple(x for x in types)
+
+ assert types is None or len(types)==len(self._headers)
+
+ self._types = types
+
+ if types is not None:
+ for row in self:
+ row.castRow()
+
+ def _getTypes(self):
+ return self._types
+
+ types = property(_getTypes,_setTypes)
+
+ def _getHeaders(self):
+ return self._headers
+
+ def _setHeaders(self,headers):
+ if not isinstance(headers, tuple):
+ headers = tuple(x for x in headers)
+
+ self._hindex = dict((k,i) for i,k in imap(None,count(),headers))
+ self._headers=headers
+ self.colcount=len(headers)
+
+ headers=property(_getHeaders,_setHeaders)
+
+ def append(self,value):
+ if not isinstance(value, TableRow):
+ value = self.rowFactory(self,value)
+ else:
+ assert self.isCompatible(value)
+ list.append(self,value.row)
+
+
+
+class _Row(list):
+ def __init__(self,data,size):
+ if data is None:
+ list.__init__(self,(None for x in xrange(size)))
+ else:
+ list.__init__(self,data)
+ assert len(self)==size, \
+ "Size of data is not correct (%d instead of %d)" % (len(self),size)
+
+ def append(self,value):
+ raise NotImplementedError, \
+ "Rows cannot change of size"
+
+ def pop(self,key=None):
+ raise NotImplementedError, \
+ "Rows cannot change of size"
+
+ def extend(self,values):
+ raise NotImplementedError, \
+ "Rows cannot change of size"
+
+
+
+
+class TableRow(object):
+ '''
+
+ '''
+ def __init__(self, table,
+ data=None,
+ ):
+
+ self.table = table
+
+ if isinstance(data,_Row):
+ self.row=row
+ else:
+ data = self._castRow(data)
+ self.row=_Row(data,self._colcount)
+
+ def getType(self):
+ return self.table.types
+
+ def getHeaders(self):
+ return self.table.headers
+
+ def getHIndex(self):
+ return self.table._hindex
+
+ def getColCount(self):
+ return self.table.colcount
+
+ types = property(getType,None,None,
+ "List of types associated to this row")
+ headers= property(getHeaders,None,None,
+ "List of headers associated to this row")
+
+ _hindex= property(getHIndex,None,None)
+ _colcount = property(getColCount,None,None)
+
+ def _castValue(t,x):
+ '''
+ Cast a value to a specified type, with exception of
+ C{None} values that are returned without cast.
+
+ @param t: the destination type
+ @type t: C{type}
+ @param x: the value to cast
+
+ @return: the casted value or C{None}
+
+ '''
+ if x is None or t is None:
+ return x
+ else:
+ return t(x)
+
+ _castValue=staticmethod(_castValue)
+
+ def _castRow(self,data):
+
+ if not isinstance(data, (list,dict)):
+ data=[x for x in data]
+
+ if isinstance(data,list):
+ assert len(data)==self._colcount, \
+ 'values has not good length'
+ if self.types is not None:
+ data=[TableRow._castValue(t, x)
+ for t,x in imap(None,self.types,data)]
+
+ elif isinstance(data,dict):
+ lvalue = [None] * len(self.header)
+
+ for k,v in data.items():
+ try:
+ hindex = self._hindex[k]
+ if self.types is not None:
+ lvalue[hindex]=TableRow._castValue(self.types[hindex], v)
+ else:
+ lvalue[hindex]=v
+ except KeyError:
+ info('%s is not a table column' % k)
+
+ data=lvalue
+ else:
+ raise TypeError
+
+ return data
+
+ def __getitem__(self,key):
+ '''
+
+ @param key:
+ @type key:
+ '''
+
+ if isinstance(key,(int,slice)):
+ return self.row[key]
+
+ if isinstance(key,str):
+ i = self._hindex[key]
+ return self.row[i]
+
+ raise TypeError, "Key must be an int, slice or str value"
+
+ def __setitem__(self,key,value):
+ '''
+
+ @param key:
+ @type key:
+ @param value:
+ @type value:
+ '''
+
+ if isinstance(key,str):
+ key = self._hindex[key]
+
+ elif isinstance(key,int):
+ if self.types is not None:
+ value = TableRow._castValue(self.types[key], value)
+ self.row[key]=value
+
+ elif isinstance(key,slice):
+ indices = xrange(key.indices(len(self.row)))
+ for i,v in imap(None,indices,value):
+ self[i]=v
+ else:
+ raise TypeError, "Key must be an int, slice or str value"
+
+
+
+ def __iter__(self):
+ '''
+
+ '''
+ return iter(self.row)
+
+ def append(self,value):
+ raise NotImplementedError, \
+ "Rows cannot change of size"
+
+ def pop(self,key=None):
+ raise NotImplementedError, \
+ "Rows cannot change of size"
+
+ def extend(self,values):
+ raise NotImplementedError, \
+ "Rows cannot change of size"
+
+ def __len__(self):
+ return self._colcount
+
+ def __repr__(self):
+ return repr(self.row)
+
+ def __str__(self):
+ return str(self.row)
+
+ def castRow(self):
+ self.row = _Row(self._castRow(self.row),len(self.row))
+
+
+class iTableIterator(object):
+
+ def _getHeaders(self):
+ raise NotImplemented
+
+ def _getTypes(self):
+ raise NotImplemented
+
+ def _getRowFactory(self):
+ raise NotImplemented
+
+ def _getSubrowFactory(self):
+ raise NotImplemented
+
+ def _getColcount(self):
+ return len(self._getTypes())
+
+ def __iter__(self):
+ return self
+
+ headers = property(_getHeaders,None,None)
+ types = property(_getTypes,None,None)
+ rowFactory = property(_getRowFactory,None,None)
+ subrowFactory = property(_getSubrowFactory,None,None)
+ colcount = property(_getColcount,None,None)
+
+ def columnIndex(self,name):
+ if isinstance(name,str):
+ return self._reference.headers.index(name)
+ elif isinstance(name,int):
+ lh = len(self._reference.headers)
+ if name < lh and name >=0:
+ return name
+ elif name < 0 and name >= -lh:
+ return lh - name
+ raise IndexError
+ raise TypeError
+
+ def next(self):
+ raise NotImplemented
+
+
+class TableIterator(iTableIterator):
+
+ def __init__(self,table):
+ if not isinstance(table,Table):
+ raise TypeError
+
+ self._reftable=table
+ self._i=0
+
+ def _getHeaders(self):
+ return self._reftable.headers
+
+ def _getTypes(self):
+ return self._reftable.types
+
+ def _getRowFactory(self):
+ return self._reftable.rowFactory
+
+ def _getSubrowFactory(self):
+ return self._reftable.subrowFactory
+
+ def columnIndex(self,name):
+ if isinstance(name,str):
+ return self._reftable._hindex[name]
+ elif isinstance(name,int):
+ lh = len(self._reftable._headers)
+ if name < lh and name >=0:
+ return name
+ elif name < 0 and name >= -lh:
+ return lh - name
+ raise IndexError
+ raise TypeError
+
+
+ def rewind(self):
+ i=0
+
+ def next(self):
+ if self._i < len(self._reftable):
+ rep=self._reftable[self._i]
+ self._i+=1
+ return rep
+ else:
+ raise StopIteration
+
+ headers = property(_getHeaders,None,None)
+ types = property(_getTypes,None,None)
+ rowFactory = property(_getRowFactory,None,None)
+ subrowFactory = property(_getSubrowFactory,None,None)
+
+
+class ProjectionIterator(iTableIterator):
+
+ def __init__(self,tableiterator,*cols):
+ self._reference = iter(tableiterator)
+
+ assert isinstance(self._reference, iTableIterator)
+
+ self._selected = tuple(self._reference.columnIndex(x)
+ for x in cols)
+ self._headers = tuple(self._reference.headers[x]
+ for x in self._selected)
+
+ if self._reference.types is not None:
+ self._types= tuple(self._reference.types[x]
+ for x in self._selected)
+ else:
+ self._types=None
+
+ def _getRowFactory(self):
+ return self._reference.subrowFactory
+
+ def _getSubrowFactory(self):
+ return self._reference.subrowFactory
+
+ def _getHeaders(self):
+ return self._headers
+
+ def _getTypes(self):
+ return self._types
+
+ headers = property(_getHeaders,None,None)
+ types = property(_getTypes,None,None)
+ rowFactory = property(_getRowFactory,None,None)
+ subrowFactory = property(_getSubrowFactory,None,None)
+
+ def next(self):
+ value = self._reference.next()
+ value = (value[x] for x in self._selected)
+ return self.rowFactory(self,value)
+
+class SelectionIterator(iTableIterator):
+ def __init__(self,tableiterator,**conditions):
+ self._reference = iter(tableiterator)
+
+ assert isinstance(self._reference, iTableIterator)
+
+ self._conditions=dict((self._reference.columnIndex(i),c)
+ for i,c in conditions.iteritems())
+
+ def _checkCondition(self,row):
+ return reduce(lambda x,y : x and y,
+ (bool(self._conditions[i](row[i]))
+ for i in self._conditions),
+ True)
+
+ def _getRowFactory(self):
+ return self._reference.rowFactory
+
+ def _getSubrowFactory(self):
+ return self._reference.subrowFactory
+
+ def _getHeaders(self):
+ return self._reference.headers
+
+ def _getTypes(self):
+ return self._reference.types
+
+ def next(self):
+ row = self._reference.next()
+ while not self._checkCondition(row):
+ row = self._reference.next()
+ return row
+
+
+ headers = property(_getHeaders,None,None)
+ types = property(_getTypes,None,None)
+ rowFactory = property(_getRowFactory,None,None)
+ subrowFactory = property(_getSubrowFactory,None,None)
+
+
+class UnionIterator(iTableIterator):
+ def __init__(self,*itables):
+ self._itables=[iter(x) for x in itables]
+ self._types = self._itables[0].types
+ self._headers = self._itables[0].headers
+
+ assert reduce(lambda x,y: x and y,
+ ( isinstance(z,iTableIterator)
+ and len(z.headers)==len(self._headers)
+ for z in self._itables),
+ True)
+
+ self._iterator = chain(*self._itables)
+
+ def _getRowFactory(self):
+ return self._itables[0].rowFactory
+
+ def _getSubrowFactory(self):
+ return self._itables[0].subrowFactory
+
+ def _getHeaders(self):
+ return self._headers
+
+ def _getTypes(self):
+ return self._types
+
+ def next(self):
+ value = self._iterator.next()
+ return self.rowFactory(self,value.row)
+
+ headers = property(_getHeaders,None,None)
+ types = property(_getTypes,None,None)
+ rowFactory = property(_getRowFactory,None,None)
+ subrowFactory = property(_getSubrowFactory,None,None)
+
+
+
+def tableFactory(tableiterator):
+ tableiterator = iter(tableiterator)
+ assert isinstance(tableiterator, iTableIterator)
+
+ newtable = Table(tableiterator.headers,
+ tableiterator.types,
+ tableiterator.rowFactory,
+ tableiterator.subrowFactory)
+
+ for r in tableiterator:
+ newtable.append(r)
+
+ return newtable
+
+def projectTable(tableiterator,*cols):
+ return tableFactory(ProjectionIterator(tableiterator,*cols))
+
+def subTable(tableiterator,**conditions):
+ return tableFactory(SelectionIterator(tableiterator,**conditions))
+
+def concatTables(*itables):
+ '''
+ Concatene severals tables.
+
+ concatenation is done using the L{UnionIterator<UnionIterator>}
+
+ @type itables: iTableIterator or Table
+
+ @return: a new Table
+ @rtype: c{Table}
+
+ @see: L{UnionIterator<UnionIterator>}
+ '''
+ return tableFactory(UnionIterator(*itables))
+
+class TableIteratorAsDict(object):
+
+ def __init__(self,tableiterator):
+ self._reference = iter(tableiterator)
+
+ assert isinstance(self._reference, iTableIterator)
+
+ self._headers = self._reference.headers
+ self._types = self._reference.types
+ if self._types is not None:
+ self._types = dict((n,t)
+ for n,t in imap(None,self._headers,self._types))
+
+ def __iter__(self):
+ return self
+
+ def next(self):
+ value = self._reference.next()
+ return dict((n,t)
+ for n,t in imap(None,self._headers,value))
+
+ def _getHeaders(self):
+ return self._headers
+
+ def _getTypes(self):
+ return self._types
+
+ headers = property(_getHeaders,None,None)
+ types = property(_getTypes,None,None)
+
\ No newline at end of file
diff --git a/src/obitools/table/csv.py b/src/obitools/table/csv.py
new file mode 100644
index 0000000..1d9a73d
--- /dev/null
+++ b/src/obitools/table/csv.py
@@ -0,0 +1,52 @@
+"""
+obitools.table.csv module provides an iterator adapter
+allowing to parse csv (comma separatted value) file
+"""
+
+import re
+
+def csvIterator(lineIterator,sep=','):
+ '''
+ Allows easy parsing of a csv file. This function
+ convert an iterator on line over a csv text file
+ in an iterator on data list. Each list corresponds
+ to all values present n one line.
+
+ @param lineIterator: iterator on text lines
+ @type lineIterator: iterator
+ @param sep: string of one letter used as separator
+ blank charactere or " is not allowed as
+ separator
+ @type sep: string
+ @return: an iterator on data list
+ @rtype: iterator
+ '''
+ assert len(sep)==1 and not sep.isspace() and sep!='"'
+ valueMatcher=re.compile('\s*((")(([^"]|"")*)"|([^%s]*?))\s*(%s|$)' % (sep,sep))
+ def iterator():
+ for l in lineIterator:
+ yield _csvParse(l,valueMatcher)
+ return iterator()
+
+
+def _csvParse(line,valueMatcher):
+ data=[]
+ i = iter(valueMatcher.findall(line))
+ m = i.next()
+ if m[0]:
+ while m[-1]!='':
+ if m[1]=='"':
+ data.append(m[2].replace('""','"'))
+ else:
+ data.append(m[0])
+ m=i.next()
+ if m[1]=='"':
+ data.append(m[2].replace('""','"'))
+ else:
+ data.append(m[0])
+ return data
+
+
+
+
+
\ No newline at end of file
diff --git a/src/obitools/tagmatcher/__init__.py b/src/obitools/tagmatcher/__init__.py
new file mode 100644
index 0000000..880ead0
--- /dev/null
+++ b/src/obitools/tagmatcher/__init__.py
@@ -0,0 +1,35 @@
+from obitools import NucSequence
+from obitools.location import locationGenerator,extractExternalRefs
+
+
+
+class TagMatcherSequence(NucSequence):
+ '''
+ Class used to represent a nucleic sequence issued mapped
+ on a genome by the tagMatcher software.
+ '''
+
+ def __init__(self,seq,cd,locs,dm,rm):
+ NucSequence.__init__(self, seq, seq)
+ self['locations']=locs
+ self['conditions']=cd
+ self['dm']=dm
+ self['rm']=rm
+ self['tm']=dm+rm
+
+ def eminEmaxFilter(self,emin=None,emax=None):
+ result = [x for x in self['locations']
+ if (emin is None or x['error'] >=emin)
+ and (emax is None or x['error'] <=emax)]
+ self['locations']=result
+ dm=0
+ rm=0
+ for x in result:
+ if x.isDirect():
+ dm+=1
+ else:
+ rm+=1
+ self['dm']=dm
+ self['rm']=rm
+ self['tm']=dm+rm
+ return self
diff --git a/src/obitools/tagmatcher/options.py b/src/obitools/tagmatcher/options.py
new file mode 100644
index 0000000..45673ce
--- /dev/null
+++ b/src/obitools/tagmatcher/options.py
@@ -0,0 +1,14 @@
+def addTagMatcherErrorOptions(optionManager):
+ optionManager.add_option('-E','--emax',
+ action='store',
+ metavar="<##>",
+ type="int",dest="emax",
+ default=None,
+ help="keep match with no more than emax errors")
+
+ optionManager.add_option('-e','--emin',
+ action='store',
+ metavar="<##>",
+ type="int",dest="emin",
+ default=0,
+ help="keep match with at least emin errors")
diff --git a/src/obitools/tagmatcher/parser.py b/src/obitools/tagmatcher/parser.py
new file mode 100644
index 0000000..a843e66
--- /dev/null
+++ b/src/obitools/tagmatcher/parser.py
@@ -0,0 +1,89 @@
+import re
+import sys
+
+from obitools import tagmatcher
+from obitools.seqdb import nucEntryIterator
+from obitools.location.feature import Feature
+from obitools.location import locationGenerator
+
+_seqMatcher = re.compile('(?<=TG )[acgtrymkwsbdhvnACGTRYMKWSBDHVN]+')
+_cdMatcher = re.compile('(?<=CD ) *([^:]+?) +: +([0-9]+)')
+_loMatcher = re.compile('(?<=LO ) *([ACGTRYMKWSBDHVN]+) +([^ ]+) +([^ ]+) +\(([0-9]+)\)')
+_dmMatcher = re.compile('(?<=DM )[0-9]+')
+_rmMatcher = re.compile('(?<=RM )[0-9]+')
+
+
+def __tagmatcherparser(text):
+ try:
+ seq = _seqMatcher.search(text).group()
+ cd = dict((x[0],int(x[1])) for x in _cdMatcher.findall(text))
+ locs = []
+
+ for (match,ac,loc,err) in _loMatcher.findall(text):
+ feat = Feature('location', locationGenerator(loc))
+ feat['error']=int(err)
+ feat['match']=match
+ feat['contig']=ac
+ locs.append(feat)
+
+ dm = int(_dmMatcher.search(text).group())
+ rm = int(_rmMatcher.search(text).group())
+
+ except AttributeError,e:
+ print >>sys.stderr,'======================================================='
+ print >>sys.stderr,text
+ print >>sys.stderr,'======================================================='
+ raise e
+
+ return (seq,cd,locs,dm,rm)
+
+def tagMatcherParser(text):
+ return tagmatcher.TagMatcherSequence(*__tagmatcherparser(text))
+
+
+class TagMatcherIterator(object):
+ _cdheadparser = re.compile('condition [0-9]+ : (.+)')
+
+ def __init__(self,file):
+ self._ni = nucEntryIterator(file)
+ self.header=self._ni.next()
+ self.conditions=TagMatcherIterator._cdheadparser.findall(self.header)
+
+ def next(self):
+ return tagMatcherParser(self._ni.next())
+
+ def __iter__(self):
+ return self
+
+def formatTagMatcher(tmseq,reader=None):
+ if isinstance(tmseq, TagMatcherIterator):
+ return tmseq.header
+
+ assert isinstance(tmseq,tagmatcher.TagMatcherSequence),'Only TagMatcherSequence can be used'
+ lo = '\n'.join(['LO %s %s %s (%d)' % (l['match'],l['contig'],l.locStr(),l['error'])
+ for l in tmseq['locations']])
+ if reader is not None:
+ cd = '\n'.join(['CD %s : %d' % (x,tmseq['conditions'][x])
+ for x in reader.conditions])
+ else:
+ cd = '\n'.join(['CD %s : %d' % (x,tmseq['conditions'][x])
+ for x in tmseq['conditions']])
+
+ tg = 'TG %s' % str(tmseq)
+
+ e=[tg]
+ if cd:
+ e.append(cd)
+ if lo:
+ e.append(lo)
+
+ tm = 'TM %d' % tmseq['tm']
+ dm = 'DM %d' % tmseq['dm']
+ rm = 'RM %d' % tmseq['rm']
+
+ e.extend((tm,dm,rm,'//'))
+
+ return '\n'.join(e)
+
+
+
diff --git a/src/obitools/thermo/__init__.py b/src/obitools/thermo/__init__.py
new file mode 100644
index 0000000..492dbb9
--- /dev/null
+++ b/src/obitools/thermo/__init__.py
@@ -0,0 +1,597 @@
+from math import log
+from array import array
+from copy import deepcopy
+
+bpencoder={'A':1,'C':2,'G':3,'T':4,
+ 'a':1,'c':2,'g':3,'t':4,
+ '-':0
+ }
+
+rvencoder={'A':4,'C':3,'G':2,'T':1,
+ 'a':4,'c':3,'g':2,'t':1,
+ '-':0
+ }
+
+R = 1.987
+SALT_METHOD_SANTALUCIA = 1
+SALT_METHOD_OWCZARZY = 2
+DEF_CONC_PRIMERS = 8.e-7
+DEF_CONC_SEQUENCES = 0.
+DEF_SALT = 0.05
+forbidden_entropy = 0.
+forbidden_enthalpy = 1.e18
+
+__dH = [[[[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]]],
+ [[[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]]],
+ [[[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]]],
+ [[[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]]],
+ [[[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]]],
+ [[[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]]]
+ ]
+__dS = [[[[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]]],
+ [[[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]]],
+ [[[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]]],
+ [[[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]]],
+ [[[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]]],
+ [[[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]],
+ [[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.],[0.,0.,0.,0.,0.,0.]]]
+ ]
+
+def initParams(c1, c2, kp, sm,nparm={}):
+ global forbidden_entropy
+ global dH,dS
+
+ dH=deepcopy(__dH)
+ dS=deepcopy(__dS)
+
+ nparm['Ct1'] = c1;
+ nparm['Ct2'] = c2;
+ nparm['kplus'] = kp;
+ maxCT = 1;
+
+ if(nparm['Ct2'] > nparm['Ct1']):
+ maxCT = 2
+
+ if(nparm['Ct1'] == nparm['Ct2']):
+ ctFactor = nparm['Ct1']/2
+ elif (maxCT == 1):
+ ctFactor = nparm['Ct1']-nparm['Ct2']/2
+ else:
+ ctFactor = nparm['Ct2']-nparm['Ct1']/2
+
+ nparm['rlogc'] = R * log(ctFactor)
+ forbidden_entropy = nparm['rlogc']
+ nparm['kfac'] = 0.368 * log(nparm['kplus'])
+ nparm['saltMethod'] = sm
+
+
+ # Set all X-/Y-, -X/Y- and X-/-Y so, that TM will be VERY small!
+ for x in xrange(1,5):
+ for y in xrange(1,5):
+ dH[0][x][y][0]=forbidden_enthalpy;
+ dS[0][x][y][0]=forbidden_entropy;
+ dH[x][0][0][y]=forbidden_enthalpy;
+ dS[x][0][0][y]=forbidden_entropy;
+ dH[x][0][y][0]=forbidden_enthalpy;
+ dS[x][0][y][0]=forbidden_entropy;
+ # forbid X-/Y$ and X$/Y- etc., i.e. terminal must not be paired with gap!
+ dH[x][5][y][0]=forbidden_enthalpy;
+ dS[x][5][y][0]=forbidden_entropy;
+ dH[x][0][y][5]=forbidden_enthalpy;
+ dS[x][0][y][5]=forbidden_entropy;
+ dH[5][x][0][y]=forbidden_enthalpy;
+ dS[5][x][0][y]=forbidden_entropy;
+ dH[0][x][5][y]=forbidden_enthalpy;
+ dS[0][x][5][y]=forbidden_entropy;
+
+ #forbid X$/-Y etc.
+ dH[x][5][0][y]=forbidden_enthalpy;
+ dS[x][5][0][y]=forbidden_entropy;
+ dH[x][0][5][y]=forbidden_enthalpy;
+ dS[x][0][5][y]=forbidden_entropy;
+ dH[5][x][y][0]=forbidden_enthalpy;
+ dS[5][x][y][0]=forbidden_entropy;
+ dH[0][x][y][5]=forbidden_enthalpy;
+ dS[0][x][y][5]=forbidden_entropy;
+
+
+
+ #also, forbid x-/-- and --/x-, i.e. no two inner gaps paired
+ dH[x][0][0][0]=forbidden_enthalpy;
+ dS[x][0][0][0]=forbidden_entropy;
+ dH[0][0][x][0]=forbidden_enthalpy;
+ dS[0][0][x][0]=forbidden_entropy;
+ # x-/-$
+ dH[x][0][0][5]=forbidden_enthalpy;
+ dS[x][0][0][5]=forbidden_entropy;
+ dH[5][0][0][x]=forbidden_enthalpy;
+ dS[5][0][0][x]=forbidden_entropy;
+ dH[0][5][x][0]=forbidden_enthalpy;
+ dS[x][0][0][5]=forbidden_entropy;
+ dH[0][x][5][0]=forbidden_enthalpy;
+ dS[0][x][5][0]=forbidden_entropy;
+
+ # forbid --/--
+ dH[0][0][0][0]=forbidden_enthalpy;
+ dS[0][0][0][0]=forbidden_entropy;
+
+ dH[5][0][0][0]=forbidden_enthalpy;
+ dS[5][0][0][0]=forbidden_entropy;
+ dH[0][0][5][0]=forbidden_enthalpy;
+ dS[0][0][5][0]=forbidden_entropy;
+ dH[0][5][5][0]=forbidden_enthalpy;
+ dS[0][5][5][0]=forbidden_entropy;
+
+ # Interior loops (double Mismatches)
+ iloop_entropy=-0.97
+ iloop_enthalpy=0.0
+
+ for x in xrange(1,5):
+ for y in xrange(1,5):
+ for a in xrange(1,5):
+ for b in xrange(1,5):
+ # AT and CG pair, and as A=1, C=2, G=3, T=4 this means
+ # we have Watson-Crick pairs if (x+a==5) and (y+b)==5.
+ if ( not ((x+a==5) or (y+b==5))):
+ # No watson-crick-pair, i.e. double mismatch!
+ # set enthalpy/entropy to loop expansion!
+ dH[x][y][a][b] = iloop_enthalpy;
+ dS[x][y][a][b] = iloop_entropy;
+
+
+ # xy/-- and --/xy (Bulge Loops of size > 1)
+ bloop_entropy=-1.3
+ bloop_enthalpy=0.0
+
+ for x in xrange(1,5):
+ for y in xrange(1,5):
+ dH[x][y][0][0] = bloop_enthalpy;
+ dS[x][y][0][0] = bloop_entropy;
+ dH[0][0][x][y] = bloop_enthalpy;
+ dS[0][0][x][y] = bloop_entropy;
+
+
+ # x-/ya abd xa/y- as well as -x/ay and ax/-y
+ # bulge opening and closing parameters with
+ # adjacent matches / mismatches
+ # obulge_mism and cbulge_mism chosen so high to avoid
+ # AAAAAAAAA
+ # T--G----T
+ # being better than
+ # AAAAAAAAA
+ # TG------T
+ obulge_match_H =-2.66e3
+ obulge_match_S =-14.22
+ cbulge_match_H =-2.66e3
+ cbulge_match_S =-14.22
+ obulge_mism_H = 0.0
+ obulge_mism_S = -6.45
+ cbulge_mism_H = 0.0
+ cbulge_mism_S =-6.45
+
+ for x in xrange(1,5):
+ for y in xrange(1,5):
+ for a in xrange(1,5):
+ if (x+y==5): # other base pair matches!
+
+ dH[x][0][y][a]=obulge_match_H; # bulge opening
+ dS[x][0][y][a]=obulge_match_S;
+ dH[x][a][y][0]=obulge_match_H;
+ dS[x][a][y][0]=obulge_match_S;
+ dH[0][x][a][y]=cbulge_match_H; # bulge closing
+ dS[0][x][a][y]=cbulge_match_S;
+ dH[a][x][0][y]=cbulge_match_H;
+ dS[a][x][0][y]=cbulge_match_S;
+ else:
+ # mismatch in other base pair!
+ dH[x][0][y][a]=obulge_mism_H; # bulge opening
+ dS[x][0][y][a]=obulge_mism_S;
+ dH[x][a][y][0]=obulge_mism_H;
+ dS[x][a][y][0]=obulge_mism_S;
+ dH[0][x][a][y]=cbulge_mism_H; # bulge closing
+ dS[0][x][a][y]=cbulge_mism_S;
+ dH[a][x][0][y]=cbulge_mism_H;
+ dS[a][x][0][y]=cbulge_mism_S;
+
+
+
+ # Watson-Crick pairs (note that only ten are unique, as obviously
+ # 5'-AG-3'/3'-TC-5' = 5'-CT-3'/3'-GA-5' etc.
+ dH[1][1][4][4]=-7.6e3; dS[1][1][4][4]=-21.3 # AA/TT 04
+ dH[1][2][4][3]=-8.4e3; dS[1][2][4][3]=-22.4 # AC/TG adapted GT/CA
+ dH[1][3][4][2]=-7.8e3; dS[1][3][4][2]=-21.0 # AG/TC adapted CT/GA
+ dH[1][4][4][1]=-7.2e3; dS[1][4][4][1]=-20.4 # AT/TA 04
+ dH[2][1][3][4]=-8.5e3; dS[2][1][3][4]=-22.7 # CA/GT 04
+ dH[2][2][3][3]=-8.0e3; dS[2][2][3][3]=-19.9 # CC/GG adapted GG/CC
+ dH[2][3][3][2]=-10.6e3; dS[2][3][3][2]=-27.2 # CG/GC 04
+ dH[2][4][3][1]=-7.8e3; dS[2][4][3][1]=-21.0 # CT/GA 04
+ dH[3][1][2][4]=-8.2e3; dS[3][1][2][4]=-22.2 # GA/CT 04
+ dH[3][2][2][3]=-9.8e3; dS[3][2][2][3]=-24.4 # GC/CG 04
+ dH[3][3][2][2]=-8.0e3; dS[3][3][2][2]=-19.9 # GG/CC 04
+ dH[3][4][2][1]=-8.4e3; dS[3][4][2][1]=-22.4 # GT/CA 04
+ dH[4][1][1][4]=-7.2e3; dS[4][1][1][4]=-21.3 # TA/AT 04
+ dH[4][2][1][3]=-8.2e3; dS[4][2][1][3]=-22.2 # TC/AG adapted GA/CT
+ dH[4][3][1][2]=-8.5e3; dS[4][3][1][2]=-22.7 # TG/AC adapted CA/GT
+ dH[4][4][1][1]=-7.6e3; dS[4][4][1][1]=-21.3 # TT/AA adapted AA/TT
+
+ # A-C Mismatches (Values for pH 7.0)
+ dH[1][1][2][4]=7.6e3; dS[1][1][2][4]=20.2 # AA/CT
+ dH[1][1][4][2]=2.3e3; dS[1][1][4][2]=4.6 # AA/TC
+ dH[1][2][2][3]=-0.7e3; dS[1][2][2][3]=-3.8 # AC/CG
+ dH[1][2][4][1]=5.3e3; dS[1][2][4][1]=14.6 # AC/TA
+ dH[1][3][2][2]=0.6e3; dS[1][3][2][2]=-0.6 # AG/CC
+ dH[1][4][2][1]=5.3e3; dS[1][4][2][1]=14.6 # AT/CA
+ dH[2][1][1][4]=3.4e3; dS[2][1][1][4]=8.0 # CA/AT
+ dH[2][1][3][2]=1.9e3; dS[2][1][3][2]=3.7 # CA/GC
+ dH[2][2][1][3]=5.2e3; dS[2][2][1][3]=14.2 # CC/AG
+ dH[2][2][3][1]=0.6e3; dS[2][2][3][1]=-0.6 # CC/GA
+ dH[2][3][1][2]=1.9e3; dS[2][3][1][2]=3.7 # CG/AC
+ dH[2][4][1][1]=2.3e3; dS[2][4][1][1]=4.6 # CT/AA
+ dH[3][1][2][2]=5.2e3; dS[3][1][2][2]=14.2 # GA/CC
+ dH[3][2][2][1]=-0.7e3; dS[3][2][2][1]=-3.8 # GC/CA
+ dH[4][1][1][2]=3.4e3; dS[4][1][1][2]=8.0 # TA/AC
+ dH[4][2][1][1]=7.6e3; dS[4][2][1][1]=20.2 # TC/AA
+
+ # C-T Mismatches
+ dH[1][2][4][4]=0.7e3; dS[1][2][4][4]=0.2 # AC/TT
+ dH[1][4][4][2]=-1.2e3; dS[1][4][4][2]=-6.2 # AT/TC
+ dH[2][1][4][4]=1.0e3; dS[2][1][4][4]=0.7 # CA/TT
+ dH[2][2][3][4]=-0.8e3; dS[2][2][3][4]=-4.5 # CC/GT
+ dH[2][2][4][3]=5.2e3; dS[2][2][4][3]=13.5 # CC/TG
+ dH[2][3][4][2]=-1.5e3; dS[2][3][4][2]=-6.1 # CG/TC
+ dH[2][4][3][2]=-1.5e3; dS[2][4][3][2]=-6.1 # CT/GC
+ dH[2][4][4][1]=-1.2e3; dS[2][4][4][1]=-6.2 # CT/TA
+ dH[3][2][2][4]=2.3e3; dS[3][2][2][4]=5.4 # GC/CT
+ dH[3][4][2][2]=5.2e3; dS[3][4][2][2]=13.5 # GT/CC
+ dH[4][1][2][4]=1.2e3; dS[4][1][2][4]=0.7 # TA/CT
+ dH[4][2][2][3]=2.3e3; dS[4][2][2][3]=5.4 # TC/CG
+ dH[4][2][1][4]=1.2e3; dS[4][2][1][4]=0.7 # TC/AT
+ dH[4][3][2][2]=-0.8e3; dS[4][3][2][2]=-4.5 # TG/CC
+ dH[4][4][2][1]=0.7e3; dS[4][4][2][1]=0.2 # TT/CA
+ dH[4][4][1][2]=1.0e3; dS[4][4][1][2]=0.7 # TT/AC
+
+ # G-A Mismatches
+ dH[1][1][3][4]=3.0e3; dS[1][1][3][4]=7.4 # AA/GT
+ dH[1][1][4][3]=-0.6e3; dS[1][1][4][3]=-2.3 # AA/TG
+ dH[1][2][3][3]=0.5e3; dS[1][2][3][3]=3.2 # AC/GG
+ dH[1][3][3][2]=-4.0e3; dS[1][3][3][2]=-13.2 # AG/GC
+ dH[1][3][4][1]=-0.7e3; dS[1][3][4][1]=-2.3 # AG/TA
+ dH[1][4][3][1]=-0.7e3; dS[1][4][3][1]=-2.3 # AT/GA
+ dH[2][1][3][3]=-0.7e3; dS[2][1][3][3]=-2.3 # CA/GG
+ dH[2][3][3][1]=-4.0e3; dS[2][3][3][1]=-13.2 # CG/GA
+ dH[3][1][1][4]=0.7e3; dS[3][1][1][4]=0.7 # GA/AT
+ dH[3][1][2][3]=-0.6e3; dS[3][1][2][3]=-1.0 # GA/CG
+ dH[3][2][1][3]=-0.6e3; dS[3][2][1][3]=-1.0 # GC/AG
+ dH[3][3][1][2]=-0.7e3; dS[3][3][1][2]=-2.3 # GG/AC
+ dH[3][3][2][1]=0.5e3; dS[3][3][2][1]=3.2 # GG/CA
+ dH[3][4][1][1]=-0.6e3; dS[3][4][1][1]=-2.3 # GT/AA
+ dH[4][1][1][3]=0.7e3; dS[4][1][1][3]=0.7 # TA/AG
+ dH[4][3][1][1]=3.0e3; dS[4][3][1][1]=7.4 # TG/AA
+
+ # G-T Mismatches
+ dH[1][3][4][4]=1.0e3; dS[1][3][4][4]=0.9 # AG/TT
+ dH[1][4][4][3]=-2.5e3; dS[1][4][4][3]=-8.3 # AT/TG
+ dH[2][3][3][4]=-4.1e3; dS[2][3][3][4]=-11.7 # CG/GT
+ dH[2][4][3][3]=-2.8e3; dS[2][4][3][3]=-8.0 # CT/GG
+ dH[3][1][4][4]=-1.3e3; dS[3][1][4][4]=-5.3 # GA/TT
+ dH[3][2][4][3]=-4.4e3; dS[3][2][4][3]=-12.3 # GC/TG
+ dH[3][3][2][4]=3.3e3; dS[3][3][2][4]=10.4 # GG/CT
+ dH[3][3][4][2]=-2.8e3; dS[3][3][4][2]=-8.0 # GG/TC
+# dH[3][3][4][4]=5.8e3; dS[3][3][4][4]=16.3 # GG/TT
+ dH[3][4][2][3]=-4.4e3; dS[3][4][2][3]=-12.3 # GT/CG
+ dH[3][4][4][1]=-2.5e3; dS[3][4][4][1]=-8.3 # GT/TA
+# dH[3][4][4][3]=4.1e3; dS[3][4][4][3]=9.5 # GT/TG
+ dH[4][1][3][4]=-0.1e3; dS[4][1][3][4]=-1.7 # TA/GT
+ dH[4][2][3][3]=3.3e3; dS[4][2][3][3]=10.4 # TC/GG
+ dH[4][3][1][4]=-0.1e3; dS[4][3][1][4]=-1.7 # TG/AT
+ dH[4][3][3][2]=-4.1e3; dS[4][3][3][2]=-11.7 # TG/GC
+# dH[4][3][3][4]=-1.4e3; dS[4][3][3][4]=-6.2 # TG/GT
+ dH[4][4][1][3]=-1.3e3; dS[4][4][1][3]=-5.3 # TT/AG
+ dH[4][4][3][1]=1.0e3; dS[4][4][3][1]=0.9 # TT/GA
+# dH[4][4][3][3]=5.8e3; dS[4][4][3][3]=16.3 # TT/GG
+
+ # A-A Mismatches
+ dH[1][1][1][4]=4.7e3; dS[1][1][1][4]=12.9 # AA/AT
+ dH[1][1][4][1]=1.2e3; dS[1][1][4][1]=1.7 # AA/TA
+ dH[1][2][1][3]=-2.9e3; dS[1][2][1][3]=-9.8 # AC/AG
+ dH[1][3][1][2]=-0.9e3; dS[1][3][1][2]=-4.2 # AG/AC
+ dH[1][4][1][1]=1.2e3; dS[1][4][1][1]=1.7 # AT/AA
+ dH[2][1][3][1]=-0.9e3; dS[2][1][3][1]=-4.2 # CA/GA
+ dH[3][1][2][1]=-2.9e3; dS[3][1][2][1]=-9.8 # GA/CA
+ dH[4][1][1][1]=4.7e3; dS[4][1][1][1]=12.9 # TA/AA
+
+ # C-C Mismatches
+ dH[1][2][4][2]=0.0e3; dS[1][2][4][2]=-4.4 # AC/TC
+ dH[2][1][2][4]=6.1e3; dS[2][1][2][4]=16.4 # CA/CT
+ dH[2][2][2][3]=3.6e3; dS[2][2][2][3]=8.9 # CC/CG
+ dH[2][2][3][2]=-1.5e3; dS[2][2][3][2]=-7.2 # CC/GC
+ dH[2][3][2][2]=-1.5e3; dS[2][3][2][2]=-7.2 # CG/CC
+ dH[2][4][2][1]=0.0e3; dS[2][4][2][1]=-4.4 # CT/CA
+ dH[3][2][2][2]=3.6e3; dS[3][2][2][2]=8.9 # GC/CC
+ dH[4][2][1][2]=6.1e3; dS[4][2][1][2]=16.4 # TC/AC
+
+ # G-G Mismatches
+ dH[1][3][4][3]=-3.1e3; dS[1][3][4][3]=-9.5 # AG/TG
+ dH[2][3][3][3]=-4.9e3; dS[2][3][3][3]=-15.3 # CG/GG
+ dH[3][1][3][4]=1.6e3; dS[3][1][3][4]=3.6 # GA/GT
+ dH[3][2][3][3]=-6.0e3; dS[3][2][3][3]=-15.8 # GC/GG
+ dH[3][3][2][3]=-6.0e3; dS[3][3][2][3]=-15.8 # GG/CG
+ dH[3][3][3][2]=-4.9e3; dS[3][3][3][2]=-15.3 # GG/GC
+ dH[3][4][3][1]=-3.1e3; dS[3][4][3][1]=-9.5 # GT/GA
+ dH[4][3][1][3]=1.6e3; dS[4][3][1][3]=3.6 # TG/AG
+
+ # T-T Mismatches
+ dH[1][4][4][4]=-2.7e3; dS[1][4][4][4]=-10.8 # AT/TT
+ dH[2][4][3][4]=-5.0e3; dS[2][4][3][4]=-15.8 # CT/GT
+ dH[3][4][2][4]=-2.2e3; dS[3][4][2][4]=-8.4 # GT/CT
+ dH[4][1][4][4]=0.2e3; dS[4][1][4][4]=-1.5 # TA/TT
+ dH[4][2][4][3]=-2.2e3; dS[4][2][4][3]=-8.4 # TC/TG
+ dH[4][3][4][2]=-5.0e3; dS[4][3][4][2]=-15.8 # TG/TC
+ dH[4][4][1][4]=0.2e3; dS[4][4][1][4]=-1.5 # TT/AT
+ dH[4][4][4][1]=-2.7e3; dS[4][4][4][1]=-10.8 # TT/TA
+
+ # Dangling Eds
+ dH[5][1][1][4]=-0.7e3; dS[5][1][1][4]=-0.8 # $A/AT
+ dH[5][1][2][4]=4.4e3; dS[5][1][2][4]=14.9 # $A/CT
+ dH[5][1][3][4]=-1.6e3; dS[5][1][3][4]=-3.6 # $A/GT
+ dH[5][1][4][4]=2.9e3; dS[5][1][4][4]=10.4 # $A/TT
+ dH[5][2][1][3]=-2.1e3; dS[5][2][1][3]=-3.9 # $C/AG
+ dH[5][2][2][3]=-0.2e3; dS[5][2][2][3]=-0.1 # $C/CG
+ dH[5][2][3][3]=-3.9e3; dS[5][2][3][3]=-11.2 # $C/GG
+ dH[5][2][4][3]=-4.4e3; dS[5][2][4][3]=-13.1 # $C/TG
+ dH[5][3][1][2]=-5.9e3; dS[5][3][1][2]=-16.5 # $G/AC
+ dH[5][3][2][2]=-2.6e3; dS[5][3][2][2]=-7.4 # $G/CC
+ dH[5][3][3][2]=-3.2e3; dS[5][3][3][2]=-10.4 # $G/GC
+ dH[5][3][4][2]=-5.2e3; dS[5][3][4][2]=-15.0 # $G/TC
+ dH[5][4][1][1]=-0.5e3; dS[5][4][1][1]=-1.1 # $T/AA
+ dH[5][4][2][1]=4.7e3; dS[5][4][2][1]=14.2 # $T/CA
+ dH[5][4][3][1]=-4.1e3; dS[5][4][3][1]=-13.1 # $T/GA
+ dH[5][4][4][1]=-3.8e3; dS[5][4][4][1]=-12.6 # $T/TA
+ dH[1][5][4][1]=-2.9e3; dS[1][5][4][1]=-7.6 # A$/TA
+ dH[1][5][4][2]=-4.1e3; dS[1][5][4][2]=-13.0 # A$/TC
+ dH[1][5][4][3]=-4.2e3; dS[1][5][4][3]=-15.0 # A$/TG
+ dH[1][5][4][4]=-0.2e3; dS[1][5][4][4]=-0.5 # A$/TT
+ dH[1][1][5][4]=0.2e3; dS[1][1][5][4]=2.3 # AA/$T
+ dH[1][1][4][5]=-0.5e3; dS[1][1][4][5]=-1.1 # AA/T$
+ dH[1][2][5][3]=-6.3e3; dS[1][2][5][3]=-17.1 # AC/$G
+ dH[1][2][4][5]=4.7e3; dS[1][2][4][5]=14.2 # AC/T$
+ dH[1][3][5][2]=-3.7e3; dS[1][3][5][2]=-10.0 # AG/$C
+ dH[1][3][4][5]=-4.1e3; dS[1][3][4][5]=-13.1 # AG/T$
+ dH[1][4][5][1]=-2.9e3; dS[1][4][5][1]=-7.6 # AT/$A
+ dH[1][4][4][5]=-3.8e3; dS[1][4][4][5]=-12.6 # AT/T$
+ dH[2][5][3][1]=-3.7e3; dS[2][5][3][1]=-10.0 # C$/GA
+ dH[2][5][3][2]=-4.0e3; dS[2][5][3][2]=-11.9 # C$/GC
+ dH[2][5][3][3]=-3.9e3; dS[2][5][3][3]=-10.9 # C$/GG
+ dH[2][5][3][4]=-4.9e3; dS[2][5][3][4]=-13.8 # C$/GT
+ dH[2][1][5][4]=0.6e3; dS[2][1][5][4]=3.3 # CA/$T
+ dH[2][1][3][5]=-5.9e3; dS[2][1][3][5]=-16.5 # CA/G$
+ dH[2][2][5][3]=-4.4e3; dS[2][2][5][3]=-12.6 # CC/$G
+ dH[2][2][3][5]=-2.6e3; dS[2][2][3][5]=-7.4 # CC/G$
+ dH[2][3][5][2]=-4.0e3; dS[2][3][5][2]=-11.9 # CG/$C
+ dH[2][3][3][5]=-3.2e3; dS[2][3][3][5]=-10.4 # CG/G$
+ dH[2][4][5][1]=-4.1e3; dS[2][4][5][1]=-13.0 # CT/$A
+ dH[2][4][3][5]=-5.2e3; dS[2][4][3][5]=-15.0 # CT/G$
+ dH[3][5][2][1]=-6.3e3; dS[3][5][2][1]=-17.1 # G$/CA
+ dH[3][5][2][2]=-4.4e3; dS[3][5][2][2]=-12.6 # G$/CC
+ dH[3][5][2][3]=-5.1e3; dS[3][5][2][3]=-14.0 # G$/CG
+ dH[3][5][2][4]=-4.0e3; dS[3][5][2][4]=-10.9 # G$/CT
+ dH[3][1][5][4]=-1.1e3; dS[3][1][5][4]=-1.6 # GA/$T
+ dH[3][1][2][5]=-2.1e3; dS[3][1][2][5]=-3.9 # GA/C$
+ dH[3][2][5][3]=-5.1e3; dS[3][2][5][3]=-14.0 # GC/$G
+ dH[3][2][2][5]=-0.2e3; dS[3][2][2][5]=-0.1 # GC/C$
+ dH[3][3][5][2]=-3.9e3; dS[3][3][5][2]=-10.9 # GG/$C
+ dH[3][3][2][5]=-3.9e3; dS[3][3][2][5]=-11.2 # GG/C$
+ dH[3][4][5][1]=-4.2e3; dS[3][4][5][1]=-15.0 # GT/$A
+ dH[3][4][2][5]=-4.4e3; dS[3][4][2][5]=-13.1 # GT/C$
+ dH[4][5][1][1]=0.2e3; dS[4][5][1][1]=2.3 # T$/AA
+ dH[4][5][1][2]=0.6e3; dS[4][5][1][2]=3.3 # T$/AC
+ dH[4][5][1][3]=-1.1e3; dS[4][5][1][3]=-1.6 # T$/AG
+ dH[4][5][1][4]=-6.9e3; dS[4][5][1][4]=-20.0 # T$/AT
+ dH[4][1][5][4]=-6.9e3; dS[4][1][5][4]=-20.0 # TA/$T
+ dH[4][1][1][5]=-0.7e3; dS[4][1][1][5]=-0.7 # TA/A$
+ dH[4][2][5][3]=-4.0e3; dS[4][2][5][3]=-10.9 # TC/$G
+ dH[4][2][1][5]=4.4e3; dS[4][2][1][5]=14.9 # TC/A$
+ dH[4][3][5][2]=-4.9e3; dS[4][3][5][2]=-13.8 # TG/$C
+ dH[4][3][1][5]=-1.6e3; dS[4][3][1][5]=-3.6 # TG/A$
+ dH[4][4][5][1]=-0.2e3; dS[4][4][5][1]=-0.5 # TT/$A
+ dH[4][4][1][5]=2.9e3; dS[4][4][1][5]=10.4 # TT/A$
+
+
+ nparm['dH']=dH
+ nparm['dS']=dS
+
+ return nparm
+
+
+defaultParm=initParams(DEF_CONC_PRIMERS,DEF_CONC_SEQUENCES,DEF_SALT, SALT_METHOD_SANTALUCIA)
+
+def seqencoder(seq):
+ return [bpencoder[x] for x in seq]
+
+def getInitialEntropy(nparm=defaultParm):
+ return -5.9+nparm['rlogc']
+
+def getEnthalpy(x0, x1, y0, y1,nparm=defaultParm):
+ return nparm['dH'][x0][x1][y0][y1]
+
+def GetEntropy(x0, x1, y0, y1,nparm=defaultParm):
+
+ nx0=x0
+ nx1=x1
+ ny0=y0
+ ny1=y1
+ dH=nparm['dH']
+ dS=nparm['dS']
+ answer = dS[nx0][nx1][ny0][ny1]
+
+ if (nparm['saltMethod'] == SALT_METHOD_SANTALUCIA):
+ if(nx0!=5 and 1<= nx1 and nx1<=4):
+ answer += 0.5*nparm['kfac']
+
+ if(ny1!=5 and 1<= ny0 and ny0<=4):
+ answer += 0.5*nparm['kfac']
+
+ if (nparm['saltMethod'] == SALT_METHOD_OWCZARZY):
+ logk = log(nparm['kplus']);
+ answer += dH[nx0][nx1][ny0][ny1]*((4.29 * nparm['gcContent']-3.95)* 1e-5 * logk + 0.0000094*logk**2);
+
+ return answer;
+
+def CalcTM(entropy,enthalpy):
+ tm = 0
+ if (enthalpy>=forbidden_enthalpy) :
+ return 0;
+
+ if (entropy<0) :
+ tm = enthalpy/entropy
+ if (tm<0):
+ return 0;
+
+ return tm;
+
+
+
+
+def countGCContent(seq):
+ count = 0;
+ for k in seq :
+ if k in 'cgGC':
+ count+=1;
+ return count;
+
+
+#def cleanSeq (inseq,outseq,length):
+#
+# seqlen = len(inseq)
+# if (len != 0)
+# seqlen = length;
+#
+# j=0
+# for i in xrange(seqlen):
+# {
+# switch (inseq[i])
+# {
+# case 'a':
+# case '\0':
+# case 'A':
+# outseq[j++] = 'A'; break;
+# case 'c':
+# case '\1':
+# case 'C':
+# outseq[j++] = 'C'; break;
+# case 'g':
+# case '\2':
+# case 'G':
+# outseq[j++] = 'G'; break;
+# case 't':
+# case '\3':
+# case 'T':
+# outseq[j++] = 'T'; break;
+# }
+# }
+# outseq[j] = '\0';
+#}
+
+def calcSelfTM(seq,nparm=defaultParm):
+ dH=nparm['dH']
+ dS=nparm['dS']
+ length=len(seq)
+
+ thedH = 0;
+ thedS = -5.9+nparm['rlogc']
+ for i in xrange(1,length):
+ c1 = rvencoder[seq[i-1]];
+ c2 = rvencoder[seq[i]];
+ c3 = bpencoder[seq[i-1]];
+ c4 = bpencoder[seq[i]];
+
+ thedH += dH[c3][c4][c1][c2];
+ thedS += GetEntropy(c3, c4, c1, c2, nparm)
+
+ mtemp = CalcTM(thedS,thedH);
+# print thedH,thedS,nparm['rlogc']
+ return mtemp-273.15;
+
+
+def calcTMTwoSeq(seq1,seq2,nparm=defaultParm):
+
+ thedH = 0;
+ thedS = -5.9+nparm['rlogc']
+ dH=nparm['dH']
+ dS=nparm['dS']
+ length=len(seq1)
+
+ for i in xrange(1,length):
+ c1 = rvencoder[seq2[i-1]]
+ c2 = rvencoder[seq2[i]]
+ c3 = bpencoder[seq1[i-1]]
+ c4 = bpencoder[seq1[i]]
+
+ thedH += dH[c3][c4][c1][c2]
+ thedS += GetEntropy(c3, c4, c1, c2, nparm)
+
+ mtemp = CalcTM(thedS,thedH);
+# print thedH,thedS,nparm['rlogc']
+
+ return mtemp-273.15;
+
+
diff --git a/src/obitools/tools/__init__.py b/src/obitools/tools/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/obitools/tools/_solexapairend.pyx b/src/obitools/tools/_solexapairend.pyx
new file mode 100644
index 0000000..c145df9
--- /dev/null
+++ b/src/obitools/tools/_solexapairend.pyx
@@ -0,0 +1,187 @@
+# cython: profile=False
+
+from cpython cimport array
+
+from obitools.tools.solexapairend import iterOnAligment
+from obitools import NucSequence
+
+
+cdef class IterOnConsensus:
+
+ cdef object _ali
+ cdef int __seqASingle
+ cdef int __seqBSingle
+ cdef int __seqABMatch
+ cdef int __seqAMismatch
+ cdef int __seqBMismatch
+ cdef int __seqAInsertion
+ cdef int __seqBInsertion
+ cdef int __seqADeletion
+ cdef int __seqBDeletion
+ cdef object __ioa
+ cdef bint __firstSeqB
+
+ def __cinit__(self,ali):
+ self._ali=ali
+ self.__seqASingle=0
+ self.__seqBSingle=0
+ self.__seqABMatch=0
+ self.__seqAMismatch=0
+ self.__seqBMismatch=0
+ self.__seqAInsertion=0
+ self.__seqBInsertion=0
+ self.__seqADeletion=0
+ self.__seqBDeletion=0
+
+ self.__ioa = iterOnAligment(self._ali)
+ self.__firstSeqB=False
+
+ def get_seqASingle(self):
+ return self.__seqASingle
+
+
+ def get_seqBSingle(self):
+ return self.__seqBSingle
+
+
+ def get_seqABMatch(self):
+ return self.__seqABMatch
+
+
+ def get_seqAMismatch(self):
+ return self.__seqAMismatch
+
+
+ def get_seqBMismatch(self):
+ return self.__seqBMismatch
+
+
+ def get_seqAInsertion(self):
+ return self.__seqAInsertion
+
+
+ def get_seqBInsertion(self):
+ return self.__seqBInsertion
+
+
+ def get_seqADeletion(self):
+ return self.__seqADeletion
+
+
+ def get_seqBDeletion(self):
+ return self.__seqBDeletion
+
+ def __next__(self):
+ cdef bytes snuc0
+ cdef bytes snuc1
+ cdef char* nuc0
+ cdef char* nuc1
+ cdef char* dash='-'
+ cdef double score0
+ cdef double score1
+ cdef double h0
+ cdef double h1
+
+ while(1):
+ snuc0,score0,snuc1,score1 = self.__ioa.next()
+ nuc0=snuc0
+ nuc1=snuc1
+ if nuc0[0]==nuc1[0]:
+ if nuc1[0]!=dash[0]:
+ self.__firstSeqB=True
+ self.__seqABMatch+=1
+ self.__seqBSingle=0
+ return (nuc0,score0*score1)
+ else:
+ h0 = score0 * (1-score1/3)
+ h1 = score1 * (1-score0/3)
+ if h0 < h1:
+
+ if nuc0[0]!=dash[0]:
+ self.__seqBSingle=0
+ if nuc1[0]==dash[0]:
+ if self.__firstSeqB:
+ self.__seqAInsertion+=1
+ else:
+ self.__seqASingle+=1
+ else:
+ self.__firstSeqB=True
+ self.__seqAMismatch+=1
+ return (nuc0,h0)
+ else:
+ self.__seqADeletion+=1
+ else:
+ if nuc1[0]!=dash[0]:
+ self.__firstSeqB=True
+ if nuc0[0]==dash[0]:
+ self.__seqBInsertion+=1
+ self.__seqBSingle+=1
+ else:
+ self.__seqBMismatch+=1
+ self.__seqBSingle=0
+ return (nuc1,h1)
+ else:
+ self.__seqBSingle=0
+ self.__seqBDeletion+=1
+
+
+ def __iter__(self):
+ return self
+
+ seqASingle = property(get_seqASingle, None, None, "direct's docstring")
+ seqBSingle = property(get_seqBSingle, None, None, "reverse's docstring")
+ seqABMatch = property(get_seqABMatch, None, None, "idem's docstring")
+ seqAMismatch = property(get_seqAMismatch, None, None, "mismatchdirect's docstring")
+ seqBMismatch = property(get_seqBMismatch, None, None, "mismatchreverse's docstring")
+ seqAInsertion = property(get_seqAInsertion, None, None, "insertdirect's docstring")
+ seqBInsertion = property(get_seqBInsertion, None, None, "insertreverse's docstring")
+ seqADeletion = property(get_seqADeletion, None, None, "deletedirect's docstring")
+ seqBDeletion = property(get_seqBDeletion, None, None, "deletereverse's docstring")
+
+
+def buildConsensus(ali):
+ cdef double quality[1000]
+ cdef char aseq[1000]
+ cdef int i=0
+ cdef int j=0
+ cdef char* cnuc
+ cdef bytes nuc
+ cdef double score
+ cdef bytes sseq
+
+ if len(ali[0])>999:
+ raise AssertionError,"To long alignemnt"
+
+ ic=IterOnConsensus(ali)
+
+ for nuc,score in ic:
+ cnuc=nuc
+ quality[i]=score
+ aseq[i]=cnuc[0]
+ i+=1
+
+ aseq[i]=0
+
+ sseq=aseq
+ seq=NucSequence(ali[0].wrapped.id+'_CONS',sseq,**ali[0].wrapped.getTags())
+ seq.quality=array.array('d',[quality[j] for j in range(i)])
+
+ if hasattr(ali, "direction"):
+ seq['direction']=ali.direction
+ if hasattr(ali, "counter"):
+ seq['alignement_id']=ali.counter
+ seq['seq_a_single']=ic.seqASingle
+ seq['seq_b_single']=ic.seqBSingle
+ seq['seq_ab_match']=ic.seqABMatch
+ seq['seq_a_mismatch']=ic.seqAMismatch
+ seq['seq_b_mismatch']=ic.seqBMismatch
+ seq['seq_a_insertion']=ic.seqAInsertion
+ seq['seq_b_insertion']=ic.seqBInsertion-ic.seqBSingle
+ seq['seq_a_deletion']=ic.seqADeletion
+ seq['seq_b_deletion']=ic.seqBDeletion
+ seq['score']=ali.score
+ seq['ali_length']=len(seq)-ic.seqASingle-ic.seqBSingle
+ if seq['ali_length']>0:
+ seq['score_norm']=float(ali.score)/seq['ali_length']
+ seq['mode']='alignment'
+ return seq
diff --git a/src/obitools/tools/solexapairend.py b/src/obitools/tools/solexapairend.py
new file mode 100644
index 0000000..609f533
--- /dev/null
+++ b/src/obitools/tools/solexapairend.py
@@ -0,0 +1,51 @@
+'''
+Created on 17 mai 2010
+
+ at author: coissac
+'''
+
+from obitools.alignment import columnIterator
+
+
+def iterOnAligment(ali):
+ pos0=0
+ pos1=len(ali[1].wrapped)-1
+ begin0=False
+ end0=False
+ begin1=False
+ end1=False
+ for nuc0,nuc1 in columnIterator(ali):
+ if nuc0=='-':
+ if begin0:
+ if not end0:
+ score0 = ( ali[0].wrapped.quality[pos0-1]
+ +ali[0].wrapped.quality[pos0]
+ )/2
+ else:
+ score0 = 1.
+ else:
+ score0 = 0.
+ else:
+ begin0=True
+ score0 = ali[0].wrapped.quality[pos0]
+ pos0+=1
+ end0= pos0==len(ali[0].wrapped)
+
+ if nuc1=='-':
+ if begin1:
+ if not end1:
+ score1 = ( ali[1].wrapped.wrapped.quality[pos1]
+ +ali[1].wrapped.wrapped.quality[pos1+1]
+ )/2
+ else:
+ score1 = 0.
+ else:
+ score1 = 1.
+ else:
+ begin1=True
+ score1 = ali[1].wrapped.wrapped.quality[pos1]
+ pos1-=1
+ end1=pos1<0
+
+ result = (nuc0,score0,nuc1,score1)
+ yield result
diff --git a/src/obitools/tree/__init__.py b/src/obitools/tree/__init__.py
new file mode 100644
index 0000000..facb5ff
--- /dev/null
+++ b/src/obitools/tree/__init__.py
@@ -0,0 +1,116 @@
+import re
+
+
+class Tree(set):
+ def registerNode(self,node):
+ assert isinstance(node, TreeNode)
+ self.add(node)
+
+ def childNodeIterator(self,node):
+ assert isinstance(node, TreeNode)
+ return (x for x in self if x._parent==node)
+
+ def subTreeSize(self,node):
+ n=1
+ for subnode in self.childNodeIterator(node):
+ n+=self.subTreeSize(subnode)
+ return n
+
+ def getRoot(self):
+ roots = [x for x in self if x._parent is None]
+ assert len(roots)==1,'Tree cannot have several root node'
+ return roots[0]
+
+ def ancestorNodeIterator(self,node):
+ assert isinstance(node, TreeNode)
+ while node._parent is not None:
+ yield node
+ node=node._parent
+ yield node
+
+ def terminalNodeIterator(self):
+ return (x for x in self if x._isterminal)
+
+ def commonAncestor(self,node1,node2):
+ anc1 = set(x for x in self.ancestorNodeIterator(node1))
+ rep = [x for x in self.ancestorNodeIterator(node2)
+ if x in anc1]
+ assert len(rep)>=1
+ return rep[0]
+
+ def getDist(self,node1,node2):
+ ca = self.commonAncestor(node1, node2)
+ dist = 0
+ while node1 != ca:
+ dist+=node1._dist
+ node1=node1._parent
+ while node2 != ca:
+ dist+=node2._dist
+ node2=node2._parent
+ return dist
+
+ def farestNodes(self):
+ dmax=0
+ n1=None
+ n2=None
+ for node1 in self.terminalNodeIterator():
+ for node2 in self.terminalNodeIterator():
+ d = self.getDist(node1, node2)
+ if d > dmax:
+ dmax = d
+ n1=node1
+ n2=node2
+ return node1,node2,dmax
+
+ def setRoot(self,node,dist):
+ assert node in self
+ assert node._parent and node._dist > dist
+
+ newroot = TreeNode(self)
+ parent = node._parent
+ node._parent = newroot
+ compdist = node._dist - dist
+ node._dist=dist
+ node = parent
+
+ while node:
+ parent = node._parent
+ if parent:
+ dist = node._dist
+
+ node._parent = newroot
+ node._dist = compdist
+
+ newroot = node
+ node = parent
+
+ if node:
+ compdist=dist
+
+ for child in self.childNodeIterator(newroot):
+ child._parent = newroot._parent
+ child._dist += newroot._dist
+
+ self.remove(newroot)
+
+
+class TreeNode(object):
+ def __init__(self,tree,name=None,dist=None,bootstrap=None,**info):
+ self._parent=None
+ self._name=name
+ self._dist=dist
+ self._bootstrap=bootstrap
+ self._info=info
+ tree.registerNode(self)
+ self._isterminal=True
+
+
+ def linkToParent(self,parent):
+ assert isinstance(parent, TreeNode) or parent is None
+ self._parent=parent
+ if parent is not None:
+ parent._isterminal=False
+
+
+
+
diff --git a/src/obitools/tree/dot.py b/src/obitools/tree/dot.py
new file mode 100644
index 0000000..a21c4a1
--- /dev/null
+++ b/src/obitools/tree/dot.py
@@ -0,0 +1,18 @@
+
+from obitools.utils import universalOpen
+from obitools.tree import Tree,TreeNode
+
+def nodeWriter(tree,node,nodes):
+ data=[]
+ if node._parent:
+ data.append('%d -> %d ' % (nodes[node],nodes[node._parent]))
+ return "\n".join(data)
+
+
+def treeWriter(tree):
+ nodes=dict(map(None,tree,xrange(len(tree))))
+ code=[]
+ for node in tree:
+ code.append(nodeWriter(tree,node,nodes))
+ code = "\n".join(code)
+ return 'digraph tree { node [shape=point]\n%s\n};' % code
\ No newline at end of file
diff --git a/src/obitools/tree/layout.py b/src/obitools/tree/layout.py
new file mode 100644
index 0000000..a39ba77
--- /dev/null
+++ b/src/obitools/tree/layout.py
@@ -0,0 +1,103 @@
+
+class NodeLayout(dict):
+ '''
+ Layout data associated to a tree node.
+ '''
+ pass
+
+class TreeLayout(dict):
+ '''
+ Description of a phylogenetic tree layout
+
+ @see:
+ '''
+ def addNode(self,node):
+ self[node]=NodeLayout()
+
+ def setAttribute(self,node,key,value):
+ self[node][key]=value
+
+ def hasAttribute(self,node,key):
+ return key in self[node]
+
+ def getAttribute(self,node,key,default=None):
+ return self[node].get(key,default)
+
+ def setNodesColor(self,color,predicate=True):
+ '''
+
+ @param color:
+ @type color:
+ @param predicat:
+ @type predicat:
+ '''
+ for node in self:
+ if callable(predicat):
+ change = predicat(node)
+ else:
+ change = predicat
+
+ if change:
+ if callable(color):
+ c = color(node)
+ else:
+ c = color
+ self.setAttribute(node, 'color', color)
+
+ def setCircular(self,iscircularpredicat):
+ for node in self:
+ if callable(iscircularpredicat):
+ change = iscircularpredicat(node)
+ else:
+ change = iscircularpredicat
+
+ if change:
+ self.setAttribute(node, 'shape', 'circle')
+ else:
+ self.setAttribute(node, 'shape', 'square')
+
+ def setRadius(self,radius,predicate=True):
+ for node in self:
+ if callable(predicat):
+ change = predicat(node)
+ else:
+ change = predicat
+
+ if change:
+ if callable(radius):
+ r = radius(node)
+ else:
+ r = radius
+ self.setAttribute(node, 'radius', r)
+
+def predicatGeneratorIsInfoEqual(info,value):
+ def isInfoEqual(node):
+ data = node._info
+ return data is not None and info in data and data[info]==value
+
+ return isInfoEqual
+
+def isTerminalNode(node):
+ return node._isterminal
+
+def constantColorGenerator(color):
+ def colorMaker(node):
+ return color
+
+ return colorMaker
+
+def constantColorGenerator(color):
+ def colorMaker(node):
+ return color
+
+ return colorMaker
+
+def notPredicatGenerator(predicate):
+ def notpred(x):
+ return not predicat(x)
+ return notpred
+
+
+
+
+
\ No newline at end of file
diff --git a/src/obitools/tree/newick.py b/src/obitools/tree/newick.py
new file mode 100644
index 0000000..c69d0d3
--- /dev/null
+++ b/src/obitools/tree/newick.py
@@ -0,0 +1,117 @@
+import re
+import sys
+
+from obitools.utils import universalOpen
+from obitools.tree import Tree,TreeNode
+
+def subNodeIterator(data):
+ level=0
+ start = 1
+ if data[0]=='(':
+ for i in xrange(1,len(data)):
+ c=data[i]
+ if c=='(':
+ level+=1
+ elif c==')':
+ level-=1
+ if c==',' and not level:
+ yield data[start:i]
+ start = i+1
+ yield data[start:i]
+ else:
+ yield data
+
+
+_nodeParser=re.compile('\s*(?P<subnodes>\(.*\))?(?P<name>[^ :]+)? *(?P<bootstrap>[0-9.]+)?(:(?P<distance>-?[0-9.]+))?')
+
+def nodeParser(data):
+ parsedNode = _nodeParser.match(data).groupdict(0)
+ if not parsedNode['name']:
+ parsedNode['name']=None
+
+ if not parsedNode['bootstrap']:
+ parsedNode['bootstrap']=None
+ else:
+ parsedNode['bootstrap']=float(parsedNode['bootstrap'])
+
+ if not parsedNode['distance']:
+ parsedNode['distance']=None
+ else:
+ parsedNode['distance']=float(parsedNode['distance'])
+
+ if not parsedNode['subnodes']:
+ parsedNode['subnodes']=None
+
+ return parsedNode
+
+_cleanTreeData=re.compile('\s+')
+
+def treeParser(data,tree=None,parent=None):
+ if tree is None:
+ tree = Tree()
+ data = _cleanTreeData.sub(' ',data).strip()
+
+ parsedNode = nodeParser(data)
+ node = TreeNode(tree,
+ parsedNode['name'],
+ parsedNode['distance'],
+ parsedNode['bootstrap'])
+
+ node.linkToParent(parent)
+
+ if parsedNode['subnodes']:
+ for subnode in subNodeIterator(parsedNode['subnodes']):
+ treeParser(subnode,tree,node)
+ return tree
+
+_treecomment=re.compile('\[.*\]')
+
+def treeIterator(file):
+ file = universalOpen(file)
+ data = file.read()
+
+ comment = _treecomment.findall(data)
+ data=_treecomment.sub('',data).strip()
+
+ if comment:
+ comment=comment[0]
+ else:
+ comment=None
+ for tree in data.split(';'):
+ t = treeParser(tree)
+ if comment:
+ t.comment=comment
+ yield t
+
+def nodeWriter(tree,node,deep=0):
+ name = node._name
+ if name is None:
+ name=''
+
+ distance=node._dist
+ if distance is None:
+ distance=''
+ else:
+ distance = ':%6.5f' % distance
+
+ bootstrap=node._bootstrap
+ if bootstrap is None:
+ bootstrap=''
+ else:
+ bootstrap=' %d' % int(bootstrap)
+
+ nodeseparator = ',\n' + ' ' * (deep+1)
+
+ subnodes = nodeseparator.join([nodeWriter(tree, x, deep+1)
+ for x in tree.childNodeIterator(node)])
+ if subnodes:
+ subnodes='(\n' + ' ' * (deep+1) + subnodes + '\n' + ' ' * deep + ')'
+
+ return '%s%s%s%s' % (subnodes,name,bootstrap,distance)
+
+def treeWriter(tree,startnode=None):
+ if startnode is not None:
+ root=startnode
+ else:
+ root = tree.getRoot()
+ return nodeWriter(tree,root)+';'
diff --git a/src/obitools/tree/svg.py b/src/obitools/tree/svg.py
new file mode 100644
index 0000000..ff51a8c
--- /dev/null
+++ b/src/obitools/tree/svg.py
@@ -0,0 +1,70 @@
+import math
+
+from obitools.svg import Scene,Circle,Line,Rectangle,Text
+from obitools.tree import Tree
+
+def displayTreeLayout(layout,width=400,height=400,radius=3,scale=1.0):
+ '''
+ Convert a tree layout object in an svg file.
+
+ @param layout: the tree layout object
+ @type layout: obitools.tree.layout.TreeLayout
+ @param width: svg document width
+ @type width: int
+ @param height: svg document height
+ @type height: int
+ @param radius: default radius of node in svg unit (default 3)
+ @type radius: int
+ @param scale: scale factor applied to the svg coordinates (default 1.0)
+ @type scale: float
+
+ @return: str containing svg code
+ '''
+ xmin = min(layout.getAttribute(n,'x') for n in layout)
+ xmax = max(layout.getAttribute(n,'x') for n in layout)
+ ymin = min(layout.getAttribute(n,'y') for n in layout)
+ ymax = max(layout.getAttribute(n,'y') for n in layout)
+
+ dx = xmax - xmin
+ dy = ymax - ymin
+
+ xscale = width * 0.95 / dx * scale
+ yscale = height * 0.95 / dy * scale
+
+ def X(x):
+ return (x - xmin ) * xscale + width * 0.025
+
+ def Y(y):
+ return (y - ymin ) * yscale + height * 0.025
+
+ scene = Scene('unrooted', height, width)
+
+ for n in layout:
+ if n._parent is not None:
+ parent = n._parent
+ xf = layout.getAttribute(n,'x')
+ yf = layout.getAttribute(n,'y')
+ xp = layout.getAttribute(parent,'x')
+ yp = layout.getAttribute(parent,'y')
+ scene.add(Line((X(xf),Y(yf)),(X(xp),Y(yp))))
+
+ for n in layout:
+ xf = layout.getAttribute(n,'x')
+ yf = layout.getAttribute(n,'y')
+ cf = layout.getAttribute(n,'color')
+ sf = layout.getAttribute(n,'shape')
+ if layout.hasAttribute(n,'radius'):
+ rf=layout.getAttribute(n,'radius')
+ else:
+ rf=radius
+
+ if sf=='circle':
+ scene.add(Circle((X(xf),Y(yf)),rf,cf))
+ else:
+ scene.add(Rectangle((X(xf)-rf,Y(yf)-rf),2*rf,2*rf,cf))
+
+
+ return ''.join(scene.strarray())
+
+
+
\ No newline at end of file
diff --git a/src/obitools/tree/unrooted.py b/src/obitools/tree/unrooted.py
new file mode 100644
index 0000000..9a9f3e6
--- /dev/null
+++ b/src/obitools/tree/unrooted.py
@@ -0,0 +1,33 @@
+from obitools.tree.layout import TreeLayout
+import math
+
+def subtreeLayout(tree,node,layout,start,end,x,y,default):
+ nbotu = tree.subTreeSize(node)
+ delta = (end-start)/(nbotu+1)
+
+ layout.addNode(node)
+ layout.setAttribute(node,'x',x)
+ layout.setAttribute(node,'y',y)
+ layout.setAttribute(node,'color',(255,0,0))
+ layout.setAttribute(node,'shape','circle')
+
+ for subnode in tree.childNodeIterator(node):
+ snbotu = tree.subTreeSize(subnode)
+ end = start + snbotu * delta
+ med = start + snbotu * delta /2
+ r = subnode._dist
+ if r is None or r <=0:
+ r=default
+ subx=math.cos(med) * r + x
+ suby=math.sin(med) * r + y
+ subtreeLayout(tree, subnode, layout, start, end, subx, suby, default)
+ start=end
+
+ return layout
+
+def treeLayout(tree):
+ layout = TreeLayout()
+ root = tree.getRoot()
+ dmin = min(n._dist for n in tree if n._dist is not None and n._dist > 0)
+ return subtreeLayout(tree,root,layout,0,2*math.pi,0,0,dmin / 100)
+
\ No newline at end of file
diff --git a/src/obitools/unit/__init__.py b/src/obitools/unit/__init__.py
new file mode 100644
index 0000000..d02c812
--- /dev/null
+++ b/src/obitools/unit/__init__.py
@@ -0,0 +1,8 @@
+import unittest
+
+from obitools import tests_group as obitools_tests_group
+
+tests_group=obitools_tests_group
+
+
+
diff --git a/src/obitools/unit/obitools/__init__.py b/src/obitools/unit/obitools/__init__.py
new file mode 100644
index 0000000..3c9fc13
--- /dev/null
+++ b/src/obitools/unit/obitools/__init__.py
@@ -0,0 +1,91 @@
+import unittest
+
+import obitools
+
+from utils import tests_group as utils_tests_group
+
+class BioseqTest(unittest.TestCase):
+
+ sequenceId = 'id1'
+ sequenceDefinition = 'sequence definition'
+ sequenceQualifier = {'extra':3}
+
+ def setUp(self):
+ self.bioseq = self.bioseqClass(self.sequenceId,
+ self.sequenceString,
+ self.sequenceDefinition,
+ **self.sequenceQualifier)
+
+ title = self.__doc__.strip()
+ underline = "=" * len(title)
+
+ #print "%s\n%s" % (title,underline)
+
+ def tearDown(self):
+ pass
+ #print "\n"
+
+ def testIdAttribute(self):
+ '''
+ test if id attribute exists
+ '''
+ self.failUnless(hasattr(self.bioseq, 'id'), 'id missing attribute')
+
+ def testIdValue(self):
+ '''
+ test if id attribute value is 'id1'
+ '''
+ self.failUnlessEqual(self.bioseq.id, 'id1',
+ 'identifier is created with good value')
+
+ def testDefinitionAttribute(self):
+ '''
+ test if definition attribute exists
+ '''
+ self.failUnless(hasattr(self.bioseq, 'definition'), 'definition missing attribute')
+
+ def testSequenceIsLowerCase(self):
+ '''
+ test if sequence is stored as lower case letter
+ '''
+ self.failUnlessEqual(str(self.bioseq),
+ str(self.bioseq).lower(),
+ "Sequence is not stored as lower case string")
+
+ def testSequenceQualifier(self):
+ '''
+ test if the extra qualifier is present and its value is three.
+ '''
+ self.failUnlessEqual(self.bioseq['extra'],
+ 3,
+ "Sequence qualifier cannot be successfully retrieve")
+
+ def testCreateSequenceQualifier(self):
+ self.bioseq['testqualifier']='ok'
+ self.failUnlessEqual(self.bioseq['testqualifier'],
+ 'ok',
+ "Sequence qualifier cannot be successfully created")
+
+
+
+class NucBioseqTest(BioseqTest):
+ '''
+ Test obitools.NucSequence class
+ '''
+
+ bioseqClass = obitools.NucSequence
+ sequenceString = 'AACGT' * 5
+
+
+class AABioseqTest(BioseqTest):
+ '''
+ Test obitools.AASequence class
+ '''
+
+ bioseqClass = obitools.AASequence
+ sequenceString = 'MLKCVT' * 5
+
+
+
+
+tests_group = utils_tests_group + [NucBioseqTest,AABioseqTest]
\ No newline at end of file
diff --git a/src/obitools/utils/__init__.py b/src/obitools/utils/__init__.py
new file mode 100644
index 0000000..425ceb9
--- /dev/null
+++ b/src/obitools/utils/__init__.py
@@ -0,0 +1,319 @@
+import sys
+
+import time
+import re
+import shelve
+
+from threading import Lock
+from logging import warning
+import urllib2
+
+from obitools.gzip import GzipFile
+from obitools.zipfile import ZipFile
+import os.path
+
+from _utils import FakeFile # @UnresolvedImport
+from _utils import progressBar # @UnresolvedImport
+import zlib
+
+try:
+ from collections import Counter
+except ImportError:
+ from obitools.collections import Counter
+
+
+class FileFormatError(Exception):
+ pass
+
+
+def uncompressFile(fileobj):
+ d = zlib.decompressobj(16+zlib.MAX_WBITS)
+ READ_BLOCK_SIZE = 1024*8
+
+ buf = ""
+ while True:
+ data = fileobj.read(READ_BLOCK_SIZE)
+ if not data: break
+
+ buf = buf + d.decompress(data)
+ lines = buf.split('\n')
+ buf=lines[-1]
+
+ for line in lines[0:-1]:
+ yield line+"\n"
+
+
+def universalOpen(file,noError=False):
+ '''
+ Open a file gziped or not.
+
+ If file is a C{str} instance, file is
+ concidered as a file name. In this case
+ the C{.gz} suffixe is tested to eventually
+ open it a a gziped file.
+
+ If file is an other kind of object, it is assumed
+ that this object follow the C{file} interface
+ and it is return as is.
+
+ @param file: the file to open
+ @type file: C{str} or a file like object
+
+ @return: an iterator on text lines.
+ '''
+ if isinstance(file,str):
+ try:
+ if urllib2.urlparse.urlparse(file)[0]=='':
+ rep = open(file)
+ else:
+ rep = urllib2.urlopen(file,timeout=15)
+
+ if file[-3:] == '.gz':
+ rep = uncompressFile(fileobj=rep)
+ if file[-4:] == '.zip':
+ zip = ZipFile(file=rep)
+ data = zip.infolist()
+ assert len(data)==1,'Only zipped file containning a single file can be open'
+ name = data[0].filename
+ rep = zip.open(name)
+ except Exception as e:
+ if not noError:
+ print >>sys.stderr, e
+ sys.exit();
+ else:
+ raise e
+ else:
+ rep = file
+ return rep
+
+def universalTell(file):
+ '''
+ Return the position in the file even if
+ it is a gziped one.
+
+ @param file: the file to check
+ @type file: a C{file} like instance
+
+ @return: position in the file
+ @rtype: C{int}
+ '''
+
+ if hasattr(file, "tell"):
+ return file.tell()
+ else:
+ return None
+
+def fileSize(file):
+ '''
+ Return the file size even if it is a
+ gziped one.
+
+ @param file: the file to check
+ @type file: a C{file} like instance
+
+ @return: the size of the file
+ @rtype: C{int}
+ '''
+ if hasattr(file, "tell"):
+ pos = file.tell()
+ file.seek(0,2)
+ length = file.tell()
+ file.seek(pos,0)
+ else:
+ length=0
+ return length
+
+
+def endLessIterator(endedlist):
+ for x in endedlist:
+ yield x
+ while(1):
+ yield endedlist[-1]
+
+
+def multiLineWrapper(lineiterator):
+ '''
+ Aggregator of strings.
+
+ @param lineiterator: a stream of strings from an opened OBO file.
+ @type lineiterator: a stream of strings.
+
+ @return: an aggregated stanza.
+ @rtype: an iterotor on str
+
+ @note: The aggregator aggregates strings from an opened OBO file.
+ When the length of a string is < 2, the current stanza is over.
+ '''
+
+ for line in lineiterator:
+ rep = [line]
+ while len(line)>=2 and line[-2]=='\\':
+ rep[-1]=rep[-1][0:-2]
+ try:
+ line = lineiterator.next()
+ except StopIteration:
+ raise FileFormatError
+ rep.append(line)
+ yield ''.join(rep)
+
+
+def skipWhiteLineIterator(lineiterator):
+ '''
+ Curator of stanza.
+
+ @param lineiterator: a stream of strings from an opened OBO file.
+ @type lineiterator: a stream of strings.
+
+ @return: a stream of strings without blank strings.
+ @rtype: a stream strings
+
+ @note: The curator skip white lines of the current stanza.
+ '''
+
+ for line in lineiterator:
+ cleanline = line.strip()
+ if cleanline:
+ yield line
+ else:
+ print 'skipped'
+
+
+class ColumnFile(object):
+
+ def __init__(self,stream,sep=None,strip=True,
+ types=None,skip=None,head=None,
+ extra=None,
+ extraformat='([a-zA-Z]\w*) *= *([^;]+);'):
+ self._stream = universalOpen(stream)
+ self._delimiter=sep
+ self._strip=strip
+ self._extra=extra
+ self._extraformat = re.compile(extraformat)
+
+ if types:
+ self._types=[x for x in types]
+ for i in xrange(len(self._types)):
+ if self._types[i] is bool:
+ self._types[i]=ColumnFile.str2bool
+ else:
+ self._types=None
+
+ self._skip = skip
+ if skip is not None:
+ self._lskip= len(skip)
+ else:
+ self._lskip= 0
+ self._head=head
+
+ def str2bool(x):
+ return bool(eval(x.strip()[0].upper(),{'T':True,'V':True,'F':False}))
+
+ str2bool = staticmethod(str2bool)
+
+
+ def __iter__(self):
+ return self
+
+ def next(self):
+
+ def cast(txt,type):
+ try:
+ v = type(txt)
+ except:
+ v=None
+ return v
+ ligne = self._stream.next()
+ if self._skip is not None:
+ while ligne[0:self._lskip]==self._skip:
+ ligne = self._stream.next()
+ if self._extra is not None:
+ try:
+ (ligne,extra) = ligne.rsplit(self._extra,1)
+ extra = dict(self._extraformat.findall(extra))
+ except ValueError:
+ extra=None
+ else:
+ extra = None
+ data = ligne.split(self._delimiter)
+ if self._strip or self._types:
+ data = [x.strip() for x in data]
+ if self._types:
+ it = endLessIterator(self._types)
+ data = [cast(*x) for x in ((y,it.next()) for y in data)]
+ if self._head is not None:
+ data=dict(map(None, self._head,data))
+ if extra is not None:
+ data['__extra__']=extra
+ else:
+ if extra is not None:
+ data.append(extra)
+ return data
+
+ def tell(self):
+ return universalTell(self._stream)
+
+
+class CachedDB(object):
+
+ def __init__(self,cachefile,masterdb):
+ self._cache = shelve.open(cachefile,'c')
+ self._db = masterdb
+ self._lock=Lock()
+
+ def _cacheSeq(self,seq):
+ self._lock.acquire()
+ self._cache[seq.id]=seq
+ self._lock.release()
+ return seq
+
+ def __getitem__(self,ac):
+ if isinstance(ac,str):
+ self._lock.acquire()
+ if ac in self._cache:
+# print >>sys.stderr,"Use cache for %s" % ac
+ data = self._cache[ac]
+ self._lock.release()
+
+ else:
+ self._lock.release()
+ data = self._db[ac]
+ self._cacheSeq(data)
+ return data
+ else:
+ self._lock.acquire()
+ acs = [[x,self._cache.get(x,None)] for x in ac]
+ self._lock.release()
+ newacs = [ac for ac,cached in acs if cached is None]
+ if newacs:
+ newseqs = self._db[newacs]
+ else:
+ newseqs = iter([])
+ for r in acs:
+ if r[1] is None:
+ r[1]=self._cacheSeq(newseqs.next())
+# else:
+# print >>sys.stderr,"Use cache for %s" % r[0]
+ return (x[1] for x in acs)
+
+
+def moduleInDevelopment(name):
+ Warning('This module %s is under development : use it with caution' % name)
+
+
+def deprecatedScript(newscript):
+ current = sys.argv[0]
+ print >>sys.stderr," "
+ print >>sys.stderr," "
+ print >>sys.stderr," "
+ print >>sys.stderr,"#########################################################"
+ print >>sys.stderr,"# #"
+ print >>sys.stderr," W A R N I N G :"
+ print >>sys.stderr," %s is a deprecated script " % os.path.split(current)[1]
+ print >>sys.stderr," it will disappear in the next obitools version"
+ print >>sys.stderr," "
+ print >>sys.stderr," The new corresponding command is %s " % newscript
+ print >>sys.stderr,"# #"
+ print >>sys.stderr,"#########################################################"
+ print >>sys.stderr," "
+ print >>sys.stderr," "
+ print >>sys.stderr," "
diff --git a/src/obitools/utils/_utils.pxd b/src/obitools/utils/_utils.pxd
new file mode 100644
index 0000000..f86fec0
--- /dev/null
+++ b/src/obitools/utils/_utils.pxd
@@ -0,0 +1,42 @@
+cdef extern from "stdio.h":
+ struct FILE
+ int fprintf(FILE *stream, char *format, ...)
+ FILE* stderr
+ ctypedef unsigned int off_t "unsigned long long"
+
+
+
+cdef extern from "time.h":
+ struct tm :
+ int tm_yday
+ int tm_hour
+ int tm_min
+ int tm_sec
+
+ enum: CLOCKS_PER_SEC
+
+ ctypedef int time_t
+ ctypedef int clock_t
+
+
+ tm *gmtime_r(time_t *clock, tm *result)
+ time_t time(time_t *tloc)
+ clock_t clock()
+
+cdef class FakeFile:
+
+ cdef object _li
+ cdef list __buffer
+ cdef int __bufsize
+
+ cpdef str read(self,int size=?)
+ cpdef str readline(self)
+
+cpdef object progressBar(object pos,
+ off_t maxi,
+ bint reset=?,
+ bytes head=?,
+ list delta=?,
+ list step=?)
+
+
\ No newline at end of file
diff --git a/src/obitools/utils/_utils.pyx b/src/obitools/utils/_utils.pyx
new file mode 100644
index 0000000..3b662bf
--- /dev/null
+++ b/src/obitools/utils/_utils.pyx
@@ -0,0 +1,160 @@
+# cython: profile=True
+
+from _utils cimport *
+
+import sys
+
+cdef class FakeFile:
+
+ def __init__(self,li):
+ self._li = li
+ self.__buffer = []
+ self.__bufsize=0
+
+ cpdef str read(self,int size=-1):
+
+ cdef int csize=self.__bufsize
+ cdef str line
+ cdef str buffer
+
+ try:
+ while(csize < size or size < 0):
+ line = self._li.next()
+ csize+=len(line)
+ self.__buffer.append(line)
+ except StopIteration:
+ if csize==0:
+ raise EOFError
+
+ buffer = ''.join(self.__buffer)
+
+ if size >= 0:
+ self.__buffer=[buffer[size:]]
+ self.__bufsize=len(self.__buffer[0])
+ buffer=buffer[0:size]
+ else:
+ self.__buffer=[]
+ self.__bufsize=0
+
+ return buffer
+
+ cpdef str readline(self):
+
+ cdef str line # @DuplicatedSignature
+
+ try:
+ if self.__buffer:
+ line = self.__buffer[0]
+ self.__buffer=[]
+ self.__bufsize=0
+ else:
+ line=self._li.next()
+ except StopIteration:
+ raise EOFError
+
+ return line
+
+cpdef object progressBar(object pos,
+ off_t maxi,
+ bint reset=False,
+ bytes head=b'',
+ list delta=[],
+ list step=[1,0,0]):
+
+ cdef off_t ipos
+ cdef double percent
+ cdef int days,hour,minu,sec
+ cdef bytes bar
+ cdef off_t fraction
+ cdef int freq,cycle,arrow
+ cdef tm remain
+
+ cdef clock_t d
+ cdef clock_t elapsed
+ cdef clock_t newtime
+ cdef clock_t more
+
+ # 0123456789
+ cdef char* wheel= '|/-\\'
+ cdef char* spaces=' ' \
+ ' ' \
+ ' ' \
+ ' ' \
+ ' '
+
+ cdef char* diese ='##########' \
+ '##########' \
+ '##########' \
+ '##########' \
+ '##########'
+
+ if reset:
+ del delta[:]
+ step[:]=[1,0,0]
+ if not delta:
+ delta.append(clock())
+ delta.append(clock())
+
+ if ( maxi<=0):
+ maxi=1
+
+ freq,cycle,arrow = step
+
+ cycle+=1
+
+ if cycle % freq == 0:
+ cycle=1
+ newtime = clock()
+ d = newtime-delta[1]
+
+ if d < 0.2 * CLOCKS_PER_SEC :
+ freq*=2
+ elif d > 0.4 * CLOCKS_PER_SEC and freq>1:
+ freq/=2
+
+ delta[1]=newtime
+ elapsed = newtime-delta[0]
+
+ if callable(pos):
+ ipos=pos()
+ else:
+ ipos=pos
+
+ percent = <double>ipos/<double>maxi
+ more = <time_t>((<double>elapsed / percent * (1. - percent))/CLOCKS_PER_SEC)
+ <void>gmtime_r(&more, &remain)
+ days = remain.tm_yday
+ hour = remain.tm_hour
+ minu = remain.tm_min
+ sec = remain.tm_sec
+
+ fraction=<int>(percent * 50.)
+ if fraction < 0:
+ fraction=0
+ if fraction > 50:
+ fraction=50
+ arrow=(arrow+1) % 4
+
+ if days:
+ <void>fprintf(stderr,b'\r%s %5.1f %% |%.*s%c%.*s] remain : %d days %02d:%02d:%02d',
+ <char*>head,
+ percent*100,
+ fraction,diese,
+ wheel[arrow],
+ 50-fraction,spaces,
+ days,hour,minu,sec)
+ else:
+ <void>fprintf(stderr,b'\r%s %5.1f %% |%.*s%c%.*s] remain : %02d:%02d:%02d',
+ <char*>head,
+ percent*100.,
+ fraction,diese,
+ wheel[arrow],
+ 50-fraction,spaces,
+ hour,minu,sec)
+
+
+ else:
+ cycle+=1
+
+ step[0:3] = freq,cycle,arrow
+
\ No newline at end of file
diff --git a/src/obitools/utils/bioseq.py b/src/obitools/utils/bioseq.py
new file mode 100644
index 0000000..2031ab1
--- /dev/null
+++ b/src/obitools/utils/bioseq.py
@@ -0,0 +1,234 @@
+def mergeTaxonomyClassification(uniqSeq,taxonomy):
+ for seq in uniqSeq:
+ if seq['merged_taxid']:
+ seq['taxid']=taxonomy.lastCommonTaxon(*seq['merged_taxid'].keys())
+ tsp = taxonomy.getSpecies(seq['taxid'])
+ tgn = taxonomy.getGenus(seq['taxid'])
+ tfa = taxonomy.getFamily(seq['taxid'])
+
+ if tsp is not None:
+ sp_sn = taxonomy.getScientificName(tsp)
+ else:
+ sp_sn="###"
+ tsp=-1
+
+ if tgn is not None:
+ gn_sn = taxonomy.getScientificName(tgn)
+ else:
+ gn_sn="###"
+ tgn=-1
+
+ if tfa is not None:
+ fa_sn = taxonomy.getScientificName(tfa)
+ else:
+ fa_sn="###"
+ tfa=-1
+
+ seq['species']=tsp
+ seq['genus']=tgn
+ seq['family']=tfa
+
+ seq['species_name']=sp_sn
+ seq['genus_name']=gn_sn
+ seq['family_name']=fa_sn
+
+ seq['rank']=taxonomy.getRank(seq['taxid'])
+ seq['scientific_name']=fa_sn = taxonomy.getScientificName(seq['taxid'])
+
+def uniqSequence(seqIterator,taxonomy=None,mergedKey=None,mergeIds=False,categories=None):
+ uniques={}
+ uniqSeq=[]
+
+ if categories is None:
+ categories=[]
+
+ if mergedKey is not None:
+ mergedKey=set(mergedKey)
+ else:
+ mergedKey=set()
+
+ if taxonomy is not None:
+ mergedKey.add('taxid')
+
+ for seq in seqIterator:
+ s = tuple(seq[x] for x in categories) + (str(seq),)
+ if s in uniques:
+ s = uniques[s]
+ if 'count' in seq:
+ s['count']+=seq['count']
+ else:
+ s['count']+=1
+ seq['count']=1
+# if taxonomy is not None and 'taxid' in seq:
+# s['merged_taxid'][seq['taxid']]=
+ for key in mergedKey:
+ if key=='taxid' and mergeIds:
+ if 'taxid_dist' in seq:
+ s["taxid_dist"].update(seq["taxid_dist"])
+ if 'taxid' in seq:
+ s["taxid_dist"][seq.id]=seq['taxid']
+
+ mkey = "merged_%s" % key
+ #cas ou on met a jour les merged_keys mais il n'y a pas de merged_keys dans la sequence qui arrive
+ if key in seq:
+ s[mkey][seq[key]]=s[mkey].get(seq[key],0)+seq['count']
+ #cas ou merged_keys existe deja
+ else:
+ if mkey in seq:
+ for skey in seq[mkey]:
+ s[mkey][skey]=s[mkey].get(skey,0)+seq[mkey][skey]
+
+
+ for key in seq.iterkeys():
+ # Merger proprement l'attribut merged s'il exist
+ if key in s and s[key]!=seq[key] and key!='count' and key[0:7]!='merged_' and key!='merged':
+ del(s[key])
+
+
+ if mergeIds:
+ s['merged'].append(seq.id)
+ else:
+ uniques[s]=seq
+ for key in mergedKey:
+ if key=='taxid' and mergeIds:
+ if 'taxid_dist' not in seq:
+ seq["taxid_dist"]={}
+ if 'taxid' in seq:
+ seq["taxid_dist"][seq.id]=seq['taxid']
+ mkey = "merged_%s" % key
+ if mkey not in seq:
+ seq[mkey]={}
+ if key in seq:
+ seq[mkey][seq[key]]=seq[mkey].get(seq[key],0)+seq['count']
+ del(seq[key])
+
+ if 'count' not in seq:
+ seq['count']=1
+ if mergeIds:
+ seq['merged']=[seq.id]
+ uniqSeq.append(seq)
+
+ if taxonomy is not None:
+ mergeTaxonomyClassification(uniqSeq, taxonomy)
+
+
+
+ return uniqSeq
+
+def uniqPrefixSequence(seqIterator,taxonomy=None,mergedKey=None,mergeIds=False,categories=None):
+
+ if categories is None:
+ categories=[]
+
+ def cmpseq(s1,s2):
+ return cmp(str(s1),str(s2))
+
+ if mergedKey is not None:
+ mergedKey=set(mergedKey)
+ else:
+ mergedKey=set()
+
+ if taxonomy is not None:
+ mergedKey.add('taxid')
+
+ sequences=list(seqIterator)
+
+ if not sequences:
+ return []
+
+ sequences.sort(cmpseq)
+
+
+ old=sequences.pop()
+ uniqSeq=[old]
+ if 'count' not in old:
+ old['count']=1
+ for key in mergedKey:
+ mkey = "merged_%s" % key
+ if mkey not in old:
+ old[mkey]={}
+ if key in old:
+ old[mkey][old[key]]=old[mkey].get(old[key],0)+1
+ if mergeIds:
+ old['merged']=[old.id]
+
+
+ while(sequences):
+ seq=sequences.pop()
+ lseq=len(seq)
+ pold = str(old)[0:lseq]
+ if pold==str(seq):
+
+ if 'count' in seq:
+ old['count']+=seq['count']
+ else:
+ old['count']+=1
+
+ for key in mergedKey:
+ mkey = "merged_%s" % key
+ if key in seq:
+ old[mkey][seq[key]]=old[mkey].get(seq[key],0)+1
+ if mkey in seq:
+ for skey in seq[mkey]:
+ if skey in old:
+ old[mkey][skey]=old[mkey].get(seq[skey],0)+seq[mkey][skey]
+ else:
+ old[mkey][skey]=seq[mkey][skey]
+
+ for key in seq.iterkeys():
+ if key in old and old[key]!=seq[key]:
+ del(old[key])
+
+
+ if mergeIds:
+ old['merged'].append(seq.id)
+ else:
+ old=seq
+
+ for key in mergedKey:
+ mkey = "merged_%s" % key
+ if mkey not in seq:
+ seq[mkey]={}
+ if key in seq:
+ seq[mkey][seq[key]]=seq[mkey].get(seq[key],0)+1
+ del(seq[key])
+
+ if 'count' not in seq:
+ seq['count']=1
+ if mergeIds:
+ seq['merged']=[seq.id]
+ uniqSeq.append(seq)
+
+ if taxonomy is not None:
+ mergeTaxonomyClassification(uniqSeq, taxonomy)
+
+ return uniqSeq
+
+
+
+
+def _cmpOnKeyGenerator(key,reverse=False):
+ def compare(x,y):
+ try:
+ c1 = x[key]
+ except KeyError:
+ c1=None
+
+ try:
+ c2 = y[key]
+ except KeyError:
+ c2=None
+
+ if reverse:
+ s=c1
+ c1=c2
+ c2=s
+ return cmp(c1,c2)
+
+ return compare
+
+def sortSequence(seqIterator,key,reverse=False):
+ seqs = list(seqIterator)
+ seqs.sort(_cmpOnKeyGenerator(key, reverse))
+ return seqs
+
\ No newline at end of file
diff --git a/src/obitools/utils/crc64.py b/src/obitools/utils/crc64.py
new file mode 100644
index 0000000..537391e
--- /dev/null
+++ b/src/obitools/utils/crc64.py
@@ -0,0 +1,53 @@
+#
+# Code obtained from :
+# http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/259177/index_txt
+#
+
+# Initialisation
+# 32 first bits of generator polynomial for CRC64
+# the 32 lower bits are assumed to be zero
+
+POLY64REVh = 0xd8000000L
+CRCTableh = [0] * 256
+CRCTablel = [0] * 256
+isInitialized = False
+
+def CRC64(aString):
+ global isInitialized
+ crcl = 0
+ crch = 0
+ if (isInitialized is not True):
+ isInitialized = True
+ for i in xrange(256):
+ partl = i
+ parth = 0L
+ for j in xrange(8):
+ rflag = partl & 1L
+ partl >>= 1L
+ if (parth & 1):
+ partl |= (1L << 31L)
+ parth >>= 1L
+ if rflag:
+ parth ^= POLY64REVh
+ CRCTableh[i] = parth;
+ CRCTablel[i] = partl;
+
+ for item in aString:
+ shr = 0L
+ shr = (crch & 0xFF) << 24
+ temp1h = crch >> 8L
+ temp1l = (crcl >> 8L) | shr
+ tableindex = (crcl ^ ord(item)) & 0xFF
+
+ crch = temp1h ^ CRCTableh[tableindex]
+ crcl = temp1l ^ CRCTablel[tableindex]
+ return (crch, crcl)
+
+def CRC64digest(aString):
+ return "%08X%08X" % (CRC64(aString))
+
+if __name__ == '__main__':
+ assert CRC64("IHATEMATH") == (3822890454, 2600578513)
+ assert CRC64digest("IHATEMATH") == "E3DCADD69B01ADD1"
+ print 'CRC64: dumb test successful'
+
diff --git a/src/obitools/utils/iterator.py b/src/obitools/utils/iterator.py
new file mode 100644
index 0000000..f53537f
--- /dev/null
+++ b/src/obitools/utils/iterator.py
@@ -0,0 +1,8 @@
+from itertools import chain
+
+def uniqueChain(*args):
+ see = set()
+ for x in chain(*args):
+ if x not in see:
+ see.add(x)
+ yield x
\ No newline at end of file
diff --git a/src/obitools/version.py b/src/obitools/version.py
new file mode 100644
index 0000000..863c966
--- /dev/null
+++ b/src/obitools/version.py
@@ -0,0 +1,5 @@
+major = 1
+minor = 2
+serial= '11'
+
+version = "%2d.%02d %s" % (major,minor,serial)
diff --git a/src/obitools/word/__init__.py b/src/obitools/word/__init__.py
new file mode 100644
index 0000000..2719b3b
--- /dev/null
+++ b/src/obitools/word/__init__.py
@@ -0,0 +1,135 @@
+from itertools import imap
+from _binary import *
+
+def wordCount(liste):
+ count = {}
+
+ for e in liste:
+ count[e]=count.get(e,0) + 1
+
+ return count
+
+
+def wordIterator(sequence,lword,step=1,endIncluded=False,circular=False):
+
+ assert not (endIncluded and circular), \
+ "endIncluded and circular cannot not be set to True at the same time"
+
+ L = len(sequence)
+ sequence = str(sequence)
+ if circular:
+ sequence += sequence[0:lword]
+ pmax=L
+ elif endIncluded:
+ pmax=L
+ else:
+ pmax = L - lword + 1
+
+ pos = xrange(0,pmax,step)
+
+ for x in pos:
+ yield encodeWord(sequence[x:x+lword])
+
+def filterIterator(sequence,step=32,maxword=50,wordsize=4,circular=False):
+
+ assert step < 64
+
+ wi = wordIterator(sequence, wordsize, circular=circular)
+
+ lfilter=0
+ winfilter=0
+ filter=0
+ buffer=[]
+ pos=0
+
+ for w in wi:
+ code = 1 << w
+ buffer.append(code)
+ if len(buffer)>step:
+ buffer.pop(0)
+ lfilter+=1
+ if not filter & code:
+ filter |=code
+ winfilter+=1
+ if winfilter>=maxword:
+ yield (sequence.id,pos,lfilter,filter)
+ pos = pos + lfilter - len(buffer)
+ filter=0
+ lfilter=0
+ winfilter=0
+ for code in buffer:
+ lfilter+=1
+ if not filter & code:
+ filter |=code
+ winfilter+=1
+ buffer=[]
+
+ yield (sequence.id,pos,lfilter,filter)
+
+def filterDict(sequence,step=32,maxword=50,wordsize=4,circular=False,filters=None):
+ fi = filterIterator(sequence, step, maxword,wordsize, circular)
+
+ if filters is None:
+ r = {}
+ else:
+ r = filters
+
+ for f in fi:
+ lseq = r.get(f[2],{})
+ r[f[3]]=lseq
+ lpos = lseq.get(f[0],[])
+ lseq[f[0]]=lpos
+ lpos.append((f[1],f[2]))
+
+ return r
+
+def primerWordMin(lprimer,error=2,lword=4):
+ parts = error+1
+ match = lprimer - error
+ mparts= (match / parts) + 1
+
+ if mparts < lword:
+ return 0
+
+ remains = mparts * parts - match
+
+ return (mparts - lword + 1) * parts - remains
+
+def wordSelector(words,accept=None,reject=None):
+ '''
+ Filter over a DNA word iterator.
+
+ @param words: an iterable object other a list of DNA words
+ @type words: an iterator
+ @param accept: a list of predicate. Each predicate is a function
+ accepting one str parametter and returning a boolean
+ value.
+ @type accept: list
+ @param reject: a list of predicat. Each predicat is a function
+ accepting one str parametter and returning a boolean
+ value.
+ @type reject: list
+
+ @return: an iterator on DNA word (str)
+ @rtype: iterator
+ '''
+ if accept is None:
+ accept=[]
+ if reject is None:
+ reject=[]
+ for w in words:
+# print [bool(p(w)) for p in accept]
+ accepted = reduce(lambda x,y: bool(x) and bool(y),
+ (p(w) for p in accept),
+ True)
+# print [(p.__name__,bool(p(w))) for p in reject]
+ rejected = reduce(lambda x,y:bool(x) or bool(y),
+ (p(w) for p in reject),
+ False)
+# print decodeWord(w,5),accepted,rejected,
+ if accepted and not rejected:
+# print " conserved"
+ yield w
+# else:
+# print
+
diff --git a/src/obitools/word/_binary.pyx b/src/obitools/word/_binary.pyx
new file mode 100644
index 0000000..17c2b50
--- /dev/null
+++ b/src/obitools/word/_binary.pyx
@@ -0,0 +1,269 @@
+'''
+Created on 2 juil. 2009
+
+ at author: coissac
+'''
+
+
+maxword = sizeof(unsigned long int) * 8 /2
+
+cdef import from "math.h":
+ double ceil(double x)
+ double log(double x)
+
+cdef int binarywordsize(unsigned long int x):
+ return <int>ceil(log(x)/log(2))
+
+cpdef str bin2str(unsigned long int x):
+ cdef str rep=''
+ cdef unsigned long int i
+ cdef int ws = binarywordsize(x)
+
+ for i in range(ws):
+ if x & (1 << i):
+ rep = '1' + rep
+ else:
+ rep = '0' + rep
+ return rep
+
+
+
+cdef class WordPattern :
+ cdef public unsigned long int a
+ cdef public unsigned long int c
+ cdef public unsigned long int g
+ cdef public unsigned long int t
+
+
+ def __init__(self, unsigned long int a,
+ unsigned long int c,
+ unsigned long int g,
+ unsigned long int t):
+ self.a=a
+ self.c=c
+ self.g=g
+ self.t=t
+
+ def __str__(self):
+ return b"(a:%s,c:%s,g:%s,t:%s)" % (bin2str(self.a),
+ bin2str(self.c),
+ bin2str(self.g),
+ bin2str(self.t))
+
+cdef unsigned int bitCount(unsigned long int x):
+ cdef unsigned int i=0
+ while(x):
+ i+=1
+ x&=x-1
+ return i
+
+def allDNAWordIterator(size):
+ '''
+ Iterate thought the list of all DNA word of
+ size `size`.
+
+ @param size: size of the DNA word
+ @type size: int
+
+ @return: an iterator on DNA word (int)
+ @rtype: iterator
+ '''
+
+ maxi=4**size
+ return xrange(maxi)
+
+cpdef int wordDist(unsigned long int w1,unsigned long int w2):
+ '''
+ estimate Hamming distance between two words of the same size.
+
+ @param w1: the first word
+ @type w1: str
+ @param w2: the second word
+ @type w2: str
+
+ @return: the count of difference between the two words
+ @rtype: int
+ '''
+ cdef unsigned long int diff
+ cdef unsigned long int dist
+
+ diff = (~(w1 & w2) & (w1 | w2))
+ diff = (diff | (diff >> 1)) & 0x55555555
+ dist = bitCount(diff)
+ return dist
+
+cpdef int homoMax(unsigned long int word,unsigned int size):
+ cdef unsigned long int mask
+ cdef unsigned long int good
+ cdef unsigned long int maxi
+ cdef unsigned long int shift
+
+ mask = (1 << (size << 1))-1
+ good = 0x55555555
+ maxi=0
+ shift = word
+ while good:
+ maxi+=1
+ shift>>=2
+ mask>>=2
+ id = (word & shift) | (~word & ~shift)
+ good&= id & (id>>1) & mask
+ return maxi
+
+cpdef int countA(unsigned long int word,unsigned int size):
+ cdef unsigned long int mask
+ cdef unsigned long int id
+ cdef unsigned long int good
+ mask = (1 << (size << 1))-1
+ id = ~word
+ good= id & (id>>1) & 0x55555555 & mask
+ return bitCount(good)
+
+cpdef int countT(unsigned long int word,unsigned int size):
+ cdef unsigned long int good
+
+ good= word & (word>>1) & 0x55555555
+ return bitCount(good)
+
+cpdef int countAT(unsigned long int word,unsigned int size):
+ cdef unsigned long int mask
+ cdef unsigned long int shift
+ cdef unsigned long int good
+
+ mask = (1 << (size << 1))-1
+ shift = word >> 1
+ good = ((word & shift) | (~word & ~shift)) & 0x55555555 & mask
+ return bitCount(good)
+
+cpdef int countC(unsigned long int word,unsigned int size):
+ cdef unsigned long int mask
+ cdef unsigned long int good
+
+ mask = (1 << (size << 1))-1
+ good = ((word & 0x55555555) | (~word & 0xAAAAAAAA))
+ good &= (good >> 1) & 0x55555555 & mask
+ return bitCount(good)
+
+cpdef int countG(unsigned long int word,unsigned int size):
+ cdef unsigned long int mask
+ cdef unsigned long int good
+
+ mask = (1 << (size << 1))-1
+ good = ((word & 0xAAAAAAAA) | (~word & 0x55555555))
+ good &= (good >> 1) & 0x55555555 & mask
+ return bitCount(good)
+
+cpdef int countCG(unsigned long int word,unsigned int size):
+ cdef unsigned long int mask
+ cdef unsigned long int shift
+ cdef unsigned long int good
+
+ mask = (1 << (size << 1))-1
+ shift = word >> 1
+ good = ((word & ~shift) | (~word & shift)) & 0x55555555 & mask
+ return bitCount(good)
+
+
+cpdef str decodeWord(unsigned long int word,unsigned int size):
+ return ''.join(['acgt'[(word >> i) & 3] for i in xrange(size*2-2,-1,-2)])
+
+cpdef int encodeWord(word) except -1:
+ assert len(word)<=32,"Word length should be less or equal to 32"
+ w=0
+ word=word.lower()
+ for l in word:
+ w<<=2
+ if l=='c' :
+ w|=1
+ elif l=='g':
+ w|=2
+ elif l=='t':
+ w|=3
+ elif l!='a':
+ raise RuntimeError,"word should only contain a, c, g or t (%s)" % word
+ return w
+
+def encodePattern(pattern):
+ a=0
+ c=0
+ g=0
+ t=0
+
+ pattern=pattern.lower()
+
+ for l in pattern:
+ a<<=2
+ c<<=2
+ g<<=2
+ t<<=2
+ if l in 'armwdhvn':
+ a|=1
+ if l in 'cymsbhvn':
+ c|=1
+ if l in 'grksbdvn':
+ g|=1
+ if l in 'tykwbdhn':
+ t|=1
+
+ return WordPattern(a,c,g,t)
+
+cpdef bint matchPattern(unsigned long int word,pattern):
+ all = pattern.a|pattern.c|pattern.g|pattern.t
+ eq = ~word
+ match = eq & (eq >> 1) & pattern.a
+ eq = (word & 0x55555555 | ~word & 0xAAAAAAAA)
+ match|= eq & (eq >> 1) & pattern.c
+ eq = (word & 0xAAAAAAAA | ~word & 0x55555555)
+ match|= eq & (eq >> 1) & pattern.g
+ eq = word
+ match|= eq & (eq >> 1) & pattern.t
+ return match == all
+
+cdef class ErrorPositionIterator:
+
+ cdef int _wsize
+ cdef int _errors
+ cdef unsigned long int _mask
+ cdef int _errorpos[32]
+ cdef bint _end
+
+ def __init__(self,wordsize,errorcount):
+ self._wsize=wordsize
+ self._errors=errorcount
+ self._mask=0
+ for i in range(errorcount):
+ self._errorpos[i]=i
+ self._end=False
+
+ def __iter__(self):
+ return self
+
+ def next(self):
+ cdef unsigned long int rep
+ cdef bint move=False
+ cdef int i
+ if self._end:
+ raise StopIteration
+
+ rep = 0
+ for i in range(self._errors):
+ rep |= 1 << self._errorpos[i]
+ print bin2str(rep)
+
+ move=False
+ i=0
+ while (not move):
+ if self._errorpos[i]<self._errorpos[i+1]-1:
+ self._errorpos[i]+=1
+ move=True
+ i=0
+ print "pos %d/%d moved" % (i,self._wsize)
+ else:
+ self._errorpos[i]=i
+ i+=1
+ if i==self._errors-1 and self._errorpos[i]==self._wsize:
+ self._end=True
+ move=True
+
+ return rep
+
\ No newline at end of file
diff --git a/src/obitools/word/_readindex.cfiles b/src/obitools/word/_readindex.cfiles
new file mode 100644
index 0000000..3f1b3df
--- /dev/null
+++ b/src/obitools/word/_readindex.cfiles
@@ -0,0 +1 @@
+_readindex.h
\ No newline at end of file
diff --git a/src/obitools/word/_readindex.ext.1.c b/src/obitools/word/_readindex.ext.1.c
new file mode 100644
index 0000000..0b8b1be
--- /dev/null
+++ b/src/obitools/word/_readindex.ext.1.c
@@ -0,0 +1,202 @@
+/*
+ * Copyright (c) 1999 Apple Computer, Inc. All rights reserved.
+ *
+ * @APPLE_LICENSE_HEADER_START@
+ *
+ * The contents of this file constitute Original Code as defined in and
+ * are subject to the Apple Public Source License Version 1.1 (the
+ * "License"). You may not use this file except in compliance with the
+ * License. Please obtain a copy of the License at
+ * http://www.apple.com/publicsource and read it before using this file.
+ *
+ * This Original Code and all software distributed under the License are
+ * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
+ * License for the specific language governing rights and limitations
+ * under the License.
+ *
+ * @APPLE_LICENSE_HEADER_END@
+ */
+/*
+ * Copyright (c) 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Ronnie Kon at Mindcraft Inc., Kevin Lew and Elmer Yglesias.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+
+#include <sys/types.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <stddef.h>
+
+/*
+ * Swap two areas of size number of bytes. Although qsort(3) permits random
+ * blocks of memory to be sorted, sorting pointers is almost certainly the
+ * common case (and, were it not, could easily be made so). Regardless, it
+ * isn't worth optimizing; the SWAP's get sped up by the cache, and pointer
+ * arithmetic gets lost in the time required for comparison function calls.
+ */
+#define SWAP(a, b, count, size, tmp) { \
+ count = size; \
+ do { \
+ tmp = *a; \
+ *a++ = *b; \
+ *b++ = tmp; \
+ } while (--count); \
+}
+
+/* Copy one block of size size to another. */
+#define COPY(a, b, count, size, tmp1, tmp2) { \
+ count = size; \
+ tmp1 = a; \
+ tmp2 = b; \
+ do { \
+ *tmp1++ = *tmp2++; \
+ } while (--count); \
+}
+
+/*
+ * Build the list into a heap, where a heap is defined such that for
+ * the records K1 ... KN, Kj/2 >= Kj for 1 <= j/2 <= j <= N.
+ *
+ * There two cases. If j == nmemb, select largest of Ki and Kj. If
+ * j < nmemb, select largest of Ki, Kj and Kj+1.
+ */
+#define CREATE(initval, nmemb, par_i, child_i, par, child, size, count, tmp) { \
+ for (par_i = initval; (child_i = par_i * 2) <= nmemb; \
+ par_i = child_i) { \
+ child = base + child_i * size; \
+ if (child_i < nmemb && compar(child, child + size) < 0) { \
+ child += size; \
+ ++child_i; \
+ } \
+ par = base + par_i * size; \
+ if (compar(child, par) <= 0) \
+ break; \
+ SWAP(par, child, count, size, tmp); \
+ } \
+}
+
+/*
+ * Select the top of the heap and 'heapify'. Since by far the most expensive
+ * action is the call to the compar function, a considerable optimization
+ * in the average case can be achieved due to the fact that k, the displaced
+ * elememt, is ususally quite small, so it would be preferable to first
+ * heapify, always maintaining the invariant that the larger child is copied
+ * over its parent's record.
+ *
+ * Then, starting from the *bottom* of the heap, finding k's correct place,
+ * again maintianing the invariant. As a result of the invariant no element
+ * is 'lost' when k is assigned its correct place in the heap.
+ *
+ * The time savings from this optimization are on the order of 15-20% for the
+ * average case. See Knuth, Vol. 3, page 158, problem 18.
+ *
+ * XXX Don't break the #define SELECT line, below. Reiser cpp gets upset.
+ */
+#define SELECT(par_i, child_i, nmemb, par, child, size, k, count, tmp1, tmp2) { \
+ for (par_i = 1; (child_i = par_i * 2) <= nmemb; par_i = child_i) { \
+ child = base + child_i * size; \
+ if (child_i < nmemb && compar(child, child + size) < 0) { \
+ child += size; \
+ ++child_i; \
+ } \
+ par = base + par_i * size; \
+ COPY(par, child, count, size, tmp1, tmp2); \
+ } \
+ for (;;) { \
+ child_i = par_i; \
+ par_i = child_i / 2; \
+ child = base + child_i * size; \
+ par = base + par_i * size; \
+ if (child_i == 1 || compar(k, par) < 0) { \
+ COPY(child, k, count, size, tmp1, tmp2); \
+ break; \
+ } \
+ COPY(child, par, count, size, tmp1, tmp2); \
+ } \
+}
+
+/*
+ * Heapsort -- Knuth, Vol. 3, page 145. Runs in O (N lg N), both average
+ * and worst. While heapsort is faster than the worst case of quicksort,
+ * the BSD quicksort does median selection so that the chance of finding
+ * a data set that will trigger the worst case is nonexistent. Heapsort's
+ * only advantage over quicksort is that it requires little additional memory.
+ */
+int
+heapsort(vbase, nmemb, size, compar)
+ void *vbase;
+ size_t nmemb, size;
+ int (*compar) __P((const void *, const void *));
+{
+ register int cnt, i, j, l;
+ register char tmp, *tmp1, *tmp2;
+ char *base, *k, *p, *t;
+
+ if (nmemb <= 1)
+ return (0);
+
+ if (!size) {
+ errno = EINVAL;
+ return (-1);
+ }
+
+ if ((k = malloc(size)) == NULL)
+ return (-1);
+
+ /*
+ * Items are numbered from 1 to nmemb, so offset from size bytes
+ * below the starting address.
+ */
+ base = (char *)vbase - size;
+
+ for (l = nmemb / 2 + 1; --l;)
+ CREATE(l, nmemb, i, j, t, p, size, cnt, tmp);
+
+ /*
+ * For each element of the heap, save the largest element into its
+ * final slot, save the displaced element (k), then recreate the
+ * heap.
+ */
+ while (nmemb > 1) {
+ COPY(k, base + nmemb * size, cnt, size, tmp1, tmp2);
+ COPY(base + nmemb * size, base + size, cnt, size, tmp1, tmp2);
+ --nmemb;
+ SELECT(i, j, nmemb, t, p, size, k, cnt, tmp1, tmp2);
+ }
+ free(k);
+ return (0);
+}
diff --git a/src/obitools/word/_readindex.h b/src/obitools/word/_readindex.h
new file mode 100644
index 0000000..438a7a0
--- /dev/null
+++ b/src/obitools/word/_readindex.h
@@ -0,0 +1,94 @@
+/*********************************************************************************/
+/*********************************************************************************/
+/*********************************************************************************/
+
+/*
+macros.h:
+Binary constant generator macro
+By Tom Torfs - donated to the public domain
+*/
+
+/* All macro's evaluate to compile-time constants */
+
+/* *** helper macros *** */
+
+/* turn a numeric literal into a hex constant
+(avoids problems with leading zeroes)
+8-bit constants max value 0x11111111, always fits in unsigned long
+*/
+#define HEX__(n) 0x##n##LU
+
+/* 8-bit conversion function */
+#define B8__(x) ((x&0x0000000FLU)?1:0) \
++((x&0x000000F0LU)?2:0) \
++((x&0x00000F00LU)?4:0) \
++((x&0x0000F000LU)?8:0) \
++((x&0x000F0000LU)?16:0) \
++((x&0x00F00000LU)?32:0) \
++((x&0x0F000000LU)?64:0) \
++((x&0xF0000000LU)?128:0)
+
+/* *** user macros *** */
+
+/* for upto 8-bit binary constants */
+#define B8(d) ((unsigned char)B8__(HEX__(d)))
+
+/* for upto 16-bit binary constants, MSB first */
+#define B16(dmsb,dlsb) (((unsigned short)B8(dmsb)<< \
++ B8(dlsb))
+
+/* for upto 32-bit binary constants, MSB first */
+#define B32(dmsb,db2,db3,dlsb) (((unsigned long)B8(dmsb)<<24) \
++ ((unsigned long)B8(db2)<<16) \
++ ((unsigned long)B8(db3)<< \
++ B8(dlsb))
+
+/*********************************************************************************/
+/*********************************************************************************/
+/*********************************************************************************/
+
+/*
+typedef struct obinuc {
+ unsigned int seqused : 1; // this sequence is already used
+ unsigned int endofread : 1; // this word is already used
+ unsigned int zero : 1; // this is a non standard nucleotide
+ unsigned int direction : 1; // 0 -> use direct word 1 -> use reverse word
+ unsigned int reverse : 2; // reverse nucleotide 0 : A 1 : C 2 : G 3 : T
+ unsigned int forward : 2; // forward nucleotide 0 : A 1 : C 2 : G 3 : T
+} *pobinuc, obinuc;
+*/
+
+typedef char obinuc,*pobinuc;
+
+
+#define SET_SEQUSED(x) ((char)((x) | B8(10000000)))
+#define SET_ENDOFREAD(x) ((char)((x) | B8(01000000)))
+#define SET_ZERO(x) ((char)((x) | B8(00100000)))
+#define SET_DIRECTION(x) ((char)((x) | B8(00010000)))
+
+#define UNSET_SEQUSED(x) ((char)((x) & B8(01111111)))
+#define UNSET_ENDOFREAD(x) ((char)((x) & B8(10111111)))
+#define UNSET_ZERO(x) ((char)((x) & B8(11011111)))
+#define UNSET_DIRECTION(x) ((char)((x) & B8(11101111)))
+
+
+#define SET_REVERSE(x,val) ((char)(((x) & B8(11110011)) | (((val) & B8(00000011)) << 2)))
+#define SET_FORWARD(x,val) ((char)(((x) & B8(11111100)) | (((val) & B8(00000011)))))
+
+#define GET_SEQUSED(x) ((char)(((x) >> 7) & 1))
+#define GET_ENDOFREAD(x) ((char)(((x) >> 6) & 1))
+#define GET_ZERO(x) ((char)(((x) >> 5) & 1))
+#define GET_DIRECTION(x) ((char)(((x) >> 4) & 1))
+#define GET_REVERSE(x) ((char)(((x) >> 2) & B8(00000011)))
+#define GET_FORWARD(x) ((char)(x) & B8(00000011))
+#define DECODE_NUC(x) (*("acgt" + GET_FORWARD(x)))
+#define DECODE_NUC_FR(x,d) (*("acgtn" + (((d==0) ? GET_FORWARD(x):GET_REVERSE(x)) | (GET_ZERO(x) << 2))))
+
+#define A B8(00001100)
+#define C B8(00001001)
+#define G B8(00000110)
+#define T B8(00000011)
+#define N B8(00100000)
+
+
+
diff --git a/src/obitools/word/_readindex.pyx b/src/obitools/word/_readindex.pyx
new file mode 100644
index 0000000..48afcf1
--- /dev/null
+++ b/src/obitools/word/_readindex.pyx
@@ -0,0 +1,805 @@
+from libc.stdlib cimport free
+from libc.stdlib cimport malloc,realloc
+from libc.stdio cimport fopen,fclose,fread,fwrite,FILE
+from libc.string cimport strlen
+from cpython.bytes cimport PyBytes_FromString
+from cpython.bytes cimport PyBytes_FromStringAndSize
+
+import sys
+
+from threading import Lock
+from cPickle import dumps,loads
+
+from obitools._obitools import NucSequence
+from turtle import Tbuffer
+
+cdef extern from "_readindex.h":
+ ctypedef char obinuc
+ ctypedef obinuc* pobinuc
+
+ obinuc SET_SEQUSED(char x)
+ obinuc SET_ENDOFREAD(char x)
+ obinuc SET_ZERO(char x)
+ obinuc SET_DIRECTION(char x)
+
+ obinuc UNSET_SEQUSED(char x)
+ obinuc UNSET_ENDOFREAD(char x)
+ obinuc UNSET_ZERO(char x)
+ obinuc UNSET_DIRECTION(char x)
+
+
+ obinuc SET_REVERSE(char x, unsigned int val)
+ obinuc SET_FORWARD(char x, unsigned int val)
+
+ unsigned int GET_SEQUSED(obinuc x)
+ unsigned int GET_ENDOFREAD(obinuc x)
+ unsigned int GET_ZERO(obinuc x)
+ unsigned int GET_DIRECTION(obinuc x)
+ unsigned int GET_REVERSE(obinuc x)
+ unsigned int GET_FORWARD(obinuc x)
+
+ char DECODE_NUC(obinuc x)
+ char DECODE_NUC_FR(obinuc x, unsigned int d)
+
+ enum:
+ A
+ enum:
+ C
+ enum:
+ G
+ enum:
+ T
+ enum:
+ N
+
+cdef extern from *:
+ ctypedef void* pconstvoid "const void*"
+
+cdef extern from "stdlib.h":
+ void heapsort(void *base, size_t nel, size_t width, int (*compar)(pconstvoid, pconstvoid))
+ void qsort(void *base, size_t nel, size_t width, int (*compar)(pconstvoid, pconstvoid))
+ void* bsearch(pconstvoid key, pconstvoid base, size_t nel, size_t width, int (*compar) (pconstvoid, pconstvoid))
+
+cdef obinuc encodeobinuc(char nuc):
+ nuc&=0b11011111
+ if nuc=='A':
+ return A
+ elif nuc=='C':
+ return C
+ elif nuc=='G':
+ return G
+ elif nuc=='T':
+ return T
+ else:
+ return N
+
+cdef int hashword(pobinuc word, int lkey, int lword):
+ cdef int key=0
+ cdef int k
+ cdef int dir=1 if GET_DIRECTION(word[0])==0 else -1
+
+ if lword < lkey:
+ lkey=lword
+
+ if dir < 0:
+ word+=lword-1
+
+ for k in range(lkey):
+ key<<=2
+ key|=GET_FORWARD(word[0]) if dir > 0 else GET_REVERSE(word[0])
+ word+=dir
+
+ return key
+
+
+
+cdef void encode_direction(pobinuc word, int lword):
+ cdef int direction=0
+ cdef pobinuc bnuc = word
+ cdef pobinuc enuc = word + lword - 1
+
+ while (bnuc < enuc and GET_FORWARD(bnuc[0])==GET_REVERSE(enuc[0])):
+ bnuc+=1
+ enuc-=1
+
+ if GET_FORWARD(bnuc[0]) > GET_REVERSE(enuc[0]):
+ word[0]=SET_DIRECTION(word[0])
+ else:
+ word[0]=UNSET_DIRECTION(word[0])
+
+
+cdef bytes decodeword(pobinuc w, int lword):
+ cdef char[1024] cword
+ cdef bytes bword
+ cdef int d
+ cdef int dir=1
+ cdef int j
+
+ d = GET_DIRECTION(w[0])
+
+ if d==1:
+ dir=-1
+ w+= lword - 1
+
+ for j in range(lword):
+ cword[j]=DECODE_NUC_FR(w[0],d)
+ w+=dir
+
+ cword[lword]=0
+ bword = PyBytes_FromStringAndSize(cword,lword)
+
+ return bword
+
+cpdef minword(bytes word):
+ cdef obinuc[1024] nword
+ cdef char* cword=word
+ cdef int lword=len(word)
+
+ for i in range(lword):
+ nword[i]=encodeobinuc(cword[i])
+
+ encode_direction(nword,lword)
+
+ return decodeword(nword,lword)
+
+
+cdef object cmpwordlengthLock=Lock()
+cdef int cmpwordlength=0
+
+cdef int cmpwords(pconstvoid pw1, pconstvoid pw2):
+ cdef pobinuc w1=(<pobinuc*>pw1)[0]
+ cdef pobinuc w2=(<pobinuc*>pw2)[0]
+ cdef int dir1=1
+ cdef int dir2=1
+ cdef int d1=GET_DIRECTION(w1[0])
+ cdef int d2=GET_DIRECTION(w2[0])
+ cdef int i=0
+ cdef int n1
+ cdef int n2
+ cdef int rep=0
+
+ global cmpwordlength
+
+# print "-->",decodeword(w1,cmpwordlength),cmpwordlength,d1
+# print "-->",decodeword(w2,cmpwordlength),cmpwordlength,d2
+
+ if d1==1:
+ dir1=-1
+ w1+=cmpwordlength-1
+
+ if d2==1:
+ dir2=-1
+ w2+=cmpwordlength-1
+
+ n1 = GET_FORWARD(w1[0]) if d1==0 else GET_REVERSE(w1[0])
+ n2 = GET_FORWARD(w2[0]) if d2==0 else GET_REVERSE(w2[0])
+
+# print n1,n2
+
+ while (n1==n2 and i < cmpwordlength):
+# print i,n1,n2
+ i+=1
+ w1+=dir1
+ w2+=dir2
+ n1 = GET_FORWARD(w1[0]) if d1==0 else GET_REVERSE(w1[0])
+ n2 = GET_FORWARD(w2[0]) if d2==0 else GET_REVERSE(w2[0])
+
+ if cmpwordlength==i:
+ rep=0
+ elif n1 < n2:
+ rep = -1
+ elif n1 > n2:
+ rep = 1
+
+# print rep
+
+ return rep
+
+
+cdef class ReadIndex:
+
+ cdef int _size
+ cdef int _readsize
+ cdef int _chuncksize
+ cdef int _seqsize
+ cdef long long _buffer_size
+ cdef pobinuc _buffer
+ cdef long long _endofreads
+ cdef list _ids
+ cdef pobinuc* _wordlist
+ cdef long long _wordlist_size
+ cdef int _wordlength
+ cdef int[4096] _index
+ cdef int _lindex
+ cdef int* _globalwordlength
+
+ def __init__(self, int readsize=-1, int chuncksize=1000000):
+ cdef int i
+ global cmpwordlength
+
+ assert readsize < 1024,"You cannot use reads longer than 1023 base pair"
+ self._readsize=readsize
+ self._seqsize=(self._readsize+1)*2
+ self._chuncksize=chuncksize
+ self._buffer=NULL
+ self._buffer_size=0
+ self._endofreads=0
+ self._size=0
+ self._ids=[]
+ self._wordlist=NULL
+ self._wordlist_size=0
+ self._wordlength=0
+ self._lindex=0
+
+ self._globalwordlength=&cmpwordlength
+
+ for i in range(4096):
+ self._index[i]=-1
+
+ def __del__(self):
+ if self._buffer != NULL:
+ free(self._buffer)
+ if self._wordlist != NULL:
+ free(self._wordlist)
+
+
+ def __len__(self):
+ return self._size
+
+ def save(self,bytes filename, bint verbose=False):
+ cdef char* cfile=filename
+ cdef FILE *f = fopen(cfile,'w')
+ cdef long long i
+ cdef bytes btitle
+ cdef char* title
+ cdef int ltitle
+ cdef size_t transfered
+ cdef size_t ltbuffer
+ cdef bytes tbuffer
+ cdef char* tcbuffer
+
+ assert f!=NULL,"cannot open file Ms" % filename
+
+ if verbose:
+ print >>sys.stderr,"Writing header..."
+
+ transfered = fwrite(&(self._size),sizeof(int),1,f)
+ assert transfered==1,"Error during size writing"
+
+ transfered = fwrite(&(self._readsize),sizeof(int),1,f)
+ assert transfered==1,"Error during readsize writing"
+
+ transfered = fwrite(&(self._seqsize),sizeof(int),1,f)
+ assert transfered==1,"Error during seqsize writing"
+
+ transfered = fwrite(&(self._buffer_size),sizeof(long long),1,f)
+ assert transfered==1,"Error during buffer size writing"
+
+ transfered = fwrite(&(self._buffer),sizeof(pobinuc),1,f)
+ assert transfered==1,"Error during buffer address writing"
+
+ print >> sys.stderr,self._endofreads
+ transfered = fwrite(&(self._endofreads),sizeof(long long),1,f)
+ assert transfered==1,"Error during endofread writing"
+
+ transfered = fwrite(&(self._wordlist_size),sizeof(long long),1,f)
+ assert transfered==1,"Error during wordlist size writing"
+
+ transfered = fwrite(&(self._wordlength),sizeof(int),1,f)
+ assert transfered==1,"Error during word length writing"
+
+ transfered = fwrite(&(self._lindex),sizeof(int),1,f)
+ assert transfered==1,"Error during lindex writing"
+
+ if verbose:
+ print >>sys.stderr,"Writing sequences..."
+
+ fwrite(self._buffer,1,self._buffer_size,f)
+
+ if verbose:
+ print >>sys.stderr,"Writing %d words index..." % self._wordlist_size
+
+# for i in range(self._wordlist_size):
+# print >>sys.stderr,'--> %d %d' % (i,<long long>self._wordlist[i]),
+# self._wordlist[i]-=<long long>self._buffer
+# print " %d" % <long long>self._wordlist[i]
+
+ fwrite(self._wordlist,sizeof(pobinuc),self._wordlist_size,f)
+
+# for i in range(self._wordlist_size):
+# print >>sys.stderr,'--> %d %d' % (i,<long long>self._wordlist[i]),
+# self._wordlist[i]+=<long long>self._buffer
+# print " %d" % <long long>self._wordlist[i]
+
+ if verbose:
+ print >>sys.stderr,"Writing sequence identifiers..."
+
+ tbuffer=dumps(self._ids)
+ tcbuffer=tbuffer
+ ltbuffer=strlen(tcbuffer)
+ if verbose:
+ print >>sys.stderr," identifier size = %d" % ltbuffer
+ fwrite(<buffer,sizeof(size_t),1,f)
+ fwrite(tcbuffer,1,ltbuffer,f)
+
+# for i in range(self._size):
+# ltitple=len(self._ids[i])
+# btitle= self._ids[i]
+# title = btitle
+# fwrite(&(ltitle),sizeof(int),1,f)
+# fwrite(title,1,ltitle,f)
+
+ print >>sys.stderr
+
+ if verbose:
+ print >>sys.stderr,"Save done"
+
+ fclose(f)
+
+ if verbose:
+ print >>sys.stderr,"File closed"
+
+ def load(self,bytes filename, bint verbose=False):
+ cdef char* cfile=filename
+ cdef FILE *f = fopen(cfile,'r')
+ cdef char[10000] ctitle
+ cdef bytes btitle
+ cdef int ltitle
+ cdef pobinuc oldbuf
+ cdef size_t transfered
+ cdef size_t ltbuffer
+ cdef bytes tbuffer
+ cdef char* tcbuffer
+
+# print >>sys.stderr,sizeof(int),sizeof(pobinuc),sizeof(long long)
+
+ assert f!=NULL,"cannot open file Ms" % filename
+
+ if verbose:
+ print >>sys.stderr,"Reading header..."
+
+ transfered = fread(&(self._size),sizeof(int),1,f)
+ assert transfered==1,"Error during size reading"
+ if verbose:
+ print >>sys.stderr," index contains %d sequence pairs" % self._size
+
+ transfered = fread(&(self._readsize),sizeof(int),1,f)
+ assert transfered==1,"Error during read size reading"
+ if verbose:
+ print >>sys.stderr," read size is %d pb" % self._readsize
+
+ transfered = fread(&(self._seqsize),sizeof(int),1,f)
+ assert transfered==1,"Error during seqsize reading"
+ if verbose:
+ print >>sys.stderr," sequence size is %d bytes" % self._seqsize
+
+ transfered = fread(&(self._buffer_size),sizeof(long long),1,f)
+ assert transfered==1,"Error during buffer size reading"
+ if verbose:
+ print >>sys.stderr," buffer size is %d bytes" % self._buffer_size
+
+ transfered = fread(&(oldbuf),sizeof(pobinuc),1,f)
+ assert transfered==1,"Error during buffer address reading"
+ transfered = fread(&(self._endofreads),sizeof(long long),1,f)
+ assert transfered==1,"Error during endofread reading"
+ if verbose:
+ print >>sys.stderr," end of reads is %d" % self._endofreads
+
+ transfered = fread(&(self._wordlist_size),sizeof(long long),1,f)
+ assert transfered==1,"Error during word list size reading"
+ if verbose:
+ print >>sys.stderr," index contains %d words" % self._wordlist_size
+
+ transfered = fread(&(self._wordlength),sizeof(int),1,f)
+ assert transfered==1,"Error during word length reading"
+
+ transfered = fread(&(self._lindex),sizeof(int),1,f)
+ assert transfered==1,"Error during lindex reading"
+
+ if verbose:
+ print >>sys.stderr,"Reading sequences..."
+
+ if (self._buffer!=NULL):
+ free(self._buffer)
+
+ self._buffer=<pobinuc>malloc(self._buffer_size)
+
+ transfered = fread(self._buffer,1,self._buffer_size,f)
+
+ if verbose:
+ print >>sys.stderr,"Reading %d words index..." % self._wordlist_size
+
+ if (self._wordlist!=NULL):
+ free(self._wordlist)
+
+ self._wordlist = <pobinuc *>malloc(self._wordlist_size * sizeof(pobinuc))
+
+ transfered = fread(self._wordlist,sizeof(pobinuc),self._wordlist_size,f)
+
+
+ if verbose:
+ print >>sys.stderr,"Patching word index..."
+
+
+ for i in range(self._wordlist_size):
+ self._wordlist[i]+= (self._buffer - oldbuf)
+
+ self._ids=[]
+
+ if verbose:
+ print >>sys.stderr,"Reading sequence ids..."
+
+ fread(<buffer,sizeof(size_t),1,f)
+ if verbose:
+ print >>sys.stderr," identifier size = %d" % ltbuffer
+ tcbuffer = <char*>malloc(ltbuffer)
+ fread(tcbuffer,1,ltbuffer,f)
+ self._ids=loads(PyBytes_FromStringAndSize(tcbuffer,ltbuffer))
+ free(tcbuffer)
+
+ fclose(f)
+
+ self._lindex=6 if self._wordlength >=6 else self._wordlength
+
+ if verbose:
+ print >>sys.stderr,"Hashing word prefix..."
+
+ for i in range(4096):
+ self._index[i]=-1
+
+ for i in range(self._wordlist_size):
+ k = hashword(self._wordlist[i],self._lindex,self._wordlength)
+ if self._index[k]==-1:
+ self._index[k]=i
+ #print k,i
+
+ fclose(f)
+
+
+ def indexWords(self,int lword,bint verbose=False):
+ cdef int error=0
+ cdef pobinuc sword=self._buffer
+ cdef pobinuc eword=sword
+ cdef pobinuc endbuff = self._buffer + self._endofreads
+ cdef int i=0
+ cdef int k=0
+ cdef int maxwords = (self._readsize - lword + 1) * self._size * 2
+
+ assert sword != NULL,"Cannot index empty ReadIndex"
+ assert lword <= self._readsize,"words cannot be longer than reads"
+
+ if verbose:
+ print >>sys.stderr,"Indexing words from %d sequences..." % len(self)
+
+ if self._wordlist!=NULL:
+ free(self._wordlist)
+
+ self._wordlist = <pobinuc*>malloc(maxwords * sizeof(pobinuc*))
+
+ for i in range(lword):
+ error+=GET_ZERO(eword[0])
+ eword+=1
+
+ i=0
+
+
+ while (eword < endbuff):
+ if error==0:
+ self._wordlist[i]=sword
+ encode_direction(sword,lword)
+ i+=1
+
+ error-=GET_ZERO(sword[0])
+ error+=GET_ZERO(eword[0])
+
+ sword+=1
+ eword+=1
+
+
+ self._wordlist = <pobinuc*>realloc(self._wordlist, i * sizeof(pobinuc*))
+ self._wordlist_size=i
+ self._wordlength=lword
+
+ if verbose:
+ print >>sys.stderr,"Sorting %d words..." % i
+
+ cmpwordlengthLock.acquire()
+ self._globalwordlength[0]=lword
+ heapsort(self._wordlist,i,sizeof(pobinuc),cmpwords)
+ cmpwordlengthLock.release()
+
+ self._lindex=6 if lword >=6 else lword
+
+ if verbose:
+ print >>sys.stderr,"Hashing word prefix..."
+
+ for i in range(4096):
+ self._index[i]=-1
+
+ for i in range(self._wordlist_size):
+ k = hashword(self._wordlist[i],self._lindex,lword)
+ if self._index[k]==-1:
+ self._index[k]=i
+ #print k,i
+
+
+ def itermarkedpairs(self):
+ cdef size_t i
+ cdef pobinuc start1
+ cdef pobinuc start2
+
+ for i in range(self._size):
+ start1=self._buffer+ i * self._seqsize
+ start2=start1 + self._seqsize / 2
+ if GET_SEQUSED(start1[0])==1 and GET_SEQUSED(start2[0])==1:
+ yield self.getSeqPairAt(start1,False)
+
+ def itermarkedsingleton(self):
+ cdef size_t i
+ cdef pobinuc start1
+ cdef pobinuc start2
+
+ for i in range(self._size):
+ start1=self._buffer+ i * self._seqsize
+ start2=start1 + self._seqsize / 2
+ if (GET_SEQUSED(start1[0])==1 or GET_SEQUSED(start2[0])==1) \
+ and not (GET_SEQUSED(start1[0])==1 and GET_SEQUSED(start2[0])==1):
+ if GET_SEQUSED(start1[0])==1:
+ yield self.getSeqAt(start1,False)
+ else:
+ yield self.getSeqAt(start2,False)
+
+ def iterreads(self,bytes word):
+ cdef obinuc nword[1024]
+ cdef pobinuc pnword=nword
+ cdef pobinuc* ppnword=&pnword
+ cdef pobinuc* found
+ cdef char* cword=word
+ cdef int i
+ cdef int lword=self._wordlength
+ cdef int k
+ cdef int nk=1 << (2*self._lindex)
+ cdef long long wstart
+ cdef long long wend
+ cdef long long wpoint
+ cdef int pcomp
+ cdef int scomp
+ cdef int ecomp
+
+ assert len(word) == lword
+
+ for i in range(lword):
+ nword[i]=encodeobinuc(cword[i])
+
+ encode_direction(nword,lword)
+ k=hashword(nword,self._lindex,lword)
+
+ wstart=self._index[k]
+
+ if wstart==-1:
+ raise StopIteration
+
+ k+=1
+
+ while (k < nk and self._index[k]==-1):
+ k+=1
+
+ if k==nk:
+ wend=self._wordlist_size
+ else:
+ wend=self._index[k]
+
+# print "coucou : %d %d" % (wstart,wend)
+
+
+# print "locking 0"
+ cmpwordlengthLock.acquire()
+# print "locked 0"
+ self._globalwordlength[0]=lword
+
+# print decodeword(ppnword[0],lword)
+# print decodeword((self._wordlist+wstart)[0],lword)
+
+ found = <pobinuc*>bsearch(ppnword,self._wordlist+wstart,wend-wstart,sizeof(pobinuc),cmpwords)
+
+ if found==NULL:
+ cmpwordlengthLock.release()
+ raise StopIteration
+
+ wpoint = found - self._wordlist
+
+ wstart = wpoint
+ while (wpoint >0 and cmpwords(ppnword,self._wordlist+wpoint)==0):
+ s=self.getSeqAt(self._wordlist[wpoint],True)
+ if s is not None:
+ cmpwordlengthLock.release()
+ yield s
+ cmpwordlengthLock.acquire()
+ self._globalwordlength[0]=lword
+ wpoint-=1
+
+ wstart = wpoint+1
+ while (wpoint < self._wordlist_size and cmpwords(ppnword,self._wordlist+wpoint)==0):
+ s=self.getSeqAt(self._wordlist[wpoint],True)
+ if s is not None:
+ cmpwordlengthLock.release()
+ yield s
+ cmpwordlengthLock.acquire()
+ self._globalwordlength[0]=lword
+ wpoint+=1
+
+ cmpwordlengthLock.release()
+
+
+
+ def iterwords(self):
+ cdef int i
+
+ assert self._wordlist != NULL,'You must index words'
+
+ for i in range(self._wordlist_size):
+ yield decodeword(self._wordlist[i],self._wordlength)
+
+
+
+
+ def add(self,sequence):
+ cdef bytes bseq
+ cdef char* seq
+
+ if self._readsize<0:
+ self._readsize=len(sequence[0])
+ self._seqsize=(self._readsize+1)*2
+
+ assert self._readsize < 1024,"You cannot use reads longer than 1023 base pair"
+
+ else:
+ assert len(sequence[0]) <= self._readsize and len(sequence[1]) <= self._readsize
+
+ if self._buffer==NULL:
+ self._buffer = <pobinuc>malloc(self._seqsize*self._chuncksize)
+ self._buffer_size=self._seqsize*self._chuncksize
+ self._endofreads=0
+
+ if self._endofreads + self._seqsize >= self._buffer_size:
+ self._buffer_size+=self._seqsize*self._chuncksize
+ self._buffer = <pobinuc> realloc(<void*>self._buffer,self._buffer_size)
+
+ self._ids.append(sequence[0].id[0:-2])
+
+ bseq = bytes(sequence[0])
+ seq = bseq
+ l=0
+
+ while seq[0]!=0:
+ self._buffer[self._endofreads]=encodeobinuc(seq[0])
+ self._endofreads+=1
+ seq+=1
+ l+=1
+
+ while l<=self._readsize:
+ self._buffer[self._endofreads]=SET_ENDOFREAD(N)
+ self._endofreads+=1
+ l+=1
+
+
+ bseq = bytes(sequence[1])
+ seq = bseq
+ l=0
+
+ while seq[0]!=0:
+ self._buffer[self._endofreads]=encodeobinuc(seq[0])
+ self._endofreads+=1
+ seq+=1
+ l+=1
+
+ while l<=self._readsize:
+ self._buffer[self._endofreads]=SET_ENDOFREAD(N)
+ self._endofreads+=1
+ l+=1
+
+ self._size+=1
+
+ cdef object getSeqAt(self,pobinuc word,bint lock=False):
+ cdef long long delta
+ cdef pobinuc start1
+ cdef pobinuc start2
+ cdef char[1024] cseqf
+ cdef char[1024] cseqr
+ cdef char* pseq
+ cdef bytes bseqf
+ cdef bytes bseqr
+ cdef bytes n=b"/1"
+
+ delta = <void*>word - <void*>self._buffer
+ delta/= self._seqsize
+
+ start1=self._buffer+ delta * self._seqsize
+ start2=start1 + self._seqsize / 2
+
+ if word >= start2:
+ start1=start2
+ n=b"/2"
+
+ if lock:
+ if GET_SEQUSED(start1[0])==1:
+ return None
+ else:
+ start1[0]=SET_SEQUSED(start1[0])
+
+ pseq = cseqf
+
+ while (GET_ENDOFREAD(start1[0])==0):
+ pseq[0]=DECODE_NUC(start1[0])
+ start1+=1
+ pseq+=1
+
+ pseq[0]=0
+
+ bseqf = PyBytes_FromString(cseqf)
+
+ return NucSequence(self._ids[delta]+n,bseqf)
+
+ cdef object getSeqPairAt(self,pobinuc word,bint lock=False):
+ cdef long long delta
+ cdef pobinuc start1
+ cdef pobinuc start2
+ cdef char[1024] cseqf
+ cdef char[1024] cseqr
+ cdef char* pseq
+ cdef bytes bseqf
+ cdef bytes bseqr
+
+ delta = <void*>word - <void*>self._buffer
+ delta/= self._seqsize
+
+ start1=self._buffer+ delta * self._seqsize
+ start2=start1 + self._seqsize / 2
+
+ if lock:
+ if GET_SEQUSED(start1[0])==1:
+ return None,None
+ else:
+ start1[0]=SET_SEQUSED(start1[0])
+ start2[0]=SET_SEQUSED(start2[0])
+
+ pseq = cseqf
+
+ while (GET_ENDOFREAD(start1[0])==0):
+ pseq[0]=DECODE_NUC(start1[0])
+ start1+=1
+ pseq+=1
+
+ pseq[0]=0
+
+ bseqf = PyBytes_FromString(cseqf)
+
+ pseq = cseqr
+
+ while (GET_ENDOFREAD(start2[0])==0):
+ pseq[0]=DECODE_NUC(start2[0])
+ start2+=1
+ pseq+=1
+
+ pseq[0]=0
+
+ bseqr = PyBytes_FromString(cseqr)
+
+ return NucSequence(self._ids[delta]+'/1',bseqf),NucSequence(self._ids[delta]+'/2',bseqr)
+
+ def __getitem__(self,int index):
+
+ if index >= self._size:
+ raise IndexError(index)
+
+ if index < 0:
+ index+=self._size
+
+ if index < 0:
+ raise IndexError(index)
+
+ return self.getSeqAt(self._buffer + index * self._seqsize)
+
+
+
+
diff --git a/src/obitools/word/options.py b/src/obitools/word/options.py
new file mode 100644
index 0000000..f67a757
--- /dev/null
+++ b/src/obitools/word/options.py
@@ -0,0 +1,117 @@
+from obitools.word import wordSelector
+from obitools.word import allDNAWordIterator,encodeWord
+from obitools.word import predicate
+
+
+
+
+def _acceptedOptionCallback(options,opt,value,parser):
+ if not hasattr(parser.values, 'acceptedOligo'):
+ parser.values.acceptedOligo=[]
+ parser.values.acceptedOligo.append(predicate.predicateMatchPattern(value,))
+
+def _rejectedOptionCallback(options,opt,value,parser):
+ if not hasattr(parser.values, 'rejectedOligo'):
+ parser.values.rejectedOligo=[]
+ parser.values.rejectedOligo.append(predicate.predicateMatchPattern(value))
+
+
+
+def addOligoOptions(optionManager):
+
+ optionManager.add_option('-L','--oligo-list',
+ action="store", dest="oligoList",
+ metavar="<filename>",
+ type="str",
+ help="filename containing a list of oligonucleotide")
+
+
+ optionManager.add_option('-s','--oligo-size',
+ action="store", dest="oligoSize",
+ metavar="<###>",
+ type="int",
+ help="Size of oligonucleotide to generate")
+
+ optionManager.add_option('-f','--family-size',
+ action="store", dest="familySize",
+ metavar="<###>",
+ type="int",
+ help="Size of oligonucleotide family to generate")
+
+ optionManager.add_option('-d','--distance',
+ action="store", dest="oligoDist",
+ metavar="<###>",
+ type="int",
+ default=1,
+ help="minimal distance between two oligonucleotides")
+
+ optionManager.add_option('-g','--gc-max',
+ action="store", dest="gcMax",
+ metavar="<###>",
+ type="int",
+ default=0,
+ help="maximum count of G or C nucleotide acceptable in a word")
+
+ optionManager.add_option('-a','--accepted',
+ action="append",dest="acceptedPattern",
+ metavar="<regular pattern>",
+ default=[],
+ type="str",
+ help="pattern of accepted oligonucleotide")
+
+ optionManager.add_option('-r','--rejected',
+ action="append",dest="rejectedPattern",
+ metavar="<regular pattern>",
+ default=[],
+ type="str",
+ help="pattern of rejected oligonucleotide")
+
+ optionManager.add_option('-p','--homopolymer',
+ action="store", dest="homopolymere",
+ metavar="<###>",
+ type="int",
+ default=0,
+ help="reject oligo with homopolymer longer than.")
+
+ optionManager.add_option('-P','--homopolymer-min',
+ action="store", dest="homopolymere_min",
+ metavar="<###>",
+ type="int",
+ default=0,
+ help="accept only oligo with homopolymer longer or equal to.")
+
+def dnaWordIterator(options):
+
+ assert options.oligoSize is not None or options.oligoList is not None,"option -s or --oligo-size must be specified"
+ assert options.familySize is not None,"option -f or --family-size must be specified"
+ assert options.oligoDist is not None,"option -d or --distance must be specified"
+
+ if options.oligoList is not None:
+ options.oligoSize=len(open(options.oligoList).next().strip())
+ words = (encodeWord(x.strip().lower()) for x in open(options.oligoList))
+ else:
+ words = allDNAWordIterator(options.oligoSize)
+ #seed = 'a' * options.oligoSize
+ options.acceptedOligo=[]
+ for p in options.acceptedPattern:
+ assert len(p)==options.oligoSize,"Accept pattern with bad lenth : %s" % p
+ options.acceptedOligo.append(predicate.predicateMatchPattern(p, options.oligoSize))
+
+ options.rejectedOligo=[]
+ for p in options.rejectedPattern:
+ assert len(p)==options.oligoSize,"Reject pattern with bad lenth : %s" % p
+ options.rejectedOligo.append(predicate.predicateMatchPattern(p, options.oligoSize))
+
+
+ #options.acceptedOligo.append(predicat.distMinGenerator(seed, options.oligoDist))
+
+ if options.homopolymere:
+ options.rejectedOligo.append(predicate.predicateHomoPolymerLarger(options.homopolymere, options.oligoSize))
+
+ if options.homopolymere_min:
+ options.acceptedOligo.append(predicate.predicateHomoPolymerLarger(options.homopolymere_min-1, options.oligoSize))
+
+ if options.gcMax:
+ options.rejectedOligo.append(predicate.predicateGCUpperBond(options.gcMax, options.oligoSize))
+
+ return wordSelector(words, options.acceptedOligo, options.rejectedOligo)
diff --git a/src/obitools/word/predicate.py b/src/obitools/word/predicate.py
new file mode 100644
index 0000000..082b80f
--- /dev/null
+++ b/src/obitools/word/predicate.py
@@ -0,0 +1,41 @@
+#@PydevCodeAnalysisIgnore
+'''
+Created on 14 oct. 2009
+
+ at author: coissac
+'''
+
+from _binary import wordDist, \
+ homoMax, \
+ countCG, \
+ matchPattern, \
+ encodePattern
+
+def predicateWordDistMin(word,dmin,size):
+ def predicate(w):
+ return wordDist(word, w) >= dmin
+ return predicate
+
+def predicateHomoPolymerLarger(count,size):
+ def predicate(w):
+ return homoMax(w, size) > count
+ return predicate
+
+def predicateHomoPolymerSmaller(count,size):
+ def predicate(w):
+ return homoMax(w, size) < count
+ return predicate
+
+def predicateGCUpperBond(count,size):
+ def predicate(w):
+ return countCG(w, size) > count
+ return predicate
+
+def predicateMatchPattern(pattern,size):
+ pattern=encodePattern(pattern)
+ def predicate(w):
+ return matchPattern(w, pattern)
+ return predicate
+
+
+
diff --git a/src/obitools/zipfile.py b/src/obitools/zipfile.py
new file mode 100644
index 0000000..41e4bcb
--- /dev/null
+++ b/src/obitools/zipfile.py
@@ -0,0 +1,1282 @@
+"""
+Read and write ZIP files.
+"""
+import struct, os, time, sys, shutil
+import binascii, cStringIO
+
+try:
+ import zlib # We may need its compression method
+ crc32 = zlib.crc32
+except ImportError:
+ zlib = None
+ crc32 = binascii.crc32
+
+__all__ = ["BadZipfile", "error", "ZIP_STORED", "ZIP_DEFLATED", "is_zipfile",
+ "ZipInfo", "ZipFile", "PyZipFile", "LargeZipFile" ]
+
+class BadZipfile(Exception):
+ pass
+
+
+class LargeZipFile(Exception):
+ """
+ Raised when writing a zipfile, the zipfile requires ZIP64 extensions
+ and those extensions are disabled.
+ """
+
+error = BadZipfile # The exception raised by this module
+
+ZIP64_LIMIT= (1 << 31) - 1
+
+# constants for Zip file compression methods
+ZIP_STORED = 0
+ZIP_DEFLATED = 8
+# Other ZIP compression methods not supported
+
+# Here are some struct module formats for reading headers
+structEndArchive = "<4s4H2LH" # 9 items, end of archive, 22 bytes
+stringEndArchive = "PK\005\006" # magic number for end of archive record
+structCentralDir = "<4s4B4HLLL5HLL"# 19 items, central directory, 46 bytes
+stringCentralDir = "PK\001\002" # magic number for central directory
+structFileHeader = "<4s2B4HLLL2H" # 12 items, file header record, 30 bytes
+stringFileHeader = "PK\003\004" # magic number for file header
+structEndArchive64Locator = "<4sLQL" # 4 items, locate Zip64 header, 20 bytes
+stringEndArchive64Locator = "PK\x06\x07" # magic token for locator header
+structEndArchive64 = "<4sQHHLLQQQQ" # 10 items, end of archive (Zip64), 56 bytes
+stringEndArchive64 = "PK\x06\x06" # magic token for Zip64 header
+
+
+# indexes of entries in the central directory structure
+_CD_SIGNATURE = 0
+_CD_CREATE_VERSION = 1
+_CD_CREATE_SYSTEM = 2
+_CD_EXTRACT_VERSION = 3
+_CD_EXTRACT_SYSTEM = 4 # is this meaningful?
+_CD_FLAG_BITS = 5
+_CD_COMPRESS_TYPE = 6
+_CD_TIME = 7
+_CD_DATE = 8
+_CD_CRC = 9
+_CD_COMPRESSED_SIZE = 10
+_CD_UNCOMPRESSED_SIZE = 11
+_CD_FILENAME_LENGTH = 12
+_CD_EXTRA_FIELD_LENGTH = 13
+_CD_COMMENT_LENGTH = 14
+_CD_DISK_NUMBER_START = 15
+_CD_INTERNAL_FILE_ATTRIBUTES = 16
+_CD_EXTERNAL_FILE_ATTRIBUTES = 17
+_CD_LOCAL_HEADER_OFFSET = 18
+
+# indexes of entries in the local file header structure
+_FH_SIGNATURE = 0
+_FH_EXTRACT_VERSION = 1
+_FH_EXTRACT_SYSTEM = 2 # is this meaningful?
+_FH_GENERAL_PURPOSE_FLAG_BITS = 3
+_FH_COMPRESSION_METHOD = 4
+_FH_LAST_MOD_TIME = 5
+_FH_LAST_MOD_DATE = 6
+_FH_CRC = 7
+_FH_COMPRESSED_SIZE = 8
+_FH_UNCOMPRESSED_SIZE = 9
+_FH_FILENAME_LENGTH = 10
+_FH_EXTRA_FIELD_LENGTH = 11
+
+def is_zipfile(filename):
+ """Quickly see if file is a ZIP file by checking the magic number."""
+ try:
+ fpin = open(filename, "rb")
+ endrec = _EndRecData(fpin)
+ fpin.close()
+ if endrec:
+ return True # file has correct magic number
+ except IOError:
+ pass
+ return False
+
+def _EndRecData64(fpin, offset, endrec):
+ """
+ Read the ZIP64 end-of-archive records and use that to update endrec
+ """
+ locatorSize = struct.calcsize(structEndArchive64Locator)
+ fpin.seek(offset - locatorSize, 2)
+ data = fpin.read(locatorSize)
+ sig, diskno, reloff, disks = struct.unpack(structEndArchive64Locator, data)
+ if sig != stringEndArchive64Locator:
+ return endrec
+
+ if diskno != 0 or disks != 1:
+ raise BadZipfile("zipfiles that span multiple disks are not supported")
+
+ # Assume no 'zip64 extensible data'
+ endArchiveSize = struct.calcsize(structEndArchive64)
+ fpin.seek(offset - locatorSize - endArchiveSize, 2)
+ data = fpin.read(endArchiveSize)
+ sig, sz, create_version, read_version, disk_num, disk_dir, \
+ dircount, dircount2, dirsize, diroffset = \
+ struct.unpack(structEndArchive64, data)
+ if sig != stringEndArchive64:
+ return endrec
+
+ # Update the original endrec using data from the ZIP64 record
+ endrec[1] = disk_num
+ endrec[2] = disk_dir
+ endrec[3] = dircount
+ endrec[4] = dircount2
+ endrec[5] = dirsize
+ endrec[6] = diroffset
+ return endrec
+
+
+def _EndRecData(fpin):
+ """Return data from the "End of Central Directory" record, or None.
+
+ The data is a list of the nine items in the ZIP "End of central dir"
+ record followed by a tenth item, the file seek offset of this record."""
+ fpin.seek(-22, 2) # Assume no archive comment.
+ filesize = fpin.tell() + 22 # Get file size
+ data = fpin.read()
+ if data[0:4] == stringEndArchive and data[-2:] == "\000\000":
+ endrec = struct.unpack(structEndArchive, data)
+ endrec = list(endrec)
+ endrec.append("") # Append the archive comment
+ endrec.append(filesize - 22) # Append the record start offset
+ if endrec[-4] == 0xffffffff:
+ return _EndRecData64(fpin, -22, endrec)
+ return endrec
+ # Search the last END_BLOCK bytes of the file for the record signature.
+ # The comment is appended to the ZIP file and has a 16 bit length.
+ # So the comment may be up to 64K long. We limit the search for the
+ # signature to a few Kbytes at the end of the file for efficiency.
+ # also, the signature must not appear in the comment.
+ END_BLOCK = min(filesize, 1024 * 4)
+ fpin.seek(filesize - END_BLOCK, 0)
+ data = fpin.read()
+ start = data.rfind(stringEndArchive)
+ if start >= 0: # Correct signature string was found
+ endrec = struct.unpack(structEndArchive, data[start:start+22])
+ endrec = list(endrec)
+ comment = data[start+22:]
+ if endrec[7] == len(comment): # Comment length checks out
+ # Append the archive comment and start offset
+ endrec.append(comment)
+ endrec.append(filesize - END_BLOCK + start)
+ if endrec[-4] == 0xffffffff:
+ return _EndRecData64(fpin, - END_BLOCK + start, endrec)
+ return endrec
+ return # Error, return None
+
+
+class ZipInfo (object):
+ """Class with attributes describing each file in the ZIP archive."""
+
+ __slots__ = (
+ 'orig_filename',
+ 'filename',
+ 'date_time',
+ 'compress_type',
+ 'comment',
+ 'extra',
+ 'create_system',
+ 'create_version',
+ 'extract_version',
+ 'reserved',
+ 'flag_bits',
+ 'volume',
+ 'internal_attr',
+ 'external_attr',
+ 'header_offset',
+ 'CRC',
+ 'compress_size',
+ 'file_size',
+ '_raw_time',
+ )
+
+ def __init__(self, filename="NoName", date_time=(1980,1,1,0,0,0)):
+ self.orig_filename = filename # Original file name in archive
+
+ # Terminate the file name at the first null byte. Null bytes in file
+ # names are used as tricks by viruses in archives.
+ null_byte = filename.find(chr(0))
+ if null_byte >= 0:
+ filename = filename[0:null_byte]
+ # This is used to ensure paths in generated ZIP files always use
+ # forward slashes as the directory separator, as required by the
+ # ZIP format specification.
+ if os.sep != "/" and os.sep in filename:
+ filename = filename.replace(os.sep, "/")
+
+ self.filename = filename # Normalized file name
+ self.date_time = date_time # year, month, day, hour, min, sec
+ # Standard values:
+ self.compress_type = ZIP_STORED # Type of compression for the file
+ self.comment = "" # Comment for each file
+ self.extra = "" # ZIP extra data
+ if sys.platform == 'win32':
+ self.create_system = 0 # System which created ZIP archive
+ else:
+ # Assume everything else is unix-y
+ self.create_system = 3 # System which created ZIP archive
+ self.create_version = 20 # Version which created ZIP archive
+ self.extract_version = 20 # Version needed to extract archive
+ self.reserved = 0 # Must be zero
+ self.flag_bits = 0 # ZIP flag bits
+ self.volume = 0 # Volume number of file header
+ self.internal_attr = 0 # Internal attributes
+ self.external_attr = 0 # External file attributes
+ # Other attributes are set by class ZipFile:
+ # header_offset Byte offset to the file header
+ # CRC CRC-32 of the uncompressed file
+ # compress_size Size of the compressed file
+ # file_size Size of the uncompressed file
+
+ def FileHeader(self):
+ """Return the per-file header as a string."""
+ dt = self.date_time
+ dosdate = (dt[0] - 1980) << 9 | dt[1] << 5 | dt[2]
+ dostime = dt[3] << 11 | dt[4] << 5 | (dt[5] // 2)
+ if self.flag_bits & 0x08:
+ # Set these to zero because we write them after the file data
+ CRC = compress_size = file_size = 0
+ else:
+ CRC = self.CRC
+ compress_size = self.compress_size
+ file_size = self.file_size
+
+ extra = self.extra
+
+ if file_size > ZIP64_LIMIT or compress_size > ZIP64_LIMIT:
+ # File is larger than what fits into a 4 byte integer,
+ # fall back to the ZIP64 extension
+ fmt = '<HHQQ'
+ extra = extra + struct.pack(fmt,
+ 1, struct.calcsize(fmt)-4, file_size, compress_size)
+ file_size = 0xffffffff # -1
+ compress_size = 0xffffffff # -1
+ self.extract_version = max(45, self.extract_version)
+ self.create_version = max(45, self.extract_version)
+
+ header = struct.pack(structFileHeader, stringFileHeader,
+ self.extract_version, self.reserved, self.flag_bits,
+ self.compress_type, dostime, dosdate, CRC,
+ compress_size, file_size,
+ len(self.filename), len(extra))
+ return header + self.filename + extra
+
+ def _decodeExtra(self):
+ # Try to decode the extra field.
+ extra = self.extra
+ unpack = struct.unpack
+ while extra:
+ tp, ln = unpack('<HH', extra[:4])
+ if tp == 1:
+ if ln >= 24:
+ counts = unpack('<QQQ', extra[4:28])
+ elif ln == 16:
+ counts = unpack('<QQ', extra[4:20])
+ elif ln == 8:
+ counts = unpack('<Q', extra[4:12])
+ elif ln == 0:
+ counts = ()
+ else:
+ raise RuntimeError, "Corrupt extra field %s"%(ln,)
+
+ idx = 0
+
+ # ZIP64 extension (large files and/or large archives)
+ # XXX Is this correct? won't this exclude 2**32-1 byte files?
+ if self.file_size in (0xffffffffffffffffL, 0xffffffffL):
+ self.file_size = counts[idx]
+ idx += 1
+
+ if self.compress_size == -1 or self.compress_size == 0xFFFFFFFFL:
+ self.compress_size = counts[idx]
+ idx += 1
+
+ if self.header_offset == -1 or self.header_offset == 0xffffffffL:
+ old = self.header_offset
+ self.header_offset = counts[idx]
+ idx+=1
+
+ extra = extra[ln+4:]
+
+
+class _ZipDecrypter:
+ """
+ Class to handle decryption of files stored within a ZIP archive.
+
+ ZIP supports a password-based form of encryption. Even though known
+ plaintext attacks have been found against it, it is still useful
+ to be able to get data out of such a file.
+
+ Usage ::
+ zd = _ZipDecrypter(mypwd)
+ plain_char = zd(cypher_char)
+ plain_text = map(zd, cypher_text)
+ """
+
+ def _GenerateCRCTable():
+ """Generate a CRC-32 table.
+
+ ZIP encryption uses the CRC32 one-byte primitive for scrambling some
+ internal keys. We noticed that a direct implementation is faster than
+ relying on binascii.crc32().
+ """
+ poly = 0xedb88320
+ table = [0] * 256
+ for i in range(256):
+ crc = i
+ for j in range(8):
+ if crc & 1:
+ crc = ((crc >> 1) & 0x7FFFFFFF) ^ poly
+ else:
+ crc = ((crc >> 1) & 0x7FFFFFFF)
+ table[i] = crc
+ return table
+ crctable = _GenerateCRCTable()
+
+ def _crc32(self, ch, crc):
+ """Compute the CRC32 primitive on one byte."""
+ return ((crc >> 8) & 0xffffff) ^ self.crctable[(crc ^ ord(ch)) & 0xff]
+
+ def __init__(self, pwd):
+ self.key0 = 305419896
+ self.key1 = 591751049
+ self.key2 = 878082192
+ for p in pwd:
+ self._UpdateKeys(p)
+
+ def _UpdateKeys(self, c):
+ self.key0 = self._crc32(c, self.key0)
+ self.key1 = (self.key1 + (self.key0 & 255)) & 4294967295
+ self.key1 = (self.key1 * 134775813 + 1) & 4294967295
+ self.key2 = self._crc32(chr((self.key1 >> 24) & 255), self.key2)
+
+ def __call__(self, c):
+ """Decrypt a single character."""
+ c = ord(c)
+ k = self.key2 | 2
+ c = c ^ (((k * (k^1)) >> 8) & 255)
+ c = chr(c)
+ self._UpdateKeys(c)
+ return c
+
+class ZipExtFile:
+ """File-like object for reading an archive member.
+ Is returned by ZipFile.open().
+ """
+
+ def __init__(self, fileobj, zipinfo, decrypt=None):
+ self.fileobj = fileobj
+ self.decrypter = decrypt
+ self.bytes_read = 0L
+ self.rawbuffer = ''
+ self.readbuffer = ''
+ self.linebuffer = ''
+ self.eof = False
+ self.univ_newlines = False
+ self.nlSeps = ("\n", )
+ self.lastdiscard = ''
+
+ self.compress_type = zipinfo.compress_type
+ self.compress_size = zipinfo.compress_size
+
+ self.closed = False
+ self.mode = "r"
+ self.name = zipinfo.filename
+
+ # read from compressed files in 64k blocks
+ self.compreadsize = 64*1024
+ if self.compress_type == ZIP_DEFLATED:
+ self.dc = zlib.decompressobj(-15)
+
+ def set_univ_newlines(self, univ_newlines):
+ self.univ_newlines = univ_newlines
+
+ # pick line separator char(s) based on universal newlines flag
+ self.nlSeps = ("\n", )
+ if self.univ_newlines:
+ self.nlSeps = ("\r\n", "\r", "\n")
+
+ def __iter__(self):
+ return self
+
+ def next(self):
+ nextline = self.readline()
+ if not nextline:
+ raise StopIteration()
+
+ return nextline
+
+ def close(self):
+ self.closed = True
+
+ def _checkfornewline(self):
+ nl, nllen = -1, -1
+ if self.linebuffer:
+ # ugly check for cases where half of an \r\n pair was
+ # read on the last pass, and the \r was discarded. In this
+ # case we just throw away the \n at the start of the buffer.
+ if (self.lastdiscard, self.linebuffer[0]) == ('\r','\n'):
+ self.linebuffer = self.linebuffer[1:]
+
+ for sep in self.nlSeps:
+ nl = self.linebuffer.find(sep)
+ if nl >= 0:
+ nllen = len(sep)
+ return nl, nllen
+
+ return nl, nllen
+
+ def readline(self, size = -1):
+ """Read a line with approx. size. If size is negative,
+ read a whole line.
+ """
+ if size < 0:
+ size = sys.maxint
+ elif size == 0:
+ return ''
+
+ # check for a newline already in buffer
+ nl, nllen = self._checkfornewline()
+
+ if nl >= 0:
+ # the next line was already in the buffer
+ nl = min(nl, size)
+ else:
+ # no line break in buffer - try to read more
+ size -= len(self.linebuffer)
+ while nl < 0 and size > 0:
+ buf = self.read(min(size, 100))
+ if not buf:
+ break
+ self.linebuffer += buf
+ size -= len(buf)
+
+ # check for a newline in buffer
+ nl, nllen = self._checkfornewline()
+
+ # we either ran out of bytes in the file, or
+ # met the specified size limit without finding a newline,
+ # so return current buffer
+ if nl < 0:
+ s = self.linebuffer
+ self.linebuffer = ''
+ return s
+
+ buf = self.linebuffer[:nl]
+ self.lastdiscard = self.linebuffer[nl:nl + nllen]
+ self.linebuffer = self.linebuffer[nl + nllen:]
+
+ # line is always returned with \n as newline char (except possibly
+ # for a final incomplete line in the file, which is handled above).
+ return buf + "\n"
+
+ def readlines(self, sizehint = -1):
+ """Return a list with all (following) lines. The sizehint parameter
+ is ignored in this implementation.
+ """
+ result = []
+ while True:
+ line = self.readline()
+ if not line: break
+ result.append(line)
+ return result
+
+ def read(self, size = None):
+ # act like file() obj and return empty string if size is 0
+ if size == 0:
+ return ''
+
+ # determine read size
+ bytesToRead = self.compress_size - self.bytes_read
+
+ # adjust read size for encrypted files since the first 12 bytes
+ # are for the encryption/password information
+ if self.decrypter is not None:
+ bytesToRead -= 12
+
+ if size is not None and size >= 0:
+ if self.compress_type == ZIP_STORED:
+ lr = len(self.readbuffer)
+ bytesToRead = min(bytesToRead, size - lr)
+ elif self.compress_type == ZIP_DEFLATED:
+ if len(self.readbuffer) > size:
+ # the user has requested fewer bytes than we've already
+ # pulled through the decompressor; don't read any more
+ bytesToRead = 0
+ else:
+ # user will use up the buffer, so read some more
+ lr = len(self.rawbuffer)
+ bytesToRead = min(bytesToRead, self.compreadsize - lr)
+
+ # avoid reading past end of file contents
+ if bytesToRead + self.bytes_read > self.compress_size:
+ bytesToRead = self.compress_size - self.bytes_read
+
+ # try to read from file (if necessary)
+ if bytesToRead > 0:
+ bytes = self.fileobj.read(bytesToRead)
+ self.bytes_read += len(bytes)
+ self.rawbuffer += bytes
+
+ # handle contents of raw buffer
+ if self.rawbuffer:
+ newdata = self.rawbuffer
+ self.rawbuffer = ''
+
+ # decrypt new data if we were given an object to handle that
+ if newdata and self.decrypter is not None:
+ newdata = ''.join(map(self.decrypter, newdata))
+
+ # decompress newly read data if necessary
+ if newdata and self.compress_type == ZIP_DEFLATED:
+ newdata = self.dc.decompress(newdata)
+ self.rawbuffer = self.dc.unconsumed_tail
+ if self.eof and len(self.rawbuffer) == 0:
+ # we're out of raw bytes (both from the file and
+ # the local buffer); flush just to make sure the
+ # decompressor is done
+ newdata += self.dc.flush()
+ # prevent decompressor from being used again
+ self.dc = None
+
+ self.readbuffer += newdata
+
+
+ # return what the user asked for
+ if size is None or len(self.readbuffer) <= size:
+ bytes = self.readbuffer
+ self.readbuffer = ''
+ else:
+ bytes = self.readbuffer[:size]
+ self.readbuffer = self.readbuffer[size:]
+
+ return bytes
+
+
+class ZipFile:
+ """ Class with methods to open, read, write, close, list zip files.
+
+ z = ZipFile(file, mode="r", compression=ZIP_STORED, allowZip64=True)
+
+ @var file: Either the path to the file, or a file-like object.
+ If it is a path, the file will be opened and closed by ZipFile.
+ @var mode: The mode can be either read "r", write "w" or append "a".
+ @var compression: ZIP_STORED (no compression) or ZIP_DEFLATED (requires zlib).
+ @var allowZip64: if True ZipFile will create files with ZIP64 extensions when
+ needed, otherwise it will raise an exception when this would
+ be necessary.
+
+ """
+
+ fp = None # Set here since __del__ checks it
+
+ def __init__(self, file, mode="r", compression=ZIP_STORED, allowZip64=False):
+ """Open the ZIP file with mode read "r", write "w" or append "a"."""
+ if mode not in ("r", "w", "a"):
+ raise RuntimeError('ZipFile() requires mode "r", "w", or "a"')
+
+ if compression == ZIP_STORED:
+ pass
+ elif compression == ZIP_DEFLATED:
+ if not zlib:
+ raise RuntimeError,\
+ "Compression requires the (missing) zlib module"
+ else:
+ raise RuntimeError, "That compression method is not supported"
+
+ self._allowZip64 = allowZip64
+ self._didModify = False
+ self.debug = 0 # Level of printing: 0 through 3
+ self.NameToInfo = {} # Find file info given name
+ self.filelist = [] # List of ZipInfo instances for archive
+ self.compression = compression # Method of compression
+ self.mode = key = mode.replace('b', '')[0]
+ self.pwd = None
+
+ # Check if we were passed a file-like object
+ if isinstance(file, basestring):
+ self._filePassed = 0
+ self.filename = file
+ modeDict = {'r' : 'rb', 'w': 'wb', 'a' : 'r+b'}
+ try:
+ self.fp = open(file, modeDict[mode])
+ except IOError:
+ if mode == 'a':
+ mode = key = 'w'
+ self.fp = open(file, modeDict[mode])
+ else:
+ raise
+ else:
+ self._filePassed = 1
+ self.fp = file
+ self.filename = getattr(file, 'name', None)
+
+ if key == 'r':
+ self._GetContents()
+ elif key == 'w':
+ pass
+ elif key == 'a':
+ try: # See if file is a zip file
+ self._RealGetContents()
+ # seek to start of directory and overwrite
+ self.fp.seek(self.start_dir, 0)
+ except BadZipfile: # file is not a zip file, just append
+ self.fp.seek(0, 2)
+ else:
+ if not self._filePassed:
+ self.fp.close()
+ self.fp = None
+ raise RuntimeError, 'Mode must be "r", "w" or "a"'
+
+ def _GetContents(self):
+ """Read the directory, making sure we close the file if the format
+ is bad."""
+ try:
+ self._RealGetContents()
+ except BadZipfile:
+ if not self._filePassed:
+ self.fp.close()
+ self.fp = None
+ raise
+
+ def _RealGetContents(self):
+ """Read in the table of contents for the ZIP file."""
+ fp = self.fp
+ endrec = _EndRecData(fp)
+ if not endrec:
+ raise BadZipfile, "File is not a zip file"
+ if self.debug > 1:
+ print endrec
+ size_cd = endrec[5] # bytes in central directory
+ offset_cd = endrec[6] # offset of central directory
+ self.comment = endrec[8] # archive comment
+ # endrec[9] is the offset of the "End of Central Dir" record
+ if endrec[9] > ZIP64_LIMIT:
+ x = endrec[9] - size_cd - 56 - 20
+ else:
+ x = endrec[9] - size_cd
+ # "concat" is zero, unless zip was concatenated to another file
+ concat = x - offset_cd
+ if self.debug > 2:
+ print "given, inferred, offset", offset_cd, x, concat
+ # self.start_dir: Position of start of central directory
+ self.start_dir = offset_cd + concat
+ fp.seek(self.start_dir, 0)
+ data = fp.read(size_cd)
+ fp = cStringIO.StringIO(data)
+ total = 0
+ while total < size_cd:
+ centdir = fp.read(46)
+ total = total + 46
+ if centdir[0:4] != stringCentralDir:
+ raise BadZipfile, "Bad magic number for central directory"
+ centdir = struct.unpack(structCentralDir, centdir)
+ if self.debug > 2:
+ print centdir
+ filename = fp.read(centdir[_CD_FILENAME_LENGTH])
+ # Create ZipInfo instance to store file information
+ x = ZipInfo(filename)
+ x.extra = fp.read(centdir[_CD_EXTRA_FIELD_LENGTH])
+ x.comment = fp.read(centdir[_CD_COMMENT_LENGTH])
+ total = (total + centdir[_CD_FILENAME_LENGTH]
+ + centdir[_CD_EXTRA_FIELD_LENGTH]
+ + centdir[_CD_COMMENT_LENGTH])
+ x.header_offset = centdir[_CD_LOCAL_HEADER_OFFSET]
+ (x.create_version, x.create_system, x.extract_version, x.reserved,
+ x.flag_bits, x.compress_type, t, d,
+ x.CRC, x.compress_size, x.file_size) = centdir[1:12]
+ x.volume, x.internal_attr, x.external_attr = centdir[15:18]
+ # Convert date/time code to (year, month, day, hour, min, sec)
+ x._raw_time = t
+ x.date_time = ( (d>>9)+1980, (d>>5)&0xF, d&0x1F,
+ t>>11, (t>>5)&0x3F, (t&0x1F) * 2 )
+
+ x._decodeExtra()
+ x.header_offset = x.header_offset + concat
+ self.filelist.append(x)
+ self.NameToInfo[x.filename] = x
+ if self.debug > 2:
+ print "total", total
+
+
+ def namelist(self):
+ """Return a list of file names in the archive."""
+ l = []
+ for data in self.filelist:
+ l.append(data.filename)
+ return l
+
+ def infolist(self):
+ """Return a list of class ZipInfo instances for files in the
+ archive."""
+ return self.filelist
+
+ def printdir(self):
+ """Print a table of contents for the zip file."""
+ print "%-46s %19s %12s" % ("File Name", "Modified ", "Size")
+ for zinfo in self.filelist:
+ date = "%d-%02d-%02d %02d:%02d:%02d" % zinfo.date_time[:6]
+ print "%-46s %s %12d" % (zinfo.filename, date, zinfo.file_size)
+
+ def testzip(self):
+ """Read all the files and check the CRC."""
+ for zinfo in self.filelist:
+ try:
+ self.read(zinfo.filename) # Check CRC-32
+ except BadZipfile:
+ return zinfo.filename
+
+
+ def getinfo(self, name):
+ """Return the instance of ZipInfo given 'name'."""
+ info = self.NameToInfo.get(name)
+ if info is None:
+ raise KeyError(
+ 'There is no item named %r in the archive' % name)
+
+ return info
+
+ def setpassword(self, pwd):
+ """Set default password for encrypted files."""
+ self.pwd = pwd
+
+ def read(self, name, pwd=None):
+ """Return file bytes (as a string) for name."""
+ return self.open(name, "r", pwd).read()
+
+ def open(self, name, mode="r", pwd=None):
+ """Return file-like object for 'name'."""
+ if mode not in ("r", "U", "rU"):
+ raise RuntimeError, 'open() requires mode "r", "U", or "rU"'
+ if not self.fp:
+ raise RuntimeError, \
+ "Attempt to read ZIP archive that was already closed"
+
+ # Only open a new file for instances where we were not
+ # given a file object in the constructor
+ if self._filePassed:
+ zef_file = self.fp
+ else:
+ zef_file = open(self.filename, 'rb')
+
+ # Get info object for name
+ zinfo = self.getinfo(name)
+
+ filepos = zef_file.tell()
+
+ zef_file.seek(zinfo.header_offset, 0)
+
+ # Skip the file header:
+ fheader = zef_file.read(30)
+ if fheader[0:4] != stringFileHeader:
+ raise BadZipfile, "Bad magic number for file header"
+
+ fheader = struct.unpack(structFileHeader, fheader)
+ fname = zef_file.read(fheader[_FH_FILENAME_LENGTH])
+ if fheader[_FH_EXTRA_FIELD_LENGTH]:
+ zef_file.read(fheader[_FH_EXTRA_FIELD_LENGTH])
+
+ if fname != zinfo.orig_filename:
+ raise BadZipfile, \
+ 'File name in directory "%s" and header "%s" differ.' % (
+ zinfo.orig_filename, fname)
+
+ # check for encrypted flag & handle password
+ is_encrypted = zinfo.flag_bits & 0x1
+ zd = None
+ if is_encrypted:
+ if not pwd:
+ pwd = self.pwd
+ if not pwd:
+ raise RuntimeError, "File %s is encrypted, " \
+ "password required for extraction" % name
+
+ zd = _ZipDecrypter(pwd)
+ # The first 12 bytes in the cypher stream is an encryption header
+ # used to strengthen the algorithm. The first 11 bytes are
+ # completely random, while the 12th contains the MSB of the CRC,
+ # or the MSB of the file time depending on the header type
+ # and is used to check the correctness of the password.
+ bytes = zef_file.read(12)
+ h = map(zd, bytes[0:12])
+ if zinfo.flag_bits & 0x8:
+ # compare against the file type from extended local headers
+ check_byte = (zinfo._raw_time >> 8) & 0xff
+ else:
+ # compare against the CRC otherwise
+ check_byte = (zinfo.CRC >> 24) & 0xff
+ if ord(h[11]) != check_byte:
+ raise RuntimeError("Bad password for file", name)
+
+ # build and return a ZipExtFile
+ if zd is None:
+ zef = ZipExtFile(zef_file, zinfo)
+ else:
+ zef = ZipExtFile(zef_file, zinfo, zd)
+
+ # set universal newlines on ZipExtFile if necessary
+ if "U" in mode:
+ zef.set_univ_newlines(True)
+ return zef
+
+ def extract(self, member, path=None, pwd=None):
+ """Extract a member from the archive to the current working directory,
+ using its full name. Its file information is extracted as accurately
+ as possible. `member' may be a filename or a ZipInfo object. You can
+ specify a different directory using `path'.
+ """
+ if not isinstance(member, ZipInfo):
+ member = self.getinfo(member)
+
+ if path is None:
+ path = os.getcwd()
+
+ return self._extract_member(member, path, pwd)
+
+ def extractall(self, path=None, members=None, pwd=None):
+ """Extract all members from the archive to the current working
+ directory. `path' specifies a different directory to extract to.
+ `members' is optional and must be a subset of the list returned
+ by namelist().
+ """
+ if members is None:
+ members = self.namelist()
+
+ for zipinfo in members:
+ self.extract(zipinfo, path, pwd)
+
+ def _extract_member(self, member, targetpath, pwd):
+ """Extract the ZipInfo object 'member' to a physical
+ file on the path targetpath.
+ """
+ # build the destination pathname, replacing
+ # forward slashes to platform specific separators.
+ if targetpath[-1:] == "/":
+ targetpath = targetpath[:-1]
+
+ # don't include leading "/" from file name if present
+ if os.path.isabs(member.filename):
+ targetpath = os.path.join(targetpath, member.filename[1:])
+ else:
+ targetpath = os.path.join(targetpath, member.filename)
+
+ targetpath = os.path.normpath(targetpath)
+
+ # Create all upper directories if necessary.
+ upperdirs = os.path.dirname(targetpath)
+ if upperdirs and not os.path.exists(upperdirs):
+ os.makedirs(upperdirs)
+
+ source = self.open(member.filename, pwd=pwd)
+ target = file(targetpath, "wb")
+ shutil.copyfileobj(source, target)
+ source.close()
+ target.close()
+
+ return targetpath
+
+ def _writecheck(self, zinfo):
+ """Check for errors before writing a file to the archive."""
+ if zinfo.filename in self.NameToInfo:
+ if self.debug: # Warning for duplicate names
+ print "Duplicate name:", zinfo.filename
+ if self.mode not in ("w", "a"):
+ raise RuntimeError, 'write() requires mode "w" or "a"'
+ if not self.fp:
+ raise RuntimeError, \
+ "Attempt to write ZIP archive that was already closed"
+ if zinfo.compress_type == ZIP_DEFLATED and not zlib:
+ raise RuntimeError, \
+ "Compression requires the (missing) zlib module"
+ if zinfo.compress_type not in (ZIP_STORED, ZIP_DEFLATED):
+ raise RuntimeError, \
+ "That compression method is not supported"
+ if zinfo.file_size > ZIP64_LIMIT:
+ if not self._allowZip64:
+ raise LargeZipFile("Filesize would require ZIP64 extensions")
+ if zinfo.header_offset > ZIP64_LIMIT:
+ if not self._allowZip64:
+ raise LargeZipFile("Zipfile size would require ZIP64 extensions")
+
+ def write(self, filename, arcname=None, compress_type=None):
+ """Put the bytes from filename into the archive under the name
+ arcname."""
+ if not self.fp:
+ raise RuntimeError(
+ "Attempt to write to ZIP archive that was already closed")
+
+ st = os.stat(filename)
+ mtime = time.localtime(st.st_mtime)
+ date_time = mtime[0:6]
+ # Create ZipInfo instance to store file information
+ if arcname is None:
+ arcname = filename
+ arcname = os.path.normpath(os.path.splitdrive(arcname)[1])
+ while arcname[0] in (os.sep, os.altsep):
+ arcname = arcname[1:]
+ zinfo = ZipInfo(arcname, date_time)
+ zinfo.external_attr = (st[0] & 0xFFFF) << 16L # Unix attributes
+ if compress_type is None:
+ zinfo.compress_type = self.compression
+ else:
+ zinfo.compress_type = compress_type
+
+ zinfo.file_size = st.st_size
+ zinfo.flag_bits = 0x00
+ zinfo.header_offset = self.fp.tell() # Start of header bytes
+
+ self._writecheck(zinfo)
+ self._didModify = True
+ fp = open(filename, "rb")
+ # Must overwrite CRC and sizes with correct data later
+ zinfo.CRC = CRC = 0
+ zinfo.compress_size = compress_size = 0
+ zinfo.file_size = file_size = 0
+ self.fp.write(zinfo.FileHeader())
+ if zinfo.compress_type == ZIP_DEFLATED:
+ cmpr = zlib.compressobj(zlib.Z_DEFAULT_COMPRESSION,
+ zlib.DEFLATED, -15)
+ else:
+ cmpr = None
+ while 1:
+ buf = fp.read(1024 * 8)
+ if not buf:
+ break
+ file_size = file_size + len(buf)
+ CRC = crc32(buf, CRC) & 0xffffffff
+ if cmpr:
+ buf = cmpr.compress(buf)
+ compress_size = compress_size + len(buf)
+ self.fp.write(buf)
+ fp.close()
+ if cmpr:
+ buf = cmpr.flush()
+ compress_size = compress_size + len(buf)
+ self.fp.write(buf)
+ zinfo.compress_size = compress_size
+ else:
+ zinfo.compress_size = file_size
+ zinfo.CRC = CRC
+ zinfo.file_size = file_size
+ # Seek backwards and write CRC and file sizes
+ position = self.fp.tell() # Preserve current position in file
+ self.fp.seek(zinfo.header_offset + 14, 0)
+ self.fp.write(struct.pack("<LLL", zinfo.CRC, zinfo.compress_size,
+ zinfo.file_size))
+ self.fp.seek(position, 0)
+ self.filelist.append(zinfo)
+ self.NameToInfo[zinfo.filename] = zinfo
+
+ def writestr(self, zinfo_or_arcname, bytes):
+ """Write a file into the archive. The contents is the string
+ 'bytes'. 'zinfo_or_arcname' is either a ZipInfo instance or
+ the name of the file in the archive."""
+ if not isinstance(zinfo_or_arcname, ZipInfo):
+ zinfo = ZipInfo(filename=zinfo_or_arcname,
+ date_time=time.localtime(time.time())[:6])
+ zinfo.compress_type = self.compression
+ else:
+ zinfo = zinfo_or_arcname
+
+ if not self.fp:
+ raise RuntimeError(
+ "Attempt to write to ZIP archive that was already closed")
+
+ zinfo.file_size = len(bytes) # Uncompressed size
+ zinfo.header_offset = self.fp.tell() # Start of header bytes
+ self._writecheck(zinfo)
+ self._didModify = True
+ zinfo.CRC = crc32(bytes) & 0xffffffff # CRC-32 checksum
+ if zinfo.compress_type == ZIP_DEFLATED:
+ co = zlib.compressobj(zlib.Z_DEFAULT_COMPRESSION,
+ zlib.DEFLATED, -15)
+ bytes = co.compress(bytes) + co.flush()
+ zinfo.compress_size = len(bytes) # Compressed size
+ else:
+ zinfo.compress_size = zinfo.file_size
+ zinfo.header_offset = self.fp.tell() # Start of header bytes
+ self.fp.write(zinfo.FileHeader())
+ self.fp.write(bytes)
+ self.fp.flush()
+ if zinfo.flag_bits & 0x08:
+ # Write CRC and file sizes after the file data
+ self.fp.write(struct.pack("<lLL", zinfo.CRC, zinfo.compress_size,
+ zinfo.file_size))
+ self.filelist.append(zinfo)
+ self.NameToInfo[zinfo.filename] = zinfo
+
+ def __del__(self):
+ """Call the "close()" method in case the user forgot."""
+ self.close()
+
+ def close(self):
+ """Close the file, and for mode "w" and "a" write the ending
+ records."""
+ if self.fp is None:
+ return
+
+ if self.mode in ("w", "a") and self._didModify: # write ending records
+ count = 0
+ pos1 = self.fp.tell()
+ for zinfo in self.filelist: # write central directory
+ count = count + 1
+ dt = zinfo.date_time
+ dosdate = (dt[0] - 1980) << 9 | dt[1] << 5 | dt[2]
+ dostime = dt[3] << 11 | dt[4] << 5 | (dt[5] // 2)
+ extra = []
+ if zinfo.file_size > ZIP64_LIMIT \
+ or zinfo.compress_size > ZIP64_LIMIT:
+ extra.append(zinfo.file_size)
+ extra.append(zinfo.compress_size)
+ file_size = 0xffffffff #-1
+ compress_size = 0xffffffff #-1
+ else:
+ file_size = zinfo.file_size
+ compress_size = zinfo.compress_size
+
+ if zinfo.header_offset > ZIP64_LIMIT:
+ extra.append(zinfo.header_offset)
+ header_offset = 0xffffffffL # -1 32 bit
+ else:
+ header_offset = zinfo.header_offset
+
+ extra_data = zinfo.extra
+ if extra:
+ # Append a ZIP64 field to the extra's
+ extra_data = struct.pack(
+ '<HH' + 'Q'*len(extra),
+ 1, 8*len(extra), *extra) + extra_data
+
+ extract_version = max(45, zinfo.extract_version)
+ create_version = max(45, zinfo.create_version)
+ else:
+ extract_version = zinfo.extract_version
+ create_version = zinfo.create_version
+
+ try:
+ centdir = struct.pack(structCentralDir,
+ stringCentralDir, create_version,
+ zinfo.create_system, extract_version, zinfo.reserved,
+ zinfo.flag_bits, zinfo.compress_type, dostime, dosdate,
+ zinfo.CRC, compress_size, file_size,
+ len(zinfo.filename), len(extra_data), len(zinfo.comment),
+ 0, zinfo.internal_attr, zinfo.external_attr,
+ header_offset)
+ except DeprecationWarning:
+ print >>sys.stderr, (structCentralDir,
+ stringCentralDir, create_version,
+ zinfo.create_system, extract_version, zinfo.reserved,
+ zinfo.flag_bits, zinfo.compress_type, dostime, dosdate,
+ zinfo.CRC, compress_size, file_size,
+ len(zinfo.filename), len(extra_data), len(zinfo.comment),
+ 0, zinfo.internal_attr, zinfo.external_attr,
+ header_offset)
+ raise
+ self.fp.write(centdir)
+ self.fp.write(zinfo.filename)
+ self.fp.write(extra_data)
+ self.fp.write(zinfo.comment)
+
+ pos2 = self.fp.tell()
+ # Write end-of-zip-archive record
+ if pos1 > ZIP64_LIMIT:
+ # Need to write the ZIP64 end-of-archive records
+ zip64endrec = struct.pack(
+ structEndArchive64, stringEndArchive64,
+ 44, 45, 45, 0, 0, count, count, pos2 - pos1, pos1)
+ self.fp.write(zip64endrec)
+
+ zip64locrec = struct.pack(
+ structEndArchive64Locator,
+ stringEndArchive64Locator, 0, pos2, 1)
+ self.fp.write(zip64locrec)
+
+ endrec = struct.pack(structEndArchive, stringEndArchive,
+ 0, 0, count, count, pos2 - pos1, 0xffffffffL, 0)
+ self.fp.write(endrec)
+
+ else:
+ endrec = struct.pack(structEndArchive, stringEndArchive,
+ 0, 0, count, count, pos2 - pos1, pos1, 0)
+ self.fp.write(endrec)
+ self.fp.flush()
+ if not self._filePassed:
+ self.fp.close()
+ self.fp = None
+
+
+class PyZipFile(ZipFile):
+ """Class to create ZIP archives with Python library files and packages."""
+
+ def writepy(self, pathname, basename = ""):
+ """Add all files from "pathname" to the ZIP archive.
+
+ If pathname is a package directory, search the directory and
+ all package subdirectories recursively for all *.py and enter
+ the modules into the archive. If pathname is a plain
+ directory, listdir *.py and enter all modules. Else, pathname
+ must be a Python *.py file and the module will be put into the
+ archive. Added modules are always module.pyo or module.pyc.
+ This method will compile the module.py into module.pyc if
+ necessary.
+ """
+ dir, name = os.path.split(pathname)
+ if os.path.isdir(pathname):
+ initname = os.path.join(pathname, "__init__.py")
+ if os.path.isfile(initname):
+ # This is a package directory, add it
+ if basename:
+ basename = "%s/%s" % (basename, name)
+ else:
+ basename = name
+ if self.debug:
+ print "Adding package in", pathname, "as", basename
+ fname, arcname = self._get_codename(initname[0:-3], basename)
+ if self.debug:
+ print "Adding", arcname
+ self.write(fname, arcname)
+ dirlist = os.listdir(pathname)
+ dirlist.remove("__init__.py")
+ # Add all *.py files and package subdirectories
+ for filename in dirlist:
+ path = os.path.join(pathname, filename)
+ root, ext = os.path.splitext(filename)
+ if os.path.isdir(path):
+ if os.path.isfile(os.path.join(path, "__init__.py")):
+ # This is a package directory, add it
+ self.writepy(path, basename) # Recursive call
+ elif ext == ".py":
+ fname, arcname = self._get_codename(path[0:-3],
+ basename)
+ if self.debug:
+ print "Adding", arcname
+ self.write(fname, arcname)
+ else:
+ # This is NOT a package directory, add its files at top level
+ if self.debug:
+ print "Adding files from directory", pathname
+ for filename in os.listdir(pathname):
+ path = os.path.join(pathname, filename)
+ root, ext = os.path.splitext(filename)
+ if ext == ".py":
+ fname, arcname = self._get_codename(path[0:-3],
+ basename)
+ if self.debug:
+ print "Adding", arcname
+ self.write(fname, arcname)
+ else:
+ if pathname[-3:] != ".py":
+ raise RuntimeError, \
+ 'Files added with writepy() must end with ".py"'
+ fname, arcname = self._get_codename(pathname[0:-3], basename)
+ if self.debug:
+ print "Adding file", arcname
+ self.write(fname, arcname)
+
+ def _get_codename(self, pathname, basename):
+ """Return (filename, archivename) for the path.
+
+ Given a module name path, return the correct file path and
+ archive name, compiling if necessary. For example, given
+ /python/lib/string, return (/python/lib/string.pyc, string).
+ """
+ file_py = pathname + ".py"
+ file_pyc = pathname + ".pyc"
+ file_pyo = pathname + ".pyo"
+ if os.path.isfile(file_pyo) and \
+ os.stat(file_pyo).st_mtime >= os.stat(file_py).st_mtime:
+ fname = file_pyo # Use .pyo file
+ elif not os.path.isfile(file_pyc) or \
+ os.stat(file_pyc).st_mtime < os.stat(file_py).st_mtime:
+ import py_compile
+ if self.debug:
+ print "Compiling", file_py
+ try:
+ py_compile.compile(file_py, file_pyc, None, True)
+ except py_compile.PyCompileError,err:
+ print err.msg
+ fname = file_pyc
+ else:
+ fname = file_pyc
+ archivename = os.path.split(fname)[1]
+ if basename:
+ archivename = "%s/%s" % (basename, archivename)
+ return (fname, archivename)
+
+
+def main(args = None):
+ import textwrap
+ USAGE=textwrap.dedent("""\
+ Usage:
+ zipfile.py -l zipfile.zip # Show listing of a zipfile
+ zipfile.py -t zipfile.zip # Test if a zipfile is valid
+ zipfile.py -e zipfile.zip target # Extract zipfile into target dir
+ zipfile.py -c zipfile.zip src ... # Create zipfile from sources
+ """)
+ if args is None:
+ args = sys.argv[1:]
+
+ if not args or args[0] not in ('-l', '-c', '-e', '-t'):
+ print USAGE
+ sys.exit(1)
+
+ if args[0] == '-l':
+ if len(args) != 2:
+ print USAGE
+ sys.exit(1)
+ zf = ZipFile(args[1], 'r')
+ zf.printdir()
+ zf.close()
+
+ elif args[0] == '-t':
+ if len(args) != 2:
+ print USAGE
+ sys.exit(1)
+ zf = ZipFile(args[1], 'r')
+ zf.testzip()
+ print "Done testing"
+
+ elif args[0] == '-e':
+ if len(args) != 3:
+ print USAGE
+ sys.exit(1)
+
+ zf = ZipFile(args[1], 'r')
+ out = args[2]
+ for path in zf.namelist():
+ if path.startswith('./'):
+ tgt = os.path.join(out, path[2:])
+ else:
+ tgt = os.path.join(out, path)
+
+ tgtdir = os.path.dirname(tgt)
+ if not os.path.exists(tgtdir):
+ os.makedirs(tgtdir)
+ fp = open(tgt, 'wb')
+ fp.write(zf.read(path))
+ fp.close()
+ zf.close()
+
+ elif args[0] == '-c':
+ if len(args) < 3:
+ print USAGE
+ sys.exit(1)
+
+ def addToZip(zf, path, zippath):
+ if os.path.isfile(path):
+ zf.write(path, zippath, ZIP_DEFLATED)
+ elif os.path.isdir(path):
+ for nm in os.listdir(path):
+ addToZip(zf,
+ os.path.join(path, nm), os.path.join(zippath, nm))
+ # else: ignore
+
+ zf = ZipFile(args[1], 'w', allowZip64=True)
+ for src in args[2:]:
+ addToZip(zf, src, os.path.basename(src))
+
+ zf.close()
+
+if __name__ == "__main__":
+ main()
diff --git a/src/obiuniq.py b/src/obiuniq.py
new file mode 100755
index 0000000..63c41c7
--- /dev/null
+++ b/src/obiuniq.py
@@ -0,0 +1,107 @@
+#!/usr/local/bin/python
+'''
+:py:mod:`obiuniq`: groups and dereplicates sequences
+====================================================
+
+.. codeauthor:: Eric Coissac <eric.coissac at metabarcoding.org>
+
+The :py:mod:`obiuniq` command is in some way analog to the standard Unix ``uniq -c`` command.
+
+Instead of working text line by text line as the standard Unix tool, the processing is done on
+sequence records.
+
+A sequence record is a complex object composed of an identifier, a set of
+attributes (``key=value``), a definition, and the sequence itself.
+
+The :py:mod:`obiuniq` command groups together sequence records. Then, for each group, a sequence
+record is printed.
+
+A group is defined by the sequence and optionally by the values of a set of attributes
+specified with the ``-c`` option.
+
+As the identifier, the set of attributes (``key=value``) and the definition of the sequence
+records that are grouped together may be different, two options (``-m`` and ``-i``)
+allow refining how these parts of the records are reported.
+
+ - By default, only attributes with identical values
+ within a group of sequence records are kept.
+
+ - A ``count`` attribute is set to the total number of sequence records for each group.
+
+ - For each attribute specified by the ``-m`` option, a new attribute whose key is prefixed
+ by ``merged_`` is created. These new attributes contain the number of times each value
+ occurs within the group of sequence records.
+
+:py:mod:`obiuniq` and taxonomic information
+-------------------------------------------
+
+When a taxonomy is loaded (``-d`` or ``-t`` options), the ``merged_taxid``
+attribute is created and records the number of times each *taxid* has been found in the
+group (it may be empty if no sequence record has a *taxid* attribute in the group).
+In addition, a set of taxonomy-related attributes are generated for each group having at
+least one sequence record with a *taxid* attribute. The *taxid* attribute of the sequence
+group is set to the last common ancestor of the *taxids* of the group. All other taxonomy-related
+attributes created (``species``, ``genus``, ``family``, ``species_name``, ``genus_name``,
+``family_name``, ``rank``, ``scientific_name``) give information on the last common ancestor.
+
+'''
+
+
+from obitools.format.options import addInputFormatOption
+from obitools.fasta import formatFasta
+from obitools.utils.bioseq import uniqSequence,uniqPrefixSequence
+from obitools.options import getOptionManager
+from obitools.options.taxonomyfilter import addTaxonomyDBOptions
+from obitools.options.taxonomyfilter import loadTaxonomyDatabase
+
+def addUniqOptions(optionManager):
+ group = optionManager.add_option_group('Obiuniq specific options')
+ group.add_option('-m','--merge',
+ action="append", dest="merge",
+ metavar="<TAG NAME>",
+ type="string",
+ default=[],
+ help="Attributes to merge")
+
+ group.add_option('-i','--merge-ids',
+ action="store_true", dest="mergeids",
+ default=False,
+ help="Add the merged key with all ids of merged sequences")
+
+ group.add_option('-c','--category-attribute',
+ action="append", dest="categories",
+ metavar="<Attribute Name>",
+ default=[],
+ help="Add one attribute to the list of attributes "
+ "used to group sequences before dereplication "
+ "(option can be used several times)")
+
+ group.add_option('-p','--prefix',
+ action="store_true", dest="prefix",
+ default=False,
+ help="Dereplication is done based on prefix matching: "
+ "(i) The shortest sequence of each group is a prefix "
+ "of any sequence of its group (ii) Two shortest "
+ "sequences of any couple of groups are not the"
+ "prefix of the other one")
+
+
+if __name__=='__main__':
+
+# root.setLevel(DEBUG)
+
+ optionParser = getOptionManager([addUniqOptions,addTaxonomyDBOptions,addInputFormatOption],progdoc=__doc__)
+
+ (options, entries) = optionParser()
+
+ taxonomy=loadTaxonomyDatabase(options)
+
+ if options.prefix:
+ usm = uniqPrefixSequence
+ else:
+ usm= uniqSequence
+
+ uniqSeq=usm(entries,taxonomy,options.merge,options.mergeids,options.categories)
+
+ for seq in uniqSeq:
+ print formatFasta(seq)
diff --git a/src/oligotag.py b/src/oligotag.py
new file mode 100755
index 0000000..16a56bb
--- /dev/null
+++ b/src/oligotag.py
@@ -0,0 +1,106 @@
+#!/usr/local/bin/python
+'''
+:py:mod:`oligotag`: Designs a set of oligonucleotides with specified properties
+===============================================================================
+
+.. codeauthor:: Eric Coissac <eric.coissac at metabarcoding.org>
+
+
+:py:mod:`oligotag` designs a set of oligonucleotides that can be used for tagging a set
+of samples during PCR reactions, by adding the oligonucleotides on the 5' end of the primers.
+Many options allow designing a set of oligonucleotides according to specified properties.
+
+'''
+
+import sys
+
+from obitools.options import getOptionManager
+
+from obitools import word
+
+from obitools.word.options import addOligoOptions
+from obitools.word.options import dnaWordIterator
+
+from obitools.word import wordDist,decodeWord
+
+from obitools.graph.algorithms.clique import cliqueIterator
+from obitools.graph import Graph
+
+
+def addOligoTagOptions(optionManager):
+
+# optionManager.add_option('-E','--bad-pairs',
+# action="store", dest="badPairs",
+# metavar="<filename>",
+# type="str",
+# help="filename containing a list of oligonucleotide")
+
+ optionManager.add_option('-T','--timeout',
+ action="store", dest="timeout",
+ metavar="<seconde>",
+ type="int",
+ default=None,
+ help="timeout to identify a clique of good size")
+
+
+
+def edgeIterator(words,distmin=1,error=None):
+ words=[x for x in words]
+
+ for i in xrange(len(words)):
+ for j in xrange(i+1,len(words)):
+ D = wordDist(words[i], words[j])
+ if D>=distmin:
+ yield words[i], words[j]
+ elif error is not None:
+ print >>error,words[i], words[j],D
+
+
+def readData(edges):
+ graph = Graph()
+
+ for x,y in edges:
+ graph.addEdge(x, y)
+ return graph
+
+
+if __name__=='__main__':
+
+
+ optionParser = getOptionManager([addOligoOptions,addOligoTagOptions],
+ )
+ (options, entries) = optionParser()
+
+# if options.badPairs is not None:
+# error = open(options.badPairs,'w')
+# else:
+ error = None
+
+ goodOligo = dnaWordIterator(options)
+
+ print >>sys.stderr,"Build good words graph..."
+
+ graph= readData(edgeIterator(goodOligo,options.oligoDist,error))
+
+ print >>sys.stderr,"Initial graph size : %d edge count : %d" % (len(graph),graph.edgeCount())
+ print >>sys.stderr
+
+
+ ci = cliqueIterator(graph, options.familySize,timeout=options.timeout)
+
+ try:
+ result = ci.next()
+ print >>sys.stderr
+
+ for word in result:
+ print decodeWord(graph.getNode(index=word).getLabel(),options.oligoSize)
+
+ except StopIteration:
+ print >>sys.stderr
+ print >>sys.stderr,"-------------------------------------------"
+ print >>sys.stderr
+ print >>sys.stderr,"No solutions for this parametter set"
+ print >>sys.stderr
+ print >>sys.stderr,"-------------------------------------------"
+ print >>sys.stderr
+
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/obitools.git
More information about the debian-med-commit
mailing list