[med-svn] [falconkit] 01/05: Imported Upstream version 0.4.0

Afif Elghraoui afif-guest at moszumanska.debian.org
Fri Dec 18 08:24:42 UTC 2015


This is an automated email from the git hooks/post-receive script.

afif-guest pushed a commit to branch master
in repository falconkit.

commit 112e8f90ac0e2b206b3c32f940d22bc93688f24d
Author: Afif Elghraoui <afif at ghraoui.name>
Date:   Fri Dec 11 19:05:59 2015 -0800

    Imported Upstream version 0.4.0
---
 .gitignore                                 |    9 +
 .gitmodules                                |    0
 .travis.yml                                |   26 +
 LICENSE                                    |   36 +
 MANIFEST.in                                |    1 +
 README.md                                  |   34 +
 doc/README                                 |    3 +
 doc/falcon_icon.svg                        |   24 +
 doc/falcon_icon2.png                       |  Bin 0 -> 58715 bytes
 examples/build_env.sh                      |   22 +
 examples/build_env2.sh                     |   22 +
 examples/ecoli_asm_graph_exploration.ipynb |  716 ++++++++++++++
 examples/fc_run_LG.cfg                     |   36 +
 examples/fc_run_arab.cfg                   |   36 +
 examples/fc_run_dmel.cfg                   |   36 +
 examples/fc_run_ecoli.cfg                  |   35 +
 examples/fc_run_ecoli_2.cfg                |   35 +
 examples/run_ecoli_test.sh                 |   11 +
 setup.py                                   |   37 +
 src/c/DW_banded.c                          |  319 ++++++
 src/c/Makefile                             |   20 +
 src/c/Makefile.osx                         |   16 +
 src/c/common.h                             |  178 ++++
 src/c/ext_falcon.c                         |   13 +
 src/c/falcon.c                             |  843 ++++++++++++++++
 src/c/kmer_lookup.c                        |  589 ++++++++++++
 src/py/FastaReader.py                      |  260 +++++
 src/py/__init__.py                         |   39 +
 src/py/falcon_kit.py                       |  199 ++++
 src/py/fc_asm_graph.py                     |  212 ++++
 src/py/mains/__init__.py                   |    0
 src/py/mains/actg_coordinate.py            |   27 +
 src/py/mains/consensus.py                  |  240 +++++
 src/py/mains/contig_annotate.py            |   29 +
 src/py/mains/ctg_link_analysis.py          |   80 ++
 src/py/mains/dedup_a_tigs.py               |   22 +
 src/py/mains/graph_to_contig.py            |  297 ++++++
 src/py/mains/graph_to_utgs.py              |  160 +++
 src/py/mains/ovlp_filter.py                |  275 ++++++
 src/py/mains/ovlp_stats.py                 |  117 +++
 src/py/mains/ovlp_to_graph.py              | 1441 ++++++++++++++++++++++++++++
 src/py/mains/run.py                        |  566 +++++++++++
 src/py/mains/tasks.py                      |   31 +
 src/py/multiproc.py                        |   24 +
 src/py/run_support.py                      |  419 ++++++++
 src/py/util/__init__.py                    |    0
 src/py/util/io.py                          |  162 ++++
 src/py_scripts/fc_actg_coordinate.py       |    5 +
 src/py_scripts/fc_consensus.py             |    5 +
 src/py_scripts/fc_contig_annotate.py       |    5 +
 src/py_scripts/fc_ctg_link_analysis.py     |    5 +
 src/py_scripts/fc_dedup_a_tigs.py          |    5 +
 src/py_scripts/fc_graph_to_contig.py       |    5 +
 src/py_scripts/fc_graph_to_utgs.py         |    5 +
 src/py_scripts/fc_ovlp_filter.py           |    5 +
 src/py_scripts/fc_ovlp_stats.py            |    5 +
 src/py_scripts/fc_ovlp_to_graph.py         |    5 +
 src/py_scripts/fc_run.cfg                  |   38 +
 src/py_scripts_v0.1/falcon_asm.py          | 1154 ++++++++++++++++++++++
 src/py_scripts_v0.1/falcon_asm_s.py        | 1220 +++++++++++++++++++++++
 src/py_scripts_v0.1/falcon_dedup.py        |  119 +++
 src/py_scripts_v0.1/falcon_fixasm.py       |  213 ++++
 src/py_scripts_v0.1/falcon_overlap.py      |  328 +++++++
 src/py_scripts_v0.1/falcon_overlap2.py     |  337 +++++++
 src/py_scripts_v0.1/falcon_qrm.py          |  370 +++++++
 src/py_scripts_v0.1/falcon_qrm_0.py        |  378 ++++++++
 src/py_scripts_v0.1/falcon_sense.py        |  248 +++++
 src/py_scripts_v0.1/falcon_ucns_data.py    |  120 +++
 src/py_scripts_v0.1/falcon_utgcns.py       |  124 +++
 src/py_scripts_v0.1/get_ovl.sh             |    7 +
 src/py_scripts_v0.1/get_rdata.py           |  207 ++++
 src/py_scripts_v0.1/overlapper.py          |  216 +++++
 src/py_scripts_v0.1/ovlp_filter.sh         |    6 +
 src/py_scripts_v0.1/redis_graph.py         |   79 ++
 src/py_scripts_v0.1/remove_dup_ctg.py      |   75 ++
 src/utils/fetch_preads.py                  |   70 ++
 test/test_actg_coordinate.py               |    9 +
 test/test_consensus.py                     |    7 +
 test/test_contig_annotate.py               |    9 +
 test/test_ctg_link_analysis.py             |    9 +
 test/test_graph_to_contig.py               |    9 +
 test/test_graph_to_utgs.py                 |    9 +
 test/test_ovlp_filter.py                   |   30 +
 test/test_ovlp_stats.py                    |   29 +
 test/test_ovlp_to_graph.py                 |    7 +
 test/test_run.py                           |    9 +
 test/test_run_LG.py                        |    9 +
 test_data/t1.fa                            |    2 +
 test_data/t1.fofn                          |    1 +
 test_data/t2.fa                            |    2 +
 test_data/t2.fofn                          |    1 +
 travis.sh                                  |   18 +
 92 files changed, 13216 insertions(+)

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..f8c2ba4
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,9 @@
+/build/
+/dist/
+falcon_kit.egg-info/
+*.pyc
+*.pyo
+*.swp
+*.so
+*.dylib
+*.dll
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000..e69de29
diff --git a/.travis.yml b/.travis.yml
new file mode 100644
index 0000000..dd67f33
--- /dev/null
+++ b/.travis.yml
@@ -0,0 +1,26 @@
+# Build matrix / environment variable are explained on:
+# http://about.travis-ci.org/docs/user/build-configuration/
+# This file can be validated on:
+# http://lint.travis-ci.org/
+
+#before_install: sudo apt-get install -y cmake
+# cmake is pre-installed in Travis for both linux and osx
+
+#before_install:
+#  - sudo apt-get update -qq
+#  - sudo apt-get install -qq valgrind
+#sudo: required
+os:
+  - linux
+language: python
+compiler:
+  - clang  # hmm. distutils uses 'gcc' anyway
+#  - gcc
+script: ./travis.sh
+#env:
+#  matrix:
+#    - SHARED_LIB=ON  STATIC_LIB=ON CMAKE_PKG=ON  BUILD_TYPE=release VERBOSE_MAKE=false
+#    - SHARED_LIB=OFF STATIC_LIB=ON CMAKE_PKG=OFF BUILD_TYPE=debug   VERBOSE_MAKE=true VERBOSE
+notifications:
+  email: false
+sudo: false
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..94e4fd5
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,36 @@
+#################################################################################$$
+# Copyright (c) 2011-2015, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted (subject to the limitations in the
+# disclaimer below) provided that the following conditions are met:
+#
+#  * Redistributions of source code must retain the above copyright
+#  notice, this list of conditions and the following disclaimer.
+#
+#  * Redistributions in binary form must reproduce the above
+#  copyright notice, this list of conditions and the following
+#  disclaimer in the documentation and/or other materials provided
+#  with the distribution.
+#
+#  * Neither the name of Pacific Biosciences nor the names of its
+#  contributors may be used to endorse or promote products derived
+#  from this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#################################################################################$$
diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 0000000..5b3144c
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1 @@
+include src/c/*
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..0bb7bc3
--- /dev/null
+++ b/README.md
@@ -0,0 +1,34 @@
+Falcon
+===========
+
+Falcon: a set of tools for fast aligning long reads for consensus and assembly
+
+The Falcon tool kit is a set of simple code collection which I use for studying
+efficient assembly algorithm for haploid and diploid genomes. It has some back-end 
+code implemented in C for speed and some simple front-end written in Python for
+convenience. 
+
+
+DOCUMENTATION
+-------------
+
+The default branch is now "master" which contained the latest code.
+
+The current latest intergrated release is v0.3.0. Check [v0.3.0 Integration Installation Guide](https://github.com/PacificBiosciences/FALCON-integrate/wiki/Installation-for-v0.3.0) for installation.
+
+For the pre-Jun 2015 v0.2.2 version, please check [v0.2.2 release github repository](https://github.com/PacificBiosciences/FALCON/tree/v0.2.2). We will no longer address issues that are specific to that branch unless they also impact the current master branch.
+
+- [wiki pages](https://github.com/PacificBiosciences/FALCON/wiki)
+- [Developer Installation Guide](https://github.com/PacificBiosciences/FALCON/wiki/Setup:-Installation-and-Environment)
+- [v0.3.0 Integration Installation Guide](https://github.com/PacificBiosciences/FALCON-integrate/wiki/Installation-for-v0.3.0)
+- [Documentation is here.](https://github.com/PacificBiosciences/FALCON/wiki/Manual)
+- [FAQs](https://github.com/PacificBiosciences/FALCON/wiki/FAQs)
+- [v0.2.2 release github repository](https://github.com/PacificBiosciences/FALCON/tree/v0.2.2)
+
+ABOUT THE LICENSE
+------------------
+
+Standard PacBio ["Open Source License"](LICENSE).
+
+July 9th, 2015
+
diff --git a/doc/README b/doc/README
new file mode 100644
index 0000000..d835a13
--- /dev/null
+++ b/doc/README
@@ -0,0 +1,3 @@
+The images here are used by
+
+  https://github.com/PacificBiosciences/FALCON/wiki/Manual
diff --git a/doc/falcon_icon.svg b/doc/falcon_icon.svg
new file mode 100644
index 0000000..8dffb35
--- /dev/null
+++ b/doc/falcon_icon.svg
@@ -0,0 +1,24 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<svg version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0" y="0" width="258.666" height="245.667" viewBox="0, 0, 258.666, 245.667">
+  <defs>
+    <linearGradient id="Gradient_1" gradientUnits="userSpaceOnUse" x1="225.627" y1="404.121" x2="225.627" y2="192.454">
+      <stop offset="0.02" stop-color="#1D3884"/>
+      <stop offset="1" stop-color="#FFFFFF"/>
+    </linearGradient>
+    <linearGradient id="Gradient_2" gradientUnits="userSpaceOnUse" x1="139.88" y1="377.62" x2="313.76" y2="226.54">
+      <stop offset="0.026" stop-color="#D90000"/>
+      <stop offset="1" stop-color="#FFFFFF" stop-opacity="0.798"/>
+    </linearGradient>
+  </defs>
+  <g id="Layer_1" transform="translate(-96.294, -175.454)">
+    <g>
+      <path d="M151.651,192.454 L299.603,192.454 C320.787,192.454 337.96,208.634 337.96,228.592 L337.96,367.982 C337.96,387.941 320.787,404.121 299.603,404.121 L151.651,404.121 C130.467,404.121 113.294,387.941 113.294,367.982 L113.294,228.592 C113.294,208.634 130.467,192.454 151.651,192.454 z" fill="url(#Gradient_1)"/>
+      <path d="M151.651,192.454 L299.603,192.454 C320.787,192.454 337.96,208.634 337.96,228.592 L337.96,367.982 C337.96,387.941 320.787,404.121 299.603,404.121 L151.651,404.121 C130.467,404.121 113.294,387.941 113.294,367.982 L113.294,228.592 C113.294,208.634 130.467,192.454 151.651,192.454 z" fill-opacity="0" stroke="#FFFFFF" stroke-width="9"/>
+    </g>
+    <g>
+      <path d="M144.25,337.917 C144.25,337.917 163.5,314.5 210.833,301.833 C232.522,296.029 309.5,281.167 309.5,281.167 C309.5,281.167 284.667,318.5 272.167,331.833 C259.667,345.167 253,328 218.5,337.5 C184,347 171.833,370.167 171.833,370.167 C171.833,370.167 151.833,373.833 133.833,382.833 C115.833,391.833 109.04,410.241 107.833,408.833 C105.629,406.262 98.75,380.75 110.75,364.75 C122.75,348.75 129,349.583 134.25,332.25 C139.5,314.917 135.5,290.5 177.5,258.5 C219.5,226.5 347.583,195.25  [...]
+      <path d="M144.25,337.917 C144.25,337.917 163.5,314.5 210.833,301.833 C232.522,296.029 309.5,281.167 309.5,281.167 C309.5,281.167 284.667,318.5 272.167,331.833 C259.667,345.167 253,328 218.5,337.5 C184,347 171.833,370.167 171.833,370.167 C171.833,370.167 151.833,373.833 133.833,382.833 C115.833,391.833 109.04,410.241 107.833,408.833 C105.629,406.262 98.75,380.75 110.75,364.75 C122.75,348.75 129,349.583 134.25,332.25 C139.5,314.917 135.5,290.5 177.5,258.5 C219.5,226.5 347.583,195.25  [...]
+    </g>
+  </g>
+</svg>
diff --git a/doc/falcon_icon2.png b/doc/falcon_icon2.png
new file mode 100644
index 0000000..38b3abd
Binary files /dev/null and b/doc/falcon_icon2.png differ
diff --git a/examples/build_env.sh b/examples/build_env.sh
new file mode 100644
index 0000000..f2663a3
--- /dev/null
+++ b/examples/build_env.sh
@@ -0,0 +1,22 @@
+virtualenv --no-site-packages  --always-copy   $PWD/fc_env
+. $PWD/fc_env/bin/activate
+git clone https://github.com/pb-jchin/pypeFLOW
+cd pypeFLOW
+python setup.py install
+
+cd ..
+git clone https://github.com/PacificBiosciences/FALCON.git
+cd FALCON
+python setup.py install
+
+cd ..
+git clone https://github.com/pb-jchin/DAZZ_DB.git
+cd DAZZ_DB/
+make
+cp DBrm DBshow DBsplit DBstats fasta2DB ../fc_env/bin/
+
+cd ..
+git clone https://github.com/pb-jchin/DALIGNER.git
+cd DALIGNER
+make
+cp daligner daligner_p DB2Falcon HPCdaligner LA4Falcon LAmerge LAsort  ../fc_env/bin
diff --git a/examples/build_env2.sh b/examples/build_env2.sh
new file mode 100644
index 0000000..a5e404c
--- /dev/null
+++ b/examples/build_env2.sh
@@ -0,0 +1,22 @@
+virtualenv --no-site-packages  --always-copy   $PWD/fc_env
+. $PWD/fc_env/bin/activate
+
+cd FALCON
+git submodule init
+git submodule update
+
+cd pypeFLOW
+python setup.py install
+cd ..
+
+python setup.py install
+
+cd DAZZ_DB/
+make
+cp DBrm DBshow DBsplit DBstats fasta2DB ../../fc_env/bin/
+cd ..
+
+cd DALIGNER
+make
+cp daligner daligner_p DB2Falcon HPCdaligner LA4Falcon LAmerge LAsort  ../../fc_env/bin
+cd ../..
diff --git a/examples/ecoli_asm_graph_exploration.ipynb b/examples/ecoli_asm_graph_exploration.ipynb
new file mode 100644
index 0000000..1a7160f
--- /dev/null
+++ b/examples/ecoli_asm_graph_exploration.ipynb
@@ -0,0 +1,716 @@
+{
+ "metadata": {
+  "name": "",
+  "signature": "sha256:585e8268126c8365dec7b581d0673bd9a5a1e83a88800e51c0f9028479318042"
+ },
+ "nbformat": 3,
+ "nbformat_minor": 0,
+ "worksheets": [
+  {
+   "cells": [
+    {
+     "cell_type": "heading",
+     "level": 1,
+     "metadata": {},
+     "source": [
+      "FALCON Assembly Graph Processing and Visulization Example\n"
+     ]
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "This notebook shows how to fetch the graph data from the assembler and show how to examine the bubbles in the assembly graph."
+     ]
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "You will need to have `matplotlib`, `networkx` and `pygraphviz` as Python libraries. Of course, you will need to have `graphviz` command line tool install in your system too. "
+     ]
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "Some setup to change the working directory to where the data is."
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "%cd /mnt/projects/bifxanalysis/jchin_asm_jobs/DA_develop_test/Ecoli/2-asm-falcon/\n",
+      "%matplotlib inline\n",
+      "%pylab inline"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "/mnt/projects/bifxanalysis/jchin_asm_jobs/DA_develop_test/Ecoli/2-asm-falcon\n",
+        "Populating the interactive namespace from numpy and matplotlib\n"
+       ]
+      }
+     ],
+     "prompt_number": 1
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "Some boilerplate code for loading the necessary module and classes"
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "import networkx as nx\n",
+      "from IPython.display import display, HTML, SVG, Image"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 2
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "Load the `AsmGraph` class which parse the graph data files and load them into python objects. "
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "from falcon_kit.fc_asm_graph import AsmGraph"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 3
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "Load data into `G_asm`. "
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "G_asm = AsmGraph(\"sg_edges_list\", \"utg_data\", \"ctg_paths\")"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 4
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "Show all contig identifiers. The `000000R` is the dual contig of `000000F`."
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "print G_asm.ctg_data.keys()"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "['000000F', '000000R']\n"
+       ]
+      }
+     ],
+     "prompt_number": 5
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "Print out the content of the contig `000000F`. The output is a tuple of `contig identifier`, `start_utg`, `end_node`, `number of base`, `number of overlapped base`, `unitigs`."
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "print G_asm.ctg_data[\"000000F\"]\n",
+      "print\n",
+      "print \"number of unitigs in the contig:\", len(G_asm.ctg_data[\"000000F\"][-1])"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "('ctg_linear', '000011445:B~NA~000003425:E', '000011445:B', 4642293, 28970832, (['000011445:B', 'NA', '000003425:E'], ['000003425:E', '000009719:E', '000006955:E'], ['000006955:E', 'NA', '000017147:B'], ['000017147:B', '000012473:B', '000010757:B'], ['000010757:B', 'NA', '000015636:E'], ['000015636:E', '000004093:E', '000015696:B'], ['000015696:B', 'NA', '000016941:B'], ['000016941:B', '000003353:B', '000008783:B'], ['000008783:B', 'NA', '000006338:B'], ['000006338:B', '00000493 [...]
+        "\n",
+        "number of unitigs in the contig: 16\n"
+       ]
+      }
+     ],
+     "prompt_number": 6
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "If the starting node is the same of the ending node, then the contig may be circular too. (The contig type only applys to \"simple unitig\". Any contig contains more than one single unitig is classfied as `ctg_linear`. This convention may change in the futrue."
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "utgs = G_asm.ctg_data[\"000000F\"][-1]\n",
+      "if utgs[0][0] == utgs[-1][-1]:\n",
+      "    print \"the contig is a cirular\"\n",
+      "else:\n",
+      "    print \"thec contig is not circular\""
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "the contig is a cirular\n"
+       ]
+      }
+     ],
+     "prompt_number": 7
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "Check the number of node in the string subgraph for this contig. We can get the full string subgraph using the method `get_sg_for_ctg()`.  This is full string graph. Each node is 5' or 3' ends of a read."
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "sg = G_asm.get_sg_for_ctg(\"000000F\")\n",
+      "print \"number of nodes:\", len(sg.nodes())\n",
+      "print \"number of edges:\", len(sg.edges())"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "number of nodes: 1361\n",
+        "number of edges: 1369\n"
+       ]
+      }
+     ],
+     "prompt_number": 8
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "Call `neato` layout code from Graphviz to layout the contig string graph. We can see a circle with a couple of small bubbles."
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "position=nx.graphviz_layout(sg, prog='neato') "
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 9
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "figure(figsize=(10,10))\n",
+      "ax=subplot(1,1,1)\n",
+      "minx = 10\n",
+      "miny = 10\n",
+      "count = 0\n",
+      "for e in sg.edges():\n",
+      "    yy, xx = zip(position[e[0]], position[e[1]]) \n",
+      "\n",
+      "    xx = -array(xx)\n",
+      "    yy = -array(yy)\n",
+      "    ax.plot( xx, yy, \".-b\" ) \n",
+      "\n",
+      "    if min(xx) < minx:\n",
+      "        minx = min(xx)\n",
+      "    if min(yy) < miny:\n",
+      "        miny = min(yy)\n",
+      "\n",
+      "    #print x,y\n",
+      "xlim(minx*1.1,-minx*0.1)\n",
+      "ylim(miny*1.1,-miny*0.1)\n",
+      "ax.get_xaxis().set_visible(False)\n",
+      "ax.get_yaxis().set_visible(False)"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "metadata": {},
+       "output_type": "display_data",
+       "png": "iVBORw0KGgoAAAANSUhEUgAAAjwAAAI8CAYAAAD1D3GaAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3U13HNWZB/CnLQdhiGNJSIkA2y1b2NZq1iwI38KLfMKc\nIxazmy8gLziD59iZ1WCDsYCJjV+wgYRX2z2LSo+6ql+klrq7qu79/c7hRFUtoIIl9V/P89x7O71e\nLwAAUnaq7gcAAJg3gQcASJ7AAwAkT+ABAJIn8AAAyRN4AIDknZ70YqfTsWYdAGiNXq/XGXV/YuD5\n1984+6cBAJixTmdk1okILS0AIAMCDwCQPIEHAEiewAMAJE/gAQCSJ/AAAMkTeACA5Ak8AEDyBB4A\nIHkCDwCQPIEHAEiewAMAJE/gAQCSJ/AAAMkTeACA5Ak8AEDyBB4AIHkCDwCQPIEHAEiewAMAJE/g\nAQCSJ/AAA [...]
+       "text": [
+        "<matplotlib.figure.Figure at 0x7fc1a28ac310>"
+       ]
+      }
+     ],
+     "prompt_number": 10
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "Create the assembly graph which each edge is a simple or a compound unitig rather than a raw string graph edge. "
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "utgs = G_asm.ctg_data[\"000000F\"][-1]\n",
+      "utg_g = nx.DiGraph()\n",
+      "for s,v,t in utgs:\n",
+      "    utg_g.add_edge(s,t)\n",
+      "position=nx.graphviz_layout(utg_g, prog='neato') \n",
+      "figure(figsize=(10,10))\n",
+      "ax=subplot(1,1,1)\n",
+      "minx = 10\n",
+      "miny = 10\n",
+      "count = 0\n",
+      "for e in utg_g.edges():\n",
+      "    yy, xx = zip(position[e[0]], position[e[1]]) \n",
+      "\n",
+      "    xx = -array(xx)\n",
+      "    yy = -array(yy)\n",
+      "\n",
+      "    ax.plot( xx, yy, \".-b\" ) \n",
+      "\n",
+      "    if min(xx) < minx:\n",
+      "        minx = min(xx)\n",
+      "    if min(yy) < miny:\n",
+      "        miny = min(yy)\n",
+      "\n",
+      "    #print x,y\n",
+      "xlim(minx*1.1,-minx*0.1)\n",
+      "ylim(miny*1.1,-miny*0.1)\n",
+      "ax.get_xaxis().set_visible(False)\n",
+      "ax.get_yaxis().set_visible(False)"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "metadata": {},
+       "output_type": "display_data",
+       "png": "iVBORw0KGgoAAAANSUhEUgAAAjwAAAI8CAYAAAD1D3GaAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3Xd4lWW2/vF7Q2ihoyAKCIoUCzI4YKGLAoJKFRRQSqLY\nxgLzG71mPE7RGR05E3DErkmoUlQQREGUJqAOAwoWBkVFiiDSQTrJ/v2xJieWEDbJ3vvZ+3m/n+vi\nQoSZs847Ibmz3udZKxQOhwUAAOCzEq4LAAAAiDUCDwAA8B6BBwAAeI/AAwAAvEfgAQAA3iPwAAAA\n76UU9puhUIg76wAAIGmEw+FQQf++0MDz3/9g9KsBAACIslCowKwjiVdaAAAgAAg8AADAewQeAADg\nPQIPAADwHoEHAAB4j8ADAAC8R+ABAADeI/AAAADvEXgAAID3CDwAAMB7BB4AAOA9Ag8AAPAegQcA\nAHiPwAMAA [...]
+       "text": [
+        "<matplotlib.figure.Figure at 0x7fc19bc98550>"
+       ]
+      }
+     ],
+     "prompt_number": 11
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "Load sequences for each string graph edges so we can generate sequence given a path in the graph."
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "G_asm.load_sg_seq(\"preads4falcon.fasta\")"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 12
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "This shows how to get sequences for each simple or compound unitigs with a contig."
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "compound_path_seqs = {}\n",
+      "print \"s\", \"v\", \"t\", \"type\", \"seq len\", \"path len\"\n",
+      "for s,v,t in G_asm.ctg_data[\"000000F\"][-1]:\n",
+      "    utg_data = G_asm.utg_data[(s,t,v)]\n",
+      "    type_, length, score, path_or_edges = utg_data\n",
+      "    \n",
+      "    if type_ == \"simple\":\n",
+      "        path = path_or_edges.split(\"~\")\n",
+      "        seq = G_asm.get_seq_from_path( path )\n",
+      "        print s, v, t, utg_data[0], len(seq)\n",
+      "    else:\n",
+      "        c_graph = nx.DiGraph()\n",
+      "        simple_utgs = [ e.split(\"~\") for e in path_or_edges.split(\"|\")]\n",
+      "        \n",
+      "        for ss, tt, vv in simple_utgs:\n",
+      "            type_, length, score, sub_path = G_asm.utg_data[ (ss,vv,tt) ]\n",
+      "            sub_path = sub_path.split(\"~\")\n",
+      "            v1 = sub_path[0]\n",
+      "            for v2 in sub_path[1:]:\n",
+      "                c_graph.add_edge( v1, v2, score = 10000000 - G_asm.sg_edges[ (v1, v2) ][1]  )\n",
+      "                v1 = v2\n",
+      "        seqs = []\n",
+      "        while 1:\n",
+      "            try:\n",
+      "                shortest_path = nx.shortest_path( c_graph, s, t, weight = \"score\" )\n",
+      "            except nx.exception.NetworkXNoPath:\n",
+      "                break\n",
+      "            seq = G_asm.get_seq_from_path( shortest_path )\n",
+      "            seqs.append( (seq, shortest_path) )\n",
+      "            \n",
+      "            n0 = shortest_path[0]\n",
+      "            for n1 in shortest_path[1:]:\n",
+      "                c_graph.remove_edge(n0, n1)\n",
+      "                n0 = n1\n",
+      "        \n",
+      "        compound_path_seqs[(s,v,t)] = seqs\n",
+      "        for seq, subpath in seqs:            \n",
+      "            print s, v, t, utg_data[0], len(seq), len(subpath)\n"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "s v t type seq len path len\n",
+        "000011445:B NA 000003425:E compound 2736 3\n",
+        "000011445:B NA 000003425:E compound 2681 3\n",
+        "000003425:E 000009719:E 000006955:E simple 84196\n",
+        "000006955:E NA 000017147:B compound 26315 7\n",
+        "000006955:E NA 000017147:B compound 26357 7\n",
+        "000017147:B 000012473:B 000010757:B simple 601191\n",
+        "000010757:B NA 000015636:E compound 16169 3\n",
+        "000010757:B NA 000015636:E compound 16194 3\n",
+        "000015636:E 000004093:E 000015696:B simple 1469288\n",
+        "000015696:B NA 000016941:B compound 16458 3\n",
+        "000015696:B NA 000016941:B compound 16438 8\n",
+        "000016941:B 000003353:B 000008783:B simple 88381\n",
+        "000008783:B NA 000006338:B compound 26074 3\n",
+        "000008783:B NA 000006338:B compound 26058 10\n",
+        "000006338:B 000004932:B 000010623:B simple 206164\n",
+        "000010623:B NA 000014991:B compound 30158 3\n",
+        "000010623:B NA 000014991:B compound 30148 8\n",
+        "000014991:B 000013790:E 000002926:B simple 392373\n",
+        "000002926:B NA 000011761:B compound 25736 3\n",
+        "000002926:B NA 000011761:B compound 25814 12\n",
+        "000011761:B 000003659:E 000014184:E simple 184084\n",
+        "000014184:E NA 000012028:E compound 14792 3\n",
+        "000014184:E NA 000012028:E compound 14895 4\n",
+        "000012028:E 000013461:E 000011445:B simple 1447372\n"
+       ]
+      }
+     ],
+     "prompt_number": 13
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      " Boilerplate code for using an aligner within `falcon_kit` for dot plots and alignment."
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "from falcon_kit import kup, falcon, DWA\n",
+      "rcmap = dict(zip(\"ACGTacgtNn-\",\"TGCATGCANN-\"))\n",
+      "def rc(seq):\n",
+      "    return \"\".join([rcmap[c] for c in seq[::-1]])\n",
+      "\n",
+      "def get_aln_data(t_seq, q_seq):\n",
+      "    aln_data = []\n",
+      "    K = 8\n",
+      "    seq0 = t_seq\n",
+      "    lk_ptr = kup.allocate_kmer_lookup( 1 << (K * 2) )\n",
+      "    sa_ptr = kup.allocate_seq( len(seq0) )\n",
+      "    sda_ptr = kup.allocate_seq_addr( len(seq0) )\n",
+      "    kup.add_sequence( 0, K, seq0, len(seq0), sda_ptr, sa_ptr, lk_ptr)\n",
+      "    q_id = \"dummy\"\n",
+      "    \n",
+      "    kmer_match_ptr = kup.find_kmer_pos_for_seq(q_seq, len(q_seq), K, sda_ptr, lk_ptr)\n",
+      "    kmer_match = kmer_match_ptr[0]\n",
+      "    aln_range_ptr = kup.find_best_aln_range(kmer_match_ptr, K, K*5, 12)\n",
+      "    aln_range = aln_range_ptr[0]\n",
+      "    x,y = zip( * [ (kmer_match.query_pos[i], kmer_match.target_pos[i]) for i in range(kmer_match.count)] )\n",
+      "    kup.free_kmer_match(kmer_match_ptr)\n",
+      "    s1, e1, s2, e2 = aln_range.s1, aln_range.e1, aln_range.s2, aln_range.e2\n",
+      "    \n",
+      "    if e1 - s1 > 100:\n",
+      "\n",
+      "        alignment = DWA.align(q_seq[s1:e1], e1-s1,\n",
+      "                              seq0[s2:e2], e2-s2,\n",
+      "                              100,1)\n",
+      "\n",
+      "        if alignment[0].aln_str_size > 100:\n",
+      "            aln_data.append( ( q_id, 0, s1, e1, len(q_seq), s2, e2, len(seq0), alignment[0].aln_str_size, alignment[0].dist ) )\n",
+      "            aln_str1 = alignment[0].q_aln_str\n",
+      "            aln_str0 = alignment[0].t_aln_str\n",
+      "\n",
+      "        DWA.free_alignment(alignment)\n",
+      "\n",
+      "    kup.free_kmer_lookup(lk_ptr)\n",
+      "    kup.free_seq_array(sa_ptr)\n",
+      "    kup.free_seq_addr_array(sda_ptr)\n",
+      "    return aln_data, x, y"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 14
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "Plot the subgraph of each compound path and do alignment between different subpaths within the compound path.  One can see 7 out of 8 compound path might caused by errors where the identity between the alternative paths are high. However, the path starts at `000006955:E` and ends at `000017147:B` shows a 2kb inversion. This appears to be realy bioligcal polymorphism in a population of E. coli cells."
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "print \"s\", \"v\", \"t\", \"aln_identity\", \"aln_coverage\"\n",
+      "\n",
+      "for s,v,t in compound_path_seqs:\n",
+      "    \n",
+      "    sg = G_asm.get_sg_for_utg((s,t,v))\n",
+      "    position=nx.graphviz_layout(sg, prog='dot') \n",
+      "    figure(figsize=(10,4))\n",
+      "    ax=subplot(1,2,1)\n",
+      "    minx = 10\n",
+      "    miny = 10\n",
+      "    count = 0\n",
+      "    for e in sg.edges():\n",
+      "        yy, xx = zip(position[e[0]], position[e[1]]) \n",
+      "        col = \"k\"\n",
+      "        xx = -array(xx)\n",
+      "        yy = -array(yy)\n",
+      "        ax.plot( xx, yy, \".-\"+col ) \n",
+      "        if min(xx) < minx:\n",
+      "            minx = min(xx)\n",
+      "        if min(yy) < miny:\n",
+      "            miny = min(yy)\n",
+      "\n",
+      "        #print x,y\n",
+      "    xlim(minx*1.1,-minx*0.1)\n",
+      "    ylim(miny*1.1,-miny*0.1)\n",
+      "    ax.get_xaxis().set_visible(False)\n",
+      "    ax.get_yaxis().set_visible(False)\n",
+      "    \n",
+      "    seqs = compound_path_seqs[(s,v,t)]\n",
+      "    seq0 = seqs[0][0]\n",
+      "    ax=subplot(1,2,2)\n",
+      "    for seq, path in seqs[1:]:\n",
+      "        aln_data, x, y = get_aln_data(seq0, seq)\n",
+      "        print s,v,t, 1.0-1.0*aln_data[-1][-1] / aln_data[-1][-2], 1.0*(aln_data[-1][3]-aln_data[-1][2])/aln_data[-1][4]\n",
+      "        \n",
+      "        plot(x, y,'.b', markersize=0.2)\n",
+      "        seq = rc(seq)\n",
+      "        aln_data, rx, ry = get_aln_data(seq0, seq)\n",
+      "        rx = np.array(rx)\n",
+      "        rx = len(seq) - rx\n",
+      "        plot(rx, ry,'.r', markersize=0.2)\n",
+      "        text(0, 0, \"%s %s %s\" % (s, v, t))\n",
+      "        \n",
+      "        \n",
+      "        \n"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "s v t aln_identity aln_coverage\n",
+        "000011445:B"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        " NA 000003425:E 0.97798972854 0.995151063036\n",
+        "000010757:B"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        " NA 000015636:E 0.99697456162 0.999382487341\n",
+        "000014184:E"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        " NA 000012028:E 0.98953722334 0.999261497147\n",
+        "000006955:E"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        " NA 000017147:B 0.9952 0.923473839967\n",
+        "000015696:B"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        " NA 000016941:B 0.995871782419 0.999391653486\n",
+        "000002926:B"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        " NA 000011761:B 0.990612686394 0.999612613311\n",
+        "000008783:B"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        " NA 000006338:B 0.99254387642 0.999616240694\n",
+        "000010623:B"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        " NA 000014991:B 0.992332352844 0.999601963646\n"
+       ]
+      },
+      {
+       "metadata": {},
+       "output_type": "display_data",
+       "png": "iVBORw0KGgoAAAANSUhEUgAAAkgAAAEACAYAAABI/YkzAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3Xd4VFX+x/H3CUWqIlUgCEhZpKTQIooSRFAEUVAElGZD\nkea6Ls0fK3aBVdRVgaUXRVQUQaQJRKQIJCZBybLomiAgEPpKIguE8/tjZmIIoYXJ3Cmf1/PMw82d\nOzPfuQl3PnPuOecaay0iIiIi8ocwpwsQERER8TcKSCIiIiK5KCCJiIiI5KKAJCIiIpKLApKIiIhI\nLgpIIiIiIrkoIImIzxhjihljNhpjkowxKcaYV93ryxpjVhhjthtjlhtjyuR4zAhjzI/GmG3GmHY5\n1jcxxnzvvu8tJ96PiAQvBSQR8Rlr7XGgtbU2CogAWhtjWgLDgRXW2rrASvfPGGPqA92A+sAdwHvG\nGON+ugnAI [...]
+       "text": [
+        "<matplotlib.figure.Figure at 0x7fc1a3fb6750>"
+       ]
+      },
+      {
+       "metadata": {},
+       "output_type": "display_data",
+       "png": "iVBORw0KGgoAAAANSUhEUgAAAksAAAEACAYAAACjyjIwAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzsnXm4FMXVuN8CRaVdEPWaCC6YoEFlc0OUyDVR1E8+F9xA\nWUVBENQYNAK/KCrcCBIRDReDgLK4gEsEUQFFrwgim3A1ELcILrhcxSXaxo/F+v0xPXN7enqp7ulZ\n7tx6n8fHudVVp05VN1NnzjlVLaSUaDQajUaj0WjcaVBoBTQajUaj0WiKGW0saTQajUaj0figjSWN\nRqPRaDQaH7SxpNFoNBqNRuODNpY0Go1Go9FofNDGkkaj0Wg0Go0P2ljSaDSxI4SYLoT4Qgjxlq3s\nRCHEKiHEOiHEaiHECbZrw4UQ7wkh3hZCdLGVHyeEeMu6NtFWvpsQYo5V/roQ4tD8jU6j0dQ3tLGk\n0WhywYPAW [...]
+       "text": [
+        "<matplotlib.figure.Figure at 0x7fc19bb58850>"
+       ]
+      },
+      {
+       "metadata": {},
+       "output_type": "display_data",
+       "png": "iVBORw0KGgoAAAANSUhEUgAAAksAAAEACAYAAACjyjIwAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzsnXmYFcXVuN9iFa8EFxxUcOGLS8SoUYMY12FVIjKyBBQX\nFBQDimAinzgmMqiACPLhLkRRiKLgikEEQZ2IMYpLfqgggigqKIyiBrmyU78/7jI9fXup3m7fO1Pv\n8/h4p7qW09VN1+lzTp0WUko0Go1Go9FoNNbUi1sAjUaj0Wg0mkJGK0sajUaj0Wg0DmhlSaPRaDQa\njcYBrSxpNBqNRqPROKCVJY1Go9FoNBoHtLKk0Wg0Go1G44BWljQaTegIIaYJITYIIT40lQ8VQnws\nhPhICDHeUH6jEGKVEGKFEKKLofwkIcSH6WN3GcobCyFmpcvfEkIcmp8z02g0dRGtLGk0mih4BDjH\nWCCEaA90B [...]
+       "text": [
+        "<matplotlib.figure.Figure at 0x7fc1a2a7f050>"
+       ]
+      },
+      {
+       "metadata": {},
+       "output_type": "display_data",
+       "png": "iVBORw0KGgoAAAANSUhEUgAAAksAAAEACAYAAACjyjIwAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzsvX2cXkV5N/4dEgLsPJQK2WMRYrI1UAg/g5QikaKGgihP\nKolCeRNKSagVBALaKCZEWUnCawXyRN5UJLyIVBC0jWABjTSi8vKAtbzGms0PqHA2BMXMUiQyzx9n\nZs41c2bmnHuzm3v33vl+xL3vc+blmjkn9/U93+s6M0xKiYSEhISEhISEBD+2abcBCQkJCQkJCQkj\nGYksJSQkJCQkJCREkMhSQkJCQkJCQkIEiSwlJCQkJCQkJESQyFJCQkJCQkJCQgSJLCUkJCQkJCQk\nRJDIUkJCwpCCMbY9Y+xnjLHHGWNPMsYuVMd3Zozdyxh7ljH2b4yxPyZ1PscYW8sYe5oxdjg5vj9j\n7Bfq3JXk+ [...]
+       "text": [
+        "<matplotlib.figure.Figure at 0x7fc1a2adab90>"
+       ]
+      },
+      {
+       "metadata": {},
+       "output_type": "display_data",
+       "png": "iVBORw0KGgoAAAANSUhEUgAAAksAAAEACAYAAACjyjIwAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzsnX28FVW9/9/LCJH54VN6NEUevFmKkVdNEVM4ZoJ0EZLL\nw6EiFIoQU7yFJXCTowWJcE20pEuJiS/lHLxCGAmS6BEx5EHNFLS0wJCUo5APDClP6/fHzOwze/Y8\nrJk9e88++6z36+XLfWbWw3fWbGZ99vf7XWuElBKNRqPRaDQajT+HZG2ARqPRaDQaTSWjxZJGo9Fo\nNBpNCFosaTQajUaj0YSgxZJGo9FoNBpNCFosaTQajUaj0YSgxZJGo9FoNBpNCFosaTSa1BFCzBdC\n7BBCvOg6dq4QYr0Q4nkhxAYhxDmuc5OFEK8KIV4RQvRzHT9bCPGifW6O6/ihQohG+/gzQoiu5bs6\njUbT1tBiS [...]
+       "text": [
+        "<matplotlib.figure.Figure at 0x7fc19bf9d190>"
+       ]
+      },
+      {
+       "metadata": {},
+       "output_type": "display_data",
+       "png": "iVBORw0KGgoAAAANSUhEUgAAAksAAAEACAYAAACjyjIwAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzsvXmcHUW5///uAIFMiWyZw04yegHFSxBzES8iBMImMSTI\nBcKenBG8ol9A9ApBUJCfqKC43S+4ZUgEgiBwWZQdjahfkF2QyHbvJDcGoSeAYmrCkqR/f3RXd1V1\nVfeZkGQmZ+rzemHO6a7lqep2ns/5PE9VRUmSEBAQEBAQEBAQ4MaIwTYgICAgICAgIGAoI5ClgICA\ngICAgIAKBLIUEBAQEBAQEFCBQJYCAgICAgICAioQyFJAQEBAQEBAQAUCWQoICAgICAgIqEAgSwEB\nAasVURRtFEXRH6IoejyKovlRFH0tu755FEV3R1H0bBRFd0VRtKlWZ2YURc9FUfR0FEUHadfHR1H0\nZHbvu9r1D [...]
+       "text": [
+        "<matplotlib.figure.Figure at 0x7fc19bbd2d90>"
+       ]
+      },
+      {
+       "metadata": {},
+       "output_type": "display_data",
+       "png": "iVBORw0KGgoAAAANSUhEUgAAAksAAAEACAYAAACjyjIwAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzsvXmcXUWZPv5UiGC6WATSV1mT/DDMDGtIWEZHsBloBFGD\nCCSKECYBBsK+ClGWyKYsIsg2CAyEXUHZhQQ0aBggLEEMiyPf6WRCWE4HUEhdcAKp3x/nVJ236lTV\nObfT3bf7dj2fcXLvObW8VefQ73Of960qJqVERERERERERESEG8OabUBERERERERExEBGJEsRERER\nEREREQFEshQREREREREREUAkSxERERERERERAUSyFBERERERERERQCRLEREREREREREBRLIUERHR\nq2CMfZox9jRj7AXG2MuMsQuy6+sxxuYwxv6bMTabMfYZUud0xthfGGOvMsb2INcnMMb+lN27jFxf\ngzF2Z3b9K [...]
+       "text": [
+        "<matplotlib.figure.Figure at 0x7fc1a01bafd0>"
+       ]
+      },
+      {
+       "metadata": {},
+       "output_type": "display_data",
+       "png": "iVBORw0KGgoAAAANSUhEUgAAAksAAAEACAYAAACjyjIwAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzsvXu8JUV57/0tLsqs4ibs1YoOzN6vDkGIvioxoJJkUBFN\njgocDqhBkdscJXh7o0bBzxGNlyiexLvJiDdUVKJGPaDGWxA4hssQiApBjdl7HBDptQdBqTUgYL9/\n1KWfqq5ee4MzbNhTv88HZ63u6uqnqtv9/Nbveeop1TQNBQUFBQUFBQUFeWy31AYUFBQUFBQUFNyf\nUchSQUFBQUFBQcEEFLJUUFBQUFBQUDABhSwVFBQUFBQUFExAIUsFBQUFBQUFBRNQyFJBQUFBQUFB\nwQQUslRQULBFoZTaSSl1mVLqaqXUtUqpt7vjZyqlrldKXeX+e5a45vVKqZ8opa5TSj1DHD9QKfUD\nd+494viDl [...]
+       "text": [
+        "<matplotlib.figure.Figure at 0x7fc1a0311e90>"
+       ]
+      }
+     ],
+     "prompt_number": 15
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "Author: Jason Chin, Dec. 5, 2014"
+     ]
+    }
+   ],
+   "metadata": {}
+  }
+ ]
+}
\ No newline at end of file
diff --git a/examples/fc_run_LG.cfg b/examples/fc_run_LG.cfg
new file mode 100755
index 0000000..0615ce9
--- /dev/null
+++ b/examples/fc_run_LG.cfg
@@ -0,0 +1,36 @@
+[General]
+# list of files of the initial bas.h5 files
+input_fofn = input.fofn
+#input_fofn = preads.fofn
+
+input_type = raw
+#input_type = preads
+
+# The length cutoff used for seed reads used for initial mapping
+length_cutoff = 10000
+
+# The length cutoff used for seed reads usef for pre-assembly
+length_cutoff_pr = 7000
+
+
+jobqueue = bigmem
+sge_option_da = -pe smp 4 -q %(jobqueue)s
+sge_option_la = -pe smp 16 -q %(jobqueue)s
+sge_option_pda = -pe smp 8 -q %(jobqueue)s
+sge_option_pla = -pe smp 16 -q %(jobqueue)s
+sge_option_fc = -pe smp 24 -q %(jobqueue)s
+sge_option_cns = -pe smp 16 -q %(jobqueue)s
+
+pa_concurrent_jobs = 96
+cns_concurrent_jobs = 96
+ovlp_concurrent_jobs = 96
+
+pa_HPCdaligner_option =  -v -dal128 -t16 -e.70 -l1000 -s1000
+ovlp_HPCdaligner_option = -v -dal128 -t32 -h60 -e.96 -l500 -s1000
+
+pa_DBsplit_option = -x500 -s400
+ovlp_DBsplit_option = -x500 -s400
+
+falcon_sense_option = --output_multi --min_idt 0.70 --min_cov 4 --local_match_count_threshold 2 --max_n_read 200 --n_core 16
+
+overlap_filtering_setting = --max_diff 60 --max_cov 60 --min_cov 2 --n_core 24
diff --git a/examples/fc_run_arab.cfg b/examples/fc_run_arab.cfg
new file mode 100644
index 0000000..8f5bc80
--- /dev/null
+++ b/examples/fc_run_arab.cfg
@@ -0,0 +1,36 @@
+[General]
+# list of files of the initial bas.h5 files
+input_fofn = input.fofn
+#input_fofn = preads.fofn
+
+input_type = raw
+#input_type = preads
+
+# The length cutoff used for seed reads used for initial mapping
+length_cutoff = 15000
+
+# The length cutoff used for seed reads usef for pre-assembly
+length_cutoff_pr = 15000
+
+
+jobqueue = your_queue
+sge_option_da = -pe smp 8 -q %(jobqueue)s
+sge_option_la = -pe smp 2 -q %(jobqueue)s
+# 6 seems to small... 8 might be better for Dmel
+sge_option_pda = -pe smp 8 -q %(jobqueue)s
+sge_option_pla = -pe smp 2 -q %(jobqueue)s
+sge_option_fc = -pe smp 24 -q %(jobqueue)s
+sge_option_cns = -pe smp 8 -q %(jobqueue)s
+
+pa_concurrent_jobs = 32
+ovlp_concurrent_jobs = 32
+
+pa_HPCdaligner_option =  -v -dal128 -t16 -e.70 -l1000 -s1000
+ovlp_HPCdaligner_option = -v -dal128 -t32 -h60 -e.96 -l500 -s1000
+
+pa_DBsplit_option = -x500 -s400
+ovlp_DBsplit_option = -x500 -s400
+
+falcon_sense_option = --output_multi --min_idt 0.70 --min_cov 4 --local_match_count_threshold 2 --max_n_read 200 --n_core 6
+
+overlap_filtering_setting = --max_diff 100 --max_cov 100 --min_cov 1 --bestn 10 --n_core 24
diff --git a/examples/fc_run_dmel.cfg b/examples/fc_run_dmel.cfg
new file mode 100644
index 0000000..2623812
--- /dev/null
+++ b/examples/fc_run_dmel.cfg
@@ -0,0 +1,36 @@
+[General]
+# list of files of the initial bas.h5 files
+input_fofn = input.fofn
+#input_fofn = preads.fofn
+
+input_type = raw
+#input_type = preads
+
+# The length cutoff used for seed reads used for initial mapping
+length_cutoff = 12000
+
+# The length cutoff used for seed reads usef for pre-assembly
+length_cutoff_pr = 12000
+
+
+jobqueue = your_queue
+sge_option_da = -pe smp 8 -q %(jobqueue)s
+sge_option_la = -pe smp 2 -q %(jobqueue)s
+# 6 seems to small... 8 might be better for Dmel
+sge_option_pda = -pe smp 8 -q %(jobqueue)s
+sge_option_pla = -pe smp 2 -q %(jobqueue)s
+sge_option_fc = -pe smp 24 -q %(jobqueue)s
+sge_option_cns = -pe smp 8 -q %(jobqueue)s
+
+pa_concurrent_jobs = 32
+ovlp_concurrent_jobs = 32
+
+pa_HPCdaligner_option =  -v -dal128 -t16 -e.70 -l1000 -s1000
+ovlp_HPCdaligner_option = -v -dal128 -t32 -h60 -e.96 -l500 -s1000
+
+pa_DBsplit_option = -x500 -s400
+ovlp_DBsplit_option = -x500 -s400
+
+falcon_sense_option = --output_multi --min_idt 0.70 --min_cov 4 --local_match_count_threshold 2 --max_n_read 200 --n_core 6
+
+overlap_filtering_setting = --max_diff 30 --max_cov 60 --min_cov 5 --n_core 24
diff --git a/examples/fc_run_ecoli.cfg b/examples/fc_run_ecoli.cfg
new file mode 100644
index 0000000..dddc6d6
--- /dev/null
+++ b/examples/fc_run_ecoli.cfg
@@ -0,0 +1,35 @@
+[General]
+# list of files of the initial bas.h5 files
+input_fofn = input.fofn
+#input_fofn = preads.fofn
+
+input_type = raw
+#input_type = preads
+
+# The length cutoff used for seed reads used for initial mapping
+length_cutoff = 12000
+
+# The length cutoff used for seed reads usef for pre-assembly
+length_cutoff_pr = 12000
+
+
+jobqueue = your_queue
+sge_option_da = -pe smp 8 -q %(jobqueue)s
+sge_option_la = -pe smp 2 -q %(jobqueue)s
+sge_option_pda = -pe smp 8 -q %(jobqueue)s
+sge_option_pla = -pe smp 2 -q %(jobqueue)s
+sge_option_fc = -pe smp 24 -q %(jobqueue)s
+sge_option_cns = -pe smp 8 -q %(jobqueue)s
+
+pa_concurrent_jobs = 32
+ovlp_concurrent_jobs = 32
+
+pa_HPCdaligner_option =  -v -dal4 -t16 -e.70 -l1000 -s1000
+ovlp_HPCdaligner_option = -v -dal4 -t32 -h60 -e.96 -l500 -s1000
+
+pa_DBsplit_option = -x500 -s50
+ovlp_DBsplit_option = -x500 -s50
+
+falcon_sense_option = --output_multi --min_idt 0.70 --min_cov 4 --local_match_count_threshold 2 --max_n_read 200 --n_core 6
+
+overlap_filtering_setting = --max_diff 100 --max_cov 100 --min_cov 20 --bestn 10 --n_core 24
diff --git a/examples/fc_run_ecoli_2.cfg b/examples/fc_run_ecoli_2.cfg
new file mode 100644
index 0000000..78cf397
--- /dev/null
+++ b/examples/fc_run_ecoli_2.cfg
@@ -0,0 +1,35 @@
+[General]
+# list of files of the initial bas.h5 files
+input_fofn = input.fofn
+#input_fofn = preads.fofn
+
+input_type = raw
+#input_type = preads
+
+# The length cutoff used for seed reads used for initial mapping
+length_cutoff = 12000
+
+# The length cutoff used for seed reads usef for pre-assembly
+length_cutoff_pr = 12000
+
+
+jobqueue = your_queue
+sge_option_da = -pe smp 8 -q %(jobqueue)s
+sge_option_la = -pe smp 2 -q %(jobqueue)s
+sge_option_pda = -pe smp 8 -q %(jobqueue)s
+sge_option_pla = -pe smp 2 -q %(jobqueue)s
+sge_option_fc = -pe smp 24 -q %(jobqueue)s
+sge_option_cns = -pe smp 8 -q %(jobqueue)s
+
+pa_concurrent_jobs = 32
+ovlp_concurrent_jobs = 32
+
+pa_HPCdaligner_option =  -v -dal24 -t16 -e.70 -l1000 -s1000
+ovlp_HPCdaligner_option = -v -dal24 -t32 -h60 -e.96 -l500 -s1000
+
+pa_DBsplit_option = -x500 -s200
+ovlp_DBsplit_option = -x500 -s200
+
+falcon_sense_option = --output_multi --min_idt 0.70 --min_cov 4 --local_match_count_threshold 2 --max_n_read 200 --n_core 6
+
+overlap_filtering_setting = --max_diff 100 --max_cov 100 --min_cov 20 --bestn 10 --n_core 24
diff --git a/examples/run_ecoli_test.sh b/examples/run_ecoli_test.sh
new file mode 100644
index 0000000..17c723d
--- /dev/null
+++ b/examples/run_ecoli_test.sh
@@ -0,0 +1,11 @@
+mkdir ecoli_test/
+cd ecoli_test/
+mkdir data
+cd data
+wget https://www.dropbox.com/s/tb78i5i3nrvm6rg/m140913_050931_42139_c100713652400000001823152404301535_s1_p0.1.subreads.fasta
+wget https://www.dropbox.com/s/v6wwpn40gedj470/m140913_050931_42139_c100713652400000001823152404301535_s1_p0.2.subreads.fasta
+wget https://www.dropbox.com/s/j61j2cvdxn4dx4g/m140913_050931_42139_c100713652400000001823152404301535_s1_p0.3.subreads.fasta
+cd ..
+find $PWD/data -name "*.fasta" > input.fofn
+cp ../FALCON/examples/fc_run_ecoli_2.cfg  .
+fc_run.py fc_run_ecoli_2.cfg
diff --git a/setup.py b/setup.py
new file mode 100755
index 0000000..301e6de
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,37 @@
+#!/usr/bin/env python
+
+from setuptools import setup, Extension
+
+import glob
+
+#install_requires=[ "pbcore >= 0.6.3", "networkx >= 1.7" ]
+install_requires=[ "networkx >= 1.7" ]
+
+scripts = glob.glob("src/py_scripts/*.py")
+
+setup(name='falcon_kit',
+      version='0.4.0',
+      description='a small toolkit for DNA seqeucne alignment, overlapping, and assembly',
+      author='Jason Chin',
+      author_email='jchin at pacificbiosciences.com',
+      packages=['falcon_kit',
+          'falcon_kit.mains',
+          'falcon_kit.util',
+          ],
+      package_dir={'falcon_kit':'src/py/'},
+      ext_modules=[
+                   Extension('falcon_kit.ext_falcon', ['src/c/ext_falcon.c', 'src/c/DW_banded.c', 'src/c/kmer_lookup.c', 'src/c/falcon.c'],
+                    extra_link_args=["-fPIC",  "-O3"]),
+                   ],
+      entry_points = {'console_scripts': [
+          'falcon-task=falcon_kit.mains.tasks:main',
+          ],
+      },
+      extras_require = {
+          'falcon-task':  ['falcon_kit'],
+      },
+      scripts = scripts,
+      zip_safe = False,
+      setup_requires=install_requires,
+      install_requires=install_requires
+     )
diff --git a/src/c/DW_banded.c b/src/c/DW_banded.c
new file mode 100755
index 0000000..5c7fd54
--- /dev/null
+++ b/src/c/DW_banded.c
@@ -0,0 +1,319 @@
+
+/*
+ * =====================================================================================
+ *
+ *       Filename:  DW_banded.c
+ *
+ *    Description:  A banded version for the O(ND) greedy sequence alignment algorithm 
+ *
+ *        Version:  0.1
+ *        Created:  07/20/2013 17:00:00
+ *       Revision:  none
+ *       Compiler:  gcc
+ *
+ *         Author:  Jason Chin, 
+ *        Company:  
+ *
+ * =====================================================================================
+
+ #################################################################################$$
+ # Copyright (c) 2011-2014, Pacific Biosciences of California, Inc.
+ #
+ # All rights reserved.
+ #
+ # Redistribution and use in source and binary forms, with or without
+ # modification, are permitted (subject to the limitations in the
+ # disclaimer below) provided that the following conditions are met:
+ #
+ #  * Redistributions of source code must retain the above copyright
+ #  notice, this list of conditions and the following disclaimer.
+ #
+ #  * Redistributions in binary form must reproduce the above
+ #  copyright notice, this list of conditions and the following
+ #  disclaimer in the documentation and/or other materials provided
+ #  with the distribution.
+ #
+ #  * Neither the name of Pacific Biosciences nor the names of its
+ #  contributors may be used to endorse or promote products derived
+ #  from this software without specific prior written permission.
+ #
+ # NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+ # GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+ # BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+ # WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ # DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+ # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+ # USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ # OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ # SUCH DAMAGE.
+ #################################################################################$$
+ 
+
+*/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <limits.h>
+#include <stdbool.h>
+#include "common.h"
+
+int compare_d_path(const void * a, const void * b)
+{
+    const d_path_data2 * arg1 = a;
+    const d_path_data2 * arg2 = b;
+    if (arg1->d - arg2->d == 0) {
+        return  arg1->k - arg2->k;
+    } else {
+        return arg1->d - arg2->d;
+    }
+}
+
+
+void d_path_sort( d_path_data2 * base, unsigned long max_idx) {
+    qsort(base, max_idx, sizeof(d_path_data2), compare_d_path);
+}
+
+d_path_data2 * get_dpath_idx( seq_coor_t d, seq_coor_t k, unsigned long max_idx, d_path_data2 * base) {
+    d_path_data2 d_tmp;
+    d_path_data2 *rtn;
+    d_tmp.d = d;
+    d_tmp.k = k;
+    rtn = (d_path_data2 *)  bsearch( &d_tmp, base, max_idx, sizeof(d_path_data2), compare_d_path);
+    //printf("dp %ld %ld %ld %ld %ld %ld %ld\n", (rtn)->d, (rtn)->k, (rtn)->x1, (rtn)->y1, (rtn)->x2, (rtn)->y2, (rtn)->pre_k);
+    
+    return rtn;
+
+}
+
+void print_d_path(  d_path_data2 * base, unsigned long max_idx) {
+    unsigned long idx;
+    for (idx = 0; idx < max_idx; idx++){
+        printf("dp %ld %d %d %d %d %d %d %d\n",idx, (base+idx)->d, (base+idx)->k, (base+idx)->x1, (base+idx)->y1, (base+idx)->x2, (base+idx)->y2, (base+idx)->pre_k);
+    }
+}
+
+
+alignment * align(char * query_seq, seq_coor_t q_len,
+                  char * target_seq, seq_coor_t t_len,
+                  seq_coor_t band_tolerance,
+                  int get_aln_str) {
+    seq_coor_t * V;
+    seq_coor_t * U;  // array of matched bases for each "k"
+    seq_coor_t k_offset;
+    seq_coor_t d;
+    seq_coor_t k, k2;
+    seq_coor_t best_m;  // the best "matches" for each d
+    seq_coor_t min_k, new_min_k;
+    seq_coor_t max_k, new_max_k;
+    seq_coor_t pre_k;
+    seq_coor_t x, y;
+    seq_coor_t cd;
+    seq_coor_t ck;
+    seq_coor_t cx, cy, nx, ny;
+    seq_coor_t max_d;
+    seq_coor_t band_size;
+    unsigned long d_path_idx = 0;
+    unsigned long max_idx = 0;
+
+    d_path_data2 * d_path;
+    d_path_data2 * d_path_aux;
+    path_point * aln_path;
+    seq_coor_t aln_path_idx;
+    alignment * align_rtn;
+    seq_coor_t aln_pos;
+    seq_coor_t i;
+    bool aligned = false;
+
+    //printf("debug: %ld %ld\n", q_len, t_len);
+    //printf("%s\n", query_seq);
+   
+    max_d = (int) (0.3*(q_len + t_len));
+
+    band_size = band_tolerance * 2;
+
+    V = calloc( max_d * 2 + 1, sizeof(seq_coor_t) );
+    U = calloc( max_d * 2 + 1, sizeof(seq_coor_t) );
+    
+    k_offset = max_d;
+    
+    // We should probably use hashmap to store the backtracing information to save memory allocation time
+    // This O(MN) block allocation scheme is convient for now but it is slower for very long sequences
+    d_path = calloc( max_d * (band_size + 1 ) * 2 + 1, sizeof(d_path_data2) );
+    
+    aln_path = calloc( q_len + t_len + 1, sizeof(path_point) );
+
+    align_rtn = calloc( 1, sizeof(alignment));
+    align_rtn->t_aln_str = calloc( q_len + t_len + 1, sizeof(char));
+    align_rtn->q_aln_str = calloc( q_len + t_len + 1, sizeof(char));
+    align_rtn->aln_str_size = 0;
+    align_rtn->aln_q_s = 0;
+    align_rtn->aln_q_e = 0;
+    align_rtn->aln_t_s = 0;
+    align_rtn->aln_t_e = 0;
+
+    //printf("max_d: %lu, band_size: %lu\n", max_d, band_size);
+    best_m = -1;
+    min_k = 0;
+    max_k = 0;
+    d_path_idx = 0; 
+    max_idx = 0;
+    for (d = 0; d < max_d; d ++ ) {
+        if (max_k - min_k > band_size) {
+            break;
+        }
+ 
+        for (k = min_k; k <= max_k;  k += 2) {
+
+            if ( (k == min_k) || (k != max_k) && (V[ k - 1 + k_offset ] < V[ k + 1 + k_offset]) ) {
+                pre_k = k + 1;
+                x = V[ k + 1 + k_offset];
+            } else {
+                pre_k = k - 1;
+                x = V[ k - 1 + k_offset] + 1;
+            }
+            y = x - k;
+            d_path[d_path_idx].d = d;
+            d_path[d_path_idx].k = k;
+            d_path[d_path_idx].x1 = x;
+            d_path[d_path_idx].y1 = y;
+
+            while ( x < q_len && y < t_len && query_seq[x] == target_seq[y] ){
+                x++;
+                y++;
+            }
+
+            d_path[d_path_idx].x2 = x;
+            d_path[d_path_idx].y2 = y;
+            d_path[d_path_idx].pre_k = pre_k;
+            d_path_idx ++;
+
+            V[ k + k_offset ] = x;
+            U[ k + k_offset ] = x + y;
+            
+            if ( x + y > best_m) {
+                best_m = x + y;
+            }
+
+            if ( x >= q_len || y >= t_len) {
+                aligned = true;
+                max_idx = d_path_idx;
+                break;
+            }
+        }
+        
+        // For banding
+        new_min_k = max_k;
+        new_max_k = min_k;
+
+        for (k2 = min_k; k2 <= max_k;  k2 += 2) {
+            if (U[ k2 + k_offset] >= best_m - band_tolerance ) {
+                if ( k2 < new_min_k ) {
+                    new_min_k = k2;
+                }
+                if ( k2 > new_max_k ) {
+                    new_max_k = k2;
+                }
+            }
+        }
+        
+        max_k = new_max_k + 1;
+        min_k = new_min_k - 1;
+        
+        // For no banding
+        // max_k ++;
+        // min_k --;
+
+        // For debuging 
+        // printf("min_max_k,d, %ld %ld %ld\n", min_k, max_k, d);
+        
+        if (aligned == true) {
+            align_rtn->aln_q_e = x;
+            align_rtn->aln_t_e = y;
+            align_rtn->dist = d;
+            align_rtn->aln_str_size = (x + y + d) / 2;
+            align_rtn->aln_q_s = 0;
+            align_rtn->aln_t_s = 0;
+
+            d_path_sort(d_path, max_idx);
+            //print_d_path(d_path, max_idx);
+
+            if (get_aln_str > 0) {
+                cd = d;
+                ck = k;
+                aln_path_idx = 0;
+                while (cd >= 0 && aln_path_idx < q_len + t_len + 1) {    
+                    d_path_aux = (d_path_data2 *) get_dpath_idx( cd, ck, max_idx, d_path);
+                    aln_path[aln_path_idx].x = d_path_aux -> x2;
+                    aln_path[aln_path_idx].y = d_path_aux -> y2;
+                    aln_path_idx ++;
+                    aln_path[aln_path_idx].x = d_path_aux -> x1;
+                    aln_path[aln_path_idx].y = d_path_aux -> y1;
+                    aln_path_idx ++;
+                    ck = d_path_aux -> pre_k;
+                    cd -= 1;
+                }
+                aln_path_idx --;
+                cx = aln_path[aln_path_idx].x;
+                cy = aln_path[aln_path_idx].y;
+                align_rtn->aln_q_s = cx;
+                align_rtn->aln_t_s = cy;
+                aln_pos = 0;
+                while ( aln_path_idx > 0 ) {
+                    aln_path_idx --;
+                    nx = aln_path[aln_path_idx].x;
+                    ny = aln_path[aln_path_idx].y;
+                    if (cx == nx && cy == ny){
+                        continue;
+                    }
+                    if (nx == cx && ny != cy){ //advance in y
+                        for (i = 0; i <  ny - cy; i++) {
+                            align_rtn->q_aln_str[aln_pos + i] = '-';
+                        }
+                        for (i = 0; i <  ny - cy; i++) {
+                            align_rtn->t_aln_str[aln_pos + i] = target_seq[cy + i];
+                        }
+                        aln_pos += ny - cy;
+                    } else if (nx != cx && ny == cy){ //advance in x
+                        for (i = 0; i <  nx - cx; i++) {
+                            align_rtn->q_aln_str[aln_pos + i] = query_seq[cx + i];
+                        }
+                        for (i = 0; i <  nx - cx; i++) {
+                            align_rtn->t_aln_str[aln_pos + i] = '-';
+                        }
+                        aln_pos += nx - cx;
+                    } else {
+                        for (i = 0; i <  nx - cx; i++) {
+                            align_rtn->q_aln_str[aln_pos + i] = query_seq[cx + i];
+                        }
+                        for (i = 0; i <  ny - cy; i++) {
+                            align_rtn->t_aln_str[aln_pos + i] = target_seq[cy + i];
+                        }
+                        aln_pos += ny - cy;
+                    }
+                    cx = nx;
+                    cy = ny;
+                }
+                align_rtn->aln_str_size = aln_pos;
+            }
+            break;
+        }
+    }
+
+    free(V);
+    free(U);
+    free(d_path);
+    free(aln_path);
+    return align_rtn;
+}
+
+
+void free_alignment(alignment * aln) {
+    free(aln->q_aln_str);
+    free(aln->t_aln_str);
+    free(aln);
+}
diff --git a/src/c/Makefile b/src/c/Makefile
new file mode 100755
index 0000000..3f13a2f
--- /dev/null
+++ b/src/c/Makefile
@@ -0,0 +1,20 @@
+DW_align.so: DW_banded.c common.h
+	gcc DW_banded.c -O3 -shared -fPIC -o DW_align.so
+
+kmer_lookup.so: kmer_lookup.c common.h
+	gcc kmer_lookup.c -O3 -shared -fPIC -o kmer_lookup.so
+
+#falcon: DW_banded.c common.h kmer_lookup.c falcon.c 
+#	gcc DW_banded.c kmer_lookup.c falcon.c -O4 -o falcon -fPIC 
+
+falcon.so: falcon.c common.h DW_banded.c kmer_lookup.c
+	gcc DW_banded.c kmer_lookup.c falcon.c -O3 -shared -fPIC -o falcon.so 
+
+#falcon2.so: falcon.c common.h DW_banded_2.c kmer_lookup.c
+#	gcc DW_banded_2.c kmer_lookup.c falcon.c -O3 -shared -fPIC -o falcon2.so 
+
+clean:
+	rm  *.so
+
+all: DW_align.so kmer_lookup.so falcon.so
+
diff --git a/src/c/Makefile.osx b/src/c/Makefile.osx
new file mode 100755
index 0000000..99fcce7
--- /dev/null
+++ b/src/c/Makefile.osx
@@ -0,0 +1,16 @@
+DW_align.so: DW_banded.c common.h
+	gcc DW_banded.c -O3 -shared -fPIC -o DW_align.so -I/Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX10.8.sdk/usr/include/ -L/Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX10.8.sdk/usr/lib
+
+kmer_lookup.so: kmer_lookup.c common.h
+	gcc kmer_lookup.c -O3 -shared -fPIC -o kmer_lookup.so -I/Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX10.8.sdk/usr/include/ -L/Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX10.8.sdk/usr/lib
+
+falcon: DW_banded.c common.h kmer_lookup.c falcon.c 
+	gcc DW_banded.c kmer_lookup.c falcon.c -O4 -o falcon -fPIC -I/Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX10.8.sdk/usr/include/ -L/Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX10.8.sdk/usr/lib
+
+falcon.so: falcon.c common.h DW_banded.c kmer_lookup.c
+	gcc DW_banded.c kmer_lookup.c falcon.c -O3 -shared -fPIC -o falcon.so -I/Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX10.8.sdk/usr/include/ -L/Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX10.8.sdk/usr/lib
+
+
+
+all: DW_align.so kmer_lookup.so falcon.so falcon
+
diff --git a/src/c/common.h b/src/c/common.h
new file mode 100755
index 0000000..95c38f6
--- /dev/null
+++ b/src/c/common.h
@@ -0,0 +1,178 @@
+
+/*
+ * =====================================================================================
+ *
+ *       Filename:  common.h
+ *
+ *    Description:  Common delclaration for the code base 
+ *
+ *        Version:  0.1
+ *        Created:  07/16/2013 07:46:23 AM
+ *       Revision:  none
+ *       Compiler:  gcc
+ *
+ *         Author:  Jason Chin, 
+ *        Company:  
+ *
+ * =====================================================================================
+
+ #################################################################################$$
+ # Copyright (c) 2011-2014, Pacific Biosciences of California, Inc.
+ #
+ # All rights reserved.
+ #
+ # Redistribution and use in source and binary forms, with or without
+ # modification, are permitted (subject to the limitations in the
+ # disclaimer below) provided that the following conditions are met:
+ #
+ #  * Redistributions of source code must retain the above copyright
+ #  notice, this list of conditions and the following disclaimer.
+ #
+ #  * Redistributions in binary form must reproduce the above
+ #  copyright notice, this list of conditions and the following
+ #  disclaimer in the documentation and/or other materials provided
+ #  with the distribution.
+ #
+ #  * Neither the name of Pacific Biosciences nor the names of its
+ #  contributors may be used to endorse or promote products derived
+ #  from this software without specific prior written permission.
+ #
+ # NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+ # GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+ # BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+ # WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ # DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+ # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+ # USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ # OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ # SUCH DAMAGE.
+ #################################################################################$$
+ */
+
+typedef int seq_coor_t; 
+
+typedef struct {    
+    seq_coor_t aln_str_size ;
+    seq_coor_t dist ;
+    seq_coor_t aln_q_s;
+    seq_coor_t aln_q_e;
+    seq_coor_t aln_t_s;
+    seq_coor_t aln_t_e;
+    char * q_aln_str;
+    char * t_aln_str;
+
+} alignment;
+
+
+typedef struct {
+    seq_coor_t pre_k;
+    seq_coor_t x1;
+    seq_coor_t y1;
+    seq_coor_t x2;
+    seq_coor_t y2;
+} d_path_data;
+
+typedef struct {
+    seq_coor_t d;
+    seq_coor_t k;
+    seq_coor_t pre_k;
+    seq_coor_t x1;
+    seq_coor_t y1;
+    seq_coor_t x2;
+    seq_coor_t y2;
+} d_path_data2;
+
+typedef struct {
+    seq_coor_t x;
+    seq_coor_t y;
+} path_point;
+
+typedef struct {    
+    seq_coor_t start;
+    seq_coor_t last;
+    seq_coor_t count;
+} kmer_lookup;
+
+typedef unsigned char base;
+typedef base * seq_array;
+typedef seq_coor_t seq_addr;
+typedef seq_addr * seq_addr_array;
+
+
+typedef struct {
+    seq_coor_t count;
+    seq_coor_t * query_pos;
+    seq_coor_t * target_pos;
+} kmer_match;
+
+
+typedef struct {
+    seq_coor_t s1;
+    seq_coor_t e1;
+    seq_coor_t s2;
+    seq_coor_t e2;
+    long int score;
+} aln_range;
+
+
+typedef struct {
+    char * sequence;
+    int * eqv;
+} consensus_data;
+
+kmer_lookup * allocate_kmer_lookup (seq_coor_t);
+void init_kmer_lookup ( kmer_lookup *,  seq_coor_t );
+void free_kmer_lookup(kmer_lookup *);
+
+seq_array allocate_seq(seq_coor_t);
+void init_seq_array( seq_array, seq_coor_t);
+void free_seq_array(seq_array);
+
+seq_addr_array allocate_seq_addr(seq_coor_t size); 
+
+void free_seq_addr_array(seq_addr_array);
+
+
+aln_range *  find_best_aln_range(kmer_match *, 
+                              seq_coor_t, 
+                              seq_coor_t, 
+                              seq_coor_t); 
+
+void free_aln_range( aln_range *);
+
+kmer_match * find_kmer_pos_for_seq( char *, 
+                                    seq_coor_t, 
+                                    unsigned int K, 
+                                    seq_addr_array, 
+                                    kmer_lookup * );
+
+void free_kmer_match( kmer_match * ptr);
+void free_kmer_lookup(kmer_lookup * );
+
+
+
+void add_sequence ( seq_coor_t, 
+                    unsigned int, 
+                    char *, 
+                    seq_coor_t,
+                    seq_addr_array, 
+                    seq_array, 
+                    kmer_lookup *); 
+
+void mask_k_mer(seq_coor_t, kmer_lookup *, seq_coor_t);
+
+alignment * align(char *, seq_coor_t,
+                  char *, seq_coor_t,
+                  seq_coor_t,
+                  int); 
+
+void free_alignment(alignment *);
+
+
+void free_consensus_data(consensus_data *);
+
diff --git a/src/c/ext_falcon.c b/src/c/ext_falcon.c
new file mode 100644
index 0000000..6439cb4
--- /dev/null
+++ b/src/c/ext_falcon.c
@@ -0,0 +1,13 @@
+#include "Python.h"
+static PyMethodDef SpamMethods[] = {
+    {NULL, NULL, 0, NULL}        /* Sentinel */
+};
+PyMODINIT_FUNC
+initext_falcon(void)
+{
+    PyObject *m;
+
+    m = Py_InitModule("falcon_kit.ext_falcon", SpamMethods);
+    if (m == NULL)
+        return;
+}
diff --git a/src/c/falcon.c b/src/c/falcon.c
new file mode 100755
index 0000000..c1ab45b
--- /dev/null
+++ b/src/c/falcon.c
@@ -0,0 +1,843 @@
+/*
+ * =====================================================================================
+ *
+ *       Filename:  fastcon.c
+ *
+ *    Description:  
+ *
+ *        Version:  0.1
+ *        Created:  07/20/2013 17:00:00
+ *       Revision:  none
+ *       Compiler:  gcc
+ *
+ *         Author:  Jason Chin, 
+ *        Company:  
+ *
+ * =====================================================================================
+
+ #################################################################################$$
+ # Copyright (c) 2011-2014, Pacific Biosciences of California, Inc.
+ #
+ # All rights reserved.
+ #
+ # Redistribution and use in source and binary forms, with or without
+ # modification, are permitted (subject to the limitations in the
+ # disclaimer below) provided that the following conditions are met:
+ #
+ #  * Redistributions of source code must retain the above copyright
+ #  notice, this list of conditions and the following disclaimer.
+ #
+ #  * Redistributions in binary form must reproduce the above
+ #  copyright notice, this list of conditions and the following
+ #  disclaimer in the documentation and/or other materials provided
+ #  with the distribution.
+ #
+ #  * Neither the name of Pacific Biosciences nor the names of its
+ #  contributors may be used to endorse or promote products derived
+ #  from this software without specific prior written permission.
+ #
+ # NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+ # GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+ # BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+ # WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ # DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+ # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+ # USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ # OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ # SUCH DAMAGE.
+ #################################################################################$$
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <limits.h>
+#include <string.h>
+#include <assert.h>
+#include <stdint.h>
+#include "common.h"
+
+typedef struct {
+    seq_coor_t t_pos;
+    uint8_t delta;
+    char q_base;
+    seq_coor_t p_t_pos;   // the tag position of the previous base
+    uint8_t p_delta; // the tag delta of the previous base
+    char p_q_base;        // the previous base
+    unsigned q_id;
+} align_tag_t;
+
+typedef struct {
+    seq_coor_t len;
+    align_tag_t * align_tags;
+} align_tags_t;
+
+
+typedef struct {
+    uint16_t size;
+    uint16_t n_link;
+    seq_coor_t * p_t_pos;   // the tag position of the previous base
+    uint8_t * p_delta; // the tag delta of the previous base
+    char * p_q_base;        // the previous base
+    uint16_t * link_count;
+    uint16_t count;
+    seq_coor_t best_p_t_pos;
+    uint8_t best_p_delta;
+    uint8_t best_p_q_base; // encoded base
+    double score;
+} align_tag_col_t;
+
+typedef struct {
+    align_tag_col_t * base;
+} msa_base_group_t;
+
+typedef struct {
+    uint8_t size;
+    uint8_t max_delta;
+    msa_base_group_t * delta;
+} msa_delta_group_t;
+
+typedef msa_delta_group_t * msa_pos_t;
+
+align_tags_t * get_align_tags( char * aln_q_seq, 
+                               char * aln_t_seq, 
+                               seq_coor_t aln_seq_len,
+                               aln_range * range,
+                               unsigned q_id,
+                               seq_coor_t t_offset) {
+    char p_q_base;
+    align_tags_t * tags;
+    seq_coor_t i, j, jj, k, p_j, p_jj;
+
+    tags = calloc( 1, sizeof(align_tags_t) );
+    tags->len = aln_seq_len; 
+    tags->align_tags = calloc( aln_seq_len + 1, sizeof(align_tag_t) );
+    i = range->s1 - 1;
+    j = range->s2 - 1;
+    jj = 0;
+    p_j = -1;
+    p_jj = 0;
+    p_q_base = '.';
+
+    for (k = 0; k < aln_seq_len; k++) {
+        if (aln_q_seq[k] != '-') {
+            i ++;
+            jj ++;
+        } 
+        if (aln_t_seq[k] != '-') {
+            j ++;
+            jj = 0;
+        }
+        //printf("t %d %d %d %c %c\n", q_id, j, jj, aln_t_seq[k], aln_q_seq[k]);
+       
+       
+        if ( j + t_offset >= 0 && jj < UINT8_MAX && p_jj < UINT8_MAX) {
+            (tags->align_tags[k]).t_pos = j + t_offset;
+            (tags->align_tags[k]).delta = jj;
+            (tags->align_tags[k]).p_t_pos = p_j + t_offset;
+            (tags->align_tags[k]).p_delta = p_jj;
+            (tags->align_tags[k]).p_q_base = p_q_base;
+            (tags->align_tags[k]).q_base = aln_q_seq[k];
+            (tags->align_tags[k]).q_id = q_id;
+            
+            p_j = j;
+            p_jj = jj;
+            p_q_base = aln_q_seq[k];
+        }
+    }
+    // sentinal at the end
+    //k = aln_seq_len;
+    tags->len = k; 
+    (tags->align_tags[k]).t_pos = UINT_MAX;
+    (tags->align_tags[k]).delta = UINT8_MAX;
+    (tags->align_tags[k]).q_base = '.';
+    (tags->align_tags[k]).q_id = UINT_MAX;
+    return tags;
+}
+
+void free_align_tags( align_tags_t * tags) {
+    free( tags->align_tags );
+    free( tags );
+}
+
+
+void allocate_aln_col( align_tag_col_t * col) {
+    col->p_t_pos = ( seq_coor_t * ) calloc(col->size, sizeof( seq_coor_t ));
+    col->p_delta = ( uint8_t * ) calloc(col->size, sizeof( uint8_t ));
+    col->p_q_base = ( char * )calloc(col->size, sizeof( char ));
+    col->link_count = ( uint16_t * ) calloc(col->size, sizeof( uint16_t ));
+}
+
+void realloc_aln_col( align_tag_col_t * col ) {
+    col->p_t_pos = (seq_coor_t *) realloc( col->p_t_pos, (col->size) * sizeof( seq_coor_t ));
+    col->p_delta = ( uint8_t *)  realloc( col->p_delta, (col->size) * sizeof( uint8_t ));
+    col->p_q_base = (char *) realloc( col->p_q_base, (col->size) * sizeof( char ));
+    col->link_count = ( uint16_t *) realloc( col->link_count, (col->size) * sizeof( uint16_t ));
+}
+
+void free_aln_col( align_tag_col_t * col) {
+    free(col->p_t_pos);
+    free(col->p_delta);
+    free(col->p_q_base);
+    free(col->link_count);
+}
+
+
+void allocate_delta_group( msa_delta_group_t * g) {
+    int i,j;
+    g->max_delta = 0;
+    g->delta = (msa_base_group_t *) calloc( g->size, sizeof(msa_base_group_t));
+    for (i = 0; i< g->size; i++) {
+        g->delta[i].base = ( align_tag_col_t * ) calloc( 5, sizeof(align_tag_col_t ) );
+        for (j = 0; j < 5; j++ ) {
+             g->delta[i].base[j].size = 8;
+             allocate_aln_col(&(g->delta[i].base[j]));
+        }
+    }
+}
+
+void realloc_delta_group( msa_delta_group_t * g, uint16_t new_size ) {
+    int i, j, bs, es;
+    bs = g->size;
+    es = new_size;
+    g->delta = (msa_base_group_t *) realloc(g->delta, new_size * sizeof(msa_base_group_t));
+    for (i=bs; i < es; i++) {
+        g->delta[i].base = ( align_tag_col_t *) calloc( 5, sizeof(align_tag_col_t ) );
+        for (j = 0; j < 5; j++ ) {
+             g->delta[i].base[j].size = 8;
+             allocate_aln_col(&(g->delta[i].base[j]));
+        }
+    }
+    g->size = new_size;
+}
+
+void free_delta_group( msa_delta_group_t * g) {
+    //manything to do here 
+    int i, j;
+    for (i = 0; i < g->size; i++) {
+        for (j = 0; j < 5; j++) {
+            free_aln_col( &(g->delta[i].base[j]) );
+        }
+        free(g->delta[i].base);
+    }
+    free(g->delta);
+}
+
+void update_col( align_tag_col_t * col, seq_coor_t p_t_pos, uint8_t p_delta, char p_q_base) {
+    int updated = 0;
+    int kk;
+    col->count += 1;
+    for (kk = 0; kk < col->n_link; kk++) {
+        if ( p_t_pos == col->p_t_pos[kk] &&
+             p_delta == col->p_delta[kk] &&
+             p_q_base == col->p_q_base[kk] ) {
+            col->link_count[kk] ++;
+            updated = 1;
+            break;
+        }
+    }
+    if (updated == 0) {
+        if (col->n_link + 1 > col->size) {
+            if (col->size < (UINT16_MAX > 1)-1) {
+                col->size *= 2;
+            } else {
+                col->size += 256;
+            }
+            assert( col->size < UINT16_MAX-1 );
+            realloc_aln_col(col);
+        }
+        kk = col->n_link;
+
+        col->p_t_pos[kk] = p_t_pos;
+        col->p_delta[kk] = p_delta;
+        col->p_q_base[kk] = p_q_base;
+        col->link_count[kk] = 1;
+        col->n_link++;
+    }
+}
+
+
+msa_pos_t * get_msa_working_sapce(unsigned int max_t_len) {
+    msa_pos_t * msa_array;
+    unsigned int i;
+    msa_array = calloc(max_t_len, sizeof(msa_pos_t *));
+    for (i = 0; i < max_t_len; i++) {
+        msa_array[i] = calloc(1, sizeof(msa_delta_group_t));
+        msa_array[i]->size = 8;
+        allocate_delta_group(msa_array[i]);
+    }
+    return msa_array;
+}
+
+void clean_msa_working_space( msa_pos_t * msa_array, unsigned int max_t_len) {
+    unsigned int i,j,k;
+    align_tag_col_t * col;
+    for (i = 0; i < max_t_len; i++) {
+        for (j =0; j < msa_array[i]->max_delta + 1; j++) {
+            for (k = 0; k < 5; k++ ) {
+                col = msa_array[i]->delta[j].base + k;
+                /*
+                for (c =0; c < col->size; c++) {
+                    col->p_t_pos[c] = 0;
+                    col->p_delta[c] = 0;
+                    col->p_q_base[c] = 0;
+                    col->link_count[c] =0;
+                }
+                */
+                col->n_link = 0;
+                col->count = 0;
+                col->best_p_t_pos = 0;
+                col->best_p_delta = 0;
+                col->best_p_q_base = 0;
+                col->score = 0;
+            }
+        }
+        msa_array[i]->max_delta = 0;
+    }
+}
+
+#define STATIC_ALLOCATE
+//#undef STATIC_ALLOCATE
+
+consensus_data * get_cns_from_align_tags( align_tags_t ** tag_seqs, 
+                                          unsigned n_tag_seqs, 
+                                          unsigned t_len, 
+                                          unsigned min_cov ) {
+
+    seq_coor_t i, j, t_pos;
+    unsigned int * coverage;
+    unsigned int * local_nbase;
+
+    consensus_data * consensus;
+    //char * consensus;
+    align_tag_t * c_tag;
+    static msa_pos_t * msa_array = NULL;
+
+    coverage = calloc( t_len, sizeof(unsigned int) );
+    local_nbase = calloc( t_len, sizeof(unsigned int) );
+
+#ifndef STATIC_ALLOCATE
+
+    msa_array = calloc(t_len, sizeof(msa_pos_t *));
+
+    for (i = 0; i < t_len; i++) {
+        msa_array[i] = calloc(1, sizeof(msa_delta_group_t));
+        msa_array[i]->size = 8;
+        allocate_delta_group(msa_array[i]);
+    }
+
+#endif    
+
+#ifdef STATIC_ALLOCATE
+
+    if ( msa_array == NULL) {
+        msa_array = get_msa_working_sapce( 100000 );
+    } 
+
+    assert(t_len < 100000);
+
+#endif    
+
+    
+    // loop through every alignment
+    //printf("XX %d\n", n_tag_seqs);
+    for (i = 0; i < n_tag_seqs; i++) {
+
+        // for each alignment position, insert the alignment tag to msa_array
+        for (j = 0; j < tag_seqs[i]->len; j++) {
+            c_tag = tag_seqs[i]->align_tags + j;
+            unsigned int delta;
+            delta = c_tag->delta;
+            if (delta == 0) {
+                t_pos = c_tag->t_pos;
+                coverage[ t_pos ] ++;
+            }
+            // Assume t_pos was set on earlier iteration.
+            if (delta > msa_array[t_pos]->max_delta) {
+                msa_array[t_pos]->max_delta = delta;
+                if (msa_array[t_pos]->max_delta + 4 > msa_array[t_pos]->size ) {
+                    realloc_delta_group(msa_array[t_pos], msa_array[t_pos]->max_delta + 8);
+                }
+            }
+            
+            unsigned int base;
+            switch (c_tag->q_base) {
+                case 'A': base = 0; break;
+                case 'C': base = 1; break;
+                case 'G': base = 2; break;
+                case 'T': base = 3; break;
+                case '-': base = 4; break;
+            }
+            // Note: On bad input, base may be uninitialized.
+            update_col( &(msa_array[t_pos]->delta[delta].base[base]), c_tag->p_t_pos, c_tag->p_delta, c_tag->p_q_base);
+            local_nbase[ t_pos ] ++;
+        }
+    }
+
+    // propogate score throught the alignment links, setup backtracking information
+    align_tag_col_t * g_best_aln_col = 0;
+    unsigned int g_best_ck = 0;
+    seq_coor_t g_best_t_pos = 0;
+    {
+        int kk; 
+        int ck;
+        // char base;
+        int best_i;
+        int best_j;
+        int best_b;
+        int best_ck = -1;
+        double score;
+        double best_score;
+        double g_best_score;
+        // char best_mark;
+
+        align_tag_col_t * aln_col;
+        
+        g_best_score = -1;
+
+        for (i = 0; i < t_len; i++) {  //loop through every template base
+            //printf("max delta: %d %d\n", i, msa_array[i]->max_delta);
+            for (j = 0; j <= msa_array[i]->max_delta; j++) { // loop through every delta position
+                for (kk = 0; kk < 5; kk++) {  // loop through diff bases of the same delta posiiton
+                    /*
+                    switch (kk) {
+                        case 0: base = 'A'; break;
+                        case 1: base = 'C'; break;
+                        case 2: base = 'G'; break;
+                        case 3: base = 'T'; break;
+                        case 4: base = '-'; break;
+                    }
+                    */
+                    aln_col = msa_array[i]->delta[j].base + kk;
+                    if (aln_col->count >= 0) {
+                        best_score = -1;
+                        best_i = -1;
+                        best_j = -1;
+                        best_b = -1;
+
+                        for (ck = 0; ck < aln_col->n_link; ck++) { // loop through differnt link to previous column
+                            int pi;
+                            int pj;
+                            int pkk;
+                            pi = aln_col->p_t_pos[ck];
+                            pj = aln_col->p_delta[ck];
+                            switch (aln_col->p_q_base[ck]) {
+                                case 'A': pkk = 0; break;
+                                case 'C': pkk = 1; break;
+                                case 'G': pkk = 2; break;
+                                case 'T': pkk = 3; break;
+                                case '-': pkk = 4; break;
+                                default: pkk = 4;
+                            }
+
+                            if (aln_col->p_t_pos[ck] == -1) {
+                                score =  (double) aln_col->link_count[ck] - (double) coverage[i] * 0.5;
+                            } else {
+                                score = msa_array[pi]->delta[pj].base[pkk].score + 
+                                        (double) aln_col->link_count[ck] - (double) coverage[i] * 0.5;
+                            }
+                            // best_mark = ' ';
+                            if (score > best_score) {
+                                best_score = score;
+                                aln_col->best_p_t_pos = best_i = pi;
+                                aln_col->best_p_delta = best_j = pj;
+                                aln_col->best_p_q_base = best_b = pkk;
+                                best_ck = ck;
+                                // best_mark = '*';
+                            }
+                            /*
+                            printf("X %d %d %d %c %d %d %d %c %d %lf %c\n", coverage[i], i, j, base, aln_col->count, 
+                                                                  aln_col->p_t_pos[ck], 
+                                                                  aln_col->p_delta[ck], 
+                                                                  aln_col->p_q_base[ck], 
+                                                                  aln_col->link_count[ck],
+                                                                  score, best_mark);
+                            */
+                        }
+                        aln_col->score = best_score;
+                        if (best_score > g_best_score) {
+                            g_best_score = best_score;
+                            g_best_aln_col = aln_col;
+                            g_best_ck = best_ck;
+                            g_best_t_pos = i;
+                            //printf("GB %d %d %d %d\n", i, j, ck, g_best_aln_col);
+                        }
+                    }
+                }
+            }
+        }
+    }
+    assert(g_best_score != -1);
+
+    // reconstruct the sequences
+    unsigned int index;
+    char bb;
+    int ck;
+    char * cns_str;
+    int * eqv;
+    double score0;
+    
+    consensus = calloc( 1, sizeof(consensus_data) );
+    consensus->sequence = calloc( t_len * 2 + 1, sizeof(char) );
+    consensus->eqv = calloc( t_len * 2 + 1, sizeof(unsigned int) );
+    cns_str = consensus->sequence;
+    eqv =  consensus->eqv;
+
+    index = 0;
+    ck = g_best_ck;
+    i = g_best_t_pos;
+
+    while (1) {
+        if (coverage[i] > min_cov) {
+            switch (ck) {
+                case 0: bb = 'A'; break;
+                case 1: bb = 'C'; break;
+                case 2: bb = 'G'; break;
+                case 3: bb = 'T'; break;
+                case 4: bb = '-'; break;
+            }
+        } else {
+            switch (ck) {
+                case 0: bb = 'a'; break;
+                case 1: bb = 'c'; break;
+                case 2: bb = 'g'; break;
+                case 3: bb = 't'; break;
+                case 4: bb = '-'; break;
+            }
+        }
+        // Note: On bad input, bb will keep previous value, possibly unitialized.
+
+        score0 = g_best_aln_col->score;
+        i = g_best_aln_col->best_p_t_pos;
+        if (i == -1 || index >= t_len * 2) break;
+        j = g_best_aln_col->best_p_delta;
+        ck = g_best_aln_col->best_p_q_base;
+        g_best_aln_col = msa_array[i]->delta[j].base + ck;
+
+        if (bb != '-') {
+            cns_str[index] = bb;
+            eqv[index] = (int) score0 - (int) g_best_aln_col->score;
+            //printf("C %d %d %c %lf %d %d\n", i, index, bb, g_best_aln_col->score, coverage[i], eqv[index] );
+            index ++;
+        }
+    }
+    
+    // reverse the sequence
+    for (i = 0; i < index/2; i++) {
+        cns_str[i] = cns_str[i] ^ cns_str[index-i-1];
+        cns_str[index-i-1] = cns_str[i] ^ cns_str[index-i-1];
+        cns_str[i] = cns_str[i] ^ cns_str[index-i-1];
+        eqv[i] = eqv[i] ^ eqv[index-i-1];
+        eqv[index-i-1] = eqv[i] ^ eqv[index-i-1];
+        eqv[i] = eqv[i] ^ eqv[index-i-1];
+    }
+
+    cns_str[index] = 0;
+    //printf("%s\n", cns_str);
+#ifndef STATIC_ALLOCATE
+    for (i = 0; i < t_len; i++) {
+        free_delta_group(msa_array[i]);
+        free(msa_array[i]);
+    }
+    
+    free(msa_array);
+#endif
+
+#ifdef STATIC_ALLOCATE
+    clean_msa_working_space(msa_array, t_len+1);
+#endif
+    
+    free(coverage);
+    free(local_nbase);
+    return consensus;
+}
+
+//const unsigned int K = 8;
+
+consensus_data * generate_consensus( char ** input_seq, 
+                           unsigned int n_seq, 
+                           unsigned min_cov, 
+                           unsigned K,
+                           unsigned long local_match_count_window,
+                           unsigned long local_match_count_threshold,
+                           double min_idt) {
+    // local_match_count_window, local_match_count_threshold obsoleted, keep the interface for a while
+
+    unsigned int j;
+    unsigned int seq_count;
+    unsigned int aligned_seq_count;
+    kmer_lookup * lk_ptr;
+    seq_array sa_ptr;
+    seq_addr_array sda_ptr;
+    kmer_match * kmer_match_ptr;
+    aln_range * arange;
+    alignment * aln;
+    align_tags_t ** tags_list;
+    //char * consensus;
+    consensus_data * consensus;
+    double max_diff;
+    max_diff = 1.0 - min_idt;
+
+    seq_count = n_seq;
+    //printf("XX n_seq %d\n", n_seq);
+    //for (j=0; j < seq_count; j++) {
+    //    printf("seq_len: %u %u\n", j, strlen(input_seq[j]));
+    //};
+    fflush(stdout);
+
+    tags_list = calloc( seq_count, sizeof(align_tags_t *) );
+    lk_ptr = allocate_kmer_lookup( 1 << (K * 2) );
+    sa_ptr = allocate_seq( (seq_coor_t) strlen( input_seq[0]) );
+    sda_ptr = allocate_seq_addr( (seq_coor_t) strlen( input_seq[0]) );
+    add_sequence( 0, K, input_seq[0], strlen(input_seq[0]), sda_ptr, sa_ptr, lk_ptr);
+    //mask_k_mer(1 << (K * 2), lk_ptr, 16);
+
+    aligned_seq_count = 0;
+    for (j=1; j < seq_count; j++) {
+
+        //printf("seq_len: %ld %u\n", j, strlen(input_seq[j]));
+
+        kmer_match_ptr = find_kmer_pos_for_seq(input_seq[j], strlen(input_seq[j]), K, sda_ptr, lk_ptr);
+#define INDEL_ALLOWENCE_0 6
+
+        arange = find_best_aln_range(kmer_match_ptr, K, K * INDEL_ALLOWENCE_0, 5);  // narrow band to avoid aligning through big indels
+
+        //printf("1:%ld %ld %ld %ld\n", arange_->s1, arange_->e1, arange_->s2, arange_->e2);
+
+        //arange = find_best_aln_range2(kmer_match_ptr, K, K * INDEL_ALLOWENCE_0, 5);  // narrow band to avoid aligning through big indels
+
+        //printf("2:%ld %ld %ld %ld\n\n", arange->s1, arange->e1, arange->s2, arange->e2);
+        
+#define INDEL_ALLOWENCE_1 0.10
+        if (arange->e1 - arange->s1 < 100 || arange->e2 - arange->s2 < 100 ||
+            abs( (arange->e1 - arange->s1 ) - (arange->e2 - arange->s2) ) > 
+                   (int) (0.5 * INDEL_ALLOWENCE_1 * (arange->e1 - arange->s1 + arange->e2 - arange->s2))) {
+            free_kmer_match( kmer_match_ptr);
+            free_aln_range(arange);
+            continue;
+        }
+        //printf("%ld %s\n", strlen(input_seq[j]), input_seq[j]);
+        //printf("%ld %s\n\n", strlen(input_seq[0]), input_seq[0]);
+        
+        
+#define INDEL_ALLOWENCE_2 150
+
+        aln = align(input_seq[j]+arange->s1, arange->e1 - arange->s1 ,
+                    input_seq[0]+arange->s2, arange->e2 - arange->s2 , 
+                    INDEL_ALLOWENCE_2, 1);
+        if (aln->aln_str_size > 500 && ((double) aln->dist / (double) aln->aln_str_size) < max_diff) {
+            tags_list[aligned_seq_count] = get_align_tags( aln->q_aln_str, 
+                                                           aln->t_aln_str, 
+                                                           aln->aln_str_size, 
+                                                           arange, j, 
+                                                           0); 
+            aligned_seq_count ++;
+        }
+        /***
+        for (k = 0; k < tags_list[j]->len; k++) {
+            printf("%ld %d %c\n", tags_list[j]->align_tags[k].t_pos,
+                                   tags_list[j]->align_tags[k].delta,
+                                   tags_list[j]->align_tags[k].q_base);
+        }
+        ***/
+        free_aln_range(arange);
+        free_alignment(aln);
+        free_kmer_match( kmer_match_ptr);
+    }
+
+    if (aligned_seq_count > 0) {
+        consensus = get_cns_from_align_tags( tags_list, aligned_seq_count, strlen(input_seq[0]), min_cov );
+    } else {
+        // allocate an empty consensus sequence
+        consensus = calloc( 1, sizeof(consensus_data) );
+        consensus->sequence = calloc( 1, sizeof(char) );
+        consensus->eqv = calloc( 1, sizeof(unsigned int) );
+    }
+    //free(consensus);
+    free_seq_addr_array(sda_ptr);
+    free_seq_array(sa_ptr);
+    free_kmer_lookup(lk_ptr);
+    for (j=0; j < aligned_seq_count; j++) {
+        free_align_tags(tags_list[j]);
+    }
+    free(tags_list);
+    return consensus;
+}
+
+consensus_data * generate_utg_consensus( char ** input_seq, 
+                           seq_coor_t *offset,
+                           unsigned int n_seq, 
+                           unsigned min_cov, 
+                           unsigned K,
+                           double min_idt) {
+
+    unsigned int j;
+    unsigned int seq_count;
+    unsigned int aligned_seq_count;
+    aln_range * arange;
+    alignment * aln;
+    align_tags_t ** tags_list;
+    //char * consensus;
+    consensus_data * consensus;
+    double max_diff;
+    seq_coor_t utg_len;
+    seq_coor_t r_len;
+    max_diff = 1.0 - min_idt;
+    
+
+    seq_count = n_seq;
+    /***
+    for (j=0; j < seq_count; j++) {
+        printf("seq_len: %u %u\n", j, strlen(input_seq[j]));
+    };
+    fflush(stdout);
+    ***/
+    tags_list = calloc( seq_count+1, sizeof(align_tags_t *) );
+    utg_len =  strlen(input_seq[0]);
+    aligned_seq_count = 0;
+    arange = calloc( 1, sizeof(aln_range) );
+
+    arange->s1 = 0;
+    arange->e1 = strlen(input_seq[0]);
+    arange->s2 = 0;
+    arange->e2 = strlen(input_seq[0]); 
+    tags_list[aligned_seq_count] = get_align_tags( input_seq[0], input_seq[0], 
+                                                   strlen(input_seq[0]), arange, 0, 0); 
+    aligned_seq_count += 1;
+    for (j=1; j < seq_count; j++) {
+        arange->s1 = 0;
+        arange->e1 = strlen(input_seq[j])-1;
+        arange->s2 = 0;
+        arange->e2 = strlen(input_seq[j])-1; 
+
+        r_len = strlen(input_seq[j]);
+        //printf("seq_len: %u %u\n", j, r_len);
+        if ( offset[j] < 0) {
+            if ((r_len + offset[j]) < 128) {
+                continue;
+            }
+            if ( r_len + offset[j] < utg_len ) {
+
+                //printf("1: %ld %u %u\n", offset[j], r_len, utg_len);
+                aln = align(input_seq[j] - offset[j], r_len + offset[j] ,
+                            input_seq[0], r_len + offset[j] , 
+                            500, 1);
+            } else {
+                //printf("2: %ld %u %u\n", offset[j], r_len, utg_len);
+                aln = align(input_seq[j] - offset[j], utg_len ,
+                            input_seq[0], utg_len , 
+                            500, 1);
+            }
+            offset[j] = 0;
+
+        } else {
+            if ( offset[j] > utg_len - 128) {
+                continue;
+            }
+            if ( offset[j] + r_len > utg_len ) {
+                //printf("3: %ld %u %u\n", offset[j], r_len, utg_len);
+                aln = align(input_seq[j], utg_len - offset[j] ,
+                            input_seq[0]+offset[j], utg_len - offset[j], 
+                            500, 1);
+            } else {
+                //printf("4: %ld %u %u\n", offset[j], r_len, utg_len);
+                aln = align(input_seq[j], r_len ,
+                            input_seq[0]+offset[j], r_len , 
+                            500, 1);
+            }
+        }
+        if (aln->aln_str_size > 500 && ((double) aln->dist / (double) aln->aln_str_size) < max_diff) {
+            tags_list[aligned_seq_count] = get_align_tags( aln->q_aln_str, aln->t_aln_str, 
+                                                           aln->aln_str_size, arange, j, 
+                                                           offset[j]); 
+            aligned_seq_count ++;
+        }
+        free_alignment(aln);
+    }
+    free_aln_range(arange);
+    if (aligned_seq_count > 0) {
+        consensus = get_cns_from_align_tags( tags_list, aligned_seq_count, utg_len, 0 );
+    } else {
+        // allocate an empty consensus sequence
+        consensus = calloc( 1, sizeof(consensus_data) );
+        consensus->sequence = calloc( 1, sizeof(char) );
+        consensus->eqv = calloc( 1, sizeof(unsigned int) );
+    }
+    //free(consensus);
+    for (j=0; j < aligned_seq_count; j++) {
+        free_align_tags(tags_list[j]);
+    }
+    free(tags_list);
+    return consensus;
+}
+
+
+void free_consensus_data( consensus_data * consensus ){
+    free(consensus->sequence);
+    free(consensus->eqv);
+    free(consensus);
+}
+
+/***
+void main() {
+    unsigned int j;
+    char small_buffer[1024];
+    char big_buffer[65536];
+    char ** input_seq;
+    char ** seq_id;
+    int seq_count;
+    char * consensus;
+
+    input_seq = calloc( 501, sizeof(char *));
+    seq_id = calloc( 501, sizeof(char *));
+    
+    while(1) {
+        seq_count = 0;
+        while (1) {
+
+            scanf("%s", small_buffer);
+            seq_id[seq_count] = calloc( strlen(small_buffer) + 1, sizeof(char));
+            strcpy(seq_id[seq_count], small_buffer);
+
+            scanf("%s", big_buffer);
+            input_seq[seq_count] = calloc( strlen(big_buffer) + 1 , sizeof(char));
+            strcpy(input_seq[seq_count], big_buffer);
+
+            if (strcmp(seq_id[seq_count], "+") == 0) {
+                break;
+            }
+            if (strcmp(seq_id[seq_count], "-") == 0) {
+                break;
+            }
+            //printf("%s\n", seq_id[seq_count]);
+            seq_count += 1;
+            if (seq_count > 500) break;
+        }
+        //printf("sc: %d\n", seq_count);
+        if (seq_count < 10 && strcmp(seq_id[seq_count], "-") != 0 ) continue;
+        if (seq_count < 10 && strcmp(seq_id[seq_count], "-") == 0 ) break;
+
+            consensus = generate_consensus(input_seq, seq_count, 8, 8);
+        if (strlen(consensus) > 500) {
+            printf(">%s\n%s\n", seq_id[0], consensus);
+        }
+        fflush(stdout);
+        free(consensus);
+        for (j=0; j < seq_count; j++) {
+            free(seq_id[j]);
+            free(input_seq[j]);
+        };
+
+    }
+    for (j=0; j < seq_count; j++) {
+        free(seq_id[j]);
+        free(input_seq[j]);
+    };
+    free(seq_id);
+    free(input_seq);
+}
+***/
diff --git a/src/c/kmer_lookup.c b/src/c/kmer_lookup.c
new file mode 100755
index 0000000..e19e200
--- /dev/null
+++ b/src/c/kmer_lookup.c
@@ -0,0 +1,589 @@
+/*
+ * =====================================================================================
+ *
+ *       Filename:  kmer_count.c
+ *
+ *    Description:  
+ *
+ *        Version:  0.1
+ *        Created:  07/20/2013 17:00:00
+ *       Revision:  none
+ *       Compiler:  gcc
+ *
+ *         Author:  Jason Chin, 
+ *        Company:  
+ *
+ * =====================================================================================
+
+ #################################################################################$$
+ # Copyright (c) 2011-2014, Pacific Biosciences of California, Inc.
+ #
+ # All rights reserved.
+ #
+ # Redistribution and use in source and binary forms, with or without
+ # modification, are permitted (subject to the limitations in the
+ # disclaimer below) provided that the following conditions are met:
+ #
+ #  * Redistributions of source code must retain the above copyright
+ #  notice, this list of conditions and the following disclaimer.
+ #
+ #  * Redistributions in binary form must reproduce the above
+ #  copyright notice, this list of conditions and the following
+ #  disclaimer in the documentation and/or other materials provided
+ #  with the distribution.
+ #
+ #  * Neither the name of Pacific Biosciences nor the names of its
+ #  contributors may be used to endorse or promote products derived
+ #  from this software without specific prior written permission.
+ #
+ # NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+ # GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+ # BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+ # WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ # DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+ # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+ # USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ # OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ # SUCH DAMAGE.
+ #################################################################################$$
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <limits.h>
+#include "common.h"
+
+
+const unsigned int KMERMATCHINC = 10000;
+
+int compare_seq_coor(const void * a, const void * b) {
+    const seq_coor_t * arg1 = a;
+    const seq_coor_t * arg2 = b;
+    return  (* arg1) - (* arg2);
+}
+
+
+kmer_lookup * allocate_kmer_lookup ( seq_coor_t size ) {
+    kmer_lookup * kl;
+
+    //printf("%lu is allocated for kmer lookup\n", size);
+    kl = (kmer_lookup *)  malloc( size * sizeof(kmer_lookup) );
+    init_kmer_lookup( kl, size);
+    return kl;
+}
+
+void init_kmer_lookup ( kmer_lookup * kl,  seq_coor_t size ) {
+    seq_coor_t i;
+    //printf("%lu is allocated for kmer lookup\n", size);
+    for (i=0; i<size; i++) {
+        kl[i].start = INT_MAX;
+        kl[i].last = INT_MAX;
+        kl[i].count = 0;
+    }
+}
+
+
+void free_kmer_lookup( kmer_lookup *  ptr) {
+    free(ptr);
+}
+
+seq_array allocate_seq(seq_coor_t size) {
+    seq_array sa;
+    sa  = (seq_array) malloc( size * sizeof(base) ); 
+    init_seq_array( sa, size);
+    return sa;
+}
+
+void init_seq_array( seq_array sa, seq_coor_t size) {
+    seq_coor_t i;
+    for (i=0; i<size; i++) {
+        sa[i] = 0xff;
+    }
+}
+
+void free_seq_array( seq_array sa) {
+    free(sa);
+}
+
+seq_addr_array allocate_seq_addr(seq_coor_t size) {
+    return (seq_addr_array) calloc( size, sizeof(seq_addr));
+}
+
+void free_seq_addr_array(seq_addr_array sda) {
+    free(sda);
+}
+
+seq_coor_t get_kmer_bitvector(seq_array sa, unsigned int K) {
+    unsigned int i;
+    seq_coor_t kmer_bv = 0;
+    seq_coor_t kmer_mask;
+
+    kmer_mask = 0;
+    for (i = 0; i < K; i++) {
+        kmer_mask <<= 2;
+        kmer_mask |= 0x00000003;
+    }
+
+    for (i = 0; i < K; i++) {
+        kmer_bv <<= 2;
+        kmer_bv |= (unsigned int) sa[i];
+    }
+
+    return kmer_bv;
+}
+
+void add_sequence ( seq_coor_t start, 
+                    unsigned int K, 
+                    char * seq, 
+                    seq_coor_t seq_len,
+                    seq_addr_array sda, 
+                    seq_array sa, 
+                    kmer_lookup * lk ) {
+
+    seq_coor_t i;
+    seq_coor_t kmer_bv;
+    seq_coor_t kmer_mask;
+
+    kmer_mask = 0;
+    for (i = 0; i < K; i++) {
+        kmer_mask <<= 2;
+        kmer_mask |= 0x00000003;
+    }
+
+    for (i = 0; i < seq_len; i++) {
+        switch ( seq[i] ) {
+            case 'A':
+                sa[ start + i ] = 0;
+                break;
+            case 'C':
+                sa[ start + i ] = 1;
+                break;
+            case 'G':
+                sa[ start + i ] = 2;
+                break;
+            case 'T':
+                sa[ start + i ] = 3;
+        }
+    }
+    kmer_bv = get_kmer_bitvector( sa + start, K);
+    for (i = 0; i < seq_len - K;  i++) {
+        //printf("%lu %lu\n", i, kmer_bv);
+        //printf("lk before init: %lu %lu %lu\n", kmer_bv, lk[kmer_bv].start, lk[kmer_bv].last);
+        if (lk[kmer_bv].start == INT_MAX) {
+            lk[kmer_bv].start = start + i;
+            lk[kmer_bv].last = start + i;
+            lk[kmer_bv].count += 1;
+            //printf("lk init: %lu %lu %lu\n", kmer_bv, lk[kmer_bv].start, lk[kmer_bv].last);
+        } else {
+            sda[ lk[kmer_bv].last ] = start + i;
+            lk[kmer_bv].count += 1;
+            lk[kmer_bv].last = start + i;
+            //printf("lk change: %lu %lu %lu\n", kmer_bv, lk[kmer_bv].start, lk[kmer_bv].last);
+        }
+        kmer_bv <<= 2;
+        kmer_bv |= sa[ start + i + K];
+        kmer_bv &= kmer_mask;
+    }
+}
+
+
+void mask_k_mer(seq_coor_t size, kmer_lookup * kl, seq_coor_t threshold) {
+    seq_coor_t i;
+    for (i=0; i<size; i++) {
+        if (kl[i].count > threshold) {
+            kl[i].start = INT_MAX;
+            kl[i].last = INT_MAX;
+            //kl[i].count = 0;
+        }
+    }
+}
+
+
+kmer_match * find_kmer_pos_for_seq( char * seq, seq_coor_t seq_len, unsigned int K,
+                    seq_addr_array sda, 
+                    kmer_lookup * lk) {
+    seq_coor_t i;
+    seq_coor_t kmer_bv;
+    seq_coor_t kmer_mask;
+    seq_coor_t kmer_pos;
+    seq_coor_t next_kmer_pos;
+    unsigned int half_K;
+    seq_coor_t kmer_match_rtn_allocation_size = KMERMATCHINC;
+    kmer_match * kmer_match_rtn;
+    base * sa;
+
+    kmer_match_rtn = (kmer_match *) malloc( sizeof(kmer_match) );
+    kmer_match_rtn->count = 0;
+    kmer_match_rtn->query_pos = (seq_coor_t *) calloc( kmer_match_rtn_allocation_size, sizeof( seq_coor_t ) );
+    kmer_match_rtn->target_pos = (seq_coor_t *) calloc( kmer_match_rtn_allocation_size, sizeof( seq_coor_t ) );
+
+    sa = calloc( seq_len, sizeof(base) );
+
+    kmer_mask = 0;
+    for (i = 0; i < K; i++) {
+        kmer_mask <<= 2;
+        kmer_mask |= 0x00000003;
+    }
+
+    for (i = 0; i < seq_len; i++) {
+        switch ( seq[i] ) {
+            case 'A':
+                sa[ i ] = 0;
+                break;
+            case 'C':
+                sa[ i ] = 1;
+                break;
+            case 'G':
+                sa[ i ] = 2;
+                break;
+            case 'T':
+                sa[ i ] = 3;
+        }
+    }
+
+
+    kmer_bv = get_kmer_bitvector(sa, K);
+    half_K = K >> 1;
+    for (i = 0; i < seq_len - K;  i += half_K) {
+        kmer_bv = get_kmer_bitvector(sa + i, K);
+        if (lk[kmer_bv].start == INT_MAX) {  //for high count k-mers
+            continue;
+        }
+        kmer_pos = lk[ kmer_bv ].start;
+        next_kmer_pos = sda[ kmer_pos ];
+        kmer_match_rtn->query_pos[ kmer_match_rtn->count ] = i;
+        kmer_match_rtn->target_pos[ kmer_match_rtn->count ] = kmer_pos;
+        kmer_match_rtn->count += 1;
+        if (kmer_match_rtn->count > kmer_match_rtn_allocation_size - 1000) {
+            kmer_match_rtn_allocation_size += KMERMATCHINC;
+            kmer_match_rtn->query_pos = (seq_coor_t *) realloc( kmer_match_rtn->query_pos, 
+                                                                   kmer_match_rtn_allocation_size  * sizeof(seq_coor_t) );
+            kmer_match_rtn->target_pos = (seq_coor_t *) realloc( kmer_match_rtn->target_pos, 
+                                                                    kmer_match_rtn_allocation_size  * sizeof(seq_coor_t) );
+        }
+        while ( next_kmer_pos > kmer_pos ){
+            kmer_pos = next_kmer_pos;
+            next_kmer_pos = sda[ kmer_pos ];
+            kmer_match_rtn->query_pos[ kmer_match_rtn->count ] = i;
+            kmer_match_rtn->target_pos[ kmer_match_rtn->count ] = kmer_pos;
+            kmer_match_rtn->count += 1;
+            if (kmer_match_rtn->count > kmer_match_rtn_allocation_size - 1000) {
+                kmer_match_rtn_allocation_size += KMERMATCHINC;
+                kmer_match_rtn->query_pos = (seq_coor_t *) realloc( kmer_match_rtn->query_pos, 
+                                                                       kmer_match_rtn_allocation_size  * sizeof(seq_coor_t) );
+                kmer_match_rtn->target_pos = (seq_coor_t *) realloc( kmer_match_rtn->target_pos, 
+                                                                        kmer_match_rtn_allocation_size  * sizeof(seq_coor_t) );
+            }
+        }
+    }
+    free(sa);
+    return kmer_match_rtn;
+}
+
+void free_kmer_match( kmer_match * ptr) {
+    free(ptr->query_pos);
+    free(ptr->target_pos);
+    free(ptr);
+}
+
+aln_range* find_best_aln_range(kmer_match * km_ptr, 
+                              seq_coor_t K, 
+                              seq_coor_t bin_size, 
+                              seq_coor_t count_th) {
+    seq_coor_t i;
+    seq_coor_t j;
+    seq_coor_t q_min, q_max, t_min, t_max;
+    seq_coor_t * d_count;
+    seq_coor_t * q_coor;
+    seq_coor_t * t_coor;
+    aln_range * arange;
+
+    long int d, d_min, d_max;
+    long int cur_score;
+    long int max_score;
+    long int max_k_mer_count;
+    long int max_k_mer_bin;
+    seq_coor_t cur_start;
+
+    arange = calloc(1 , sizeof(aln_range));
+
+    q_min = INT_MAX;
+    q_max = 0;
+    t_min = INT_MAX;
+    t_max = 0;
+
+    d_min = INT_MAX;
+    d_max = LONG_MIN;
+
+    for (i = 0; i <  km_ptr->count; i++ ) {
+        if ( km_ptr -> query_pos[i] < q_min) {
+            q_min =  km_ptr->query_pos[i];
+        }
+        if ( km_ptr -> query_pos[i] > q_max) {
+            q_max =  km_ptr->query_pos[i];
+        }
+        if ( km_ptr -> target_pos[i] < t_min) {
+            t_min =  km_ptr->target_pos[i];
+        }
+        if ( km_ptr -> query_pos[i] > t_max) {
+            t_max =  km_ptr->target_pos[i];
+        }
+        d = (long int) km_ptr->query_pos[i] - (long int) km_ptr->target_pos[i];
+        if ( d < d_min ) {
+            d_min = d;
+        }
+        if ( d > d_max ) {
+            d_max = d;
+        }
+    }
+
+    //printf("%lu %ld %ld\n" , km_ptr->count, d_min, d_max);
+    d_count = calloc( (d_max - d_min)/bin_size + 1, sizeof(seq_coor_t) );
+    q_coor = calloc( km_ptr->count, sizeof(seq_coor_t) );
+    t_coor = calloc( km_ptr->count, sizeof(seq_coor_t) );
+
+    for (i = 0; i <  km_ptr->count; i++ ) {
+        d = (long int) (km_ptr->query_pos[i]) - (long int) (km_ptr->target_pos[i]);
+        d_count[ (d - d_min)/ (long int) bin_size ] += 1;
+        q_coor[i] = INT_MAX;
+        t_coor[i] = INT_MAX;
+    }
+
+    j = 0;
+    max_k_mer_count = 0;
+    max_k_mer_bin = INT_MAX;
+    for (i = 0; i <  km_ptr->count; i++ ) {
+        d = (long int) (km_ptr->query_pos[i]) - (long int) (km_ptr->target_pos[i]);
+        if ( d_count[ (d - d_min)/ (long int) bin_size ] > max_k_mer_count) {
+            max_k_mer_count =  d_count[ (d - d_min)/ (long int) bin_size ];
+            max_k_mer_bin = (d - d_min)/ (long int) bin_size;
+        }
+    }
+    //printf("k_mer: %lu %lu\n" , max_k_mer_count, max_k_mer_bin);
+    
+    if ( max_k_mer_bin != INT_MAX && max_k_mer_count > count_th ) {
+        for (i = 0; i <  km_ptr->count; i++ ) {
+            d = (long int) (km_ptr->query_pos[i]) - (long int) (km_ptr->target_pos[i]);
+            if ( abs( ( (d - d_min)/ (long int) bin_size ) - max_k_mer_bin ) > 5 ) {
+                continue;
+            }
+            if (d_count[ (d - d_min)/ (long int) bin_size ] > count_th) {
+                q_coor[j] = km_ptr->query_pos[i];  
+                t_coor[j] = km_ptr->target_pos[i];
+                //printf("d_count: %lu %lu\n" ,i, d_count[(d - d_min)/ (long int) bin_size]);
+                //printf("coor: %lu %lu\n" , q_coor[j], t_coor[j]);
+                j ++;
+            }
+        }
+    }
+
+    if (j > 1) {
+        arange->s1 = q_coor[0];
+        arange->e1 = q_coor[0];
+        arange->s2 = t_coor[0];
+        arange->e2 = t_coor[0];
+        arange->score = 0;
+
+        max_score = 0;
+        cur_score = 0;
+        cur_start = 0;
+
+        for (i = 1; i < j; i++) {
+            cur_score += 32 - (q_coor[i] - q_coor[i-1]);
+            //printf("deltaD, %lu %ld\n", q_coor[i] - q_coor[i-1], cur_score);
+            if (cur_score < 0) {
+                cur_score = 0;
+                cur_start = i;
+            } else if (cur_score > max_score) {
+                arange->s1 = q_coor[cur_start];
+                arange->s2 = t_coor[cur_start];
+                arange->e1 = q_coor[i];
+                arange->e2 = t_coor[i];
+                max_score = cur_score;
+                arange->score = max_score;
+                //printf("%lu %lu %lu %lu\n", arange.s1, arange.e1, arange.s2, arange.e2);
+            }
+        }
+
+    } else {
+        arange->s1 = 0;
+        arange->e1 = 0;
+        arange->s2 = 0;
+        arange->e2 = 0;
+        arange->score = 0;
+    }
+
+    // printf("free\n");
+
+    free(d_count);
+    free(q_coor);
+    free(t_coor);
+    return arange;
+}
+
+aln_range* find_best_aln_range2(kmer_match * km_ptr, 
+                                seq_coor_t K, 
+                                seq_coor_t bin_width, 
+                                seq_coor_t count_th) {
+
+    seq_coor_t * d_coor;
+    seq_coor_t * hit_score;
+    seq_coor_t * hit_count;
+    seq_coor_t * last_hit;
+    seq_coor_t max_q, max_t;
+    seq_coor_t s, e, max_s, max_e, max_span, d_s, d_e, delta, d_len;
+    seq_coor_t px, py, cx, cy;
+    seq_coor_t max_hit_idx;
+    seq_coor_t max_hit_score, max_hit_count;
+    seq_coor_t i, j;
+    seq_coor_t candidate_idx, max_d, d;
+
+    aln_range * arange;
+
+    arange = calloc(1 , sizeof(aln_range));
+
+    d_coor = calloc( km_ptr->count, sizeof(seq_coor_t) );
+
+    max_q = -1;
+    max_t = -1;
+
+    for (i = 0; i <  km_ptr->count; i++ ) {
+        d_coor[i] = km_ptr->query_pos[i] - km_ptr->target_pos[i];
+        max_q = max_q > km_ptr->query_pos[i] ? max_q : km_ptr->query_pos[i];
+        max_t = max_t > km_ptr->target_pos[i] ? max_q : km_ptr->target_pos[i];
+
+    }
+
+    qsort(d_coor, km_ptr->count, sizeof(seq_coor_t), compare_seq_coor);
+
+
+    s = 0;
+    e = 0;
+    max_s = -1;
+    max_e = -1;
+    max_span = -1;
+    delta = (long int) ( 0.05 * ( max_q + max_t ) );
+    d_len =  km_ptr->count;
+    d_s = -1;
+    d_e = -1;
+    while (1) {
+        d_s = d_coor[s];
+        d_e = d_coor[e];
+        while (d_e < d_s + delta && e < d_len-1) {
+            e += 1;
+            d_e = d_coor[e];
+        }
+        if ( max_span == -1 || e - s > max_span ) {
+            max_span = e - s;
+            max_s = s;
+            max_e = e;
+        }
+        s += 1;
+        if (s == d_len || e == d_len) {
+            break;
+        }
+    }
+
+    if (max_s == -1 || max_e == -1 || max_e - max_s < 32) {
+        arange->s1 = 0;
+        arange->e1 = 0;
+        arange->s2 = 0;
+        arange->e2 = 0;
+        arange->score = 0;
+        free(d_coor);
+        return arange;
+    }
+
+    last_hit = calloc( km_ptr->count, sizeof(seq_coor_t) );
+    hit_score = calloc( km_ptr->count, sizeof(seq_coor_t) );
+    hit_count = calloc( km_ptr->count, sizeof(seq_coor_t) );
+
+    for (i = 0; i <  km_ptr->count; i++ ) {
+        last_hit[i] = -1;
+        hit_score[i] = 0;
+        hit_count[i] = 0;
+    }
+    max_hit_idx = -1;
+    max_hit_score = 0;
+    for (i = 0; i < km_ptr->count; i ++)  {
+        cx = km_ptr->query_pos[i];
+        cy = km_ptr->target_pos[i];
+        d = cx - cy; 
+        if ( d < d_coor[max_s] || d > d_coor[max_e] ) continue;
+
+        j = i - 1;
+        candidate_idx = -1;
+        max_d = 65535;
+        while (1) {
+            if ( j < 0 ) break;
+            px = km_ptr->query_pos[j];
+            py = km_ptr->target_pos[j];
+            d = px - py;
+            if ( d < d_coor[max_s] || d > d_coor[max_e] ) {
+                j--;
+                continue;
+            }
+            if (cx - px > 320) break; //the number here controling how big alignment gap to be considered
+            if (cy > py && cx - px + cy - py < max_d && cy - py <= 320 ) {
+                max_d = cx - px + cy - py;
+                candidate_idx = j;
+            }
+            j--;
+        }
+        if (candidate_idx != -1) {
+            last_hit[i] = candidate_idx;
+            hit_score[i] = hit_score[candidate_idx] + (64 - max_d);
+            hit_count[i] = hit_count[candidate_idx] + 1;
+            if (hit_score[i] < 0) {
+                hit_score[i] = 0;
+                hit_count[i] = 0;
+            }
+        } else {
+            hit_score[i] = 0;
+            hit_count[i] = 0;
+        }
+        if (hit_score[i] > max_hit_score) {
+            max_hit_score = hit_score[i];
+            max_hit_count = hit_count[i];
+            max_hit_idx = i;
+        }
+
+    }
+    if (max_hit_idx == -1) {
+        arange->s1 = 0;
+        arange->e1 = 0;
+        arange->s2 = 0;
+        arange->e2 = 0;
+        arange->score = 0;
+        free(d_coor);
+        free(last_hit);
+        free(hit_score);
+        free(hit_count);
+        return arange;
+    }
+
+    arange->score = max_hit_count + 1;
+    arange->e1 = km_ptr->query_pos[max_hit_idx];
+    arange->e2 = km_ptr->target_pos[max_hit_idx];
+    i = max_hit_idx;
+    while (last_hit[i] != -1) {
+        i = last_hit[i];
+    }
+    arange->s1 = km_ptr->query_pos[i];
+    arange->s2 = km_ptr->target_pos[i];
+
+    free(d_coor);
+    free(last_hit);
+    free(hit_score);
+    free(hit_count);
+    return arange;
+}
+
+void free_aln_range( aln_range * arange) {
+    free(arange);
+}
diff --git a/src/py/FastaReader.py b/src/py/FastaReader.py
new file mode 100644
index 0000000..65085bd
--- /dev/null
+++ b/src/py/FastaReader.py
@@ -0,0 +1,260 @@
+#################################################################################$$
+# Copyright (c) 2011-2014, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted (subject to the limitations in the
+# disclaimer below) provided that the following conditions are met:
+#
+#  * Redistributions of source code must retain the above copyright
+#  notice, this list of conditions and the following disclaimer.
+#
+#  * Redistributions in binary form must reproduce the above
+#  copyright notice, this list of conditions and the following
+#  disclaimer in the documentation and/or other materials provided
+#  with the distribution.
+#
+#  * Neither the name of Pacific Biosciences nor the names of its
+#  contributors may be used to endorse or promote products derived
+#  from this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#################################################################################$$
+
+from os.path import abspath, expanduser
+from cStringIO import StringIO
+import md5
+import re
+
+def splitFastaHeader( name ):
+    """
+    Split a FASTA/FASTQ header into its id and metadata components
+    """
+    nameParts = re.split('\s', name, maxsplit=1)
+    id_ = nameParts[0]
+    if len(nameParts) > 1:
+        metadata = nameParts[1].strip()
+    else:
+        metadata = None
+    return (id_, metadata)
+
+def splitFileContents(f, delimiter, BLOCKSIZE=8192):
+    """
+    Same semantics as f.read().split(delimiter), but with memory usage
+    determined by largest chunk rather than entire file size
+    """
+    remainder = StringIO()
+    while True:
+        block = f.read(BLOCKSIZE)
+        if not block:
+            break
+        parts = block.split(delimiter)
+        remainder.write(parts[0])
+        for part in parts[1:]:
+            yield remainder.getvalue()
+            remainder = StringIO()
+            remainder.write(part)
+    yield remainder.getvalue()
+
+def isFileLikeObject(o):
+    return hasattr(o, "read") and hasattr(o, "write")
+
+def getFileHandle(filenameOrFile, mode="r"):
+    """
+    Given a filename not ending in ".gz", open the file with the
+    appropriate mode.
+    Given a filename ending in ".gz", return a filehandle to the
+    unzipped stream.
+    Given a file object, return it unless the mode is incorrect--in
+    that case, raise an exception.
+    """
+    assert mode in ("r", "w")
+
+    if isinstance(filenameOrFile, basestring):
+        filename = abspath(expanduser(filenameOrFile))
+        if filename.endswith(".gz"):
+            return gzip.open(filename, mode)
+        else:
+            return open(filename, mode)
+    elif isFileLikeObject(filenameOrFile):
+        return filenameOrFile
+    else:
+        raise Exception("Invalid type to getFileHandle")
+
+
+class ReaderBase(object):
+    def __init__(self, f):
+        """
+        Prepare for iteration through the records in the file
+        """
+        self.file = getFileHandle(f, "r")
+
+    def close(self):
+        """
+        Close the underlying file
+        """
+        self.file.close()
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.close()
+
+
+class FastaRecord(object):
+    """
+    A FastaRecord object models a named sequence in a FASTA file.
+    """
+    DELIMITER = ">"
+    COLUMNS   = 60
+
+    def __init__(self, name, sequence):
+        try:
+            assert "\n" not in name
+            assert "\n" not in sequence
+            assert self.DELIMITER not in sequence
+            self._name = name
+            self._sequence = sequence
+            self._md5 = md5.md5(self.sequence).hexdigest()
+            self._id, self._metadata = splitFastaHeader(name)
+        except AssertionError:
+            raise ValueError("Invalid FASTA record data")
+
+    @property
+    def name(self):
+        """
+        The name of the sequence in the FASTA file, equal to the entire
+        FASTA header following the '>' character
+        """
+        return self._name
+
+    @property
+    def id(self):
+        """
+        The id of the sequence in the FASTA file, equal to the FASTA header
+        up to the first whitespace.
+        """
+        return self._id
+
+    @property
+    def metadata(self):
+        """
+        The metadata associated with the sequence in the FASTA file, equal to
+        the contents of the FASTA header following the first whitespace
+        """
+        return self._metadata
+
+    @property
+    def sequence(self):
+        """
+        The sequence for the record as present in the FASTA file.
+        (Newlines are removed but otherwise no sequence normalization
+        is performed).
+        """
+        return self._sequence
+
+    @property
+    def length(self):
+        """
+        Get the length of the FASTA sequence
+        """
+        return len(self._sequence)
+
+    @property
+    def md5(self):
+        """
+        The MD5 checksum (hex digest) of `sequence`
+        """
+        return self._md5
+
+    @classmethod
+    def fromString(cls, s):
+        """
+        Interprets a string as a FASTA record.  Does not make any
+        assumptions about wrapping of the sequence string.
+        """
+        try:
+            lines = s.splitlines()
+            assert len(lines) > 1
+            assert lines[0][0] == cls.DELIMITER
+            name = lines[0][1:]
+            sequence = "".join(lines[1:])
+            return FastaRecord(name, sequence)
+        except AssertionError:
+            raise ValueError("String not recognized as a valid FASTA record")
+
+    def reverseComplement(self, preserveHeader=False):
+        """
+        Return a new FastaRecord with the reverse-complemented DNA sequence.
+        Optionally, supply a name
+        """
+        rcSequence = sequences.reverseComplement(self.sequence)
+        if preserveHeader:
+            return FastaRecord(self.name, rcSequence)
+        else:
+            rcName = '{0} [revcomp]'.format(self.name.strip())
+            return FastaRecord(rcName, rcSequence)
+
+    def __eq__(self, other):
+        if isinstance(other, self.__class__):
+            return (self.name     == other.name and
+                    self.sequence == other.sequence)
+        else:
+            return False
+
+    def __ne__(self, other):
+        return not self.__eq__(other)
+
+    def __str__(self):
+        """
+        Output a string representation of this FASTA record, observing
+        standard conventions about sequence wrapping.
+        """
+        return (">%s\n" % self.name) + \
+            wrap(self.sequence, self.COLUMNS)
+
+
+class FastaReader(ReaderBase):
+    """
+    Streaming reader for FASTA files, useable as a one-shot iterator
+    over FastaRecord objects.  Agnostic about line wrapping.
+    Example:
+    .. doctest::
+        TODO: Get data.
+        > from pbcore import data
+        > filename = data.getTinyFasta()
+        > r = FastaReader(filename)
+        > for record in r:
+        ...     print record.name, len(record.sequence), record.md5
+        ref000001|EGFR_Exon_2 183 e3912e9ceacd6538ede8c1b2adda7423
+        ref000002|EGFR_Exon_3 203 4bf218da37175a91869033024ac8f9e9
+        ref000003|EGFR_Exon_4 215 245bc7a046aad0788c22b071ed210f4d
+        ref000004|EGFR_Exon_5 157 c368b8191164a9d6ab76fd328e2803ca
+        >>> r.close()
+    """
+    DELIMITER = ">"
+
+    def __iter__(self):
+        try:
+            parts = splitFileContents(self.file, ">")
+            assert "" == next(parts)
+            for part in parts:
+                yield FastaRecord.fromString(">" + part)
+        except AssertionError:
+            raise ValueError("Invalid FASTA file")
+
diff --git a/src/py/__init__.py b/src/py/__init__.py
new file mode 100644
index 0000000..2e1685f
--- /dev/null
+++ b/src/py/__init__.py
@@ -0,0 +1,39 @@
+
+#################################################################################$$
+# Copyright (c) 2011-2014, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted (subject to the limitations in the
+# disclaimer below) provided that the following conditions are met:
+#
+#  * Redistributions of source code must retain the above copyright
+#  notice, this list of conditions and the following disclaimer.
+#
+#  * Redistributions in binary form must reproduce the above
+#  copyright notice, this list of conditions and the following
+#  disclaimer in the documentation and/or other materials provided
+#  with the distribution.
+#
+#  * Neither the name of Pacific Biosciences nor the names of its
+#  contributors may be used to endorse or promote products derived
+#  from this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#################################################################################$$
+
+from .falcon_kit import *
diff --git a/src/py/falcon_kit.py b/src/py/falcon_kit.py
new file mode 100644
index 0000000..32a85ac
--- /dev/null
+++ b/src/py/falcon_kit.py
@@ -0,0 +1,199 @@
+
+#################################################################################$$
+# Copyright (c) 2011-2014, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted (subject to the limitations in the
+# disclaimer below) provided that the following conditions are met:
+#
+#  * Redistributions of source code must retain the above copyright
+#  notice, this list of conditions and the following disclaimer.
+#
+#  * Redistributions in binary form must reproduce the above
+#  copyright notice, this list of conditions and the following
+#  disclaimer in the documentation and/or other materials provided
+#  with the distribution.
+#
+#  * Neither the name of Pacific Biosciences nor the names of its
+#  contributors may be used to endorse or promote products derived
+#  from this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#################################################################################$$
+__all__ = [
+    'kup', 'DWA', 'falcon',
+    'KmerLookup', 'KmerMatch', 'AlnRange', 'ConsensusData',
+    'Alignment', 'get_alignment',
+    ]
+
+from ctypes import *
+from . import ext_falcon
+#module_path = os.path.split(__file__)[0]
+
+
+seq_coor_t = c_int
+base_t = c_uint8
+
+class KmerLookup(Structure):
+    _fields_ = [("start", seq_coor_t),
+                ("last", seq_coor_t),
+                ("count", seq_coor_t)]
+
+class KmerMatch(Structure):
+    _fields_ = [ ("count", seq_coor_t),
+                ("query_pos", POINTER(seq_coor_t)),
+                ("target_pos", POINTER(seq_coor_t)) ]
+
+class AlnRange(Structure):
+    _fields_ = [ ("s1", seq_coor_t),
+                 ("e1", seq_coor_t),
+                 ("s2", seq_coor_t),
+                 ("e2", seq_coor_t),
+                 ("score", c_long) ]
+
+class ConsensusData(Structure):
+    _fields_ = [ ("sequence", c_char_p),
+                 ("eff_cov", POINTER(c_uint)) ]
+
+
+falcon_dll = CDLL(ext_falcon.__file__)
+
+kup = falcon_dll
+
+kup.allocate_kmer_lookup.argtypes =  [seq_coor_t] 
+kup.allocate_kmer_lookup.restype = POINTER(KmerLookup)
+kup.init_kmer_lookup.argtypes = [POINTER(KmerLookup), seq_coor_t]
+kup.free_kmer_lookup.argtypes = [POINTER(KmerLookup)]
+
+kup.allocate_seq.argtypes = [seq_coor_t]
+kup.allocate_seq.restype = POINTER(base_t)
+kup.init_seq_array.argtypes = [POINTER(base_t), seq_coor_t]
+kup.free_seq_array.argtypes = [POINTER(base_t)]
+
+kup.allocate_seq_addr.argtypes = [seq_coor_t]
+kup.allocate_seq_addr.restype = POINTER(seq_coor_t)
+kup.free_seq_addr_array.argtypes = [POINTER(seq_coor_t)]
+
+kup.add_sequence.argtypes = [ seq_coor_t, c_uint, POINTER(c_char), seq_coor_t, POINTER(seq_coor_t), 
+                              POINTER(c_uint8), POINTER(KmerLookup) ]
+kup.mask_k_mer.argtypes =[ c_long, POINTER(KmerLookup), c_long ]
+kup.find_kmer_pos_for_seq.argtypes = [ POINTER(c_char), seq_coor_t, c_uint, POINTER(seq_coor_t), 
+                                       POINTER(KmerLookup)]
+kup.find_kmer_pos_for_seq.restype = POINTER(KmerMatch)
+kup.free_kmer_match.argtypes = [ POINTER(KmerMatch) ]
+
+
+kup.find_best_aln_range.argtypes = [POINTER(KmerMatch), seq_coor_t, seq_coor_t, seq_coor_t]
+kup.find_best_aln_range.restype = POINTER(AlnRange)
+kup.find_best_aln_range2.argtypes = [POINTER(KmerMatch), seq_coor_t, seq_coor_t, seq_coor_t]
+kup.find_best_aln_range2.restype = POINTER(AlnRange)
+kup.free_aln_range.argtypes = [POINTER(AlnRange)]
+
+
+class Alignment(Structure):
+    """
+    typedef struct {    
+        seq_coor_t aln_str_size ;
+        seq_coor_t dist ;
+        seq_coor_t aln_q_s;
+        seq_coor_t aln_q_e;
+        seq_coor_t aln_t_s;
+        seq_coor_t aln_t_e;
+        char * q_aln_str;
+        char * t_aln_str;
+    } alignment;
+    """
+    _fields_ = [ ("aln_str_size", seq_coor_t),
+                 ("dist", seq_coor_t),
+                 ("aln_q_s", seq_coor_t),
+                 ("aln_q_e", seq_coor_t),
+                 ("aln_t_s", seq_coor_t),
+                 ("aln_t_e", seq_coor_t),
+                 ("q_aln_str", c_char_p),
+                 ("t_aln_str", c_char_p)]
+
+
+DWA = falcon_dll
+
+DWA.align.argtypes = [ POINTER(c_char), c_long, POINTER(c_char), c_long, c_long, c_int ] 
+DWA.align.restype = POINTER(Alignment)
+DWA.free_alignment.argtypes = [POINTER(Alignment)]
+
+
+
+falcon = falcon_dll
+
+falcon.generate_consensus.argtypes = [POINTER(c_char_p), c_uint, c_uint, c_uint, c_uint, c_uint, c_double  ]
+falcon.generate_consensus.restype = POINTER(ConsensusData)
+falcon.free_consensus_data.argtypes = [ POINTER(ConsensusData) ]
+
+
+def get_alignment(seq1, seq0):
+    K = 8
+    lk_ptr = kup.allocate_kmer_lookup( 1 << (K * 2) )
+    sa_ptr = kup.allocate_seq( len(seq0) )
+    sda_ptr = kup.allocate_seq_addr( len(seq0) )
+    kup.add_sequence( 0, K, seq0, len(seq0), sda_ptr, sa_ptr, lk_ptr)
+
+    kmer_match_ptr = kup.find_kmer_pos_for_seq(seq1, len(seq1), K, sda_ptr, lk_ptr)
+    kmer_match = kmer_match_ptr[0]
+    aln_range_ptr = kup.find_best_aln_range(kmer_match_ptr, K, K*10, 50)
+    #x,y = zip( * [ (kmer_match.query_pos[i], kmer_match.target_pos[i]) for i in range(kmer_match.count )] )
+    kup.free_kmer_match(kmer_match_ptr)
+    aln_range = aln_range_ptr[0]
+    s1, e1, s2, e2 = aln_range.s1, aln_range.e1, aln_range.s2, aln_range.e2
+    kup.free_aln_range(aln_range_ptr)
+
+    if e1 - s1 > 500:
+        #s1 = 0 if s1 < 14 else s1 - 14
+        #s2 = 0 if s2 < 14 else s2 - 14
+        e1 = len(seq1) if e1 >= len(seq1)-2*K else e1 + K*2
+        e2 = len(seq0) if e2 >= len(seq0)-2*K else e2 + K*2
+        
+        alignment = DWA.align(seq1[s1:e1], e1-s1,
+                              seq0[s2:e2], e2-s2,
+                              100,
+                              0)
+        #print seq1[s1:e1]
+        #print seq0[s2:e2]
+        #if alignment[0].aln_str_size > 500:
+
+        #aln_str1 = alignment[0].q_aln_str
+        #aln_str0 = alignment[0].t_aln_str
+        aln_size = alignment[0].aln_str_size
+        aln_dist = alignment[0].dist
+        aln_q_s = alignment[0].aln_q_s
+        aln_q_e = alignment[0].aln_q_e
+        aln_t_s = alignment[0].aln_t_s
+        aln_t_e = alignment[0].aln_t_e
+        
+        #print "X,",alignment[0].aln_q_s, alignment[0].aln_q_e
+        #print "Y,",alignment[0].aln_t_s, alignment[0].aln_t_e
+        
+        #print aln_str1
+        #print aln_str0
+    
+        DWA.free_alignment(alignment)
+
+    kup.free_seq_addr_array(sda_ptr)
+    kup.free_seq_array(sa_ptr)
+    kup.free_kmer_lookup(lk_ptr)
+    if e1 - s1 > 500 and aln_size > 500:
+        return s1, s1+aln_q_e-aln_q_s, s2, s2+aln_t_e-aln_t_s, aln_size, aln_dist
+    else:
+        return None
diff --git a/src/py/fc_asm_graph.py b/src/py/fc_asm_graph.py
new file mode 100644
index 0000000..8f7d235
--- /dev/null
+++ b/src/py/fc_asm_graph.py
@@ -0,0 +1,212 @@
+#################################################################################$$
+# Copyright (c) 2011-2014, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted (subject to the limitations in the
+# disclaimer below) provided that the following conditions are met:
+#
+#  * Redistributions of source code must retain the above copyright
+#  notice, this list of conditions and the following disclaimer.
+#
+#  * Redistributions in binary form must reproduce the above
+#  copyright notice, this list of conditions and the following
+#  disclaimer in the documentation and/or other materials provided
+#  with the distribution.
+#
+#  * Neither the name of Pacific Biosciences nor the names of its
+#  contributors may be used to endorse or promote products derived
+#  from this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#################################################################################$$
+
+import networkx as nx
+from FastaReader import FastaReader
+
+RCMAP = dict(zip("ACGTacgtNn-","TGCAtgcaNn-"))
+
+def reverse_end( node_id ):
+    node_id, end = node_id.split(":")
+    new_end = "B" if end == "E" else "E"
+    return node_id + ":" + new_end
+
+class AsmGraph(object):
+
+    def __init__(self, sg_file, utg_file, ctg_file):
+        self.sg_edges = {}
+        self.sg_edge_seqs = {}
+        self.utg_data = {}
+        self.ctg_data ={}
+        self.utg_to_ctg = {}
+        self.node_to_ctg = {}
+        self.node_to_utg = {}
+
+        self.load_sg_data(sg_file)
+        self.load_utg_data(utg_file)
+        self.load_ctg_data(ctg_file)
+
+        self.build_node_map()
+
+    def load_sg_data(self, sg_file):
+
+        with open(sg_file) as f:
+            for l in f:
+                l = l.strip().split()
+                v, w = l[0:2]
+                seq_id, b, e = l[2:5]
+                b, e = int(b), int(e)
+                score, idt = l[5:7]
+                score, idt = int(score), float(idt)
+                type_ = l[7]
+                self.sg_edges[ (v, w) ] = ( (seq_id, b, e), score, idt, type_)
+
+    def load_sg_seq(self, fasta_fn):
+
+        all_read_ids = set() # read ids in the graph
+
+        for v, w in self.sg_edges:
+            type_ = self.sg_edges[ (v, w) ][-1]
+            if type_ != "G":
+                continue
+            v = v.split(":")[0]
+            w = w.split(":")[0]
+            all_read_ids.add(v)
+            all_read_ids.add(w)
+
+        seqs = {}
+        # load all p-read name into memory
+        f = FastaReader(fasta_fn)
+        for r in f:
+            if r.name not in all_read_ids:
+                continue
+            seqs[r.name] = r.sequence.upper()
+
+
+        for v, w in self.sg_edges:
+            seq_id, s, t = self.sg_edges[ (v, w) ][0]
+            type_ = self.sg_edges[ (v, w) ][-1]
+
+            if type_ != "G":
+                continue
+
+            if s < t:
+                e_seq = seqs[ seq_id ][ s:t ]
+            else:
+                e_seq = "".join([ RCMAP[c] for c in seqs[ seq_id ][ s:t:-1 ] ])
+            self.sg_edge_seqs[ (v, w) ] = e_seq
+
+    def get_seq_from_path(self, path):
+        if len(self.sg_edge_seqs) == 0:
+            return ""
+        v = path[0]
+        seqs = []
+        for w in path[1:]:
+            seqs.append( self.sg_edge_seqs[ (v, w) ] )
+            v = w
+        return "".join(seqs)
+
+
+    def load_utg_data(self, utg_file):
+
+        with open(utg_file) as f:
+            for l in f:
+                l = l.strip().split()
+                s, v, t = l[0:3]
+                type_, length, score = l[3:6]
+                length, score = int(length), int(score)
+                path_or_edges = l[6]
+                self.utg_data[ (s,t,v) ] = ( type_, length, score, path_or_edges)
+
+
+    def load_ctg_data(self, ctg_file):
+
+        with open(ctg_file) as f:
+            for l in f:
+                l = l.strip().split()
+                ctg_id, ctg_type = l[0:2]
+                start_edge = l[2]
+                end_node = l[3]
+                length = int(l[4])
+                score = int(l[5])
+                path = tuple( ( e.split("~") for e in l[6].split("|") ) )
+                self.ctg_data[ ctg_id ] = ( ctg_type, start_edge, end_node,  length, score, path )
+                for u in path:
+                    s, v, t = u
+                    #rint s,v,t
+                    type_, length, score, path_or_edges =  self.utg_data[ (s,t,v) ]
+                    if type_ != "compound":
+                        self.utg_to_ctg[ (s, t, v) ] = ctg_id
+                    else:
+                        for svt in path_or_edges.split("|"):
+                            s, v, t = svt.split("~")
+                            self.utg_to_ctg[ (s, t, v) ] = ctg_id
+
+
+    def get_sg_for_utg(self, utg_id):
+        sg = nx.DiGraph()
+        type_, length, score, path_or_edges =  self.utg_data[ utg_id ]
+        if type_ == "compound":
+            for svt in path_or_edges.split("|"):
+                s, v, t = svt.split("~")
+                type_, length, score, one_path =  self.utg_data[ (s, t, v) ]
+                one_path = one_path.split("~")
+                sg.add_path(one_path)
+        else:
+            one_path = path_or_edges.split("~")
+            sg.add_path(one_path)
+        return sg
+
+
+    def get_sg_for_ctg(self, ctg_id):
+        sg = nx.DiGraph()
+        utgs = []
+        path = self.ctg_data[ctg_id][-1]
+        for s, v, t in path:
+            type_, length, score, path_or_edges =  self.utg_data[ (s, t, v) ]
+            utgs.append( (type_, path_or_edges) )
+
+        for t, utg in utgs:
+            if t == "simple":
+                one_path = utg.split("~")
+                sg.add_path(one_path)
+            elif t == "compound":
+                for svt in utg.split("|"):
+                    s, v, t = svt.split("~")
+                    type_, length, score, one_path =  self.utg_data[ (s, t, v) ]
+                    one_path = one_path.split("~")
+                    sg.add_path(one_path)
+
+        return sg
+
+
+    def build_node_map(self):
+
+        for ctg_id in self.ctg_data:
+            sg = self.get_sg_for_ctg( ctg_id )
+            for n in sg.nodes():
+                self.node_to_ctg.setdefault(n, set())
+                self.node_to_ctg[n].add(ctg_id)
+
+
+        for u_id in self.utg_data:
+            if self.utg_data[u_id][0] == "compound":
+                continue
+            sg = self.get_sg_for_utg( u_id )
+            for n in sg.nodes():
+                self.node_to_utg.setdefault(n, set())
+                self.node_to_utg[n].add( u_id )
diff --git a/src/py/mains/__init__.py b/src/py/mains/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/py/mains/actg_coordinate.py b/src/py/mains/actg_coordinate.py
new file mode 100644
index 0000000..032e3f4
--- /dev/null
+++ b/src/py/mains/actg_coordinate.py
@@ -0,0 +1,27 @@
+from falcon_kit.FastaReader import FastaReader
+
+
+def main(*argv):
+  p_ctg_coor_map = {}
+  with open("p_ctg_tiling_path") as f:
+    for row in f:
+        row = row.strip().split()
+        ctg_id, v, w, edge_rid, b, e  = row[:6]
+        if ctg_id not in p_ctg_coor_map:
+            coor = 0   # the p_ctg_tiling_path should be sorted by contig the order of the edges in the tiling path
+            p_ctg_coor_map[ctg_id] = {}
+            p_ctg_coor_map[ctg_id][v] = 0
+            coor += abs(int(b) - int(e))
+            p_ctg_coor_map[ctg_id][w] = coor
+            continue
+        else:
+            coor += abs(int(b) - int(e))
+            p_ctg_coor_map[ctg_id][w] = coor 
+
+
+  a_ctg_fasta = FastaReader("a_ctg.fa")
+  for r in a_ctg_fasta:
+    rid = r.name.split()
+    rid, v, w = rid[:3]
+    pid = rid.split("-")[0]
+    print rid, p_ctg_coor_map[pid][v], p_ctg_coor_map[pid][w]
diff --git a/src/py/mains/consensus.py b/src/py/mains/consensus.py
new file mode 100644
index 0000000..af3e624
--- /dev/null
+++ b/src/py/mains/consensus.py
@@ -0,0 +1,240 @@
+from ctypes import (POINTER, c_char_p, c_uint, c_uint, c_uint, c_uint, c_uint, c_double, string_at)
+from falcon_kit.multiproc import Pool
+from falcon_kit import falcon
+import argparse
+import os
+import re
+import sys
+import falcon_kit
+
+
+falcon.generate_consensus.argtypes = [ POINTER(c_char_p), c_uint, c_uint, c_uint, c_uint, c_uint, c_double ]
+falcon.generate_consensus.restype = POINTER(falcon_kit.ConsensusData)
+falcon.free_consensus_data.argtypes = [ POINTER(falcon_kit.ConsensusData) ]
+
+
+def get_alignment(seq1, seq0, edge_tolerance = 1000):
+
+    kup = falcon_kit.kup
+    K = 8 
+    lk_ptr = kup.allocate_kmer_lookup( 1 << (K * 2) )
+    sa_ptr = kup.allocate_seq( len(seq0) )
+    sda_ptr = kup.allocate_seq_addr( len(seq0) )
+    kup.add_sequence( 0, K, seq0, len(seq0), sda_ptr, sa_ptr, lk_ptr)
+
+    kup.mask_k_mer(1 << (K * 2), lk_ptr, 16)
+    kmer_match_ptr = kup.find_kmer_pos_for_seq(seq1, len(seq1), K, sda_ptr, lk_ptr)
+    kmer_match = kmer_match_ptr[0]
+    aln_range_ptr = kup.find_best_aln_range2(kmer_match_ptr, K, K*50, 25)
+    #x,y = zip( * [ (kmer_match.query_pos[i], kmer_match.target_pos[i]) for i in range(kmer_match.count )] )
+    aln_range = aln_range_ptr[0]
+    kup.free_kmer_match(kmer_match_ptr)
+    s1, e1, s0, e0, km_score = aln_range.s1, aln_range.e1, aln_range.s2, aln_range.e2, aln_range.score  
+    e1 += K + K/2
+    e0 += K + K/2
+    kup.free_aln_range(aln_range)
+    len_1 = len(seq1)
+    len_0 = len(seq0)
+    if e1 > len_1: 
+        e1 = len_1
+    if e0 > len_0:
+        e0 = len_0
+
+    aln_size = 1
+    if e1 - s1 > 500:
+
+        aln_size = max( e1-s1, e0-s0 )
+        aln_score = int(km_score * 48)
+        aln_q_s = s1
+        aln_q_e = e1
+        aln_t_s = s0
+        aln_t_e = e0
+        
+    kup.free_seq_addr_array(sda_ptr)
+    kup.free_seq_array(sa_ptr)
+    kup.free_kmer_lookup(lk_ptr)
+
+    if s1 > edge_tolerance and s0 > edge_tolerance:
+        return 0, 0, 0, 0, 0, 0, "none"
+
+    if len_1 - e1 > edge_tolerance and len_0 - e0 > edge_tolerance:
+        return 0, 0, 0, 0, 0, 0, "none"
+
+
+    if e1 - s1 > 500 and aln_size > 500:
+        return s1, s1+aln_q_e-aln_q_s, s0, s0+aln_t_e-aln_t_s, aln_size, aln_score, "aln"
+    else:
+        return 0, 0, 0, 0, 0, 0, "none"
+
+def get_consensus_without_trim( c_input ):
+    seqs, seed_id, config = c_input
+    min_cov, K, local_match_count_window, local_match_count_threshold, max_n_read, min_idt, edge_tolerance, trim_size = config
+    if len(seqs) > max_n_read:
+        seqs = seqs[:max_n_read]
+    seqs_ptr = (c_char_p * len(seqs))()
+    seqs_ptr[:] = seqs
+    consensus_data_ptr = falcon.generate_consensus( seqs_ptr, len(seqs), min_cov, K, 
+                                                    local_match_count_window, local_match_count_threshold, min_idt )
+
+    consensus = string_at(consensus_data_ptr[0].sequence)[:]
+    eff_cov = consensus_data_ptr[0].eff_cov[:len(consensus)]
+    falcon.free_consensus_data( consensus_data_ptr )
+    del seqs_ptr
+    return consensus, seed_id
+
+def get_consensus_with_trim( c_input ):
+    seqs, seed_id, config = c_input
+    min_cov, K, local_match_count_window, local_match_count_threshold, max_n_read, min_idt, edge_tolerance, trim_size = config
+    trim_seqs = []
+    seed = seqs[0]
+    for seq in seqs[1:]:
+        aln_data = get_alignment(seq, seed, edge_tolerance)
+        s1, e1, s2, e2, aln_size, aln_score, c_status = aln_data
+        if c_status == "none":
+            continue
+        if aln_score > 1000 and e1 - s1 > 500:
+            e1 -= trim_size
+            s1 += trim_size
+            trim_seqs.append( (e1-s1, seq[s1:e1]) )
+    trim_seqs.sort(key = lambda x:-x[0]) #use longest alignment first
+    trim_seqs = [x[1] for x in trim_seqs]
+        
+    if len(trim_seqs) > max_n_read:
+        trim_seqs = trim_seqs[:max_n_read]
+
+    trim_seqs = [seed] + trim_seqs
+
+
+    seqs_ptr = (c_char_p * len(trim_seqs))()
+    seqs_ptr[:] = trim_seqs
+    consensus_data_ptr = falcon.generate_consensus( seqs_ptr, len(trim_seqs), min_cov, K, 
+                                               local_match_count_window, local_match_count_threshold, min_idt )
+    consensus = string_at(consensus_data_ptr[0].sequence)[:]
+    eff_cov = consensus_data_ptr[0].eff_cov[:len(consensus)]
+    falcon.free_consensus_data( consensus_data_ptr )
+    del seqs_ptr
+    return consensus, seed_id
+
+
+def get_seq_data(config, min_cov_aln, min_len_aln):
+    max_len = 100000
+    min_cov, K, local_match_count_window, local_match_count_threshold, max_n_read, min_idt, edge_tolerance, trim_size = config
+    seqs = []
+    seed_id = None
+    seqs_data = []
+    read_ids = set()
+    with sys.stdin as f:
+        for l in f:
+            l = l.strip().split()
+            if len(l) != 2:
+                continue
+
+            read_id = l[0]
+            seq = l[1]
+            if len(seq) > max_len:
+                seq = seq[:max_len-1]
+
+            if read_id not in ("+", "-", "*"):
+                if len(seq) >= min_len_aln:
+                    if len(seqs) == 0:
+                        seqs.append(seq) #the "seed"
+                        seed_id = l[0]
+                    if read_id not in read_ids: #avoidng using the same read twice. seed is used again here by design
+                        seqs.append(seq)
+                        read_ids.add(read_id)
+            elif l[0] == "+":
+                if len(seqs) >= min_cov_aln:
+                    seqs = seqs[:1] + sorted(seqs[1:], key=lambda x: -len(x))
+                    yield (seqs[:max_n_read], seed_id, config) 
+                #seqs_data.append( (seqs, seed_id) ) 
+                seqs = []
+                read_ids = set()
+                seed_id = None
+            elif l[0] == "*":
+                seqs = []
+                read_ids = set()
+                seed_id = None
+            elif l[0] == "-":
+                #yield (seqs, seed_id)
+                #seqs_data.append( (seqs, seed_id) )
+                break
+def format_seq(seq, col):
+    return "\n".join( [ seq[i:(i+col)] for i in xrange(0, len(seq), col) ] )
+
+def main(*argv):
+    parser = argparse.ArgumentParser(description='a simple multi-processor consensus sequence generator')
+    parser.add_argument('--n_core', type=int, default=24,
+                        help='number of processes used for generating consensus; '
+                        '0 for main process only (default=%(default)s)')
+    parser.add_argument('--local_match_count_window', type=int, default=12,
+                        help='local match window size (obsoleted, no effect)')
+    parser.add_argument('--local_match_count_threshold', type=int, default=6,
+                        help='local match count threshold (obsoleted, no effect)')
+    parser.add_argument('--min_cov', type=int, default=6,
+                        help='minimum coverage to break the consensus')
+    parser.add_argument('--min_cov_aln', type=int, default=10,
+                        help='minimum coverage of alignment data; an alignment with fewer reads will be completely ignored')
+    parser.add_argument('--min_len_aln', type=int, default=100,
+                        help='minimum length of a sequence in an alignment to be used in consensus; any shorter sequence will be completely ignored')
+    parser.add_argument('--max_n_read', type=int, default=500,
+                        help='maximum number of reads used in generating the consensus')
+    parser.add_argument('--trim', action="store_true", default=False,
+                        help='trim the input sequence with k-mer spare dynamic programming to find the mapped range')
+    parser.add_argument('--output_full', action="store_true", default=False,
+                        help='output uncorrected regions too')
+    parser.add_argument('--output_multi', action="store_true", default=False,
+                        help='output multi correct regions; implies --output_dformat, unless --output-simple-fasta-header')
+    parser.add_argument('--output_dformat', action="store_true", default=True,
+                        help='output daligner compatible header, only work with --output_multi; DEPRECATED and ignored, as this is the default now')
+    parser.add_argument('--output_simple_fasta_header', action='store_true', default=False,
+                        help='Turn off --output_dformat. This was for older (pre spring 2015) DALIGNER. Never needed now.')
+    parser.add_argument('--min_idt', type=float, default=0.70,
+                        help='minimum identity of the alignments used for correction')
+    parser.add_argument('--edge_tolerance', type=int, default=1000,
+                        help='for trimming, the there is unaligned edge leng > edge_tolerance, ignore the read')
+    parser.add_argument('--trim_size', type=int, default=50,
+                        help='the size for triming both ends from initial sparse aligned region')
+    good_region = re.compile("[ACGT]+")
+    args = parser.parse_args(argv[1:])
+    exe_pool = Pool(args.n_core)
+    if args.trim:
+        get_consensus = get_consensus_with_trim
+    else:
+        get_consensus = get_consensus_without_trim
+
+    K = 8
+    config = args.min_cov, K, args.local_match_count_window, args.local_match_count_threshold,\
+             args.max_n_read, args.min_idt, args.edge_tolerance, args.trim_size
+    # TODO: pass config object, not tuple, so we can add fields
+    for res in exe_pool.imap(get_consensus, get_seq_data(config, args.min_cov_aln, args.min_len_aln)):
+        cns, seed_id = res
+        if len(cns) < 500:
+            continue
+
+
+        if args.output_full:
+            print ">"+seed_id+"_f"
+            print cns
+        else:
+            cns = good_region.findall(cns)
+            if len(cns) == 0:
+                continue
+            if args.output_multi:
+                seq_i = 0
+                for cns_seq in cns:
+                    if len(cns_seq) < 500:
+                        continue
+                    if not args.output_simple_fasta_header:
+                        if seq_i >= 10:
+                            break
+                        print ">prolog/%s%01d/%d_%d" % (seed_id, seq_i, 0, len(cns_seq))
+                        print format_seq(cns_seq, 80)
+                    else:
+                        print ">"+seed_id+"_%d" % seq_i
+                        print cns_seq
+                    seq_i += 1
+            else:
+                cns.sort(key = lambda x: len(x))
+                print ">"+seed_id
+                print cns[-1]
+
diff --git a/src/py/mains/contig_annotate.py b/src/py/mains/contig_annotate.py
new file mode 100644
index 0000000..e617347
--- /dev/null
+++ b/src/py/mains/contig_annotate.py
@@ -0,0 +1,29 @@
+import networkx as nx
+from falcon_kit.fc_asm_graph import AsmGraph
+
+
+def main(*argv):
+  G_asm = AsmGraph("sg_edges_list", "utg_data", "ctg_paths")
+
+
+  p_ctg_coor_map = {}
+  for fn in ("p_ctg_tiling_path", "a_ctg_tiling_path"):
+    f = open(fn)
+    for row in f:
+        row = row.strip().split()
+        ctg_id, v, w, edge_rid, b, e  = row[:6]
+        if ctg_id not in p_ctg_coor_map:
+            coor = 0   # the p_ctg_tiling_path should be sorted by contig the order of the edges in the tiling path
+            p_ctg_coor_map[ctg_id] = {}
+            p_ctg_coor_map[ctg_id][v] = 0
+            coor += abs(int(b) - int(e))
+            p_ctg_coor_map[ctg_id][w] = coor
+            G_asm.node_to_ctg[w]
+            print ctg_id, v, 0, " ".join(list(G_asm.node_to_ctg[v]))
+            print ctg_id, w, coor, " ".join(list(G_asm.node_to_ctg[w]))
+            continue
+        else:
+            coor += abs(int(b) - int(e))
+            p_ctg_coor_map[ctg_id][w] = coor 
+            print ctg_id, w, coor, " ".join(list(G_asm.node_to_ctg[w]))
+    f.close()
diff --git a/src/py/mains/ctg_link_analysis.py b/src/py/mains/ctg_link_analysis.py
new file mode 100644
index 0000000..ad6d71c
--- /dev/null
+++ b/src/py/mains/ctg_link_analysis.py
@@ -0,0 +1,80 @@
+from falcon_kit import fc_asm_graph 
+
+def main(*argv):
+  AsmGraph = fc_asm_graph.AsmGraph
+
+  G_asm = AsmGraph("sg_edges_list", "utg_data", "ctg_paths")
+
+  sg_edges = G_asm.sg_edges
+  node_to_ctg = G_asm.node_to_ctg
+  node_to_utg = G_asm.node_to_utg
+
+  ctg_data = G_asm.ctg_data
+  utg_data = G_asm.utg_data
+
+  ctg_pair_links = {}
+  for v, w in sg_edges.keys():
+    if v in node_to_ctg and w in node_to_ctg:
+        for ctg1 in list(node_to_ctg[v]):
+            for ctg2 in list(node_to_ctg[w]):
+                if ctg1 == ctg2:
+                    continue
+                ctg_pair_links.setdefault((ctg1, ctg2), set())
+                ctg_pair_links[ (ctg1, ctg2) ].add( (v,w) )    
+
+                    
+  utg_pair_links = {}
+  for v, w in sg_edges.keys():
+    if v in node_to_utg and w in node_to_utg:
+        for u1 in list(node_to_utg[v]):
+            for u2 in list(node_to_utg[w]):
+                if u1 == u2:
+                    continue
+                utg_pair_links.setdefault((u1, u2), set())
+                utg_pair_links[(u1,u2)].add( (v, w) )
+
+
+  for ctg1, ctg2 in ctg_pair_links:
+    links = ctg_pair_links[ ( ctg1, ctg2 ) ]
+    count = len(links)
+    if count > 0:
+        path1 = ctg_data[ctg1][-1][-5:]
+        path2 = ctg_data[ctg2][-1][:5]
+        utg1 = []
+        utg2 = []
+        for s1, v1, t1 in path1:
+            u1 = (s1, t1, v1)
+            type_, length, score, path_or_edges =  utg_data[ u1 ]
+            if type_ == "compound":
+                for u in path_or_edges.split("|"):
+                    ss, vv, tt = u.split("~")
+                    utg1.append( (ss, tt, vv) )
+            else:
+               utg1.append(u1)
+        for s2, v2, t2 in path2:
+            u2 = (s2, t2, v2)
+            type_, length, score, path_or_edges =  utg_data[ u2 ]
+            if type_ == "compound":
+                for u in path_or_edges.split("|"):
+                    ss, vv, tt = u.split("~")
+                    utg2.append( (ss, tt, vv) )
+            else:
+               utg2.append(u2) 
+        #print path1
+        #print path2
+        #print len(utg1), len(utg2)
+        for u1 in utg1:
+            for u2 in utg2:
+                u1 = tuple(u1)
+                u2 = tuple(u2)
+                c = utg_pair_links.get( (u1, u2), set() )
+                if len(c) == 0:
+                    continue
+                s1,t1,v1 = u1
+                s2,t2,v2 = u2
+                len_1 = ctg_data[ ctg1 ][ 3 ]
+                len_2 = ctg_data[ ctg2 ][ 3 ]
+                print ctg1, ctg2, len_1, len_2, len(utg1), len(utg2), len(links), "~".join( (s1,v1,t1) ),  "~".join( (s2,v2,t2) ), len(c)
+        
+
+
diff --git a/src/py/mains/dedup_a_tigs.py b/src/py/mains/dedup_a_tigs.py
new file mode 100644
index 0000000..c642487
--- /dev/null
+++ b/src/py/mains/dedup_a_tigs.py
@@ -0,0 +1,22 @@
+from falcon_kit.FastaReader import FastaReader
+import argparse
+
+def parse_args(argv):
+    parser = argparse.ArgumentParser(description='remove duplicate a-tig, it assumes the working directory has the a_ctg_all.fa file')
+    parser.add_argument('--max_idt', type=int, help="keep a-tig if the identity (in %) to the primary contig is <= max_idt", default = 96)
+    parser.add_argument('--max_aln_cov', type=int, help="keep a-tig if the alignment coverage (in %) on the a-tig is <= max_aln_cov", default = 97)
+    parser.add_argument('--min_len_diff', type=int, help="keep a-tig if the length different > min_len_diff", default = 500)
+    args = parser.parse_args(argv[1:])
+    return args
+
+def main(*argv):
+    args = parse_args(argv)
+    reads = FastaReader("a_ctg_all.fa")
+    with open("a_ctg.fa","w") as f:
+        for r in reads:
+            tig_id, v, w, len_, ovl, ne, delta_l, idt, cov = r.name.split()
+            if 100*float(idt) > args.max_idt and 100*float(cov) > args.max_aln_cov and\
+               abs(int(delta_l)) < args.min_len_diff:
+                   continue
+            print >>f, ">"+r.name
+            print >>f, r.sequence
diff --git a/src/py/mains/graph_to_contig.py b/src/py/mains/graph_to_contig.py
new file mode 100644
index 0000000..293619d
--- /dev/null
+++ b/src/py/mains/graph_to_contig.py
@@ -0,0 +1,297 @@
+import networkx as nx
+#from pbcore.io import FastaReader
+from falcon_kit.FastaReader import FastaReader
+from falcon_kit import kup, falcon, DWA
+
+read_fasta = "preads4falcon.fasta"
+edge_data_file = "sg_edges_list"
+utg_data_file = "utg_data"
+ctg_data_file = "ctg_paths"
+
+RCMAP = dict(zip("ACGTacgtNn-","TGCAtgcaNn-"))
+def rc(seq):
+    return "".join([RCMAP[c] for c in seq[::-1]])
+
+def get_aln_data(t_seq, q_seq):
+    aln_data = []
+    x = []
+    y = []
+    K = 8
+    seq0 = t_seq
+    lk_ptr = kup.allocate_kmer_lookup( 1 << (K * 2) )
+    sa_ptr = kup.allocate_seq( len(seq0) )
+    sda_ptr = kup.allocate_seq_addr( len(seq0) )
+    kup.add_sequence( 0, K, seq0, len(seq0), sda_ptr, sa_ptr, lk_ptr)
+    q_id = "dummy"
+
+    kmer_match_ptr = kup.find_kmer_pos_for_seq(q_seq, len(q_seq), K, sda_ptr, lk_ptr)
+    kmer_match = kmer_match_ptr[0]
+
+    if kmer_match.count != 0:
+        aln_range_ptr = kup.find_best_aln_range(kmer_match_ptr, K, K*5, 12)
+        aln_range = aln_range_ptr[0]
+        x,y = zip( * [ (kmer_match.query_pos[i], kmer_match.target_pos[i]) for i in range(kmer_match.count)] )
+
+        s1, e1, s2, e2 = aln_range.s1, aln_range.e1, aln_range.s2, aln_range.e2
+
+        if e1 - s1 > 100:
+
+            alignment = DWA.align(q_seq[s1:e1], e1-s1,
+                                  seq0[s2:e2], e2-s2,
+                                  1500,1)
+
+            if alignment[0].aln_str_size > 100:
+                aln_data.append( ( q_id, 0, s1, e1, len(q_seq), s2, e2, len(seq0), alignment[0].aln_str_size, alignment[0].dist ) )
+                aln_str1 = alignment[0].q_aln_str
+                aln_str0 = alignment[0].t_aln_str
+
+            DWA.free_alignment(alignment)
+
+        kup.free_aln_range(aln_range_ptr)
+
+    kup.free_kmer_match(kmer_match_ptr)
+    kup.free_kmer_lookup(lk_ptr)
+    kup.free_seq_array(sa_ptr)
+    kup.free_seq_addr_array(sda_ptr)
+    return aln_data, x, y
+
+def reverse_end( node_id ):
+    node_id, end = node_id.split(":")
+    new_end = "B" if end == "E" else "E"
+    return node_id + ":" + new_end
+
+def main(*argv):
+    reads_in_layout = set()
+    with open(edge_data_file) as f:
+        for l in f:
+            l = l.strip().split()
+            """001039799:E 000333411:E 000333411 17524 20167 17524 99.62"""
+            v, w, rid, s, t, aln_score, idt, type_ = l
+            if type_ != "G":
+                continue
+            r1 = v.split(":")[0]
+            reads_in_layout.add(r1)
+            r2 = w.split(":")[0]
+            reads_in_layout.add(r2)
+
+    seqs = {}
+    # load all p-read name into memory
+    f = FastaReader(read_fasta)
+    for r in f:
+        if r.name not in reads_in_layout:
+            continue
+        seqs[r.name] = r.sequence.upper()
+
+    edge_data = {}
+    with open(edge_data_file) as f:
+        for l in f:
+            l = l.strip().split()
+            """001039799:E 000333411:E 000333411 17524 20167 17524 99.62"""
+            v, w, rid, s, t, aln_score, idt, type_ = l
+
+            if type_ != "G":
+                continue
+            r1 = v.split(":")[0]
+            reads_in_layout.add(r1)
+            r2 = w.split(":")[0]
+            reads_in_layout.add(r2)
+
+            s = int(s)
+            t = int(t)
+            aln_score = int(aln_score)
+            idt = float(idt)
+
+            if s < t:
+                e_seq = seqs[ rid ][ s:t ]
+            else:
+                e_seq = "".join([ RCMAP[c] for c in seqs[ rid ][ s:t:-1 ] ])
+            edge_data[ (v, w) ] = ( rid, s, t, aln_score, idt, e_seq )
+
+    utg_data = {}
+    with open(utg_data_file) as f:
+        for l in f:
+            l = l.strip().split()
+            s, v, t, type_, length, score, path_or_edges = l
+            if type_ not in ["compound", "simple", "contained"]:
+                continue
+            length = int(length)
+            score = int(score)
+            if type_ in ("simple", "contained"):
+                path_or_edges = path_or_edges.split("~")
+            else:
+                path_or_edges = [ tuple(e.split("~")) for e in path_or_edges.split("|") ]
+            utg_data[ (s,v,t) ] = type_, length, score, path_or_edges
+
+    p_ctg_out = open("p_ctg.fa","w")
+    a_ctg_out = open("a_ctg_all.fa","w")
+    a_ctg_base_out = open("a_ctg_base.fa","w")
+    p_ctg_t_out = open("p_ctg_tiling_path","w")
+    a_ctg_t_out = open("a_ctg_tiling_path","w")
+    a_ctg_base_t_out = open("a_ctg_base_tiling_path","w")
+    layout_ctg = set()
+
+    with open(ctg_data_file) as f:
+        for l in f:
+            l = l.strip().split()
+            ctg_id, c_type_, i_utig, t0, length, score, utgs = l
+            ctg_id = ctg_id
+            s0 = i_utig.split("~")[0]
+
+            if (reverse_end(t0), reverse_end(s0)) in layout_ctg:
+                continue
+            else:
+                layout_ctg.add( (s0, t0) )
+
+            ctg_label = i_utig+"~"+t0
+            length = int(length)
+            utgs = utgs.split("|")
+            one_path = []
+            total_score = 0
+            total_length =0
+
+            #a_ctg_data = []
+            a_ctg_group = {}
+
+            for utg in utgs:
+                s,v,t  = utg.split("~")
+                type_, length, score, path_or_edges = utg_data[ (s,v,t) ]
+                total_score += score
+                total_length += length
+                if type_ == "simple":
+                    if len(one_path) != 0:
+                        one_path.extend ( path_or_edges[1:] )
+                    else:
+                        one_path.extend ( path_or_edges )
+                if type_ == "compound":
+
+                    c_graph = nx.DiGraph()
+
+                    all_alt_path = []
+                    for ss, vv, tt in path_or_edges:
+                        type_, length, score, sub_path = utg_data[ (ss,vv,tt) ]
+
+                        v1 = sub_path[0]
+                        for v2 in sub_path[1:]:
+                            c_graph.add_edge( v1, v2, e_score = edge_data[ (v1, v2) ][3]  )
+                            v1 = v2
+
+                    shortest_path = nx.shortest_path( c_graph, s, t, "e_score" )
+                    score = nx.shortest_path_length( c_graph, s, t, "e_score" )
+                    all_alt_path.append( (score, shortest_path) )
+
+                    #a_ctg_data.append( (s, t, shortest_path) ) #first path is the same as the one used in the primary contig
+                    while 1:
+                        n0 = shortest_path[0]
+                        for n1 in shortest_path[1:]:
+                            c_graph.remove_edge(n0, n1)
+                            n0 = n1
+                        try:
+                            shortest_path = nx.shortest_path( c_graph, s, t, "e_score" )
+                            score = nx.shortest_path_length( c_graph, s, t, "e_score" )
+                            #a_ctg_data.append( (s, t, shortest_path) )
+                            all_alt_path.append( (score, shortest_path) )
+
+                        except nx.exception.NetworkXNoPath:
+                            break
+                        #if len(shortest_path) < 2:
+                        #    break
+                    all_alt_path.sort()
+                    all_alt_path.reverse()
+                    shortest_path = all_alt_path[0][1]
+                    if len(one_path) != 0:
+                        one_path.extend ( shortest_path[1:] )
+                    else:
+                        one_path.extend ( shortest_path )
+
+                    a_ctg_group[ (s, t) ] = all_alt_path
+
+            if len(one_path) == 0:
+                continue
+
+            one_path_edges = zip(one_path[:-1], one_path[1:])
+
+            sub_seqs = []
+            for vv, ww in one_path_edges:
+                rid, s, t, aln_score, idt, e_seq = edge_data[ (vv, ww) ]
+                sub_seqs.append( e_seq )
+                print >> p_ctg_t_out, "%s %s %s %s %d %d %d %0.2f" % (ctg_id, vv, ww, rid, s, t, aln_score, idt)
+            print >> p_ctg_out, ">%s %s %s %d %d" % (ctg_id, ctg_label, c_type_, total_length, total_score)
+            print >> p_ctg_out, "".join(sub_seqs)
+
+            a_id = 1
+            for v, w, in a_ctg_group:
+                #get the base sequence used in the primary contig
+                #count = len( [x for x in a_ctg_group[ (v, w) ] if len(x[1]) > 3] )
+                #if count < 2:
+                #    continue
+                atig_output = []
+
+                score, atig_path = a_ctg_group[ (v, w) ][0]
+                atig_path_edges = zip(atig_path[:-1], atig_path[1:])
+                sub_seqs = []
+                total_length = 0
+                total_score = 0
+                for vv, ww in atig_path_edges:
+                    rid, s, t, aln_score, idt, e_seq = edge_data[ (vv, ww) ]
+                    sub_seqs.append( e_seq )
+                    total_length += abs(s-t)
+                    total_score += aln_score
+
+                base_seq = "".join(sub_seqs)
+                atig_output.append( (v, w, atig_path, total_length, total_score, base_seq, atig_path_edges, 0, 1, 1) )
+
+                for score, atig_path in a_ctg_group[ (v, w) ][1:]:
+                    atig_path_edges = zip(atig_path[:-1], atig_path[1:])
+                    sub_seqs = []
+                    total_length = 0
+                    total_score = 0
+                    for vv, ww in atig_path_edges:
+                        rid, s, t, aln_score, idt, e_seq = edge_data[ (vv, ww) ]
+                        sub_seqs.append( e_seq )
+                        total_length += abs(s-t)
+                        total_score += aln_score
+
+                    seq = "".join(sub_seqs)
+
+                    delta_len = len(seq) - len(base_seq)
+                    idt = 0.0
+                    cov = 0.0
+                    if len(base_seq) > 2000 and len(seq) > 2000:
+                        aln_data, x, y = get_aln_data(base_seq, seq)
+                        if len( aln_data ) != 0:
+                            idt =  1.0-1.0*aln_data[-1][-1] / aln_data[-1][-2]
+                            cov = 1.0*(aln_data[-1][3]-aln_data[-1][2])/aln_data[-1][4]
+
+                    atig_output.append( (v, w, atig_path, total_length, total_score, seq, atig_path_edges, delta_len, idt, cov) )
+
+                if len(atig_output) == 1:
+                    continue
+
+                sub_id = 0
+                for data in atig_output:
+                    v0, w0, tig_path, total_length, total_score, seq, atig_path_edges, delta_len, a_idt, cov = data
+                    for vv, ww in atig_path_edges:
+                        rid, s, t, aln_score, idt, e_seq = edge_data[ (vv, ww) ]
+                        if sub_id != 0:
+                            print >> a_ctg_t_out, "%s-%03d-%02d %s %s %s %d %d %d %0.2f" % (ctg_id, a_id, sub_id, vv, ww, rid, s, t, aln_score, idt)
+                        else:
+                            print >> a_ctg_base_t_out, "%s-%03d-%02d %s %s %s %d %d %d %0.2f" % (ctg_id, a_id, sub_id, vv, ww, rid, s, t, aln_score, idt)
+
+                    if sub_id != 0:
+                        print >> a_ctg_out, ">%s-%03d-%02d %s %s %d %d %d %d %0.2f %0.2f" % (ctg_id, a_id, sub_id, v0, w0, total_length, total_score, len(atig_path_edges), delta_len, a_idt, cov )
+                        print >> a_ctg_out, seq
+                    else:
+                        print >> a_ctg_base_out, ">%s-%03d-%02d %s %s %d %d %d %d %0.2f %0.2f" % (ctg_id, a_id, sub_id, v0, w0, total_length, total_score, len(atig_path_edges), delta_len, a_idt, cov )
+                        print >> a_ctg_base_out, seq
+
+                    sub_id += 1
+
+                a_id += 1
+
+    a_ctg_out.close()
+    a_ctg_base_out.close()
+    p_ctg_out.close()
+    a_ctg_t_out.close()
+    a_ctg_base_t_out.close()
+    a_ctg_t_out.close()
+    p_ctg_t_out.close()
diff --git a/src/py/mains/graph_to_utgs.py b/src/py/mains/graph_to_utgs.py
new file mode 100644
index 0000000..d14e58f
--- /dev/null
+++ b/src/py/mains/graph_to_utgs.py
@@ -0,0 +1,160 @@
+from falcon_kit import kup, falcon, DWA
+from falcon_kit.fc_asm_graph import AsmGraph
+import networkx as nx
+
+RCMAP = dict(zip("ACGTacgtNn-","TGCAtgcaNn-"))
+def rc(seq):
+    return "".join([RCMAP[c] for c in seq[::-1]])
+
+def get_aln_data(t_seq, q_seq):
+    aln_data = []
+    K = 8
+    seq0 = t_seq
+    lk_ptr = kup.allocate_kmer_lookup( 1 << (K * 2) )
+    sa_ptr = kup.allocate_seq( len(seq0) )
+    sda_ptr = kup.allocate_seq_addr( len(seq0) )
+    kup.add_sequence( 0, K, seq0, len(seq0), sda_ptr, sa_ptr, lk_ptr)
+    q_id = "dummy"
+    
+    kmer_match_ptr = kup.find_kmer_pos_for_seq(q_seq, len(q_seq), K, sda_ptr, lk_ptr)
+    kmer_match = kmer_match_ptr[0]
+    aln_range_ptr = kup.find_best_aln_range(kmer_match_ptr, K, K*5, 12)
+    aln_range = aln_range_ptr[0]
+    x,y = zip( * [ (kmer_match.query_pos[i], kmer_match.target_pos[i]) for i in range(kmer_match.count)] )
+    kup.free_kmer_match(kmer_match_ptr)
+    s1, e1, s2, e2 = aln_range.s1, aln_range.e1, aln_range.s2, aln_range.e2
+    
+    if e1 - s1 > 100:
+
+        alignment = DWA.align(q_seq[s1:e1], e1-s1,
+                              seq0[s2:e2], e2-s2,
+                              1500,1)
+
+        if alignment[0].aln_str_size > 100:
+            aln_data.append( ( q_id, 0, s1, e1, len(q_seq), s2, e2, len(seq0), alignment[0].aln_str_size, alignment[0].dist ) )
+            aln_str1 = alignment[0].q_aln_str
+            aln_str0 = alignment[0].t_aln_str
+
+        DWA.free_alignment(alignment)
+
+    kup.free_kmer_lookup(lk_ptr)
+    kup.free_seq_array(sa_ptr)
+    kup.free_seq_addr_array(sda_ptr)
+    return aln_data, x, y
+
+def main(*argv):
+  G_asm = AsmGraph("sg_edges_list", "utg_data", "ctg_paths")
+  G_asm.load_sg_seq("preads4falcon.fasta")
+
+  utg_out = open("utgs.fa","w")
+
+
+  for utg in G_asm.utg_data:
+    s,t,v  = utg
+    type_, length, score, path_or_edges = G_asm.utg_data[ (s,t,v) ]
+    if type_ == "simple":
+        path_or_edges = path_or_edges.split("~")
+        seq = G_asm.get_seq_from_path( path_or_edges )
+        print >> utg_out, ">%s~%s~%s-%d %d %d" % (s, v, t, 0, length, score ) 
+        print >> utg_out, seq
+
+    if type_ == "compound":
+
+        c_graph = nx.DiGraph()
+
+        all_alt_path = []
+        path_or_edges = [ c.split("~") for c in path_or_edges.split("|")]
+        for ss, vv, tt in path_or_edges:
+            type_, length, score, sub_path = G_asm.utg_data[ (ss,tt,vv) ]
+             
+            sub_path = sub_path.split("~")
+            v1 = sub_path[0]
+            for v2 in sub_path[1:]:
+                c_graph.add_edge( v1, v2, e_score = G_asm.sg_edges[ (v1, v2) ][1]  )
+                v1 = v2
+        
+        shortest_path = nx.shortest_path( c_graph, s, t, "e_score" )
+        score = nx.shortest_path_length( c_graph, s, t, "e_score" )
+        all_alt_path.append( (score, shortest_path) )
+        
+
+        #a_ctg_data.append( (s, t, shortest_path) ) #first path is the same as the one used in the primary contig
+        while 1:
+            if s == t:
+                break
+            n0 = shortest_path[0]
+            for n1 in shortest_path[1:]:
+                c_graph.remove_edge(n0, n1)
+                n0 = n1
+            try:
+                shortest_path = nx.shortest_path( c_graph, s, t, "e_score" )
+                score = nx.shortest_path_length( c_graph, s, t, "e_score" )
+                #a_ctg_data.append( (s, t, shortest_path) )
+                all_alt_path.append( (score, shortest_path) )
+
+            except nx.exception.NetworkXNoPath:
+                break
+            #if len(shortest_path) < 2:
+            #    break
+
+        all_alt_path.sort()
+        all_alt_path.reverse()
+        shortest_path = all_alt_path[0][1]
+
+        
+        score, atig_path = all_alt_path[0]
+
+        atig_output = []
+
+        atig_path_edges = zip(atig_path[:-1], atig_path[1:])
+        sub_seqs = []
+        total_length = 0
+        total_score = 0
+        for vv, ww in atig_path_edges:
+            r, aln_score, idt, typs_  = G_asm.sg_edges[ (vv, ww) ]
+            e_seq  = G_asm.sg_edge_seqs[ (vv, ww) ]
+            rid, ss, tt = r
+            sub_seqs.append( e_seq )
+            total_length += abs(ss-tt)
+            total_score += aln_score
+
+        base_seq = "".join(sub_seqs)
+        atig_output.append( (s, t, atig_path, total_length, total_score, base_seq, atig_path_edges, 1, 1) )
+
+
+        duplicated = True
+        for score, atig_path in all_alt_path[1:]:
+            atig_path_edges = zip(atig_path[:-1], atig_path[1:])
+            sub_seqs = []
+            total_length = 0
+            total_score = 0
+            for vv, ww in atig_path_edges:
+                r, aln_score, idt, type_ = G_asm.sg_edges[ (vv, ww) ]
+                e_seq  = G_asm.sg_edge_seqs[ (vv, ww) ]
+                rid, ss, tt = r
+                sub_seqs.append( e_seq )
+                total_length += abs(ss-tt)
+                total_score += aln_score
+
+            seq = "".join(sub_seqs)
+
+            aln_data, x, y = get_aln_data(base_seq, seq)
+            if len( aln_data ) != 0:
+                idt =  1.0-1.0*aln_data[-1][-1] / aln_data[-1][-2]
+                cov = 1.0*(aln_data[-1][3]-aln_data[-1][2])/aln_data[-1][4]
+                if idt < 0.96 or cov < 0.98:
+                    duplicated = False
+                    atig_output.append( (s, t, atig_path, total_length, total_score, seq, atig_path_edges, idt, cov) )
+            else:
+                duplicated = False
+                atig_output.append( (s, t, atig_path, total_length, total_score, seq, atig_path_edges, 0, 0) )
+
+        #if len(atig_output) == 1:
+        #    continue
+
+        sub_id = 0
+        for data in atig_output:
+            v0, w0, tig_path, total_length, total_score, seq, atig_path_edges, a_idt, cov = data
+            print >> utg_out, ">%s~%s~%s-%d %d %d" % (v0, "NA", w0, sub_id,  total_length, total_score ) 
+            print >> utg_out, seq
+            sub_id += 1
diff --git a/src/py/mains/ovlp_filter.py b/src/py/mains/ovlp_filter.py
new file mode 100644
index 0000000..f432023
--- /dev/null
+++ b/src/py/mains/ovlp_filter.py
@@ -0,0 +1,275 @@
+from falcon_kit.multiproc import Pool
+import falcon_kit.util.io as io
+import argparse
+
+Reader = io.CapturedProcessReaderContext
+
+
+def run_filter_stage1(db_fn, fn, max_diff, max_ovlp, min_ovlp, min_len):
+    cmd = "LA4Falcon -mo %s %s" % (db_fn, fn)
+    reader = Reader(cmd)
+    with reader:
+        return fn, filter_stage1(reader.readlines, max_diff, max_ovlp, min_ovlp, min_len)
+def filter_stage1(readlines, max_diff, max_ovlp, min_ovlp, min_len):
+        ignore_rtn = []
+        current_q_id = None
+        contained = False
+        ave_idt = 0.0
+        all_over_len = 0.0
+        overlap_data = {"5p":0, "3p":0}
+        q_id = None
+        for l in readlines():
+            l = l.strip().split()
+            q_id, t_id = l[:2]
+
+            if q_id != None and q_id != current_q_id:
+
+                left_count = overlap_data["5p"]
+                right_count = overlap_data["3p"]
+            
+                if abs(left_count - right_count) > max_diff:
+                    ignore_rtn.append( current_q_id )
+                elif left_count > max_ovlp or right_count > max_ovlp:
+                    ignore_rtn.append( current_q_id )
+                elif left_count < min_ovlp or right_count < min_ovlp: 
+                    ignore_rtn.append( current_q_id )
+
+                overlap_data = {"5p":0, "3p":0}
+                current_q_id = q_id
+                contained = False
+                ave_idt = 0.0
+                all_over_len = 0.0
+
+            overlap_len = -int(l[2])
+            idt = float(l[3])
+            q_s, q_e, q_l = int(l[5]), int(l[6]), int(l[7])
+            t_s, t_e, t_l = int(l[9]), int(l[10]), int(l[11])
+
+            if idt < 90:
+                continue
+
+            if q_l < min_len or t_l < min_len:
+                continue
+
+            if not contained:
+                if l[-1] in ("contains", "overlap"):
+                    ave_idt += idt * overlap_len
+                    all_over_len += overlap_len
+                if q_s == 0:
+                    overlap_data["5p"] += 1
+                elif q_e == q_l:
+                    overlap_data["3p"] += 1
+
+        if q_id !=  None:
+            left_count = overlap_data["5p"]
+            right_count = overlap_data["3p"]
+            if abs(left_count - right_count) > max_diff:
+                ignore_rtn.append( current_q_id )
+            elif left_count > max_ovlp or right_count > max_ovlp:
+                ignore_rtn.append( current_q_id )
+            elif left_count < min_ovlp or right_count < min_ovlp: 
+                ignore_rtn.append( current_q_id )
+            
+        return ignore_rtn
+
+def run_filter_stage2(db_fn, fn, max_diff, max_ovlp, min_ovlp, min_len, ignore_set):
+    cmd = "LA4Falcon -mo %s %s" % (db_fn, fn)
+    reader = Reader(cmd)
+    with reader:
+        return fn, filter_stage2(reader.readlines, max_diff, max_ovlp, min_ovlp, min_len, ignore_set)
+def filter_stage2(readlines, max_diff, max_ovlp, min_ovlp, min_len, ignore_set):
+        contained_id = set()
+        for l in readlines():
+            l = l.strip().split()
+            q_id, t_id = l[:2]
+
+            q_s, q_e, q_l = int(l[5]), int(l[6]), int(l[7])
+            t_s, t_e, t_l = int(l[9]), int(l[10]), int(l[11])
+
+            idt = float(l[3])
+            if idt < 90:
+                continue
+
+            if q_l < min_len or t_l < min_len:
+                continue
+
+            if q_id in ignore_set:
+                continue
+            if t_id in ignore_set:
+                continue
+            if l[-1] == "contained":
+                contained_id.add(q_id)
+            if l[-1] == "contains":
+                contained_id.add(t_id)
+        return contained_id 
+
+def run_filter_stage3(db_fn, fn, max_diff, max_ovlp, min_ovlp, min_len, ignore_set, contained_set, bestn):
+    cmd = "LA4Falcon -mo %s %s" % (db_fn, fn)
+    reader = Reader(cmd)
+    with reader:
+        return fn, filter_stage3(reader.readlines, max_diff, max_ovlp, min_ovlp, min_len, ignore_set, contained_set, bestn)
+def filter_stage3(readlines, max_diff, max_ovlp, min_ovlp, min_len, ignore_set, contained_set, bestn):
+        ovlp_output = []
+        overlap_data = {"5p":[], "3p":[]}
+        current_q_id = None
+        for l in readlines():
+            l = l.strip().split()
+            q_id, t_id = l[:2]
+
+
+            if current_q_id == None:
+                current_q_id = q_id
+                overlap_data = {"5p":[], "3p":[]}
+
+            elif q_id != current_q_id:
+
+                left = overlap_data["5p"]
+                right = overlap_data["3p"]
+                left.sort()
+                right.sort()
+
+                for i in xrange(len(left)):
+                    score, m_range, ovlp = left[i]
+                    ovlp_output.append(ovlp)
+                    #print " ".join(ovlp), read_end_data[current_q_id] 
+                    if i >= bestn and m_range > 1000:
+                        break
+                
+                for i in xrange(len(right)):
+                    score, m_range, ovlp = right[i]
+                    ovlp_output.append(ovlp)
+                    #print " ".join(ovlp), read_end_data[current_q_id]
+                    if i >= bestn and m_range > 1000:
+                        break
+
+                overlap_data = {"5p":[], "3p":[]}
+                current_q_id = q_id
+
+            if q_id in contained_set:
+                continue
+            if t_id in contained_set:
+                continue
+            if q_id in ignore_set:
+                continue
+            if t_id in ignore_set:
+                continue
+
+            overlap_len = -int(l[2])
+            idt = float(l[3])
+            q_s, q_e, q_l = int(l[5]), int(l[6]), int(l[7])
+            t_s, t_e, t_l = int(l[9]), int(l[10]), int(l[11])
+
+            if idt < 90:
+                continue
+            if q_l < min_len or t_l < min_len:
+                continue
+
+            if q_s == 0:
+                overlap_data["5p"].append( (-overlap_len,  t_l - (t_e - t_s),  l) )
+            elif q_e == q_l:
+                overlap_data["3p"].append( (-overlap_len, t_l - (t_e - t_s), l) )
+
+        left = overlap_data["5p"]
+        right = overlap_data["3p"]
+        left.sort()
+        right.sort()
+
+
+        for i in xrange(len(left)):
+            score, m_range, ovlp = left[i]
+            ovlp_output.append(ovlp)
+            #print " ".join(ovlp), read_end_data[current_q_id] 
+            if i >= bestn and m_range > 1000:
+                break
+
+        for i in xrange(len(right)):
+            score, m_range, ovlp = right[i]
+            ovlp_output.append(ovlp)
+            #print " ".join(ovlp), read_end_data[current_q_id]
+            if i >= bestn and m_range > 1000:
+                break
+
+        return ovlp_output
+
+def run_ovlp_filter(exe_pool, file_list, max_diff, max_cov, min_cov, min_len, bestn, db_fn):
+    io.LOG('preparing filter_stage1')
+    io.logstats()
+    inputs = []
+    for fn in file_list:
+        if len(fn) != 0:
+            inputs.append( (run_filter_stage1, db_fn, fn, max_diff, max_cov, min_cov, min_len) )
+    
+    ignore_all = []
+    for res in exe_pool.imap(io.run_func, inputs):  
+        ignore_all.extend( res[1] )
+
+    io.LOG('preparing filter_stage2')
+    io.logstats()
+    inputs = []
+    ignore_all = set(ignore_all)
+    for fn in file_list:
+        if len(fn) != 0:
+            inputs.append( (run_filter_stage2, db_fn, fn, max_diff, max_cov, min_cov, min_len, ignore_all) )
+    contained = set()
+    for res in exe_pool.imap(io.run_func, inputs):  
+        contained.update(res[1])
+        #print res[0], len(res[1]), len(contained)
+
+    #print "all", len(contained)
+    io.LOG('preparing filter_stage3')
+    io.logstats()
+    inputs = []
+    ignore_all = set(ignore_all)
+    for fn in file_list:
+        if len(fn) != 0:
+            inputs.append( (run_filter_stage3, db_fn, fn, max_diff, max_cov, min_cov, min_len, ignore_all, contained, bestn) )
+    for res in exe_pool.imap(io.run_func, inputs):  
+        for l in res[1]:
+            print " ".join(l)
+    io.logstats()
+
+def try_run_ovlp_filter(n_core, fofn, max_diff, max_cov, min_cov, min_len, bestn, db_fn):
+    io.LOG('starting ovlp_filter')
+    file_list = io.validated_fns(fofn)
+    io.LOG('fofn %r: %r' %(fofn, file_list))
+    n_core = min(n_core, len(file_list))
+    exe_pool = Pool(n_core)
+    try:
+        run_ovlp_filter(exe_pool, file_list, max_diff, max_cov, min_cov, min_len, bestn, db_fn)
+        io.LOG('finished ovlp_filter')
+    except KeyboardInterrupt:
+        io.LOG('terminating ovlp_filter workers...')
+        exe_pool.terminate()
+
+def ovlp_filter(n_core, fofn, max_diff, max_cov, min_cov, min_len, bestn, db_fn, debug, silent, stream):
+    if debug:
+        n_core = 0
+        silent = False
+    if silent:
+        io.LOG = io.write_nothing
+    if stream:
+        global Reader
+        Reader = io.StreamedProcessReaderContext
+    try_run_ovlp_filter(n_core, fofn, max_diff, max_cov, min_cov, min_len, bestn, db_fn)
+
+def parse_args(argv):
+    parser = argparse.ArgumentParser(description='a simple multi-processes LAS ovelap data filter')
+    parser.add_argument('--n_core', type=int, default=4,
+                        help='number of processes used for generating consensus; '
+                        '0 for main process only (default=%(default)s)')
+    parser.add_argument('--fofn', type=str, help='file contains the path of all LAS file to be processed in parallel')
+    parser.add_argument('--db', type=str, dest='db_fn', help='read db file path')
+    parser.add_argument('--max_diff', type=int, help="max difference of 5' and 3' coverage")
+    parser.add_argument('--max_cov', type=int, help="max coverage of 5' or 3' coverage")
+    parser.add_argument('--min_cov', type=int, help="min coverage of 5' or 3' coverage")
+    parser.add_argument('--min_len', type=int, default=2500, help="min length of the reads (default=%(default)s)")
+    parser.add_argument('--bestn', type=int, default=10, help="output at least best n overlaps on 5' or 3' ends if possible (default=%(default)s)")
+    parser.add_argument('--stream', action='store_true', help='stream from LA4Falcon, instead of slurping all at once; can save memory for large data')
+    parser.add_argument('--debug', '-g', action='store_true', help="single-threaded, plus other aids to debugging")
+    parser.add_argument('--silent', action='store_true', help="suppress cmd reporting on stderr")
+    args = parser.parse_args(argv[1:])
+    return args
+
+def main(*argv):
+    args = parse_args(argv)
+    ovlp_filter(**vars(args))
diff --git a/src/py/mains/ovlp_stats.py b/src/py/mains/ovlp_stats.py
new file mode 100644
index 0000000..0443c93
--- /dev/null
+++ b/src/py/mains/ovlp_stats.py
@@ -0,0 +1,117 @@
+from falcon_kit.multiproc import Pool
+import falcon_kit.util.io as io
+import argparse
+import subprocess as sp
+import shlex
+
+Reader = io.CapturedProcessReaderContext
+
+
+def filter_stats(readlines, min_len):
+        current_q_id = None
+        contained = False
+        ave_idt = 0.0
+        all_over_len = 0.0
+        overlap_data = {"5p":0, "3p":0}
+        q_id = None
+        rtn_data = []
+        q_l = 0
+        for l in readlines():
+            l = l.strip().split()
+            q_id, t_id = l[:2]
+
+            if q_id != current_q_id:
+                left_count = overlap_data["5p"]
+                right_count = overlap_data["3p"]
+                if (current_q_id != None and
+                        (left_count > 0 or right_count > 0)):
+                    rtn_data.append( (current_q_id, q_l, left_count, right_count  ) )
+                overlap_data = {"5p":0, "3p":0}
+                current_q_id = q_id
+                contained = False
+                ave_idt = 0.0
+                all_over_len = 0.0
+
+            overlap_len = -int(l[2])
+            idt = float(l[3])
+            q_s, q_e, q_l = int(l[5]), int(l[6]), int(l[7])
+            t_s, t_e, t_l = int(l[9]), int(l[10]), int(l[11])
+
+            if q_l < min_len or t_l < min_len:
+                continue
+
+            if idt < 90:
+                continue
+
+            if not contained:
+                if l[-1] in ("contains", "overlap"):
+                    ave_idt += idt * overlap_len
+                    all_over_len += overlap_len
+                if q_s == 0:
+                    overlap_data["5p"] += 1
+                elif q_e == q_l:
+                    overlap_data["3p"] += 1
+
+        if q_id != None:
+            left_count = overlap_data["5p"]
+            right_count = overlap_data["3p"]
+            if (left_count > 0 or right_count > 0):
+                rtn_data.append( (q_id, q_l, left_count, right_count  ) )
+
+        return rtn_data
+
+
+def run_filter_stats(fn, min_len):
+    cmd = "LA4Falcon -mo ../1-preads_ovl/preads.db %s" % fn
+    reader = Reader(cmd)
+    with reader:
+        return fn, filter_stats(reader.readlines, min_len)
+
+def run_ovlp_stats(exe_pool, file_list, min_len):
+    inputs = []
+    for fn in file_list:
+        if len(fn) != 0:
+            inputs.append( (run_filter_stats, fn, min_len ) )
+    for res in exe_pool.imap(io.run_func, inputs):
+        for l in res[1]:
+            print " ".join([str(c) for c in l])
+
+def try_run_ovlp_stats(n_core, fofn, min_len):
+    io.LOG('starting ovlp_stats')
+    file_list = io.validated_fns(fofn)
+    io.LOG('fofn %r: %r' %(fofn, file_list))
+    n_core = min(n_core, len(file_list))
+    exe_pool = Pool(n_core)
+    try:
+        run_ovlp_stats(exe_pool, file_list, min_len)
+        io.LOG('finished ovlp_stats')
+    except KeyboardInterrupt:
+        io.LOG('terminating ovlp_stats workers...')
+        exe_pool.terminate()
+
+def ovlp_stats(fofn, min_len, n_core, stream, debug, silent):
+    if debug:
+        n_core = 0
+        silent = False
+    if silent:
+        io.LOG = io.write_nothing
+    if stream:
+        global Reader
+        Reader = io.StreamedProcessReaderContext
+    try_run_ovlp_stats(n_core, fofn, min_len)
+
+def parse_args(argv):
+    parser = argparse.ArgumentParser(description='a simple multi-processes LAS ovelap data filter')
+    parser.add_argument('--n_core', type=int, default=4,
+                        help='number of processes used for generating consensus; '
+                        '0 for main process only (default=%(default)s)')
+    parser.add_argument('--fofn', type=str, help='file contains the path of all LAS file to be processed in parallel')
+    parser.add_argument('--min_len', type=int, default=2500, help="min length of the reads")
+    parser.add_argument('--stream', action='store_true', help='stream from LA4Falcon, instead of slurping all at once; can save memory for large data')
+    parser.add_argument('--debug', '-g', action='store_true', help="single-threaded, plus other aids to debugging")
+    parser.add_argument('--silent', action='store_true', help="suppress cmd reporting on stderr")
+    return parser.parse_args(argv[1:])
+
+def main(*argv):
+    args = parse_args(argv)
+    ovlp_stats(**vars(args))
diff --git a/src/py/mains/ovlp_to_graph.py b/src/py/mains/ovlp_to_graph.py
new file mode 100644
index 0000000..050e615
--- /dev/null
+++ b/src/py/mains/ovlp_to_graph.py
@@ -0,0 +1,1441 @@
+#from pbcore.io import FastaReader
+import networkx as nx
+import os
+import shlex
+import sys
+import subprocess
+
+DEBUG_LOG_LEVEL = 0
+
+class SGNode(object):
+    """
+    class representing a node in the string graph
+    """
+    def __init__(self, node_name):
+        self.name = node_name
+        self.out_edges = []
+        self.in_edges = []
+    def add_out_edge(self, out_edge):
+        self.out_edges.append(out_edge)
+    def add_in_edge(self, in_edge):
+        self.in_edges.append(in_edge)
+
+class SGEdge(object):
+    """
+    class representing an edge in the string graph
+    """
+    def __init__(self, in_node, out_node):
+        self.in_node = in_node
+        self.out_node = out_node
+        self.attr = {}
+    def set_attribute(self, attr, value):
+        self.attr[attr] = value
+
+def reverse_end( node_id ):
+    node_id, end = node_id.split(":")
+    new_end = "B" if end == "E" else "E"
+    return node_id + ":" + new_end
+
+class StringGraph(object):
+    """
+    class representing the string graph
+    """
+    def __init__(self):
+        self.nodes = {}
+        self.edges = {}
+        self.n_mark = {}
+        self.e_reduce = {}
+        self.repeat_overlap = {}
+        self.best_out = {}
+        self.best_in = {}
+        
+    def add_node(self, node_name):
+        """ 
+        add a node into the graph by given a node name
+        """
+        if node_name not in self.nodes:
+            self.nodes[node_name] = SGNode(node_name)
+    
+    def add_edge(self, in_node_name, out_node_name, **attributes):
+        """ 
+        add an edge into the graph by given a pair of nodes
+        """
+        if (in_node_name, out_node_name) not in self.edges:
+        
+            self.add_node(in_node_name)
+            self.add_node(out_node_name)
+            in_node = self.nodes[in_node_name]
+            out_node = self.nodes[out_node_name]    
+            
+            edge = SGEdge(in_node, out_node)
+            self.edges[ (in_node_name, out_node_name) ] = edge
+            in_node.add_out_edge(edge)
+            out_node.add_in_edge(edge)
+        edge =  self.edges[ (in_node_name, out_node_name) ]
+        for k, v in attributes.items():
+            edge.attr[k] = v
+
+    def init_reduce_dict(self):
+        for e in self.edges:
+            self.e_reduce[e] = False
+
+    def mark_chimer_edges(self):
+
+        for n_name in self.nodes:
+            n = self.nodes[n_name]
+            
+            out_nodes = set( [ e.out_node for e in n.out_edges ] )
+            in_nodes = [e.in_node for e in n.in_edges ] 
+            is_chimer = True
+            for in_node in in_nodes:
+                for v in [e.out_node for e in in_node.out_edges]:
+                    if v in out_nodes:
+                        is_chimer = False
+                        break
+
+            if is_chimer == True:
+                for e in n.out_edges:
+                    v, w =  e.in_node.name, e.out_node.name
+                    self.e_reduce[ (v, w) ] = True
+                for e in n.in_edges:
+                    v, w =  e.in_node.name, e.out_node.name
+                    self.e_reduce[ (v, w) ] = True
+
+
+            # need to remove the node from the graph rather than just mark the edges are "reduced"?
+
+
+    def mark_spur_edge(self):
+
+        removed_edges = set()
+        for  v in self.nodes:
+            if len(self.nodes[v].out_edges) > 1:
+                for out_edge in self.nodes[v].out_edges:
+                    w = out_edge.out_node.name
+                    
+                    if len(self.nodes[w].out_edges) == 0 and self.e_reduce[ (v, w) ] != True:
+                        self.e_reduce[(v, w)] = True
+                        removed_edges.add( (v, w) )
+                        v2, w2 = reverse_end(w), reverse_end(v)
+                        self.e_reduce[(v2, w2)] = True
+                        removed_edges.add( (v2, w2) )
+
+            if len(self.nodes[v].in_edges) > 1:
+                for in_edge in self.nodes[v].in_edges:
+                    w = in_edge.in_node.name
+                    if len(self.nodes[w].in_edges) == 0 and self.e_reduce[ (w, v) ] != True:
+                        self.e_reduce[(w, v)] = True
+                        removed_edges.add( (w, v) )
+                        v2, w2 = reverse_end(w), reverse_end(v)
+                        self.e_reduce[(w2, v2)] = True
+                        removed_edges.add( (w2, v2) )
+        return removed_edges
+
+    def mark_tr_edges(self):
+        """
+        transitive reduction
+        """
+        n_mark = self.n_mark
+        e_reduce = self.e_reduce
+        FUZZ = 500
+        for n in self.nodes:
+            n_mark[n] = "vacant"
+    
+        for n_name, node in self.nodes.items():
+
+            out_edges = node.out_edges
+            if len(out_edges) == 0:
+                continue
+            
+            out_edges.sort(key=lambda x: x.attr["length"])
+            
+            for e in out_edges:
+                w = e.out_node
+                n_mark[ w.name ] = "inplay"
+            
+            max_len = out_edges[-1].attr["length"]
+                
+            max_len += FUZZ
+            
+            for e in out_edges:
+                e_len = e.attr["length"]
+                w = e.out_node
+                if n_mark[w.name] == "inplay":
+                    w.out_edges.sort( key=lambda x: x.attr["length"] )
+                    for e2 in w.out_edges:
+                        if e2.attr["length"] + e_len < max_len:
+                            x = e2.out_node
+                            if n_mark[x.name] == "inplay":
+                                n_mark[x.name] = "eliminated"
+            
+            for e in out_edges:
+                e_len = e.attr["length"]
+                w = e.out_node
+                w.out_edges.sort( key=lambda x: x.attr["length"] )
+                if len(w.out_edges) > 0:
+                    x = w.out_edges[0].out_node
+                    if n_mark[x.name] == "inplay":
+                        n_mark[x.name] = "eliminated"
+                for e2 in w.out_edges:
+                    if e2.attr["length"] < FUZZ:
+                        x = e2.out_node
+                        if n_mark[x.name] == "inplay":
+                            n_mark[x.name] = "eliminated"
+                            
+            for out_edge in out_edges:
+                v = out_edge.in_node
+                w = out_edge.out_node
+                if n_mark[w.name] == "eliminated":
+                    e_reduce[ (v.name, w.name) ] = True
+                    v_name, w_name = reverse_end(w.name), reverse_end(v.name)
+                    e_reduce[(v_name, w_name)] = True
+                n_mark[w.name] = "vacant"
+                
+
+    def mark_best_overlap(self):
+        """
+        find the best overlapped edges
+        """
+
+        best_edges = set()
+        removed_edges = set()
+
+        for v in self.nodes:
+
+            out_edges = self.nodes[v].out_edges
+            if len(out_edges) > 0:
+                out_edges.sort(key=lambda e: -e.attr["score"])
+                for e in out_edges:
+                    if self.e_reduce[ (e.in_node.name, e.out_node.name) ] != True:
+                        best_edges.add( (e.in_node.name, e.out_node.name) )
+                        self.best_out[v] = e.out_node.name
+                        break
+
+            in_edges = self.nodes[v].in_edges
+            if len(in_edges) > 0:
+                in_edges.sort(key=lambda e: -e.attr["score"])
+                for e in in_edges:
+                    if self.e_reduce[ (e.in_node.name, e.out_node.name) ] != True:
+                        best_edges.add( (e.in_node.name, e.out_node.name) )
+                        self.best_in[v] = e.in_node.name
+                        break
+
+        if DEBUG_LOG_LEVEL > 1:
+            print "X", len(best_edges)
+
+        for e_n, e in self.edges.items():
+            v = e_n[0]
+            w = e_n[1]
+            if self.e_reduce[ (v, w) ] != True:
+                if (v, w) not in best_edges:
+                    self.e_reduce[(v, w)] = True
+                    removed_edges.add( (v, w) )
+                    v2, w2 = reverse_end(w), reverse_end(v)
+                    self.e_reduce[(v2, w2)] = True
+                    removed_edges.add( (v2, w2) )
+                
+        return removed_edges
+
+    def resolve_repeat_edges(self):
+
+
+        edges_to_reduce = []
+        nodes_to_test = set()
+        for v_n, v in self.nodes.items():
+            
+            out_nodes = []
+            for e in v.out_edges:
+                if self.e_reduce[(e.in_node.name, e.out_node.name)] == False:
+                    out_nodes.append( e.out_node.name )
+
+            in_nodes = []
+            for e in v.in_edges:
+                if self.e_reduce[(e.in_node.name, e.out_node.name)] == False:
+                    in_nodes.append( e.in_node.name )
+
+            if len(out_nodes) == 1 and len(in_nodes)  == 1:
+                nodes_to_test.add(v_n)
+        
+        for v_n in list( nodes_to_test ):
+            
+            v = self.nodes[v_n]
+
+            out_nodes = []
+            for e in v.out_edges:
+                if self.e_reduce[(e.in_node.name, e.out_node.name)] == False:
+                    out_nodes.append( e.out_node.name )
+
+            in_nodes = []
+            for e in v.in_edges:
+                if self.e_reduce[(e.in_node.name, e.out_node.name)] == False:
+                    in_nodes.append( e.in_node.name )
+
+            in_node_name = in_nodes[0] 
+
+            for out_edge in self.nodes[in_node_name].out_edges:
+                vv = out_edge.in_node.name
+                ww = out_edge.out_node.name
+
+                ww_out = self.nodes[ww].out_edges
+                v_out = self.nodes[v_n].out_edges
+                ww_out_nodes = set( [ n.out_node.name for n in ww_out] )
+                v_out_nodes = set(  [ n.out_node.name for n in v_out] )
+                o_overlap = len( ww_out_nodes & v_out_nodes )
+
+                ww_in_count = 0
+                for e in self.nodes[ww].in_edges:
+                    if self.e_reduce[ ( e.in_node.name, e.out_node.name ) ] == False:
+                        ww_in_count += 1
+
+                if ww != v_n and\
+                   self.e_reduce[ (vv, ww) ] == False and\
+                   ww_in_count > 1 and\
+                   ww not in nodes_to_test and\
+                   o_overlap == 0:
+                    edges_to_reduce.append( (vv, ww) )
+
+            out_node_name = out_nodes[0]
+
+            for in_edge in self.nodes[out_node_name].in_edges:
+                vv = in_edge.in_node.name
+                ww = in_edge.out_node.name
+
+                vv_in = self.nodes[vv].in_edges
+                v_in = self.nodes[v_n].in_edges
+                vv_in_nodes = set( [ n.in_node.name for n in vv_in] )
+                v_in_nodes = set(  [ n.in_node.name for n in v_in] )
+                i_overlap = len( vv_in_nodes & v_in_nodes )
+
+                vv_out_count = 0
+                for e in self.nodes[vv].out_edges:
+                    if self.e_reduce[ ( e.in_node.name, e.out_node.name )] == False:
+                        vv_out_count += 1
+
+                if vv != v_n and\
+                   self.e_reduce[ (vv, ww) ] == False and\
+                   vv_out_count > 1 and\
+                   vv not in nodes_to_test and\
+                   i_overlap == 0:
+                    edges_to_reduce.append( (vv, ww) )
+
+        removed_edges = set()
+        for e in edges_to_reduce:
+            self.e_reduce[e] = True
+            removed_edges.add(e)
+
+        return removed_edges
+
+    def get_out_edges_for_node(self, name, mask=True):
+        rtn = []
+        for e in self.nodes[name].out_edges:
+            v = e.in_node
+            w = e.out_node
+            if self.e_reduce[ (v.name, w.name) ] == False:
+                rtn.append(e)
+        return rtn
+        
+        
+    def get_in_edges_for_node(self, name, mask=True):
+        rtn = []
+        for e in self.nodes[name].in_edges:
+            v = e.in_node
+            w = e.out_node
+            if self.e_reduce[ (v.name, w.name) ] == False:
+                rtn.append(e)
+        return rtn
+
+    def get_best_out_edge_for_node(self, name, mask=True):
+        rtn = []
+        for e in self.nodes[name].out_edges:
+            v = e.in_node
+            w = e.out_node
+            if self.e_reduce[ (v.name, w.name) ] == False:
+                rtn.append(e)
+        rtn.sort(key=lambda e: e.attr["score"])
+
+        return rtn[-1]
+
+    def get_best_in_edge_for_node(self, name, mask=True):
+        rtn = []
+        for e in self.nodes[name].in_edges:
+            v = e.in_node
+            w = e.out_node
+            if self.e_reduce[ (v.name, w.name) ] == False:
+                rtn.append(e)
+        rtn.sort(key=lambda e: e.attr["score"])
+        return rtn[-1]
+        
+
+RCMAP = dict(zip("ACGTacgtNn-","TGCAtgcaNn-"))
+def generate_seq_from_path(sg, seqs, path):
+    subseqs = []
+    r_id, end = path[0].split(":")
+    
+    count = 0
+    for i in range( len( path ) -1 ):
+        w_n, v_n = path[i:i+2]
+        edge = sg.edges[ (w_n, v_n ) ]
+        read_id, coor = edge.attr["label"].split(":")
+        b,e = coor.split("-")
+        b = int(b)
+        e = int(e)
+        if b < e:
+            subseqs.append( seqs[read_id][b:e] )
+        else:
+            subseqs.append( "".join( [RCMAP[c] for c in seqs[read_id][b:e:-1]] ) )
+
+    return "".join(subseqs)
+
+
+def reverse_edge( e ):
+    e1, e2 = e
+    return reverse_end(e2), reverse_end(e1)
+
+def reverse_path( p ):
+    p = p[::-1]
+    return [reverse_end(n) for n in p]
+
+    
+def find_bundle(ug, u_edge_data, start_node, depth_cutoff, width_cutoff, length_cutoff):
+
+    tips = set()
+    bundle_edges = set()
+    bundle_nodes = set()
+
+    local_graph = nx.ego_graph(ug, start_node, depth_cutoff, undirected=False)
+    length_to_node = {start_node:0}
+    score_to_node = {start_node:0}
+
+    v = start_node
+    end_node = start_node
+
+    if DEBUG_LOG_LEVEL > 1: 
+        print
+        print 
+        print "start", start_node
+
+    bundle_nodes.add(v)
+    for vv, ww, kk in local_graph.out_edges(v, keys = True):
+        max_score = 0
+        max_length = 0
+
+        if (vv, ww, kk) not in bundle_edges and\
+                reverse_end(ww) not in bundle_nodes:
+
+            bundle_edges.add( (vv, ww, kk) )
+            tips.add(ww)
+
+    for v in list(tips):
+        bundle_nodes.add(v)
+
+    depth = 1
+    width = 1.0
+    converage = False
+
+
+    while 1:
+        if DEBUG_LOG_LEVEL > 1:
+            print "# of tips", len(tips)
+
+        if len(tips) > 4:
+            converage = False
+            break
+
+        if len(tips) == 1:
+            end_node = tips.pop()
+
+            if DEBUG_LOG_LEVEL > 1:
+                print "end", end_node
+
+            if end_node not in length_to_node:
+                v = end_node
+                max_score_edge = None
+                max_score = 0
+                for uu, vv, kk in local_graph.in_edges(v, keys=True):
+                    if uu not in length_to_node:
+                        continue
+
+                    score = u_edge_data[ (uu, vv, kk) ][1]
+
+                    if score > max_score:
+
+                        max_score = score
+                        max_score_edge = (uu, vv, kk)
+
+                length_to_node[v] = length_to_node[max_score_edge[0]] +  u_edge_data[ max_score_edge ][0]
+                score_to_node[v] = score_to_node[max_score_edge[0]] +  u_edge_data[ max_score_edge ][1]
+                
+
+            converage = True
+            break
+        
+
+        depth += 1
+        width = 1.0 * len(bundle_edges) / depth
+
+        if depth > 10 and width > width_cutoff:
+            converage = False
+            break
+
+        if depth > depth_cutoff:
+            converage = False
+            break
+        
+        tips_list = list(tips)
+
+        tip_updated = False
+        loop_detect = False
+        length_limit_reached = False
+
+        for v in tips_list:
+            if DEBUG_LOG_LEVEL > 1:
+                print "process", v
+
+            if len(local_graph.out_edges(v, keys=True)) == 0: # dead end route
+                print "no out edge", v
+                continue
+
+            max_score_edge = None
+            max_score = 0
+
+            extend_tip = True
+
+            for uu, vv, kk in local_graph.in_edges(v, keys=True):
+                if DEBUG_LOG_LEVEL > 1: 
+                    print "in_edges", uu, vv, kk
+                    print uu, "in length_to_node",  uu in length_to_node
+
+                if uu not in length_to_node:
+                    extend_tip = False
+                    break
+
+                score = u_edge_data[ (uu, vv, kk) ][1]
+
+                if score > max_score:
+
+                    max_score = score
+                    max_score_edge = (uu, vv, kk)
+            
+            if extend_tip:
+            
+                length_to_node[v] = length_to_node[max_score_edge[0]] +  u_edge_data[ max_score_edge ][0]
+                score_to_node[v] = score_to_node[max_score_edge[0]] +  u_edge_data[ max_score_edge ][1]
+
+                if length_to_node[v] > length_cutoff:
+                    length_limit_reached = True
+                    converage = False
+                    break
+
+                v_updated = False
+                for vv, ww, kk in local_graph.out_edges(v, keys=True):
+
+                    if DEBUG_LOG_LEVEL > 1:
+                        print "test", vv, ww, kk
+
+                    if ww in length_to_node:
+                        loop_detect = True
+                        if DEBUG_LOG_LEVEL > 1:
+                            print "loop_detect", ww
+                        break
+
+                    if (vv, ww, kk) not in bundle_edges and\
+                            reverse_end(ww) not in bundle_nodes:
+
+                        if DEBUG_LOG_LEVEL > 1:
+                            print "add", ww
+
+                        tips.add(ww)
+                        bundle_edges.add( (vv, ww, kk) )
+                        tip_updated = True
+                        v_updated = True
+
+                if v_updated:
+
+                    if DEBUG_LOG_LEVEL > 1:
+                        print "remove", v
+
+                    tips.remove(v)
+
+                    if len(tips) == 1:
+                        break
+
+            if loop_detect:
+                converage = False
+                break
+
+        if length_limit_reached:
+            converage = False
+            break
+
+        if loop_detect:
+            converage = False
+            break
+
+        if not tip_updated:
+            converage = False
+            break
+
+        for v in list(tips):
+            bundle_nodes.add(v)
+
+        
+
+    data = start_node, end_node, bundle_edges, length_to_node[end_node], score_to_node[end_node], depth
+    
+    data_r = None
+
+    if DEBUG_LOG_LEVEL > 1:
+        print converage, data, data_r
+    return converage, data, data_r
+
+def generate_string_graph(args):
+
+    overlap_file = args.overlap_file
+
+    contained_reads = set()
+    chimer_ids = set()
+
+    filter_reads = False
+    
+    seqs = set()
+
+    G=nx.Graph()
+    edges =set()
+    overlap_data = []
+    contained_reads = set()
+    overlap_count = {}
+
+
+    # loop through the overlapping data to load the data in the a python array
+    # contained reads are identified 
+
+    with open(overlap_file) as f:
+        for l in f:
+            l = l.strip().split()
+
+            #work around for some ill formed data recored
+            #if len(l) != 13:
+            #    continue
+            
+            f_id, g_id, score, identity = l[:4]
+
+            if f_id == g_id:  # don't need self-self overlapping
+                continue
+            
+            if filter_reads:
+
+                if g_id not in seqs: 
+                    continue
+
+                if f_id not in seqs:
+                    continue
+
+            score = int(score)
+            identity = float(identity)
+            contained = l[12]
+            if contained == "contained":
+                contained_reads.add(f_id)
+                continue
+            if contained == "contains":
+                contained_reads.add(g_id)
+                continue
+            if contained == "none":
+                continue
+
+            if identity < args.min_idt: # only take record with >96% identity as overlapped reads
+                continue
+            f_strain, f_start, f_end, f_len = (int(c) for c in l[4:8])
+            g_strain, g_start, g_end, g_len = (int(c) for c in l[8:12])
+
+            # only used reads longer than the 4kb for assembly
+            if f_len < args.min_len: continue
+            if g_len < args.min_len: continue
+            
+            """
+            # double check for proper overlap
+            # this is not necessary when using DALIGNER for overlapper
+            # it may be useful if other overlappers give fuzzier alignment boundary
+            if f_start > 24 and f_len - f_end > 24:  # allow 24 base tolerance on both sides of the overlapping
+                continue
+            
+            if g_start > 24 and g_len - g_end > 24:
+                continue
+            
+            if g_strain == 0:
+                if f_start < 24 and g_len - g_end > 24:
+                    continue
+                if g_start < 24 and f_len - f_end > 24:
+                    continue
+            else:
+                if f_start < 24 and g_start > 24:
+                    continue
+                if g_start < 24 and f_start > 24:
+                    continue
+            """
+
+            overlap_data.append( (f_id, g_id, score, identity,
+                                  f_strain, f_start, f_end, f_len,
+                                  g_strain, g_start, g_end, g_len) )
+
+            overlap_count[f_id] = overlap_count.get(f_id,0)+1
+            overlap_count[g_id] = overlap_count.get(g_id,0)+1
+            
+    overlap_set = set()
+    sg = StringGraph()
+    for od in overlap_data:
+        f_id, g_id, score, identity = od[:4]
+        if f_id in contained_reads:
+            continue
+        if g_id in contained_reads:
+            continue
+        f_s, f_b, f_e, f_l = od[4:8]
+        g_s, g_b, g_e, g_l = od[8:12]
+        overlap_pair = [f_id, g_id]
+        overlap_pair.sort()
+        overlap_pair = tuple( overlap_pair )
+        if overlap_pair in overlap_set:  # don't allow duplicated records
+            continue
+        else:
+            overlap_set.add(overlap_pair)
+
+        
+        if g_s == 1: # revered alignment, swapping the begin and end coordinates
+            g_b, g_e = g_e, g_b
+        
+        # build the string graph edges for each overlap
+        if f_b > 1:
+            if g_b < g_e:
+                """
+                     f.B         f.E
+                  f  ----------->
+                  g         ------------->
+                            g.B           g.E
+                """
+                if f_b == 0 or g_e - g_l == 0:
+                    continue
+                sg.add_edge( "%s:B" % g_id, "%s:B" % f_id, label = (f_id, f_b, 0), 
+                                                           length = abs(f_b-0),
+                                                           score = -score, 
+                                                           identity = identity )
+                sg.add_edge( "%s:E" % f_id, "%s:E" % g_id, label = (g_id, g_e, g_l), 
+                                                           length = abs(g_e-g_l),
+                                                           score = -score,
+                                                           identity = identity)
+            else:
+                """
+                     f.B         f.E
+                  f  ----------->
+                  g         <-------------
+                            g.E           g.B           
+                """
+                if f_b == 0 or g_e == 0:
+                    continue
+                sg.add_edge( "%s:E" % g_id, "%s:B" % f_id, label = (f_id, f_b, 0), 
+                                                           length = abs(f_b -0),
+                                                           score = -score,
+                                                           identity = identity)
+                sg.add_edge( "%s:E" % f_id, "%s:B" % g_id, label = (g_id, g_e, 0), 
+                                                           length = abs(g_e- 0),
+                                                           score = -score,
+                                                           identity = identity)
+        else:
+            if g_b < g_e:
+                """
+                                    f.B         f.E
+                  f                 ----------->
+                  g         ------------->
+                            g.B           g.E
+                """
+                if g_b == 0 or f_e - f_l == 0:
+                    continue
+                sg.add_edge( "%s:B" % f_id, "%s:B" % g_id, label = (g_id, g_b, 0), 
+                                                           length = abs(g_b - 0),
+                                                           score = -score,
+                                                           identity = identity)
+                sg.add_edge( "%s:E" % g_id, "%s:E" % f_id, label = (f_id, f_e, f_l), 
+                                                           length = abs(f_e-f_l),
+                                                           score = -score,
+                                                           identity = identity)
+            else:
+                """
+                                    f.B         f.E
+                  f                 ----------->
+                  g         <-------------
+                            g.E           g.B           
+                """
+                if g_b - g_l == 0 or f_e - f_l ==0:
+                    continue
+                sg.add_edge( "%s:B" % f_id, "%s:E" % g_id, label = (g_id, g_b, g_l), 
+                                                           length = abs(g_b - g_l),
+                                                           score = -score,
+                                                           identity = identity)
+                sg.add_edge( "%s:B" % g_id, "%s:E" % f_id, label = (f_id, f_e, f_l), 
+                                                           length = abs(f_e - f_l),
+                                                           score = -score,
+                                                           identity = identity)
+
+
+    sg.init_reduce_dict()
+
+    #if not args.disable_chimer_prediction:
+    #    sg.mark_chimer_edges()
+    #sg.mark_spur_edge()
+    
+
+    sg.mark_tr_edges() # mark those edges that transitive redundant
+
+    if DEBUG_LOG_LEVEL > 1:
+        print sum( [1 for c in sg.e_reduce.values() if c == True] )
+        print sum( [1 for c in sg.e_reduce.values() if c == False] )
+
+
+    removed_edges = set()
+    if args.lfc == True:
+        removed_edges = sg.resolve_repeat_edges()  
+    else:
+        removed_edges = sg.mark_best_overlap() # mark those edges that are best overlap edges
+
+    spur_edges = sg.mark_spur_edge()
+
+    if DEBUG_LOG_LEVEL > 1:
+        print sum( [1 for c in sg.e_reduce.values() if c == False] )
+
+    out_f = open("sg_edges_list", "w")
+    nxsg = nx.DiGraph()
+    edge_data = {}
+    for v, w in sg.edges:
+        e = sg.edges[ (v, w) ]
+        rid, sp, tp = e.attr["label"]
+        score = e.attr["score"]
+        identity = e.attr["identity"]
+        length = abs(sp-tp)
+
+
+        if  sg.e_reduce[(v, w)] != True:
+            type_ = "G"
+            label = "%s:%d-%d" % (rid, sp, tp)
+            nxsg.add_edge(v, w, label = label, length = length, score = score)
+            edge_data[ (v, w) ] = (rid, sp, tp, length, score, identity, type_)
+            if w in sg.best_in:
+                nxsg.node[w]["best_in"] = v
+        elif (v, w) in removed_edges:
+            type_ = "R"
+        elif (v, w) in spur_edges:
+            type_ = "S"
+        elif sg.e_reduce[(v, w)] == True:
+            type_ = "TR"
+
+        print >>out_f, v, w, rid, sp, tp, score, identity, type_
+
+
+        
+    out_f.close()
+    nxsg_r = nxsg.reverse()    
+
+    return nxsg, nxsg_r, edge_data
+
+
+
+def construct_compound_paths(ug, u_edge_data):
+
+    source_nodes = set()
+    sink_nodes = set()
+    simple_nodes = set()
+    branch_nodes = set()
+
+    all_nodes = ug.nodes()
+    for n in all_nodes:
+        in_degree = len( ug.in_edges(n) )
+        out_degree = len( ug.out_edges(n) )
+        if in_degree == 0:
+            source_nodes.add(n)
+        if out_degree == 0:
+            sink_nodes.add(n)
+        if in_degree == 1 and out_degree == 1:
+            simple_nodes.add(n)
+        if in_degree > 1 or out_degree > 1:
+            branch_nodes.add(n)
+
+    #print "#", len(all_nodes),len(source_nodes), len(sink_nodes), len(simple_nodes), len(branch_nodes)
+    compound_paths_0 = []
+    for p in list(branch_nodes):
+        if ug.out_degree(p) > 1:
+            coverage, data, data_r =  find_bundle(ug, u_edge_data, p, 48, 16, 500000)
+            if coverage == True:
+                start_node, end_node, bundle_edges, length, score, depth = data
+                compound_paths_0.append(  (start_node, "NA", end_node, 1.0*len(bundle_edges)/depth, length, score, bundle_edges ) )
+
+    compound_paths_0.sort( key=lambda x: -len(x[6]) )
+
+
+    edge_to_cpath = {}
+    compound_paths_1 = {}
+    for s, v, t, width, length, score, bundle_edges in compound_paths_0:
+        if DEBUG_LOG_LEVEL > 1:
+            print "constructing utg, test ", s,v, t
+        
+        overlapped = False
+        for vv, ww, kk in list(bundle_edges):
+            if (vv, ww, kk) in edge_to_cpath:
+                if DEBUG_LOG_LEVEL > 1:
+                    print "remove overlapped utg", (s, v, t), (vv, ww, kk)
+                overlapped = True
+                break
+            rvv = reverse_end(vv)
+            rww = reverse_end(ww)
+            rkk = reverse_end(kk)
+            if (rww, rvv, rkk) in edge_to_cpath:
+                if DEBUG_LOG_LEVEL > 1:
+                    print "remove overlapped r utg", (s, v, t),  (rww, rvv, rkk)
+                overlapped = True
+                break
+            
+
+        if not overlapped:
+            if DEBUG_LOG_LEVEL > 1:
+                print "constructing", s,v, t
+
+            bundle_edges_r = []
+            rs = reverse_end(t)
+            rt = reverse_end(s)
+
+            for vv, ww, kk in list(bundle_edges):
+                edge_to_cpath.setdefault( (vv, ww, kk), set() )
+                edge_to_cpath[ (vv, ww, kk) ].add( ( s, t, v) )
+                rvv = reverse_end(ww)
+                rww = reverse_end(vv)
+                rkk = reverse_end(kk)
+                edge_to_cpath.setdefault( (rvv, rww, rkk), set() )
+                edge_to_cpath[ (rvv, rww, rkk) ].add( (rs, rt, v) ) #assert v == "NA"
+                bundle_edges_r.append(  (rvv, rww, rkk) )
+            
+            compound_paths_1[ ( s, v, t) ] = width, length, score, bundle_edges
+            compound_paths_1[ ( rs, v, rt) ] = width, length, score, bundle_edges_r
+
+             
+    compound_paths_2 = {}
+    edge_to_cpath = {}
+    for s, v, t in compound_paths_1:
+        rs = reverse_end(t)
+        rt = reverse_end(s)
+        if (rs, "NA", rt) not in compound_paths_1:
+            if DEBUG_LOG_LEVEL > 1:
+                print "non_compliment bundle", s, v, t, len(compound_paths_1[( s, v, t)][-1])
+            continue
+        width, length, score, bundle_edges = compound_paths_1[ (s, v, t) ]
+        compound_paths_2[ (s, v, t) ] = width, length, score, bundle_edges
+        for vv, ww, kk in list(bundle_edges):
+            edge_to_cpath.setdefault( (vv, ww, kk), set() )
+            edge_to_cpath[ (vv, ww, kk) ].add( ( s, t, v) )
+
+
+    compound_paths_3 = {}
+    for k, val in compound_paths_2.items():
+        
+        start_node, NA, end_node = k
+        rs = reverse_end(end_node)
+        rt = reverse_end(start_node)
+        assert (rs, "NA", rt) in compound_paths_2
+        
+        contained = False
+        for vv, ww, kk in ug.out_edges(start_node, keys=True):
+            if len(edge_to_cpath.get( (vv, ww, kk), [] )) > 1: 
+                contained = True
+
+        if not contained:
+            compound_paths_3[k] = val
+            if DEBUG_LOG_LEVEL > 1:
+                print "compound", k 
+
+    compound_paths = {}
+    for s, v, t in compound_paths_3:
+        rs = reverse_end(t)
+        rt = reverse_end(s)
+        if (rs, "NA", rt) not in compound_paths_3:
+            continue
+        compound_paths[ (s, v, t) ] = compound_paths_3[ (s, v, t) ]
+
+    return compound_paths
+
+def main(*argv):
+    import argparse
+    
+    parser = argparse.ArgumentParser(description='a example string graph assembler that is desinged for handling diploid genomes')
+    parser.add_argument('overlap_file', help='a file that contains the overlap information.')
+
+    parser.add_argument('--min_len', type=int, default=4000, 
+                        help='minimum length of the reads to be considered for assembling')
+    parser.add_argument('--min_idt', type=float, default=96,
+                        help='minimum alignment identity of the reads to be considered for assembling')
+    parser.add_argument('--lfc', action="store_true", default=False,
+                        help='use local flow constraint method rather than best overlap method to resolve knots in string graph')
+
+    args = parser.parse_args(argv[1:])
+
+
+    # transitivity reduction, remove spurs, remove putative edges caused by repeats
+    sg, sg_r, edge_data = generate_string_graph(args)
+
+
+    simple_paths = {}
+    dual_path = {}
+
+
+    sg2 = nx.DiGraph()
+
+    for v, w in edge_data:
+
+        assert (reverse_end(w), reverse_end(v)) in edge_data
+        
+        #if (v, w) in masked_edges:
+        #    continue
+
+        rid, sp, tp, length, score, identity, type_ = edge_data[ (v, w) ]
+        if type_ != "G":
+            continue
+
+        label = "%s:%d-%d" % (rid, sp, tp)
+        sg2.add_edge( v, w, label = label, length = length, score = score)
+
+        
+    # utg construction phase 1, identify all simple paths
+    s_nodes = set()
+    t_nodes = set()
+    simple_nodes = set()
+
+    all_nodes = sg2.nodes()
+    for n in all_nodes:
+        in_degree = len( sg2.in_edges(n) )
+        out_degree = len( sg2.out_edges(n) )
+        if in_degree == 1 and out_degree == 1:
+            simple_nodes.add(n)
+        else:
+            if out_degree != 0:
+                s_nodes.add(n)
+            if in_degree != 0:
+                t_nodes.add(n)
+
+
+    free_edges = set(sg2.edges())
+
+    if DEBUG_LOG_LEVEL > 1: 
+        for s in list(simple_nodes):
+            print "simple_node", s
+        for s in list(s_nodes):
+            print "s_node", s
+        for s in list(t_nodes):
+            print "t_node", s
+
+        for v,w in free_edges:
+            if (reverse_end(w), reverse_end(v) ) not in free_edges:
+                print "bug", v,w
+                print oreverse_end(w), reverse_end(v)
+
+    while len(free_edges) != 0:
+        if len(s_nodes) != 0:
+            n = s_nodes.pop()
+            if DEBUG_LOG_LEVEL > 1:
+                print "initial utg 1", n
+        else:
+            e = free_edges.pop()
+            free_edges.add(e)
+            n = e[0]
+            if DEBUG_LOG_LEVEL > 1:
+                print "initial utg 2", n
+
+        path = []
+        path_length =0
+        path_score = 0 
+        for v, w in sg2.out_edges(n):
+            if (v, w) not in free_edges:
+                continue
+            rv = reverse_end(v)
+            rw = reverse_end(w)
+
+            path_length = 0
+            path_score = 0
+            v0 = v
+            w0 = w
+            path = [v, w]
+            path_edges = set()
+            path_edges.add( (v, w) )
+            path_length += edge_data[ (v, w) ][3]
+            path_score += edge_data[ (v, w) ][4]
+            free_edges.remove( (v, w) )
+
+            r_path_length = 0
+            r_path_score = 0
+            rv0 = rv
+            rw0 = rw
+            r_path = [rv, rw] # need to reverse again
+            r_path_edges = set()
+            r_path_edges.add( (rw, rv) )
+            r_path_length += edge_data[ (rw, rv) ][3]
+            r_path_score += edge_data[ (rw, rv) ][4]
+            free_edges.remove( (rw, rv) )
+
+            while w in simple_nodes:
+                w, w_ = sg2.out_edges(w)[0]
+                if (w, w_) not in free_edges:
+                    break
+                rw_, rw = reverse_end(w_), reverse_end(w)
+
+                if ( rw_, rw ) in path_edges:
+                    break
+
+                path.append(w_)
+                path_edges.add( (w, w_) )
+                path_length += edge_data[ (w, w_) ][3]
+                path_score += edge_data[ (w, w_) ][4]
+                free_edges.remove( (w, w_) )
+                
+                r_path.append(rw_)
+                r_path_edges.add( (rw_, rw) )
+                r_path_length += edge_data[ (rw_, rw) ][3]
+                r_path_score += edge_data[ (rw_, rw) ][4]
+                free_edges.remove( (rw_, rw) )
+                
+
+                w = w_
+
+            simple_paths[ (v0, w0, path[-1]) ] = path_length, path_score, path
+            r_path.reverse()
+            assert r_path[0] == reverse_end(path[-1])
+            simple_paths[ (r_path[0], rw0, rv0) ] = r_path_length, r_path_score, r_path
+
+            if DEBUG_LOG_LEVEL > 1:
+                print  path_length, path_score, path
+
+            dual_path[ (r_path[0], rw0, rv0) ] = (v0, w0, path[-1])
+            dual_path[ (v0, w0, path[-1]) ] = (r_path[0], rw0, rv0)
+
+
+
+    ug = nx.MultiDiGraph()
+    u_edge_data = {}
+    circular_path = set()
+
+    for s, v, t in simple_paths:
+        length, score, path = simple_paths[ (s, v, t) ]
+        u_edge_data[ (s, t, v) ] = (length, score, path, "simple")
+        if s != t:
+            ug.add_edge(s, t, key = v, type_ = "simple", via = v, length = length, score = score)
+        else:
+            circular_path.add( (s, t, v) )
+
+
+    if DEBUG_LOG_LEVEL > 1:
+        with open("utg_data0","w") as f:
+            for s, t, v in u_edge_data:
+                rs = reverse_end(t)
+                rt = reverse_end(s)
+                rv = reverse_end(v)
+                assert (rs, rt, rv) in u_edge_data
+                length, score, path_or_edges, type_ = u_edge_data[ (s, t, v) ]
+                
+                if type_ == "compound":
+                    path_or_edges = "|".join( [ ss+"~"+vv+"~"+tt for ss, tt, vv in path_or_edges ] )
+                else:
+                    path_or_edges = "~".join( path_or_edges )
+                print >>f, s, v, t, type_, length, score, path_or_edges
+
+    # identify spurs in the utg graph
+    # Currently, we use ad-hoc logic filtering out shorter utg, but we ca
+    # add proper alignment comparison later to remove redundant utgs 
+
+    utg_spurs = set()
+    all_nodes = ug.nodes()
+
+    ug2 = ug.copy()
+    spur_edges = set()
+    edges_to_remove = set()
+
+    for n in s_nodes:
+        if ug.in_degree(n) != 0:
+            continue
+        for s, t, v in ug.out_edges(n, keys=True):
+            length, score, edges, type_ = u_edge_data[ (s, t, v) ]
+            if length > 50000 and len(edges) > 3:
+                continue
+            in_degree = len( set( e[0] for e in ug.in_edges(t))  ) # ignore mutli-edges
+            out_degree = len( set( e[1] for e in ug.out_edges(t)) )
+            if in_degree > 1 and out_degree > 0:
+                spur_edges.add( (s, t, v) )
+                edges_to_remove.add( (s, t, v) )
+                u_edge_data[ (s, t, v) ] = length, score, edges, "spur:2"
+                rs = reverse_end(t)
+                rt = reverse_end(s)
+                rv = reverse_end(v)
+                edges_to_remove.add( (rs, rt, rv) )
+                length, score, edges, type_ = u_edge_data[ (rs, rt, rv) ]
+                u_edge_data[ (rs, rt, rv) ] = length, score, edges, "spur:2"
+
+    for n in t_nodes:
+        if ug.out_degree(n) != 0:
+            continue
+        for s, t, v in ug.in_edges(n, keys=True):
+            length, score, edges, type_ = u_edge_data[ (s, t, v) ]
+            if length > 50000 and len(edges) > 3:
+                continue
+            in_degree = len( set( e[0] for e in ug.in_edges(s))  ) # ignore mutli-edges
+            out_degree = len( set( e[1] for e in ug.out_edges(s)) )
+            if in_degree > 0 and out_degree > 1:
+                spur_edges.add( (s, t, v) )
+                edges_to_remove.add( (s, t, v) )
+                u_edge_data[ (s, t, v) ] = length, score, edges, "spur:2"
+                rs = reverse_end(t)
+                rt = reverse_end(s)
+                rv = reverse_end(v)
+                edges_to_remove.add( (rs, rt, rv) )
+                length, score, edges, type_ = u_edge_data[ (rs, rt, rv) ]
+                u_edge_data[ (rs, rt, rv) ] = length, score, edges, "spur:2"
+
+    for s, t, v in list(edges_to_remove):
+        ug2.remove_edge( s, t, key= v)
+
+    #phase 2, finding all "consistent" compound paths
+    compound_paths = construct_compound_paths(ug2, u_edge_data)
+    compound_path_file = open("c_path","w")
+
+    ug2_edges = set(ug2.edges(keys = True))
+    edges_to_remove  = set()
+    for s, v, t in compound_paths:
+        width, length, score, bundle_edges =  compound_paths[ (s, v, t) ] 
+        print >> compound_path_file, s,v,t, width, length, score, "|".join( [e[0]+"~"+e[2]+"~"+e[1] for e in bundle_edges] )
+        for ss, tt, vv in bundle_edges:
+            if (ss, tt, vv) in ug2_edges:
+                edges_to_remove.add( (ss, tt, vv) )
+
+    
+    for s, t, v in edges_to_remove:
+        ug2.remove_edge( s, t ,v )
+        length, score, edges, type_ = u_edge_data[ (s, t, v) ]
+        if type_ != "spur":
+            u_edge_data[ (s, t, v) ] = length, score, edges, "contained"
+
+
+    for s, v, t in compound_paths:
+        width, length, score, bundle_edges =  compound_paths[ (s, v, t) ] 
+        u_edge_data[ (s, t, v) ] = (length, score, bundle_edges, "compound")
+        ug2.add_edge( s, t, key = v, via = v, type_="compound", length = length, score = score)
+
+        assert v == "NA"
+        rs = reverse_end(t)
+        rt = reverse_end(s)
+        assert (rs, v, rt) in compound_paths
+        dual_path[ (s, v, t) ] = (rs, v, rt)
+        dual_path[ (rs, v, rt) ] = (s, v, t)
+
+    compound_path_file.close()
+
+
+    # remove short utg using local flow consistent rule
+    """
+      short UTG like this can be removed, this kind of utg are likely artifects of repeats 
+      >____           _____>
+           \__UTG_>__/
+      <____/         \_____<
+    """
+    ug_edge_to_remove = set() 
+    for s, t, v in ug.edges(keys=True):
+        if ug2.in_degree(s) == 1 and ug2.out_degree(s) == 2 and \
+           ug2.in_degree(t) == 2 and ug2.out_degree(t) == 1:
+            length, score, path_or_edges, type_ = u_edge_data[ (s, t, v) ]
+            if length < 60000: 
+                rs = reverse_end(t)
+                rt = reverse_end(s)
+                rv = reverse_end(v)
+                ug_edge_to_remove.add( (s, t, v) )
+                ug_edge_to_remove.add( (rs, rt, rv) )
+    for s, t, v in list(ug_edge_to_remove):
+        ug2.remove_edge(s, t, key=v)
+        length, score, edges, type_ = u_edge_data[ (s, t, v) ]
+        u_edge_data[ (s, t, v) ] = length, score, edges, "repeat_bridge"
+
+    ug = ug2
+
+    with open("utg_data","w") as f:
+        for s, t, v in u_edge_data:
+            length, score, path_or_edges, type_ = u_edge_data[ (s, t, v) ]
+            
+            if v == "NA":
+                path_or_edges = "|".join( [ ss+"~"+vv+"~"+tt for ss, tt, vv in path_or_edges ] )
+            else:
+                path_or_edges = "~".join( path_or_edges )
+            print >>f, s, v, t, type_, length, score, path_or_edges
+
+    # contig construction from utgs
+
+    s_nodes = set()
+    t_nodes = set()
+    simple_nodes = set()
+    simple_out = set()
+    simple_in = set()
+
+    all_nodes = ug.nodes()
+    for n in all_nodes:
+        in_degree = len( ug.in_edges(n) )
+        out_degree = len( ug.out_edges(n) )
+        if in_degree == 1 and out_degree == 1:
+            simple_nodes.add(n)
+        else:
+            if out_degree != 0:
+                s_nodes.add(n)
+            if in_degree != 0:
+                t_nodes.add(n)
+        if out_degree == 1:
+            simple_out.add(n)
+        if in_degree == 1:
+            simple_in.add(n)
+
+    all_nodes = set(all_nodes)
+    c_path = []
+    
+    free_edges = set()
+    for s, t, v in ug.edges(keys=True):
+        free_edges.add( (s, t, v) )
+
+    while len(free_edges) != 0:
+
+        if len(s_nodes) != 0:
+            n = s_nodes.pop()
+        else:
+            e = free_edges.pop()
+            n = e[0]
+        
+        for s, t, v in ug.out_edges(n, keys=True):
+            path_start = n
+            path_end = None
+            path_key = None
+            path = []
+            path_length = 0
+            path_score = 0
+            path_nodes = set()
+            path_nodes.add(s)
+            if DEBUG_LOG_LEVEL > 1:
+                print "check 1", s, t, v
+            path_key = t
+            t0 = s
+            while t in simple_out:
+                if t in path_nodes:
+                    break
+                rt = reverse_end(t)
+                if rt in path_nodes:
+                    break
+
+                length, score, path_or_edges, type_ = u_edge_data[ (t0, t, v) ]
+
+               
+                """
+                If the next node has two in-edges and the current path has the best overlap,
+                we will extend the contigs. Otherwise, we will terminate the contig extension.
+                This can help reduce some mis-assemblies but it can still construct long contigs
+                when there is an oppertunity (assuming the best overlap has the highest
+                likelihood to be correct.)
+                """
+                if len(ug.in_edges(t, keys=True)) > 1:
+                    best_in_node = sg.node[t]["best_in"] 
+                    
+                    if type_ == "simple" and best_in_node != path_or_edges[-2]:
+                        break
+                    if type_ == "compound":
+                        t_in_nodes = set()
+                        for ss, vv, tt in path_or_edges:
+                            if tt != t:
+                                continue
+                            length, score, path_or_edges, type_ = u_edge_data[ (ss,vv,tt) ]
+                            if path_or_edges[-1] == tt:
+                                t_in_nodes.add(path_or_edges[-2])
+                        if best_in_node not in t_in_nodes:
+                            break
+                # ----------------
+
+
+                path.append( (t0, t, v) )
+                path_nodes.add(t)
+                path_length += length
+                path_score += score
+                assert len( ug.out_edges( t, keys=True ) ) == 1 # t is "simple_out" node
+                t0, t, v = ug.out_edges( t, keys=True )[0] 
+
+            path.append( (t0, t, v) )
+            length, score, path_or_edges, type_ = u_edge_data[ (t0, t, v) ]
+            path_length += length
+            path_score += score
+            path_nodes.add(t)
+            path_end = t
+
+            c_path.append( (path_start, path_key, path_end, path_length, path_score, path, len(path)) ) 
+            if DEBUG_LOG_LEVEL > 1:
+                print "c_path", path_start, path_key, path_end, path_length, path_score, len(path)
+            for e in path:
+                if e in free_edges:
+                    free_edges.remove( e )
+ 
+    if DEBUG_LOG_LEVEL > 1:
+        print "left over edges:", len(free_edges)
+
+
+
+    free_edges = set()
+    for s, t, v in ug.edges(keys=True):
+        free_edges.add( (s, t, v) )
+
+
+    ctg_id = 0
+
+    ctg_paths = open("ctg_paths","w")
+
+    c_path.sort( key=lambda x: -x[3] )
+
+    
+    for path_start, path_key, path_end, p_len, p_score, path, n_edges in c_path:
+        length = 0
+        score = 0
+        length_r = 0
+        score_r = 0
+
+        non_overlapped_path = []
+        non_overlapped_path_r = []
+        for s, t, v in path:
+            if v != "NA": 
+                rs, rt, rv = reverse_end(t), reverse_end(s), reverse_end(v)
+            else:
+                rs, rt, rv = reverse_end(t), reverse_end(s), "NA"
+            if (s, t, v) in free_edges and (rs, rt, rv) in free_edges:
+                non_overlapped_path.append( (s,t,v) )
+                non_overlapped_path_r.append( (rs, rt, rv)  )
+                length += u_edge_data[ (s, t, v) ][0]
+                score += u_edge_data[ (s, t, v) ][1]
+                length_r += u_edge_data[ (rs, rt, rv) ][0]
+                score_r += u_edge_data[ (rs, rt, rv) ][1]
+            else:
+                break
+
+        if len(non_overlapped_path) == 0:
+            continue
+        s0, t0, v0 = non_overlapped_path[0]
+        end_node = non_overlapped_path[-1][1]
+
+        print >> ctg_paths, "%06dF" % ctg_id, "ctg_linear", s0+"~"+v0+"~"+t0, end_node, length, score, "|".join([ c[0]+"~"+c[2]+"~"+c[1] for c in non_overlapped_path ] )
+        non_overlapped_path_r.reverse()
+        s0, t0, v0 = non_overlapped_path_r[0]
+        end_node = non_overlapped_path_r[-1][1]
+        print >> ctg_paths, "%06dR" % ctg_id, "ctg_linear", s0+"~"+v0+"~"+t0, end_node, length_r, score_r, "|".join([ c[0]+"~"+c[2]+"~"+c[1] for c in non_overlapped_path_r ] )
+        ctg_id += 1
+        for e in non_overlapped_path:
+            if e in free_edges:
+                free_edges.remove(e)
+        for e in non_overlapped_path_r:
+            if e in free_edges:
+                free_edges.remove(e)
+
+
+
+    for s, t, v in list(circular_path):
+        length, score, path, type_ = u_edge_data[ (s, t, v) ]
+        print >> ctg_paths, "%6d" % ctg_id, "ctg_circular", s+"~"+v+"~"+t, t, length, score, s+"~"+v+"~"+t
+        ctg_id += 1
+
+    ctg_paths.close()
+
diff --git a/src/py/mains/run.py b/src/py/mains/run.py
new file mode 100644
index 0000000..44e9334
--- /dev/null
+++ b/src/py/mains/run.py
@@ -0,0 +1,566 @@
+from .. import run_support as support
+from pypeflow.data import PypeLocalFile, makePypeLocalFile, fn
+from pypeflow.task import PypeTask, PypeThreadTaskBase, PypeTaskBase
+from pypeflow.controller import PypeWorkflow, PypeThreadWorkflow
+from falcon_kit.FastaReader import FastaReader
+import glob
+import sys
+import os
+import re
+import time
+import hashlib
+
+
+wait_time = 5
+
+def run_script(job_data, job_type = "SGE" ):
+    """For now, we actually modify the script before running it.
+    This assume a simple bash script.
+    We will have a better solution eventually.
+    """
+    script_fn = job_data["script_fn"]
+    support.update_env_in_script(script_fn,
+        ['PATH', 'PYTHONPATH', 'LD_LIBRARY_PATH'])
+    if job_type == "SGE":
+        job_name = job_data["job_name"]
+        cwd = job_data["cwd"]
+        sge_option = job_data["sge_option"]
+        sge_cmd="qsub -N {job_name} {sge_option} -o {cwd}/sge_log -j y\
+                 -S /bin/bash {script}".format(job_name=job_name,  
+                                               cwd=os.getcwd(), 
+                                               sge_option=sge_option, 
+                                               script=script_fn)
+
+        fc_run_logger.info( "submitting %s for SGE, start job: %s " % (script_fn, job_name) )
+        cmd = sge_cmd
+        rc = os.system(cmd)
+    elif job_type == "SLURM":
+        job_name = job_data["job_name"]
+        cwd = job_data["cwd"]
+        sge_option = job_data["sge_option"]
+        fc_run_logger.info( "submitting %s for SGE, start job: %s " % (script_fn, job_name) )
+        sge_cmd="sbatch -J {job_name} {sge_option} {script}".format(job_name=job_name, cwd=os.getcwd(),sge_option=sge_option, script=script_fn)
+        cmd = sge_cmd
+        rc = os.system(cmd)
+    elif job_type == "local":
+        job_name = job_data["job_name"]
+        fc_run_logger.info( "executing %r locally, start job: %r " % (script_fn, job_name) )
+        log_fn = '{0}.log'.format(script_fn)
+        cmd = "bash {0} 1> {1} 2>&1".format(script_fn, log_fn)
+        rc = os.system(cmd)
+        if rc:
+            out = open(log_fn).read()
+            fc_run_logger.warning('Contents of %r:\n%s' %(log_fn, out))
+    if rc:
+        msg = "Cmd %r (job %r) returned %d." % (cmd, job_name, rc)
+        fc_run_logger.info(msg)
+        # For non-qsub, this might still help with debugging. But technically
+        # we should not raise here, as a failure should be noticed later.
+        # When we are confident that script failures are handled well,
+        # we can make this optional.
+        raise Exception(msg)
+    else:
+        msg = "Cmd %r (job %r) returned %d" % (cmd, job_name, rc)
+        fc_run_logger.debug(msg)
+
+def wait_for_file(filename, task, job_name = ""):
+    """We could be in the thread or sub-process which spawned a qsub job,
+    so we must check for the shutdown_event.
+    """
+    while 1:
+        time.sleep(wait_time)
+        # We prefer all jobs to rely on `*done.exit`, but not all do yet. So we check that 1st.
+        exit_fn = filename + '.exit'
+        if os.path.exists(exit_fn):
+            fc_run_logger.info( "%r generated. job: %r exited." % (exit_fn, job_name) )
+            os.unlink(exit_fn) # to allow a restart later, if not done
+            if not os.path.exists(filename):
+                fc_run_logger.warning( "%r is missing. job: %r failed!" % (filename, job_name) )
+            break
+        if os.path.exists(filename) and not os.path.exists(exit_fn):
+            # (rechecked exit_fn to avoid race condition)
+            fc_run_logger.info( "%r generated. job: %r finished." % (filename, job_name) )
+            break
+        if task.shutdown_event is not None and task.shutdown_event.is_set():
+            fc_run_logger.warning( "shutdown_event received (Keyboard Interrupt maybe?), %r not finished."
+                % (job_name) )
+            if support.job_type == "SGE":
+                fc_run_logger.info( "deleting the job by `qdel` now..." )
+                os.system("qdel %s" % job_name) # Failure is ok.
+            break
+
+def task_make_fofn_abs_raw(self):
+    return support.make_fofn_abs(self.i_fofn.path, self.o_fofn.path)
+
+def task_make_fofn_abs_preads(self):
+    return support.make_fofn_abs(self.i_fofn.path, self.o_fofn.path)
+
+def task_build_rdb(self):
+    input_fofn_fn = fn(self.input_fofn)
+    job_done = fn(self.rdb_build_done)
+    work_dir = self.parameters["work_dir"]
+    config = self.parameters["config"]
+    sge_option_da = config["sge_option_da"]
+
+    script_fn = os.path.join( work_dir, "prepare_rdb.sh" )
+    args = {
+        'input_fofn_fn': input_fofn_fn,
+        'work_dir': work_dir,
+        'config': config,
+        'job_done': job_done,
+        'script_fn': script_fn,
+        'run_jobs_fn': fn(self.run_jobs),
+    }
+    support.build_rdb(**args)
+
+    job_data = support.make_job_data(self.URL, script_fn)
+    job_data["sge_option"] = sge_option_da
+    run_script(job_data, job_type = config["job_type"])
+    wait_for_file(job_done, task=self, job_name=job_data['job_name'])
+
+def task_build_pdb(self):  #essential the same as build_rdb() but the subtle differences are tricky to consolidate to one function
+    input_fofn_fn = fn(self.pread_fofn)
+    job_done = fn(self.pdb_build_done)
+    work_dir = self.parameters["work_dir"]
+    config = self.parameters["config"]
+    sge_option_pda = config["sge_option_pda"]
+
+    script_fn = os.path.join( work_dir, "prepare_pdb.sh" )
+    args = {
+        'input_fofn_fn': input_fofn_fn,
+        'work_dir': work_dir,
+        'config': config,
+        'job_done': job_done,
+        'script_fn': script_fn,
+        'run_jobs_fn': fn(self.run_jobs),
+    }
+    support.build_pdb(**args)
+
+    job_data = support.make_job_data(self.URL, script_fn)
+    job_data["sge_option"] = sge_option_pda
+    run_script(job_data, job_type = config["job_type"])
+    wait_for_file(job_done, task=self, job_name=job_data['job_name'])
+
+def task_run_falcon_asm(self):
+    wd = self.parameters["wd"]
+    #p_merge_done = self.p_merge_done
+    db_file = fn(self.db_file)
+    job_done = fn(self.falcon_asm_done)
+    config = self.parameters["config"]
+    pread_dir = self.parameters["pread_dir"]
+    script_dir = os.path.join( wd )
+    script_fn =  os.path.join( script_dir ,"run_falcon_asm.sh" )
+    args = {
+        'pread_dir': pread_dir,
+        'db_file': db_file,
+        'config': config,
+        'job_done': job_done,
+        'script_fn': script_fn,
+    }
+    support.run_falcon_asm(**args)
+
+    job_data = support.make_job_data(self.URL, script_fn)
+    job_data["sge_option"] = config["sge_option_fc"]
+    run_script(job_data, job_type = config["job_type"])
+    wait_for_file(job_done, task=self, job_name=job_data['job_name'])
+
+def task_run_daligner(self):
+    job_done = fn(self.job_done)
+    daligner_cmd = self.parameters["daligner_cmd"]
+    job_uid = self.parameters["job_uid"]
+    cwd = self.parameters["cwd"]
+    db_prefix = self.parameters["db_prefix"]
+    nblock = self.parameters["nblock"]
+    config = self.parameters["config"]
+    sge_option_da = config["sge_option_da"]
+    script_dir = os.path.join( cwd )
+    script_fn =  os.path.join( script_dir , "rj_%s.sh" % (job_uid))
+    args = {
+        'daligner_cmd': daligner_cmd,
+        'db_prefix': db_prefix,
+        'nblock': nblock,
+        'config': config,
+        'job_done': job_done,
+        'script_fn': script_fn,
+    }
+    support.run_daligner(**args)
+
+    job_data = support.make_job_data(self.URL, script_fn)
+    job_data["sge_option"] = sge_option_da
+    run_script(job_data, job_type = config["job_type"])
+    wait_for_file(job_done, task=self, job_name=job_data['job_name'])
+
+def task_run_las_merge(self):
+    p_script_fn = self.parameters["merge_script"]
+    job_id = self.parameters["job_id"]
+    cwd = self.parameters["cwd"]
+    job_done = fn(self.job_done)
+    config = self.parameters["config"]
+    sge_option_la = config["sge_option_la"]
+
+    script_dir = os.path.join( cwd )
+    script_fn =  os.path.join( script_dir , "rp_%05d.sh" % (job_id))
+    args = {
+        'p_script_fn': p_script_fn,
+        'config': config,
+        'job_done': job_done,
+        'script_fn': script_fn,
+    }
+    support.run_las_merge(**args)
+
+    job_data = support.make_job_data(self.URL, script_fn)
+    job_data["sge_option"] = sge_option_la
+    run_script(job_data, job_type = config["job_type"])
+    wait_for_file(job_done, task=self, job_name=job_data['job_name'])
+
+def task_run_consensus(self):
+    out_file_fn = fn(self.out_file)
+    job_id = self.parameters["job_id"]
+    cwd = self.parameters["cwd"]
+    config = self.parameters["config"]
+    prefix = self.parameters["prefix"]
+    sge_option_cns = config["sge_option_cns"]
+    script_dir = os.path.join( cwd )
+    job_done = os.path.join( cwd, "c_%05d_done" % job_id )
+    script_fn =  os.path.join( script_dir , "c_%05d.sh" % (job_id))
+    args = {
+        'job_id': job_id,
+        'out_file_fn': out_file_fn,
+        'prefix': prefix,
+        'config': config,
+        'job_done': job_done,
+        'script_fn': script_fn,
+    }
+    support.run_consensus(**args)
+
+    job_data = support.make_job_data(self.URL, script_fn)
+    job_data["sge_option"] = sge_option_cns
+    run_script(job_data, job_type = config["job_type"])
+    wait_for_file(job_done, task=self, job_name=job_data['job_name'])
+
+
+def create_daligner_tasks(run_jobs_fn, wd, db_prefix, db_file, rdb_build_done, config, pread_aln = False):
+    job_id = 0
+    tasks = []
+    tasks_out = {}
+
+    nblock = 1
+    new_db = True
+    if os.path.exists( fn(db_file) ):
+        with open( fn(db_file) ) as f:
+            for l in f:
+                l = l.strip().split()
+                if l[0] == "blocks" and l[1] == "=":
+                    nblock = int(l[2])
+                    new_db = False
+                    break
+
+    for pid in xrange(1, nblock + 1):
+        support.make_dirs("%s/m_%05d" % (wd, pid))
+
+    with open(run_jobs_fn) as f :
+        for l in f :
+            l = l.strip()
+            job_uid = hashlib.md5(l).hexdigest()
+            job_uid = job_uid[:8]
+            l = l.split()
+            if l[0] == "daligner":
+                support.make_dirs(os.path.join( wd, "./job_%s" % job_uid))
+                call = "cd %s/job_%s;ln -sf ../.%s.bps .; ln -sf ../.%s.idx .; ln -sf ../%s.db ." % (wd, job_uid, db_prefix, db_prefix, db_prefix)
+                rc = os.system(call)
+                if rc:
+                    raise Exception("Failure in system call: %r -> %d" %(call, rc))
+                job_done = makePypeLocalFile(os.path.abspath( "%s/job_%s/job_%s_done" % (wd, job_uid, job_uid)  ))
+                if pread_aln == True:
+                    l[0] = "daligner_p"
+                parameters =  {"daligner_cmd": " ".join(l),
+                               "cwd": os.path.join(wd, "job_%s" % job_uid),
+                               "job_uid": job_uid,
+                               "config": config,
+                               "nblock": nblock,
+                               "db_prefix": db_prefix}
+                make_daligner_task = PypeTask( inputs = {"rdb_build_done": rdb_build_done},
+                                               outputs = {"job_done": job_done},
+                                               parameters = parameters,
+                                               TaskType = PypeThreadTaskBase,
+                                               URL = "task://localhost/d_%s_%s" % (job_uid, db_prefix) )
+                daligner_task = make_daligner_task( task_run_daligner )
+                tasks.append( daligner_task )
+                tasks_out[ "ajob_%s" % job_uid ] = job_done
+                job_id += 1
+    return tasks, tasks_out
+
+def create_merge_tasks(run_jobs_fn, wd, db_prefix, input_dep, config):
+    merge_tasks = []
+    consensus_tasks = []
+    merge_out = {}
+    consensus_out ={}
+    mjob_data = {}
+
+    with open(run_jobs_fn) as f :
+        for l in f:
+            l = l.strip().split()
+            if l[0] not in ( "LAsort", "LAmerge", "mv" ):
+                continue
+            if l[0] == "LAsort":
+                p_id = int( l[2].split(".")[1] )
+                mjob_data.setdefault( p_id, [] )
+                mjob_data[p_id].append(  " ".join(l) )
+            if l[0] == "LAmerge":
+                l2 = l[2].split(".")
+                if l2[1][0] == "L":
+                    p_id = int(  l[2].split(".")[2] )
+                    mjob_data.setdefault( p_id, [] )
+                    mjob_data[p_id].append(  " ".join(l) )
+                else:
+                    p_id = int( l[2].split(".")[1] )
+                    mjob_data.setdefault( p_id, [] )
+                    mjob_data[p_id].append(  " ".join(l) )
+            if l[0] == "mv":
+                l2 = l[1].split(".")
+                if l2[1][0] == "L":
+                    p_id = int(  l[1].split(".")[2] )
+                    mjob_data.setdefault( p_id, [] )
+                    mjob_data[p_id].append(  " ".join(l) )
+                else:
+                    p_id = int( l[1].split(".")[1] )
+                    mjob_data.setdefault( p_id, [] )
+                    mjob_data[p_id].append(  " ".join(l) )
+
+    for p_id in mjob_data:
+        s_data = mjob_data[p_id]
+
+        support.make_dirs("%s/m_%05d" % (wd, p_id))
+        support.make_dirs("%s/preads" % (wd) )
+        support.make_dirs("%s/las_files" % (wd) )
+
+        merge_script_file = os.path.abspath( "%s/m_%05d/m_%05d.sh" % (wd, p_id, p_id) )
+        with open(merge_script_file, "w") as merge_script:
+            #print >> merge_script, """for f in `find .. -wholename "*job*/%s.%d.%s.*.*.las"`; do ln -sf $f .; done""" % (db_prefix, p_id, db_prefix)
+            for l in s_data:
+                print >> merge_script, l
+            print >> merge_script, "ln -sf ../m_%05d/%s.%d.las ../las_files" % (p_id, db_prefix, p_id) 
+            print >> merge_script, "ln -sf ./m_%05d/%s.%d.las .. " % (p_id, db_prefix, p_id) 
+            
+        job_done = makePypeLocalFile(os.path.abspath( "%s/m_%05d/m_%05d_done" % (wd, p_id, p_id)  ))
+        parameters =  {"merge_script": merge_script_file, 
+                       "cwd": os.path.join(wd, "m_%05d" % p_id),
+                       "job_id": p_id,
+                       "config": config}
+
+        make_merge_task = PypeTask( inputs = {"input_dep": input_dep},
+                                       outputs = {"job_done": job_done},
+                                       parameters = parameters,
+                                       TaskType = PypeThreadTaskBase,
+                                       URL = "task://localhost/m_%05d_%s" % (p_id, db_prefix) )
+        merge_task = make_merge_task ( task_run_las_merge)
+
+        merge_out["mjob_%d" % p_id] = job_done
+        merge_tasks.append(merge_task)
+
+
+        out_file = makePypeLocalFile(os.path.abspath( "%s/preads/out.%05d.fasta" % (wd, p_id)  ))
+        out_done = makePypeLocalFile(os.path.abspath( "%s/preads/c_%05d_done" % (wd, p_id)  ))
+        parameters =  {"cwd": os.path.join(wd, "preads" ),
+                       "job_id": p_id, 
+                       "prefix": db_prefix,
+                       "config": config}
+        make_c_task = PypeTask( inputs = {"job_done": job_done},
+                                outputs = {"out_file": out_file, "out_done": out_done },
+                                parameters = parameters,
+                                TaskType = PypeThreadTaskBase,
+                                URL = "task://localhost/ct_%05d" % p_id )
+        
+        c_task = make_c_task( task_run_consensus)
+        consensus_tasks.append(c_task)
+        consensus_out["cjob_%d" % p_id] = out_done 
+
+    return merge_tasks, merge_out, consensus_tasks, consensus_out
+
+
+
+def main1(prog_name, input_config_fn, logger_config_fn=None):
+    global fc_run_logger
+    fc_run_logger = support.setup_logger(logger_config_fn)
+
+    fc_run_logger.info( "fc_run started with configuration %s", input_config_fn ) 
+    config = support.get_config(support.parse_config(input_config_fn))
+    rawread_dir = os.path.abspath("./0-rawreads")
+    pread_dir = os.path.abspath("./1-preads_ovl")
+    falcon_asm_dir  = os.path.abspath("./2-asm-falcon")
+    script_dir = os.path.abspath("./scripts")
+    sge_log_dir = os.path.abspath("./sge_log")
+
+    for d in (rawread_dir, pread_dir, falcon_asm_dir, script_dir, sge_log_dir):
+        support.make_dirs(d)
+
+    concurrent_jobs = config["pa_concurrent_jobs"]
+    PypeThreadWorkflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs)
+    wf = PypeThreadWorkflow()
+
+    input_fofn_plf = makePypeLocalFile(os.path.basename(config["input_fofn_fn"]))
+    rawread_fofn_plf = makePypeLocalFile(os.path.join(rawread_dir, os.path.basename(config["input_fofn_fn"])))
+    make_fofn_abs_task = PypeTask(inputs = {"i_fofn": input_fofn_plf},
+                                  outputs = {"o_fofn": rawread_fofn_plf},
+                                  parameters = {},
+                                  TaskType = PypeThreadTaskBase)
+    fofn_abs_task = make_fofn_abs_task(task_make_fofn_abs_raw)
+
+    wf.addTasks([fofn_abs_task])
+    wf.refreshTargets([fofn_abs_task])
+
+    if config["input_type"] == "raw":
+        #### import sequences into daligner DB
+        sleep_done = makePypeLocalFile( os.path.join( rawread_dir, "sleep_done") )
+        rdb_build_done = makePypeLocalFile( os.path.join( rawread_dir, "rdb_build_done") ) 
+        run_jobs = makePypeLocalFile( os.path.join( rawread_dir, "run_jobs.sh") ) 
+        parameters = {"work_dir": rawread_dir,
+                      "config": config}
+
+        make_build_rdb_task = PypeTask(inputs = {"input_fofn": rawread_fofn_plf},
+                                      outputs = {"rdb_build_done": rdb_build_done,
+                                                 "run_jobs": run_jobs}, 
+                                      parameters = parameters,
+                                      TaskType = PypeThreadTaskBase)
+        build_rdb_task = make_build_rdb_task(task_build_rdb)
+
+        wf.addTasks([build_rdb_task])
+        wf.refreshTargets([rdb_build_done]) 
+
+        db_file = makePypeLocalFile(os.path.join( rawread_dir, "%s.db" % "raw_reads" ))
+        #### run daligner
+        daligner_tasks, daligner_out = create_daligner_tasks(fn(run_jobs), rawread_dir, "raw_reads", db_file, rdb_build_done, config) 
+
+        wf.addTasks(daligner_tasks)
+        #wf.refreshTargets(updateFreq = 60) # larger number better for more jobs
+
+        r_da_done = makePypeLocalFile( os.path.join( rawread_dir, "da_done") )
+
+        @PypeTask( inputs = daligner_out, 
+                   outputs =  {"da_done":r_da_done},
+                   TaskType = PypeThreadTaskBase,
+                   URL = "task://localhost/rda_check" )
+        def check_r_da_task(self):
+            os.system("touch %s" % fn(self.da_done))
+        
+        wf.addTask(check_r_da_task)
+        wf.refreshTargets(updateFreq = wait_time) # larger number better for more jobs, need to call to run jobs here or the # of concurrency is changed
+        
+        concurrent_jobs = config["cns_concurrent_jobs"]
+        PypeThreadWorkflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs)
+        merge_tasks, merge_out, consensus_tasks, consensus_out = create_merge_tasks(fn(run_jobs), rawread_dir, "raw_reads", r_da_done, config)
+        wf.addTasks( merge_tasks )
+        if config["target"] == "overlapping":
+            wf.refreshTargets(updateFreq = wait_time) # larger number better for more jobs, need to call to run jobs here or the # of concurrency is changed
+            sys.exit(0)
+        wf.addTasks( consensus_tasks )
+
+        r_cns_done = makePypeLocalFile( os.path.join( rawread_dir, "cns_done") )
+        pread_fofn = makePypeLocalFile( os.path.join( pread_dir,  "input_preads.fofn" ) )
+
+        @PypeTask( inputs = consensus_out, 
+                   outputs =  {"cns_done":r_cns_done, "pread_fofn": pread_fofn},
+                   TaskType = PypeThreadTaskBase,
+                   URL = "task://localhost/cns_check" )
+        def check_r_cns_task(self):
+            with open(fn(self.pread_fofn),  "w") as f:
+                fn_list =  glob.glob("%s/preads/out*.fasta" % rawread_dir)
+                fn_list.sort()
+                for fa_fn in fn_list:
+                    print >>f, fa_fn
+            os.system("touch %s" % fn(self.cns_done))
+
+        wf.addTask(check_r_cns_task)
+        wf.refreshTargets(updateFreq = wait_time) # larger number better for more jobs
+
+    if config["target"] == "pre-assembly":
+        sys.exit(0)
+
+    # build pread database
+    if config["input_type"] == "preads":
+        pread_fofn = makePypeLocalFile(os.path.join(pread_dir, os.path.basename(config["input_fofn_fn"])))
+        make_fofn_abs_task = PypeTask(inputs = {"i_fofn": rawread_fofn_plf},
+                                     outputs = {"o_fofn": pread_fofn},
+                                     parameters = {},
+                                     TaskType = PypeThreadTaskBase)
+        fofn_abs_task = make_fofn_abs_task(task_make_fofn_abs_preads)
+        wf.addTasks([fofn_abs_task])
+        wf.refreshTargets([fofn_abs_task])
+
+    pdb_build_done = makePypeLocalFile( os.path.join( pread_dir, "pdb_build_done") ) 
+    parameters = {"work_dir": pread_dir,
+                  "config": config}
+
+    run_jobs = makePypeLocalFile(os.path.join(pread_dir, 'run_jobs.sh'))
+    make_build_pdb_task  = PypeTask(inputs = { "pread_fofn": pread_fofn },
+                                    outputs = { "pdb_build_done": pdb_build_done,
+                                                "run_jobs": run_jobs},
+                                    parameters = parameters,
+                                    TaskType = PypeThreadTaskBase,
+                                    URL = "task://localhost/build_pdb")
+    build_pdb_task = make_build_pdb_task(task_build_pdb)
+
+    wf.addTasks([build_pdb_task])
+    wf.refreshTargets([pdb_build_done]) 
+
+
+
+    db_file = makePypeLocalFile(os.path.join( pread_dir, "%s.db" % "preads" ))
+    #### run daligner
+    concurrent_jobs = config["ovlp_concurrent_jobs"]
+    PypeThreadWorkflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs)
+    config["sge_option_da"] = config["sge_option_pda"]
+    config["sge_option_la"] = config["sge_option_pla"]
+    daligner_tasks, daligner_out = create_daligner_tasks(fn(run_jobs), pread_dir, "preads", db_file, pdb_build_done, config, pread_aln= True) 
+    wf.addTasks(daligner_tasks)
+    #wf.refreshTargets(updateFreq = 30) # larger number better for more jobs
+
+    p_da_done = makePypeLocalFile( os.path.join( pread_dir, "da_done") )
+
+    @PypeTask( inputs = daligner_out, 
+               outputs =  {"da_done":p_da_done},
+               TaskType = PypeThreadTaskBase,
+               URL = "task://localhost/pda_check" )
+    def check_p_da_task(self):
+        os.system("touch %s" % fn(self.da_done))
+    
+    wf.addTask(check_p_da_task)
+
+    merge_tasks, merge_out, consensus_tasks, consensus_out = create_merge_tasks(fn(run_jobs), pread_dir, "preads", p_da_done, config)
+    wf.addTasks( merge_tasks )
+    #wf.refreshTargets(updateFreq = 30) #all
+
+    p_merge_done = makePypeLocalFile( os.path.join( pread_dir, "p_merge_done") )
+
+    @PypeTask( inputs = merge_out, 
+               outputs =  {"p_merge_done":p_merge_done},
+               TaskType = PypeThreadTaskBase,
+               URL = "task://localhost/pmerge_check" )
+    def check_p_merge_check_task(self):
+        os.system("touch %s" % fn(self.p_merge_done))
+    
+    wf.addTask(check_p_merge_check_task)
+    wf.refreshTargets(updateFreq = wait_time) #all
+
+    
+    falcon_asm_done = makePypeLocalFile( os.path.join( falcon_asm_dir, "falcon_asm_done") )
+    make_run_falcon_asm = PypeTask(
+               inputs = {"p_merge_done": p_merge_done, "db_file":db_file},
+               outputs =  {"falcon_asm_done":falcon_asm_done},
+               parameters = {"wd": falcon_asm_dir,
+                             "config": config,
+                             "pread_dir": pread_dir},
+               TaskType = PypeThreadTaskBase,
+               URL = "task://localhost/falcon" )
+    wf.addTask(make_run_falcon_asm(task_run_falcon_asm))
+    wf.refreshTargets(updateFreq = wait_time) #all
+
+
+def main(*argv):
+    if len(argv) < 2:
+        sys.stderr.write( """
+you need to specify a configuration file"
+usage: fc_run.py fc_run.cfg [logging.cfg]
+""")
+        sys.exit(2)
+    main1(*argv)
diff --git a/src/py/mains/tasks.py b/src/py/mains/tasks.py
new file mode 100644
index 0000000..7783a21
--- /dev/null
+++ b/src/py/mains/tasks.py
@@ -0,0 +1,31 @@
+"""Executable tasks.
+
+To be called by pbsmrtpipe.
+
+pypeFLOW uses its own adaptors instead.
+"""
+from .. import run_support as support
+import sys
+
+
+def help():
+    print("""
+Usage:
+    falcon-task [task] <[task-args]>
+
+tasks:
+    make-fofn-abs
+""")
+    sys.exit(2)
+
+def main_make_fofn_abs(i_fofn_fn, o_fofn_fn):
+    support.make_fofn_abs(i_fofn_fn, o_fofn_fn)
+
+def main(argv=sys.argv):
+    if len(argv) < 2 or argv[1].startswith('-'):
+        help()
+    task = argv[1]
+    tasks = {
+        'make-fofn-abs': main_make_fofn_abs,
+    }
+    return tasks[task](*argv[2:])
diff --git a/src/py/multiproc.py b/src/py/multiproc.py
new file mode 100644
index 0000000..5cbf08b
--- /dev/null
+++ b/src/py/multiproc.py
@@ -0,0 +1,24 @@
+"""Job pools for multiprocessing.
+"""
+import multiprocessing
+import itertools
+
+class FakePool(object):
+    """Fake version of multiprocessing.Pool
+    """
+    def map(self, func, iterable, chunksize=None):
+        return map(func, iterable)
+    def imap(self, func, iterable, chunksize=None):
+        return itertools.imap(func, iterable)
+    def __init__(self, *args, **kwds):
+        pass
+
+def Pool(processes, *args, **kwds):
+    """Pool factory.
+    If 'not processes', return our FakePool;
+    otherwise, a multiprocessing.Pool.
+    """
+    if processes:
+        return multiprocessing.Pool(processes, *args, **kwds)
+    else:
+        return FakePool()
diff --git a/src/py/run_support.py b/src/py/run_support.py
new file mode 100644
index 0000000..a7c3fa7
--- /dev/null
+++ b/src/py/run_support.py
@@ -0,0 +1,419 @@
+import ConfigParser
+import logging
+import logging.config
+import os
+import StringIO
+import sys
+import tempfile
+import time
+import uuid
+
+job_type = None
+fc_run_logger = None
+
+def _prepend_env_paths(content, names):
+    """
+    E.g.
+      names = ['PATH', 'PYTYHONPATH']
+      content =
+        echo hi
+      =>
+        export PATH=current:path:${PATH}
+        export PYTHON=current:path:${PYTHONPATH}
+        echo hi
+    """
+    export_env_vars = ['export %(k)s=%(v)s:${%(k)s}' %dict(
+        k=name, v=os.environ.get(name, '')) for name in names]
+    return '\n'.join(export_env_vars + [content])
+
+def update_env_in_script(fn, names):
+    """Modify fn using on prepend_env_paths().
+    """
+    with open(fn) as ifs:
+        content = ifs.read()
+    content = _prepend_env_paths(content, names)
+    with open(fn, 'w') as ofs:
+        ofs.write(content)
+
+def use_tmpdir_for_files(basenames, src_dir, link_dir):
+    """Generate script to copy db files to tmpdir (for speed).
+    - Choose tmp_dir, based on src_dir name.
+    - rsync basenames into tmp_dir  # after 'flock', per file
+    - symlink from link_dir into tmp_dir.
+    Return list of script lines, sans linefeed.
+    """
+    script = list()
+    unique = os.path.abspath(src_dir).replace('/', '_')
+    root = tempfile.gettempdir()
+    tmp_dir = os.path.join(root, 'falcon', unique)
+    script.append('mkdir -p %s' %tmp_dir)
+    for basename in basenames:
+        src = os.path.join(src_dir, basename)
+        dst = os.path.join(tmp_dir, basename)
+        rm_cmd = 'rm -f %s' %basename
+        # Wait on lock for up to 10 minutes, in case of very large files.
+        rsync_cmd = "flock -w 600 %s.lock -c 'rsync -av %s %s'" %(dst, src, dst)
+        ln_cmd = 'ln -sf %s %s' %(dst, basename)
+        script.extend([rm_cmd, rsync_cmd, ln_cmd])
+    return script
+
+def make_job_data(url, script_fn):
+    """Choose defaults.
+    Run in same directory as script_fn.
+    Base job_name on script_fn.
+    """
+    wd = os.path.dirname(script_fn)
+    job_name = '{0}-{1}-{1}'.format(
+            os.path.basename(script_fn),
+            url.split("/")[-1],
+            str(uuid.uuid4())[:8],
+            )
+    job_data = {"job_name": job_name,
+                "cwd": wd,
+                "script_fn": script_fn }
+    return job_data
+
+def parse_config(config_fn):
+    config = ConfigParser.ConfigParser()
+    config.read(config_fn)
+    return config
+
+def get_config(config):
+    global job_type  # TODO: Stop using global for wait_for_file().
+    job_type = "SGE"
+    if config.has_option('General', 'job_type'):
+        job_type = config.get('General', 'job_type')
+
+    pa_concurrent_jobs = 8
+    if config.has_option('General', 'pa_concurrent_jobs'):
+        pa_concurrent_jobs = config.getint('General', 'pa_concurrent_jobs')
+
+    cns_concurrent_jobs = 8
+    if config.has_option('General', 'cns_concurrent_jobs'):
+        cns_concurrent_jobs = config.getint('General', 'cns_concurrent_jobs')
+
+    ovlp_concurrent_jobs = 8
+    if config.has_option('General', 'ovlp_concurrent_jobs'):
+        ovlp_concurrent_jobs = config.getint('General', 'ovlp_concurrent_jobs')
+
+    #appending = False
+    #if config.has_option('General', 'appending'):
+    #    appending = config.get('General', 'appending')
+    #    if appending == "True":
+    #        appending = True
+
+    openending = False
+    if config.has_option('General', 'openending'):
+        openending = config.get('General', 'openending')
+        if openending == "True":
+            openending = True
+
+    input_type = "raw"
+    if config.has_option('General', 'input_type'):
+        input_type = config.get('General', 'input_type')
+
+    overlap_filtering_setting =  """--max_diff 1000 --max_cov 1000 --min_cov 2"""
+    if config.has_option('General', 'overlap_filtering_setting'):
+        overlap_filtering_setting = config.get('General', 'overlap_filtering_setting')
+
+    pa_HPCdaligner_option = """-v -dal4 -t16 -e.70 -l1000 -s100"""
+    if config.has_option('General', 'pa_HPCdaligner_option'):
+        pa_HPCdaligner_option = config.get('General', 'pa_HPCdaligner_option')
+
+    ovlp_HPCdaligner_option = """ -v -dal24 -t32 -h60 -e.96 -l500 -s1000"""
+    if config.has_option('General', 'ovlp_HPCdaligner_option'):
+        ovlp_HPCdaligner_option = config.get('General', 'ovlp_HPCdaligner_option')
+
+    pa_DBsplit_option = """ -x500 -s200"""
+    if config.has_option('General', 'pa_DBsplit_option'):
+        pa_DBsplit_option = config.get('General', 'pa_DBsplit_option')
+
+    ovlp_DBsplit_option = """ -x500 -s200"""
+    if config.has_option('General', 'ovlp_DBsplit_option'):
+        ovlp_DBsplit_option = config.get('General', 'ovlp_DBsplit_option')
+
+    falcon_sense_option = """ --output_multi --min_idt 0.70 --min_cov 2 --local_match_count_threshold 0 --max_n_read 1800 --n_core 6"""
+    if config.has_option('General', 'falcon_sense_option'):
+        falcon_sense_option = config.get('General', 'falcon_sense_option')
+
+    falcon_sense_skip_contained = "False"
+    if config.has_option('General', 'falcon_sense_skip_contained'):
+        falcon_sense_skip_contained = config.get('General', 'falcon_sense_skip_contained')
+        if falcon_sense_skip_contained in ["True", "true", "1"]:
+            falcon_sense_skip_contained = True
+        else:
+            falcon_sense_skip_contained = False
+
+    length_cutoff = config.getint('General', 'length_cutoff')
+    input_fofn_fn = config.get('General', 'input_fofn')
+
+    length_cutoff_pr = config.getint('General', 'length_cutoff_pr')
+
+    bestn = 12
+    if config.has_option('General', 'bestn'):
+        bestn = config.getint('General', 'bestn')
+
+    if config.has_option('General', 'target'):
+        target = config.get('General', 'target')
+        if target not in ["overlapping", "pre-assembly", "assembly"]:
+            print """ Target has to be "overlapping", "pre-assembly" or "assembly" in this verison. You have an unknown target %s in the configuration file.  """ % target
+            raise SystemExit(1)
+    else:
+        print """ No target specified, assuming "assembly" as target """
+        target = "assembly"
+
+    if config.has_option('General', 'use_tmpdir'):
+        use_tmpdir = config.getboolean('General','use_tmpdir')
+    else:
+        use_tmpdir = False
+
+    hgap_config = {"input_fofn_fn" : input_fofn_fn,
+                   "target" : target,
+                   "job_type" : job_type,
+                   "input_type": input_type,
+                   "openending": openending,
+                   "pa_concurrent_jobs" : pa_concurrent_jobs,
+                   "ovlp_concurrent_jobs" : ovlp_concurrent_jobs,
+                   "cns_concurrent_jobs" : cns_concurrent_jobs,
+                   "overlap_filtering_setting": overlap_filtering_setting,
+                   "length_cutoff" : length_cutoff,
+                   "length_cutoff_pr" : length_cutoff_pr,
+                   "sge_option_da": config.get('General', 'sge_option_da'),
+                   "sge_option_la": config.get('General', 'sge_option_la'),
+                   "sge_option_pda": config.get('General', 'sge_option_pda'),
+                   "sge_option_pla": config.get('General', 'sge_option_pla'),
+                   "sge_option_fc": config.get('General', 'sge_option_fc'),
+                   "sge_option_cns": config.get('General', 'sge_option_cns'),
+                   "pa_HPCdaligner_option": pa_HPCdaligner_option,
+                   "ovlp_HPCdaligner_option": ovlp_HPCdaligner_option,
+                   "pa_DBsplit_option": pa_DBsplit_option,
+                   "ovlp_DBsplit_option": ovlp_DBsplit_option,
+                   "falcon_sense_option": falcon_sense_option,
+                   "falcon_sense_skip_contained": falcon_sense_skip_contained,
+                   "use_tmpdir": use_tmpdir,
+                   }
+
+    hgap_config["install_prefix"] = sys.prefix
+
+    return hgap_config
+
+default_logging_config = """
+[loggers]
+keys=root,pypeflow,fc_run
+
+[handlers]
+keys=stream,file_pypeflow,file_fc
+
+[formatters]
+keys=form01
+
+[logger_root]
+level=NOTSET
+handlers=stream
+
+[logger_pypeflow]
+level=NOTSET
+handlers=stream
+qualname=pypeflow
+propagate=1
+
+[logger_fc_run]
+level=NOTSET
+handlers=stream
+qualname=fc_run
+propagate=1
+
+[handler_stream]
+class=StreamHandler
+level=INFO
+formatter=form01
+args=(sys.stderr,)
+
+[handler_file_pypeflow]
+class=FileHandler
+level=DEBUG
+formatter=form01
+args=('pypeflow.log',)
+
+[handler_file_fc]
+class=FileHandler
+level=DEBUG
+formatter=form01
+args=('fc_run.log',)
+
+[formatter_form01]
+format=%(asctime)s - %(name)s - %(levelname)s - %(message)s
+"""
+
+def setup_logger(logging_config_fn):
+    """See https://docs.python.org/2/library/logging.config.html
+    """
+    logging.Formatter.converter = time.gmtime # cannot be done in .ini
+
+    if logging_config_fn:
+        logger_fileobj = open(logging_config_fn)
+    else:
+        logger_fileobj = StringIO.StringIO(default_logging_config)
+    defaults = {
+    }
+    logging.config.fileConfig(logger_fileobj, defaults=defaults, disable_existing_loggers=False)
+
+    return logging.getLogger("fc_run")
+
+def make_fofn_abs(i_fofn_fn, o_fofn_fn):
+    """Copy i_fofn to o_fofn, but with relative filenames expanded for CWD.
+    """
+    assert os.path.abspath(o_fofn_fn) != os.path.abspath(i_fofn_fn)
+    with open(i_fofn_fn) as ifs, open(o_fofn_fn, 'w') as ofs:
+        for line in ifs:
+            ifn = line.strip()
+            if not ifn: continue
+            abs_ifn = os.path.abspath(ifn)
+            ofs.write('%s\n' %abs_ifn)
+    #return o_fofn_fn
+
+def make_dirs(d):
+    if not os.path.isdir(d):
+        os.makedirs(d)
+
+def build_rdb(input_fofn_fn, work_dir, config, job_done, script_fn, run_jobs_fn):
+    length_cutoff = config["length_cutoff"]
+    pa_HPCdaligner_option = config["pa_HPCdaligner_option"]
+    pa_DBsplit_option = config["pa_DBsplit_option"]
+    openending = config["openending"]
+
+    last_block = 1
+    new_db = True
+    if os.path.exists( os.path.join(work_dir, "raw_reads.db") ):
+        with open(  os.path.join(work_dir, "raw_reads.db") ) as f:
+            for l in f:
+                l = l.strip().split()
+                if l[0] == "blocks" and l[1] == "=":
+                    last_block = int(l[2])
+                    new_db = False
+                    break
+
+    with open(script_fn,"w") as script_file:
+        script_file.write("set -vex\n")
+        script_file.write("trap 'touch {job_done}.exit' EXIT\n".format(job_done = job_done))
+        script_file.write("cd {work_dir}\n".format(work_dir = work_dir))
+        script_file.write("hostname\n")
+        script_file.write("date\n")
+        #script_file.write("for f in `cat {input_fofn_fn}`; do fasta2DB raw_reads $f; done\n".format(input_fofn_fn = input_fofn_fn))
+        script_file.write("fasta2DB -v raw_reads -f{input_fofn_fn}\n".format(input_fofn_fn = input_fofn_fn))
+        if new_db  == True:
+            script_file.write("DBsplit %s raw_reads\n" % pa_DBsplit_option)
+        if openending == True:
+            script_file.write("""LB=$(cat raw_reads.db | awk '$1 == "blocks" {print $3-1}')\n""")
+        else:
+            script_file.write("""LB=$(cat raw_reads.db | awk '$1 == "blocks" {print $3}')\n""")
+        script_file.write("HPCdaligner %s -H%d raw_reads %d-$LB > %s\n" %(
+            pa_HPCdaligner_option, length_cutoff, last_block, run_jobs_fn))
+        script_file.write("touch {job_done}\n".format(job_done = job_done))
+
+def build_pdb(input_fofn_fn, work_dir, config, job_done, script_fn, run_jobs_fn):
+    length_cutoff = config["length_cutoff_pr"]
+    ovlp_HPCdaligner_option = config["ovlp_HPCdaligner_option"]
+    ovlp_DBsplit_option = config["ovlp_DBsplit_option"]
+
+    with open(script_fn,"w") as script_file:
+        script_file.write("set -vex\n")
+        script_file.write("trap 'touch {job_done}.exit' EXIT\n".format(job_done = job_done))
+        script_file.write("cd {work_dir}\n".format(work_dir = work_dir))
+        script_file.write("hostname\n")
+        script_file.write("date\n")
+        script_file.write("fasta2DB -v preads -f{input_fofn_fn}\n".format(input_fofn_fn = input_fofn_fn))
+        script_file.write("DBsplit -x%d %s preads\n" % (length_cutoff, ovlp_DBsplit_option))
+        script_file.write("HPCdaligner %s -H%d preads > %s\n" %(
+            ovlp_HPCdaligner_option, length_cutoff, run_jobs_fn))
+        script_file.write("touch {job_done}\n".format(job_done = job_done))
+
+def run_falcon_asm(pread_dir, db_file, config, job_done, script_fn):
+    wd = os.path.dirname(script_fn)
+    overlap_filtering_setting = config["overlap_filtering_setting"]
+    length_cutoff_pr = config["length_cutoff_pr"]
+
+    script = []
+    script.append( "set -vex" )
+    script.append( "trap 'touch %s.exit' EXIT" % job_done )
+    script.append( "cd %s" % pread_dir )
+    # Write preads4falcon.fasta, in 1-preads_ovl:
+    script.append( "DB2Falcon -U preads")
+    script.append( "cd %s" % wd )
+    script.append( """find %s/las_files -name "*.las" > las.fofn """ % pread_dir )
+    script.append( """fc_ovlp_filter.py --db %s --fofn las.fofn %s --min_len %d > preads.ovl""" %\
+            (db_file, overlap_filtering_setting, length_cutoff_pr) )
+    script.append( "ln -sf %s/preads4falcon.fasta ." % pread_dir)
+    script.append( """fc_ovlp_to_graph.py preads.ovl --min_len %d > fc_ovlp_to_graph.log""" % length_cutoff_pr) # TODO: drop this logfile
+    # Write 'p_ctg.fa' and 'a_ctg.fa':
+    script.append( """fc_graph_to_contig.py""" )
+    script.append( """fc_dedup_a_tigs.py""" )
+    script.append( """touch %s""" % job_done)
+
+    with open(script_fn, "w") as script_file:
+        script_file.write("\n".join(script) + '\n')
+
+def run_daligner(daligner_cmd, db_prefix, nblock, config, job_done, script_fn):
+    cwd = os.path.dirname(script_fn)
+
+    script = []
+    script.append( "set -vex" )
+    script.append( "trap 'touch {job_done}.exit' EXIT".format(job_done = job_done) )
+    script.append( "cd %s" % cwd )
+    script.append( "hostname" )
+    script.append( "date" )
+    if config['use_tmpdir']:
+        basenames = [pattern.format(db_prefix) for pattern in ('.{}.idx', '.{}.bps', '{}.db')]
+        dst_dir = os.path.abspath(cwd)
+        src_dir = os.path.abspath(os.path.dirname(cwd)) # by convention
+        script.extend(use_tmpdir_for_files(basenames, src_dir, dst_dir))
+    script.append( "time "+ daligner_cmd )
+
+    for p_id in xrange( 1, nblock+1 ):
+        script.append( """ for f in `find $PWD -wholename "*%s.%d.%s.*.*.las"`; do ln -sf $f ../m_%05d; done """  % (db_prefix, p_id, db_prefix, p_id) )
+
+    script.append( "touch {job_done}".format(job_done = job_done) )
+
+    with open(script_fn,"w") as script_file:
+        script_file.write("\n".join(script) + '\n')
+
+def run_las_merge(p_script_fn, job_done, config, script_fn):
+    cwd = os.path.dirname(script_fn)
+    script = []
+    script.append( "set -vex" )
+    script.append( "trap 'touch {job_done}.exit' EXIT".format(job_done = job_done) )
+    script.append( "cd %s" % cwd )
+    script.append( "hostname" )
+    script.append( "date" )
+    script.append( "time bash %s" % p_script_fn )
+    script.append( "touch {job_done}".format(job_done = job_done) )
+
+    with open(script_fn,"w") as script_file:
+        script_file.write("\n".join(script) + '\n')
+
+def run_consensus(job_id, out_file_fn, prefix, config, job_done, script_fn):
+    cwd = os.path.dirname(script_fn)
+    falcon_sense_option = config["falcon_sense_option"]
+    length_cutoff = config["length_cutoff"]
+
+    c_script_fn = os.path.join(cwd, "cp_%05d.sh" % job_id)
+    with open(c_script_fn, "w") as c_script:
+        print >> c_script, "set -vex"
+        print >> c_script, "trap 'touch {job_done}.exit' EXIT".format(job_done = job_done)
+        print >> c_script, "cd .."
+        if config["falcon_sense_skip_contained"] == True:
+            print >> c_script, """LA4Falcon -H%d -fso %s las_files/%s.%d.las | """ % (length_cutoff, prefix, prefix, job_id),
+        else:
+            print >> c_script, """LA4Falcon -H%d -fo %s las_files/%s.%d.las | """ % (length_cutoff, prefix, prefix, job_id),
+        print >> c_script, """fc_consensus.py %s > %s""" % (falcon_sense_option, out_file_fn)
+        print >> c_script, "touch {job_done}".format(job_done = job_done)
+
+    script = []
+    script.append( "set -vex" )
+    script.append( "cd %s" % cwd )
+    script.append( "hostname" )
+    script.append( "date" )
+    script.append( "time bash %s" %os.path.basename(c_script_fn) )
+
+    with open(script_fn,"w") as script_file:
+        script_file.write("\n".join(script) + '\n')
diff --git a/src/py/util/__init__.py b/src/py/util/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/py/util/io.py b/src/py/util/io.py
new file mode 100644
index 0000000..2054604
--- /dev/null
+++ b/src/py/util/io.py
@@ -0,0 +1,162 @@
+"""I/O utilities
+Not specific to FALCON.
+"""
+import os
+import resource
+import shlex
+import subprocess as sp
+import sys
+import traceback
+
+def write_nothing(*args):
+    """
+    To use,
+      LOG = noop
+    """
+
+def write_with_pid(*args):
+    msg = '[%d]%s\n' %(os.getpid(), ' '.join(args))
+    sys.stderr.write(msg)
+
+LOG = write_with_pid
+
+def logstats():
+    """This is useful 'atexit'.
+    """
+    LOG('maxrss:%9d' %(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss))
+
+def reprarg(arg):
+    if (isinstance(arg, set) or isinstance(arg, list)
+            or isinstance(arg, tuple) or isinstance(arg, dict)):
+        if len(arg) > 9:
+            return '%s(%d elem)' %(type(arg).__name__, len(arg))
+    return repr(arg) 
+def run_func(args):
+    """Wrap multiprocessing.Pool calls.
+    Usage:
+        pool.imap(run_func, [func, arg0, arg1, ...])
+    """
+    func = args[0]
+    try:
+        func_name = func.__name__
+    except:
+        func_name = repr(func) # but since it must be pickle-able, this should never happen.
+    args = args[1:]
+    try:
+        LOG('starting %s(%s)' %(func_name, ', '.join(reprarg(a) for a in args)))
+        logstats()
+        ret = func(*args)
+        logstats()
+        LOG('finished %s(%s)' %(func_name, ', '.join(reprarg(a) for a in args)))
+        return ret
+    except Exception:
+        raise Exception(traceback.format_exc())
+    except: # KeyboardInterrupt, SystemExit
+        LOG('interrupted %s(%s)' %(func_name, ', '.join(reprarg(a) for a in args)))
+        return
+
+def syscall(cmd):
+    """Return stdout, fully captured.
+    Wait for subproc to finish.
+    Raise if empty.
+    Raise on non-zero exit-code.
+    """
+    LOG('$ %s >' %cmd)
+    output = sp.check_output(shlex.split(cmd))
+    if not output:
+        msg = '%r failed to produce any output.' %cmd
+        LOG('WARNING: %s' %msg)
+    return output
+
+def slurplines(cmd):
+    return syscall(cmd).splitlines()
+
+def streamlines(cmd):
+    """Stream stdout from cmd.
+    Let stderr fall through.
+    The returned reader will stop yielding when the subproc exits.
+    Note: We do not detect a failure in the underlying process.
+    """
+    LOG('$ %s |' %cmd)
+    proc = sp.Popen(shlex.split(cmd), stdout=sp.PIPE)
+    return proc.stdout
+
+class DataReaderContext(object):
+    def readlines(self):
+        output = self.data.strip()
+        for line in output.splitlines():
+            yield line
+    def __enter__(self):
+        pass
+    def __exit__(self, *args):
+        self.returncode = 0
+    def __init__(self, data):
+        self.data = data
+class ProcessReaderContext(object):
+    """Prefer this to slurplines() or streamlines().
+    """
+    def __enter__(self):
+        self.proc = sp.Popen(shlex.split(self.cmd), stdout=sp.PIPE)
+    def __exit__(self, etype, evalue, etb):
+        if etype is None:
+            self.proc.wait()
+        else:
+            # Exception was raised in "with-block".
+            # We cannot wait on proc b/c it might never finish!
+            pass
+        self.returncode = self.proc.returncode
+        if self.returncode:
+            msg = "%r <- %r" %(self.returncode, self.cmd)
+            raise Exception(msg)
+        del self.proc
+    def __init__(self, cmd):
+        self.cmd = cmd
+class CapturedProcessReaderContext(ProcessReaderContext):
+    def readlines(self):
+        """Usage:
+
+            cmd = 'ls -l'
+            reader = CapturedProcessReaderContext(cmd)
+            with reader:
+                for line in reader.readlines():
+                    print line
+
+        Any exception within the 'with-block' is propagated.
+        Otherwise, after all lines are read, if 'cmd' failed, Exception is raised.
+        """
+        output, _ = self.proc.communicate()
+        for line in output.splitlines():
+            yield line
+class StreamedProcessReaderContext(ProcessReaderContext):
+    def readlines(self):
+        """Usage:
+
+            cmd = 'ls -l'
+            reader = StreamedProcessReaderContext(cmd)
+            with reader:
+                for line in reader.readlines():
+                    print line
+
+        Any exception within the 'with-block' is propagated.
+        Otherwise, after all lines are read, if 'cmd' failed, Exception is raised.
+        """
+        for line in self.proc.stdout:
+            yield line
+
+def filesize(fn):
+    """In bytes.
+    Raise if fn does not exist.
+    """
+    statinfo = os.stat(fn)
+    return statinfo.st_size
+
+def validated_fns(fofn):
+    """Return list of filenames from fofn.
+    Assert none are empty or non-existent.
+    """
+    fns = open(fofn).read().strip().split("\n")
+    for fn in fns:
+        assert fn
+        assert os.path.isfile(fn)
+        assert filesize(fn)
+    return fns
diff --git a/src/py_scripts/fc_actg_coordinate.py b/src/py_scripts/fc_actg_coordinate.py
new file mode 100644
index 0000000..4fb5579
--- /dev/null
+++ b/src/py_scripts/fc_actg_coordinate.py
@@ -0,0 +1,5 @@
+from falcon_kit.mains.actg_coordinate import main
+import sys
+
+if __name__ == "__main__":
+    main(*sys.argv)
diff --git a/src/py_scripts/fc_consensus.py b/src/py_scripts/fc_consensus.py
new file mode 100644
index 0000000..0621991
--- /dev/null
+++ b/src/py_scripts/fc_consensus.py
@@ -0,0 +1,5 @@
+from falcon_kit.mains.consensus import main
+import sys
+
+if __name__ == "__main__":
+    main(*sys.argv)
diff --git a/src/py_scripts/fc_contig_annotate.py b/src/py_scripts/fc_contig_annotate.py
new file mode 100644
index 0000000..90fbb0c
--- /dev/null
+++ b/src/py_scripts/fc_contig_annotate.py
@@ -0,0 +1,5 @@
+from falcon_kit.mains.contig_annotate import main
+import sys
+
+if __name__ == "__main__":
+    main(*sys.argv)
diff --git a/src/py_scripts/fc_ctg_link_analysis.py b/src/py_scripts/fc_ctg_link_analysis.py
new file mode 100644
index 0000000..208d576
--- /dev/null
+++ b/src/py_scripts/fc_ctg_link_analysis.py
@@ -0,0 +1,5 @@
+from falcon_kit.mains.ctg_link_analysis import main
+import sys
+
+if __name__ == "__main__":
+    main(*sys.argv)
diff --git a/src/py_scripts/fc_dedup_a_tigs.py b/src/py_scripts/fc_dedup_a_tigs.py
new file mode 100644
index 0000000..301a735
--- /dev/null
+++ b/src/py_scripts/fc_dedup_a_tigs.py
@@ -0,0 +1,5 @@
+from falcon_kit.mains.dedup_a_tigs import main
+import sys
+
+if __name__ == "__main__":
+    main(*sys.argv)
diff --git a/src/py_scripts/fc_graph_to_contig.py b/src/py_scripts/fc_graph_to_contig.py
new file mode 100644
index 0000000..cbe9afa
--- /dev/null
+++ b/src/py_scripts/fc_graph_to_contig.py
@@ -0,0 +1,5 @@
+from falcon_kit.mains.graph_to_contig import main
+import sys
+
+if __name__ == "__main__":
+    main(*sys.argv)
diff --git a/src/py_scripts/fc_graph_to_utgs.py b/src/py_scripts/fc_graph_to_utgs.py
new file mode 100644
index 0000000..0352358
--- /dev/null
+++ b/src/py_scripts/fc_graph_to_utgs.py
@@ -0,0 +1,5 @@
+from falcon_kit.mains.graph_to_utgs import main
+import sys
+
+if __name__ == "__main__":
+    main(*sys.argv)
diff --git a/src/py_scripts/fc_ovlp_filter.py b/src/py_scripts/fc_ovlp_filter.py
new file mode 100644
index 0000000..3cf053a
--- /dev/null
+++ b/src/py_scripts/fc_ovlp_filter.py
@@ -0,0 +1,5 @@
+from falcon_kit.mains.ovlp_filter import main
+import sys
+
+if __name__ == "__main__":
+    main(*sys.argv)
diff --git a/src/py_scripts/fc_ovlp_stats.py b/src/py_scripts/fc_ovlp_stats.py
new file mode 100644
index 0000000..e52e367
--- /dev/null
+++ b/src/py_scripts/fc_ovlp_stats.py
@@ -0,0 +1,5 @@
+from falcon_kit.mains.ovlp_stats import main
+import sys
+
+if __name__ == "__main__":
+    main(*sys.argv)
diff --git a/src/py_scripts/fc_ovlp_to_graph.py b/src/py_scripts/fc_ovlp_to_graph.py
new file mode 100644
index 0000000..dad890a
--- /dev/null
+++ b/src/py_scripts/fc_ovlp_to_graph.py
@@ -0,0 +1,5 @@
+from falcon_kit.mains.ovlp_to_graph import main
+import sys
+
+if __name__ == "__main__":
+    main(*sys.argv)
diff --git a/src/py_scripts/fc_run.cfg b/src/py_scripts/fc_run.cfg
new file mode 100644
index 0000000..8ed18e0
--- /dev/null
+++ b/src/py_scripts/fc_run.cfg
@@ -0,0 +1,38 @@
+[General]
+# list of files of the initial bas.h5 files
+input_fofn = input.fofn
+#input_fofn = preads.fofn
+
+input_type = raw
+#input_type = preads
+
+# The length cutoff used for seed reads used for initial mapping
+length_cutoff = 10000
+
+# The length cutoff used for seed reads usef for pre-assembly
+length_cutoff_pr = 10000
+
+# target = pre-assembly
+# target = mapping
+target = assembly
+
+sge_option_da = -pe smp 8 -q bigmem
+sge_option_la = -pe smp 2 -q bigmem
+sge_option_pda = -pe smp 8 -q bigmem 
+sge_option_pla = -pe smp 2 -q bigmem
+sge_option_fc = -pe smp 24 -q bigmem
+sge_option_cns = -pe smp 8 -q bigmem
+
+pa_concurrent_jobs = 32
+cns_concurrent_jobs = 32
+ovlp_concurrent_jobs = 32
+
+pa_HPCdaligner_option =  -v -dal128 -t16 -e.70 -l1000 -s1000 
+ovlp_HPCdaligner_option = -v -dal128 -t32 -h60 -e.96 -l500 -s1000 
+
+pa_DBsplit_option = -x500 -s400
+ovlp_DBsplit_option = -x500 -s400
+
+falcon_sense_option = --output_multi --min_idt 0.70 --min_cov 4 --local_match_count_threshold 2 --max_n_read 200 --n_core 6
+
+overlap_filtering_setting = --max_diff 100 --max_cov 100 --min_cov 1  --bestn 10
diff --git a/src/py_scripts_v0.1/falcon_asm.py b/src/py_scripts_v0.1/falcon_asm.py
new file mode 100755
index 0000000..1534b44
--- /dev/null
+++ b/src/py_scripts_v0.1/falcon_asm.py
@@ -0,0 +1,1154 @@
+#!/usr/bin/env python
+
+#################################################################################$$
+# Copyright (c) 2011-2014, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted (subject to the limitations in the
+# disclaimer below) provided that the following conditions are met:
+#
+#  * Redistributions of source code must retain the above copyright
+#  notice, this list of conditions and the following disclaimer.
+#
+#  * Redistributions in binary form must reproduce the above
+#  copyright notice, this list of conditions and the following
+#  disclaimer in the documentation and/or other materials provided
+#  with the distribution.
+#
+#  * Neither the name of Pacific Biosciences nor the names of its
+#  contributors may be used to endorse or promote products derived
+#  from this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#################################################################################$$
+
+from pbcore.io import FastaReader
+import networkx as nx
+import os
+import shlex
+import sys
+import subprocess
+
+DEBUG_LOG_LEVEL = 0
+
+class SGNode(object):
+    """
+    class representing a node in the string graph
+    """
+    def __init__(self, node_name):
+        self.name = node_name
+        self.out_edges = []
+        self.in_edges = []
+    def add_out_edge(self, out_edge):
+        self.out_edges.append(out_edge)
+    def add_in_edge(self, in_edge):
+        self.in_edges.append(in_edge)
+
+class SGEdge(object):
+    """
+    class representing an edge in the string graph
+    """
+    def __init__(self, in_node, out_node):
+        self.in_node = in_node
+        self.out_node = out_node
+        self.attr = {}
+    def set_attribute(self, attr, value):
+        self.attr[attr] = value
+
+def reverse_end( node_id ):
+    node_id, end = node_id.split(":")
+    new_end = "B" if end == "E" else "E"
+    return node_id + ":" + new_end
+
+class StringGraph(object):
+    """
+    class representing the string graph
+    """
+    def __init__(self):
+        self.nodes = {}
+        self.edges = {}
+        self.n_mark = {}
+        self.e_reduce = {}
+        self.repeat_overlap = {}
+        
+    def add_node(self, node_name):
+        """ 
+        add a node into the graph by given a node name
+        """
+        if node_name not in self.nodes:
+            self.nodes[node_name] = SGNode(node_name)
+    
+    def add_edge(self, in_node_name, out_node_name, **attributes):
+        """ 
+        add an edge into the graph by given a pair of nodes
+        """
+        if (in_node_name, out_node_name) not in self.edges:
+        
+            self.add_node(in_node_name)
+            self.add_node(out_node_name)
+            in_node = self.nodes[in_node_name]
+            out_node = self.nodes[out_node_name]    
+            
+            edge = SGEdge(in_node, out_node)
+            self.edges[ (in_node_name, out_node_name) ] = edge
+            in_node.add_out_edge(edge)
+            out_node.add_in_edge(edge)
+        edge =  self.edges[ (in_node_name, out_node_name) ]
+        for k, v in attributes.items():
+            edge.attr[k] = v
+
+    def init_reduce_dict(self):
+        for e in self.edges:
+            self.e_reduce[e] = False
+
+    def mark_chimer_edge(self):
+
+        for e_n, e in self.edges.items():
+            v = e_n[0]
+            w = e_n[1]
+            overlap_count = 0
+            for w_out_e in self.nodes[w].out_edges:
+                w_out_n = w_out_e.out_node.name
+                if (v, w_out_n) in self.edges:
+                    overlap_count += 1
+            for v_in_e in self.nodes[v].in_edges:
+                v_in_n = v_in_e.in_node.name
+                if (v_in_n, w) in self.edges:
+                    overlap_count += 1
+            if self.e_reduce[ (v, w) ] != True:
+                if overlap_count == 0:
+                    self.e_reduce[(v, w)] = True
+                    #print "XXX: chimer edge %s %s removed" % (v, w)
+                    v, w = reverse_end(w), reverse_end(v)
+                    self.e_reduce[(v, w)] = True
+                    #print "XXX: chimer edge %s %s removed" % (v, w)
+
+
+
+    def mark_spur_edge(self):
+
+        for  v in self.nodes:
+            if len(self.nodes[v].out_edges) > 1:
+                for out_edge in self.nodes[v].out_edges:
+                    w = out_edge.out_node.name
+                    
+                    if len(self.nodes[w].out_edges) == 0 and self.e_reduce[ (v, w) ] != True:
+                        #print "XXX: spur edge %s %s removed" % (v, w)
+                        self.e_reduce[(v, w)] = True
+                        v2, w2 = reverse_end(w), reverse_end(v)
+                        #print "XXX: spur edge %s %s removed" % (v2, w2)
+                        self.e_reduce[(v, w)] = True
+
+            if len(self.nodes[v].in_edges) > 1:
+                for in_edge in self.nodes[v].in_edges:
+                    w = in_edge.in_node.name
+                    if len(self.nodes[w].in_edges) == 0 and self.e_reduce[ (w, v) ] != True:
+                        #print "XXX: spur edge %s %s removed" % (w, v)
+                        self.e_reduce[(w, v)] = True
+                        v2, w2 = reverse_end(w), reverse_end(v)
+                        #print "XXX: spur edge %s %s removed" % (w2, v2)
+                        self.e_reduce[(w, v)] = True
+
+
+    def mark_tr_edges(self):
+        """
+        transitive reduction
+        """
+        n_mark = self.n_mark
+        e_reduce = self.e_reduce
+        FUZZ = 500
+        for n in self.nodes:
+            n_mark[n] = "vacant"
+    
+        for n_name, node in self.nodes.items():
+
+            out_edges = node.out_edges
+            if len(out_edges) == 0:
+                continue
+            
+            out_edges.sort(key=lambda x: x.attr["length"])
+            
+            for e in out_edges:
+                w = e.out_node
+                n_mark[ w.name ] = "inplay"
+            
+            max_len = out_edges[-1].attr["length"]
+                
+            max_len += FUZZ
+            
+            for e in out_edges:
+                e_len = e.attr["length"]
+                w = e.out_node
+                if n_mark[w.name] == "inplay":
+                    w.out_edges.sort( key=lambda x: x.attr["length"] )
+                    for e2 in w.out_edges:
+                        if e2.attr["length"] + e_len < max_len:
+                            x = e2.out_node
+                            if n_mark[x.name] == "inplay":
+                                n_mark[x.name] = "eliminated"
+            
+            for e in out_edges:
+                e_len = e.attr["length"]
+                w = e.out_node
+                w.out_edges.sort( key=lambda x: x.attr["length"] )
+                if len(w.out_edges) > 0:
+                    x = w.out_edges[0].out_node
+                    if n_mark[x.name] == "inplay":
+                        n_mark[x.name] = "eliminated"
+                for e2 in w.out_edges:
+                    if e2.attr["length"] < FUZZ:
+                        x = e2.out_node
+                        if n_mark[x.name] == "inplay":
+                            n_mark[x.name] = "eliminated"
+                            
+            for out_edge in out_edges:
+                v = out_edge.in_node
+                w = out_edge.out_node
+                if n_mark[w.name] == "eliminated":
+                    e_reduce[ (v.name, w.name) ] = True
+                    #print "XXX: tr edge %s %s removed" % (v.name, w.name)
+                    v_name, w_name = reverse_end(w.name), reverse_end(v.name)
+                    e_reduce[(v_name, w_name)] = True
+                    #print "XXX: tr edge %s %s removed" % (v_name, w_name)
+                n_mark[w.name] = "vacant"
+                
+
+    def mark_best_overlap(self):
+        """
+        find the best overlapped edges
+        """
+
+        best_edges = set()
+
+        for v in self.nodes:
+
+            out_edges = self.nodes[v].out_edges
+            if len(out_edges) > 0:
+                out_edges.sort(key=lambda e: e.attr["score"])
+                e = out_edges[-1]
+                best_edges.add( (e.in_node.name, e.out_node.name) )
+
+            in_edges = self.nodes[v].in_edges
+            if len(in_edges) > 0:
+                in_edges.sort(key=lambda e: e.attr["score"])
+                e = in_edges[-1]
+                best_edges.add( (e.in_node.name, e.out_node.name) )
+
+        if DEBUG_LOG_LEVEL > 1:
+            print "X", len(best_edges)
+
+        for e_n, e in self.edges.items():
+            v = e_n[0]
+            w = e_n[1]
+            if self.e_reduce[ (v, w) ] != True:
+                if (v, w) not in best_edges:
+                    self.e_reduce[(v, w)] = True
+                    #print "XXX: in best edge %s %s removed" % (v, w)
+                    v2, w2 = reverse_end(w), reverse_end(v)
+                    #print "XXX: in best edge %s %s removed" % (v2, w2)
+                    self.e_reduce[(v2, w2)] = True
+                
+    def get_out_edges_for_node(self, name, mask=True):
+        rtn = []
+        for e in self.nodes[name].out_edges:
+            v = e.in_node
+            w = e.out_node
+            if self.e_reduce[ (v.name, w.name) ] == False:
+                rtn.append(e)
+        return rtn
+        
+        
+    def get_in_edges_for_node(self, name, mask=True):
+        rtn = []
+        for e in self.nodes[name].in_edges:
+            v = e.in_node
+            w = e.out_node
+            if self.e_reduce[ (v.name, w.name) ] == False:
+                rtn.append(e)
+        return rtn
+
+    def get_best_out_edge_for_node(self, name, mask=True):
+        rtn = []
+        for e in self.nodes[name].out_edges:
+            v = e.in_node
+            w = e.out_node
+            if self.e_reduce[ (v.name, w.name) ] == False:
+                rtn.append(e)
+        rtn.sort(key=lambda e: e.attr["score"])
+
+        return rtn[-1]
+
+    def get_best_in_edge_for_node(self, name, mask=True):
+        rtn = []
+        for e in self.nodes[name].in_edges:
+            v = e.in_node
+            w = e.out_node
+            if self.e_reduce[ (v.name, w.name) ] == False:
+                rtn.append(e)
+        rtn.sort(key=lambda e: e.attr["score"])
+        return rtn[-1]
+        
+
+RCMAP = dict(zip("ACGTacgtNn-","TGCAtgcaNn-"))
+def generate_seq_from_path(sg, seqs, path):
+    subseqs = []
+    r_id, end = path[0].split(":")
+    
+    count = 0
+    for i in range( len( path ) -1 ):
+        w_n, v_n = path[i:i+2]
+        edge = sg.edges[ (w_n, v_n ) ]
+        read_id, coor = edge.attr["label"].split(":")
+        b,e = coor.split("-")
+        b = int(b)
+        e = int(e)
+        if b < e:
+            subseqs.append( seqs[read_id][b:e] )
+        else:
+            subseqs.append( "".join( [RCMAP[c] for c in seqs[read_id][b:e:-1]] ) )
+
+    return "".join(subseqs)
+
+
+def reverse_path( path ):
+    new_path = []
+    for n in list(path[::-1]):
+        rid, end = n.split(":")
+        new_end = "B" if end == "E" else "E"
+        new_path.append( rid+":"+new_end)
+    return new_path
+
+
+def generate_unitig(sg, seqs, out_fn, connected_nodes = None):
+
+    """
+    given a string graph:sg and the sequences: seqs, write the unitig fasta file into out_fn
+    the funtion return a reduct graph representing the reduce string graph where the edges are unitigs
+    
+    some extra files generated: 
+        unit_edges.dat : an easy to parse file for unitig data
+        unit_edge_paths : the file contains the information of the path of all unitigs
+        uni_graph.gexf: the unitig graph in gexf format for visulization
+    """
+
+    G = SGToNXG(sg)
+    if connected_nodes != None:
+        connected_nodes = set(sg.nodes)
+    out_fasta = open(out_fn, "w")
+    nodes_for_tig = set()
+    sg_edges = set()
+    for v, w in sg.edges:
+        if sg.e_reduce[(v, w)] != True:
+            sg_edges.add( (v, w) )
+    count = 0
+    edges_in_tigs = set()
+
+    uni_edges = {}
+    path_f = open("unit_edge_paths","w")
+    uni_edge_f = open("unit_edges.dat", "w")
+    while len(sg_edges) > 0:
+        v, w = sg_edges.pop()
+
+        #nodes_for_tig.remove(n)
+        upstream_nodes = []
+        
+        c_node = v
+        p_in_edges = sg.get_in_edges_for_node(c_node)
+        p_out_edges = sg.get_out_edges_for_node(c_node)
+        while len(p_in_edges) == 1 and len(p_out_edges) == 1:
+            p_node = p_in_edges[0].in_node
+            upstream_nodes.append(p_node.name)
+            if (p_node.name, c_node) not in  sg_edges:
+                break
+            p_in_edges = sg.get_in_edges_for_node(p_node.name)
+            p_out_edges = sg.get_out_edges_for_node(p_node.name)
+            c_node = p_node.name
+
+        upstream_nodes.reverse()  
+            
+        downstream_nodes = []
+        c_node = w 
+        n_out_edges = sg.get_out_edges_for_node(c_node)
+        n_in_edges = sg.get_in_edges_for_node(c_node)
+        while len(n_out_edges) == 1 and len(n_in_edges) == 1:
+            n_node = n_out_edges[0].out_node
+            downstream_nodes.append(n_node.name)
+            if (c_node, n_node.name) not in  sg_edges:
+                break
+            n_out_edges = sg.get_out_edges_for_node(n_node.name)
+            n_in_edges = sg.get_in_edges_for_node(n_node.name)
+            c_node = n_node.name 
+        
+        whole_path = upstream_nodes + [v, w] + downstream_nodes
+        count += 1
+        subseq = generate_seq_from_path(sg, seqs, whole_path) 
+        uni_edges.setdefault( (whole_path[0], whole_path[-1]), [] )
+        uni_edges[(whole_path[0], whole_path[-1])].append(  ( whole_path, subseq ) )
+        print >> uni_edge_f, whole_path[0], whole_path[-1], "-".join(whole_path), subseq
+        print >>path_f, ">%05dc-%s-%s-%d %s" % (count, whole_path[0], whole_path[-1], len(whole_path), " ".join(whole_path))
+        print >>out_fasta, ">%05dc-%s-%s-%d" % (count, whole_path[0], whole_path[-1], len(whole_path))
+        print >>out_fasta, subseq
+        for i in range( len( whole_path ) -1 ):
+            w_n, v_n = whole_path[i:i+2]
+            try:
+                sg_edges.remove( (w_n, v_n) )
+            except KeyError: #if an edge is already deleted, ignore it
+                pass
+
+        r_whole_path = reverse_path( whole_path )
+        count += 1
+        subseq = generate_seq_from_path(sg, seqs, r_whole_path) 
+        uni_edges.setdefault( (r_whole_path[0], r_whole_path[-1]), [] )
+        uni_edges[(r_whole_path[0], r_whole_path[-1])].append(  ( r_whole_path, subseq ) )
+        print >> uni_edge_f, r_whole_path[0], r_whole_path[-1], "-".join(r_whole_path), subseq
+        print >>path_f, ">%05dc-%s-%s-%d %s" % (count, r_whole_path[0], r_whole_path[-1], len(r_whole_path), " ".join(r_whole_path))
+        print >>out_fasta, ">%05dc-%s-%s-%d" % (count, r_whole_path[0], r_whole_path[-1], len(r_whole_path))
+        print >>out_fasta, subseq
+        for i in range( len( r_whole_path ) -1 ):
+            w_n, v_n = r_whole_path[i:i+2]
+            try:
+                sg_edges.remove( (w_n, v_n) )
+            except KeyError: #if an edge is already deleted, ignore it
+                pass
+
+
+    path_f.close()
+    uni_edge_f.close()
+    #uni_graph = nx.DiGraph()
+    #for n1, n2 in uni_edges.keys():
+    #    uni_graph.add_edge(n1, n2, count = len( uni_edges[ (n1,n2) ] ))
+    #nx.write_gexf(uni_graph, "uni_graph.gexf")
+
+    out_fasta.close()
+    return uni_edges
+
+def neighbor_bound(G, v, w, radius):
+    """
+    test if the node v and the node w are connected within a radius in graph G
+    """
+    g1 = nx.ego_graph(G, v, radius=radius, undirected=False)
+    g2 = nx.ego_graph(G, w, radius=radius, undirected=False)
+    if len(set(g1.edges()) & set(g2.edges())) > 0:
+        return True
+    else:
+        return False
+
+
+def is_branch_node(G, n):
+    """
+    test whether the node n is a "branch node" which the paths from any of two of 
+    its offsprings do not intersect within a given radius
+    """
+    out_edges = G.out_edges([n])
+    n2 = [ e[1] for e in out_edges ]
+    is_branch = False
+    for i in range(len(n2)):
+        for j in range(i+1, len(n2)):
+            v = n2[i]
+            w = n2[j]
+            if neighbor_bound(G, v, w, 10) == False:
+                is_branch = True
+                break
+        if is_branch == True:
+            break
+    return is_branch
+
+
+def get_bundle( path, u_graph ):
+
+    """ 
+    find a sub-graph contain the nodes between the start and the end of the path
+    inputs: 
+        u_graph : a unitig graph
+    returns:
+        bundle_graph: the whole bundle graph 
+        bundle_paths: the paths in the bundle graph 
+        sub_graph2_edges: all edges of the bundle graph
+    
+    """
+
+    p_start, p_end = path[0], path[-1]
+    p_nodes = set(path)
+    p_edges = set(zip(path[:-1], path[1:]))
+
+    u_graph_r = u_graph.reverse()
+    down_path = nx.ego_graph(u_graph, p_start, radius=len(p_nodes), undirected=False)
+    up_path = nx.ego_graph(u_graph_r, p_end, radius=len(p_nodes), undirected=False)
+    subgraph_nodes = set(down_path) & set(up_path)
+    
+
+    sub_graph = nx.DiGraph()
+    for v, w in u_graph.edges_iter():
+        if v in subgraph_nodes and w in subgraph_nodes:            
+            if (v, w) in p_edges:
+                sub_graph.add_edge(v, w, color = "red")
+            else:
+                sub_graph.add_edge(v, w, color = "black")
+
+    sub_graph2 = nx.DiGraph()
+    tips = set()
+    tips.add(path[0])
+    sub_graph_r = sub_graph.reverse()
+    visited = set()
+    ct = 0
+    is_branch = is_branch_node(sub_graph, path[0]) #if the start node is a branch node
+    if is_branch:
+        n = tips.pop()
+        e = sub_graph.out_edges([n])[0] #pick one path the build the subgraph
+        sub_graph2.add_edge(e[0], e[1], n_weight = u_graph[e[0]][e[1]]["n_weight"])
+        if e[1] not in visited:
+            last_node = e[1]
+            visited.add(e[1])
+            r_id, orientation = e[1].split(":")
+            orientation = "E" if orientation == "B" else "E"
+            visited.add( r_id +":" + orientation)
+            if not is_branch_node(sub_graph_r, e[1]): 
+                tips.add(e[1])
+        
+    while len(tips) != 0:
+        n = tips.pop()
+        out_edges = sub_graph.out_edges([n])
+        if len(out_edges) == 1:
+            e = out_edges[0]
+            sub_graph2.add_edge(e[0], e[1], n_weight = u_graph[e[0]][e[1]]["n_weight"])
+            last_node = e[1]
+            if e[1] not in visited:                       
+                visited.add(e[1])
+                r_id, orientation = e[1].split(":")
+                orientation = "E" if orientation == "B" else "E"
+                visited.add( r_id +":" + orientation)
+                if not is_branch_node(sub_graph_r, e[1]): 
+                    tips.add(e[1])
+        else:
+        
+            is_branch = is_branch_node(sub_graph, n)
+            if not is_branch:
+                for e in out_edges:
+                    sub_graph2.add_edge(e[0], e[1], n_weight = u_graph[e[0]][e[1]]["n_weight"])
+                    last_node = e[1]
+                    if e[1] not in visited:
+                        r_id, orientation = e[1].split(":")
+                        visited.add(e[1])
+                        orientation = "E" if orientation == "B" else "E"
+                        visited.add( r_id +":" + orientation)
+                        if not is_branch_node(sub_graph_r, e[1]):
+                            tips.add(e[1])
+        ct += 1
+    last_node = None
+    longest_len = 0
+        
+    sub_graph2_nodes = sub_graph2.nodes()
+    sub_graph2_edges = sub_graph2.edges()
+
+
+    new_path = [path[0]]
+    for n in sub_graph2_nodes:
+        if len(sub_graph2.out_edges(n)) == 0 :
+            path_t = nx.shortest_path(sub_graph2, source = path[0], target = n, weight = "n_weight")
+            path_len = len(path_t)
+            if path_len > longest_len:
+                last_node = n
+                longest_len = path_len
+                new_path = path_t
+
+    if last_node == None:
+        for n in sub_graph2_nodes:
+            path_t = nx.shortest_path(sub_graph2, source = path[0], target = n, weight = "n_weight")
+            path_len = len(path_t)
+            if path_len > longest_len:
+                last_node = n
+                longest_len = path_len
+                new_path = path_t
+
+
+    path = new_path
+
+    # clean up sub_graph2 according to new begin and end
+    sub_graph2_r = sub_graph2.reverse()
+    down_path = nx.ego_graph(sub_graph2, path[0], radius=len(path), undirected=False)
+    up_path = nx.ego_graph(sub_graph2_r, path[-1], radius=len(path), undirected=False)
+    subgraph_nodes = set(down_path) & set(up_path)
+    for v in sub_graph2_nodes:
+        if v not in subgraph_nodes:
+            sub_graph2.remove_node(v)
+    
+    if DEBUG_LOG_LEVEL > 1:
+        print "new_path", path[0], last_node, len(sub_graph2_nodes), path
+
+
+    bundle_paths = [path]
+    p_nodes = set(path)
+    p_edges = set(zip(path[:-1], path[1:]))
+
+    sub_graph2_nodes = sub_graph2.nodes()
+    sub_graph2_edges = sub_graph2.edges()
+
+    nodes_idx = dict( [ (n[1], n[0]) for n in enumerate(path) ]  )
+    
+         
+    # create a list of subpath that has no branch
+    non_branch_subpaths = []
+    wi = 0
+    vi = 0
+    v = path[0]
+    while v != path[-1] and wi < len(path)-1:
+        wi += 1
+        w = path[wi]
+        while len( sub_graph2.successors(w) ) == 1 and len( sub_graph2.predecessors(w) ) == 1 and wi < len(path)-1:
+            wi += 1
+            w = path[wi]
+        if  len( sub_graph2.successors(v) )!= 1 or len( sub_graph2.predecessors(w) )!= 1:
+            branched = True
+        else:
+            branched = False
+
+        if not branched:
+            non_branch_subpaths.append( path[vi:wi+1] )
+        v = w
+        vi = wi
+
+    # create the accompany_graph that has the path of the alternative subpaths
+    
+    associate_graph = nx.DiGraph()
+    for v, w in sub_graph2.edges_iter():
+        if (v, w) not in p_edges:
+            associate_graph.add_edge(v, w, n_weight = sub_graph2[v][w]["n_weight"])
+
+    if DEBUG_LOG_LEVEL > 1:
+        print "associate_graph size:", len(associate_graph)           
+        print "non_branch_subpaths",len(non_branch_subpaths), non_branch_subpaths
+
+    # construct the bundle graph                
+    associate_graph_nodes = set(associate_graph.nodes())
+    bundle_graph = nx.DiGraph()
+    bundle_graph.add_path( path )
+    for i in range(len(non_branch_subpaths)-1):
+        if len(non_branch_subpaths[i]) == 0 or len( non_branch_subpaths[i+1] ) == 0:
+            continue
+        e1, e2 = non_branch_subpaths[i: i+2]
+        v = e1[-1]
+        w = e2[0]
+        if v == w:
+            continue
+        in_between_node_count = nodes_idx[w] - nodes_idx[v] 
+        if v in associate_graph_nodes and w in associate_graph_nodes:
+            try:
+                a_path = nx.shortest_path(associate_graph, v, w, "n_weight")    
+            except nx.NetworkXNoPath:
+                continue
+            bundle_graph.add_path( a_path )      
+            bundle_paths.append( a_path )
+
+    return bundle_graph, bundle_paths, sub_graph2_edges
+            
+def get_bundles(u_edges):
+    
+    """
+    input: all unitig edges
+    output: the assembled primary_tigs.fa and all_tigs.fa
+    """
+
+    ASM_graph = nx.DiGraph()
+    out_f = open("primary_tigs.fa", "w")
+    main_tig_paths = open("primary_tigs_paths","w")
+    sv_tigs = open("all_tigs.fa","w")
+    sv_tig_paths = open("all_tigs_paths","w")
+    max_weight = 0 
+    for v, w in u_edges:
+        x = max( [len(s[1]) for s in u_edges[ (v,w) ] ] )
+        if DEBUG_LOG_LEVEL > 1:
+            print "W", v, w, x
+        if x > max_weight:
+            max_weight = x
+            
+    in_edges = {}
+    out_edges = {}
+    for v, w in u_edges:
+        in_edges.setdefault(w, []) 
+        out_edges.setdefault(w, []) 
+        in_edges[w].append( (v, w) )
+
+        out_edges.setdefault(v, [])
+        in_edges.setdefault(v, [])
+        out_edges[v].append( (v, w) )
+
+    u_graph = nx.DiGraph()
+    for v,w in u_edges:
+
+        u_graph.add_edge(v, w, n_weight = max_weight - max( [len(s[1]) for s in  u_edges[ (v,w) ] ] ) )
+    
+    bundle_edge_out = open("bundle_edges","w")
+    bundle_index = 0
+    G = u_graph.copy()
+    visited_u_edges = set()
+    while len(G) > 0:
+        
+        root_nodes = set() 
+        for n in G: 
+            if G.in_degree(n) == 0: 
+                root_nodes.add(n) 
+
+        if len(root_nodes) == 0:
+            if G.in_degree(n) != 1: 
+                root_nodes.add(n) 
+        
+        if len(root_nodes) == 0:  
+            root_nodes.add( G.nodes()[0] ) 
+        
+        candidates = [] 
+        
+        for n in list(root_nodes): 
+            sp =nx.single_source_shortest_path_length(G, n) 
+            sp = sp.items() 
+            sp.sort(key=lambda x : x[1]) 
+            longest = sp[-1] 
+            if DEBUG_LOG_LEVEL > 2:
+                print "L", n, longest[0]
+            if longest[0].split(":")[0] == n.split(":")[0]: #avoid a big loop 
+                continue
+            candidates.append ( (longest[1], n, longest[0]) ) 
+
+        if len(candidates) == 0:
+            print "no more candiate", len(G.edges()), len(G.nodes())
+            if len(G.edges()) > 0:
+                path = G.edges()[0] 
+                print path
+            else:
+                break
+        else:
+            candidates.sort() 
+            
+            candidate = candidates[-1] 
+            
+            if candidate[1] == candidate[2]: 
+                G.remove_node(candidate[1]) 
+                continue 
+         
+            path = nx.shortest_path(G, candidate[1], candidate[2], "n_weight") 
+
+        if DEBUG_LOG_LEVEL > 1:
+            print "X", path[0], path[-1], len(path)
+        
+        cmp_edges = set()
+        g_edges = set(G.edges())
+        new_path = []  
+        tail = True
+        # avioid confusion due to long palindrome sequence
+        if len(path) > 2:
+            for i in range( 0, len( path ) - 1 ):
+                v_n, w_n = path[i:i+2]
+                new_path.append(v_n)
+                # the comment out code below might be useful for filter out some high connectivity nodes
+                #if (v_n, w_n) in cmp_edges or\
+                #    len(u_graph.out_edges(w_n)) > 5 or\
+                #    len(u_graph.in_edges(w_n)) > 5:
+                if (v_n, w_n) in cmp_edges: 
+                    tail = False
+                    break
+
+                r_id, end = v_n.split(":")
+                end = "E" if end == "B" else "B" 
+                v_n2 = r_id + ":" + end 
+
+                r_id, end = w_n.split(":")
+                end = "E" if end == "B" else "B" 
+                w_n2 = r_id + ":" + end 
+
+                if (w_n2, v_n2) in g_edges:
+                    cmp_edges.add( (w_n2, v_n2) )
+
+            if tail:
+                new_path.append(w_n)
+        else:
+            new_path = path[:]
+                
+        
+        if len(new_path) > 1:
+            path = new_path
+            
+            if DEBUG_LOG_LEVEL > 2:
+                print "Y", path[0], path[-1], len(path)
+
+            bundle_graph, bundle_paths, bundle_graph_edges = get_bundle( path, G )
+            for bg_edge in bundle_graph_edges:
+                print >> bundle_edge_out, bundle_index, "edge", bg_edge[0], bg_edge[1]
+            for path_ in bundle_paths:
+                print >>bundle_edge_out, "path", bundle_index, " ".join(path_) 
+
+            edges_to_be_removed = set()
+            if DEBUG_LOG_LEVEL > 2:
+                print "Z", bundle_paths[0][0], bundle_paths[0][-1]
+                print bundle_index, len(path), len(bundle_paths[0]), len(bundle_paths), len(bundle_graph_edges)
+
+            if len(bundle_graph_edges) > 0:
+
+                ASM_graph.add_path(bundle_paths[0], ctg="%04d" % bundle_index)
+                extra_u_edges = []
+                
+                print >> main_tig_paths, ">%04d %s" % ( bundle_index, " ".join(bundle_paths[0]) )
+                subseqs = []
+            
+                for i in range(len(bundle_paths[0]) - 1): 
+                    v, w = bundle_paths[0][i:i+2]
+                    edges_to_be_removed.add( (v,w) )
+                    uedges = u_edges[ (v,w) ]
+                    uedges.sort( key= lambda x: len(x[0]) )
+                    subseqs.append( uedges[-1][1] )
+                    visited_u_edges.add( "-".join(uedges[-1][0]) ) 
+                    for ue in uedges:
+                        if "-".join(ue[0]) not in visited_u_edges:
+                            visited_u_edges.add("-".join(ue[0]))
+                            extra_u_edges.append(ue)
+                seq = "".join(subseqs)        
+                sv_tig_idx = 0
+                print >> sv_tig_paths, ">%04d-%04d %s" % ( bundle_index, sv_tig_idx, " ".join(bundle_paths[0]) )
+                if len(seq) > 0:
+                    print >> out_f, ">%04d %s-%s" % (bundle_index, bundle_paths[0][0], bundle_paths[0][-1])
+                    print >> out_f, seq
+                    print >> sv_tigs, ">%04d-%04d %s-%s" % (bundle_index, sv_tig_idx, bundle_paths[0][0], bundle_paths[0][-1])
+                    print >> sv_tigs, "".join(subseqs)
+
+                sv_tig_idx += 1
+
+                for sv_path in bundle_paths[1:]:
+                    print >> sv_tig_paths, ">%04d-%04d %s" % ( bundle_index, sv_tig_idx, " ".join(sv_path) )
+                    ASM_graph.add_path(sv_path, ctg="%04d" % bundle_index)
+                    subseqs = []
+                    for i in range(len(sv_path) - 1): 
+                        v, w = sv_path[i:i+2]
+                        edges_to_be_removed.add( (v,w) )
+                        uedges = u_edges[ (v,w) ]
+                        uedges.sort( key= lambda x: len(x[0]) )
+                        subseqs.append( uedges[-1][1] )
+                        visited_u_edges.add( "-".join(uedges[-1][0]) ) 
+                        for ue in uedges:
+                            if "-".join(ue[0]) not in visited_u_edges:
+                                visited_u_edges.add("-".join(ue[0]))
+                                extra_u_edges.append(ue)
+                    seq = "".join(subseqs)        
+                    if len(seq) > 0: 
+                        print >> sv_tigs, ">%04d-%04d %s-%s" % (bundle_index, sv_tig_idx, sv_path[0], sv_path[-1])
+                        print >> sv_tigs, "".join(subseqs)
+                    sv_tig_idx += 1
+                for u_path, seq in extra_u_edges:
+                    #u_path = u_path.split("-")
+                    ASM_graph.add_edge(u_path[0], u_path[-1], ctg="%04d" % bundle_index)
+                    print >> sv_tig_paths, ">%04d-%04d-u %s" % ( bundle_index, sv_tig_idx, " ".join(u_path) )
+                    print >> sv_tigs, ">%04d-%04d-u %s-%s" % (bundle_index, sv_tig_idx, u_path[0], u_path[-1])
+                    print >> sv_tigs, seq
+                    sv_tig_idx += 1
+                    
+                
+                bundle_index += 1
+        else:
+            #TODO, consolidate code here
+            v,w = path
+            uedges = u_edges[ (v,w) ]
+            uedges.sort( key= lambda x: len(x[0]) )
+            subseqs.append( uedges[-1][1] )
+            seq = "".join(subseqs)
+            print >> sv_tig_paths, ">%04d-%04d %s" % ( bundle_index, sv_tig_idx, " ".join(paths) )
+            print >> sv_tigs, ">%04d-%04d-u %s-%s" % (bundle_index, sv_tig_idx, path[0], path[-1])
+            print >> sv_tigs, seq
+            sv_tig_idx += 1
+            bundle_index += 1
+            bundle_graph_edges = zip(path[:-1],path[1:])
+        
+        #clean up the graph
+
+        edges = set(G.edges())
+        edges_to_be_removed |= set(bundle_graph_edges)
+
+        if DEBUG_LOG_LEVEL > 2:
+            print "BGE",bundle_graph_edges
+        
+        edge_remove_count = 0
+        for v, w in edges_to_be_removed:
+            if (v, w) in edges:
+                G.remove_edge( v, w )
+                edge_remove_count += 1
+                if DEBUG_LOG_LEVEL > 2:
+                    print "remove edge", bundle_index, w, v
+                
+        edges = set(G.edges())
+        for v, w in edges_to_be_removed:
+
+            r_id, end = v.split(":")
+            end = "E" if end == "B" else "B"
+            v = r_id + ":" + end
+
+            r_id, end = w.split(":")
+            end = "E" if end == "B" else "B"
+            w = r_id + ":" + end
+
+            if (w, v) in edges:
+                G.remove_edge( w, v )
+                edge_remove_count += 1
+                if DEBUG_LOG_LEVEL > 2:
+                    print "remove edge", bundle_index, w, v
+
+        if edge_remove_count == 0:
+            break
+            
+        nodes = G.nodes()
+        for n in nodes:
+            if G.in_degree(n) == 0 and G.out_degree(n) == 0:
+                G.remove_node(n)
+                if DEBUG_LOG_LEVEL > 2:
+                    print "remove node", n 
+
+    sv_tig_paths.close()
+    sv_tigs.close()
+    main_tig_paths.close()
+    out_f.close()
+    bundle_edge_out.close()
+    return ASM_graph
+
+
+
+def SGToNXG(sg):
+    G=nx.DiGraph()
+
+    max_score = max([ sg.edges[ e ].attr["score"] for e in sg.edges if sg.e_reduce[e] != True ])
+    out_f = open("edges_list","w")
+    for v, w in sg.edges:
+        if sg.e_reduce[(v, w)] != True:
+        ##if 1:
+            out_degree = len(sg.nodes[v].out_edges)
+            G.add_node( v, size = out_degree )
+            G.add_node( w, size = out_degree )
+            label = sg.edges[ (v, w) ].attr["label"]
+            score = sg.edges[ (v, w) ].attr["score"]
+            print >>out_f, v, w, label, score 
+            G.add_edge( v, w, label = label, weight = 0.001*score, n_weight = max_score - score )
+            #print in_node_name, out_node_name
+    out_f.close()
+    return G
+
+if __name__ == "__main__":
+
+    import argparse
+    
+    parser = argparse.ArgumentParser(description='a example string graph assembler that is desinged for handling diploid genomes')
+    parser.add_argument('overlap_file', help='a file that contains the overlap information.')
+    parser.add_argument('read_fasta', help='the file that contains the sequence to be assembled')
+    parser.add_argument('--min_len', type=int, default=4000, 
+                        help='minimum length of the reads to be considered for assembling')
+    parser.add_argument('--min_idt', type=float, default=96,
+                        help='minimum alignment identity of the reads to be considered for assembling')
+    parser.add_argument('--disable_chimer_prediction', action="store_true", default=False,
+                        help='you may want to disable this as some reads can be falsely identified as chimers in low coverage case')
+
+    args = parser.parse_args()
+
+
+    overlap_file = args.overlap_file
+    read_fasta = args.read_fasta
+
+    seqs = {}
+    # load all p-reads into memory
+    f = FastaReader(read_fasta)
+    for r in f:
+        seqs[r.name] = r.sequence.upper()
+
+    G=nx.Graph()
+    edges =set()
+    overlap_data = []
+    contained_reads = set()
+    overlap_count = {}
+
+
+    # loop through the overlapping data to load the data in the a python array
+    # contained reads are identified 
+
+    with open(overlap_file) as f:
+        for l in f:
+            l = l.strip().split()
+
+            #work around for some ill formed data recored
+            if len(l) != 13:
+                continue
+            
+            f_id, g_id, score, identity = l[:4]
+            if f_id == g_id:  # don't need self-self overlapping
+                continue
+
+            if g_id not in seqs: 
+                continue
+
+            if f_id not in seqs:
+                continue
+
+            score = int(score)
+            identity = float(identity)
+            contained = l[12]
+            if contained == "contained":
+                contained_reads.add(f_id)
+                continue
+            if contained == "contains":
+                contained_reads.add(g_id)
+                continue
+            if contained == "none":
+                continue
+
+            if identity < args.min_idt: # only take record with >96% identity as overlapped reads
+                continue
+            #if score > -2000:
+            #    continue
+            f_strain, f_start, f_end, f_len = (int(c) for c in l[4:8])
+            g_strain, g_start, g_end, g_len = (int(c) for c in l[8:12])
+
+            # only used reads longer than the 4kb for assembly
+            if f_len < args.min_len: continue
+            if g_len < args.min_len: continue
+            
+            # double check for proper overlap
+            if f_start > 24 and f_len - f_end > 24:  # allow 24 base tolerance on both sides of the overlapping
+                continue
+            
+            if g_start > 24 and g_len - g_end > 24:
+                continue
+            
+            if g_strain == 0:
+                if f_start < 24 and g_len - g_end > 24:
+                    continue
+                if g_start < 24 and f_len - f_end > 24:
+                    continue
+            else:
+                if f_start < 24 and g_start > 24:
+                    continue
+                if g_start < 24 and f_start > 24:
+                    continue
+
+            overlap_data.append( (f_id, g_id, score, identity,
+                                  f_strain, f_start, f_end, f_len,
+                                  g_strain, g_start, g_end, g_len) )
+
+            overlap_count[f_id] = overlap_count.get(f_id,0)+1
+            overlap_count[g_id] = overlap_count.get(g_id,0)+1
+
+    overlap_set = set()
+    sg = StringGraph()
+    for od in overlap_data:
+        f_id, g_id, score, identity = od[:4]
+        if f_id in contained_reads:
+            continue
+        if g_id in contained_reads:
+            continue
+        f_s, f_b, f_e, f_l = od[4:8]
+        g_s, g_b, g_e, g_l = od[8:12]
+        overlap_pair = [f_id, g_id]
+        overlap_pair.sort()
+        overlap_pair = tuple( overlap_pair )
+        if overlap_pair in overlap_set:  # don't allow duplicated records
+            continue
+        else:
+            overlap_set.add(overlap_pair)
+
+        
+        if g_s == 1: # revered alignment, swapping the begin and end coordinates
+            g_b, g_e = g_e, g_b
+        
+        # build the string graph edges for each overlap
+        if f_b > 24:
+            if g_b < g_e:
+                """
+                     f.B         f.E
+                  f  ----------->
+                  g         ------------->
+                            g.B           g.E
+                """
+                if f_b == 0 or g_e - g_l == 0:
+                    continue
+                sg.add_edge( "%s:B" % g_id, "%s:B" % f_id, label = "%s:%d-%d" % (f_id, f_b, 0), 
+                                                           length = abs(f_b-0),
+                                                           score = -score)
+                sg.add_edge( "%s:E" % f_id, "%s:E" % g_id, label = "%s:%d-%d" % (g_id, g_e, g_l), 
+                                                           length = abs(g_e-g_l),
+                                                           score = -score)
+            else:
+                """
+                     f.B         f.E
+                  f  ----------->
+                  g         <-------------
+                            g.E           g.B           
+                """
+                if f_b == 0 or g_e == 0:
+                    continue
+                sg.add_edge( "%s:E" % g_id, "%s:B" % f_id, label = "%s:%d-%d" % (f_id, f_b, 0), 
+                                                           length = abs(f_b -0),
+                                                           score = -score)
+                sg.add_edge( "%s:E" % f_id, "%s:B" % g_id, label = "%s:%d-%d" % (g_id, g_e, 0), 
+                                                           length = abs(g_e- 0),
+                                                           score = -score)
+        else:
+            if g_b < g_e:
+                """
+                                    f.B         f.E
+                  f                 ----------->
+                  g         ------------->
+                            g.B           g.E
+                """
+                if g_b == 0 or f_e - f_l == 0:
+                    continue
+                sg.add_edge( "%s:B" % f_id, "%s:B" % g_id, label = "%s:%d-%d" % (g_id, g_b, 0), 
+                                                           length = abs(g_b - 0),
+                                                           score = -score)
+                sg.add_edge( "%s:E" % g_id, "%s:E" % f_id, label = "%s:%d-%d" % (f_id, f_e, f_l), 
+                                                           length = abs(f_e-f_l),
+                                                           score = -score)
+            else:
+                """
+                                    f.B         f.E
+                  f                 ----------->
+                  g         <-------------
+                            g.E           g.B           
+                """
+                if g_b - g_l == 0 or f_e - f_l ==0:
+                    continue
+                sg.add_edge( "%s:B" % f_id, "%s:E" % g_id, label = "%s:%d-%d" % (g_id, g_b, g_l), 
+                                                           length = abs(g_b - g_l),
+                                                           score = -score)
+                sg.add_edge( "%s:B" % g_id, "%s:E" % f_id, label = "%s:%d-%d" % (f_id, f_e, f_l), 
+                                                           length = abs(f_e - f_l),
+                                                           score = -score)
+
+    
+    sg.init_reduce_dict()
+    if not args.disable_chimer_prediction:
+        sg.mark_chimer_edge()
+    sg.mark_spur_edge()
+    sg.mark_tr_edges() # mark those edges that transitive redundant
+
+    if DEBUG_LOG_LEVEL > 1:
+        print sum( [1 for c in sg.e_reduce.values() if c == True] )
+        print sum( [1 for c in sg.e_reduce.values() if c == False] )
+
+    sg.mark_best_overlap() # mark those edges that are best overlap edges
+
+    if DEBUG_LOG_LEVEL > 1:
+        print sum( [1 for c in sg.e_reduce.values() if c == False] )
+
+
+    G = SGToNXG(sg)
+    #nx.write_gexf(G, "string_graph.gexf") # output the raw string string graph for visuliation
+    nx.write_adjlist(G, "string_graph.adj") # write out the whole adjacent list of the string graph
+
+    u_edges = generate_unitig(sg, seqs, out_fn = "unitgs.fa") # reduct to string graph into unitig graph
+    ASM_graph = get_bundles(u_edges )  # get the assembly
+    #nx.write_gexf(ASM_graph, "asm_graph.gexf")
diff --git a/src/py_scripts_v0.1/falcon_asm_s.py b/src/py_scripts_v0.1/falcon_asm_s.py
new file mode 100755
index 0000000..720b2e2
--- /dev/null
+++ b/src/py_scripts_v0.1/falcon_asm_s.py
@@ -0,0 +1,1220 @@
+#!/usr/bin/env python
+
+#################################################################################$$
+# Copyright (c) 2011-2014, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted (subject to the limitations in the
+# disclaimer below) provided that the following conditions are met:
+#
+#  * Redistributions of source code must retain the above copyright
+#  notice, this list of conditions and the following disclaimer.
+#
+#  * Redistributions in binary form must reproduce the above
+#  copyright notice, this list of conditions and the following
+#  disclaimer in the documentation and/or other materials provided
+#  with the distribution.
+#
+#  * Neither the name of Pacific Biosciences nor the names of its
+#  contributors may be used to endorse or promote products derived
+#  from this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#################################################################################$$
+
+from pbcore.io import FastaReader
+import networkx as nx
+import os
+import shlex
+import sys
+import subprocess
+
+DEBUG_LOG_LEVEL = 0
+
+class SGNode(object):
+    """
+    class representing a node in the string graph
+    """
+    def __init__(self, node_name):
+        self.name = node_name
+        self.out_edges = []
+        self.in_edges = []
+    def add_out_edge(self, out_edge):
+        self.out_edges.append(out_edge)
+    def add_in_edge(self, in_edge):
+        self.in_edges.append(in_edge)
+
+class SGEdge(object):
+    """
+    class representing an edge in the string graph
+    """
+    def __init__(self, in_node, out_node):
+        self.in_node = in_node
+        self.out_node = out_node
+        self.attr = {}
+    def set_attribute(self, attr, value):
+        self.attr[attr] = value
+
+def reverse_end( node_id ):
+    node_id, end = node_id.split(":")
+    new_end = "B" if end == "E" else "E"
+    return node_id + ":" + new_end
+
+class StringGraph(object):
+    """
+    class representing the string graph
+    """
+    def __init__(self):
+        self.nodes = {}
+        self.edges = {}
+        self.n_mark = {}
+        self.e_reduce = {}
+        self.repeat_overlap = {}
+        
+    def add_node(self, node_name):
+        """ 
+        add a node into the graph by given a node name
+        """
+        if node_name not in self.nodes:
+            self.nodes[node_name] = SGNode(node_name)
+    
+    def add_edge(self, in_node_name, out_node_name, **attributes):
+        """ 
+        add an edge into the graph by given a pair of nodes
+        """
+        if (in_node_name, out_node_name) not in self.edges:
+        
+            self.add_node(in_node_name)
+            self.add_node(out_node_name)
+            in_node = self.nodes[in_node_name]
+            out_node = self.nodes[out_node_name]    
+            
+            edge = SGEdge(in_node, out_node)
+            self.edges[ (in_node_name, out_node_name) ] = edge
+            in_node.add_out_edge(edge)
+            out_node.add_in_edge(edge)
+        edge =  self.edges[ (in_node_name, out_node_name) ]
+        for k, v in attributes.items():
+            edge.attr[k] = v
+
+    def init_reduce_dict(self):
+        for e in self.edges:
+            self.e_reduce[e] = False
+
+    def mark_chimer_edge(self):
+
+        for e_n, e in self.edges.items():
+            v = e_n[0]
+            w = e_n[1]
+            overlap_count = 0
+            for w_out_e in self.nodes[w].out_edges:
+                w_out_n = w_out_e.out_node.name
+                if (v, w_out_n) in self.edges:
+                    overlap_count += 1
+            for v_in_e in self.nodes[v].in_edges:
+                v_in_n = v_in_e.in_node.name
+                if (v_in_n, w) in self.edges:
+                    overlap_count += 1
+            if self.e_reduce[ (v, w) ] != True:
+                if overlap_count == 0:
+                    self.e_reduce[(v, w)] = True
+                    #print "XXX: chimer edge %s %s removed" % (v, w)
+                    v, w = reverse_end(w), reverse_end(v)
+                    self.e_reduce[(v, w)] = True
+                    #print "XXX: chimer edge %s %s removed" % (v, w)
+
+
+
+    def mark_spur_edge(self):
+
+        for  v in self.nodes:
+            if len(self.nodes[v].out_edges) > 1:
+                for out_edge in self.nodes[v].out_edges:
+                    w = out_edge.out_node.name
+                    
+                    if len(self.nodes[w].out_edges) == 0 and self.e_reduce[ (v, w) ] != True:
+                        #print "XXX: spur edge %s %s removed" % (v, w)
+                        self.e_reduce[(v, w)] = True
+                        v2, w2 = reverse_end(w), reverse_end(v)
+                        #print "XXX: spur edge %s %s removed" % (v2, w2)
+                        self.e_reduce[(v, w)] = True
+
+            if len(self.nodes[v].in_edges) > 1:
+                for in_edge in self.nodes[v].in_edges:
+                    w = in_edge.in_node.name
+                    if len(self.nodes[w].in_edges) == 0 and self.e_reduce[ (w, v) ] != True:
+                        #print "XXX: spur edge %s %s removed" % (w, v)
+                        self.e_reduce[(w, v)] = True
+                        v2, w2 = reverse_end(w), reverse_end(v)
+                        #print "XXX: spur edge %s %s removed" % (w2, v2)
+                        self.e_reduce[(w, v)] = True
+
+
+    def mark_tr_edges(self):
+        """
+        transitive reduction
+        """
+        n_mark = self.n_mark
+        e_reduce = self.e_reduce
+        FUZZ = 500
+        for n in self.nodes:
+            n_mark[n] = "vacant"
+    
+        for n_name, node in self.nodes.items():
+
+            out_edges = node.out_edges
+            if len(out_edges) == 0:
+                continue
+            
+            out_edges.sort(key=lambda x: x.attr["length"])
+            
+            for e in out_edges:
+                w = e.out_node
+                n_mark[ w.name ] = "inplay"
+            
+            max_len = out_edges[-1].attr["length"]
+                
+            max_len += FUZZ
+            
+            for e in out_edges:
+                e_len = e.attr["length"]
+                w = e.out_node
+                if n_mark[w.name] == "inplay":
+                    w.out_edges.sort( key=lambda x: x.attr["length"] )
+                    for e2 in w.out_edges:
+                        if e2.attr["length"] + e_len < max_len:
+                            x = e2.out_node
+                            if n_mark[x.name] == "inplay":
+                                n_mark[x.name] = "eliminated"
+            
+            for e in out_edges:
+                e_len = e.attr["length"]
+                w = e.out_node
+                w.out_edges.sort( key=lambda x: x.attr["length"] )
+                if len(w.out_edges) > 0:
+                    x = w.out_edges[0].out_node
+                    if n_mark[x.name] == "inplay":
+                        n_mark[x.name] = "eliminated"
+                for e2 in w.out_edges:
+                    if e2.attr["length"] < FUZZ:
+                        x = e2.out_node
+                        if n_mark[x.name] == "inplay":
+                            n_mark[x.name] = "eliminated"
+                            
+            for out_edge in out_edges:
+                v = out_edge.in_node
+                w = out_edge.out_node
+                if n_mark[w.name] == "eliminated":
+                    e_reduce[ (v.name, w.name) ] = True
+                    #print "XXX: tr edge %s %s removed" % (v.name, w.name)
+                    v_name, w_name = reverse_end(w.name), reverse_end(v.name)
+                    e_reduce[(v_name, w_name)] = True
+                    #print "XXX: tr edge %s %s removed" % (v_name, w_name)
+                n_mark[w.name] = "vacant"
+                
+
+    def mark_best_overlap(self):
+        """
+        find the best overlapped edges
+        """
+
+        best_edges = set()
+
+        for v in self.nodes:
+
+            out_edges = self.nodes[v].out_edges
+            if len(out_edges) > 0:
+                out_edges.sort(key=lambda e: e.attr["score"])
+                e = out_edges[-1]
+                best_edges.add( (e.in_node.name, e.out_node.name) )
+
+            in_edges = self.nodes[v].in_edges
+            if len(in_edges) > 0:
+                in_edges.sort(key=lambda e: e.attr["score"])
+                e = in_edges[-1]
+                best_edges.add( (e.in_node.name, e.out_node.name) )
+
+        if DEBUG_LOG_LEVEL > 1:
+            print "X", len(best_edges)
+
+        for e_n, e in self.edges.items():
+            v = e_n[0]
+            w = e_n[1]
+            if self.e_reduce[ (v, w) ] != True:
+                if (v, w) not in best_edges:
+                    self.e_reduce[(v, w)] = True
+                    #print "XXX: in best edge %s %s removed" % (v, w)
+                    v2, w2 = reverse_end(w), reverse_end(v)
+                    #print "XXX: in best edge %s %s removed" % (v2, w2)
+                    self.e_reduce[(v2, w2)] = True
+                
+    def get_out_edges_for_node(self, name, mask=True):
+        rtn = []
+        for e in self.nodes[name].out_edges:
+            v = e.in_node
+            w = e.out_node
+            if self.e_reduce[ (v.name, w.name) ] == False:
+                rtn.append(e)
+        return rtn
+        
+        
+    def get_in_edges_for_node(self, name, mask=True):
+        rtn = []
+        for e in self.nodes[name].in_edges:
+            v = e.in_node
+            w = e.out_node
+            if self.e_reduce[ (v.name, w.name) ] == False:
+                rtn.append(e)
+        return rtn
+
+    def get_best_out_edge_for_node(self, name, mask=True):
+        rtn = []
+        for e in self.nodes[name].out_edges:
+            v = e.in_node
+            w = e.out_node
+            if self.e_reduce[ (v.name, w.name) ] == False:
+                rtn.append(e)
+        rtn.sort(key=lambda e: e.attr["score"])
+
+        return rtn[-1]
+
+    def get_best_in_edge_for_node(self, name, mask=True):
+        rtn = []
+        for e in self.nodes[name].in_edges:
+            v = e.in_node
+            w = e.out_node
+            if self.e_reduce[ (v.name, w.name) ] == False:
+                rtn.append(e)
+        rtn.sort(key=lambda e: e.attr["score"])
+        return rtn[-1]
+        
+
+RCMAP = dict(zip("ACGTacgtNn-","TGCAtgcaNn-"))
+def generate_seq_from_path(sg, seqs, path):
+    subseqs = []
+    r_id, end = path[0].split(":")
+    
+    count = 0
+    for i in range( len( path ) -1 ):
+        w_n, v_n = path[i:i+2]
+        edge = sg.edges[ (w_n, v_n ) ]
+        read_id, coor = edge.attr["label"].split(":")
+        b,e = coor.split("-")
+        b = int(b)
+        e = int(e)
+        if b < e:
+            subseqs.append( seqs[read_id][b:e] )
+        else:
+            subseqs.append( "".join( [RCMAP[c] for c in seqs[read_id][b:e:-1]] ) )
+
+    return "".join(subseqs)
+
+
+def reverse_path( path ):
+    new_path = []
+    for n in list(path[::-1]):
+        rid, end = n.split(":")
+        new_end = "B" if end == "E" else "E"
+        new_path.append( rid+":"+new_end)
+    return new_path
+
+
+def generate_unitig(sg, seqs, out_fn, connected_nodes = None):
+
+    """
+    given a string graph:sg and the sequences: seqs, write the unitig fasta file into out_fn
+    the funtion return a reduct graph representing the reduce string graph where the edges are unitigs
+    
+    some extra files generated: 
+        unit_edges.dat : an easy to parse file for unitig data
+        unit_edge_paths : the file contains the information of the path of all unitigs
+        uni_graph.gexf: the unitig graph in gexf format for visulization
+    """
+
+    G = SGToNXG(sg)
+    if connected_nodes != None:
+        connected_nodes = set(sg.nodes)
+    out_fasta = open(out_fn, "w")
+    nodes_for_tig = set()
+    sg_edges = set()
+    for v, w in sg.edges:
+        if sg.e_reduce[(v, w)] != True:
+            sg_edges.add( (v, w) )
+    count = 0
+    edges_in_tigs = set()
+
+    uni_edges = {}
+    path_f = open("unit_edge_paths","w")
+    uni_edge_f = open("unit_edges.dat", "w")
+    while len(sg_edges) > 0:
+        v, w = sg_edges.pop()
+
+        #nodes_for_tig.remove(n)
+        upstream_nodes = []
+        
+        c_node = v
+        p_in_edges = sg.get_in_edges_for_node(c_node)
+        p_out_edges = sg.get_out_edges_for_node(c_node)
+        while len(p_in_edges) == 1 and len(p_out_edges) == 1:
+            p_node = p_in_edges[0].in_node
+            upstream_nodes.append(p_node.name)
+            if (p_node.name, c_node) not in  sg_edges:
+                break
+            p_in_edges = sg.get_in_edges_for_node(p_node.name)
+            p_out_edges = sg.get_out_edges_for_node(p_node.name)
+            c_node = p_node.name
+
+        upstream_nodes.reverse()  
+            
+        downstream_nodes = []
+        c_node = w 
+        n_out_edges = sg.get_out_edges_for_node(c_node)
+        n_in_edges = sg.get_in_edges_for_node(c_node)
+        while len(n_out_edges) == 1 and len(n_in_edges) == 1:
+            n_node = n_out_edges[0].out_node
+            downstream_nodes.append(n_node.name)
+            if (c_node, n_node.name) not in  sg_edges:
+                break
+            n_out_edges = sg.get_out_edges_for_node(n_node.name)
+            n_in_edges = sg.get_in_edges_for_node(n_node.name)
+            c_node = n_node.name 
+        
+        whole_path = upstream_nodes + [v, w] + downstream_nodes
+        count += 1
+        subseq = generate_seq_from_path(sg, seqs, whole_path) 
+        #subseq = ""
+        uni_edges.setdefault( (whole_path[0], whole_path[-1]), [] )
+        uni_edges[(whole_path[0], whole_path[-1])].append(  ( whole_path, subseq ) )
+        print >> uni_edge_f, whole_path[0], whole_path[-1], "-".join(whole_path), subseq
+        print >>path_f, ">%05dc-%s-%s-%d %s" % (count, whole_path[0], whole_path[-1], len(whole_path), " ".join(whole_path))
+        print >>out_fasta, ">%05dc-%s-%s-%d" % (count, whole_path[0], whole_path[-1], len(whole_path))
+        print >>out_fasta, subseq
+        for i in range( len( whole_path ) -1 ):
+            w_n, v_n = whole_path[i:i+2]
+            try:
+                sg_edges.remove( (w_n, v_n) )
+            except KeyError: #if an edge is already deleted, ignore it
+                pass
+
+        r_whole_path = reverse_path( whole_path )
+        count += 1
+        subseq = generate_seq_from_path(sg, seqs, r_whole_path) 
+        #subseq = ""
+        uni_edges.setdefault( (r_whole_path[0], r_whole_path[-1]), [] )
+        uni_edges[(r_whole_path[0], r_whole_path[-1])].append(  ( r_whole_path, subseq ) )
+        print >> uni_edge_f, r_whole_path[0], r_whole_path[-1], "-".join(r_whole_path), subseq
+        print >>path_f, ">%05dc-%s-%s-%d %s" % (count, r_whole_path[0], r_whole_path[-1], len(r_whole_path), " ".join(r_whole_path))
+        print >>out_fasta, ">%05dc-%s-%s-%d" % (count, r_whole_path[0], r_whole_path[-1], len(r_whole_path))
+        print >>out_fasta, subseq
+        for i in range( len( r_whole_path ) -1 ):
+            w_n, v_n = r_whole_path[i:i+2]
+            try:
+                sg_edges.remove( (w_n, v_n) )
+            except KeyError: #if an edge is already deleted, ignore it
+                pass
+
+
+    path_f.close()
+    uni_edge_f.close()
+    #uni_graph = nx.DiGraph()
+    #for n1, n2 in uni_edges.keys():
+    #    uni_graph.add_edge(n1, n2, count = len( uni_edges[ (n1,n2) ] ))
+    #nx.write_gexf(uni_graph, "uni_graph.gexf")
+
+    out_fasta.close()
+    return uni_edges
+
+def neighbor_bound(G, v, w, radius):
+    """
+    test if the node v and the node w are connected within a radius in graph G
+    """
+    g1 = nx.ego_graph(G, v, radius=radius, undirected=False)
+    g2 = nx.ego_graph(G, w, radius=radius, undirected=False)
+    if len(set(g1.edges()) & set(g2.edges())) > 0:
+        return True
+    else:
+        return False
+
+
+def is_branch_node(G, n):
+    """
+    test whether the node n is a "branch node" which the paths from any of two of 
+    its offsprings do not intersect within a given radius
+    """
+    out_edges = G.out_edges([n])
+    n2 = [ e[1] for e in out_edges ]
+    is_branch = False
+    for i in range(len(n2)):
+        for j in range(i+1, len(n2)):
+            v = n2[i]
+            w = n2[j]
+            if neighbor_bound(G, v, w, 10) == False:
+                is_branch = True
+                break
+        if is_branch == True:
+            break
+    return is_branch
+
+
+def get_bundle( path, u_graph, u_graph_r ):
+
+    """ 
+    find a sub-graph contain the nodes between the start and the end of the path
+    inputs: 
+        u_graph : a unitig graph
+    returns:
+        bundle_graph: the whole bundle graph 
+        bundle_paths: the paths in the bundle graph 
+        sub_graph2_edges: all edges of the bundle graph
+    
+    """
+
+    p_start, p_end = path[0], path[-1]
+    p_nodes = set(path)
+    p_edges = set(zip(path[:-1], path[1:]))
+
+    down_path = nx.ego_graph(u_graph, p_start, radius=len(p_nodes), undirected=False)
+    up_path = nx.ego_graph(u_graph_r, p_end, radius=len(p_nodes), undirected=False)
+    subgraph_nodes = set(down_path) & set(up_path)
+    
+
+    sub_graph = nx.DiGraph()
+    for v, w in u_graph.edges_iter():
+        if v in subgraph_nodes and w in subgraph_nodes:            
+            if (v, w) in p_edges:
+                sub_graph.add_edge(v, w, color = "red")
+            else:
+                sub_graph.add_edge(v, w, color = "black")
+
+    sub_graph2 = nx.DiGraph()
+    tips = set()
+    tips.add(path[0])
+    sub_graph_r = sub_graph.reverse()
+    visited = set()
+    ct = 0
+    is_branch = is_branch_node(sub_graph, path[0]) #if the start node is a branch node
+    if is_branch:
+        n = tips.pop()
+        e = sub_graph.out_edges([n])[0] #pick one path the build the subgraph
+        sub_graph2.add_edge(e[0], e[1], n_weight = u_graph[e[0]][e[1]]["n_weight"])
+        if e[1] not in visited:
+            last_node = e[1]
+            visited.add(e[1])
+            r_id, orientation = e[1].split(":")
+            orientation = "E" if orientation == "B" else "E"
+            visited.add( r_id +":" + orientation)
+            if not is_branch_node(sub_graph_r, e[1]): 
+                tips.add(e[1])
+        
+    while len(tips) != 0:
+        n = tips.pop()
+        out_edges = sub_graph.out_edges([n])
+        if len(out_edges) == 1:
+            e = out_edges[0]
+            sub_graph2.add_edge(e[0], e[1], n_weight = u_graph[e[0]][e[1]]["n_weight"])
+            last_node = e[1]
+            if e[1] not in visited:                       
+                visited.add(e[1])
+                r_id, orientation = e[1].split(":")
+                orientation = "E" if orientation == "B" else "E"
+                visited.add( r_id +":" + orientation)
+                if not is_branch_node(sub_graph_r, e[1]): 
+                    tips.add(e[1])
+        else:
+        
+            is_branch = is_branch_node(sub_graph, n)
+            if not is_branch:
+                for e in out_edges:
+                    sub_graph2.add_edge(e[0], e[1], n_weight = u_graph[e[0]][e[1]]["n_weight"])
+                    last_node = e[1]
+                    if e[1] not in visited:
+                        r_id, orientation = e[1].split(":")
+                        visited.add(e[1])
+                        orientation = "E" if orientation == "B" else "E"
+                        visited.add( r_id +":" + orientation)
+                        if not is_branch_node(sub_graph_r, e[1]):
+                            tips.add(e[1])
+        ct += 1
+    last_node = None
+    longest_len = 0
+        
+    sub_graph2_nodes = sub_graph2.nodes()
+    sub_graph2_edges = sub_graph2.edges()
+
+
+    new_path = [path[0]]
+    for n in sub_graph2_nodes:
+        if len(sub_graph2.out_edges(n)) == 0 :
+            path_t = nx.shortest_path(sub_graph2, source = path[0], target = n, weight = "n_weight")
+            path_len = len(path_t)
+            if path_len > longest_len:
+                last_node = n
+                longest_len = path_len
+                new_path = path_t
+
+    if last_node == None:
+        for n in sub_graph2_nodes:
+            path_t = nx.shortest_path(sub_graph2, source = path[0], target = n, weight = "n_weight")
+            path_len = len(path_t)
+            if path_len > longest_len:
+                last_node = n
+                longest_len = path_len
+                new_path = path_t
+
+
+    path = new_path
+
+    # clean up sub_graph2 according to new begin and end
+    sub_graph2_r = sub_graph2.reverse()
+    down_path = nx.ego_graph(sub_graph2, path[0], radius=len(path), undirected=False)
+    up_path = nx.ego_graph(sub_graph2_r, path[-1], radius=len(path), undirected=False)
+    subgraph_nodes = set(down_path) & set(up_path)
+    for v in sub_graph2_nodes:
+        if v not in subgraph_nodes:
+            sub_graph2.remove_node(v)
+    
+    if DEBUG_LOG_LEVEL > 1:
+        print "new_path", path[0], last_node, len(sub_graph2_nodes), path
+
+
+    bundle_paths = [path]
+    p_nodes = set(path)
+    p_edges = set(zip(path[:-1], path[1:]))
+
+    sub_graph2_nodes = sub_graph2.nodes()
+    sub_graph2_edges = sub_graph2.edges()
+
+    nodes_idx = dict( [ (n[1], n[0]) for n in enumerate(path) ]  )
+    
+         
+    # create a list of subpath that has no branch
+    non_branch_subpaths = []
+    wi = 0
+    vi = 0
+    v = path[0]
+    while v != path[-1] and wi < len(path)-1:
+        wi += 1
+        w = path[wi]
+        while len( sub_graph2.successors(w) ) == 1 and len( sub_graph2.predecessors(w) ) == 1 and wi < len(path)-1:
+            wi += 1
+            w = path[wi]
+        if  len( sub_graph2.successors(v) )!= 1 or len( sub_graph2.predecessors(w) )!= 1:
+            branched = True
+        else:
+            branched = False
+
+        if not branched:
+            non_branch_subpaths.append( path[vi:wi+1] )
+        v = w
+        vi = wi
+
+    # create the accompany_graph that has the path of the alternative subpaths
+    
+    associate_graph = nx.DiGraph()
+    for v, w in sub_graph2.edges_iter():
+        if (v, w) not in p_edges:
+            associate_graph.add_edge(v, w, n_weight = sub_graph2[v][w]["n_weight"])
+
+    if DEBUG_LOG_LEVEL > 1:
+        print "associate_graph size:", len(associate_graph)           
+        print "non_branch_subpaths",len(non_branch_subpaths), non_branch_subpaths
+
+    # construct the bundle graph                
+    associate_graph_nodes = set(associate_graph.nodes())
+    bundle_graph = nx.DiGraph()
+    bundle_graph.add_path( path )
+    for i in range(len(non_branch_subpaths)-1):
+        if len(non_branch_subpaths[i]) == 0 or len( non_branch_subpaths[i+1] ) == 0:
+            continue
+        e1, e2 = non_branch_subpaths[i: i+2]
+        v = e1[-1]
+        w = e2[0]
+        if v == w:
+            continue
+        in_between_node_count = nodes_idx[w] - nodes_idx[v] 
+        if v in associate_graph_nodes and w in associate_graph_nodes:
+            try:
+                a_path = nx.shortest_path(associate_graph, v, w, "n_weight")    
+            except nx.NetworkXNoPath:
+                continue
+            bundle_graph.add_path( a_path )      
+            bundle_paths.append( a_path )
+
+    return bundle_graph, bundle_paths, sub_graph2_edges
+            
+def get_bundles(u_edges):
+    
+    """
+    input: all unitig edges
+    output: the assembled primary_tigs.fa and all_tigs.fa
+    """
+
+    ASM_graph = nx.DiGraph()
+    out_f = open("primary_tigs.fa", "w")
+    main_tig_paths = open("primary_tigs_paths","w")
+    sv_tigs = open("all_tigs.fa","w")
+    sv_tig_paths = open("all_tigs_paths","w")
+    max_weight = 0 
+    for v, w in u_edges:
+        x = max( [len(s[1]) for s in u_edges[ (v,w) ] ] )
+        if DEBUG_LOG_LEVEL > 1:
+            print "W", v, w, x
+        if x > max_weight:
+            max_weight = x
+            
+    in_edges = {}
+    out_edges = {}
+    for v, w in u_edges:
+        in_edges.setdefault(w, []) 
+        out_edges.setdefault(w, []) 
+        in_edges[w].append( (v, w) )
+
+        out_edges.setdefault(v, [])
+        in_edges.setdefault(v, [])
+        out_edges[v].append( (v, w) )
+
+    u_graph = nx.DiGraph()
+    for v,w in u_edges:
+
+        u_graph.add_edge(v, w, n_weight = max_weight - max( [len(s[1]) for s in  u_edges[ (v,w) ] ] ) )
+    
+    bundle_edge_out = open("bundle_edges","w")
+    bundle_index = 0
+
+
+    components = nx.weakly_connected_component_subgraphs(u_graph)
+    components = [ (len(c), c) for c in components ]
+    components.sort()
+    #components.reverse()
+    allS = len(u_graph)
+    ssG = 0.0
+    processed_overlaps = set()
+    for sG, G in components:
+
+        ssG += sG
+        print "process graph of size ", sG, "%0.2f %0.2f" % (ssG, ssG/allS)
+        G_edges = set(G.edges())
+
+        dual_component = False
+        
+        for v, w in list(G_edges):
+            v = v.split(":")[0]
+            w = w.split(":")[0]
+            if (v, w) in processed_overlaps:
+                dual_component = True
+                break
+
+        if dual_component == True:
+            continue
+
+        for v, w in list(G_edges):
+            v = v.split(":")[0]
+            w = w.split(":")[0]
+            processed_overlaps.add( (v,w) )
+            processed_overlaps.add( (w,v) )
+
+        G_r = G.reverse()
+        visited_u_edges = set()
+
+        while len(G) > 0:
+            out_f.flush()
+            main_tig_paths.flush()
+            sv_tigs.flush()
+            sv_tig_paths.flush()
+            
+            
+            #root_nodes = set() 
+            candidates = [] 
+            for n in G: 
+                sp =nx.single_source_shortest_path_length(G, n) 
+                sp = sp.items() 
+                sp.sort(key=lambda x : x[1]) 
+                longest = sp[-1] 
+                if DEBUG_LOG_LEVEL > 2:
+                    print "L", n, longest[0]
+                if longest[0].split(":")[0] == n.split(":")[0]: #avoid a big loop 
+                    continue
+                candidates.append ( (longest[1], n, longest[0]) ) 
+
+                n = longest[0]
+                sp =nx.single_source_shortest_path_length(G_r, n) 
+                sp = sp.items() 
+                sp.sort(key=lambda x : x[1]) 
+                longest = sp[-1] 
+                if DEBUG_LOG_LEVEL > 2:
+                    print "L", n, longest[0]
+                if longest[0].split(":")[0] == n.split(":")[0]: #avoid a big loop 
+                    continue
+                candidates.append ( (longest[1], longest[0], n) ) 
+                if len(candidates) != 0:
+                    break
+
+            if len(candidates) == 0:
+                print "no more candiate", len(G.edges()), len(G.nodes())
+                if len(G_edges) > 0:
+                    path = G_edges.pop()
+                    G_edges.add(path)
+                    print path
+                else:
+                    break
+            else:
+                candidates.sort() 
+                
+                candidate = candidates[-1] 
+                
+                if candidate[1] == candidate[2]: 
+                    G.remove_node(candidate[1]) 
+                    G_r.remove_node(candidate[1])
+                    continue 
+             
+                path = nx.shortest_path(G, candidate[1], candidate[2], "n_weight") 
+
+            if DEBUG_LOG_LEVEL > 1:
+                print "X", path[0], path[-1], len(path)
+            
+            cmp_edges = set()
+            #g_edges = set(G.edges())
+            new_path = []  
+            tail = True
+            # avioid confusion due to long palindrome sequence
+            if len(path) > 2:
+                for i in range( 0, len( path ) - 1 ):
+                    v_n, w_n = path[i:i+2]
+                    new_path.append(v_n)
+                    # the comment out code below might be useful for filter out some high connectivity nodes
+                    #if (v_n, w_n) in cmp_edges or\
+                    #    len(u_graph.out_edges(w_n)) > 5 or\
+                    #    len(u_graph.in_edges(w_n)) > 5:
+                    if (v_n, w_n) in cmp_edges: 
+                        tail = False
+                        break
+
+                    r_id, end = v_n.split(":")
+                    end = "E" if end == "B" else "B" 
+                    v_n2 = r_id + ":" + end 
+
+                    r_id, end = w_n.split(":")
+                    end = "E" if end == "B" else "B" 
+                    w_n2 = r_id + ":" + end 
+
+                    if (w_n2, v_n2) in G_edges:
+                        cmp_edges.add( (w_n2, v_n2) )
+
+                if tail:
+                    new_path.append(w_n)
+            else:
+                new_path = path[:]
+                    
+            
+            if len(new_path) > 1:
+                path = new_path
+                
+                if DEBUG_LOG_LEVEL > 2:
+                    print "Y", path[0], path[-1], len(path)
+
+                bundle_graph, bundle_paths, bundle_graph_edges = get_bundle( path, G, G_r )
+                for bg_edge in bundle_graph_edges:
+                    print >> bundle_edge_out, bundle_index, "edge", bg_edge[0], bg_edge[1]
+                for path_ in bundle_paths:
+                    print >>bundle_edge_out, "path", bundle_index, " ".join(path_) 
+
+                edges_to_be_removed = set()
+                if DEBUG_LOG_LEVEL > 2:
+                    print "Z", bundle_paths[0][0], bundle_paths[0][-1]
+                    print bundle_index, len(path), len(bundle_paths[0]), len(bundle_paths), len(bundle_graph_edges)
+
+                if len(bundle_graph_edges) > 0:
+
+                    ASM_graph.add_path(bundle_paths[0], ctg="%04d" % bundle_index)
+                    extra_u_edges = []
+                    
+                    print >> main_tig_paths, ">%04d %s" % ( bundle_index, " ".join(bundle_paths[0]) )
+                    subseqs = []
+                
+                    for i in range(len(bundle_paths[0]) - 1): 
+                        v, w = bundle_paths[0][i:i+2]
+                        edges_to_be_removed.add( (v,w) )
+                        uedges = u_edges[ (v,w) ]
+                        uedges.sort( key= lambda x: len(x[0]) )
+                        subseqs.append( uedges[-1][1] )
+                        visited_u_edges.add( "-".join(uedges[-1][0]) ) 
+                        for ue in uedges:
+                            if "-".join(ue[0]) not in visited_u_edges:
+                                visited_u_edges.add("-".join(ue[0]))
+                                extra_u_edges.append(ue)
+                    seq = "".join(subseqs)        
+                    sv_tig_idx = 0
+                    print >> sv_tig_paths, ">%04d-%04d %s" % ( bundle_index, sv_tig_idx, " ".join(bundle_paths[0]) )
+                    if len(seq) > 0:
+                        print >> out_f, ">%04d %s-%s" % (bundle_index, bundle_paths[0][0], bundle_paths[0][-1])
+                        print >> out_f, seq
+                        print >> sv_tigs, ">%04d-%04d %s-%s" % (bundle_index, sv_tig_idx, bundle_paths[0][0], bundle_paths[0][-1])
+                        print >> sv_tigs, "".join(subseqs)
+
+                    sv_tig_idx += 1
+
+                    for sv_path in bundle_paths[1:]:
+                        print >> sv_tig_paths, ">%04d-%04d %s" % ( bundle_index, sv_tig_idx, " ".join(sv_path) )
+                        ASM_graph.add_path(sv_path, ctg="%04d" % bundle_index)
+                        subseqs = []
+                        for i in range(len(sv_path) - 1): 
+                            v, w = sv_path[i:i+2]
+                            edges_to_be_removed.add( (v,w) )
+                            uedges = u_edges[ (v,w) ]
+                            uedges.sort( key= lambda x: len(x[0]) )
+                            subseqs.append( uedges[-1][1] )
+                            visited_u_edges.add( "-".join(uedges[-1][0]) ) 
+                            for ue in uedges:
+                                if "-".join(ue[0]) not in visited_u_edges:
+                                    visited_u_edges.add("-".join(ue[0]))
+                                    extra_u_edges.append(ue)
+                        seq = "".join(subseqs)        
+                        if len(seq) > 0: 
+                            print >> sv_tigs, ">%04d-%04d %s-%s" % (bundle_index, sv_tig_idx, sv_path[0], sv_path[-1])
+                            print >> sv_tigs, "".join(subseqs)
+                        sv_tig_idx += 1
+                    for u_path, seq in extra_u_edges:
+                        #u_path = u_path.split("-")
+                        ASM_graph.add_edge(u_path[0], u_path[-1], ctg="%04d" % bundle_index)
+                        print >> sv_tig_paths, ">%04d-%04d-u %s" % ( bundle_index, sv_tig_idx, " ".join(u_path) )
+                        print >> sv_tigs, ">%04d-%04d-u %s-%s" % (bundle_index, sv_tig_idx, u_path[0], u_path[-1])
+                        print >> sv_tigs, seq
+                        sv_tig_idx += 1
+                        
+                    
+                    bundle_index += 1
+            else:
+                #TODO, consolidate code here
+                v,w = path
+                uedges = u_edges[ (v,w) ]
+                uedges.sort( key= lambda x: len(x[0]) )
+                subseqs.append( uedges[-1][1] )
+                seq = "".join(subseqs)
+                print >> sv_tig_paths, ">%04d-%04d %s" % ( bundle_index, sv_tig_idx, " ".join(paths) )
+                print >> sv_tigs, ">%04d-%04d-u %s-%s" % (bundle_index, sv_tig_idx, path[0], path[-1])
+                print >> sv_tigs, seq
+                sv_tig_idx += 1
+                bundle_index += 1
+                bundle_graph_edges = zip(path[:-1],path[1:])
+            
+            #clean up the graph
+
+            edges = set(G.edges())
+            edges_to_be_removed |= set(bundle_graph_edges)
+
+            if DEBUG_LOG_LEVEL > 2:
+                print "BGE",bundle_graph_edges
+            
+            edge_remove_count = 0
+            for v, w in edges_to_be_removed:
+                if (v, w) in edges:
+                    G.remove_edge( v, w )
+                    G_r.remove_edge( w, v )
+                    G_edges.remove( (v, w) )
+                    edge_remove_count += 1
+                    if DEBUG_LOG_LEVEL > 2:
+                        print "remove edge", bundle_index, w, v
+                    
+            edges = set(G.edges())
+            for v, w in edges_to_be_removed:
+
+                r_id, end = v.split(":")
+                end = "E" if end == "B" else "B"
+                v = r_id + ":" + end
+
+                r_id, end = w.split(":")
+                end = "E" if end == "B" else "B"
+                w = r_id + ":" + end
+
+                if (w, v) in edges:
+                    G.remove_edge( w, v )
+                    G_edges.remove( (w, v) )
+                    G_r.remove_edge( v, w )
+                    edge_remove_count += 1
+                    if DEBUG_LOG_LEVEL > 2:
+                        print "remove edge", bundle_index, w, v
+
+            if edge_remove_count == 0:
+                break
+                
+            nodes = G.nodes()
+            for n in nodes:
+                if G.in_degree(n) == 0 and G.out_degree(n) == 0:
+                    G.remove_node(n)
+                    G_r.remove_node(n)
+                    if DEBUG_LOG_LEVEL > 2:
+                        print "remove node", n 
+
+    sv_tig_paths.close()
+    sv_tigs.close()
+    main_tig_paths.close()
+    out_f.close()
+    bundle_edge_out.close()
+    return ASM_graph
+
+
+
+def SGToNXG(sg):
+    G=nx.DiGraph()
+
+    max_score = max([ sg.edges[ e ].attr["score"] for e in sg.edges if sg.e_reduce[e] != True ])
+    out_f = open("edges_list","w")
+    for v, w in sg.edges:
+        if sg.e_reduce[(v, w)] != True:
+        ##if 1:
+            out_degree = len(sg.nodes[v].out_edges)
+            G.add_node( v, size = out_degree )
+            G.add_node( w, size = out_degree )
+            label = sg.edges[ (v, w) ].attr["label"]
+            score = sg.edges[ (v, w) ].attr["score"]
+            print >>out_f, v, w, label, score 
+            G.add_edge( v, w, label = label, weight = 0.001*score, n_weight = max_score - score )
+            #print in_node_name, out_node_name
+    out_f.close()
+    return G
+
+if __name__ == "__main__":
+
+    import argparse
+    
+    parser = argparse.ArgumentParser(description='a example string graph assembler that is desinged for handling diploid genomes')
+    parser.add_argument('overlap_file', help='a file that contains the overlap information.')
+    parser.add_argument('read_fasta', help='the file that contains the sequence to be assembled')
+    parser.add_argument('--min_len', type=int, default=4000, 
+                        help='minimum length of the reads to be considered for assembling')
+    parser.add_argument('--min_idt', type=float, default=96,
+                        help='minimum alignment identity of the reads to be considered for assembling')
+    parser.add_argument('--disable_chimer_prediction', action="store_true", default=False,
+                        help='you may want to disable this as some reads can be falsely identified as chimers in low coverage case')
+
+    args = parser.parse_args()
+
+
+    overlap_file = args.overlap_file
+    read_fasta = args.read_fasta
+
+    contained_reads = set()
+    chimer_ids = set()
+
+    with open("rc_out_all") as f:
+        for l in f:
+            l = l.strip().split()
+            if l[1] == "2":
+                chimer_ids.add(l[0])
+            if l[1] == "1":
+                contained_reads.add(l[0])
+    print len(chimer_ids)
+    
+    seqs = {}
+    # load all p-reads into memory
+    f = FastaReader(read_fasta)
+    for r in f:
+        if r.name in contained_reads:
+            continue
+        if r.name in chimer_ids:
+            continue
+        seqs[r.name] = r.sequence.upper()
+
+    G=nx.Graph()
+    edges =set()
+    overlap_data = []
+    contained_reads = set()
+    overlap_count = {}
+
+
+    # loop through the overlapping data to load the data in the a python array
+    # contained reads are identified 
+
+    with open(overlap_file) as f:
+        for l in f:
+            l = l.strip().split()
+
+            #work around for some ill formed data recored
+            if len(l) != 13:
+                continue
+            
+            f_id, g_id, score, identity = l[:4]
+            if f_id == g_id:  # don't need self-self overlapping
+                continue
+
+            if g_id not in seqs: 
+                continue
+
+            if f_id not in seqs:
+                continue
+
+            score = int(score)
+            identity = float(identity)
+            contained = l[12]
+            if contained == "contained":
+                contained_reads.add(f_id)
+                continue
+            if contained == "contains":
+                contained_reads.add(g_id)
+                continue
+            if contained == "none":
+                continue
+
+            if identity < args.min_idt: # only take record with >96% identity as overlapped reads
+                continue
+            #if score > -2000:
+            #    continue
+            f_strain, f_start, f_end, f_len = (int(c) for c in l[4:8])
+            g_strain, g_start, g_end, g_len = (int(c) for c in l[8:12])
+
+            # only used reads longer than the 4kb for assembly
+            if f_len < args.min_len: continue
+            if g_len < args.min_len: continue
+            
+            # double check for proper overlap
+            if f_start > 24 and f_len - f_end > 24:  # allow 24 base tolerance on both sides of the overlapping
+                continue
+            
+            if g_start > 24 and g_len - g_end > 24:
+                continue
+            
+            if g_strain == 0:
+                if f_start < 24 and g_len - g_end > 24:
+                    continue
+                if g_start < 24 and f_len - f_end > 24:
+                    continue
+            else:
+                if f_start < 24 and g_start > 24:
+                    continue
+                if g_start < 24 and f_start > 24:
+                    continue
+
+            overlap_data.append( (f_id, g_id, score, identity,
+                                  f_strain, f_start, f_end, f_len,
+                                  g_strain, g_start, g_end, g_len) )
+
+            overlap_count[f_id] = overlap_count.get(f_id,0)+1
+            overlap_count[g_id] = overlap_count.get(g_id,0)+1
+            
+    print "###", len(overlap_data), len(contained_reads)
+    overlap_set = set()
+    sg = StringGraph()
+    for od in overlap_data:
+        f_id, g_id, score, identity = od[:4]
+        if f_id in contained_reads:
+            continue
+        if g_id in contained_reads:
+            continue
+        f_s, f_b, f_e, f_l = od[4:8]
+        g_s, g_b, g_e, g_l = od[8:12]
+        overlap_pair = [f_id, g_id]
+        overlap_pair.sort()
+        overlap_pair = tuple( overlap_pair )
+        if overlap_pair in overlap_set:  # don't allow duplicated records
+            continue
+        else:
+            overlap_set.add(overlap_pair)
+
+        
+        if g_s == 1: # revered alignment, swapping the begin and end coordinates
+            g_b, g_e = g_e, g_b
+        
+        # build the string graph edges for each overlap
+        if f_b > 24:
+            if g_b < g_e:
+                """
+                     f.B         f.E
+                  f  ----------->
+                  g         ------------->
+                            g.B           g.E
+                """
+                if f_b == 0 or g_e - g_l == 0:
+                    continue
+                sg.add_edge( "%s:B" % g_id, "%s:B" % f_id, label = "%s:%d-%d" % (f_id, f_b, 0), 
+                                                           length = abs(f_b-0),
+                                                           score = -score)
+                sg.add_edge( "%s:E" % f_id, "%s:E" % g_id, label = "%s:%d-%d" % (g_id, g_e, g_l), 
+                                                           length = abs(g_e-g_l),
+                                                           score = -score)
+            else:
+                """
+                     f.B         f.E
+                  f  ----------->
+                  g         <-------------
+                            g.E           g.B           
+                """
+                if f_b == 0 or g_e == 0:
+                    continue
+                sg.add_edge( "%s:E" % g_id, "%s:B" % f_id, label = "%s:%d-%d" % (f_id, f_b, 0), 
+                                                           length = abs(f_b -0),
+                                                           score = -score)
+                sg.add_edge( "%s:E" % f_id, "%s:B" % g_id, label = "%s:%d-%d" % (g_id, g_e, 0), 
+                                                           length = abs(g_e- 0),
+                                                           score = -score)
+        else:
+            if g_b < g_e:
+                """
+                                    f.B         f.E
+                  f                 ----------->
+                  g         ------------->
+                            g.B           g.E
+                """
+                if g_b == 0 or f_e - f_l == 0:
+                    continue
+                sg.add_edge( "%s:B" % f_id, "%s:B" % g_id, label = "%s:%d-%d" % (g_id, g_b, 0), 
+                                                           length = abs(g_b - 0),
+                                                           score = -score)
+                sg.add_edge( "%s:E" % g_id, "%s:E" % f_id, label = "%s:%d-%d" % (f_id, f_e, f_l), 
+                                                           length = abs(f_e-f_l),
+                                                           score = -score)
+            else:
+                """
+                                    f.B         f.E
+                  f                 ----------->
+                  g         <-------------
+                            g.E           g.B           
+                """
+                if g_b - g_l == 0 or f_e - f_l ==0:
+                    continue
+                sg.add_edge( "%s:B" % f_id, "%s:E" % g_id, label = "%s:%d-%d" % (g_id, g_b, g_l), 
+                                                           length = abs(g_b - g_l),
+                                                           score = -score)
+                sg.add_edge( "%s:B" % g_id, "%s:E" % f_id, label = "%s:%d-%d" % (f_id, f_e, f_l), 
+                                                           length = abs(f_e - f_l),
+                                                           score = -score)
+
+
+    sg.init_reduce_dict()
+    #if not args.disable_chimer_prediction:
+    #    sg.mark_chimer_edge()
+    sg.mark_spur_edge()
+    sg.mark_tr_edges() # mark those edges that transitive redundant
+
+    #if DEBUG_LOG_LEVEL > 1:
+    if 1:
+        print sum( [1 for c in sg.e_reduce.values() if c == True] )
+        print sum( [1 for c in sg.e_reduce.values() if c == False] )
+
+    sg.mark_best_overlap() # mark those edges that are best overlap edges
+
+    if DEBUG_LOG_LEVEL > 1:
+        print sum( [1 for c in sg.e_reduce.values() if c == False] )
+
+
+    G = SGToNXG(sg)
+    nx.write_gexf(G, "string_graph.gexf") # output the raw string string graph for visuliation
+    nx.write_adjlist(G, "string_graph.adj") # write out the whole adjacent list of the string graph
+
+    u_edges = generate_unitig(sg, seqs, out_fn = "unitgs.fa") # reduct to string graph into unitig graph
+    ASM_graph = get_bundles(u_edges )  # get the assembly
+    nx.write_gexf(ASM_graph, "asm_graph.gexf")
diff --git a/src/py_scripts_v0.1/falcon_dedup.py b/src/py_scripts_v0.1/falcon_dedup.py
new file mode 100644
index 0000000..cbf04aa
--- /dev/null
+++ b/src/py_scripts_v0.1/falcon_dedup.py
@@ -0,0 +1,119 @@
+import subprocess
+from pbcore.io import FastaReader
+
+def get_matches(seq0, seq1):
+    with open("tmp_seq0.fa","w") as f:
+        print >>f, ">seq0"
+        print >>f, seq0
+    with open("tmp_seq1.fa","w") as f:
+        print >>f, ">seq1"
+        print >>f, seq1
+    mgaps_out=subprocess.check_output("mummer -maxmatch -c -b -l 24 tmp_seq0.fa tmp_seq1.fa | mgaps ", stderr = open("/dev/null"), shell=True)
+
+    matches = []
+    cluster = []
+    for l in mgaps_out.split("\n"):
+        l = l.strip().split()
+        if len(l) == 0:
+            continue
+        if l[0] == ">":
+            seq_id = l[1]
+            
+            if len(cluster) != 0:
+                matches.append(cluster)
+            
+            cluster = []
+            continue
+        if l[0] == "#":
+            if len(cluster) != 0:
+                matches.append(cluster)            
+            cluster = []
+            continue
+        len_ = int(l[2])
+        r_s = int(l[0])
+        q_s = int(l[1])
+        r_e = r_s + len_
+        q_e = q_s + len_
+        cluster.append( ((r_s, r_e), (q_s, q_e)) )
+    if len(cluster) != 0:
+        matches.append(cluster)
+    return matches
+
+
+u_edges = {}
+with open("./unit_edges.dat") as f:
+    for l in f:
+        v, w, path, seq = l.strip().split()
+        u_edges.setdefault( (v, w), [] )
+        u_edges[ (v, w) ].append( (path, seq) )
+        
+
+p_tig_path = {}
+a_tig_path = {}
+with open("primary_tigs_paths_c") as f:
+    for l in f:
+        l = l.strip().split()
+        id_ = l[0][1:]
+        path = l[1:]
+        p_tig_path[id_] = path
+
+with open("all_tigs_paths") as f:
+    for l in f:
+        l = l.strip().split()
+        id_ = l[0][1:]
+        path = l[1:]
+        a_tig_path[id_] = path
+
+p_tig_seqs = {}
+for r in FastaReader("primary_tigs_c.fa"):
+    p_tig_seqs[r.name] = r.sequence
+
+a_tig_seqs = {}
+for r in FastaReader("all_tigs.fa"):
+    a_tig_seqs[r.name.split()[0]] = r.sequence
+
+p_tig_to_node_pos = {}
+node_pos = []
+with open("primary_tigs_node_pos_c") as f:
+    for l in f:
+        l = l.strip().split()
+        p_tig_to_node_pos.setdefault( l[0], [])
+        p_tig_to_node_pos[l[0]].append( (l[1], int(l[2])))
+
+duplicate_a_tigs = []
+with open("a_nodup.fa","w") as out_f:
+    for p_tig_id in p_tig_path:
+        main_path = p_tig_path[p_tig_id]
+        main_path_nodes = set(main_path[:])
+        p_tig_seq = p_tig_seqs[p_tig_id]
+        a_node = []
+        a_node_range = []
+        a_node_range_map = {}
+        node_to_pos = dict( p_tig_to_node_pos[p_tig_id] )
+        for id_ in a_tig_path:
+            if id_[:4] != p_tig_id[:4]:
+                continue
+            if id_.split("-")[1] == "0000":
+                continue
+            
+            a_path = a_tig_path[id_]
+            if a_path[0] in main_path_nodes and a_path[-1] in main_path_nodes:
+                #print p_tig_id, id_, a_path[0], a_path[-1]
+                s, e = node_to_pos[a_path[0]], node_to_pos[a_path[-1]]
+                p_seq = p_tig_seq[s:e]
+                a_seq = a_tig_seqs[id_] 
+                seq_match = get_matches(p_seq, a_seq)
+                if len(seq_match) > 1:
+                    print >>out_f, ">"+id_
+                    print >>out_f,  a_seq
+                    continue
+                try:
+                    r_s, r_e = seq_match[0][0][0][0], seq_match[0][-1][0][1]
+                except:
+                    print "XXX", seq_match
+                if 1.0* (r_e - r_s) / (e - s) > 98:
+                    print >>out_f, ">"+id_
+                    print >>out_f, a_seq
+                    continue
+                duplicate_a_tigs.append(id_)
+
diff --git a/src/py_scripts_v0.1/falcon_fixasm.py b/src/py_scripts_v0.1/falcon_fixasm.py
new file mode 100644
index 0000000..9475cef
--- /dev/null
+++ b/src/py_scripts_v0.1/falcon_fixasm.py
@@ -0,0 +1,213 @@
+#!/usr/bin/env python
+
+#################################################################################$$
+# Copyright (c) 2011-2014, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted (subject to the limitations in the
+# disclaimer below) provided that the following conditions are met:
+#
+#  * Redistributions of source code must retain the above copyright
+#  notice, this list of conditions and the following disclaimer.
+#
+#  * Redistributions in binary form must reproduce the above
+#  copyright notice, this list of conditions and the following
+#  disclaimer in the documentation and/or other materials provided
+#  with the distribution.
+#
+#  * Neither the name of Pacific Biosciences nor the names of its
+#  contributors may be used to endorse or promote products derived
+#  from this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#################################################################################$$
+
+import networkx as nx
+from pbcore.io import FastaReader
+
+def neighbor_bound(G, v, w, radius):
+    g1 = nx.ego_graph(G, v, radius=radius, undirected=False)
+    g2 = nx.ego_graph(G, w, radius=radius, undirected=False)
+    if len(g1) < radius or len(g2) < radius:
+        return True
+    print v, len(g1), w, len(g2), radius
+    if len(set(g1.edges()) & set(g2.edges())) > 0:
+        return True
+    else:
+        return False
+    
+def is_branch_node(G, n):
+    out_edges = G.out_edges([n])
+    n2 = [ e[1] for e in out_edges ]
+    is_branch = False
+    for i in range(len(n2)):
+        for j in range(i+1, len(n2)):
+            v = n2[i]
+            w = n2[j]
+            if neighbor_bound(G, v, w, 20) == False:
+                is_branch = True
+                break
+        if is_branch == True:
+            break
+    return is_branch
+
+
+def get_r_path(r_edges, u_path):
+    tiling_path = []
+    pos = 0
+     
+    for i in range( len(u_path) - 1): 
+        v, w = u_path[i:i+2]
+        r_edge_label, overlap = r_edges[ (v, w) ]
+        r_edge_seq_id, range_ = r_edge_label.split(":")
+        range_ = range_.split("-")
+        s, e = int(range_[0]), int(range_[1])
+        pos += abs(e-s)
+        tiling_path.append( (pos, w, s, e) )
+    return tiling_path
+
+def get_seq(u_edges, r_edges, path):
+    subseqs = []
+    pos = []
+    cur_pos = 0
+    full_tiling_path = []
+
+    for i in range( len(path) - 1):
+        v, w = path[i:i+2]
+        pos.append( (v, cur_pos) )
+        uedges = u_edges[ (v, w) ]
+        uedges.sort( key= lambda x: len(x[0]) )
+        subseqs.append( uedges[-1][1] )
+        r_path = get_r_path( r_edges, uedges[-1][0].split("-") )
+        r_path = [ ( x[0] + cur_pos, x[1], x[2], x[3]) for x in r_path ]
+        full_tiling_path.extend( r_path )
+        cur_pos += len( uedges[-1][1] )
+    pos.append( (w, cur_pos) ) 
+    return "".join(subseqs), pos, full_tiling_path
+
+
+u_edges = {}
+with open("unit_edges.dat") as f:
+    for l in f:
+        v, w, path, seq = l.strip().split()
+        u_edges.setdefault( (v, w), [] )
+        u_edges[ (v, w) ].append( (path, seq) )
+len(u_edges)
+
+
+r_edges = {}
+with open("edges_list") as f:
+    for l in f:
+        v, w, edge_label, overlap = l.strip().split()
+        r_edges[ (v, w) ] = (edge_label, int(overlap) ) 
+
+
+primary_tigs_path = {}
+primary_path_graph = nx.DiGraph()
+begin_nodes = {}
+end_nodes ={}
+with open("primary_tigs_paths") as f:
+    for l in f:
+        l = l.strip().split()
+        name = l[0][1:]
+        path = l[1:]
+        primary_tigs_path[name] = path
+        if len(path) < 3:
+            continue
+        for i in range(len(path)-1):
+            n1 = path[i].split(":")[0]
+            n2 = path[i+1].split(":")[0]
+            primary_path_graph.add_edge( n1, n2)
+        begin_nodes.setdefault(path[0], [])
+        begin_nodes[path[0]].append( name )
+        end_nodes.setdefault(path[-1], [])
+        end_nodes[path[-1]].append( name )
+
+
+
+path_names = primary_tigs_path.keys()
+path_names.sort()
+primary_path_graph_r = primary_path_graph.reverse()
+path_f = open("primary_tigs_paths_c","w")
+pos_f = open("primary_tigs_node_pos_c", "w")
+tiling_path_f = open("all_tiling_path_c", "w")
+with open("primary_tigs_c.fa","w") as out_f:
+    for name in path_names:
+        sub_idx = 0
+        c_path = [ primary_tigs_path[name][0] ]
+        for v in primary_tigs_path[name][1:]:
+            break_path = False
+            
+            vn = v.split(":")[0]
+
+            if primary_path_graph.out_degree(vn) > 1:
+                break_path = is_branch_node(primary_path_graph, vn)
+            if primary_path_graph.in_degree(vn) > 1:
+                break_path = is_branch_node(primary_path_graph_r, vn)
+            if break_path:
+                c_path.append(v)
+                seq, pos, full_tiling_path = get_seq(u_edges, r_edges, c_path)
+                for p, w, s, e in full_tiling_path:
+                    print >> tiling_path_f, "%s_%02d" % (name, sub_idx), p, w, s, e
+                #if len(full_tiling_path) <= 5:
+                #    continue
+                print >>out_f, ">%s_%02d" % (name, sub_idx)
+                print >>out_f, seq
+                print >>path_f, ">%s_%02d" % (name, sub_idx), " ".join(c_path)
+                #print c_path
+                for node, p in pos:
+                    print >> pos_f, "%s_%02d %s %d" % (name, sub_idx, node, p)
+                c_path = [v]
+                sub_idx += 1
+            else:
+                c_path.append(v)
+                
+        if len(c_path) > 1:
+            seq, pos, full_tiling_path = get_seq(u_edges, r_edges, c_path)
+            for p, w, s, e in full_tiling_path:
+                print >> tiling_path_f, "%s_%02d" % (name, sub_idx), p, w, s, e
+            if len(full_tiling_path) <= 5:
+                continue
+            print >>out_f, ">%s_%02d" % (name, sub_idx)
+            print >>out_f, seq
+            print >>path_f, ">%s_%02d" % (name, sub_idx), " ".join(c_path)
+            for node, p in pos:
+                print >> pos_f, "%s_%02d %s %d" % (name, sub_idx, node, p)
+
+with open("all_tigs_paths") as f:
+    for l in f:
+        l = l.strip().split()
+        name = l[0][1:]
+        name = name.split("-")
+        if name[1] == "0000":
+            continue
+        if len(name) == 2:
+            path = l[1:]
+            seq, pos, full_tiling_path = get_seq(u_edges, r_edges, path)
+            for p, w, s, e in full_tiling_path:
+                print >> tiling_path_f, "%s" % ("-".join(name)), p, w, s, e
+        else:
+            path = l[1:]
+            full_tiling_path = get_r_path(r_edges, path)
+            for p, w, s, e in full_tiling_path:
+                print >> tiling_path_f, "%s" % ("-".join(name)), p, w, s, e
+
+            
+path_f.close()
+tiling_path_f.close()
+pos_f.close()
diff --git a/src/py_scripts_v0.1/falcon_overlap.py b/src/py_scripts_v0.1/falcon_overlap.py
new file mode 100755
index 0000000..c6ae2a5
--- /dev/null
+++ b/src/py_scripts_v0.1/falcon_overlap.py
@@ -0,0 +1,328 @@
+#!/usr/bin/env python
+
+#################################################################################$$
+# Copyright (c) 2011-2014, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted (subject to the limitations in the
+# disclaimer below) provided that the following conditions are met:
+#
+#  * Redistributions of source code must retain the above copyright
+#  notice, this list of conditions and the following disclaimer.
+#
+#  * Redistributions in binary form must reproduce the above
+#  copyright notice, this list of conditions and the following
+#  disclaimer in the documentation and/or other materials provided
+#  with the distribution.
+#
+#  * Neither the name of Pacific Biosciences nor the names of its
+#  contributors may be used to endorse or promote products derived
+#  from this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#################################################################################$$
+
+from falcon_kit import * 
+from pbcore.io import FastaReader
+import numpy as np
+import collections
+import sys
+import multiprocessing as mp
+from multiprocessing import sharedctypes
+from ctypes import *
+
+global sa_ptr, sda_ptr, lk_ptr
+global q_seqs, seqs
+RC_MAP = dict( zip("ACGTacgtNn-", "TGCAtgcaNn-") )
+
+def get_ovelap_alignment(seq1, seq0):
+
+    K = 8
+    lk_ptr = kup.allocate_kmer_lookup( 1 << (K * 2) )
+    sa_ptr = kup.allocate_seq( len(seq0) )
+    sda_ptr = kup.allocate_seq_addr( len(seq0) )
+    kup.add_sequence( 0, K, seq0, len(seq0), sda_ptr, sa_ptr, lk_ptr)
+
+    kmer_match_ptr = kup.find_kmer_pos_for_seq(seq1, len(seq1), K, sda_ptr, lk_ptr)
+    kmer_match = kmer_match_ptr[0]
+    aln_range_ptr = kup.find_best_aln_range(kmer_match_ptr, K, K*5, 50)
+    #x,y = zip( * [ (kmer_match.query_pos[i], kmer_match.target_pos[i]) for i in range(kmer_match.count )] )
+    aln_range = aln_range_ptr[0]
+    kup.free_kmer_match(kmer_match_ptr)
+    s1, e1, s0, e0 = aln_range.s1, aln_range.e1, aln_range.s2, aln_range.e2  
+    e1 += K + K/2
+    e0 += K + K/2
+    kup.free_aln_range(aln_range)
+    len_1 = len(seq1)
+    len_0 = len(seq0)
+    if e1 > len_1: 
+        e1 = len_1
+    if e0 > len_0:
+        e0 = len_0
+    do_aln = False
+    contain_status = "none" 
+    #print s0, e0, s1, e1 
+    if e1 - s1 > 500:
+        if s0 < s1 and s0 > 24:
+            do_aln = False
+        elif s1 <= s0 and s1 > 24:
+            do_aln = False
+        elif s1 < 24 and len_1 - e1 < 24:
+            do_aln = True
+            contain_status = "contains"
+            #print "X1"
+        elif s0 < 24 and len_0 - e0 < 24:
+            do_aln = True
+            contain_status = "contained"
+            #print "X2"
+        else:
+            do_aln = True
+            if s0 < s1:
+                s1 -= s0 #assert s1 > 0
+                s0 = 0
+                e1 = len_1
+                #if len_1 - s1 >= len_0:
+                #    do_aln = False
+                #    contain_status = "contains"
+                #    print "X3", s0, e0, len_0, s1, e1, len_1
+
+                
+            elif s1 <= s0:
+                s0 -= s1 #assert s1 > 0
+                s1 = 0
+                e0 = len_0
+                #print s0, e0, s1, e1
+                #if len_0 - s0 >= len_1:
+                #    do_aln = False
+                #    contain_status = "contained"
+                #    print "X4"
+        #if abs( (e1 - s1) - (e0 - s0 ) ) > 200:  #avoid overlap alignment for big indels
+        #    do_aln = False
+
+        if do_aln:
+            alignment = DWA.align(seq1[s1:e1], e1-s1,
+                                  seq0[s0:e0], e0-s0,
+                                  500, 0)
+            #print seq1[s1:e1]
+            #print seq0[s2:e2]
+            #if alignment[0].aln_str_size > 500:
+    
+            #aln_str1 = alignment[0].q_aln_str
+            #aln_str0 = alignment[0].t_aln_str
+            aln_size = alignment[0].aln_str_size
+            aln_dist = alignment[0].dist
+            aln_q_s = alignment[0].aln_q_s
+            aln_q_e = alignment[0].aln_q_e
+            aln_t_s = alignment[0].aln_t_s
+            aln_t_e = alignment[0].aln_t_e
+            assert aln_q_e- aln_q_s <= alignment[0].aln_str_size or aln_t_e- aln_t_s <= alignment[0].aln_str_size
+            #print aln_str1
+            #print aln_str0
+            if aln_size > 500 and contain_status == "none": 
+                contain_status = "overlap"            
+            DWA.free_alignment(alignment)
+        
+    kup.free_seq_addr_array(sda_ptr)
+    kup.free_seq_array(sa_ptr)
+    kup.free_kmer_lookup(lk_ptr)
+
+    if do_aln:
+        if s1 > 1000 and s0 > 1000:
+            return 0, 0, 0, 0, 0, 0, "none"
+        if len_1 - (s1+aln_q_e-aln_q_s) > 1000 and len_0 - (s0+aln_t_e-aln_t_s) > 1000:
+            return 0, 0, 0, 0, 0, 0, "none"
+
+
+
+
+    if e1 - s1 > 500 and do_aln and aln_size > 500:
+        #return s1, s1+aln_q_e-aln_q_s, s2, s2+aln_t_e-aln_t_s, aln_size, aln_dist, x, y
+        return s1, s1+aln_q_e-aln_q_s, s0, s0+aln_t_e-aln_t_s, aln_size, aln_dist, contain_status
+    else:
+        return 0, 0, 0, 0, 0, 0, contain_status 
+
+def get_candidate_aln(hit_input):
+
+    global q_seqs
+    q_name, hit_index_f, hit_index_r = hit_input
+    q_seq = q_seqs[q_name]
+
+    rtn = []
+
+    hit_index = hit_index_f 
+    c = collections.Counter(hit_index)
+    s = [c[0] for c in c.items() if c[1] >50]
+    #s.sort()
+    targets = set()
+    for p in s:
+        hit_id = seqs[p][0]
+        if hit_id in targets or hit_id == q_name:
+            continue
+        targets.add(hit_id)
+        seq1, seq0 = q_seq, q_seqs[hit_id]
+        aln_data = get_ovelap_alignment(seq1, seq0)
+        #rtn = get_alignment(seq1, seq0)
+        if rtn != None:
+            
+            s1, e1, s2, e2, aln_size, aln_dist, c_status = aln_data
+            #print >>f, name, 0, s1, e1, len(seq1), hit_id, 0, s2, e2, len(seq0),  aln_size, aln_dist
+            rtn.append( ( hit_id, q_name, aln_dist - aln_size, "%0.2f" % (100 - 100.0*aln_dist/(aln_size+1)), 
+                          0, s2, e2, len(seq0), 
+                          0, s1, e1, len(seq1), c_status ) )
+
+    r_q_seq = "".join([RC_MAP[c] for c in q_seq[::-1]])
+    
+    hit_index = hit_index_r 
+    c = collections.Counter(hit_index)
+    s = [c[0] for c in c.items() if c[1] >50]
+    #s.sort()
+    targets = set()
+    for p in s:
+        hit_id = seqs[p][0]
+        if hit_id in targets or hit_id == q_name:
+            continue
+        targets.add(hit_id)
+        seq1, seq0 = r_q_seq, q_seqs[hit_id]
+        aln_data = get_ovelap_alignment(seq1, seq0)
+        #rtn = get_alignment(seq1, seq0)
+        if rtn != None:
+            s1, e1, s2, e2, aln_size, aln_dist, c_status = aln_data
+            #print >>f, name, 1, s1, e1, len(seq1), hit_id, 0, s2, e2, len(seq0),  aln_size, aln_dist
+            rtn.append( ( hit_id, q_name, aln_dist - aln_size, "%0.2f" % (100 - 100.0*aln_dist/(aln_size+1)), 
+                          0, s2, e2, len(seq0), 
+                          1, len(seq1) - e1, len(seq1)- s1, len(seq1), c_status ) )
+
+    return rtn
+
+def build_look_up(seqs, K):
+    global sa_ptr, sda_ptr, lk_ptr
+
+    total_index_base = len(seqs) * 1000
+    sa_ptr = sharedctypes.RawArray(base_t, total_index_base)
+    c_sa_ptr = cast(sa_ptr, POINTER(base_t))
+    kup.init_seq_array(c_sa_ptr, total_index_base)
+
+    sda_ptr = sharedctypes.RawArray(seq_coor_t, total_index_base)
+    c_sda_ptr = cast(sda_ptr, POINTER(seq_coor_t))
+
+    lk_ptr = sharedctypes.RawArray(KmerLookup, 1 << (K*2))
+    c_lk_ptr = cast(lk_ptr, POINTER(KmerLookup))
+    kup.init_kmer_lookup(c_lk_ptr, 1 << (K*2))
+
+    start = 0
+    for r_name, seq in seqs:
+        kup.add_sequence( start, K, seq, 1000, c_sda_ptr, c_sa_ptr, c_lk_ptr)
+        start += 1000
+
+    kup.mask_k_mer(1 << (K * 2), c_lk_ptr, 512)
+    
+    #return sda_ptr, sa_ptr, lk_ptr
+
+
+
+def get_candidate_hits(q_name):
+
+    global sa_ptr, sda_ptr, lk_ptr
+    global q_seqs
+
+    K = 14
+    q_seq = q_seqs[q_name]
+
+    rtn = []
+
+    c_sda_ptr = cast(sda_ptr, POINTER(seq_coor_t))
+    c_sa_ptr = cast(sa_ptr, POINTER(base_t))
+    c_lk_ptr = cast(lk_ptr, POINTER(KmerLookup))
+
+    kmer_match_ptr = kup.find_kmer_pos_for_seq(q_seq, len(q_seq), K, c_sda_ptr, c_lk_ptr)
+    kmer_match = kmer_match_ptr[0]
+    count = kmer_match.count
+    hit_index_f = np.array(kmer_match.target_pos[0:count])/1000
+    kup.free_kmer_match(kmer_match_ptr)
+
+    r_q_seq = "".join([RC_MAP[c] for c in q_seq[::-1]])
+    
+    kmer_match_ptr = kup.find_kmer_pos_for_seq(r_q_seq, len(r_q_seq), K, c_sda_ptr, c_lk_ptr)
+    kmer_match = kmer_match_ptr[0]
+    count = kmer_match.count
+    hit_index_r = np.array(kmer_match.target_pos[0:count])/1000
+    kup.free_kmer_match(kmer_match_ptr)
+    return  q_name, hit_index_f, hit_index_r
+
+
+def q_names( q_seqs ):
+    for q_name, q_seq in q_seqs.items():
+        yield q_name
+
+
+def lookup_data_iterator( q_seqs, m_pool ):
+    for mr in m_pool.imap( get_candidate_hits, q_names(q_seqs)):
+        yield mr
+
+
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description='a simple multi-processor overlapper for sequence reads')
+    parser.add_argument('fasta_file', help='a fasta file for all pairwise overlapping of the reads within')
+    parser.add_argument('--min_len', type=int, default=4000, 
+                        help='minimum length of the reads to be considered for overlapping')
+    parser.add_argument('--n_core', type=int, default=1,
+                        help='number of processes used for detailed overlapping evalution')
+    parser.add_argument('--d_core', type=int, default=1, 
+                        help='number of processes used for k-mer matching')
+
+
+    args = parser.parse_args()
+
+    seqs = []
+    q_seqs = {}
+    f = FastaReader(args.fasta_file) # take one commnad line argument of the input fasta file name
+
+    if  args.min_len < 2200:
+         args.min_len = 2200
+
+    idx = 0
+    for r in f:
+        if len(r.sequence) < args.min_len:
+            continue
+        seq = r.sequence.upper()
+        for start in range(0, len(seq), 1000):
+            if start+1000 > len(seq):
+                break
+            seqs.append( (r.name, seq[start: start+1000]) )
+            idx += 1
+        
+        #seqs.append( (r.name, seq[:1000]) )
+        seqs.append( (r.name, seq[-1000:]) )
+        idx += 1
+
+        q_seqs[r.name] = seq
+
+
+    total_index_base = len(seqs) * 1000
+    pool = mp.Pool(args.n_core)
+    K = 14
+    build_look_up(seqs, K)
+    m_pool = mp.Pool(args.d_core)
+
+    
+    #for r in pool.imap(get_candidate_aln, lookup_data_iterator( q_seqs)):
+    for r in pool.imap(get_candidate_aln, lookup_data_iterator( q_seqs, m_pool)):
+        for h in r:
+            print " ".join([str(x) for x in h]) 
+
diff --git a/src/py_scripts_v0.1/falcon_overlap2.py b/src/py_scripts_v0.1/falcon_overlap2.py
new file mode 100755
index 0000000..9ffbf56
--- /dev/null
+++ b/src/py_scripts_v0.1/falcon_overlap2.py
@@ -0,0 +1,337 @@
+#!/usr/bin/env python
+
+#################################################################################$$
+# Copyright (c) 2011-2014, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted (subject to the limitations in the
+# disclaimer below) provided that the following conditions are met:
+#
+#  * Redistributions of source code must retain the above copyright
+#  notice, this list of conditions and the following disclaimer.
+#
+#  * Redistributions in binary form must reproduce the above
+#  copyright notice, this list of conditions and the following
+#  disclaimer in the documentation and/or other materials provided
+#  with the distribution.
+#
+#  * Neither the name of Pacific Biosciences nor the names of its
+#  contributors may be used to endorse or promote products derived
+#  from this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#################################################################################$$
+
+from falcon_kit import * 
+from pbcore.io import FastaReader
+import numpy as np
+import collections
+import sys
+import multiprocessing as mp
+from multiprocessing import sharedctypes
+from ctypes import *
+
+global sa_ptr, sda_ptr, lk_ptr
+global q_seqs,t_seqs, seqs
+RC_MAP = dict( zip("ACGTacgtNn-", "TGCAtgcaNn-") )
+
+def get_ovelap_alignment(seq1, seq0):
+
+    K = 8
+    lk_ptr = kup.allocate_kmer_lookup( 1 << (K * 2) )
+    sa_ptr = kup.allocate_seq( len(seq0) )
+    sda_ptr = kup.allocate_seq_addr( len(seq0) )
+    kup.add_sequence( 0, K, seq0, len(seq0), sda_ptr, sa_ptr, lk_ptr)
+
+    kmer_match_ptr = kup.find_kmer_pos_for_seq(seq1, len(seq1), K, sda_ptr, lk_ptr)
+    kmer_match = kmer_match_ptr[0]
+    aln_range_ptr = kup.find_best_aln_range(kmer_match_ptr, K, K*5, 50)
+    #x,y = zip( * [ (kmer_match.query_pos[i], kmer_match.target_pos[i]) for i in range(kmer_match.count )] )
+    aln_range = aln_range_ptr[0]
+    kup.free_kmer_match(kmer_match_ptr)
+    s1, e1, s0, e0 = aln_range.s1, aln_range.e1, aln_range.s2, aln_range.e2  
+    e1 += K + K/2
+    e0 += K + K/2
+    kup.free_aln_range(aln_range)
+    len_1 = len(seq1)
+    len_0 = len(seq0)
+    if e1 > len_1: 
+        e1 = len_1
+    if e0 > len_0:
+        e0 = len_0
+    do_aln = False
+    contain_status = "none" 
+    #print s0, e0, s1, e1 
+    if e1 - s1 > 500:
+        if s0 < s1 and s0 > 24:
+            do_aln = False
+        elif s1 <= s0 and s1 > 24:
+            do_aln = False
+        elif s1 < 24 and len_1 - e1 < 24:
+            do_aln = True
+            contain_status = "contains"
+            #print "X1"
+        elif s0 < 24 and len_0 - e0 < 24:
+            do_aln = True
+            contain_status = "contained"
+            #print "X2"
+        else:
+            do_aln = True
+            if s0 < s1:
+                s1 -= s0 #assert s1 > 0
+                s0 = 0
+                e1 = len_1
+                #if len_1 - s1 >= len_0:
+                #    do_aln = False
+                #    contain_status = "contains"
+                #    print "X3", s0, e0, len_0, s1, e1, len_1
+
+                
+            elif s1 <= s0:
+                s0 -= s1 #assert s1 > 0
+                s1 = 0
+                e0 = len_0
+                #print s0, e0, s1, e1
+                #if len_0 - s0 >= len_1:
+                #    do_aln = False
+                #    contain_status = "contained"
+                #    print "X4"
+        #if abs( (e1 - s1) - (e0 - s0 ) ) > 200:  #avoid overlap alignment for big indels
+        #    do_aln = False
+
+        if do_aln:
+            alignment = DWA.align(seq1[s1:e1], e1-s1,
+                                  seq0[s0:e0], e0-s0,
+                                  500, 0)
+            #print seq1[s1:e1]
+            #print seq0[s2:e2]
+            #if alignment[0].aln_str_size > 500:
+    
+            #aln_str1 = alignment[0].q_aln_str
+            #aln_str0 = alignment[0].t_aln_str
+            aln_size = alignment[0].aln_str_size
+            aln_dist = alignment[0].dist
+            aln_q_s = alignment[0].aln_q_s
+            aln_q_e = alignment[0].aln_q_e
+            aln_t_s = alignment[0].aln_t_s
+            aln_t_e = alignment[0].aln_t_e
+            assert aln_q_e- aln_q_s <= alignment[0].aln_str_size or aln_t_e- aln_t_s <= alignment[0].aln_str_size
+            #print aln_str1
+            #print aln_str0
+            if aln_size > 500 and contain_status == "none": 
+                contain_status = "overlap"            
+            DWA.free_alignment(alignment)
+        
+    kup.free_seq_addr_array(sda_ptr)
+    kup.free_seq_array(sa_ptr)
+    kup.free_kmer_lookup(lk_ptr)
+
+    if do_aln:
+        if s1 > 1000 and s0 > 1000:
+            return 0, 0, 0, 0, 0, 0, "none"
+        if len_1 - (s1+aln_q_e-aln_q_s) > 1000 and len_0 - (s0+aln_t_e-aln_t_s) > 1000:
+            return 0, 0, 0, 0, 0, 0, "none"
+
+    if e1 - s1 > 500 and do_aln and aln_size > 500:
+        #return s1, s1+aln_q_e-aln_q_s, s2, s2+aln_t_e-aln_t_s, aln_size, aln_dist, x, y
+        return s1, s1+aln_q_e-aln_q_s, s0, s0+aln_t_e-aln_t_s, aln_size, aln_dist, contain_status
+    else:
+        return 0, 0, 0, 0, 0, 0, contain_status 
+
+def get_candidate_aln(hit_input):
+
+    global q_seqs, seqs, t_seqs
+    q_name, hit_index_f, hit_index_r = hit_input
+    q_seq = q_seqs[q_name]
+
+    rtn = []
+
+    hit_index = hit_index_f 
+    c = collections.Counter(hit_index)
+    s = [c[0] for c in c.items() if c[1] >50]
+    #s.sort()
+    targets = set()
+    for p in s:
+        hit_id = seqs[p][0]
+        if hit_id in targets or hit_id == q_name:
+            continue
+        targets.add(hit_id)
+        seq1, seq0 = q_seq, t_seqs[hit_id]
+        aln_data = get_ovelap_alignment(seq1, seq0)
+        #rtn = get_alignment(seq1, seq0)
+        if rtn != None:
+             
+            s1, e1, s2, e2, aln_size, aln_dist, c_status = aln_data
+            if c_status == "none":
+                continue
+            #print >>f, name, 0, s1, e1, len(seq1), hit_id, 0, s2, e2, len(seq0),  aln_size, aln_dist
+            rtn.append( ( hit_id, q_name, aln_dist - aln_size, "%0.2f" % (100 - 100.0*aln_dist/(aln_size+1)), 
+                          0, s2, e2, len(seq0), 
+                          0, s1, e1, len(seq1), c_status ) )
+
+    r_q_seq = "".join([RC_MAP[c] for c in q_seq[::-1]])
+    
+    hit_index = hit_index_r 
+    c = collections.Counter(hit_index)
+    s = [c[0] for c in c.items() if c[1] >50]
+    #s.sort()
+    targets = set()
+    for p in s:
+        hit_id = seqs[p][0]
+        if hit_id in targets or hit_id == q_name:
+            continue
+        targets.add(hit_id)
+        seq1, seq0 = r_q_seq, t_seqs[hit_id]
+        aln_data = get_ovelap_alignment(seq1, seq0)
+        #rtn = get_alignment(seq1, seq0)
+        if rtn != None:
+            s1, e1, s2, e2, aln_size, aln_dist, c_status = aln_data
+            if c_status == "none":
+                continue
+            #print >>f, name, 1, s1, e1, len(seq1), hit_id, 0, s2, e2, len(seq0),  aln_size, aln_dist
+            rtn.append( ( hit_id, q_name, aln_dist - aln_size, "%0.2f" % (100 - 100.0*aln_dist/(aln_size+1)), 
+                          0, s2, e2, len(seq0), 
+                          1, len(seq1) - e1, len(seq1)- s1, len(seq1), c_status ) )
+
+    return rtn
+
+def build_look_up(seqs, K):
+    global sa_ptr, sda_ptr, lk_ptr
+
+    total_index_base = len(seqs) * 1000
+    sa_ptr = sharedctypes.RawArray(base_t, total_index_base)
+    c_sa_ptr = cast(sa_ptr, POINTER(base_t))
+    kup.init_seq_array(c_sa_ptr, total_index_base)
+
+    sda_ptr = sharedctypes.RawArray(seq_coor_t, total_index_base)
+    c_sda_ptr = cast(sda_ptr, POINTER(seq_coor_t))
+
+    lk_ptr = sharedctypes.RawArray(KmerLookup, 1 << (K*2))
+    c_lk_ptr = cast(lk_ptr, POINTER(KmerLookup))
+    kup.init_kmer_lookup(c_lk_ptr, 1 << (K*2))
+
+    start = 0
+    for r_name, seq in seqs:
+        kup.add_sequence( start, K, seq, 1000, c_sda_ptr, c_sa_ptr, c_lk_ptr)
+        start += 1000
+
+    kup.mask_k_mer(1 << (K * 2), c_lk_ptr, 256)
+    
+    #return sda_ptr, sa_ptr, lk_ptr
+
+
+
+def get_candidate_hits(q_name):
+
+    global sa_ptr, sda_ptr, lk_ptr
+    global q_seqs
+
+    K = 14
+    q_seq = q_seqs[q_name]
+
+    rtn = []
+
+    c_sda_ptr = cast(sda_ptr, POINTER(seq_coor_t))
+    c_sa_ptr = cast(sa_ptr, POINTER(base_t))
+    c_lk_ptr = cast(lk_ptr, POINTER(KmerLookup))
+
+    kmer_match_ptr = kup.find_kmer_pos_for_seq(q_seq, len(q_seq), K, c_sda_ptr, c_lk_ptr)
+    kmer_match = kmer_match_ptr[0]
+    count = kmer_match.count
+    hit_index_f = np.array(kmer_match.target_pos[0:count])/1000
+    kup.free_kmer_match(kmer_match_ptr)
+
+    r_q_seq = "".join([RC_MAP[c] for c in q_seq[::-1]])
+    
+    kmer_match_ptr = kup.find_kmer_pos_for_seq(r_q_seq, len(r_q_seq), K, c_sda_ptr, c_lk_ptr)
+    kmer_match = kmer_match_ptr[0]
+    count = kmer_match.count
+    hit_index_r = np.array(kmer_match.target_pos[0:count])/1000
+    kup.free_kmer_match(kmer_match_ptr)
+    return  q_name, hit_index_f, hit_index_r
+
+
+def q_names( q_seqs ):
+    for q_name, q_seq in q_seqs.items():
+        yield q_name
+
+
+def lookup_data_iterator( q_seqs, m_pool ):
+    for mr in m_pool.imap( get_candidate_hits, q_names(q_seqs)):
+        yield mr
+
+
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description='a simple multi-processor overlapper for sequence reads')
+    parser.add_argument('query_fa', help='a fasta file to be overlapped with sequence in target')
+    parser.add_argument('target_fa', help='a fasta file as the target sequences for overlapping')
+    parser.add_argument('--min_len', type=int, default=4000, 
+                        help='minimum length of the reads to be considered for overlapping')
+    parser.add_argument('--n_core', type=int, default=1,
+                        help='number of processes used for detailed overlapping evalution')
+    parser.add_argument('--d_core', type=int, default=1, 
+                        help='number of processes used for k-mer matching')
+
+
+    args = parser.parse_args()
+
+    seqs = []
+    q_seqs = {}
+    t_seqs = {}
+    f = FastaReader(args.target_fa) # take one commnad line argument of the input fasta file name
+
+    if  args.min_len < 2200:
+         args.min_len = 2200
+
+    idx = 0
+    for r in f:
+        if len(r.sequence) < args.min_len:
+            continue
+        seq = r.sequence.upper()
+        for start in range(0, len(seq), 1000):
+            if start+1000 > len(seq):
+                break
+            seqs.append( (r.name, seq[start: start+1000]) )
+            idx += 1
+        
+        seqs.append( (r.name, seq[-1000:]) )
+        idx += 1
+
+        t_seqs[r.name] = seq
+
+    f = FastaReader(args.query_fa) # take one commnad line argument of the input fasta file name
+    for r in f:
+        if len(r.sequence) < args.min_len:
+            continue
+        seq = r.sequence.upper()
+        q_seqs[r.name] = seq
+
+
+    total_index_base = len(seqs) * 1000
+    pool = mp.Pool(args.n_core)
+    K = 14
+    build_look_up(seqs, K)
+    m_pool = mp.Pool(args.d_core)
+
+    
+    #for r in pool.imap(get_candidate_aln, lookup_data_iterator( q_seqs)):
+    for r in pool.imap(get_candidate_aln, lookup_data_iterator( q_seqs, m_pool)):
+        for h in r:
+            print " ".join([str(x) for x in h]) 
+
diff --git a/src/py_scripts_v0.1/falcon_qrm.py b/src/py_scripts_v0.1/falcon_qrm.py
new file mode 100755
index 0000000..5196b65
--- /dev/null
+++ b/src/py_scripts_v0.1/falcon_qrm.py
@@ -0,0 +1,370 @@
+#!/usr/bin/env python
+
+#################################################################################$$
+# Copyright (c) 2011-2014, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted (subject to the limitations in the
+# disclaimer below) provided that the following conditions are met:
+#
+#  * Redistributions of source code must retain the above copyright
+#  notice, this list of conditions and the following disclaimer.
+#
+#  * Redistributions in binary form must reproduce the above
+#  copyright notice, this list of conditions and the following
+#  disclaimer in the documentation and/or other materials provided
+#  with the distribution.
+#
+#  * Neither the name of Pacific Biosciences nor the names of its
+#  contributors may be used to endorse or promote products derived
+#  from this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#################################################################################$$
+
+from falcon_kit import * 
+from pbcore.io import FastaReader
+import numpy as np
+import collections
+import sys
+import multiprocessing as mp
+from multiprocessing import sharedctypes
+from ctypes import *
+import math
+
+global sa_ptr, sda_ptr, lk_ptr
+global q_seqs,t_seqs, seqs
+global n_candidates, max_candidates
+
+seqs = []
+RC_MAP = dict( zip("ACGTacgtNn-", "TGCAtgcaNn-") )
+
+all_fivemers = []
+cmap = {0:"A", 1:"T", 2:"C", 3:"G"}
+for i in range(1024):
+    mer = []
+    for j in range(5):
+        mer.append( cmap[ i >> (2 *j) & 3 ])
+    all_fivemers.append("".join(mer))
+
+def fivemer_entropy(seq):
+    five_mer_count = {}
+
+    for i in range(len(seq)-5):
+        five_mer = seq[i:i+5]
+        five_mer_count.setdefault(five_mer, 0)
+        five_mer_count[five_mer] += 1
+    
+    entropy = 0.0
+    for five_mer in all_fivemers:
+        p = five_mer_count.get(five_mer, 0) + 1.0
+        p /= len(seq)
+        entropy += - p * math.log(p)
+
+    return entropy
+
+def get_alignment(seq1, seq0):
+
+    K = 8 
+    lk_ptr = kup.allocate_kmer_lookup( 1 << (K * 2) )
+    sa_ptr = kup.allocate_seq( len(seq0) )
+    sda_ptr = kup.allocate_seq_addr( len(seq0) )
+    kup.add_sequence( 0, K, seq0, len(seq0), sda_ptr, sa_ptr, lk_ptr)
+
+    kup.mask_k_mer(1 << (K * 2), lk_ptr, 16)
+    kmer_match_ptr = kup.find_kmer_pos_for_seq(seq1, len(seq1), K, sda_ptr, lk_ptr)
+    kmer_match = kmer_match_ptr[0]
+    aln_range_ptr = kup.find_best_aln_range2(kmer_match_ptr, K, K*50, 25)
+    #x,y = zip( * [ (kmer_match.query_pos[i], kmer_match.target_pos[i]) for i in range(kmer_match.count )] )
+    aln_range = aln_range_ptr[0]
+    kup.free_kmer_match(kmer_match_ptr)
+    s1, e1, s0, e0, km_score = aln_range.s1, aln_range.e1, aln_range.s2, aln_range.e2, aln_range.score  
+    e1 += K + K/2
+    e0 += K + K/2
+    kup.free_aln_range(aln_range)
+    len_1 = len(seq1)
+    len_0 = len(seq0)
+    if e1 > len_1: 
+        e1 = len_1
+    if e0 > len_0:
+        e0 = len_0
+
+    aln_size = 1
+    if e1 - s1 > 500:
+
+        aln_size = max( e1-s1, e0-s0 )
+        aln_score = int(km_score * 48)
+        aln_q_s = s1
+        aln_q_e = e1
+        aln_t_s = s0
+        aln_t_e = e0
+        
+    kup.free_seq_addr_array(sda_ptr)
+    kup.free_seq_array(sa_ptr)
+    kup.free_kmer_lookup(lk_ptr)
+
+    if s1 > 1000 and s0 > 1000:
+        return 0, 0, 0, 0, 0, 0, "none"
+
+    if len_1 - e1 > 1000 and len_0 - e0 > 1000:
+        return 0, 0, 0, 0, 0, 0, "none"
+
+
+    if e1 - s1 > 500 and aln_size > 500:
+        return s1, s1+aln_q_e-aln_q_s, s0, s0+aln_t_e-aln_t_s, aln_size, aln_score, "aln"
+    else:
+        return 0, 0, 0, 0, 0, 0, "none"
+
+def get_candidate_aln(hit_input):
+    
+    global q_seqs, seqs, t_seqs, q_len
+    global max_candidates
+    global n_candidates
+    q_name, hit_index_f, hit_index_r = hit_input
+    q_seq = q_seqs[q_name]
+
+    rtn = []
+    hit_index = hit_index_f
+    c = collections.Counter(hit_index)
+    s = [(c[0],c[1]) for c in c.items() if c[1] > 4]
+    
+    hit_data = {}
+    #hit_ids = set()
+
+    for p, hit_count in s:
+        hit_id = seqs[p][0]
+        hit_data.setdefault(hit_id, [0, 0 ,0])
+        hit_data[hit_id][0] += hit_count;
+        if hit_count > hit_data[hit_id][1]:
+            hit_data[hit_id][1] = hit_count
+        hit_data[hit_id][2] += 1
+
+    hit_data = hit_data.items()
+
+    hit_data.sort( key=lambda x:-x[1][0] )
+
+    target_count = {}
+    total_hit = 0
+
+    for hit in hit_data[:n_candidates]:
+        hit_id = hit[0]
+        hit_count = hit[1][0]
+        target_count.setdefault(hit_id, 0)
+        if target_count[hit_id] > max_candidates:
+            continue
+        if total_hit > max_candidates:
+            continue
+        seq1, seq0 = q_seq, t_seqs[hit_id]
+        aln_data = get_alignment(seq1, seq0)
+        if rtn != None:
+             
+            s1, e1, s2, e2, aln_size, aln_score, c_status = aln_data
+            if c_status == "none":
+                continue
+            target_count[hit_id] += 1
+            total_hit += 1
+            rtn.append( ( q_name, hit_id, -aln_score, "%0.2f" % (100.0*aln_score/(aln_size+1)), 
+                          0, s1, e1, len(seq1), 
+                          0, s2, e2, len(seq0), c_status + " %d" % hit_count ) )
+
+    r_q_seq = "".join([RC_MAP[c] for c in q_seq[::-1]])
+    
+    hit_index = hit_index_r 
+    c = collections.Counter(hit_index)
+    s = [(c[0],c[1]) for c in c.items() if c[1] > 4]
+
+    hit_data = {}
+    #hit_ids = set()
+
+    for p, hit_count in s:
+        hit_id = seqs[p][0]
+        hit_data.setdefault(hit_id, [0, 0 ,0])
+        hit_data[hit_id][0] += hit_count;
+        if hit_count > hit_data[hit_id][1]:
+            hit_data[hit_id][1] = hit_count
+        hit_data[hit_id][2] += 1
+
+    hit_data = hit_data.items()
+
+    hit_data.sort( key=lambda x:-x[1][0] )
+
+
+    target_count = {}
+    total_hit = 0
+
+    for hit in hit_data[:n_candidates]:
+        hit_id = hit[0] 
+        hit_count = hit[1][0]
+        target_count.setdefault(hit_id, 0)
+        if target_count[hit_id] > max_candidates:
+            continue
+        if total_hit > max_candidates:
+            continue
+        seq1, seq0 = r_q_seq, t_seqs[hit_id]
+        aln_data = get_alignment(seq1, seq0)
+        if rtn != None:
+            s1, e1, s2, e2, aln_size, aln_score, c_status = aln_data
+            if c_status == "none":
+                continue
+            target_count[hit_id] += 1
+            total_hit += 1
+            rtn.append( ( q_name, hit_id, -aln_score, "%0.2f" % (100.0*aln_score/(aln_size+1)), 
+                          0, len(seq1) - e1, len(seq1) - s1, len(seq1), 
+                          1, s2, e2, len(seq0), c_status + " %d" % hit_count ) )
+
+    return rtn
+
+def build_look_up(seqs, K):
+    global sa_ptr, sda_ptr, lk_ptr
+
+    total_index_base = len(seqs) * 1000
+    sa_ptr = sharedctypes.RawArray(base_t, total_index_base)
+    c_sa_ptr = cast(sa_ptr, POINTER(base_t))
+    kup.init_seq_array(c_sa_ptr, total_index_base)
+
+    sda_ptr = sharedctypes.RawArray(seq_coor_t, total_index_base)
+    c_sda_ptr = cast(sda_ptr, POINTER(seq_coor_t))
+
+    lk_ptr = sharedctypes.RawArray(KmerLookup, 1 << (K*2))
+    c_lk_ptr = cast(lk_ptr, POINTER(KmerLookup))
+    kup.init_kmer_lookup(c_lk_ptr, 1 << (K*2))
+
+    start = 0
+    for r_name, seq in seqs:
+        kup.add_sequence( start, K, seq, 1000, c_sda_ptr, c_sa_ptr, c_lk_ptr)
+        start += 1000
+
+    kup.mask_k_mer(1 << (K * 2), c_lk_ptr, 1024)
+    
+    #return sda_ptr, sa_ptr, lk_ptr
+
+
+
+def get_candidate_hits(q_name):
+
+    global sa_ptr, sda_ptr, lk_ptr
+    global q_seqs
+
+    K = 14
+    q_seq = q_seqs[q_name]
+
+    rtn = []
+
+    c_sda_ptr = cast(sda_ptr, POINTER(seq_coor_t))
+    c_sa_ptr = cast(sa_ptr, POINTER(base_t))
+    c_lk_ptr = cast(lk_ptr, POINTER(KmerLookup))
+
+    kmer_match_ptr = kup.find_kmer_pos_for_seq(q_seq, len(q_seq), K, c_sda_ptr, c_lk_ptr)
+    kmer_match = kmer_match_ptr[0]
+    count = kmer_match.count
+    hit_index_f = np.array(kmer_match.target_pos[0:count])/1000
+    kup.free_kmer_match(kmer_match_ptr)
+
+    r_q_seq = "".join([RC_MAP[c] for c in q_seq[::-1]])
+    
+    kmer_match_ptr = kup.find_kmer_pos_for_seq(r_q_seq, len(r_q_seq), K, c_sda_ptr, c_lk_ptr)
+    kmer_match = kmer_match_ptr[0]
+    count = kmer_match.count
+    hit_index_r = np.array(kmer_match.target_pos[0:count])/1000
+    kup.free_kmer_match(kmer_match_ptr)
+    return  q_name, hit_index_f, hit_index_r
+
+
+def q_names( q_seqs ):
+    for q_name, q_seq in q_seqs.items():
+        yield q_name
+
+
+def lookup_data_iterator( q_seqs, m_pool ):
+    for mr in m_pool.imap( get_candidate_hits, q_names(q_seqs)):
+        yield mr
+
+
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description='a simple multi-processor overlapper for sequence reads')
+    parser.add_argument('target_fofn', help='a fasta fofn as the target sequences for overlapping')
+    parser.add_argument('query_fofn', help='a fasta fofn  to be overlapped with sequence in target')
+    parser.add_argument('--min_len', type=int, default=4000, 
+                        help='minimum length of the reads to be considered for overlapping')
+    parser.add_argument('--n_core', type=int, default=1,
+                        help='number of processes used for detailed overlapping evalution')
+    parser.add_argument('--d_core', type=int, default=1, 
+                        help='number of processes used for k-mer matching')
+    parser.add_argument('--n_candidates', type=int, default=128, 
+                        help='number of candidates for read matching')
+    parser.add_argument('--max_candidates', type=int, default=64, 
+                        help='max number for read matching to output')
+
+
+
+    args = parser.parse_args()
+
+    max_candidates = args.max_candidates
+    n_candidates = args.n_candidates
+
+    q_seqs = {}
+    t_seqs = {}
+    if  args.min_len < 1200:
+         args.min_len = 1200
+
+    with open(args.target_fofn) as fofn:
+        for fn in fofn:
+            fn = fn.strip()
+            f = FastaReader(fn) # take one commnad line argument of the input fasta file name
+            for r in f:
+                if len(r.sequence) < args.min_len:
+                    continue
+                seq = r.sequence.upper()
+                for start in range(0, len(seq), 1000):
+                    if start+1000 > len(seq):
+                        break
+                    subseq = seq[start: start+1000]
+                    #if fivemer_entropy(subseq) < 4:
+                    #    continue
+                    seqs.append( (r.name, subseq) )
+                subseq = seq[-1000:]
+                #if fivemer_entropy(subseq) < 4:
+                #    continue
+                #seqs.append( (r.name, seq[:1000]) )
+                seqs.append( (r.name, subseq) )
+
+                t_seqs[r.name] = seq
+
+    with open(args.query_fofn) as fofn:
+        for fn in fofn:
+            fn = fn.strip()
+            f = FastaReader(fn) # take one commnad line argument of the input fasta file name
+            for r in f:
+                seq = r.sequence.upper()
+                #if fivemer_entropy(seq) < 4:
+                #    continue
+                q_seqs[r.name] = seq
+
+
+    pool = mp.Pool(args.n_core)
+    K = 14
+    build_look_up(seqs, K)
+    m_pool = mp.Pool(args.d_core)
+
+    
+    #for r in pool.imap(get_candidate_aln, lookup_data_iterator( q_seqs)):
+    for r in pool.imap(get_candidate_aln, lookup_data_iterator(q_seqs, m_pool)):
+        for h in r:
+            print " ".join([str(x) for x in h]) 
+
diff --git a/src/py_scripts_v0.1/falcon_qrm_0.py b/src/py_scripts_v0.1/falcon_qrm_0.py
new file mode 100755
index 0000000..c07496f
--- /dev/null
+++ b/src/py_scripts_v0.1/falcon_qrm_0.py
@@ -0,0 +1,378 @@
+#!/usr/bin/env python
+
+#################################################################################$$
+# Copyright (c) 2011-2014, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted (subject to the limitations in the
+# disclaimer below) provided that the following conditions are met:
+#
+#  * Redistributions of source code must retain the above copyright
+#  notice, this list of conditions and the following disclaimer.
+#
+#  * Redistributions in binary form must reproduce the above
+#  copyright notice, this list of conditions and the following
+#  disclaimer in the documentation and/or other materials provided
+#  with the distribution.
+#
+#  * Neither the name of Pacific Biosciences nor the names of its
+#  contributors may be used to endorse or promote products derived
+#  from this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#################################################################################$$
+
+from falcon_kit import * 
+from pbcore.io import FastaReader
+import numpy as np
+import collections
+import sys
+import multiprocessing as mp
+from multiprocessing import sharedctypes
+from ctypes import *
+import math
+
+global sa_ptr, sda_ptr, lk_ptr
+global q_seqs,t_seqs, seqs
+
+seqs = []
+RC_MAP = dict( zip("ACGTacgtNn-", "TGCAtgcaNn-") )
+
+all_fivemers = []
+cmap = {0:"A", 1:"T", 2:"C", 3:"G"}
+for i in range(1024):
+    mer = []
+    for j in range(5):
+        mer.append( cmap[ i >> (2 *j) & 3 ])
+    all_fivemers.append("".join(mer))
+
+def fivemer_entropy(seq):
+    five_mer_count = {}
+
+    for i in range(len(seq)-5):
+        five_mer = seq[i:i+5]
+        five_mer_count.setdefault(five_mer, 0)
+        five_mer_count[five_mer] += 1
+    
+    entropy = 0.0
+    for five_mer in all_fivemers:
+        p = five_mer_count.get(five_mer, 0) + 1.0
+        p /= len(seq)
+        entropy += - p * math.log(p)
+
+    return entropy
+
+def get_alignment(seq1, seq0):
+
+    K = 8 
+    lk_ptr = kup.allocate_kmer_lookup( 1 << (K * 2) )
+    sa_ptr = kup.allocate_seq( len(seq0) )
+    sda_ptr = kup.allocate_seq_addr( len(seq0) )
+    kup.add_sequence( 0, K, seq0, len(seq0), sda_ptr, sa_ptr, lk_ptr)
+
+    kup.mask_k_mer(1 << (K * 2), lk_ptr, 16)
+    kmer_match_ptr = kup.find_kmer_pos_for_seq(seq1, len(seq1), K, sda_ptr, lk_ptr)
+    kmer_match = kmer_match_ptr[0]
+    aln_range_ptr = kup.find_best_aln_range2(kmer_match_ptr, K, K*50, 25)
+    #x,y = zip( * [ (kmer_match.query_pos[i], kmer_match.target_pos[i]) for i in range(kmer_match.count )] )
+    aln_range = aln_range_ptr[0]
+    kup.free_kmer_match(kmer_match_ptr)
+    s1, e1, s0, e0, km_score = aln_range.s1, aln_range.e1, aln_range.s2, aln_range.e2, aln_range.score  
+    e1 += K + K/2
+    e0 += K + K/2
+    kup.free_aln_range(aln_range)
+    len_1 = len(seq1)
+    len_0 = len(seq0)
+    if e1 > len_1: 
+        e1 = len_1
+    if e0 > len_0:
+        e0 = len_0
+
+    aln_size = 1
+    if e1 - s1 > 500:
+
+        #aln_size = max( e1-s1, e0-s0 )
+        #aln_score = int(km_score * 2)
+        #aln_q_s = s1
+        #aln_q_e = e1
+        #aln_t_s = s0
+        #aln_t_e = e0
+        
+        alignment = DWA.align(seq1[s1:e1], e1-s1,
+                              seq0[s0:e0], e0-s0,
+                              500, 0)
+        aln_size = alignment[0].aln_str_size
+        aln_score = 4 * alignment[0].aln_str_size - 5 * alignment[0].dist
+        aln_q_s = alignment[0].aln_q_s
+        aln_q_e = alignment[0].aln_q_e
+        aln_t_s = alignment[0].aln_t_s
+        aln_t_e = alignment[0].aln_t_e
+        assert aln_q_e- aln_q_s <= alignment[0].aln_str_size or aln_t_e- aln_t_s <= alignment[0].aln_str_size
+        #print aln_str1
+        #print aln_str0
+
+        if aln_size > 500: 
+            contain_status = "overlap"            
+        DWA.free_alignment(alignment)
+        
+    kup.free_seq_addr_array(sda_ptr)
+    kup.free_seq_array(sa_ptr)
+    kup.free_kmer_lookup(lk_ptr)
+
+    if e1 - s1 > 500 and aln_size > 500:
+        return s1, s1+aln_q_e-aln_q_s, s0, s0+aln_t_e-aln_t_s, aln_size, aln_score, "aln"
+    else:
+        return 0, 0, 0, 0, 0, 0, "none"
+
+def get_candidate_aln(hit_input):
+    
+    global q_seqs, seqs, t_seqs, q_len
+    q_name, hit_index_f, hit_index_r = hit_input
+    q_seq = q_seqs[q_name]
+
+    rtn = []
+    hit_index = hit_index_f
+    c = collections.Counter(hit_index)
+    s = [(c[0],c[1]) for c in c.items() if c[1] > 6]
+    
+    hit_data = []
+    hit_ids = set()
+    for p, hit_count in s:
+        hit_id = seqs[p][0]
+        if hit_id == q_name or hit_id in hit_ids:
+            continue
+        if hit_id not in hit_ids:
+            hit_ids.add(hit_id)
+            hit_data.append( (hit_id, t_seqs[hit_id], len(t_seqs[hit_id]), hit_count) )
+
+    hit_data.sort( key=lambda x:-x[2] )
+
+    target_count = {}
+    total_hit = 0
+
+    for hit in hit_data:
+        hit_id = hit[0]
+        hit_count = hit[3]
+        target_count.setdefault(hit_id, 0)
+        if target_count[hit_id] > 64:
+            continue
+        if total_hit > 64:
+            continue
+        seq1, seq0 = q_seq, hit[1] 
+        aln_data = get_alignment(seq1, seq0)
+        if rtn != None:
+             
+            s1, e1, s2, e2, aln_size, aln_score, c_status = aln_data
+            if c_status == "none":
+                continue
+            """
+            if e1 - s1 < 5000:
+                if -aln_score > -8000:
+                    continue
+                if (100.0*aln_score/(aln_size+1)) < 150:
+                    continue
+            """
+            target_count[hit_id] += 1
+            total_hit += 1
+            rtn.append( ( q_name, hit_id, -aln_score, "%0.2f" % (100.0*aln_score/(aln_size+1)), 
+                          0, s1, e1, len(seq1), 
+                          0, s2, e2, len(seq0), c_status + " %d" % hit_count ) )
+
+    r_q_seq = "".join([RC_MAP[c] for c in q_seq[::-1]])
+    
+    hit_index = hit_index_r 
+    c = collections.Counter(hit_index)
+    s = [(c[0],c[1]) for c in c.items() if c[1] > 6]
+
+    hit_data = []
+    hit_ids = set()
+    for p, hit_count in s:
+        hit_id = seqs[p][0]
+        if hit_id == q_name or hit_id in hit_ids:
+            continue
+        if hit_id not in hit_ids:
+            hit_ids.add(hit_id)
+            hit_data.append( (hit_id, t_seqs[hit_id], len(t_seqs[hit_id]), hit_count) )
+
+    hit_data.sort( key=lambda x:-x[2] )
+
+    target_count = {}
+    total_hit = 0
+
+    for hit in hit_data:
+        hit_id = hit[0] 
+        hit_count = hit[3]
+        target_count.setdefault(hit_id, 0)
+        if target_count[hit_id] > 64:
+            continue
+        if total_hit > 64:
+            continue
+        seq1, seq0 = r_q_seq, hit[1]
+        aln_data = get_alignment(seq1, seq0)
+        if rtn != None:
+            s1, e1, s2, e2, aln_size, aln_score, c_status = aln_data
+            if c_status == "none":
+                continue
+            """
+            if e1 - s1 < 5000:
+                if -aln_score > -8000:
+                    continue
+                if (100.0*aln_score/(aln_size+1)) < 150:
+                    continue
+            """
+            target_count[hit_id] += 1
+            total_hit += 1
+            rtn.append( ( q_name, hit_id, -aln_score, "%0.2f" % (100.0*aln_score/(aln_size+1)), 
+                          0, len(seq1) - e1, len(seq1) - s1, len(seq1), 
+                          1, s2, e2, len(seq0), c_status + " %d" % hit_count ) )
+
+    return rtn
+
+def build_look_up(seqs, K):
+    global sa_ptr, sda_ptr, lk_ptr
+
+    total_index_base = len(seqs) * 1000
+    sa_ptr = sharedctypes.RawArray(base_t, total_index_base)
+    c_sa_ptr = cast(sa_ptr, POINTER(base_t))
+    kup.init_seq_array(c_sa_ptr, total_index_base)
+
+    sda_ptr = sharedctypes.RawArray(seq_coor_t, total_index_base)
+    c_sda_ptr = cast(sda_ptr, POINTER(seq_coor_t))
+
+    lk_ptr = sharedctypes.RawArray(KmerLookup, 1 << (K*2))
+    c_lk_ptr = cast(lk_ptr, POINTER(KmerLookup))
+    kup.init_kmer_lookup(c_lk_ptr, 1 << (K*2))
+
+    start = 0
+    for r_name, seq in seqs:
+        kup.add_sequence( start, K, seq, 1000, c_sda_ptr, c_sa_ptr, c_lk_ptr)
+        start += 1000
+
+    kup.mask_k_mer(1 << (K * 2), c_lk_ptr, 1024)
+    
+    #return sda_ptr, sa_ptr, lk_ptr
+
+
+
+def get_candidate_hits(q_name):
+
+    global sa_ptr, sda_ptr, lk_ptr
+    global q_seqs
+
+    K = 14
+    q_seq = q_seqs[q_name]
+
+    rtn = []
+
+    c_sda_ptr = cast(sda_ptr, POINTER(seq_coor_t))
+    c_sa_ptr = cast(sa_ptr, POINTER(base_t))
+    c_lk_ptr = cast(lk_ptr, POINTER(KmerLookup))
+
+    kmer_match_ptr = kup.find_kmer_pos_for_seq(q_seq, len(q_seq), K, c_sda_ptr, c_lk_ptr)
+    kmer_match = kmer_match_ptr[0]
+    count = kmer_match.count
+    hit_index_f = np.array(kmer_match.target_pos[0:count])/1000
+    kup.free_kmer_match(kmer_match_ptr)
+
+    r_q_seq = "".join([RC_MAP[c] for c in q_seq[::-1]])
+    
+    kmer_match_ptr = kup.find_kmer_pos_for_seq(r_q_seq, len(r_q_seq), K, c_sda_ptr, c_lk_ptr)
+    kmer_match = kmer_match_ptr[0]
+    count = kmer_match.count
+    hit_index_r = np.array(kmer_match.target_pos[0:count])/1000
+    kup.free_kmer_match(kmer_match_ptr)
+    return  q_name, hit_index_f, hit_index_r
+
+
+def q_names( q_seqs ):
+    for q_name, q_seq in q_seqs.items():
+        yield q_name
+
+
+def lookup_data_iterator( q_seqs, m_pool ):
+    for mr in m_pool.imap( get_candidate_hits, q_names(q_seqs)):
+        yield mr
+
+
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description='a simple multi-processor overlapper for sequence reads')
+    parser.add_argument('target_fofn', help='a fasta fofn as the target sequences for overlapping')
+    parser.add_argument('query_fofn', help='a fasta fofn  to be overlapped with sequence in target')
+    parser.add_argument('--min_len', type=int, default=4000, 
+                        help='minimum length of the reads to be considered for overlapping')
+    parser.add_argument('--n_core', type=int, default=1,
+                        help='number of processes used for detailed overlapping evalution')
+    parser.add_argument('--d_core', type=int, default=1, 
+                        help='number of processes used for k-mer matching')
+
+
+    args = parser.parse_args()
+
+    q_seqs = {}
+    t_seqs = {}
+    if  args.min_len < 1200:
+         args.min_len = 1200
+
+    with open(args.target_fofn) as fofn:
+        for fn in fofn:
+            fn = fn.strip()
+            f = FastaReader(fn) # take one commnad line argument of the input fasta file name
+            for r in f:
+                if len(r.sequence) < args.min_len:
+                    continue
+                seq = r.sequence.upper()
+                for start in range(0, len(seq), 1000):
+                    if start+1000 > len(seq):
+                        break
+                    subseq = seq[start: start+1000]
+                    #if fivemer_entropy(subseq) < 4:
+                    #    continue
+                    seqs.append( (r.name, subseq) )
+                subseq = seq[-1000:]
+                #if fivemer_entropy(subseq) < 4:
+                #    continue
+                #seqs.append( (r.name, seq[:1000]) )
+                seqs.append( (r.name, subseq) )
+
+                t_seqs[r.name] = seq
+
+    with open(args.query_fofn) as fofn:
+        for fn in fofn:
+            fn = fn.strip()
+            f = FastaReader(fn) # take one commnad line argument of the input fasta file name
+            for r in f:
+                #if len(r.sequence) < args.min_len:
+                #    continue
+                seq = r.sequence.upper()
+                if fivemer_entropy(seq) < 4:
+                    continue
+                q_seqs[r.name] = seq
+
+
+    pool = mp.Pool(args.n_core)
+    K = 14
+    build_look_up(seqs, K)
+    m_pool = mp.Pool(args.d_core)
+
+    
+    #for r in pool.imap(get_candidate_aln, lookup_data_iterator( q_seqs)):
+    for r in pool.imap(get_candidate_aln, lookup_data_iterator( q_seqs, m_pool)):
+        for h in r:
+            print " ".join([str(x) for x in h]) 
+
diff --git a/src/py_scripts_v0.1/falcon_sense.py b/src/py_scripts_v0.1/falcon_sense.py
new file mode 100644
index 0000000..f2386ae
--- /dev/null
+++ b/src/py_scripts_v0.1/falcon_sense.py
@@ -0,0 +1,248 @@
+#!/usr/bin/env python
+
+#################################################################################$$
+# Copyright (c) 2011-2014, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted (subject to the limitations in the
+# disclaimer below) provided that the following conditions are met:
+#
+#  * Redistributions of source code must retain the above copyright
+#  notice, this list of conditions and the following disclaimer.
+#
+#  * Redistributions in binary form must reproduce the above
+#  copyright notice, this list of conditions and the following
+#  disclaimer in the documentation and/or other materials provided
+#  with the distribution.
+#
+#  * Neither the name of Pacific Biosciences nor the names of its
+#  contributors may be used to endorse or promote products derived
+#  from this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#################################################################################$$
+
+from ctypes import *
+import sys
+from multiprocessing import Pool
+import os
+import falcon_kit
+
+module_path = falcon_kit.__path__[0]
+
+falcon = CDLL(os.path.join(module_path, "falcon.so"))
+
+falcon.generate_consensus.argtypes = [ POINTER(c_char_p), c_uint, c_uint, c_uint, c_uint, c_uint, c_double ]
+falcon.generate_consensus.restype = POINTER(falcon_kit.ConsensusData)
+falcon.free_consensus_data.argtypes = [ POINTER(falcon_kit.ConsensusData) ]
+
+
+def get_alignment(seq1, seq0, edge_tolerance = 1000):
+
+    kup = falcon_kit.kup
+    K = 8 
+    lk_ptr = kup.allocate_kmer_lookup( 1 << (K * 2) )
+    sa_ptr = kup.allocate_seq( len(seq0) )
+    sda_ptr = kup.allocate_seq_addr( len(seq0) )
+    kup.add_sequence( 0, K, seq0, len(seq0), sda_ptr, sa_ptr, lk_ptr)
+
+    kup.mask_k_mer(1 << (K * 2), lk_ptr, 16)
+    kmer_match_ptr = kup.find_kmer_pos_for_seq(seq1, len(seq1), K, sda_ptr, lk_ptr)
+    kmer_match = kmer_match_ptr[0]
+    aln_range_ptr = kup.find_best_aln_range2(kmer_match_ptr, K, K*50, 25)
+    #x,y = zip( * [ (kmer_match.query_pos[i], kmer_match.target_pos[i]) for i in range(kmer_match.count )] )
+    aln_range = aln_range_ptr[0]
+    kup.free_kmer_match(kmer_match_ptr)
+    s1, e1, s0, e0, km_score = aln_range.s1, aln_range.e1, aln_range.s2, aln_range.e2, aln_range.score  
+    e1 += K + K/2
+    e0 += K + K/2
+    kup.free_aln_range(aln_range)
+    len_1 = len(seq1)
+    len_0 = len(seq0)
+    if e1 > len_1: 
+        e1 = len_1
+    if e0 > len_0:
+        e0 = len_0
+
+    aln_size = 1
+    if e1 - s1 > 500:
+
+        aln_size = max( e1-s1, e0-s0 )
+        aln_score = int(km_score * 48)
+        aln_q_s = s1
+        aln_q_e = e1
+        aln_t_s = s0
+        aln_t_e = e0
+        
+    kup.free_seq_addr_array(sda_ptr)
+    kup.free_seq_array(sa_ptr)
+    kup.free_kmer_lookup(lk_ptr)
+
+    if s1 > edge_tolerance and s0 > edge_tolerance:
+        return 0, 0, 0, 0, 0, 0, "none"
+
+    if len_1 - e1 > edge_tolerance and len_0 - e0 > edge_tolerance:
+        return 0, 0, 0, 0, 0, 0, "none"
+
+
+    if e1 - s1 > 500 and aln_size > 500:
+        return s1, s1+aln_q_e-aln_q_s, s0, s0+aln_t_e-aln_t_s, aln_size, aln_score, "aln"
+    else:
+        return 0, 0, 0, 0, 0, 0, "none"
+
+def get_consensus_without_trim( c_input ):
+    seqs, seed_id, config = c_input
+    min_cov, K, local_match_count_window, local_match_count_threshold, max_n_read, min_idt, edge_tolerance, trim_size = config
+    if len(seqs) > max_n_read:
+        seqs = seqs[:max_n_read]
+    seqs_ptr = (c_char_p * len(seqs))()
+    seqs_ptr[:] = seqs
+    consensus_data_ptr = falcon.generate_consensus( seqs_ptr, len(seqs), min_cov, K, 
+                                                    local_match_count_window, local_match_count_threshold, min_idt )
+
+    consensus = string_at(consensus_data_ptr[0].sequence)[:]
+    eff_cov = consensus_data_ptr[0].eff_cov[:len(consensus)]
+    falcon.free_consensus_data( consensus_data_ptr )
+    del seqs_ptr
+    return consensus, seed_id
+
+def get_consensus_with_trim( c_input ):
+    seqs, seed_id, config = c_input
+    min_cov, K, local_match_count_window, local_match_count_threshold, max_n_read, min_idt, edge_tolerance, trim_size = config
+    trim_seqs = []
+    seed = seqs[0]
+    for seq in seqs[1:]:
+        aln_data = get_alignment(seq, seed, edge_tolerance)
+        s1, e1, s2, e2, aln_size, aln_score, c_status = aln_data
+        if c_status == "none":
+            continue
+        if aln_score > 1000 and e1 - s1 > 500:
+            e1 -= trim_size
+            s1 += trim_size
+            trim_seqs.append( (e1-s1, seq[s1:e1]) )
+    trim_seqs.sort(key = lambda x:-x[0]) #use longest alignment first
+    trim_seqs = [x[1] for x in trim_seqs]
+        
+    if len(trim_seqs) > max_n_read:
+        trim_seqs = trim_seqs[:max_n_read]
+
+    trim_seqs = [seed] + trim_seqs
+
+
+    seqs_ptr = (c_char_p * len(trim_seqs))()
+    seqs_ptr[:] = trim_seqs
+    consensus_data_ptr = falcon.generate_consensus( seqs_ptr, len(trim_seqs), min_cov, K, 
+                                               local_match_count_window, local_match_count_threshold, min_idt )
+    consensus = string_at(consensus_data_ptr[0].sequence)[:]
+    eff_cov = consensus_data_ptr[0].eff_cov[:len(consensus)]
+    falcon.free_consensus_data( consensus_data_ptr )
+    del seqs_ptr
+    return consensus, seed_id
+
+
+def get_seq_data(config):
+    min_cov, K, local_match_count_window, local_match_count_threshold, max_n_read, min_idt, edge_tolerance, trim_size = config
+    seqs = []
+    seed_id = None
+    seqs_data = []
+    read_ids = set()
+    with sys.stdin as f:
+        for l in f:
+            l = l.strip().split()
+            if len(l) != 2:
+                continue
+            if l[0] not in ("+", "-"):
+                if len(l[1]) > 100:
+                    if len(seqs) == 0:
+                        seqs.append(l[1]) #the "seed"
+                        seed_id = l[0]
+                    if l[0] not in read_ids: #avoidng using the same read twice
+                        seqs.append(l[1])
+            elif l[0] == "+":
+                if len(seqs) > 10:
+                    seqs.sort( key=lambda x: -len(x) )
+                    yield (seqs[:max_n_read], seed_id, config) 
+                #seqs_data.append( (seqs, seed_id) ) 
+                seqs = []
+                read_id = set()
+                seed_id = None
+            elif l[0] == "-":
+                #yield (seqs, seed_id)
+                #seqs_data.append( (seqs, seed_id) )
+                break
+
+if __name__ == "__main__":
+    import argparse
+    import re
+    parser = argparse.ArgumentParser(description='a simple multi-processor consensus sequence generator')
+    parser.add_argument('--n_core', type=int, default=24,
+                        help='number of processes used for generating consensus')
+    parser.add_argument('--local_match_count_window', type=int, default=12,
+                        help='local match window size')
+    parser.add_argument('--local_match_count_threshold', type=int, default=6,
+                        help='local match count threshold')
+    parser.add_argument('--min_cov', type=int, default=6,
+                        help='minimum coverage to break the consensus')
+    parser.add_argument('--max_n_read', type=int, default=500,
+                        help='minimum number of reads used in generating the consensus')
+    parser.add_argument('--trim', action="store_true", default=False,
+                        help='trim the input sequence with k-mer spare dynamic programming to find the mapped range')
+    parser.add_argument('--output_full', action="store_true", default=False,
+                        help='output uncorrected regions too')
+    parser.add_argument('--output_multi', action="store_true", default=False,
+                        help='output multi correct regions')
+    parser.add_argument('--min_idt', type=float, default=0.70,
+                        help='minimum identity of the alignments used for correction')
+    parser.add_argument('--edge_tolerance', type=int, default=1000,
+                        help='for trimming, the there is unaligned edge leng > edge_tolerance, ignore the read')
+    parser.add_argument('--trim_size', type=int, default=50,
+                        help='the size for triming both ends from initial sparse aligned region')
+    good_region = re.compile("[ACGT]+")
+    args = parser.parse_args()
+    exe_pool = Pool(args.n_core)
+    if args.trim:
+        get_consensus = get_consensus_with_trim
+    else:
+        get_consensus = get_consensus_without_trim
+
+    K = 8
+    config = args.min_cov, K, args.local_match_count_window, args.local_match_count_threshold,\
+             args.max_n_read, args.min_idt, args.edge_tolerance, args.trim_size
+    for res in exe_pool.imap(get_consensus, get_seq_data(config)):  
+        cns, seed_id = res
+        if args.output_full == True:
+            if len(cns) > 500:
+                print ">"+seed_id+"_f"
+                print cns
+        else:
+            cns = good_region.findall(cns)
+            if len(cns) == 0:
+                continue
+            if args.output_multi == True:
+                seq_i = 0
+                for cns_seq in cns:
+                    if len(cns_seq) > 500:
+                        print ">"+seed_id+"_%d" % seq_i
+                        print cns_seq
+                    seq_i += 1
+            else:
+                cns.sort(key = lambda x: len(x))
+                if len(cns[-1]) > 500:
+                    print ">"+seed_id
+                    print cns[-1]
+
diff --git a/src/py_scripts_v0.1/falcon_ucns_data.py b/src/py_scripts_v0.1/falcon_ucns_data.py
new file mode 100644
index 0000000..aecd33a
--- /dev/null
+++ b/src/py_scripts_v0.1/falcon_ucns_data.py
@@ -0,0 +1,120 @@
+#!/usr/bin/env python
+
+#################################################################################$$
+# Copyright (c) 2011-2014, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted (subject to the limitations in the
+# disclaimer below) provided that the following conditions are met:
+#
+#  * Redistributions of source code must retain the above copyright
+#  notice, this list of conditions and the following disclaimer.
+#
+#  * Redistributions in binary form must reproduce the above
+#  copyright notice, this list of conditions and the following
+#  disclaimer in the documentation and/or other materials provided
+#  with the distribution.
+#
+#  * Neither the name of Pacific Biosciences nor the names of its
+#  contributors may be used to endorse or promote products derived
+#  from this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#################################################################################$$
+
+import sys
+import os
+
+
+rcmap = dict(zip("ACGTacgtNn-", "TGCAtgcaNn-"))
+
+if __name__ == "__main__":
+    import argparse
+    import re
+    from pbcore.io import FastaReader
+    
+    tiling_path = {}
+    with open("all_tiling_path_c") as f:
+        for l in f:
+            l = l.strip().split()
+            tiling_path.setdefault( l[0], [])
+
+            offset = int(l[1])
+            node_id = l[2].split(":")
+            s = int(l[3])
+            e = int(l[4])
+
+            tiling_path[ l[0] ].append( (offset, node_id[0], node_id[1], s, e) )
+
+    f = FastaReader("preads.fa")
+    seq_db = {}
+    for r in f:
+         seq_db[r.name] = r.sequence
+
+    f = FastaReader("primary_tigs_c.fa")
+    p_tigs_db = {}
+    for r in f:
+         p_tigs_db[r.name] = r.sequence
+
+    for p_tig_id in p_tigs_db:
+        pread_data = {}
+        offsets = []
+        seqs = []
+        p_tig = p_tigs_db[p_tig_id]
+        #if len(tiling_path[p_tig_id]) <= 2:
+        #    continue
+        print p_tig_id, 0, p_tig
+        for offset, s_id, end, s, e in tiling_path[p_tig_id]:
+            seq = seq_db[s_id]
+            if end == "B":
+                s, e = e, s
+                offset = offset - len(seq) 
+                seq = "".join([rcmap[c] for c in seq[::-1]])
+            else:
+                offset = offset - len(seq)
+            print s_id, offset, seq
+        
+        print "+ + +"
+
+    f = FastaReader("a_nodup.fa")
+    a_tigs_db = {}
+    for r in f:
+         a_tigs_db[r.name] = r.sequence
+
+    for a_tig_id in a_tigs_db:
+        pread_data = {}
+        offsets = []
+        seqs = []
+        a_tig = a_tigs_db[a_tig_id]
+        #if len(tiling_path[a_tig_id]) <= 2:
+        #    continue
+        print a_tig_id, 0, a_tig
+        for offset, s_id, end, s, e in tiling_path[a_tig_id]:
+            seq = seq_db[s_id]
+            if end == "B":
+                s, e = e, s
+                offset = offset - len(seq) 
+                seq = "".join([rcmap[c] for c in seq[::-1]])
+            else:
+                offset = offset - len(seq)
+            print s_id, offset, seq
+        
+        print "+ + +"
+
+    print "- - -"
+
diff --git a/src/py_scripts_v0.1/falcon_utgcns.py b/src/py_scripts_v0.1/falcon_utgcns.py
new file mode 100644
index 0000000..57b8db5
--- /dev/null
+++ b/src/py_scripts_v0.1/falcon_utgcns.py
@@ -0,0 +1,124 @@
+#!/usr/bin/env python
+
+#################################################################################$$
+# Copyright (c) 2011-2014, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted (subject to the limitations in the
+# disclaimer below) provided that the following conditions are met:
+#
+#  * Redistributions of source code must retain the above copyright
+#  notice, this list of conditions and the following disclaimer.
+#
+#  * Redistributions in binary form must reproduce the above
+#  copyright notice, this list of conditions and the following
+#  disclaimer in the documentation and/or other materials provided
+#  with the distribution.
+#
+#  * Neither the name of Pacific Biosciences nor the names of its
+#  contributors may be used to endorse or promote products derived
+#  from this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#################################################################################$$
+
+from ctypes import *
+import sys
+from multiprocessing import Pool
+import os
+import falcon_kit
+
+module_path = falcon_kit.__path__[0]
+
+falcon = CDLL(os.path.join(module_path, "falcon.so"))
+"""
+consensus_data * generate_utg_consensus( char ** input_seq, 
+                           seq_coor_t *offset,
+                           unsigned int n_seq, 
+                           unsigned min_cov, 
+                           unsigned K,
+                           double min_idt) {
+"""
+falcon.generate_utg_consensus.argtypes = [ POINTER(c_char_p), POINTER(falcon_kit.seq_coor_t), c_uint, c_uint, c_uint, c_double ]
+falcon.generate_utg_consensus.restype = POINTER(falcon_kit.ConsensusData)
+falcon.free_consensus_data.argtypes = [ POINTER(falcon_kit.ConsensusData) ]
+
+rcmap = dict(zip("ACGTacgtNn-", "TGCAtgcaNn-"))
+
+def get_consensus(c_input):
+    t_id, seqs, offsets, config = c_input 
+    K = config[0]
+    seqs_ptr = (c_char_p * len(seqs))()
+    seqs_ptr[:] = seqs
+    offset_ptr = (c_long * len(seqs))( *offsets )
+    consensus_data_ptr = falcon.generate_utg_consensus( seqs_ptr, offset_ptr, len(seqs), 0, K, 0.)
+    consensus = string_at(consensus_data_ptr[0].sequence)[:]
+    del seqs_ptr
+    del offset_ptr
+    falcon.free_consensus_data( consensus_data_ptr )
+    return consensus, t_id
+
+def echo(c_input):
+
+    t_id, seqs, offsets, config = c_input 
+
+    return len(seqs), "test"
+
+def get_seq_data(config):
+    seqs = []
+    offsets = []
+    seed_id = None
+    with sys.stdin as f:
+        for l in f:
+            l = l.strip().split()
+            if len(l) != 3:
+                continue
+            if l[0] not in ("+", "-"):
+                if len(seqs) == 0:
+                    seqs.append(l[2]) #the "seed"
+                    offsets.append( int(l[1]) )
+                    seed_id = l[0]
+                else:
+                    seqs.append(l[2])
+                    offsets.append( int(l[1]) )
+            elif l[0] == "+":
+                yield (seed_id, seqs, offsets, config) 
+                seqs = []
+                offsets = []
+                seed_id = None
+            elif l[0] == "-":
+                break
+
+if __name__ == "__main__":
+    import argparse
+    import re
+    parser = argparse.ArgumentParser(description='a simple multi-processor consensus sequence generator')
+    parser.add_argument('--n_core', type=int, default=4,
+                        help='number of processes used for generating consensus')
+    args = parser.parse_args()
+    exe_pool = Pool(args.n_core)
+    K = 8
+    config = (K, )
+    for res in exe_pool.imap(get_consensus, get_seq_data(config)):  
+    #for res in exe_pool.imap(echo, get_seq_data(config)):  
+    #for res in map(echo, get_seq_data(config)):  
+    #for res in map(get_consensus, get_seq_data(config)):  
+        cns, t_id = res
+        print ">"+t_id+"|tigcns"
+        print cns
+
diff --git a/src/py_scripts_v0.1/get_ovl.sh b/src/py_scripts_v0.1/get_ovl.sh
new file mode 100644
index 0000000..417f03b
--- /dev/null
+++ b/src/py_scripts_v0.1/get_ovl.sh
@@ -0,0 +1,7 @@
+/mnt/secondary/Share/HBAR_03202013/bin/parallel -j 32 "LA4Falcon -mo -H2000 {}  | python overlap_filter_step1.py > {}.ignore" ::: *.las
+rm all.ignore
+cat *.ignore > all.ignore
+/mnt/secondary/Share/HBAR_03202013/bin/parallel -j 32 "LA4Falcon -mo -H2000 {}  | python overlap_filter_step2.py > {}.rc" ::: *.las
+cat *.rc > rc_out_all
+rm *.rc
+/mnt/secondary/Share/HBAR_03202013/bin/parallel -j 32 "LA4Falcon -mo -H2000 {}  | python overlap_filter_step3.py > {}.ovl" ::: *.las
diff --git a/src/py_scripts_v0.1/get_rdata.py b/src/py_scripts_v0.1/get_rdata.py
new file mode 100755
index 0000000..14704a4
--- /dev/null
+++ b/src/py_scripts_v0.1/get_rdata.py
@@ -0,0 +1,207 @@
+#!/usr/bin/env python
+
+#################################################################################$$
+# Copyright (c) 2011-2014, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted (subject to the limitations in the
+# disclaimer below) provided that the following conditions are met:
+#
+#  * Redistributions of source code must retain the above copyright
+#  notice, this list of conditions and the following disclaimer.
+#
+#  * Redistributions in binary form must reproduce the above
+#  copyright notice, this list of conditions and the following
+#  disclaimer in the documentation and/or other materials provided
+#  with the distribution.
+#
+#  * Neither the name of Pacific Biosciences nor the names of its
+#  contributors may be used to endorse or promote products derived
+#  from this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#################################################################################$$
+
+import sys
+import glob
+#import pkg_resources
+import uuid
+from datetime import datetime
+
+from collections import Counter
+from multiprocessing import Pool
+#from pbtools.pbdagcon.q_sense import *
+import os
+
+"""
+try:
+    __p4revision__ = "$Revision: #4 $"
+    __p4change__ = "$Change: 121571 $"
+    revNum = int(__p4revision__.strip("$").split(" ")[1].strip("#"))
+    changeNum = int(__p4change__.strip("$").split(":")[-1])
+    __version__ = "%s-r%d-c%d" % ( pkg_resources.require("pbtools.pbhgap")[0].version, revNum, changeNum )
+except:
+    __version__ = "pbtools.hbar-dtk-github"
+"""
+
+query_fasta_fn = sys.argv[1]
+target_fasta_fn = sys.argv[2]
+m4_fofn = sys.argv[3]
+bestn = int(sys.argv[4])
+group_id = int(sys.argv[5])
+num_chunk = int(sys.argv[6])
+min_cov = int(sys.argv[7])
+max_cov = int(sys.argv[8])
+trim_align = int(sys.argv[9])
+trim_plr = int(sys.argv[10])
+
+
+rmap = dict(zip("ACGTNacgt-","TGCANntgca-"))
+def rc(seq):
+    return "".join([rmap[c] for c in seq[::-1]])
+
+"""0x239fb832/0_590 0x722a1e26 -1843 81.6327 0 62 590 590 0 6417 6974 9822 254 11407 -74.5375 -67.9 1"""
+query_to_target = {}
+with open(m4_fofn) as fofn:
+    for fn in fofn:
+        fn = fn.strip()
+        with open(fn) as m4_f:
+            for l in m4_f:
+                d = l.strip().split()
+                id1, id2 = d[:2]
+                #if -noSplitSubread not used, we will need the following line    
+                #id1 = id1.split("/")[0]
+                if id1 == id2:
+                    continue
+                if hash(id2) % num_chunk != group_id:
+                    continue
+                if int(d[2]) > -1000: continue
+                if int(d[11]) < 4000: continue
+                query_to_target.setdefault(id1, [])
+                query_to_target[id1].append( (int(d[2]), l) )
+
+target_to_query = {}
+for id1 in query_to_target:
+    query_to_target[id1].sort()
+    rank = 0
+    for s, ll in query_to_target[id1][:bestn]:
+        l = ll.strip()
+        d = l.split()
+        id1, id2 = d[:2]
+        target_to_query.setdefault(id2,[])
+        target_to_query[id2].append( ( (int(d[5])-int(d[6]), int(d[2])), l ) )
+        #target_to_query[id2].append( ( int(d[2]), l ) )
+        #rank += 1
+
+from pbcore.io import FastaIO
+query_data = {}
+with open(query_fasta_fn) as fofn:
+    for fa_fn in fofn:
+        fa_fn = fa_fn.strip()
+        f_s = FastaIO.FastaReader(fa_fn)
+        for s in f_s:
+            id1 = s.name
+            if id1 not in query_to_target:
+                continue
+            query_data[id1]=s.sequence
+        f_s.file.close()
+
+target_data = {}
+with open(target_fasta_fn) as fofn:
+    for fa_fn in fofn:
+        fa_fn = fa_fn.strip()
+        f_s = FastaIO.FastaReader(fa_fn)
+        for s in f_s:
+            id2 = s.name
+            if hash(id2) % num_chunk != group_id:
+                continue
+            target_data[id2]=s.sequence
+        f_s.file.close()
+
+
+ec_data = []
+base_count = Counter()
+r_count =0
+
+for id2 in target_to_query:
+    if len(target_to_query[id2])<10:
+        continue
+    if id2 not in target_data:
+        continue
+
+    ref_data = (id2, target_data[id2]) 
+    ref_len = len(target_data[id2])
+    base_count.clear()
+    base_count.update( target_data[id2] )
+    if 1.0*base_count.most_common(1)[0][1]/ref_len > 0.8:  # don't do preassmbly if a read is of >80% of the same base
+        continue
+    read_data = []
+    
+    query_alignment = target_to_query[id2]
+    query_alignment.sort() # get better alignment
+    total_bases = 0
+    max_cov_bases = max_cov * ref_len * 1.2
+    #min_cov_bases = min_cov * ref_len * 3
+    
+    for rank_score, l in query_alignment:
+        rank, score = rank_score
+        #score = rank_score
+        l = l.split()
+        id1 = l[0]
+        #if -noSplitSubread not used, we will need the following line    
+        #id1 = id1.split("/")[0]
+        q_s = int(l[5]) + trim_align
+        q_e = int(l[6]) - trim_align
+        strand = int(l[8])
+        t_s = int(l[9])
+        t_e = int(l[10])
+        t_l = int(l[11])
+        #if strand == 1:
+        #    t_s, t_e = t_l - t_e, t_l - t_s
+        #    t_s += trim_align
+        #    t_e -= trim_align
+
+        if q_e - q_s < 400:
+            continue
+        total_bases += q_e - q_s
+        if total_bases > max_cov_bases:
+            break
+        q_seq = query_data[id1][q_s:q_e]
+        read_data.append( ( "%s/0/%d_%d" % (id1, q_s, q_e), q_s, q_e, q_seq, strand, t_s, t_e) )
+
+    if len(read_data) > 5:
+        r_count += 1
+        t_id, t_seq = ref_data 
+        t_len = len(t_seq)
+        print t_id, t_seq
+        for r in read_data:
+            q_id, q_s, q_e, q_seq, strand, t_s, t_e = r
+            if strand == 1:
+                q_seq = rc(q_seq)
+            print q_id, q_seq
+        #if r_count > 600:
+        #    break
+        print "+ +"
+print "- -"
+
+#output_dir,dumb = os.path.split( os.path.abspath( output_file ) )
+#output_log = open ( os.path.join( output_dir, "j%02d.log" % group_id ), "w" )
+
+
+
+
diff --git a/src/py_scripts_v0.1/overlapper.py b/src/py_scripts_v0.1/overlapper.py
new file mode 100644
index 0000000..30f8fa8
--- /dev/null
+++ b/src/py_scripts_v0.1/overlapper.py
@@ -0,0 +1,216 @@
+from falcon_kit import kup, falcon, DWA, get_consensus, get_alignment
+from pbcore.io import FastaReader
+import numpy as np
+import collections
+import sys
+
+seqs = []
+q_seqs = {}
+f = FastaReader(sys.argv[1]) # take one commnad line argument of the input fasta file name
+
+for r in f:
+    if len(r.sequence) < 6000:
+        continue
+    seq = r.sequence.upper()
+    seqs.append( (r.name, seq[:500], seq[-500:] ) )
+    q_seqs[r.name] = seq
+
+
+total_index_base = len(seqs) * 1000
+print total_index_base
+sa_ptr = kup.allocate_seq( total_index_base )
+sda_ptr = kup.allocate_seq_addr( total_index_base )
+K=14
+lk_ptr = kup.allocate_kmer_lookup( 1 << (K * 2) )
+
+start = 0
+for r_name, prefix, suffix in seqs:
+    kup.add_sequence( start, K, prefix, 500, sda_ptr, sa_ptr, lk_ptr)
+    start += 500
+    kup.add_sequence( start, K, suffix, 500, sda_ptr, sa_ptr, lk_ptr)
+    start += 500
+#kup.mask_k_mer(1 << (K * 2), lk_ptr, 256)
+
+kup.mask_k_mer(1 << (K * 2), lk_ptr, 64)
+
+def get_alignment(seq1, seq0):
+
+    K = 8
+    lk_ptr = kup.allocate_kmer_lookup( 1 << (K * 2) )
+    sa_ptr = kup.allocate_seq( len(seq0) )
+    sda_ptr = kup.allocate_seq_addr( len(seq0) )
+    kup.add_sequence( 0, K, seq0, len(seq0), sda_ptr, sa_ptr, lk_ptr)
+
+    kmer_match_ptr = kup.find_kmer_pos_for_seq(seq1, len(seq1), K, sda_ptr, lk_ptr)
+    kmer_match = kmer_match_ptr[0]
+    aln_range = kup.find_best_aln_range(kmer_match_ptr, K, K*5, 50)
+    #x,y = zip( * [ (kmer_match.query_pos[i], kmer_match.target_pos[i]) for i in range(kmer_match.count )] )
+    kup.free_kmer_match(kmer_match_ptr)
+    s1, e1, s2, e2 = aln_range.s1, aln_range.e1, aln_range.s2, aln_range.e2
+    if e1 - s1 > 500:
+        #s1 = 0 if s1 < 14 else s1 - 14
+        #s2 = 0 if s2 < 14 else s2 - 14
+        e1 = len(seq1) if e1 >= len(seq1)-2*K else e1 + K*2
+        e2 = len(seq0) if e2 >= len(seq0)-2*K else e2 + K*2
+        
+        alignment = DWA.align(seq1[s1:e1], e1-s1,
+                              seq0[s2:e2], e2-s2,
+                              100, 0)
+        #print seq1[s1:e1]
+        #print seq0[s2:e2]
+        #if alignment[0].aln_str_size > 500:
+
+        #aln_str1 = alignment[0].q_aln_str
+        #aln_str0 = alignment[0].t_aln_str
+        aln_size = alignment[0].aln_str_size
+        aln_dist = alignment[0].dist
+        aln_q_s = alignment[0].aln_q_s
+        aln_q_e = alignment[0].aln_q_e
+        aln_t_s = alignment[0].aln_t_s
+        aln_t_e = alignment[0].aln_t_e
+        assert aln_q_e- aln_q_s <= alignment[0].aln_str_size or aln_t_e- aln_t_s <= alignment[0].aln_str_size
+        #print aln_str1
+        #print aln_str0
+    
+        DWA.free_alignment(alignment)
+
+    kup.free_seq_addr_array(sda_ptr)
+    kup.free_seq_array(sa_ptr)
+    kup.free_kmer_lookup(lk_ptr)
+    if e1 - s1 > 500 and aln_size > 500:
+        return s1, s1+aln_q_e-aln_q_s, s2, s2+aln_t_e-aln_t_s, aln_size, aln_dist
+    else:
+        return None
+
+
+def get_ovelap_alignment(seq1, seq0):
+
+    K = 8
+    lk_ptr = kup.allocate_kmer_lookup( 1 << (K * 2) )
+    sa_ptr = kup.allocate_seq( len(seq0) )
+    sda_ptr = kup.allocate_seq_addr( len(seq0) )
+    kup.add_sequence( 0, K, seq0, len(seq0), sda_ptr, sa_ptr, lk_ptr)
+
+    kmer_match_ptr = kup.find_kmer_pos_for_seq(seq1, len(seq1), K, sda_ptr, lk_ptr)
+    kmer_match = kmer_match_ptr[0]
+    aln_range = kup.find_best_aln_range(kmer_match_ptr, K, K*5, 50)
+    #x,y = zip( * [ (kmer_match.query_pos[i], kmer_match.target_pos[i]) for i in range(kmer_match.count )] )
+    kup.free_kmer_match(kmer_match_ptr)
+    s1, e1, s0, e0 = aln_range.s1, aln_range.e1, aln_range.s2, aln_range.e2  
+    len_1 = len(seq1)
+    len_0 = len(seq0)
+    do_aln = False
+    contain_status = "none" 
+    if e1 - s1 > 500:
+        if s1 < 100 and len_1 - e1 < 100:
+            do_aln = False
+            contain_status = "contains"
+        elif s0 < 100 and len_0 - e0 < 100:
+            do_aln = False
+            contain_status = "contained"
+        else:
+            do_aln = True
+            if s0 < s1:
+                s1 -= s0 #assert s1 > 0
+                s0 = 0
+                e1 = len_1
+                e0 = len_1 - s1 if len_1 - s1 < len_0 else len_0
+                if e0 == len_0:
+                    do_aln = False
+                    contain_status = "contained"
+                
+            if s1 <= s0:
+                s0 -= s1 #assert s1 > 0
+                s1 = 0
+                e0 = len_0
+                e1 = len_0 - s0 if len_0 - s0 < len_1 else len_1
+                if e1 == len_1:
+                    do_aln = False
+                    contain_status = "contains"
+
+
+        if do_aln:
+            alignment = DWA.align(seq1[s1:e1], e1-s1,
+                                  seq0[s0:e0], e0-s0,
+                                  500, 0)
+            #print seq1[s1:e1]
+            #print seq0[s2:e2]
+            #if alignment[0].aln_str_size > 500:
+    
+            #aln_str1 = alignment[0].q_aln_str
+            #aln_str0 = alignment[0].t_aln_str
+            aln_size = alignment[0].aln_str_size
+            aln_dist = alignment[0].dist
+            aln_q_s = alignment[0].aln_q_s
+            aln_q_e = alignment[0].aln_q_e
+            aln_t_s = alignment[0].aln_t_s
+            aln_t_e = alignment[0].aln_t_e
+            assert aln_q_e- aln_q_s <= alignment[0].aln_str_size or aln_t_e- aln_t_s <= alignment[0].aln_str_size
+            #print aln_str1
+            #print aln_str0
+            if aln_size > 500: 
+                contain_status = "overlap"            
+            DWA.free_alignment(alignment)
+        
+    kup.free_seq_addr_array(sda_ptr)
+    kup.free_seq_array(sa_ptr)
+    kup.free_kmer_lookup(lk_ptr)
+
+    if e1 - s1 > 500 and do_aln and aln_size > 500:
+        #return s1, s1+aln_q_e-aln_q_s, s2, s2+aln_t_e-aln_t_s, aln_size, aln_dist, x, y
+        return s1, s1+aln_q_e-aln_q_s, s0, s0+aln_t_e-aln_t_s, aln_size, aln_dist, contain_status
+    else:
+        return 0, 0, 0, 0, 0, 0, contain_status 
+
+rc_map = dict( zip("ACGTacgtNn-", "TGCAtgcaNn-") )
+with open("test_ovlp.dat","w") as f:
+    for name, q_seq in q_seqs.items():
+        kmer_match_ptr = kup.find_kmer_pos_for_seq(q_seq, len(q_seq), K, sda_ptr, lk_ptr)
+        kmer_match = kmer_match_ptr[0]
+        count = kmer_match.count
+        hit_index = np.array(kmer_match.target_pos[0:count])/500
+        kup.free_kmer_match(kmer_match_ptr)
+        
+        c = collections.Counter(hit_index)
+        s = [c[0] for c in c.items() if c[1] >50]
+        #s.sort()
+        targets = set()
+        for p in s:
+            hit_id = seqs[p/2][0]
+            if hit_id in targets or hit_id == name:
+                continue
+            targets.add(hit_id)
+            seq1, seq0 = q_seq, q_seqs[hit_id ]
+            rtn = get_ovelap_alignment(seq1, seq0)
+            #rtn = get_alignment(seq1, seq0)
+            if rtn != None:
+                
+                s1, e1, s2, e2, aln_size, aln_dist, c_status = rtn
+                #print >>f, name, 0, s1, e1, len(seq1), hit_id, 0, s2, e2, len(seq0),  aln_size, aln_dist
+                print >>f, hit_id, name, aln_dist - aln_size, "%0.2f" % (100 - 100.0*aln_dist/(aln_size+1)), 0, s2, e2, len(seq0), 0, s1, e1, len(seq1), c_status
+                
+        r_q_seq = "".join([rc_map[c] for c in q_seq[::-1]])
+        
+        kmer_match_ptr = kup.find_kmer_pos_for_seq(r_q_seq, len(r_q_seq), K, sda_ptr, lk_ptr)
+        kmer_match = kmer_match_ptr[0]
+        count = kmer_match.count
+        hit_index = np.array(kmer_match.target_pos[0:count])/500
+        kup.free_kmer_match(kmer_match_ptr)
+        
+        c = collections.Counter(hit_index)
+        s = [c[0] for c in c.items() if c[1] >50]
+        #s.sort()
+        targets = set()
+        for p in s:
+            hit_id = seqs[p/2][0]
+            if hit_id in targets or hit_id == name:
+                continue
+            targets.add(hit_id)
+            seq1, seq0 = r_q_seq, q_seqs[hit_id]
+            rtn = get_ovelap_alignment(seq1, seq0)
+            #rtn = get_alignment(seq1, seq0)
+            if rtn != None:
+                s1, e1, s2, e2, aln_size, aln_dist, c_status = rtn
+                #print >>f, name, 1, s1, e1, len(seq1), hit_id, 0, s2, e2, len(seq0),  aln_size, aln_dist
+                print >>f, hit_id, name, aln_dist - aln_size, "%0.2f" % (100 - 100.0*aln_dist/(aln_size+1)), 0, s2, e2, len(seq0), 1, len(seq1) - e1, len(seq1)- s1, len(seq1), c_status
+
diff --git a/src/py_scripts_v0.1/ovlp_filter.sh b/src/py_scripts_v0.1/ovlp_filter.sh
new file mode 100644
index 0000000..608389e
--- /dev/null
+++ b/src/py_scripts_v0.1/ovlp_filter.sh
@@ -0,0 +1,6 @@
+source /mnt/secondary/Share/HBAR_03202013/bin/activate
+parallel -j 24 "LA4Falcon -mo -H10000 {}  | python overlap_filter_step1.py > {}.ignore" ::: *.las
+cat *.ignore > all.ignore
+parallel -j 24 "LA4Falcon -mo -H10000 {}  | python overlap_filter_step2.py > {}.rc" ::: *.las
+cat *.rc > rc_out_all
+parallel -j 24 "LA4Falcon -mo -H10000 {}  | python overlap_filter_step3.py > {}.ovl" ::: *.las
diff --git a/src/py_scripts_v0.1/redis_graph.py b/src/py_scripts_v0.1/redis_graph.py
new file mode 100644
index 0000000..41cc785
--- /dev/null
+++ b/src/py_scripts_v0.1/redis_graph.py
@@ -0,0 +1,79 @@
+import redis
+import sys
+from pbcore.io import FastaReader
+
+
+r = redis.StrictRedis(host='localhost', port=6379, db=0)
+
+class RedisList(object):
+
+    def __init__(self, rs):
+        self._rs = rs
+        self.id_ = "pid:" + str( id(self) )
+
+    def append(self, value):
+        self._rs.rpush( self.id_, value)
+
+    def __len__(self):
+        return self._rs.llen( self.id_ )
+
+    def __getitem__(self, i):
+        return self._rs.lrange( self.id_, i, i)
+
+    def pylist(self):
+        return self._rs.lrange( self.id_, 0, -1)
+
+    def __del__(self):
+        self._rs.delete(self.id_)
+
+class RedisDict(object):
+
+    def __init__(self, rs):
+        self._rs = rs
+        self.id_ = "pid:" + str( id(self) )
+
+    def __setitem__(self, key, value):
+        self._rs.hset( self.id_, key, value )
+
+    def __getitem__(self, key):
+        return self._rs.hget( self.id_, key )
+
+    def __delitem__(self, key):
+        return self._rs.hdel( self.id_, key)
+
+
+    def __len__(self):
+        return self._rs.hlen( self.id_ )
+    
+    def keys(self):
+        return self._rs.hgetall( self.id_ ).keys()
+
+    def values(self):
+        return self._rs.hgetall( self.id_ ).values()
+
+    def pydict(self):
+        return self._rs.hgetall( self.id_ )
+
+    def __del__(self):
+        self._rs.delete(self.id_)
+
+def test_list():
+    x = RedisList(r)
+    x.append( "1" )
+    x.append( "2" )
+    print len(x)
+    print x.pylist()
+    del x
+
+    y = RedisDict(r)
+    y["a"] = "b"
+    y["b"] = 1
+    print y["a"]
+    del y["a"]
+    print y.values()
+    print y.keys()
+    print y.pydict()
+    del y
+
+if __name__ == "__main__":
+    test_list()
diff --git a/src/py_scripts_v0.1/remove_dup_ctg.py b/src/py_scripts_v0.1/remove_dup_ctg.py
new file mode 100755
index 0000000..3164eb6
--- /dev/null
+++ b/src/py_scripts_v0.1/remove_dup_ctg.py
@@ -0,0 +1,75 @@
+#!/usr/bin/env python
+
+#################################################################################$$
+# Copyright (c) 2011-2014, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted (subject to the limitations in the
+# disclaimer below) provided that the following conditions are met:
+#
+#  * Redistributions of source code must retain the above copyright
+#  notice, this list of conditions and the following disclaimer.
+#
+#  * Redistributions in binary form must reproduce the above
+#  copyright notice, this list of conditions and the following
+#  disclaimer in the documentation and/or other materials provided
+#  with the distribution.
+#
+#  * Neither the name of Pacific Biosciences nor the names of its
+#  contributors may be used to endorse or promote products derived
+#  from this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#################################################################################$$
+
+import pbcore.io
+
+import sys
+"""nucmer -maxmatch all_tigs.fa all_tigs.fa -p all_tigs_self >& /dev/null"""
+"""show-coords -o -H -T all_tigs_self.delta | grep CONTAINS | awk '$7>96' | awk '{print $9}' | sort -u > all_tigs_duplicated_ids"""
+
+id_to_remove = set()
+with open("all_tigs_duplicated_ids") as f:
+    for l in f:
+        l = l.strip().split("-")
+        major, minor = l[:2]
+        id_to_remove.add ( (major, minor) )
+
+f = pbcore.io.FastaReader("all_tigs.fa")
+with open("a-tigs_nodup.fa", "w") as f_out:
+    for r in f:
+        major, minor = r.name.split()[0].split("-")[:2]
+        if minor == "0000":
+            continue
+        if (major, minor) in id_to_remove:
+            continue
+        if len(r.sequence) < 500:
+            continue
+        print >>f_out, ">"+r.name
+        print >>f_out, r.sequence
+
+f = pbcore.io.FastaReader("primary_tigs_c.fa")
+with open("p-tigs_nodup.fa", "w") as f_out:
+    for r in f:
+        major, minor = r.name.split()[0].split("_")[:2]
+        if (major, "0000") in id_to_remove:
+            continue
+        if len(r.sequence) < 500:
+            continue
+        print >>f_out, ">"+r.name
+        print >>f_out, r.sequence
diff --git a/src/utils/fetch_preads.py b/src/utils/fetch_preads.py
new file mode 100644
index 0000000..c5ba7d2
--- /dev/null
+++ b/src/utils/fetch_preads.py
@@ -0,0 +1,70 @@
+from pbcore.io import FastaReader
+import networkx as nx
+import sys
+
+u_graph = nx.DiGraph()
+u_edges = {}
+with open("./unit_edges.dat") as f:
+    for l in f:
+        v, w, path, seq = l.strip().split()
+        u_edges.setdefault( (v, w), [] )
+        u_edges[ (v, w) ].append( (path, seq) )
+        u_graph.add_edge(v, w)
+        
+len(u_edges)
+u_graph_r = u_graph.reverse()
+
+
+p_tig_path = {}
+a_tig_path = {}
+with open("primary_tigs_paths_c") as f:
+    for l in f:
+        l = l.strip().split()
+        id_ = l[0][1:]
+        path = l[1:]
+        p_tig_path[id_] = path
+
+with open("all_tigs_paths") as f:
+    for l in f:
+        l = l.strip().split()
+        id_ = l[0][1:]
+        path = l[1:]
+        a_tig_path[id_] = path
+
+p_ugraph = nx.DiGraph()
+p_sgraph = nx.DiGraph()
+p_tig_id = sys.argv[1]
+
+main_path = p_tig_path["%s_00" % p_tig_id]
+all_nodes = set(main_path[:])
+main_path_nodes = set(main_path[:])
+p_ugraph.add_path(main_path)
+for id_ in a_tig_path:
+    if id_[:4] == p_tig_id:
+        a_path = a_tig_path[id_]
+        if a_path[0] in main_path_nodes and a_path[-1] in main_path_nodes:
+            p_ugraph.add_path(a_path)
+            for pp in a_path:
+                all_nodes.add(pp)
+        
+for v, w in u_edges:
+    if v in all_nodes and w in all_nodes:
+        for p, s in u_edges[(v,w)]:
+            p = p.split("-")
+            p_sgraph.add_path(p)
+            #print p
+            for pp in p:
+                all_nodes.add(pp)
+
+nx.write_gexf(p_ugraph, "p_ugraph.gexf")
+nx.write_gexf(p_sgraph, "p_sgraph.gexf")
+
+
+preads = FastaReader(sys.argv[2])
+
+all_nodes_ids = set( [s.split(":")[0] for s in list(all_nodes)] )
+with open("p_sgraph_nodes.fa","w") as f:
+    for r in preads:
+        if r.name in all_nodes_ids:
+            print >>f, ">"+r.name
+            print >>f, r.sequence
diff --git a/test/test_actg_coordinate.py b/test/test_actg_coordinate.py
new file mode 100644
index 0000000..03b6324
--- /dev/null
+++ b/test/test_actg_coordinate.py
@@ -0,0 +1,9 @@
+import falcon_kit.mains.actg_coordinate as mod
+
+'''
+def test_help():
+    try:
+        mod.main(*['prog', '--help'])
+    except SystemExit:
+        pass
+'''
diff --git a/test/test_consensus.py b/test/test_consensus.py
new file mode 100644
index 0000000..ac4ee4e
--- /dev/null
+++ b/test/test_consensus.py
@@ -0,0 +1,7 @@
+import falcon_kit.mains.consensus as mod
+
+def test_help():
+    try:
+        mod.main(*['prog', '--help'])
+    except SystemExit:
+        pass
diff --git a/test/test_contig_annotate.py b/test/test_contig_annotate.py
new file mode 100644
index 0000000..5e5ffec
--- /dev/null
+++ b/test/test_contig_annotate.py
@@ -0,0 +1,9 @@
+import falcon_kit.mains.contig_annotate as mod
+
+'''
+def test_help():
+    try:
+        mod.main(*['prog', '--help'])
+    except SystemExit:
+        pass
+'''
diff --git a/test/test_ctg_link_analysis.py b/test/test_ctg_link_analysis.py
new file mode 100644
index 0000000..416e409
--- /dev/null
+++ b/test/test_ctg_link_analysis.py
@@ -0,0 +1,9 @@
+import falcon_kit.mains.ctg_link_analysis as mod
+
+'''
+def test_help():
+    try:
+        mod.main(*['prog', '--help'])
+    except SystemExit:
+        pass
+'''
diff --git a/test/test_graph_to_contig.py b/test/test_graph_to_contig.py
new file mode 100644
index 0000000..1aab6b1
--- /dev/null
+++ b/test/test_graph_to_contig.py
@@ -0,0 +1,9 @@
+import falcon_kit.mains.graph_to_contig as mod
+
+'''
+def test_help():
+    try:
+        mod.main(*['prog', '--help'])
+    except SystemExit:
+        pass
+'''
diff --git a/test/test_graph_to_utgs.py b/test/test_graph_to_utgs.py
new file mode 100644
index 0000000..322e9d9
--- /dev/null
+++ b/test/test_graph_to_utgs.py
@@ -0,0 +1,9 @@
+import falcon_kit.mains.graph_to_utgs as mod
+
+'''
+def test_help():
+    try:
+        mod.main(*['prog', '--help'])
+    except SystemExit:
+        pass
+'''
diff --git a/test/test_ovlp_filter.py b/test/test_ovlp_filter.py
new file mode 100644
index 0000000..65eb466
--- /dev/null
+++ b/test/test_ovlp_filter.py
@@ -0,0 +1,30 @@
+import falcon_kit.mains.ovlp_filter as mod
+from nose.tools import assert_equal
+
+def test_help():
+    try:
+        mod.main(*['prog', '--help'])
+    except SystemExit:
+        pass
+def test():
+    readlines = data.strip().splitlines
+    max_diff, max_ovlp, min_ovlp, min_len = 1000, 1000, 1, 1
+    stage1 = mod.filter_stage1(readlines, max_diff, max_ovlp, min_ovlp, min_len)
+    print stage1
+    assert_equal(expected1,  stage1)
+expected1 = ['000000000', '000000001', '000000002', '000000003', '000000017', '000000028']
+expected1 = [None, '000000001', '000000002', '000000003', '000000017', '000000028'] # BUGGY!
+data = """
+000000000 000000001 -1807 100.00 0 181 1988 1988 0 0 1807 1989 overlap
+000000000 000000002 -823 99.88 0 0 823 1988 0 1166 1989 1989 overlap
+000000000 000000003 -50 99.94 0 0 50 1988 0 0 50 50 overlap
+000000000 000000017 -61 98.36 0 0 61 1988 0 1928 1989 1989 overlap
+000000000 000000028 -1952 79.95 0 0 1952 1988 0 37 1989 1989 overlap
+000000001 000000000 -1807 100.00 0 0 1807 1989 0 181 1988 1988 overlap
+000000001 000000002 -642 99.84 0 0 642 1989 0 1347 1989 1989 overlap
+000000002 000000000 -823 99.88 0 1166 1989 1989 0 0 823 1988 overlap
+000000002 000000001 -642 99.84 0 1347 1989 1989 0 0 642 1989 overlap
+000000003 000000000 -50 99.94 0 0 50 50 0 0 50 1988 overlap
+000000017 000000000 -61 98.36 0 1928 1989 1989 0 0 61 1988 overlap
+000000028 000000000 -1952 79.95 0 37 1989 1989 0 0 1952 1988 overlap
+"""
diff --git a/test/test_ovlp_stats.py b/test/test_ovlp_stats.py
new file mode 100644
index 0000000..de92c3b
--- /dev/null
+++ b/test/test_ovlp_stats.py
@@ -0,0 +1,29 @@
+import falcon_kit.mains.ovlp_stats as mod
+from nose.tools import assert_equal
+
+def test_help():
+    try:
+        mod.main(*['prog', '--help'])
+    except SystemExit:
+        pass
+def test():
+    readlines = data.strip().splitlines
+    stats = mod.filter_stats(readlines, min_len=62)
+    #print stats
+    assert_equal(expected,  stats)
+
+expected = [('000000000', 1988, 2, 1), ('000000001', 1989, 2, 0), ('000000002', 1989, 0, 2), ('000000017', 1989, 0, 1)]
+data = """
+000000000 000000001 -1807 100.00 0 181 1988 1988 0 0 1807 1989 overlap
+000000000 000000002 -823 99.88 0 0 823 1988 0 1166 1989 1989 overlap
+000000000 000000003 -50 99.94 0 0 50 1988 0 0 50 50 overlap
+000000000 000000017 -61 98.36 0 0 61 1988 0 1928 1989 1989 overlap
+000000000 000000028 -1952 79.95 0 0 1952 1988 0 37 1989 1989 overlap
+000000001 000000000 -1807 100.00 0 0 1807 1989 0 181 1988 1988 overlap
+000000001 000000002 -642 99.84 0 0 642 1989 0 1347 1989 1989 overlap
+000000002 000000000 -823 99.88 0 1166 1989 1989 0 0 823 1988 overlap
+000000002 000000001 -642 99.84 0 1347 1989 1989 0 0 642 1989 overlap
+000000003 000000000 -50 99.94 0 0 50 50 0 0 50 1988 overlap
+000000017 000000000 -61 98.36 0 1928 1989 1989 0 0 61 1988 overlap
+000000028 000000000 -1952 79.95 0 37 1989 1989 0 0 1952 1988 overlap
+"""
diff --git a/test/test_ovlp_to_graph.py b/test/test_ovlp_to_graph.py
new file mode 100644
index 0000000..21b7e45
--- /dev/null
+++ b/test/test_ovlp_to_graph.py
@@ -0,0 +1,7 @@
+import falcon_kit.mains.ovlp_to_graph as mod
+
+def test_help():
+    try:
+        mod.main(*['prog', '--help'])
+    except SystemExit:
+        pass
diff --git a/test/test_run.py b/test/test_run.py
new file mode 100644
index 0000000..18450eb
--- /dev/null
+++ b/test/test_run.py
@@ -0,0 +1,9 @@
+#import falcon_kit.mains.run as mod
+
+'''
+def test_help():
+    try:
+        mod.main(*['prog', '--help'])
+    except SystemExit:
+        pass
+'''
diff --git a/test/test_run_LG.py b/test/test_run_LG.py
new file mode 100644
index 0000000..831c339
--- /dev/null
+++ b/test/test_run_LG.py
@@ -0,0 +1,9 @@
+#import falcon_kit.mains.run_LG as mod
+
+'''
+def test_help():
+    try:
+        mod.main(*['prog', '--help'])
+    except SystemExit:
+        pass
+'''
diff --git a/test_data/t1.fa b/test_data/t1.fa
new file mode 100755
index 0000000..3a20a43
--- /dev/null
+++ b/test_data/t1.fa
@@ -0,0 +1,2 @@
+>30a5633d_129405_0
+AAAAGAGAGAGATCGCCCAATTTGGATTACAGTTAGGCACGCCGCTTGTTTTTTTTTTTATTTGCTTTTCGCAGAAAGGTTCTTTCCTTTAATCAGCGCCTCTTTGATTAATGGCGTCTCCGGCAATTGACAGGATTTGTTGTTTTGCAGTAAAAGGAGAAAAAAAATGAGTATGCCACGAATAACTAGAAATAGGGCTAAAAATGTTGCCAAGATCTTTGTGGCTCGGCCAGAGACAAGCGAGCAATGAGACAAAATTGGTCGCCAGATTTTTCTCTTTCTTTTGGATTTTTTTTTTTCTTATTTTCCAATGCCGTCTGCGGCATTCAAATATGCAACAGCAAAGGGCGCGGAAAAAGCAAGGAAAAATGGTGAAAATGGGGTTGGGTGAGAGATGCCTGGGCATGCCAAAGTAGCTGCCAATTTATTTTGGGCATTTTGCTTGGCTGATAGTTGGCCATCTTTATACTCTTCCCAAAAGTGTGAAAGAAT [...]
diff --git a/test_data/t1.fofn b/test_data/t1.fofn
new file mode 100755
index 0000000..1b88fb0
--- /dev/null
+++ b/test_data/t1.fofn
@@ -0,0 +1 @@
+./t1.fa
diff --git a/test_data/t2.fa b/test_data/t2.fa
new file mode 100755
index 0000000..d8fc441
--- /dev/null
+++ b/test_data/t2.fa
@@ -0,0 +1,2 @@
+>5d64830a_48915_0
+AGTAGAGATCATCTAAACTTTGGTGGTATTTGGCTAACTTGCTTATGTACACATATTAATTTAATTATACGAGTAAACTATTTCCATATTAGCGTATAGCAGCTACGCATAGTTTATAGAACAATAAAAATGAAATATTTTCGGCGACTTTGAACAAATGACGCTTTAGGGGCCTAACGGAGTATTTTTATGTGATAGACGATTTTTTGGCGGGCCAAAAAAAATAAAAGGGAAATTGGTGCTGCGCATAAAATTGAAAGCAGGCTTGCCCTCCAACCCCGCGTCTGCCCTCCCCCCCCCCCCCGCAGATCAAGAGATTATGCTATCCCGCAATAATTCGCGCCTTGCCCGCTTAACTACGTTGGCCATGCGTCGGGGGCGGGCGTCTATGCAATGGTTCAATTGGGCGTTGACTGGCCGCTGGCTAGTGTAAGCCCAGTTTTGCGGCTTATTGCCGCTACTCGGCTCGGGCAATCACATCGAGGTCATTAA [...]
diff --git a/test_data/t2.fofn b/test_data/t2.fofn
new file mode 100755
index 0000000..de317c5
--- /dev/null
+++ b/test_data/t2.fofn
@@ -0,0 +1 @@
+./t2.fa
diff --git a/travis.sh b/travis.sh
new file mode 100755
index 0000000..5693676
--- /dev/null
+++ b/travis.sh
@@ -0,0 +1,18 @@
+#!/bin/sh
+# -e: fail on error
+# -v: show commands
+# -x: show expanded commands
+set -vex
+
+#env | sort
+mkdir -p fc-env
+rm -f fc-env/bin/python
+virtualenv -p python2.7 fc-env || ../virtualenv/virtualenv.py fc-env
+. fc-env/bin/activate
+python setup.py -v install
+python -c 'import falcon_kit; print falcon_kit.falcon'
+
+# When doctests are passing, add this:
+pip install nose
+nosetests -v test/
+#nosetests -v --with-doctest fc-env/lib/python2.7/site-packages/falcon_kit

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/falconkit.git



More information about the debian-med-commit mailing list