[med-svn] [falconkit] 01/09: Imported Upstream version 0.4.2

Afif Elghraoui afif at moszumanska.debian.org
Wed Mar 9 07:38:40 UTC 2016


This is an automated email from the git hooks/post-receive script.

afif pushed a commit to branch master
in repository falconkit.

commit 7e22b666583f917a6ffe8d0fb13510a1e5a3d107
Author: Afif Elghraoui <afif at ghraoui.name>
Date:   Tue Mar 8 22:40:21 2016 -0800

    Imported Upstream version 0.4.2
---
 .gitignore                                         |   10 +
 .gitmodules                                        |    0
 .travis.yml                                        |   26 +
 src/py_scripts/remove_dup_ctg.py => LICENSE        |   41 +-
 README.md                                          |  156 +--
 doc/README                                         |    3 +
 doc/falcon_icon.svg                                |   24 +
 doc/falcon_icon2.png                               |  Bin 0 -> 58715 bytes
 doc/file_format_note.md                            |  113 --
 examples/Dmel_asm.md                               |  250 ----
 examples/HBAR.cfg                                  |   72 -
 examples/StarCluster.cfg                           |   24 -
 examples/build_env.sh                              |   22 +
 examples/build_env2.sh                             |   22 +
 examples/ecoli_asm_graph_exploration.ipynb         |  716 ++++++++++
 examples/fc_run_LG.cfg                             |   36 +
 examples/fc_run_arab.cfg                           |   36 +
 examples/fc_run_dmel.cfg                           |   36 +
 examples/fc_run_ecoli.cfg                          |   35 +
 examples/fc_run_ecoli_2.cfg                        |   35 +
 examples/install_note.sh                           |   84 --
 examples/readme.md                                 |   92 --
 examples/run_asm.sh                                |   24 -
 examples/run_ecoli_test.sh                         |   11 +
 setup.py                                           |   70 +-
 src/c/DW_banded.c                                  |    4 +-
 src/c/Makefile                                     |    2 +-
 src/c/common.h                                     |    5 +-
 src/c/ext_falcon.c                                 |   13 +
 src/c/falcon.c                                     |  622 ++++++---
 src/c/kmer_lookup.c                                |   33 +-
 src/py/FastaReader.py                              |  260 ++++
 src/py/falcon_kit.py                               |   24 +-
 src/py/fc_asm_graph.py                             |  212 +++
 src/py/functional.py                               |   89 ++
 src/py/mains/__init__.py                           |    0
 src/py/mains/actg_coordinate.py                    |   27 +
 .../falcon_sense.py => py/mains/consensus.py}      |  144 +-
 src/py/mains/contig_annotate.py                    |   29 +
 src/py/mains/ctg_link_analysis.py                  |   80 ++
 src/py/mains/dedup_a_tigs.py                       |   23 +
 src/py/mains/graph_to_contig.py                    |  297 ++++
 src/py/mains/graph_to_utgs.py                      |  160 +++
 src/py/mains/ovlp_filter.py                        |  265 ++++
 src/py/mains/ovlp_stats.py                         |  115 ++
 src/py/mains/ovlp_to_graph.py                      | 1441 ++++++++++++++++++++
 src/py/mains/run.py                                |  646 +++++++++
 src/py/mains/tasks.py                              |   31 +
 src/py/multiproc.py                                |   25 +
 src/py/run_support.py                              |  475 +++++++
 src/py/util/__init__.py                            |    0
 src/py/util/io.py                                  |  162 +++
 src/py_scripts/fc_actg_coordinate.py               |    5 +
 src/py_scripts/fc_consensus.py                     |    5 +
 src/py_scripts/fc_contig_annotate.py               |    5 +
 src/py_scripts/fc_ctg_link_analysis.py             |    5 +
 src/py_scripts/fc_dedup_a_tigs.py                  |    5 +
 src/py_scripts/fc_graph_to_contig.py               |    5 +
 src/py_scripts/fc_graph_to_utgs.py                 |    5 +
 src/py_scripts/fc_ovlp_filter.py                   |    5 +
 src/py_scripts/fc_ovlp_stats.py                    |    5 +
 src/py_scripts/fc_ovlp_to_graph.py                 |    5 +
 src/py_scripts/fc_run.py                           |    5 +
 src/{py_scripts => py_scripts_v0.1}/falcon_asm.py  |    0
 .../falcon_asm_s.py}                               |  869 +++++++-----
 .../falcon_dedup.py                                |    0
 .../falcon_fixasm.py                               |    4 +-
 .../falcon_overlap.py                              |    0
 .../falcon_overlap2.py                             |    0
 src/{py_scripts => py_scripts_v0.1}/falcon_qrm.py  |    0
 .../falcon_qrm_0.py}                               |  132 +-
 .../falcon_sense.py                                |    9 +-
 .../falcon_ucns_data.py                            |    8 +-
 .../falcon_utgcns.py                               |    0
 src/py_scripts_v0.1/get_ovl.sh                     |    7 +
 src/{py_scripts => py_scripts_v0.1}/get_rdata.py   |    0
 src/{py_scripts => py_scripts_v0.1}/overlapper.py  |    0
 src/py_scripts_v0.1/ovlp_filter.sh                 |    6 +
 src/py_scripts_v0.1/redis_graph.py                 |   79 ++
 .../remove_dup_ctg.py                              |    0
 test/HPCdaligner_synth0.sh                         |   11 +
 test/test_actg_coordinate.py                       |    9 +
 test/test_consensus.py                             |    7 +
 test/test_contig_annotate.py                       |    9 +
 test/test_ctg_link_analysis.py                     |    9 +
 test/test_functional.py                            |   31 +
 test/test_graph_to_contig.py                       |    9 +
 test/test_graph_to_utgs.py                         |    9 +
 test/test_ovlp_filter.py                           |   65 +
 test/test_ovlp_stats.py                            |   29 +
 test/test_ovlp_to_graph.py                         |    7 +
 test/test_run.py                                   |    9 +
 test/test_run_LG.py                                |    9 +
 travis.sh                                          |   18 +
 94 files changed, 6952 insertions(+), 1570 deletions(-)

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..7a42f33
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,10 @@
+/build/
+/dist/
+falcon_kit.egg-info/
+*.pyc
+*.pyo
+*.swp
+*.so
+*.dylib
+*.dll
+*.egg
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000..e69de29
diff --git a/.travis.yml b/.travis.yml
new file mode 100644
index 0000000..dd67f33
--- /dev/null
+++ b/.travis.yml
@@ -0,0 +1,26 @@
+# Build matrix / environment variable are explained on:
+# http://about.travis-ci.org/docs/user/build-configuration/
+# This file can be validated on:
+# http://lint.travis-ci.org/
+
+#before_install: sudo apt-get install -y cmake
+# cmake is pre-installed in Travis for both linux and osx
+
+#before_install:
+#  - sudo apt-get update -qq
+#  - sudo apt-get install -qq valgrind
+#sudo: required
+os:
+  - linux
+language: python
+compiler:
+  - clang  # hmm. distutils uses 'gcc' anyway
+#  - gcc
+script: ./travis.sh
+#env:
+#  matrix:
+#    - SHARED_LIB=ON  STATIC_LIB=ON CMAKE_PKG=ON  BUILD_TYPE=release VERBOSE_MAKE=false
+#    - SHARED_LIB=OFF STATIC_LIB=ON CMAKE_PKG=OFF BUILD_TYPE=debug   VERBOSE_MAKE=true VERBOSE
+notifications:
+  email: false
+sudo: false
diff --git a/src/py_scripts/remove_dup_ctg.py b/LICENSE
old mode 100755
new mode 100644
similarity index 59%
copy from src/py_scripts/remove_dup_ctg.py
copy to LICENSE
index 3164eb6..94e4fd5
--- a/src/py_scripts/remove_dup_ctg.py
+++ b/LICENSE
@@ -1,7 +1,5 @@
-#!/usr/bin/env python
-
 #################################################################################$$
-# Copyright (c) 2011-2014, Pacific Biosciences of California, Inc.
+# Copyright (c) 2011-2015, Pacific Biosciences of California, Inc.
 #
 # All rights reserved.
 #
@@ -36,40 +34,3 @@
 # OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 # SUCH DAMAGE.
 #################################################################################$$
-
-import pbcore.io
-
-import sys
-"""nucmer -maxmatch all_tigs.fa all_tigs.fa -p all_tigs_self >& /dev/null"""
-"""show-coords -o -H -T all_tigs_self.delta | grep CONTAINS | awk '$7>96' | awk '{print $9}' | sort -u > all_tigs_duplicated_ids"""
-
-id_to_remove = set()
-with open("all_tigs_duplicated_ids") as f:
-    for l in f:
-        l = l.strip().split("-")
-        major, minor = l[:2]
-        id_to_remove.add ( (major, minor) )
-
-f = pbcore.io.FastaReader("all_tigs.fa")
-with open("a-tigs_nodup.fa", "w") as f_out:
-    for r in f:
-        major, minor = r.name.split()[0].split("-")[:2]
-        if minor == "0000":
-            continue
-        if (major, minor) in id_to_remove:
-            continue
-        if len(r.sequence) < 500:
-            continue
-        print >>f_out, ">"+r.name
-        print >>f_out, r.sequence
-
-f = pbcore.io.FastaReader("primary_tigs_c.fa")
-with open("p-tigs_nodup.fa", "w") as f_out:
-    for r in f:
-        major, minor = r.name.split()[0].split("_")[:2]
-        if (major, "0000") in id_to_remove:
-            continue
-        if len(r.sequence) < 500:
-            continue
-        print >>f_out, ">"+r.name
-        print >>f_out, r.sequence
diff --git a/README.md b/README.md
index de97d3a..0bb7bc3 100644
--- a/README.md
+++ b/README.md
@@ -8,157 +8,27 @@ efficient assembly algorithm for haploid and diploid genomes. It has some back-e
 code implemented in C for speed and some simple front-end written in Python for
 convenience. 
 
-Please take a look at the `readme.md` file inside the `examples` directory. It shows 
-how to do assembly using `HBAR-DTK` + `Falcon` on Amazon EC2 with a `StarCluster` 
-setup . If any one knows anything comparable to `StarCluster` for Google Compute 
-Engine, please let me know. I can build a VM there too.
 
-FILES
------
+DOCUMENTATION
+-------------
 
-Here is a brief description of the files in the package
+The default branch is now "master" which contained the latest code.
 
-Several C files for implementing sequence matching, alignment and consensus:
+The current latest intergrated release is v0.3.0. Check [v0.3.0 Integration Installation Guide](https://github.com/PacificBiosciences/FALCON-integrate/wiki/Installation-for-v0.3.0) for installation.
 
-    kmer_lookup.c  # kmer match code for quickly identify potential hits
-    DW_banded.c    # function for detailed sequence alignment
-                   # It is based on Eugene Myers' Paper 
-                   # "AnO(ND) difference algorithm and its variations", 1986, 
-                   # http://dx.doi.org/10.1007/BF01840446
-    falcon.c       # functions for generating consensus sequences for a set of multiple sequence alginment
-    common.h       # header file for common declaration
+For the pre-Jun 2015 v0.2.2 version, please check [v0.2.2 release github repository](https://github.com/PacificBiosciences/FALCON/tree/v0.2.2). We will no longer address issues that are specific to that branch unless they also impact the current master branch.
 
-A python wrapper library using Python's ctypes to call the C functions: falcon_kit.py
-
-Some python scripts for (1) overlapping reads (2) generation consensus and (3) generate 
-assembly contigs:
-
-    falcon_overlap.py   # an overlapper
-    falcon_wrap.py      # generate consensus from a group of reads
-    get_rdata.py        # a utility for preparing data for falcon_wrap.py
-    falcon_asm.py       # take the overlapping information and the sequence to generate assembled contig
-    falcon_fixasm.py    # a script analyzing the assembly graph and break contigs on potential mis-assembly points
-    remove_dup_ctg.py   # a utility code to remove duplication contigs in the assembly results
-
-
-INSTALLATION
-------------
-
-You need to install `pbcore` and `networkx` first. You might want to install
-the `HBAR-DTK` if you want to assemble genomes from raw PacBio data.  
-
-On a Linux box, you should be able to use the standard `python setup.py
-install` to compile the C code and install python package. There is no standard
-way to install the shared objects from the C code inside a python package, so I
-did some hack to make it work.  It might have some unexpected behavior. You can
-simply install the `.so` files in a path where the operation system can find
-(e.g. setting the environment variable `LD_LIBRARY_PATH`), and remove all
-prefix in Python `ctypes` `CDDL` function calls.
-
-
-EXAMPLES
---------
-
-Example for generating pre-assembled reads:
-
-    python get_rdata.py queries.fofn targets.fofn m4.fofn 72 0 16 8 64 50 50 | falcon_wrap.py > p-reads-0.fa
-    
-    bestn : 72
-    group_id : 0
-    num_chunk : 16
-    min_cov : 8
-    max_cov : 64
-    trim_align : 50
-    trim_plr : 50
-
-    It is designed to use with the m4 alignment information generated by blasr + HBAR_WF2.py (https://github.com/PacificBiosciences/HBAR-DTK)
-
-Example for generating overlap data:
-
-    falcon_overlap.py --min_len 4000 --n_core 24 --d_core 3 preads.fa > preads.ovlp
-
-Example for generating assembly
-
-    falcon_asm.py preads.ovlp  preads.fa 
-
-The following files will be generated by `falcon_asm.py` in the same directory:
-
-    full_string_graph.adj  # the adjecent nodes of the edges in the full string graph
-    string_graph.gexf      # the gexf file of the string graph for graph visulization
-    string_graph.adj       # the adjecent nodes of the edges in the string graph after transitive reduction
-    edges_list             # full edge list 
-    paths                  # path for the unitigs
-    unit_edges.dat         # path and sequence of the untigs
-    uni_graph.gexf         # unitig graph in gexf format 
-    unitgs.fa              # fasta files of the unitigs
-    all_tigs_paths         # paths for all final contigs (= primary contigs + associated contigs)
-    all_tigs.fa            # fasta file for all contigs
-    primary_tigs_paths     # paths for all primary contigs 
-    primary_tigs.fa        # fasta file fot the primary contigs
-    asm_graph.gexf         # the assembly graph where the edges are the contigs
-
-Although I have tested this tool kit to genome up to 150Mb and get reasonable
-good assembly results, this tool kit is still highly experimental and is not
-meant to be used by novice people. If you like to try it out, you will very
-likely to know more detail about it and be able to tweak the code to adapt it
-to your computation cluster.  I will hope that I can provide more details and
-clean the code up a little in the future so it can be useful for more people. 
-
-The principle of the layout algorithm is also available at 
-https://speakerdeck.com/jchin/string-graph-assembly-for-diploid-genomes-with-long-reads
+- [wiki pages](https://github.com/PacificBiosciences/FALCON/wiki)
+- [Developer Installation Guide](https://github.com/PacificBiosciences/FALCON/wiki/Setup:-Installation-and-Environment)
+- [v0.3.0 Integration Installation Guide](https://github.com/PacificBiosciences/FALCON-integrate/wiki/Installation-for-v0.3.0)
+- [Documentation is here.](https://github.com/PacificBiosciences/FALCON/wiki/Manual)
+- [FAQs](https://github.com/PacificBiosciences/FALCON/wiki/FAQs)
+- [v0.2.2 release github repository](https://github.com/PacificBiosciences/FALCON/tree/v0.2.2)
 
 ABOUT THE LICENSE
 ------------------
 
-Major part of the coding work is done with my own time and on my own MacBook(R)
-Air. However, as a PacBio(R) employee, most of the testing are done with the data
-generated by PacBio and PacBio's computational resources, so it is fair the
-code is released with PacBio's version of open source licence. If you are from
-a competitor and try to take advantage of any open source code from PacBio, the
-only thing you can really justify such practice is to release your real data in
-public and your code as open source too. 
-
-Also, releasing this code to public is fully my own discretion. If my employer
-has any concern about this, I might have to pull it off.
-
-Standard PacBio Open Source License that is associated with this package:
-
-    #################################################################################$$
-    # Copyright (c) 2011-2014, Pacific Biosciences of California, Inc.
-    #
-    # All rights reserved.
-    #
-    # Redistribution and use in source and binary forms, with or without
-    # modification, are permitted (subject to the limitations in the
-    # disclaimer below) provided that the following conditions are met:
-    #
-    #  * Redistributions of source code must retain the above copyright
-    #  notice, this list of conditions and the following disclaimer.
-    #
-    #  * Redistributions in binary form must reproduce the above
-    #  copyright notice, this list of conditions and the following
-    #  disclaimer in the documentation and/or other materials provided
-    #  with the distribution.
-    #
-    #  * Neither the name of Pacific Biosciences nor the names of its
-    #  contributors may be used to endorse or promote products derived
-    #  from this software without specific prior written permission.
-    #
-    # NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
-    # GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
-    # BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
-    # WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
-    # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-    # DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
-    # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-    # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-    # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
-    # USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-    # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-    # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
-    # OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-    # SUCH DAMAGE.
-    #################################################################################$$
+Standard PacBio ["Open Source License"](LICENSE).
 
---Jason Chin, Dec 16, 2013
+July 9th, 2015
 
diff --git a/doc/README b/doc/README
new file mode 100644
index 0000000..d835a13
--- /dev/null
+++ b/doc/README
@@ -0,0 +1,3 @@
+The images here are used by
+
+  https://github.com/PacificBiosciences/FALCON/wiki/Manual
diff --git a/doc/falcon_icon.svg b/doc/falcon_icon.svg
new file mode 100644
index 0000000..8dffb35
--- /dev/null
+++ b/doc/falcon_icon.svg
@@ -0,0 +1,24 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<svg version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0" y="0" width="258.666" height="245.667" viewBox="0, 0, 258.666, 245.667">
+  <defs>
+    <linearGradient id="Gradient_1" gradientUnits="userSpaceOnUse" x1="225.627" y1="404.121" x2="225.627" y2="192.454">
+      <stop offset="0.02" stop-color="#1D3884"/>
+      <stop offset="1" stop-color="#FFFFFF"/>
+    </linearGradient>
+    <linearGradient id="Gradient_2" gradientUnits="userSpaceOnUse" x1="139.88" y1="377.62" x2="313.76" y2="226.54">
+      <stop offset="0.026" stop-color="#D90000"/>
+      <stop offset="1" stop-color="#FFFFFF" stop-opacity="0.798"/>
+    </linearGradient>
+  </defs>
+  <g id="Layer_1" transform="translate(-96.294, -175.454)">
+    <g>
+      <path d="M151.651,192.454 L299.603,192.454 C320.787,192.454 337.96,208.634 337.96,228.592 L337.96,367.982 C337.96,387.941 320.787,404.121 299.603,404.121 L151.651,404.121 C130.467,404.121 113.294,387.941 113.294,367.982 L113.294,228.592 C113.294,208.634 130.467,192.454 151.651,192.454 z" fill="url(#Gradient_1)"/>
+      <path d="M151.651,192.454 L299.603,192.454 C320.787,192.454 337.96,208.634 337.96,228.592 L337.96,367.982 C337.96,387.941 320.787,404.121 299.603,404.121 L151.651,404.121 C130.467,404.121 113.294,387.941 113.294,367.982 L113.294,228.592 C113.294,208.634 130.467,192.454 151.651,192.454 z" fill-opacity="0" stroke="#FFFFFF" stroke-width="9"/>
+    </g>
+    <g>
+      <path d="M144.25,337.917 C144.25,337.917 163.5,314.5 210.833,301.833 C232.522,296.029 309.5,281.167 309.5,281.167 C309.5,281.167 284.667,318.5 272.167,331.833 C259.667,345.167 253,328 218.5,337.5 C184,347 171.833,370.167 171.833,370.167 C171.833,370.167 151.833,373.833 133.833,382.833 C115.833,391.833 109.04,410.241 107.833,408.833 C105.629,406.262 98.75,380.75 110.75,364.75 C122.75,348.75 129,349.583 134.25,332.25 C139.5,314.917 135.5,290.5 177.5,258.5 C219.5,226.5 347.583,195.25  [...]
+      <path d="M144.25,337.917 C144.25,337.917 163.5,314.5 210.833,301.833 C232.522,296.029 309.5,281.167 309.5,281.167 C309.5,281.167 284.667,318.5 272.167,331.833 C259.667,345.167 253,328 218.5,337.5 C184,347 171.833,370.167 171.833,370.167 C171.833,370.167 151.833,373.833 133.833,382.833 C115.833,391.833 109.04,410.241 107.833,408.833 C105.629,406.262 98.75,380.75 110.75,364.75 C122.75,348.75 129,349.583 134.25,332.25 C139.5,314.917 135.5,290.5 177.5,258.5 C219.5,226.5 347.583,195.25  [...]
+    </g>
+  </g>
+</svg>
diff --git a/doc/falcon_icon2.png b/doc/falcon_icon2.png
new file mode 100644
index 0000000..38b3abd
Binary files /dev/null and b/doc/falcon_icon2.png differ
diff --git a/doc/file_format_note.md b/doc/file_format_note.md
deleted file mode 100644
index 6cbcdd4..0000000
--- a/doc/file_format_note.md
+++ /dev/null
@@ -1,113 +0,0 @@
-Quick Note on FALCON Assembly Output Format
-============================================
-
-After running `falcon_asm.py`, the following files will be generated
-
-- `edges_list`: the list of edges in the assembled string graph
-- `unit_edge_paths`: the path of each unitig
-- `unit_edges.dat`: the path and the sequence of each unitig
-- `unitgs.fa`: fasta file of all unitigs
-- `all_tigs_paths`: the path of all contigs
-- `all_tigs.fa`: the sequences of all contigs
-- `primary_tigs_paths`: the path of the primary contigs
-- `primary_tigs.fa`: the sequences of the initial primary contigs
-- `bundle_edges`: the edges and paths of each "string bundles"
-
-After running `falcon_fixasm.py`, it generates the following files
-
-- `primary_tigs_c.fa`: the final primary contigs
-- `primary_tigs_paths_c`: the path of the final primary contigs
-- `all_tiling_path_c`: the "tiling" path of all contigs
-- `primary_tigs_node_pos_c`: the positions of the nodes in each of the primary contigs
-
-The format of each node is the identifier of the DNA fragment followed by `:B` or `:E` indicating the
-end of the read that is corresponding to the node.
-
-The `egdes_list` file has a simple 4 column format: `in_node out_node edge_label overlap_length`.
- 
-Here is an example how edges are represented in the `egdes_list` file:
-	
-	00099576_1:B 00101043_0:B 00101043_0:1991-0 14333
-	00215514_0:E 00025025_0:B 00025025_0:99-0 14948
-	00223367_0:E 00146924_0:B 00146924_0:1188-0 8452
-	00205542_0:E 00076625_0:B 00076625_0:396-0 11067
-
-The `edge_label`, e.g. `00101043_0:1991-0`, encodes the correspondent sequence of the edge from the DNA fragment. The
-edge `00099576_1:B -> 00101043_0:B` has a sequence from read `00101043_0` base 1991 to 0.
-
-
-The `unit_edge_paths` file contains the information of the path of each unitig. Each line represents 
-an unitig. For example, the unitig `00001c` is represented as:
-
-	>00001c-00169881_0:B-00121915_0:E-133 00169881_0:B 00201238_0:E 00137179_0:E 00142410_0:B 
-     00223493_0:B 00208425_0:B 00102538_0:E 00160115_0:E  ... 00122905_0:E 00121915_0:E
-
-The full unitig id `00001c-00169881_0:B-00121915_0:E-133` includes the unique serial number `00001c`, the begin node `00169881_0:B` and the end node `00121915_0:E` followed by the number of nodes 133 in the path. The rest of the fields list the full path node by node.
-
-The `primary_tigs_paths` and `all_tigs_paths` have the same format as the `unit_edge_paths` except the edges in the path are the unitig edges rather than the edges in the original string graph.
-
-The `unit_edges.dat` contains not only the begin nodes, the end nodes and the paths of the unitigs but also the full sequences of the unitigs.  It has simple 4 column format `begin node`, `end node`, `path`, `sequence`. The different nodes in the path are delimited by `-`.  
-
-The sequence identifiers in `all_tigs.fa` also encode the relationship between different contigs. For example:
-
-	$ grep ">" all_tigs.fa | head -15
-	>0000-0000 2e8a7078_130260_0:B-02eca7b8_135520_0:E
-	>0000-0001 6edbcd5c_128868_0:E-3353572d_72448_963:E
-	>0000-0002 2f1c350c_15083_0:E-8c92434f_60400_0:E
-	>0000-0003 02eca7b8_135520_0:B-02030999_5577_0:B
-	>0000-0004-u 53756d78_87035_13099:B-d850f3f2_135807_0:E
-	>0000-0005-u 80ae02b0_43730_1168:B-4901e842_5163_2833:B
-	>0000-0006-u e1709413_155764_0:E-e55b636f_50757_0:E
-	>0000-0007-u e56a70f0_80897_1520:E-06734432_150537_0:E
-	>0000-0008-u 1ab64aad_59082_807:E-6f9ad27e_23458_5638:E
-	>0000-0009-u 1a88ddf4_21715_0:B-9eb4f7d7_79023_11041:E
-	>0000-0010-u ada57c82_24446_0:E-4ce44ebc_41426_0:E
-	>0000-0011-u 49704ee2_54679_0:B-a9ced3cc_90191_1410:E
-	>0000-0012-u b3728b6f_59022_233:E-bd1579e4_160424_0:B
-
-All these sequences have the same first field `0000`. It means all these contigs are initialized from the same "string bundles". If the second field is `0000`, it means that sequence is the primary contig of this bundle. The rest are the "associated contigs". The second field in the identifier simply indicates the begin and the end node of the contigs.
-
-After running `falcon_fixasm.py`, some of the primary contigs could be broken apart into smaller pieces. For example:
-	
-	$ grep ">" primary_tigs_c.fa |  head -15
-	>0000_00
-	>0001_00
-	>0001_01
-	>0001_02
-	>0002_00
-	>0002_01
-
-In this case, the initial primary contig `0000` (`0000-0000` in the `all_tigs.fa` file) is intact. However, the `0001-0000` has been broken into 3 primary contigs `0001_00`, `0001_01`, and `0001_02`.
-
-Some of the associated contigs might be caused by sequencing / consensus errors or missing overlapping information. Running `falcon_dedup.py` compares the associated contigs to the corresponding sequences in the primary contigs. If the identity is high, namely not large scale variants found, they will be removed. Mummer3 (Nucmer) package is used and is necessary for this step. `falcon_dedup.py` generates a file called `a_nodup.fa` which contains the non-redundant associate contigs.
-
-
-Input File Format For FalconSense
----------------------------------
-
-The `falcon_sense.py` generates consensus from a set of raw sequences.
-
-The input is a stream of sequences. Each row has two columns.  Different set of reads are delimited by `+ +` and the file should be ended by `+ +`.  Here is an example
-
-	seed_id1 ACTACATACATACTTA...
-	read_id2 TCTGGCAACACTACTTA...
-	...
-	- -
-	seed_id2 ACTACATACATACTTA...
-	read_id3 TCTGGCAACACTACTTA...
-	...
-	- -
-	+ +
-
-In this case, if there are enough coverage to correct `seed_id1` and `seed_id2`, the `falcon_sense.py` will generate two consensus sequences (labeled with `seed_id1` and `seed_id2`) in fasta format to `stdout`.
-
-Final Note
-----------
-
-1. Typically, the size of `unitgs.fa` will be roughly twice of the genome size, since the file contains both dual edges from each overlap. In the process of the assembly, only one of the dual edges will be used in the final contigs.  
-
-2. The relation between the associate contigs and the primary contigs can be simply identified by the begin and the end nods of the associted contigs. One can easily constructed the corresponding sequences in the primary contigs for identify the variants between them.
-
-3. One can construct a unitig graph from the `unit_edge_paths` files, the graph is typically much smaller than the initial string graph which is more convenient for visualization for understanding the assembly/genome structure.
-
-4. The `-` and `:` characters are used as delimiter for parsing, so the initial reads identifier should not have these two characters. 
diff --git a/examples/Dmel_asm.md b/examples/Dmel_asm.md
deleted file mode 100644
index f8d20f1..0000000
--- a/examples/Dmel_asm.md
+++ /dev/null
@@ -1,250 +0,0 @@
-Dmel Assembly with FALCON on Amazon EC2
-=========================================
-
-Preparation for Running StarCluster
------------------------------------
-
-I use a development version of StarCluster since the stable version does
-not support the kind of instance that we need to use in AWS EC2.
-
-You can install the developement version by directly cloning the 
-StarCluster's GitHub repository. The following is a simple example
-for installing the development version. You might have to install
-other python packages that StarCluster is dependent on.
-
-```
-    git clone https://github.com/jtriley/StarCluster.git
-    cd StarCluster
-    # you can check out the exact revision that I am using for this document
-    git checkout 4149bbed292b0298478756d778d8fbf1dd210daf 
-    python setup.py install
-```
-
-For using StarCluster to create a SGE in AWS EC2, I assume you already know how
-to create an AWS EC2 account, and go through the tutorial for running VMs on
-EC2.
-
-I have built a public EC2 EBS snapshot. You should create a new EBS volume
-using the `PacBio_Dmel_Asm / snap-19e7a0df` snapshop. It already contains the
-raw sequence fasta files and an assembly as example already.
-
-Here is an example of the configuration for StarCluster::
-
-```
-    [aws info]
-    aws_access_key_id = your_access_key
-    aws_secret_access_key = your_secret_access_key
-    aws_user_id = your_user_id
-
-    [volume DMEL]
-    volume_id=your_dmel_data_EBS_id #e.g volume_id=vol-c9df3b85
-    mount_path=/mnt/dmel_asm
-
-    [cluster falcon-pre-asm]
-    keyname = starcluster
-    cluster_size = 1
-    cluster_user = sgeadmin
-    cluster_shell = bash
-    master_image_id = ami-ef3c0e86
-    master_instance_type = c3.8xlarge
-    node_image_id = ami-ef3c0e86
-    node_instance_type = c3.8xlarge
-    availability_zone = us-east-1a
-    volumes = DMEL
-
-    [cluster falcon-bigmem]
-    keyname = starcluster
-    cluster_size = 1
-    cluster_user = sgeadmin
-    cluster_shell = bash
-    master_image_id = ami-73d2d21a
-    master_instance_type = cr1.8xlarge
-    node_image_id = ami-73d2d21a
-    node_instance_type = cr1.8xlarge
-    availability_zone = us-east-1a
-    volumes = DMEL
-
-    [global]
-    default_template = falcon-bigmem
-    ENABLE_EXPERIMENTAL=True
-```
-
-I set up two cluster configurations for different part of the assembly process.
-If you want to run end-to-end in one kind of instance, you can just use the 
-`falcon-bigmem` for assembly. It costs a little bit more.
-
-The AMI images (ami-ef3c0e86 and ami-73d2d21a) are pre-built with most package
-necessary for the assembly work. If you will like to build your own, you can
-check with this script:
-
-```
-    https://raw.github.com/PacificBiosciences/FALCON/v0.1.1/examples/install_note.sh
-```
-
-Get preassembled reads
-------------------------
-
-"Pre-assembly" is the process to error correct PacBio reads to generate
-"preassembled reads" (p-reads) which have good accuracy to be assembled by
-traditional Overlap-Layout-Consensus assembly algorithms directly. In this
-instruction, we use an experimental code `falcon_qrm.py` to match the reads for
-error correction. It is much faster than using `blasr` for the same purpose but
-it may not as robust as `blasr` to generate high quality results yet as many
-statistical properties of the algorithm is not fully studied.
-
-
-First, let start an EC2 cluster of one node to set up a few things by running 
-following `starcluster` command:
-
-```
-    starcluster start -c falcon-pre-asm falcon
-```
-
-Once the cluster is built, one can login the master node by:
-
-```
-    starcluster sshmaster falcon
-```
-
-We will need the following steps to setup the running environment::
-
-1. update SGE environment
-
-    ```
-        cd /mnt/dmel_asm/sge_setup
-        bash sge_setup.sh
-    ```
-
-2. setup HBAD-DTK environment
-
-    ```
-        . /home/HBAR_ENV/bin/activate
-    ```
-
-3. update HBAR-DTK and falcon_asm
-
-    ```
-        cd /mnt/dmel_asm/packages/pbtools.hbar-dtk-0.1.5
-        python setup.py install
-        cd /mnt/dmel_asm/packages/falcon_kit-0.1.1
-        #edit falcon_asm.py to set identity threshold for overlapping at 98%, it is done in the EBS snapshot
-        python setup.py install
-    ```
-
-If you want to do an assembly in `/mnt/dmel_asm/new_asm/`, just clone the 
-configuration in `/mnt/dmel_asm/asm_template/` to `/mnt/dmel_asm/new_asm/`:
-
-```
-    cd /mnt/dmel_asm/
-    cp -a asm_template/ new_asm/
-    cd new_asm/
-```
-
-An example of the assembly result can be found in `/mnt/dmel_asm/asm_example`.
-
-You can start the pre-assembly stage by running the `HBAR_WF3.py` script as following:
-
-```
-    python HBAR_WF3.py HBAR_step1.cfg
-```
-
-It will take while to preparing the fasta files for pre-assembly. Once that is
-one, SGE jobs for matching reads will be submitted. Once the SGE jobs are
-submmited, you can use add more node to run the jobs concurrently to speed up
-the process by issuing this command on your local host to add the nodes:
-
-    starcluster addnode -n 15 falcon # add 15 nodes 
-
-When all nodes are up. You can try to run the load balancer so once the jobs are
-done, the node can be terminated automatically to save some money.
-
-    starcluster loadbalance -k 9 -K -m 16 -n 1 falcon
-
-I found I have to comment out one line of code in `starcluster/plugins/sge.py`
-to make it work properly to remove unused nodes:
-    
-    class SGEPlugin(clustersetup.DefaultClusterSetup):
-        def _remove_from_sge(self, node):
-            #comment out the following line in the code
-            #master.ssh.execute('qconf -de %s' % node.alias)
-
-If you use 16 nodes, it will takes about 4 hours to finish all jobs.  If all
-pre-assembly jobs finish the cluster will be terminated automatically, but the
-results will be kept in the EBS volume.
-
-The generated p-reads will be in `/mnt/dmel_asm/new_asm/2-preads-falcon/pread_*.fa`.
-
-Assembling the p-reads
-------------------------
-
-We use a different instance type which has bigger memory to assemble the genome. We
-only needs one node for the assembly part.  We still use SGE as the code was written 
-to run end-to-end assembly in a general SGE cluster. First, start single node cluster by
-running the commands in the local host:
-
-```
-    starcluster start -c falcon-bigmem falcon
-```
-
-Repeat the setup process:
-
-```
-    cd /mnt/dmel_asm/sge_setup
-    bash sge_setup.sh
-
-    . /home/HBAR_ENV/bin/activate
-
-    cd /mnt/dmel_asm/packages/pbtools.hbar-dtk-0.1.5
-    python setup.py install
-    cd /mnt/dmel_asm/packages/falcon_kit-0.1.1
-    #edit falcon_asm.py to set identity threshold for overlapping at 98%, it is done in the EBS snapshot
-    python setup.py install
-```
-
-You can start the assembly stage by running the `HBAR_WF3.py` script as following:
-
-```
-    cd /mnt/dmel_asm/new_asm/
-    python HBAR_WF3.py HBAR_step2.cfg
-```
-
-It takes about two hours for the assembly process to finish. The results will 
-be in `/mnt/dmel_asm/new_asm/3-asm-falcon`. 
-
-Here is a list of the output files:
-
-```
-    full_string_graph.adj  # the adjacent nodes of the edges in the full string graph
-    string_graph.gexf      # the gexf file of the string graph for graph visualization
-    string_graph.adj       # the adjecent nodes of the edges in the string graph after transitive reduction
-    edges_list             # full edge list 
-    paths                  # path for the unitigs
-    unit_edges.dat         # path and sequence of the untigs
-    uni_graph.gexf         # unitig graph in gexf format 
-    unitgs.fa              # fasta files of the unitigs
-    all_tigs_paths         # paths for all final contigs (= primary contigs + associated contigs)
-    all_tigs.fa            # fasta file for all contigs
-    primary_tigs_paths     # paths for all primary contigs 
-    primary_tigs.fa        # fasta file fot the primary contigs
-    primary_tigs_paths_c   # paths for all primary contigs, detectable mis-assemblies are broken 
-    primary_tigs_c.fa      # fasta file fot the primary contigs, detectable mis-assemblies are broken
-    asm_graph.gexf         # the assembly graph where the edges are the contigs
-```
-
-There might be redundant contig. The following script can be used to remove
-redundant contigs:
-
-```
-    export PATH=$PATH:/home/HBAR_ENV/MUMmer3.23
-    nucmer -mum all_tigs.fa all_tigs.fa -p all_tigs_self >& /dev/null
-    show-coords -o -H -T all_tigs_self.delta | grep CONTAINS | awk '$7>96' | awk '{print $9}' | sort -u > all_tigs_duplicated_ids
-    remove_dup_ctg.py
-    cat p-tigs_nodup.fa a-tigs_nodup.fa > pa-tigs_nodup.fa
-```
-
-The non-reduant set of contigs in `pa-tigs_nodup.fa` will be suitable for further correction
-by the Quvier algorithm. 
-
--
-Jason Chin, March 9, 2014
-
diff --git a/examples/HBAR.cfg b/examples/HBAR.cfg
deleted file mode 100755
index 2257294..0000000
--- a/examples/HBAR.cfg
+++ /dev/null
@@ -1,72 +0,0 @@
-[General]
-# list of files of the initial bas.h5 files
-input_fofn = input.fofn
-
-# The length cutoff used for seed reads used for initial mapping
-length_cutoff = 10000 
-
-# The length cutoff used for seed reads usef for pre-assembly
-length_cutoff_pr = 10000
-
-# The read quality cutoff used for seed reads
-RQ_threshold = 0.75
-
-# SGE job option for distributed mapping 
-sge_option_dm = -pe smp 32 -q all.q
-
-# SGE job option for m4 filtering
-sge_option_qf = -pe smp 4 -q all.q
-
-# SGE job option for pre-assembly
-sge_option_pa = -pe smp 32 -q all.q
-
-# SGE job option for CA 
-sge_option_ca = -pe smp 8 -q all.q
-
-# SGE job option for Quiver
-sge_option_qv = -pe smp 32 -q all.q
-
-# blasr for initial read-read mapping for each chunck (do not specific the "-out" option). 
-# One might need to tune the bestn parameter to match the number of distributed chunks to get more optimized results 
-blasr_opt = -nCandidates 64 -minMatch 12 -maxLCPLength 15 -bestn 48 -minPctIdentity 75.0 -maxScore -1000 -nproc 32 
-
-#This is used for running quiver
-SEYMOUR_HOME = /mnt/secondary/Smrtpipe/builds/Assembly_Mainline_Nightly_Archive/build470-116466/
-
-#The number of best alignment hits used for pre-assembly
-#It should be about the same as the final PLR coverage, slight higher might be OK.
-bestn = 64
-
-# target choices are "pre_assembly", "draft_assembly", "all"
-# "mapping": initial mapping
-# "pre_assembly" : generate pre_assembly for any long read assembler to use
-# "draft_assembly": automatic submit CA assembly job when pre-assembly is done
-# "all" : submit job for using Quiver to do final polish, not working yet
-target = pre_assembly
-
-
-# number of chunks for pre-assembly. 
-preassembly_num_chunk = 1 
-
-
-q_chunk_size = 1
-t_chunk_size = 3
-
-# "tmpdir" is for preassembly. A lot of small files are created and deleted during this process. 
-# It would be great to use ramdisk for this. Set tmpdir to a NFS mount will probably have very bad performance.
-tmpdir = /tmp
-
-# "big_tmpdir" is for quiver, better in a big disk
-big_tmpdir = /tmp
-
-# various trimming parameters
-min_cov = 8
-max_cov = 64
-trim_align = 50
-trim_plr = 50
-
-# number of processes used by by blasr during the preassembly process
-q_nproc = 16 
-
-
-concurrent_jobs = 1
diff --git a/examples/StarCluster.cfg b/examples/StarCluster.cfg
deleted file mode 100644
index db5f1f9..0000000
--- a/examples/StarCluster.cfg
+++ /dev/null
@@ -1,24 +0,0 @@
-[aws info]
-aws_access_key_id = your_key
-aws_secret_access_key = your_access_key
-aws_user_id = your_user_id
-
-[key starcluster]
-key_location = ~/.ec2/starcluster.rsa
-
-[cluster falcon]
-#The AMI image is based on ami-765b3e1f us-east-1 starcluster-base-ubuntu-12.04-x86_64 
-keyname = starcluster
-cluster_size = 1
-cluster_user = sgeadmin
-cluster_shell = bash
-master_image_id = ami-ef3c0e86
-master_instance_type = c3.8xlarge
-node_image_id = ami-ef3c0e86
-node_instance_type = c3.8xlarge
-availability_zone = us-east-1c
-
-[global]
-default_template = falcon
-ENABLE_EXPERIMENTAL=True
-
diff --git a/examples/build_env.sh b/examples/build_env.sh
new file mode 100644
index 0000000..f2663a3
--- /dev/null
+++ b/examples/build_env.sh
@@ -0,0 +1,22 @@
+virtualenv --no-site-packages  --always-copy   $PWD/fc_env
+. $PWD/fc_env/bin/activate
+git clone https://github.com/pb-jchin/pypeFLOW
+cd pypeFLOW
+python setup.py install
+
+cd ..
+git clone https://github.com/PacificBiosciences/FALCON.git
+cd FALCON
+python setup.py install
+
+cd ..
+git clone https://github.com/pb-jchin/DAZZ_DB.git
+cd DAZZ_DB/
+make
+cp DBrm DBshow DBsplit DBstats fasta2DB ../fc_env/bin/
+
+cd ..
+git clone https://github.com/pb-jchin/DALIGNER.git
+cd DALIGNER
+make
+cp daligner daligner_p DB2Falcon HPCdaligner LA4Falcon LAmerge LAsort  ../fc_env/bin
diff --git a/examples/build_env2.sh b/examples/build_env2.sh
new file mode 100644
index 0000000..a5e404c
--- /dev/null
+++ b/examples/build_env2.sh
@@ -0,0 +1,22 @@
+virtualenv --no-site-packages  --always-copy   $PWD/fc_env
+. $PWD/fc_env/bin/activate
+
+cd FALCON
+git submodule init
+git submodule update
+
+cd pypeFLOW
+python setup.py install
+cd ..
+
+python setup.py install
+
+cd DAZZ_DB/
+make
+cp DBrm DBshow DBsplit DBstats fasta2DB ../../fc_env/bin/
+cd ..
+
+cd DALIGNER
+make
+cp daligner daligner_p DB2Falcon HPCdaligner LA4Falcon LAmerge LAsort  ../../fc_env/bin
+cd ../..
diff --git a/examples/ecoli_asm_graph_exploration.ipynb b/examples/ecoli_asm_graph_exploration.ipynb
new file mode 100644
index 0000000..1a7160f
--- /dev/null
+++ b/examples/ecoli_asm_graph_exploration.ipynb
@@ -0,0 +1,716 @@
+{
+ "metadata": {
+  "name": "",
+  "signature": "sha256:585e8268126c8365dec7b581d0673bd9a5a1e83a88800e51c0f9028479318042"
+ },
+ "nbformat": 3,
+ "nbformat_minor": 0,
+ "worksheets": [
+  {
+   "cells": [
+    {
+     "cell_type": "heading",
+     "level": 1,
+     "metadata": {},
+     "source": [
+      "FALCON Assembly Graph Processing and Visulization Example\n"
+     ]
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "This notebook shows how to fetch the graph data from the assembler and show how to examine the bubbles in the assembly graph."
+     ]
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "You will need to have `matplotlib`, `networkx` and `pygraphviz` as Python libraries. Of course, you will need to have `graphviz` command line tool install in your system too. "
+     ]
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "Some setup to change the working directory to where the data is."
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "%cd /mnt/projects/bifxanalysis/jchin_asm_jobs/DA_develop_test/Ecoli/2-asm-falcon/\n",
+      "%matplotlib inline\n",
+      "%pylab inline"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "/mnt/projects/bifxanalysis/jchin_asm_jobs/DA_develop_test/Ecoli/2-asm-falcon\n",
+        "Populating the interactive namespace from numpy and matplotlib\n"
+       ]
+      }
+     ],
+     "prompt_number": 1
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "Some boilerplate code for loading the necessary module and classes"
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "import networkx as nx\n",
+      "from IPython.display import display, HTML, SVG, Image"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 2
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "Load the `AsmGraph` class which parse the graph data files and load them into python objects. "
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "from falcon_kit.fc_asm_graph import AsmGraph"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 3
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "Load data into `G_asm`. "
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "G_asm = AsmGraph(\"sg_edges_list\", \"utg_data\", \"ctg_paths\")"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 4
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "Show all contig identifiers. The `000000R` is the dual contig of `000000F`."
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "print G_asm.ctg_data.keys()"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "['000000F', '000000R']\n"
+       ]
+      }
+     ],
+     "prompt_number": 5
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "Print out the content of the contig `000000F`. The output is a tuple of `contig identifier`, `start_utg`, `end_node`, `number of base`, `number of overlapped base`, `unitigs`."
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "print G_asm.ctg_data[\"000000F\"]\n",
+      "print\n",
+      "print \"number of unitigs in the contig:\", len(G_asm.ctg_data[\"000000F\"][-1])"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "('ctg_linear', '000011445:B~NA~000003425:E', '000011445:B', 4642293, 28970832, (['000011445:B', 'NA', '000003425:E'], ['000003425:E', '000009719:E', '000006955:E'], ['000006955:E', 'NA', '000017147:B'], ['000017147:B', '000012473:B', '000010757:B'], ['000010757:B', 'NA', '000015636:E'], ['000015636:E', '000004093:E', '000015696:B'], ['000015696:B', 'NA', '000016941:B'], ['000016941:B', '000003353:B', '000008783:B'], ['000008783:B', 'NA', '000006338:B'], ['000006338:B', '00000493 [...]
+        "\n",
+        "number of unitigs in the contig: 16\n"
+       ]
+      }
+     ],
+     "prompt_number": 6
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "If the starting node is the same of the ending node, then the contig may be circular too. (The contig type only applys to \"simple unitig\". Any contig contains more than one single unitig is classfied as `ctg_linear`. This convention may change in the futrue."
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "utgs = G_asm.ctg_data[\"000000F\"][-1]\n",
+      "if utgs[0][0] == utgs[-1][-1]:\n",
+      "    print \"the contig is a cirular\"\n",
+      "else:\n",
+      "    print \"thec contig is not circular\""
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "the contig is a cirular\n"
+       ]
+      }
+     ],
+     "prompt_number": 7
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "Check the number of node in the string subgraph for this contig. We can get the full string subgraph using the method `get_sg_for_ctg()`.  This is full string graph. Each node is 5' or 3' ends of a read."
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "sg = G_asm.get_sg_for_ctg(\"000000F\")\n",
+      "print \"number of nodes:\", len(sg.nodes())\n",
+      "print \"number of edges:\", len(sg.edges())"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "number of nodes: 1361\n",
+        "number of edges: 1369\n"
+       ]
+      }
+     ],
+     "prompt_number": 8
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "Call `neato` layout code from Graphviz to layout the contig string graph. We can see a circle with a couple of small bubbles."
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "position=nx.graphviz_layout(sg, prog='neato') "
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 9
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "figure(figsize=(10,10))\n",
+      "ax=subplot(1,1,1)\n",
+      "minx = 10\n",
+      "miny = 10\n",
+      "count = 0\n",
+      "for e in sg.edges():\n",
+      "    yy, xx = zip(position[e[0]], position[e[1]]) \n",
+      "\n",
+      "    xx = -array(xx)\n",
+      "    yy = -array(yy)\n",
+      "    ax.plot( xx, yy, \".-b\" ) \n",
+      "\n",
+      "    if min(xx) < minx:\n",
+      "        minx = min(xx)\n",
+      "    if min(yy) < miny:\n",
+      "        miny = min(yy)\n",
+      "\n",
+      "    #print x,y\n",
+      "xlim(minx*1.1,-minx*0.1)\n",
+      "ylim(miny*1.1,-miny*0.1)\n",
+      "ax.get_xaxis().set_visible(False)\n",
+      "ax.get_yaxis().set_visible(False)"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "metadata": {},
+       "output_type": "display_data",
+       "png": "iVBORw0KGgoAAAANSUhEUgAAAjwAAAI8CAYAAAD1D3GaAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3U13HNWZB/CnLQdhiGNJSIkA2y1b2NZq1iwI38KLfMKc\nIxazmy8gLziD59iZ1WCDsYCJjV+wgYRX2z2LSo+6ql+klrq7qu79/c7hRFUtoIIl9V/P89x7O71e\nLwAAUnaq7gcAAJg3gQcASJ7AAwAkT+ABAJIn8AAAyRN4AIDknZ70YqfTsWYdAGiNXq/XGXV/YuD5\n1984+6cBAJixTmdk1okILS0AIAMCDwCQPIEHAEiewAMAJE/gAQCSJ/AAAMkTeACA5Ak8AEDyBB4A\nIHkCDwCQPIEHAEiewAMAJE/gAQCSJ/AAAMkTeACA5Ak8AEDyBB4AIHkCDwCQPIEHAEiewAMAJE/g\nAQCSJ/AAA [...]
+       "text": [
+        "<matplotlib.figure.Figure at 0x7fc1a28ac310>"
+       ]
+      }
+     ],
+     "prompt_number": 10
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "Create the assembly graph which each edge is a simple or a compound unitig rather than a raw string graph edge. "
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "utgs = G_asm.ctg_data[\"000000F\"][-1]\n",
+      "utg_g = nx.DiGraph()\n",
+      "for s,v,t in utgs:\n",
+      "    utg_g.add_edge(s,t)\n",
+      "position=nx.graphviz_layout(utg_g, prog='neato') \n",
+      "figure(figsize=(10,10))\n",
+      "ax=subplot(1,1,1)\n",
+      "minx = 10\n",
+      "miny = 10\n",
+      "count = 0\n",
+      "for e in utg_g.edges():\n",
+      "    yy, xx = zip(position[e[0]], position[e[1]]) \n",
+      "\n",
+      "    xx = -array(xx)\n",
+      "    yy = -array(yy)\n",
+      "\n",
+      "    ax.plot( xx, yy, \".-b\" ) \n",
+      "\n",
+      "    if min(xx) < minx:\n",
+      "        minx = min(xx)\n",
+      "    if min(yy) < miny:\n",
+      "        miny = min(yy)\n",
+      "\n",
+      "    #print x,y\n",
+      "xlim(minx*1.1,-minx*0.1)\n",
+      "ylim(miny*1.1,-miny*0.1)\n",
+      "ax.get_xaxis().set_visible(False)\n",
+      "ax.get_yaxis().set_visible(False)"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "metadata": {},
+       "output_type": "display_data",
+       "png": "iVBORw0KGgoAAAANSUhEUgAAAjwAAAI8CAYAAAD1D3GaAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3Xd4lWW2/vF7Q2ihoyAKCIoUCzI4YKGLAoJKFRRQSqLY\nxgLzG71mPE7RGR05E3DErkmoUlQQREGUJqAOAwoWBkVFiiDSQTrJ/v2xJieWEDbJ3vvZ+3m/n+vi\nQoSZs847Ibmz3udZKxQOhwUAAOCzEq4LAAAAiDUCDwAA8B6BBwAAeI/AAwAAvEfgAQAA3iPwAAAA\n76UU9puhUIg76wAAIGmEw+FQQf++0MDz3/9g9KsBAACIslCowKwjiVdaAAAgAAg8AADAewQeAADg\nPQIPAADwHoEHAAB4j8ADAAC8R+ABAADeI/AAAADvEXgAAID3CDwAAMB7BB4AAOA9Ag8AAPAegQcA\nAHiPwAMAA [...]
+       "text": [
+        "<matplotlib.figure.Figure at 0x7fc19bc98550>"
+       ]
+      }
+     ],
+     "prompt_number": 11
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "Load sequences for each string graph edges so we can generate sequence given a path in the graph."
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "G_asm.load_sg_seq(\"preads4falcon.fasta\")"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 12
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "This shows how to get sequences for each simple or compound unitigs with a contig."
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "compound_path_seqs = {}\n",
+      "print \"s\", \"v\", \"t\", \"type\", \"seq len\", \"path len\"\n",
+      "for s,v,t in G_asm.ctg_data[\"000000F\"][-1]:\n",
+      "    utg_data = G_asm.utg_data[(s,t,v)]\n",
+      "    type_, length, score, path_or_edges = utg_data\n",
+      "    \n",
+      "    if type_ == \"simple\":\n",
+      "        path = path_or_edges.split(\"~\")\n",
+      "        seq = G_asm.get_seq_from_path( path )\n",
+      "        print s, v, t, utg_data[0], len(seq)\n",
+      "    else:\n",
+      "        c_graph = nx.DiGraph()\n",
+      "        simple_utgs = [ e.split(\"~\") for e in path_or_edges.split(\"|\")]\n",
+      "        \n",
+      "        for ss, tt, vv in simple_utgs:\n",
+      "            type_, length, score, sub_path = G_asm.utg_data[ (ss,vv,tt) ]\n",
+      "            sub_path = sub_path.split(\"~\")\n",
+      "            v1 = sub_path[0]\n",
+      "            for v2 in sub_path[1:]:\n",
+      "                c_graph.add_edge( v1, v2, score = 10000000 - G_asm.sg_edges[ (v1, v2) ][1]  )\n",
+      "                v1 = v2\n",
+      "        seqs = []\n",
+      "        while 1:\n",
+      "            try:\n",
+      "                shortest_path = nx.shortest_path( c_graph, s, t, weight = \"score\" )\n",
+      "            except nx.exception.NetworkXNoPath:\n",
+      "                break\n",
+      "            seq = G_asm.get_seq_from_path( shortest_path )\n",
+      "            seqs.append( (seq, shortest_path) )\n",
+      "            \n",
+      "            n0 = shortest_path[0]\n",
+      "            for n1 in shortest_path[1:]:\n",
+      "                c_graph.remove_edge(n0, n1)\n",
+      "                n0 = n1\n",
+      "        \n",
+      "        compound_path_seqs[(s,v,t)] = seqs\n",
+      "        for seq, subpath in seqs:            \n",
+      "            print s, v, t, utg_data[0], len(seq), len(subpath)\n"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "s v t type seq len path len\n",
+        "000011445:B NA 000003425:E compound 2736 3\n",
+        "000011445:B NA 000003425:E compound 2681 3\n",
+        "000003425:E 000009719:E 000006955:E simple 84196\n",
+        "000006955:E NA 000017147:B compound 26315 7\n",
+        "000006955:E NA 000017147:B compound 26357 7\n",
+        "000017147:B 000012473:B 000010757:B simple 601191\n",
+        "000010757:B NA 000015636:E compound 16169 3\n",
+        "000010757:B NA 000015636:E compound 16194 3\n",
+        "000015636:E 000004093:E 000015696:B simple 1469288\n",
+        "000015696:B NA 000016941:B compound 16458 3\n",
+        "000015696:B NA 000016941:B compound 16438 8\n",
+        "000016941:B 000003353:B 000008783:B simple 88381\n",
+        "000008783:B NA 000006338:B compound 26074 3\n",
+        "000008783:B NA 000006338:B compound 26058 10\n",
+        "000006338:B 000004932:B 000010623:B simple 206164\n",
+        "000010623:B NA 000014991:B compound 30158 3\n",
+        "000010623:B NA 000014991:B compound 30148 8\n",
+        "000014991:B 000013790:E 000002926:B simple 392373\n",
+        "000002926:B NA 000011761:B compound 25736 3\n",
+        "000002926:B NA 000011761:B compound 25814 12\n",
+        "000011761:B 000003659:E 000014184:E simple 184084\n",
+        "000014184:E NA 000012028:E compound 14792 3\n",
+        "000014184:E NA 000012028:E compound 14895 4\n",
+        "000012028:E 000013461:E 000011445:B simple 1447372\n"
+       ]
+      }
+     ],
+     "prompt_number": 13
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      " Boilerplate code for using an aligner within `falcon_kit` for dot plots and alignment."
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "from falcon_kit import kup, falcon, DWA\n",
+      "rcmap = dict(zip(\"ACGTacgtNn-\",\"TGCATGCANN-\"))\n",
+      "def rc(seq):\n",
+      "    return \"\".join([rcmap[c] for c in seq[::-1]])\n",
+      "\n",
+      "def get_aln_data(t_seq, q_seq):\n",
+      "    aln_data = []\n",
+      "    K = 8\n",
+      "    seq0 = t_seq\n",
+      "    lk_ptr = kup.allocate_kmer_lookup( 1 << (K * 2) )\n",
+      "    sa_ptr = kup.allocate_seq( len(seq0) )\n",
+      "    sda_ptr = kup.allocate_seq_addr( len(seq0) )\n",
+      "    kup.add_sequence( 0, K, seq0, len(seq0), sda_ptr, sa_ptr, lk_ptr)\n",
+      "    q_id = \"dummy\"\n",
+      "    \n",
+      "    kmer_match_ptr = kup.find_kmer_pos_for_seq(q_seq, len(q_seq), K, sda_ptr, lk_ptr)\n",
+      "    kmer_match = kmer_match_ptr[0]\n",
+      "    aln_range_ptr = kup.find_best_aln_range(kmer_match_ptr, K, K*5, 12)\n",
+      "    aln_range = aln_range_ptr[0]\n",
+      "    x,y = zip( * [ (kmer_match.query_pos[i], kmer_match.target_pos[i]) for i in range(kmer_match.count)] )\n",
+      "    kup.free_kmer_match(kmer_match_ptr)\n",
+      "    s1, e1, s2, e2 = aln_range.s1, aln_range.e1, aln_range.s2, aln_range.e2\n",
+      "    \n",
+      "    if e1 - s1 > 100:\n",
+      "\n",
+      "        alignment = DWA.align(q_seq[s1:e1], e1-s1,\n",
+      "                              seq0[s2:e2], e2-s2,\n",
+      "                              100,1)\n",
+      "\n",
+      "        if alignment[0].aln_str_size > 100:\n",
+      "            aln_data.append( ( q_id, 0, s1, e1, len(q_seq), s2, e2, len(seq0), alignment[0].aln_str_size, alignment[0].dist ) )\n",
+      "            aln_str1 = alignment[0].q_aln_str\n",
+      "            aln_str0 = alignment[0].t_aln_str\n",
+      "\n",
+      "        DWA.free_alignment(alignment)\n",
+      "\n",
+      "    kup.free_kmer_lookup(lk_ptr)\n",
+      "    kup.free_seq_array(sa_ptr)\n",
+      "    kup.free_seq_addr_array(sda_ptr)\n",
+      "    return aln_data, x, y"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 14
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "Plot the subgraph of each compound path and do alignment between different subpaths within the compound path.  One can see 7 out of 8 compound path might caused by errors where the identity between the alternative paths are high. However, the path starts at `000006955:E` and ends at `000017147:B` shows a 2kb inversion. This appears to be realy bioligcal polymorphism in a population of E. coli cells."
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "print \"s\", \"v\", \"t\", \"aln_identity\", \"aln_coverage\"\n",
+      "\n",
+      "for s,v,t in compound_path_seqs:\n",
+      "    \n",
+      "    sg = G_asm.get_sg_for_utg((s,t,v))\n",
+      "    position=nx.graphviz_layout(sg, prog='dot') \n",
+      "    figure(figsize=(10,4))\n",
+      "    ax=subplot(1,2,1)\n",
+      "    minx = 10\n",
+      "    miny = 10\n",
+      "    count = 0\n",
+      "    for e in sg.edges():\n",
+      "        yy, xx = zip(position[e[0]], position[e[1]]) \n",
+      "        col = \"k\"\n",
+      "        xx = -array(xx)\n",
+      "        yy = -array(yy)\n",
+      "        ax.plot( xx, yy, \".-\"+col ) \n",
+      "        if min(xx) < minx:\n",
+      "            minx = min(xx)\n",
+      "        if min(yy) < miny:\n",
+      "            miny = min(yy)\n",
+      "\n",
+      "        #print x,y\n",
+      "    xlim(minx*1.1,-minx*0.1)\n",
+      "    ylim(miny*1.1,-miny*0.1)\n",
+      "    ax.get_xaxis().set_visible(False)\n",
+      "    ax.get_yaxis().set_visible(False)\n",
+      "    \n",
+      "    seqs = compound_path_seqs[(s,v,t)]\n",
+      "    seq0 = seqs[0][0]\n",
+      "    ax=subplot(1,2,2)\n",
+      "    for seq, path in seqs[1:]:\n",
+      "        aln_data, x, y = get_aln_data(seq0, seq)\n",
+      "        print s,v,t, 1.0-1.0*aln_data[-1][-1] / aln_data[-1][-2], 1.0*(aln_data[-1][3]-aln_data[-1][2])/aln_data[-1][4]\n",
+      "        \n",
+      "        plot(x, y,'.b', markersize=0.2)\n",
+      "        seq = rc(seq)\n",
+      "        aln_data, rx, ry = get_aln_data(seq0, seq)\n",
+      "        rx = np.array(rx)\n",
+      "        rx = len(seq) - rx\n",
+      "        plot(rx, ry,'.r', markersize=0.2)\n",
+      "        text(0, 0, \"%s %s %s\" % (s, v, t))\n",
+      "        \n",
+      "        \n",
+      "        \n"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "s v t aln_identity aln_coverage\n",
+        "000011445:B"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        " NA 000003425:E 0.97798972854 0.995151063036\n",
+        "000010757:B"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        " NA 000015636:E 0.99697456162 0.999382487341\n",
+        "000014184:E"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        " NA 000012028:E 0.98953722334 0.999261497147\n",
+        "000006955:E"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        " NA 000017147:B 0.9952 0.923473839967\n",
+        "000015696:B"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        " NA 000016941:B 0.995871782419 0.999391653486\n",
+        "000002926:B"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        " NA 000011761:B 0.990612686394 0.999612613311\n",
+        "000008783:B"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        " NA 000006338:B 0.99254387642 0.999616240694\n",
+        "000010623:B"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        " NA 000014991:B 0.992332352844 0.999601963646\n"
+       ]
+      },
+      {
+       "metadata": {},
+       "output_type": "display_data",
+       "png": "iVBORw0KGgoAAAANSUhEUgAAAkgAAAEACAYAAABI/YkzAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3Xd4VFX+x/H3CUWqIlUgCEhZpKTQIooSRFAEUVAElGZD\nkea6Ls0fK3aBVdRVgaUXRVQUQaQJRKQIJCZBybLomiAgEPpKIguE8/tjZmIIoYXJ3Cmf1/PMw82d\nOzPfuQl3PnPuOecaay0iIiIi8ocwpwsQERER8TcKSCIiIiK5KCCJiIiI5KKAJCIiIpKLApKIiIhI\nLgpIIiIiIrkoIImIzxhjihljNhpjkowxKcaYV93ryxpjVhhjthtjlhtjyuR4zAhjzI/GmG3GmHY5\n1jcxxnzvvu8tJ96PiAQvBSQR8Rlr7XGgtbU2CogAWhtjWgLDgRXW2rrASvfPGGPqA92A+sAdwHvG\nGON+ugnAI [...]
+       "text": [
+        "<matplotlib.figure.Figure at 0x7fc1a3fb6750>"
+       ]
+      },
+      {
+       "metadata": {},
+       "output_type": "display_data",
+       "png": "iVBORw0KGgoAAAANSUhEUgAAAksAAAEACAYAAACjyjIwAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzsnXm4FMXVuN8CRaVdEPWaCC6YoEFlc0OUyDVR1E8+F9xA\nWUVBENQYNAK/KCrcCBIRDReDgLK4gEsEUQFFrwgim3A1ELcILrhcxSXaxo/F+v0xPXN7enqp7ulZ\n7tx6n8fHudVVp05VN1NnzjlVLaSUaDQajUaj0WjcaVBoBTQajUaj0WiKGW0saTQajUaj0figjSWN\nRqPRaDQaH7SxpNFoNBqNRuODNpY0Go1Go9FofNDGkkaj0Wg0Go0P2ljSaDSxI4SYLoT4Qgjxlq3s\nRCHEKiHEOiHEaiHECbZrw4UQ7wkh3hZCdLGVHyeEeMu6NtFWvpsQYo5V/roQ4tD8jU6j0dQ3tLGk\n0WhywYPAW [...]
+       "text": [
+        "<matplotlib.figure.Figure at 0x7fc19bb58850>"
+       ]
+      },
+      {
+       "metadata": {},
+       "output_type": "display_data",
+       "png": "iVBORw0KGgoAAAANSUhEUgAAAksAAAEACAYAAACjyjIwAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzsnXmYFcXVuN9iFa8EFxxUcOGLS8SoUYMY12FVIjKyBBQX\nFBQDimAinzgmMqiACPLhLkRRiKLgikEEQZ2IMYpLfqgggigqKIyiBrmyU78/7jI9fXup3m7fO1Pv\n8/h4p7qW09VN1+lzTp0WUko0Go1Go9FoNNbUi1sAjUaj0Wg0mkJGK0sajUaj0Wg0DmhlSaPRaDQa\njcYBrSxpNBqNRqPROKCVJY1Go9FoNBoHtLKk0Wg0Go1G44BWljQaTegIIaYJITYIIT40lQ8VQnws\nhPhICDHeUH6jEGKVEGKFEKKLofwkIcSH6WN3GcobCyFmpcvfEkIcmp8z02g0dRGtLGk0mih4BDjH\nWCCEaA90B [...]
+       "text": [
+        "<matplotlib.figure.Figure at 0x7fc1a2a7f050>"
+       ]
+      },
+      {
+       "metadata": {},
+       "output_type": "display_data",
+       "png": "iVBORw0KGgoAAAANSUhEUgAAAksAAAEACAYAAACjyjIwAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzsvX2cXkV5N/4dEgLsPJQK2WMRYrI1UAg/g5QikaKGgihP\nKolCeRNKSagVBALaKCZEWUnCawXyRN5UJLyIVBC0jWABjTSi8vKAtbzGms0PqHA2BMXMUiQyzx9n\nZs41c2bmnHuzm3v33vl+xL3vc+blmjkn9/U93+s6M0xKiYSEhISEhISEBD+2abcBCQkJCQkJCQkj\nGYksJSQkJCQkJCREkMhSQkJCQkJCQkIEiSwlJCQkJCQkJESQyFJCQkJCQkJCQgSJLCUkJCQkJCQk\nRJDIUkJCwpCCMbY9Y+xnjLHHGWNPMsYuVMd3Zozdyxh7ljH2b4yxPyZ1PscYW8sYe5oxdjg5vj9j\n7Bfq3JXk+ [...]
+       "text": [
+        "<matplotlib.figure.Figure at 0x7fc1a2adab90>"
+       ]
+      },
+      {
+       "metadata": {},
+       "output_type": "display_data",
+       "png": "iVBORw0KGgoAAAANSUhEUgAAAksAAAEACAYAAACjyjIwAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzsnX28FVW9/9/LCJH54VN6NEUevFmKkVdNEVM4ZoJ0EZLL\nw6EiFIoQU7yFJXCTowWJcE20pEuJiS/lHLxCGAmS6BEx5EHNFLS0wJCUo5APDClP6/fHzOwze/Y8\nrJk9e88++6z36+XLfWbWw3fWbGZ99vf7XWuElBKNRqPRaDQajT+HZG2ARqPRaDQaTSWjxZJGo9Fo\nNBpNCFosaTQajUaj0YSgxZJGo9FoNBpNCFosaTQajUaj0YSgxZJGo9FoNBpNCFosaTSa1BFCzBdC\n7BBCvOg6dq4QYr0Q4nkhxAYhxDmuc5OFEK8KIV4RQvRzHT9bCPGifW6O6/ihQohG+/gzQoiu5bs6\njUbT1tBiS [...]
+       "text": [
+        "<matplotlib.figure.Figure at 0x7fc19bf9d190>"
+       ]
+      },
+      {
+       "metadata": {},
+       "output_type": "display_data",
+       "png": "iVBORw0KGgoAAAANSUhEUgAAAksAAAEACAYAAACjyjIwAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzsvXmcHUW5///uAIFMiWyZw04yegHFSxBzES8iBMImMSTI\nBcKenBG8ol9A9ApBUJCfqKC43S+4ZUgEgiBwWZQdjahfkF2QyHbvJDcGoSeAYmrCkqR/f3RXd1V1\nVfeZkGQmZ+rzemHO6a7lqep2ns/5PE9VRUmSEBAQEBAQEBAQ4MaIwTYgICAgICAgIGAoI5ClgICA\ngICAgIAKBLIUEBAQEBAQEFCBQJYCAgICAgICAioQyFJAQEBAQEBAQAUCWQoICAgICAgIqEAgSwEB\nAasVURRtFEXRH6IoejyKovlRFH0tu755FEV3R1H0bBRFd0VRtKlWZ2YURc9FUfR0FEUHadfHR1H0\nZHbvu9r1D [...]
+       "text": [
+        "<matplotlib.figure.Figure at 0x7fc19bbd2d90>"
+       ]
+      },
+      {
+       "metadata": {},
+       "output_type": "display_data",
+       "png": "iVBORw0KGgoAAAANSUhEUgAAAksAAAEACAYAAACjyjIwAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzsvXmcXUWZPv5UiGC6WATSV1mT/DDMDGtIWEZHsBloBFGD\nCCSKECYBBsK+ClGWyKYsIsg2CAyEXUHZhQQ0aBggLEEMiyPf6WRCWE4HUEhdcAKp3x/nVJ236lTV\nObfT3bf7dj2fcXLvObW8VefQ73Of960qJqVERERERERERESEG8OabUBERERERERExEBGJEsRERER\nEREREQFEshQREREREREREUAkSxERERERERERAUSyFBERERERERERQCRLEREREREREREBRLIUERHR\nq2CMfZox9jRj7AXG2MuMsQuy6+sxxuYwxv6bMTabMfYZUud0xthfGGOvMsb2INcnMMb+lN27jFxf\ngzF2Z3b9K [...]
+       "text": [
+        "<matplotlib.figure.Figure at 0x7fc1a01bafd0>"
+       ]
+      },
+      {
+       "metadata": {},
+       "output_type": "display_data",
+       "png": "iVBORw0KGgoAAAANSUhEUgAAAksAAAEACAYAAACjyjIwAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzsvXu8JUV57/0tLsqs4ibs1YoOzN6vDkGIvioxoJJkUBFN\njgocDqhBkdscJXh7o0bBzxGNlyiexLvJiDdUVKJGPaDGWxA4hssQiApBjdl7HBDptQdBqTUgYL9/\n1KWfqq5ee4MzbNhTv88HZ63u6uqnqtv9/Nbveeop1TQNBQUFBQUFBQUFeWy31AYUFBQUFBQUFNyf\nUchSQUFBQUFBQcEEFLJUUFBQUFBQUDABhSwVFBQUFBQUFExAIUsFBQUFBQUFBRNQyFJBQUFBQUFB\nwQQUslRQULBFoZTaSSl1mVLqaqXUtUqpt7vjZyqlrldKXeX+e5a45vVKqZ8opa5TSj1DHD9QKfUD\nd+494viDl [...]
+       "text": [
+        "<matplotlib.figure.Figure at 0x7fc1a0311e90>"
+       ]
+      }
+     ],
+     "prompt_number": 15
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "Author: Jason Chin, Dec. 5, 2014"
+     ]
+    }
+   ],
+   "metadata": {}
+  }
+ ]
+}
\ No newline at end of file
diff --git a/examples/fc_run_LG.cfg b/examples/fc_run_LG.cfg
new file mode 100755
index 0000000..0615ce9
--- /dev/null
+++ b/examples/fc_run_LG.cfg
@@ -0,0 +1,36 @@
+[General]
+# list of files of the initial bas.h5 files
+input_fofn = input.fofn
+#input_fofn = preads.fofn
+
+input_type = raw
+#input_type = preads
+
+# The length cutoff used for seed reads used for initial mapping
+length_cutoff = 10000
+
+# The length cutoff used for seed reads usef for pre-assembly
+length_cutoff_pr = 7000
+
+
+jobqueue = bigmem
+sge_option_da = -pe smp 4 -q %(jobqueue)s
+sge_option_la = -pe smp 16 -q %(jobqueue)s
+sge_option_pda = -pe smp 8 -q %(jobqueue)s
+sge_option_pla = -pe smp 16 -q %(jobqueue)s
+sge_option_fc = -pe smp 24 -q %(jobqueue)s
+sge_option_cns = -pe smp 16 -q %(jobqueue)s
+
+pa_concurrent_jobs = 96
+cns_concurrent_jobs = 96
+ovlp_concurrent_jobs = 96
+
+pa_HPCdaligner_option =  -v -dal128 -t16 -e.70 -l1000 -s1000
+ovlp_HPCdaligner_option = -v -dal128 -t32 -h60 -e.96 -l500 -s1000
+
+pa_DBsplit_option = -x500 -s400
+ovlp_DBsplit_option = -x500 -s400
+
+falcon_sense_option = --output_multi --min_idt 0.70 --min_cov 4 --local_match_count_threshold 2 --max_n_read 200 --n_core 16
+
+overlap_filtering_setting = --max_diff 60 --max_cov 60 --min_cov 2 --n_core 24
diff --git a/examples/fc_run_arab.cfg b/examples/fc_run_arab.cfg
new file mode 100644
index 0000000..8f5bc80
--- /dev/null
+++ b/examples/fc_run_arab.cfg
@@ -0,0 +1,36 @@
+[General]
+# list of files of the initial bas.h5 files
+input_fofn = input.fofn
+#input_fofn = preads.fofn
+
+input_type = raw
+#input_type = preads
+
+# The length cutoff used for seed reads used for initial mapping
+length_cutoff = 15000
+
+# The length cutoff used for seed reads usef for pre-assembly
+length_cutoff_pr = 15000
+
+
+jobqueue = your_queue
+sge_option_da = -pe smp 8 -q %(jobqueue)s
+sge_option_la = -pe smp 2 -q %(jobqueue)s
+# 6 seems to small... 8 might be better for Dmel
+sge_option_pda = -pe smp 8 -q %(jobqueue)s
+sge_option_pla = -pe smp 2 -q %(jobqueue)s
+sge_option_fc = -pe smp 24 -q %(jobqueue)s
+sge_option_cns = -pe smp 8 -q %(jobqueue)s
+
+pa_concurrent_jobs = 32
+ovlp_concurrent_jobs = 32
+
+pa_HPCdaligner_option =  -v -dal128 -t16 -e.70 -l1000 -s1000
+ovlp_HPCdaligner_option = -v -dal128 -t32 -h60 -e.96 -l500 -s1000
+
+pa_DBsplit_option = -x500 -s400
+ovlp_DBsplit_option = -x500 -s400
+
+falcon_sense_option = --output_multi --min_idt 0.70 --min_cov 4 --local_match_count_threshold 2 --max_n_read 200 --n_core 6
+
+overlap_filtering_setting = --max_diff 100 --max_cov 100 --min_cov 1 --bestn 10 --n_core 24
diff --git a/examples/fc_run_dmel.cfg b/examples/fc_run_dmel.cfg
new file mode 100644
index 0000000..2623812
--- /dev/null
+++ b/examples/fc_run_dmel.cfg
@@ -0,0 +1,36 @@
+[General]
+# list of files of the initial bas.h5 files
+input_fofn = input.fofn
+#input_fofn = preads.fofn
+
+input_type = raw
+#input_type = preads
+
+# The length cutoff used for seed reads used for initial mapping
+length_cutoff = 12000
+
+# The length cutoff used for seed reads usef for pre-assembly
+length_cutoff_pr = 12000
+
+
+jobqueue = your_queue
+sge_option_da = -pe smp 8 -q %(jobqueue)s
+sge_option_la = -pe smp 2 -q %(jobqueue)s
+# 6 seems to small... 8 might be better for Dmel
+sge_option_pda = -pe smp 8 -q %(jobqueue)s
+sge_option_pla = -pe smp 2 -q %(jobqueue)s
+sge_option_fc = -pe smp 24 -q %(jobqueue)s
+sge_option_cns = -pe smp 8 -q %(jobqueue)s
+
+pa_concurrent_jobs = 32
+ovlp_concurrent_jobs = 32
+
+pa_HPCdaligner_option =  -v -dal128 -t16 -e.70 -l1000 -s1000
+ovlp_HPCdaligner_option = -v -dal128 -t32 -h60 -e.96 -l500 -s1000
+
+pa_DBsplit_option = -x500 -s400
+ovlp_DBsplit_option = -x500 -s400
+
+falcon_sense_option = --output_multi --min_idt 0.70 --min_cov 4 --local_match_count_threshold 2 --max_n_read 200 --n_core 6
+
+overlap_filtering_setting = --max_diff 30 --max_cov 60 --min_cov 5 --n_core 24
diff --git a/examples/fc_run_ecoli.cfg b/examples/fc_run_ecoli.cfg
new file mode 100644
index 0000000..dddc6d6
--- /dev/null
+++ b/examples/fc_run_ecoli.cfg
@@ -0,0 +1,35 @@
+[General]
+# list of files of the initial bas.h5 files
+input_fofn = input.fofn
+#input_fofn = preads.fofn
+
+input_type = raw
+#input_type = preads
+
+# The length cutoff used for seed reads used for initial mapping
+length_cutoff = 12000
+
+# The length cutoff used for seed reads usef for pre-assembly
+length_cutoff_pr = 12000
+
+
+jobqueue = your_queue
+sge_option_da = -pe smp 8 -q %(jobqueue)s
+sge_option_la = -pe smp 2 -q %(jobqueue)s
+sge_option_pda = -pe smp 8 -q %(jobqueue)s
+sge_option_pla = -pe smp 2 -q %(jobqueue)s
+sge_option_fc = -pe smp 24 -q %(jobqueue)s
+sge_option_cns = -pe smp 8 -q %(jobqueue)s
+
+pa_concurrent_jobs = 32
+ovlp_concurrent_jobs = 32
+
+pa_HPCdaligner_option =  -v -dal4 -t16 -e.70 -l1000 -s1000
+ovlp_HPCdaligner_option = -v -dal4 -t32 -h60 -e.96 -l500 -s1000
+
+pa_DBsplit_option = -x500 -s50
+ovlp_DBsplit_option = -x500 -s50
+
+falcon_sense_option = --output_multi --min_idt 0.70 --min_cov 4 --local_match_count_threshold 2 --max_n_read 200 --n_core 6
+
+overlap_filtering_setting = --max_diff 100 --max_cov 100 --min_cov 20 --bestn 10 --n_core 24
diff --git a/examples/fc_run_ecoli_2.cfg b/examples/fc_run_ecoli_2.cfg
new file mode 100644
index 0000000..78cf397
--- /dev/null
+++ b/examples/fc_run_ecoli_2.cfg
@@ -0,0 +1,35 @@
+[General]
+# list of files of the initial bas.h5 files
+input_fofn = input.fofn
+#input_fofn = preads.fofn
+
+input_type = raw
+#input_type = preads
+
+# The length cutoff used for seed reads used for initial mapping
+length_cutoff = 12000
+
+# The length cutoff used for seed reads usef for pre-assembly
+length_cutoff_pr = 12000
+
+
+jobqueue = your_queue
+sge_option_da = -pe smp 8 -q %(jobqueue)s
+sge_option_la = -pe smp 2 -q %(jobqueue)s
+sge_option_pda = -pe smp 8 -q %(jobqueue)s
+sge_option_pla = -pe smp 2 -q %(jobqueue)s
+sge_option_fc = -pe smp 24 -q %(jobqueue)s
+sge_option_cns = -pe smp 8 -q %(jobqueue)s
+
+pa_concurrent_jobs = 32
+ovlp_concurrent_jobs = 32
+
+pa_HPCdaligner_option =  -v -dal24 -t16 -e.70 -l1000 -s1000
+ovlp_HPCdaligner_option = -v -dal24 -t32 -h60 -e.96 -l500 -s1000
+
+pa_DBsplit_option = -x500 -s200
+ovlp_DBsplit_option = -x500 -s200
+
+falcon_sense_option = --output_multi --min_idt 0.70 --min_cov 4 --local_match_count_threshold 2 --max_n_read 200 --n_core 6
+
+overlap_filtering_setting = --max_diff 100 --max_cov 100 --min_cov 20 --bestn 10 --n_core 24
diff --git a/examples/install_note.sh b/examples/install_note.sh
deleted file mode 100644
index 785358d..0000000
--- a/examples/install_note.sh
+++ /dev/null
@@ -1,84 +0,0 @@
-# This is the script that will build everything needed to generate an assembly 
-# on top of the StarCluster Ubuntu AMI 
-HBAR_ROOT=/home
-mkdir -p $HBAR_ROOT/HBAR_ENV
-export HBAR_HOME=$HBAR_ROOT/HBAR_ENV/
-sudo apt-get install python-virtualenv
-virtualenv -p /usr/bin/python2.7 $HBAR_HOME
-cd $HBAR_HOME
-. bin/activate
-pip install numpy==1.6.2
-sudo apt-get install python-dev
-pip install numpy==1.6.2
-wget http://www.hdfgroup.org/ftp/HDF5/prev-releases/hdf5-1.8.9/src/hdf5-1.8.9.tar.gz
-tar zxvf hdf5-1.8.9.tar.gz
-cd hdf5-1.8.9
-./configure --prefix=$HBAR_HOME --enable-cxx
-make install
-cd ..
-wget http://h5py.googlecode.com/files/h5py-2.0.1.tar.gz
-tar zxvf h5py-2.0.1.tar.gz
-cd h5py-2.0.1
-python setup.py build --hdf5=$HBAR_HOME
-python setup.py install
-cd ..
-pip install git+https://github.com/PacificBiosciences/pbcore.git#pbcore
-sudo apt-get install git
-pip install git+https://github.com/PacificBiosciences/pbcore.git#pbcore
-pip install git+https://github.com/PacificBiosciences/pbdagcon.git#pbdagcon
-pip install git+https://github.com/PacificBiosciences/pbh5tools.git#pbh5tools
-pip install git+https://github.com/cschin/pypeFLOW.git#pypeflow
-pip install rdflib==3.4.0
-pip install git+https://github.com/PacificBiosciences/HBAR-DTK.git#hbar-dtk
-pip install git+https://github.com/PacificBiosciences/FALCON.git#falcon
-
-git clone https://github.com/PacificBiosciences/blasr.git
-cd blasr
-export HDF5INCLUDEDIR=/home/HBAR_ENV/include/
-export HDF5LIBDIR=/home/HBAR_ENV/lib/
-make
-cp alignment/bin/blasr ../bin/
-cp alignment/bin/sawriter ../bin/
-cp pbihdfutils/bin/samFilter  ../bin
-cp pbihdfutils/bin/samtoh5  ../bin
-cd ..
-
-
-wget http://downloads.sourceforge.net/project/boost/boost/1.47.0/boost_1_47_0.tar.gz
-tar zxvf boost_1_47_0.tar.gz
-cd boost_1_47_0/
-bash bootstrap.sh
-./b2 install -j 24 --prefix=$HBAR_ROOT/HBAR_ENV/boost
-cd ..
-
-sudo apt-get install libpcre3 libpcre3-dev
-wget http://downloads.sourceforge.net/project/swig/swig/swig-2.0.11/swig-2.0.11.tar.gz
-tar zxvf swig-2.0.11.tar.gz
-cd swig-2.0.11
-./configure --prefix=$HBAR_ROOT/HBAR_ENV
-make
-make install
-cd ..
-
-git clone https://github.com/PacificBiosciences/ConsensusCore.git
-cd ConsensusCore/
-python setup.py install --swig=$HBAR_ROOT/HBAR_ENV/bin/swig --boost=$HBAR_ROOT/HBAR_ENV/boost/include/
-cd ..
-
-pip install git+https://github.com/PacificBiosciences/GenomicConsensus.git#GenomicConsensus
-pip install git+https://github.com/PacificBiosciences/pbalign#pbaligno
-
-wget http://downloads.sourceforge.net/project/mummer/mummer/3.23/MUMmer3.23.tar.gz
-tar zxvf MUMmer3.23.tar.gz
-cd MUMmer3.23/
-make install
-cd ..
-export PATH=$PATH:/home/HBAR_ENV/MUMmer3.23
-
-
-wget http://downloads.sourceforge.net/project/samtools/samtools/0.1.19/samtools-0.1.19.tar.bz2
-tar jxvf samtools-0.1.19.tar.bz2
-cd samtools-0.1.19
-make
-cp samtools ../bin
-cd ..
diff --git a/examples/readme.md b/examples/readme.md
deleted file mode 100644
index 0c83259..0000000
--- a/examples/readme.md
+++ /dev/null
@@ -1,92 +0,0 @@
-Running an Amazon EC2 instance that has HBAR-DTK + Falcon pre-installed
-=======================================================================
-
-1. Install the latest verison of StarCluster
-```
-    git clone https://github.com/jtriley/StarCluster.git
-    cd StarCluster
-    python setup.py install #better in virtualenv
-```
-The stable version of StarCluster does not support the `c3` instance.  For
-assembly, using one node of `c3.8xlarge` instance is more convenient. In my
-test, I can finish single E. coli genome within almost one hour. Namely, one can
-assembly a bacteria genome in less then 5 bucks.
-
-2. Use the `StarCluster.cfg` as the configuration file for `StarCluster` to
-setup a `falcon` cluster
-
-3. Start the cluster 
-```
-    starcluster start falcon
-```
-
-4. login to the cluster
-```
-    starcluster sshmaster falcon
-```
-
-5. set up the SGE
-```
-    cd /home/sge_setup
-    bash sge_setup.sh
-```
-
-6. There is alreay an existing assembly results in `/home/Ecoli_ASM/`. Here I
-show how to reproduce it. First, create a new assembly working directory in
-`/mnt`, set it up and run HBAR_WF3.py to get preassembled reads
-```
-    cd /mnt
-    mkdir test_asm
-    cd test_asm
-    cp /home/Ecoli_ASM/HBAR.cfg .
-    cp /home/Ecoli_ASM/input.fofn .
-    source /home/HBAR_ENV/bin/activate
-    HBAR_WF3.py HBAR.cfg
-```
-
-7. The next part of the assembly does not start automatically yet. The detail
-steps are in the `run_asm.sh` script and one can use to get contigs and
-consensus. 
-```
-    cp /home/Ecoli_ASM/run_asm.sh .
-    bash run_asm.sh
-```
-The consensus result is in `/mnt/consensus.fasta`. Since we did not do any
-consensus after the unitig step. One more run of quiver consensus may further
-improve the final assembly accuracy.
-
-8. A yeast (S. cerevisiae W303) data set is also included in the AMI. One can try
-to assemble it with a larger cluster setting.
-
-
-9. Here is the result of a timing test:
-```
-    (HBAR_ENV)root at master:/mnt/test_asm# time HBAR_WF3.py HBAR.cfg
-    
-    Your job 1 ("mapping_task_q00002_t000011416727c") has been submitted
-    Your job 2 ("qf_task_q00002a3e75f4c") has been submitted
-    Your job 3 ("mapping_task_q00003_t00001b667b504") has been submitted
-    Your job 4 ("qf_task_q000036974ef22") has been submitted
-    Your job 5 ("mapping_task_q00001_t000017bf52d9c") has been submitted
-    Your job 6 ("qf_task_q000010b31d960") has been submitted
-    Your job 7 ("pa_task_000001ee38aee") has been submitted
-    
-    
-    
-    real    26m51.030s
-    user    1m10.152s
-    sys     0m11.993s
-    
-    (HBAR_ENV)root at master:/mnt/test_asm# time bash run_asm.sh
-    [WARNING] This .cmp.h5 file lacks some of the QV data tracks that are required for optimal performance of the Quiver algorithm.  For optimal results use the ResequencingQVs workflow in SMRTPortal with bas.h5 files from an instrument using software version 1.3.1 or later.
-
-    real    13m2.945s
-    user    244m44.322s
-    sys     2m7.032s
-```
-For better results, one might run `quiver` twice. It is possible to get the whole assembly within one hour (~ 26 + 13 * 2 = 52 minutes). With the overhead on setting up, file transfer, etc., one can assembly a bacteria genome in EC2 less than 5 bucks in principle.
-
-
---
-Jason Chin, 01/18/2014
-
diff --git a/examples/run_asm.sh b/examples/run_asm.sh
deleted file mode 100644
index 35f7323..0000000
--- a/examples/run_asm.sh
+++ /dev/null
@@ -1,24 +0,0 @@
-# This script does the assembly and generate the quiver consensus after one gets preassembled reads
-# Modification will be needed for larger genome and different computational cluster setup
-
-# It should be run within the assembly working directory
-
-mkdir 3-asm-falcon/
-cd 3-asm-falcon/
-cat ../2-preads-falcon/pread_*.fa > preads.fa
-falcon_overlap.py  --min_len 8000 --n_core 24 --d_core 1 preads.fa > preads.ovlp
-falcon_asm.py preads.ovlp preads.fa
-falcon_fixasm.py
-
-export PATH=$PATH:/home/HBAR_ENV/MUMmer3.23
-nucmer -maxmatch all_tigs.fa all_tigs.fa -p all_tigs_self >& /dev/null
-show-coords -o -H -T all_tigs_self.delta | grep CONTAINS | awk '$7>96' | awk '{print $9}' | sort -u > all_tigs_duplicated_ids
-remove_dup_ctg.py
-cat p-tigs_nodup.fa a-tigs_nodup.fa > pa-tigs_nodup.fa
-cat p-tigs_nodup.fa a-tigs_nodup.fa > /mnt/pa-tigs_nodup.fa
-
-find /home/data/Ecoli/ -name "*.bax.h5" > /mnt/h5_input.fofn
-cd /mnt
-pbalign.py --forQuiver --nproc 32  --tmpDir /mnt --maxHits 1  h5_input.fofn pa-tigs_nodup.fa output.cmp.h5 
-samtools faidx pa-tigs_nodup.fa
-quiver -j 24 output.cmp.h5 -r pa-tigs_nodup.fa -o variants.gff -o consensus.fasta
diff --git a/examples/run_ecoli_test.sh b/examples/run_ecoli_test.sh
new file mode 100644
index 0000000..17c723d
--- /dev/null
+++ b/examples/run_ecoli_test.sh
@@ -0,0 +1,11 @@
+mkdir ecoli_test/
+cd ecoli_test/
+mkdir data
+cd data
+wget https://www.dropbox.com/s/tb78i5i3nrvm6rg/m140913_050931_42139_c100713652400000001823152404301535_s1_p0.1.subreads.fasta
+wget https://www.dropbox.com/s/v6wwpn40gedj470/m140913_050931_42139_c100713652400000001823152404301535_s1_p0.2.subreads.fasta
+wget https://www.dropbox.com/s/j61j2cvdxn4dx4g/m140913_050931_42139_c100713652400000001823152404301535_s1_p0.3.subreads.fasta
+cd ..
+find $PWD/data -name "*.fasta" > input.fofn
+cp ../FALCON/examples/fc_run_ecoli_2.cfg  .
+fc_run.py fc_run_ecoli_2.cfg
diff --git a/setup.py b/setup.py
index 9784b2b..3dd7bf2 100755
--- a/setup.py
+++ b/setup.py
@@ -1,36 +1,56 @@
 #!/usr/bin/env python
 
-from setuptools import setup
+from setuptools import setup, Extension
 
-from distutils.core import Extension
+import glob
+
+#install_requires=[ "pbcore >= 0.6.3", "networkx >= 1.7" ]
+install_requires=[ "networkx >= 1.7" ]
+
+scripts = glob.glob("src/py_scripts/*.py")
 
 setup(name='falcon_kit',
-      version='0.1.3',
+      version='0.4.0',
       description='a small toolkit for DNA seqeucne alignment, overlapping, and assembly',
       author='Jason Chin',
       author_email='jchin at pacificbiosciences.com',
-      packages=['falcon_kit'],
+      packages=['falcon_kit',
+          'falcon_kit.mains',
+          'falcon_kit.util',
+          ],
       package_dir={'falcon_kit':'src/py/'},
-      ext_modules=[Extension('falcon_kit.DW_align', ['src/c/DW_banded.c'], 
-                   extra_link_args=["-fPIC",  "-O3"]),
-                   Extension('falcon_kit.kmer_lookup', ['src/c/kmer_lookup.c'],
-                   extra_link_args=["-fPIC",  "-O3"]),
-                   Extension('falcon_kit.falcon', ['src/c/DW_banded.c', 'src/c/kmer_lookup.c', 'src/c/falcon.c'],
-                   extra_link_args=["-fPIC",  "-O3"]),
-                   ],
-      scripts = ["src/py_scripts/falcon_asm.py", 
-                 "src/py_scripts/falcon_asm_dev.py",
-                 "src/py_scripts/falcon_overlap.py",
-                 "src/py_scripts/falcon_overlap2.py",
-                 "src/py_scripts/falcon_qrm.py",
-                 "src/py_scripts/falcon_fixasm.py",
-                 "src/py_scripts/falcon_dedup.py",
-                 "src/py_scripts/falcon_ucns_data.py",
-                 "src/py_scripts/falcon_utgcns.py",
-                 "src/py_scripts/falcon_sense.py",
-                 "src/py_scripts/get_rdata.py",
-                 "src/py_scripts/remove_dup_ctg.py"],
+      ext_modules=[
+                   Extension('falcon_kit.ext_falcon', ['src/c/ext_falcon.c', 'src/c/DW_banded.c', 'src/c/kmer_lookup.c', 'src/c/falcon.c'],
+                    extra_link_args=[],
+                    extra_compile_args=['-fPIC', '-O3', '-fno-omit-frame-pointer'],
+                    # '-fno-omit-frame-pointer' can help with gperftools.
+                    #libraries=['profiler'],
+                    #include_dirs=['/home/cdunn/local/include'],
+                    #library_dirs=['/home/cdunn/local/lib'],
+                    #language="c++", # c for now
+                    #export_symbols=['generate_consensus'], # for windows?
+                   ),
+                  ],
+      entry_points = {'console_scripts': [
+          'falcon-task=falcon_kit.mains.tasks:main',
+          'fc_actg_coordinate=falcon_kit.mains.actg_coordinate:main',
+          'fc_consensus=falcon_kit.mains.consensus:main',
+          'fc_contig_annotate=falcon_kit.mains.contig_annotate:main',
+          'fc_ctg_link_analysis=falcon_kit.mains.ctg_link_analysis:main',
+          'fc_dedup_a_tigs=falcon_kit.mains.dedup_a_tigs:main',
+          'fc_graph_to_contig=falcon_kit.mains.graph_to_contig:main',
+          'fc_graph_to_utgs=falcon_kit.mains.graph_to_utgs:main',
+          'fc_ovlp_filter=falcon_kit.mains.ovlp_filter:main',
+          'fc_ovlp_stats=falcon_kit.mains.ovlp_stats:main',
+          'fc_ovlp_to_graph=falcon_kit.mains.ovlp_to_graph:main',
+          'fc_run=falcon_kit.mains.run:main',
+          ],
+      },
+      extras_require = {
+          'falcon-task':  ['falcon_kit'],
+      },
+      scripts = scripts,
       zip_safe = False,
-      install_requires=[ "pbcore >= 0.6.3", "networkx >= 1.7" ]
+      setup_requires=install_requires,
+      install_requires=install_requires
      )
-
diff --git a/src/c/DW_banded.c b/src/c/DW_banded.c
index 44a6168..db8d1e0 100755
--- a/src/c/DW_banded.c
+++ b/src/c/DW_banded.c
@@ -93,7 +93,7 @@ d_path_data2 * get_dpath_idx( seq_coor_t d, seq_coor_t k, unsigned long max_idx,
 void print_d_path(  d_path_data2 * base, unsigned long max_idx) {
     unsigned long idx;
     for (idx = 0; idx < max_idx; idx++){
-        printf("dp %ld %ld %ld %ld %ld %ld %ld %ld\n",idx, (base+idx)->d, (base+idx)->k, (base+idx)->x1, (base+idx)->y1, (base+idx)->x2, (base+idx)->y2, (base+idx)->pre_k);
+        printf("dp %ld %d %d %d %d %d %d %d\n",idx, (base+idx)->d, (base+idx)->k, (base+idx)->x1, (base+idx)->y1, (base+idx)->x2, (base+idx)->y2, (base+idx)->pre_k);
     }
 }
 
@@ -169,7 +169,7 @@ alignment * align(char * query_seq, seq_coor_t q_len,
  
         for (k = min_k; k <= max_k;  k += 2) {
 
-            if ( k == min_k || k != max_k && V[ k - 1 + k_offset ] < V[ k + 1 + k_offset] ) {
+            if ( (k == min_k) || ((k != max_k) && (V[ k - 1 + k_offset ] < V[ k + 1 + k_offset])) ) {
                 pre_k = k + 1;
                 x = V[ k + 1 + k_offset];
             } else {
diff --git a/src/c/Makefile b/src/c/Makefile
index 607dcde..3f13a2f 100755
--- a/src/c/Makefile
+++ b/src/c/Makefile
@@ -14,7 +14,7 @@ falcon.so: falcon.c common.h DW_banded.c kmer_lookup.c
 #	gcc DW_banded_2.c kmer_lookup.c falcon.c -O3 -shared -fPIC -o falcon2.so 
 
 clean:
-	rm falcon *.so
+	rm  *.so
 
 all: DW_align.so kmer_lookup.so falcon.so
 
diff --git a/src/c/common.h b/src/c/common.h
index e694c3b..95c38f6 100755
--- a/src/c/common.h
+++ b/src/c/common.h
@@ -54,7 +54,7 @@
  #################################################################################$$
  */
 
-typedef long int seq_coor_t; 
+typedef int seq_coor_t; 
 
 typedef struct {    
     seq_coor_t aln_str_size ;
@@ -122,7 +122,7 @@ typedef struct {
 
 typedef struct {
     char * sequence;
-    unsigned int * eff_cov;
+    int * eqv;
 } consensus_data;
 
 kmer_lookup * allocate_kmer_lookup (seq_coor_t);
@@ -151,6 +151,7 @@ kmer_match * find_kmer_pos_for_seq( char *,
                                     seq_addr_array, 
                                     kmer_lookup * );
 
+void free_kmer_match( kmer_match * ptr);
 void free_kmer_lookup(kmer_lookup * );
 
 
diff --git a/src/c/ext_falcon.c b/src/c/ext_falcon.c
new file mode 100644
index 0000000..6439cb4
--- /dev/null
+++ b/src/c/ext_falcon.c
@@ -0,0 +1,13 @@
+#include "Python.h"
+static PyMethodDef SpamMethods[] = {
+    {NULL, NULL, 0, NULL}        /* Sentinel */
+};
+PyMODINIT_FUNC
+initext_falcon(void)
+{
+    PyObject *m;
+
+    m = Py_InitModule("falcon_kit.ext_falcon", SpamMethods);
+    if (m == NULL)
+        return;
+}
diff --git a/src/c/falcon.c b/src/c/falcon.c
index ba7eb9c..e39e071 100755
--- a/src/c/falcon.c
+++ b/src/c/falcon.c
@@ -58,13 +58,17 @@
 #include <limits.h>
 #include <string.h>
 #include <assert.h>
+#include <stdint.h>
 #include "common.h"
 
 typedef struct {
     seq_coor_t t_pos;
-    unsigned int delta;
+    uint8_t delta;
     char q_base;
-    unsigned int q_id;
+    seq_coor_t p_t_pos;   // the tag position of the previous base
+    uint8_t p_delta; // the tag delta of the previous base
+    char p_q_base;        // the previous base
+    unsigned q_id;
 } align_tag_t;
 
 typedef struct {
@@ -74,42 +78,51 @@ typedef struct {
 
 
 typedef struct {
-    seq_coor_t len;
-    char * name;
-    char * seq;
+    uint16_t size;
+    uint16_t n_link;
+    seq_coor_t * p_t_pos;   // the tag position of the previous base
+    uint8_t * p_delta; // the tag delta of the previous base
+    char * p_q_base;        // the previous base
+    uint16_t * link_count;
+    uint16_t count;
+    seq_coor_t best_p_t_pos;
+    uint8_t best_p_delta;
+    uint8_t best_p_q_base; // encoded base
+    double score;
+} align_tag_col_t;
+
+typedef struct {
+    align_tag_col_t * base;
+} msa_base_group_t;
 
-} consensusn_seq_t;
+typedef struct {
+    uint8_t size;
+    uint8_t max_delta;
+    msa_base_group_t * delta;
+} msa_delta_group_t;
 
+typedef msa_delta_group_t * msa_pos_t;
 
 align_tags_t * get_align_tags( char * aln_q_seq, 
                                char * aln_t_seq, 
                                seq_coor_t aln_seq_len,
                                aln_range * range,
-                               unsigned long q_id,
-                               unsigned long local_match_count_window,
-                               unsigned long local_match_count_threshold,
+                               unsigned q_id,
                                seq_coor_t t_offset) {
-
-#define LONGEST_INDEL_ALLOWED 6 
-
-    char q_base;
-    char t_base;
+    char p_q_base;
     align_tags_t * tags;
-    seq_coor_t i, j, jj, k;
-    seq_coor_t match_count;
+    seq_coor_t i, j, jj, k, p_j, p_jj;
 
     tags = calloc( 1, sizeof(align_tags_t) );
     tags->len = aln_seq_len; 
     tags->align_tags = calloc( aln_seq_len + 1, sizeof(align_tag_t) );
     i = range->s1 - 1;
     j = range->s2 - 1;
-    match_count = 0;
     jj = 0;
-    for (k = 0; k< local_match_count_window && k < aln_seq_len; k++) {
-        if (aln_q_seq[k]  == aln_t_seq[k] ) {
-            match_count ++;
-        }
-    }
+    p_j = -1;
+    p_jj = 0;
+    p_q_base = '.';
+
     for (k = 0; k < aln_seq_len; k++) {
         if (aln_q_seq[k] != '-') {
             i ++;
@@ -119,41 +132,29 @@ align_tags_t * get_align_tags( char * aln_q_seq,
             j ++;
             jj = 0;
         }
+        //printf("t %d %d %d %c %c\n", q_id, j, jj, aln_t_seq[k], aln_q_seq[k]);
        
-        if (local_match_count_threshold > 0) {
-            if (k < aln_seq_len - local_match_count_window && aln_q_seq[k + local_match_count_window]  == aln_t_seq[k + local_match_count_window] ) {
-                match_count ++;
-            }
-
-            if (k > local_match_count_window && aln_q_seq[k - local_match_count_window] == aln_t_seq[k - local_match_count_window] ) {
-                match_count --;
-            }
-
-            if (match_count < 0) {
-                match_count = 0;
-            }
-        }
        
-        if ( j + t_offset >= 0) {
+        if ( j + t_offset >= 0 && jj < UINT8_MAX && p_jj < UINT8_MAX) {
             (tags->align_tags[k]).t_pos = j + t_offset;
             (tags->align_tags[k]).delta = jj;
-            if (local_match_count_threshold > 0 && jj == 0 && match_count < local_match_count_threshold) {
-                (tags->align_tags[k]).q_base = '*';
-            } else {
-                (tags->align_tags[k]).q_base = aln_q_seq[k];
-            }
+            (tags->align_tags[k]).p_t_pos = p_j + t_offset;
+            (tags->align_tags[k]).p_delta = p_jj;
+            (tags->align_tags[k]).p_q_base = p_q_base;
+            (tags->align_tags[k]).q_base = aln_q_seq[k];
             (tags->align_tags[k]).q_id = q_id;
+            
+            p_j = j;
+            p_jj = jj;
+            p_q_base = aln_q_seq[k];
         }
-        //if (jj > LONGEST_INDEL_ALLOWED) {
-        //   break;
-        //}
     }
     // sentinal at the end
     //k = aln_seq_len;
     tags->len = k; 
-    (tags->align_tags[k]).t_pos = -1;
-    (tags->align_tags[k]).delta = -1;
-    (tags->align_tags[k]).q_base = ' ';
+    (tags->align_tags[k]).t_pos = UINT_MAX;
+    (tags->align_tags[k]).delta = UINT8_MAX;
+    (tags->align_tags[k]).q_base = '.';
     (tags->align_tags[k]).q_id = UINT_MAX;
     return tags;
 }
@@ -164,174 +165,394 @@ void free_align_tags( align_tags_t * tags) {
 }
 
 
-int compare_tags(const void * a, const void * b)
-{
-    const align_tag_t * arg1 = a;
-    const align_tag_t * arg2 = b;
-    if (arg1->delta - arg2->delta == 0) {
-        return  arg1->q_base - arg2->q_base;
-    } else {
-        return arg1->delta - arg2->delta;
-    }
+void allocate_aln_col( align_tag_col_t * col) {
+    col->p_t_pos = ( seq_coor_t * ) calloc(col->size, sizeof( seq_coor_t ));
+    col->p_delta = ( uint8_t * ) calloc(col->size, sizeof( uint8_t ));
+    col->p_q_base = ( char * )calloc(col->size, sizeof( char ));
+    col->link_count = ( uint16_t * ) calloc(col->size, sizeof( uint16_t ));
 }
 
-consensus_data * get_cns_from_align_tags( align_tags_t ** tag_seqs, unsigned long n_tag_seqs, unsigned t_len, unsigned min_cov ) {
+void realloc_aln_col( align_tag_col_t * col ) {
+    col->p_t_pos = (seq_coor_t *) realloc( col->p_t_pos, (col->size) * sizeof( seq_coor_t ));
+    col->p_delta = ( uint8_t *)  realloc( col->p_delta, (col->size) * sizeof( uint8_t ));
+    col->p_q_base = (char *) realloc( col->p_q_base, (col->size) * sizeof( char ));
+    col->link_count = ( uint16_t *) realloc( col->link_count, (col->size) * sizeof( uint16_t ));
+}
 
-    seq_coor_t i, j, t_pos, tmp_pos;
-    unsigned int * coverage;
-    unsigned int * local_nbase;
-    unsigned int * aux_index;
-
-    unsigned int cur_delta;
-    unsigned int counter[5] = {0, 0, 0, 0, 0};
-    unsigned int k;
-    unsigned int max_count;
-    unsigned int max_count_index;
-    seq_coor_t consensus_index;
-    seq_coor_t c_start, c_end, max_start;
-    unsigned int cov_score, max_cov_score;
-    consensus_data * consensus;
-    //char * consensus;
+void free_aln_col( align_tag_col_t * col) {
+    free(col->p_t_pos);
+    free(col->p_delta);
+    free(col->p_q_base);
+    free(col->link_count);
+}
 
 
+void allocate_delta_group( msa_delta_group_t * g) {
+    int i,j;
+    g->max_delta = 0;
+    g->delta = (msa_base_group_t *) calloc( g->size, sizeof(msa_base_group_t));
+    for (i = 0; i< g->size; i++) {
+        g->delta[i].base = ( align_tag_col_t * ) calloc( 5, sizeof(align_tag_col_t ) );
+        for (j = 0; j < 5; j++ ) {
+             g->delta[i].base[j].size = 8;
+             allocate_aln_col(&(g->delta[i].base[j]));
+        }
+    }
+}
 
-    align_tag_t ** tag_seq_index;
+void realloc_delta_group( msa_delta_group_t * g, uint16_t new_size ) {
+    int i, j, bs, es;
+    bs = g->size;
+    es = new_size;
+    g->delta = (msa_base_group_t *) realloc(g->delta, new_size * sizeof(msa_base_group_t));
+    for (i=bs; i < es; i++) {
+        g->delta[i].base = ( align_tag_col_t *) calloc( 5, sizeof(align_tag_col_t ) );
+        for (j = 0; j < 5; j++ ) {
+             g->delta[i].base[j].size = 8;
+             allocate_aln_col(&(g->delta[i].base[j]));
+        }
+    }
+    g->size = new_size;
+}
 
-    coverage = calloc( t_len, sizeof(unsigned int) );
-    local_nbase = calloc( t_len, sizeof(unsigned int) );
-    aux_index = calloc( t_len, sizeof(unsigned int) );
-    tag_seq_index = calloc( t_len, sizeof(align_tag_t *) );
+void free_delta_group( msa_delta_group_t * g) {
+    //manything to do here 
+    int i, j;
+    for (i = 0; i < g->size; i++) {
+        for (j = 0; j < 5; j++) {
+            free_aln_col( &(g->delta[i].base[j]) );
+        }
+        free(g->delta[i].base);
+    }
+    free(g->delta);
+}
 
-    for (i = 0; i < n_tag_seqs; i++) {
-        for (j = 0; j < tag_seqs[i]->len; j++) {
-            if (tag_seqs[i]->align_tags[j].delta == 0 && tag_seqs[i]->align_tags[j].q_base != '*') {
-                t_pos = tag_seqs[i]->align_tags[j].t_pos;
-                coverage[ t_pos ] ++;
+void update_col( align_tag_col_t * col, seq_coor_t p_t_pos, uint8_t p_delta, char p_q_base) {
+    int updated = 0;
+    int kk;
+    col->count += 1;
+    for (kk = 0; kk < col->n_link; kk++) {
+        if ( p_t_pos == col->p_t_pos[kk] &&
+             p_delta == col->p_delta[kk] &&
+             p_q_base == col->p_q_base[kk] ) {
+            col->link_count[kk] ++;
+            updated = 1;
+            break;
+        }
+    }
+    if (updated == 0) {
+        if (col->n_link + 1 > col->size) {
+            if (col->size < (UINT16_MAX > 1)-1) {
+                col->size *= 2;
+            } else {
+                col->size += 256;
             }
-            local_nbase[ tag_seqs[i]->align_tags[j].t_pos ] ++;
+            assert( col->size < UINT16_MAX-1 );
+            realloc_aln_col(col);
         }
+        kk = col->n_link;
+
+        col->p_t_pos[kk] = p_t_pos;
+        col->p_delta[kk] = p_delta;
+        col->p_q_base[kk] = p_q_base;
+        col->link_count[kk] = 1;
+        col->n_link++;
     }
+}
 
 
-    for (i = 0; i < t_len; i++) {
-        tag_seq_index[i] = calloc( local_nbase[i] + 1, sizeof(align_tag_t) );
+msa_pos_t * get_msa_working_sapce(unsigned int max_t_len) {
+    msa_pos_t * msa_array;
+    unsigned int i;
+    msa_array = calloc(max_t_len, sizeof(msa_pos_t *));
+    for (i = 0; i < max_t_len; i++) {
+        msa_array[i] = calloc(1, sizeof(msa_delta_group_t));
+        msa_array[i]->size = 8;
+        allocate_delta_group(msa_array[i]);
     }
+    return msa_array;
+}
 
-    for (i = 0; i < n_tag_seqs; i++) {
-        for (j = 0; j < tag_seqs[i]->len; j++) {
-            t_pos = tag_seqs[i]->align_tags[j].t_pos;
-            tag_seq_index[ t_pos ][ aux_index[ t_pos ] ] = tag_seqs[i]->align_tags[j];
-            aux_index[ t_pos ] ++;
+void clean_msa_working_space( msa_pos_t * msa_array, unsigned int max_t_len) {
+    unsigned int i,j,k;
+    align_tag_col_t * col;
+    for (i = 0; i < max_t_len; i++) {
+        for (j =0; j < msa_array[i]->max_delta + 1; j++) {
+            for (k = 0; k < 5; k++ ) {
+                col = msa_array[i]->delta[j].base + k;
+                /*
+                for (c =0; c < col->size; c++) {
+                    col->p_t_pos[c] = 0;
+                    col->p_delta[c] = 0;
+                    col->p_q_base[c] = 0;
+                    col->link_count[c] =0;
+                }
+                */
+                col->n_link = 0;
+                col->count = 0;
+                col->best_p_t_pos = 0;
+                col->best_p_delta = 0;
+                col->best_p_q_base = 0;
+                col->score = 0;
+            }
         }
+        msa_array[i]->max_delta = 0;
     }
+}
 
+#define STATIC_ALLOCATE
+//#undef STATIC_ALLOCATE
 
-    consensus_index = 0;
+consensus_data * get_cns_from_align_tags( align_tags_t ** tag_seqs, 
+                                          unsigned n_tag_seqs, 
+                                          unsigned t_len, 
+                                          unsigned min_cov ) {
 
-    
-    consensus = calloc( 1, sizeof(consensus_data) );
-    consensus->sequence = calloc( t_len * 2 + 1, sizeof(char) );
-    consensus->eff_cov = calloc( t_len * 2 + 1, sizeof(unsigned int) );
+    seq_coor_t i, j;
+    seq_coor_t t_pos = 0;
+    unsigned int * coverage;
+    unsigned int * local_nbase;
+
+    consensus_data * consensus;
+    //char * consensus;
+    align_tag_t * c_tag;
+    static msa_pos_t * msa_array = NULL;
+
+    coverage = calloc( t_len, sizeof(unsigned int) );
+    local_nbase = calloc( t_len, sizeof(unsigned int) );
+
+#ifndef STATIC_ALLOCATE
+
+    msa_array = calloc(t_len, sizeof(msa_pos_t *));
 
     for (i = 0; i < t_len; i++) {
-        qsort(tag_seq_index[i], local_nbase[i], sizeof(align_tag_t), compare_tags);
-        cur_delta = 0;
-        for (j = 0; j <= local_nbase[i]; j++) {
-            max_count = 0;
-            max_count_index = 0;
-            if (j == local_nbase[i] || tag_seq_index[i][j].delta != cur_delta) {
-                for (k = 0; k < 5; k ++) {
-                    if (counter[k] > max_count) {
-                        max_count = counter[k];
-                        max_count_index = k;
-                    }
-                    //reset counter
-                    counter[k] = 0;
-                    cur_delta = tag_seq_index[i][j].delta;
+        msa_array[i] = calloc(1, sizeof(msa_delta_group_t));
+        msa_array[i]->size = 8;
+        allocate_delta_group(msa_array[i]);
+    }
+
+#endif    
+
+#ifdef STATIC_ALLOCATE
+
+    if ( msa_array == NULL) {
+        msa_array = get_msa_working_sapce( 100000 );
+    } 
+
+    assert(t_len < 100000);
+
+#endif    
+
+    
+    // loop through every alignment
+    //printf("XX %d\n", n_tag_seqs);
+    for (i = 0; i < n_tag_seqs; i++) {
+
+        // for each alignment position, insert the alignment tag to msa_array
+        for (j = 0; j < tag_seqs[i]->len; j++) {
+            c_tag = tag_seqs[i]->align_tags + j;
+            unsigned int delta;
+            delta = c_tag->delta;
+            if (delta == 0) {
+                t_pos = c_tag->t_pos;
+                coverage[ t_pos ] ++;
+            }
+            // Assume t_pos was set on earlier iteration.
+            // (Otherwise, use its initial value, which might be an error. ~cd)
+            if (delta > msa_array[t_pos]->max_delta) {
+                msa_array[t_pos]->max_delta = delta;
+                if (msa_array[t_pos]->max_delta + 4 > msa_array[t_pos]->size ) {
+                    realloc_delta_group(msa_array[t_pos], msa_array[t_pos]->max_delta + 8);
                 }
-                if (max_count > coverage[i] * 0.5) { 
-                    switch (max_count_index) {
-                        case 0:
-                            if (coverage[i] < min_cov + 1) {
-                                consensus->sequence[consensus_index] = 'a';
-                            } else {
-                                consensus->sequence[consensus_index] = 'A';
-                            }
-                            consensus->eff_cov[consensus_index] = coverage[i] ;
-                            consensus_index ++;
-                            break;
-                        case 1:
-                            if (coverage[i] < min_cov + 1) {
-                                consensus->sequence[consensus_index] = 'c';
-                            } else {
-                                consensus->sequence[consensus_index] = 'C';
+            }
+            
+            unsigned int base = -1;
+            switch (c_tag->q_base) {
+                case 'A': base = 0; break;
+                case 'C': base = 1; break;
+                case 'G': base = 2; break;
+                case 'T': base = 3; break;
+                case '-': base = 4; break;
+            }
+            // Note: On bad input, base may be -1.
+            update_col( &(msa_array[t_pos]->delta[delta].base[base]), c_tag->p_t_pos, c_tag->p_delta, c_tag->p_q_base);
+            local_nbase[ t_pos ] ++;
+        }
+    }
+
+    // propogate score throught the alignment links, setup backtracking information
+    align_tag_col_t * g_best_aln_col = 0;
+    unsigned int g_best_ck = 0;
+    seq_coor_t g_best_t_pos = 0;
+    {
+        int kk; 
+        int ck;
+        // char base;
+        int best_i;
+        int best_j;
+        int best_b;
+        int best_ck = -1;
+        double score;
+        double best_score;
+        double g_best_score;
+        // char best_mark;
+
+        align_tag_col_t * aln_col;
+        
+        g_best_score = -1;
+
+        for (i = 0; i < t_len; i++) {  //loop through every template base
+            //printf("max delta: %d %d\n", i, msa_array[i]->max_delta);
+            for (j = 0; j <= msa_array[i]->max_delta; j++) { // loop through every delta position
+                for (kk = 0; kk < 5; kk++) {  // loop through diff bases of the same delta posiiton
+                    /*
+                    switch (kk) {
+                        case 0: base = 'A'; break;
+                        case 1: base = 'C'; break;
+                        case 2: base = 'G'; break;
+                        case 3: base = 'T'; break;
+                        case 4: base = '-'; break;
+                    }
+                    */
+                    aln_col = msa_array[i]->delta[j].base + kk;
+                    if (aln_col->count >= 0) {
+                        best_score = -1;
+                        best_i = -1;
+                        best_j = -1;
+                        best_b = -1;
+
+                        for (ck = 0; ck < aln_col->n_link; ck++) { // loop through differnt link to previous column
+                            int pi;
+                            int pj;
+                            int pkk;
+                            pi = aln_col->p_t_pos[ck];
+                            pj = aln_col->p_delta[ck];
+                            switch (aln_col->p_q_base[ck]) {
+                                case 'A': pkk = 0; break;
+                                case 'C': pkk = 1; break;
+                                case 'G': pkk = 2; break;
+                                case 'T': pkk = 3; break;
+                                case '-': pkk = 4; break;
+                                default: pkk = 4;
                             }
-                            consensus->eff_cov[consensus_index] = coverage[i] ;
-                            consensus_index ++;
-                            break;
-                        case 2:
-                            if (coverage[i] < min_cov + 1) {
-                                consensus->sequence[consensus_index] = 'g';
+
+                            if (aln_col->p_t_pos[ck] == -1) {
+                                score =  (double) aln_col->link_count[ck] - (double) coverage[i] * 0.5;
                             } else {
-                                consensus->sequence[consensus_index] = 'G';
+                                score = msa_array[pi]->delta[pj].base[pkk].score + 
+                                        (double) aln_col->link_count[ck] - (double) coverage[i] * 0.5;
                             }
-                            consensus->eff_cov[consensus_index] = coverage[i] ;
-                            consensus_index ++;
-                            break;
-                        case 3:
-                            if (coverage[i] < min_cov + 1) {
-                                consensus->sequence[consensus_index] = 't';
-                            } else {
-                                consensus->sequence[consensus_index] = 'T';
+                            // best_mark = ' ';
+                            if (score > best_score) {
+                                best_score = score;
+                                aln_col->best_p_t_pos = best_i = pi;
+                                aln_col->best_p_delta = best_j = pj;
+                                aln_col->best_p_q_base = best_b = pkk;
+                                best_ck = ck;
+                                // best_mark = '*';
                             }
-                            consensus->eff_cov[consensus_index] = coverage[i] ;
-                            consensus_index ++;
-                            break;
-                        default:
-                            break;
+                            /*
+                            printf("X %d %d %d %c %d %d %d %c %d %lf %c\n", coverage[i], i, j, base, aln_col->count, 
+                                                                  aln_col->p_t_pos[ck], 
+                                                                  aln_col->p_delta[ck], 
+                                                                  aln_col->p_q_base[ck], 
+                                                                  aln_col->link_count[ck],
+                                                                  score, best_mark);
+                            */
+                        }
+                        aln_col->score = best_score;
+                        if (best_score > g_best_score) {
+                            g_best_score = best_score;
+                            g_best_aln_col = aln_col;
+                            g_best_ck = best_ck;
+                            g_best_t_pos = i;
+                            //printf("GB %d %d %d %d\n", i, j, ck, g_best_aln_col);
+                        }
                     }
-                    //printf("c:%c\n", consensus[consensus_index-1]);
                 }
-
-            } 
-
-            if (j == local_nbase[i]) break;
-
-            switch (tag_seq_index[i][j].q_base) {
-                case 'A':
-                    counter[0] ++;
-                    break;
-                case 'C':
-                    counter[1] ++;
-                    break;
-                case 'G':
-                    counter[2] ++;
-                    break;
-                case 'T':
-                    counter[3] ++;
-                    break;
-                case '-':
-                    counter[4] ++;
-                    break;
-                default:
-                    break;
             }
-            /*
-            printf("%ld %ld %ld %u %c %u\n", i, j, tag_seq_index[i][j].t_pos,
-                                                   tag_seq_index[i][j].delta,
-                                                   tag_seq_index[i][j].q_base,
-                                                   tag_seq_index[i][j].q_id);
-            */
         }
     }
-   
-    //printf("%s\n", consensus);
+    assert(g_best_score != -1);
+
+    // reconstruct the sequences
+    unsigned int index;
+    char bb = '$';
+    int ck;
+    char * cns_str;
+    int * eqv;
+    double score0;
+    
+    consensus = calloc( 1, sizeof(consensus_data) );
+    consensus->sequence = calloc( t_len * 2 + 1, sizeof(char) );
+    consensus->eqv = calloc( t_len * 2 + 1, sizeof(unsigned int) );
+    cns_str = consensus->sequence;
+    eqv =  consensus->eqv;
+
+    index = 0;
+    ck = g_best_ck;
+    i = g_best_t_pos;
+
+    while (1) {
+        if (coverage[i] > min_cov) {
+            switch (ck) {
+                case 0: bb = 'A'; break;
+                case 1: bb = 'C'; break;
+                case 2: bb = 'G'; break;
+                case 3: bb = 'T'; break;
+                case 4: bb = '-'; break;
+            }
+        } else {
+            switch (ck) {
+                case 0: bb = 'a'; break;
+                case 1: bb = 'c'; break;
+                case 2: bb = 'g'; break;
+                case 3: bb = 't'; break;
+                case 4: bb = '-'; break;
+            }
+        }
+        // Note: On bad input, bb will keep previous value, possibly '$'.
+
+        score0 = g_best_aln_col->score;
+        i = g_best_aln_col->best_p_t_pos;
+        if (i == -1 || index >= t_len * 2) break;
+        j = g_best_aln_col->best_p_delta;
+        ck = g_best_aln_col->best_p_q_base;
+        g_best_aln_col = msa_array[i]->delta[j].base + ck;
+
+        if (bb != '-') {
+            cns_str[index] = bb;
+            eqv[index] = (int) score0 - (int) g_best_aln_col->score;
+            //printf("C %d %d %c %lf %d %d\n", i, index, bb, g_best_aln_col->score, coverage[i], eqv[index] );
+            index ++;
+        }
+    }
+    
+    // reverse the sequence
+    for (i = 0; i < index/2; i++) {
+        cns_str[i] = cns_str[i] ^ cns_str[index-i-1];
+        cns_str[index-i-1] = cns_str[i] ^ cns_str[index-i-1];
+        cns_str[i] = cns_str[i] ^ cns_str[index-i-1];
+        eqv[i] = eqv[i] ^ eqv[index-i-1];
+        eqv[index-i-1] = eqv[i] ^ eqv[index-i-1];
+        eqv[i] = eqv[i] ^ eqv[index-i-1];
+    }
 
+    cns_str[index] = 0;
+    //printf("%s\n", cns_str);
+#ifndef STATIC_ALLOCATE
     for (i = 0; i < t_len; i++) {
-        free(tag_seq_index[i]);
+        free_delta_group(msa_array[i]);
+        free(msa_array[i]);
     }
-    free(tag_seq_index);
-    free(aux_index);
+    
+    free(msa_array);
+#endif
+
+#ifdef STATIC_ALLOCATE
+    clean_msa_working_space(msa_array, t_len+1);
+#endif
+    
     free(coverage);
     free(local_nbase);
     return consensus;
@@ -346,18 +567,17 @@ consensus_data * generate_consensus( char ** input_seq,
                            unsigned long local_match_count_window,
                            unsigned long local_match_count_threshold,
                            double min_idt) {
+    // local_match_count_window, local_match_count_threshold obsoleted, keep the interface for a while
 
-    unsigned int i, j, k;
+    unsigned int j;
     unsigned int seq_count;
     unsigned int aligned_seq_count;
     kmer_lookup * lk_ptr;
     seq_array sa_ptr;
     seq_addr_array sda_ptr;
     kmer_match * kmer_match_ptr;
-    aln_range * arange_;
     aln_range * arange;
     alignment * aln;
-    align_tags_t * tags;
     align_tags_t ** tags_list;
     //char * consensus;
     consensus_data * consensus;
@@ -365,6 +585,7 @@ consensus_data * generate_consensus( char ** input_seq,
     max_diff = 1.0 - min_idt;
 
     seq_count = n_seq;
+    //printf("XX n_seq %d\n", n_seq);
     //for (j=0; j < seq_count; j++) {
     //    printf("seq_len: %u %u\n", j, strlen(input_seq[j]));
     //};
@@ -393,9 +614,10 @@ consensus_data * generate_consensus( char ** input_seq,
 
         //printf("2:%ld %ld %ld %ld\n\n", arange->s1, arange->e1, arange->s2, arange->e2);
         
-#define INDEL_ALLOWENCE_1 400
+#define INDEL_ALLOWENCE_1 0.10
         if (arange->e1 - arange->s1 < 100 || arange->e2 - arange->s2 < 100 ||
-            abs( (arange->e1 - arange->s1 ) - (arange->e2 - arange->s2) ) > INDEL_ALLOWENCE_1) {
+            abs( (arange->e1 - arange->s1 ) - (arange->e2 - arange->s2) ) > 
+                   (int) (0.5 * INDEL_ALLOWENCE_1 * (arange->e1 - arange->s1 + arange->e2 - arange->s2))) {
             free_kmer_match( kmer_match_ptr);
             free_aln_range(arange);
             continue;
@@ -414,8 +636,6 @@ consensus_data * generate_consensus( char ** input_seq,
                                                            aln->t_aln_str, 
                                                            aln->aln_str_size, 
                                                            arange, j, 
-                                                           local_match_count_window,
-                                                           local_match_count_threshold,
                                                            0); 
             aligned_seq_count ++;
         }
@@ -431,7 +651,14 @@ consensus_data * generate_consensus( char ** input_seq,
         free_kmer_match( kmer_match_ptr);
     }
 
-    consensus = get_cns_from_align_tags( tags_list, aligned_seq_count, strlen(input_seq[0]), min_cov );
+    if (aligned_seq_count > 0) {
+        consensus = get_cns_from_align_tags( tags_list, aligned_seq_count, strlen(input_seq[0]), min_cov );
+    } else {
+        // allocate an empty consensus sequence
+        consensus = calloc( 1, sizeof(consensus_data) );
+        consensus->sequence = calloc( 1, sizeof(char) );
+        consensus->eqv = calloc( 1, sizeof(unsigned int) );
+    }
     //free(consensus);
     free_seq_addr_array(sda_ptr);
     free_seq_array(sa_ptr);
@@ -450,12 +677,11 @@ consensus_data * generate_utg_consensus( char ** input_seq,
                            unsigned K,
                            double min_idt) {
 
-    unsigned int i, j, k;
+    unsigned int j;
     unsigned int seq_count;
     unsigned int aligned_seq_count;
     aln_range * arange;
     alignment * aln;
-    align_tags_t * tags;
     align_tags_t ** tags_list;
     //char * consensus;
     consensus_data * consensus;
@@ -482,8 +708,7 @@ consensus_data * generate_utg_consensus( char ** input_seq,
     arange->s2 = 0;
     arange->e2 = strlen(input_seq[0]); 
     tags_list[aligned_seq_count] = get_align_tags( input_seq[0], input_seq[0], 
-                                                   strlen(input_seq[0]), arange, 0, 
-                                                   12, 0, 0); 
+                                                   strlen(input_seq[0]), arange, 0, 0); 
     aligned_seq_count += 1;
     for (j=1; j < seq_count; j++) {
         arange->s1 = 0;
@@ -530,13 +755,20 @@ consensus_data * generate_utg_consensus( char ** input_seq,
         if (aln->aln_str_size > 500 && ((double) aln->dist / (double) aln->aln_str_size) < max_diff) {
             tags_list[aligned_seq_count] = get_align_tags( aln->q_aln_str, aln->t_aln_str, 
                                                            aln->aln_str_size, arange, j, 
-                                                           12, 0, offset[j]); 
+                                                           offset[j]); 
             aligned_seq_count ++;
         }
         free_alignment(aln);
     }
     free_aln_range(arange);
-    consensus = get_cns_from_align_tags( tags_list, aligned_seq_count, utg_len, 0 );
+    if (aligned_seq_count > 0) {
+        consensus = get_cns_from_align_tags( tags_list, aligned_seq_count, utg_len, 0 );
+    } else {
+        // allocate an empty consensus sequence
+        consensus = calloc( 1, sizeof(consensus_data) );
+        consensus->sequence = calloc( 1, sizeof(char) );
+        consensus->eqv = calloc( 1, sizeof(unsigned int) );
+    }
     //free(consensus);
     for (j=0; j < aligned_seq_count; j++) {
         free_align_tags(tags_list[j]);
@@ -548,7 +780,7 @@ consensus_data * generate_utg_consensus( char ** input_seq,
 
 void free_consensus_data( consensus_data * consensus ){
     free(consensus->sequence);
-    free(consensus->eff_cov);
+    free(consensus->eqv);
     free(consensus);
 }
 
diff --git a/src/c/kmer_lookup.c b/src/c/kmer_lookup.c
index d901b03..e19e200 100755
--- a/src/c/kmer_lookup.c
+++ b/src/c/kmer_lookup.c
@@ -70,7 +70,6 @@ int compare_seq_coor(const void * a, const void * b) {
 
 kmer_lookup * allocate_kmer_lookup ( seq_coor_t size ) {
     kmer_lookup * kl;
-    seq_coor_t i;
 
     //printf("%lu is allocated for kmer lookup\n", size);
     kl = (kmer_lookup *)  malloc( size * sizeof(kmer_lookup) );
@@ -82,8 +81,8 @@ void init_kmer_lookup ( kmer_lookup * kl,  seq_coor_t size ) {
     seq_coor_t i;
     //printf("%lu is allocated for kmer lookup\n", size);
     for (i=0; i<size; i++) {
-        kl[i].start = LONG_MAX;
-        kl[i].last = LONG_MAX;
+        kl[i].start = INT_MAX;
+        kl[i].last = INT_MAX;
         kl[i].count = 0;
     }
 }
@@ -102,7 +101,7 @@ seq_array allocate_seq(seq_coor_t size) {
 
 void init_seq_array( seq_array sa, seq_coor_t size) {
     seq_coor_t i;
-    for (i=0; i++; i<size) {
+    for (i=0; i<size; i++) {
         sa[i] = 0xff;
     }
 }
@@ -175,7 +174,7 @@ void add_sequence ( seq_coor_t start,
     for (i = 0; i < seq_len - K;  i++) {
         //printf("%lu %lu\n", i, kmer_bv);
         //printf("lk before init: %lu %lu %lu\n", kmer_bv, lk[kmer_bv].start, lk[kmer_bv].last);
-        if (lk[kmer_bv].start == LONG_MAX) {
+        if (lk[kmer_bv].start == INT_MAX) {
             lk[kmer_bv].start = start + i;
             lk[kmer_bv].last = start + i;
             lk[kmer_bv].count += 1;
@@ -197,8 +196,8 @@ void mask_k_mer(seq_coor_t size, kmer_lookup * kl, seq_coor_t threshold) {
     seq_coor_t i;
     for (i=0; i<size; i++) {
         if (kl[i].count > threshold) {
-            kl[i].start = LONG_MAX;
-            kl[i].last = LONG_MAX;
+            kl[i].start = INT_MAX;
+            kl[i].last = INT_MAX;
             //kl[i].count = 0;
         }
     }
@@ -252,7 +251,7 @@ kmer_match * find_kmer_pos_for_seq( char * seq, seq_coor_t seq_len, unsigned int
     half_K = K >> 1;
     for (i = 0; i < seq_len - K;  i += half_K) {
         kmer_bv = get_kmer_bitvector(sa + i, K);
-        if (lk[kmer_bv].start == LONG_MAX) {  //for high count k-mers
+        if (lk[kmer_bv].start == INT_MAX) {  //for high count k-mers
             continue;
         }
         kmer_pos = lk[ kmer_bv ].start;
@@ -310,19 +309,15 @@ aln_range* find_best_aln_range(kmer_match * km_ptr,
     long int max_k_mer_count;
     long int max_k_mer_bin;
     seq_coor_t cur_start;
-    seq_coor_t cur_pos;
-    seq_coor_t max_start;
-    seq_coor_t max_end;
-    seq_coor_t kmer_dist;
 
     arange = calloc(1 , sizeof(aln_range));
 
-    q_min = LONG_MAX;
+    q_min = INT_MAX;
     q_max = 0;
-    t_min = LONG_MAX;
+    t_min = INT_MAX;
     t_max = 0;
 
-    d_min = LONG_MAX;
+    d_min = INT_MAX;
     d_max = LONG_MIN;
 
     for (i = 0; i <  km_ptr->count; i++ ) {
@@ -355,13 +350,13 @@ aln_range* find_best_aln_range(kmer_match * km_ptr,
     for (i = 0; i <  km_ptr->count; i++ ) {
         d = (long int) (km_ptr->query_pos[i]) - (long int) (km_ptr->target_pos[i]);
         d_count[ (d - d_min)/ (long int) bin_size ] += 1;
-        q_coor[i] = LONG_MAX;
-        t_coor[i] = LONG_MAX;
+        q_coor[i] = INT_MAX;
+        t_coor[i] = INT_MAX;
     }
 
     j = 0;
     max_k_mer_count = 0;
-    max_k_mer_bin = LONG_MAX;
+    max_k_mer_bin = INT_MAX;
     for (i = 0; i <  km_ptr->count; i++ ) {
         d = (long int) (km_ptr->query_pos[i]) - (long int) (km_ptr->target_pos[i]);
         if ( d_count[ (d - d_min)/ (long int) bin_size ] > max_k_mer_count) {
@@ -371,7 +366,7 @@ aln_range* find_best_aln_range(kmer_match * km_ptr,
     }
     //printf("k_mer: %lu %lu\n" , max_k_mer_count, max_k_mer_bin);
     
-    if ( max_k_mer_bin != LONG_MAX && max_k_mer_count > count_th ) {
+    if ( max_k_mer_bin != INT_MAX && max_k_mer_count > count_th ) {
         for (i = 0; i <  km_ptr->count; i++ ) {
             d = (long int) (km_ptr->query_pos[i]) - (long int) (km_ptr->target_pos[i]);
             if ( abs( ( (d - d_min)/ (long int) bin_size ) - max_k_mer_bin ) > 5 ) {
diff --git a/src/py/FastaReader.py b/src/py/FastaReader.py
new file mode 100644
index 0000000..65085bd
--- /dev/null
+++ b/src/py/FastaReader.py
@@ -0,0 +1,260 @@
+#################################################################################$$
+# Copyright (c) 2011-2014, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted (subject to the limitations in the
+# disclaimer below) provided that the following conditions are met:
+#
+#  * Redistributions of source code must retain the above copyright
+#  notice, this list of conditions and the following disclaimer.
+#
+#  * Redistributions in binary form must reproduce the above
+#  copyright notice, this list of conditions and the following
+#  disclaimer in the documentation and/or other materials provided
+#  with the distribution.
+#
+#  * Neither the name of Pacific Biosciences nor the names of its
+#  contributors may be used to endorse or promote products derived
+#  from this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#################################################################################$$
+
+from os.path import abspath, expanduser
+from cStringIO import StringIO
+import md5
+import re
+
+def splitFastaHeader( name ):
+    """
+    Split a FASTA/FASTQ header into its id and metadata components
+    """
+    nameParts = re.split('\s', name, maxsplit=1)
+    id_ = nameParts[0]
+    if len(nameParts) > 1:
+        metadata = nameParts[1].strip()
+    else:
+        metadata = None
+    return (id_, metadata)
+
+def splitFileContents(f, delimiter, BLOCKSIZE=8192):
+    """
+    Same semantics as f.read().split(delimiter), but with memory usage
+    determined by largest chunk rather than entire file size
+    """
+    remainder = StringIO()
+    while True:
+        block = f.read(BLOCKSIZE)
+        if not block:
+            break
+        parts = block.split(delimiter)
+        remainder.write(parts[0])
+        for part in parts[1:]:
+            yield remainder.getvalue()
+            remainder = StringIO()
+            remainder.write(part)
+    yield remainder.getvalue()
+
+def isFileLikeObject(o):
+    return hasattr(o, "read") and hasattr(o, "write")
+
+def getFileHandle(filenameOrFile, mode="r"):
+    """
+    Given a filename not ending in ".gz", open the file with the
+    appropriate mode.
+    Given a filename ending in ".gz", return a filehandle to the
+    unzipped stream.
+    Given a file object, return it unless the mode is incorrect--in
+    that case, raise an exception.
+    """
+    assert mode in ("r", "w")
+
+    if isinstance(filenameOrFile, basestring):
+        filename = abspath(expanduser(filenameOrFile))
+        if filename.endswith(".gz"):
+            return gzip.open(filename, mode)
+        else:
+            return open(filename, mode)
+    elif isFileLikeObject(filenameOrFile):
+        return filenameOrFile
+    else:
+        raise Exception("Invalid type to getFileHandle")
+
+
+class ReaderBase(object):
+    def __init__(self, f):
+        """
+        Prepare for iteration through the records in the file
+        """
+        self.file = getFileHandle(f, "r")
+
+    def close(self):
+        """
+        Close the underlying file
+        """
+        self.file.close()
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.close()
+
+
+class FastaRecord(object):
+    """
+    A FastaRecord object models a named sequence in a FASTA file.
+    """
+    DELIMITER = ">"
+    COLUMNS   = 60
+
+    def __init__(self, name, sequence):
+        try:
+            assert "\n" not in name
+            assert "\n" not in sequence
+            assert self.DELIMITER not in sequence
+            self._name = name
+            self._sequence = sequence
+            self._md5 = md5.md5(self.sequence).hexdigest()
+            self._id, self._metadata = splitFastaHeader(name)
+        except AssertionError:
+            raise ValueError("Invalid FASTA record data")
+
+    @property
+    def name(self):
+        """
+        The name of the sequence in the FASTA file, equal to the entire
+        FASTA header following the '>' character
+        """
+        return self._name
+
+    @property
+    def id(self):
+        """
+        The id of the sequence in the FASTA file, equal to the FASTA header
+        up to the first whitespace.
+        """
+        return self._id
+
+    @property
+    def metadata(self):
+        """
+        The metadata associated with the sequence in the FASTA file, equal to
+        the contents of the FASTA header following the first whitespace
+        """
+        return self._metadata
+
+    @property
+    def sequence(self):
+        """
+        The sequence for the record as present in the FASTA file.
+        (Newlines are removed but otherwise no sequence normalization
+        is performed).
+        """
+        return self._sequence
+
+    @property
+    def length(self):
+        """
+        Get the length of the FASTA sequence
+        """
+        return len(self._sequence)
+
+    @property
+    def md5(self):
+        """
+        The MD5 checksum (hex digest) of `sequence`
+        """
+        return self._md5
+
+    @classmethod
+    def fromString(cls, s):
+        """
+        Interprets a string as a FASTA record.  Does not make any
+        assumptions about wrapping of the sequence string.
+        """
+        try:
+            lines = s.splitlines()
+            assert len(lines) > 1
+            assert lines[0][0] == cls.DELIMITER
+            name = lines[0][1:]
+            sequence = "".join(lines[1:])
+            return FastaRecord(name, sequence)
+        except AssertionError:
+            raise ValueError("String not recognized as a valid FASTA record")
+
+    def reverseComplement(self, preserveHeader=False):
+        """
+        Return a new FastaRecord with the reverse-complemented DNA sequence.
+        Optionally, supply a name
+        """
+        rcSequence = sequences.reverseComplement(self.sequence)
+        if preserveHeader:
+            return FastaRecord(self.name, rcSequence)
+        else:
+            rcName = '{0} [revcomp]'.format(self.name.strip())
+            return FastaRecord(rcName, rcSequence)
+
+    def __eq__(self, other):
+        if isinstance(other, self.__class__):
+            return (self.name     == other.name and
+                    self.sequence == other.sequence)
+        else:
+            return False
+
+    def __ne__(self, other):
+        return not self.__eq__(other)
+
+    def __str__(self):
+        """
+        Output a string representation of this FASTA record, observing
+        standard conventions about sequence wrapping.
+        """
+        return (">%s\n" % self.name) + \
+            wrap(self.sequence, self.COLUMNS)
+
+
+class FastaReader(ReaderBase):
+    """
+    Streaming reader for FASTA files, useable as a one-shot iterator
+    over FastaRecord objects.  Agnostic about line wrapping.
+    Example:
+    .. doctest::
+        TODO: Get data.
+        > from pbcore import data
+        > filename = data.getTinyFasta()
+        > r = FastaReader(filename)
+        > for record in r:
+        ...     print record.name, len(record.sequence), record.md5
+        ref000001|EGFR_Exon_2 183 e3912e9ceacd6538ede8c1b2adda7423
+        ref000002|EGFR_Exon_3 203 4bf218da37175a91869033024ac8f9e9
+        ref000003|EGFR_Exon_4 215 245bc7a046aad0788c22b071ed210f4d
+        ref000004|EGFR_Exon_5 157 c368b8191164a9d6ab76fd328e2803ca
+        >>> r.close()
+    """
+    DELIMITER = ">"
+
+    def __iter__(self):
+        try:
+            parts = splitFileContents(self.file, ">")
+            assert "" == next(parts)
+            for part in parts:
+                yield FastaRecord.fromString(">" + part)
+        except AssertionError:
+            raise ValueError("Invalid FASTA file")
+
diff --git a/src/py/falcon_kit.py b/src/py/falcon_kit.py
index 46b776e..32a85ac 100644
--- a/src/py/falcon_kit.py
+++ b/src/py/falcon_kit.py
@@ -35,14 +35,18 @@
 # OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 # SUCH DAMAGE.
 #################################################################################$$
-
+__all__ = [
+    'kup', 'DWA', 'falcon',
+    'KmerLookup', 'KmerMatch', 'AlnRange', 'ConsensusData',
+    'Alignment', 'get_alignment',
+    ]
 
 from ctypes import *
-import os
-module_path = os.path.split(__file__)[0]
+from . import ext_falcon
+#module_path = os.path.split(__file__)[0]
 
 
-seq_coor_t = c_long
+seq_coor_t = c_int
 base_t = c_uint8
 
 class KmerLookup(Structure):
@@ -66,7 +70,10 @@ class ConsensusData(Structure):
     _fields_ = [ ("sequence", c_char_p),
                  ("eff_cov", POINTER(c_uint)) ]
 
-kup = CDLL(os.path.join(module_path, "kmer_lookup.so"))
+
+falcon_dll = CDLL(ext_falcon.__file__)
+
+kup = falcon_dll
 
 kup.allocate_kmer_lookup.argtypes =  [seq_coor_t] 
 kup.allocate_kmer_lookup.restype = POINTER(KmerLookup)
@@ -121,22 +128,21 @@ class Alignment(Structure):
                  ("t_aln_str", c_char_p)]
 
 
-DWA = CDLL(os.path.join(module_path, "DW_align.so"))
+DWA = falcon_dll
+
 DWA.align.argtypes = [ POINTER(c_char), c_long, POINTER(c_char), c_long, c_long, c_int ] 
 DWA.align.restype = POINTER(Alignment)
 DWA.free_alignment.argtypes = [POINTER(Alignment)]
 
 
 
-falcon = CDLL(os.path.join(module_path,"falcon.so"))
+falcon = falcon_dll
 
 falcon.generate_consensus.argtypes = [POINTER(c_char_p), c_uint, c_uint, c_uint, c_uint, c_uint, c_double  ]
 falcon.generate_consensus.restype = POINTER(ConsensusData)
 falcon.free_consensus_data.argtypes = [ POINTER(ConsensusData) ]
 
 
-
-
 def get_alignment(seq1, seq0):
     K = 8
     lk_ptr = kup.allocate_kmer_lookup( 1 << (K * 2) )
diff --git a/src/py/fc_asm_graph.py b/src/py/fc_asm_graph.py
new file mode 100644
index 0000000..8f7d235
--- /dev/null
+++ b/src/py/fc_asm_graph.py
@@ -0,0 +1,212 @@
+#################################################################################$$
+# Copyright (c) 2011-2014, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted (subject to the limitations in the
+# disclaimer below) provided that the following conditions are met:
+#
+#  * Redistributions of source code must retain the above copyright
+#  notice, this list of conditions and the following disclaimer.
+#
+#  * Redistributions in binary form must reproduce the above
+#  copyright notice, this list of conditions and the following
+#  disclaimer in the documentation and/or other materials provided
+#  with the distribution.
+#
+#  * Neither the name of Pacific Biosciences nor the names of its
+#  contributors may be used to endorse or promote products derived
+#  from this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#################################################################################$$
+
+import networkx as nx
+from FastaReader import FastaReader
+
+RCMAP = dict(zip("ACGTacgtNn-","TGCAtgcaNn-"))
+
+def reverse_end( node_id ):
+    node_id, end = node_id.split(":")
+    new_end = "B" if end == "E" else "E"
+    return node_id + ":" + new_end
+
+class AsmGraph(object):
+
+    def __init__(self, sg_file, utg_file, ctg_file):
+        self.sg_edges = {}
+        self.sg_edge_seqs = {}
+        self.utg_data = {}
+        self.ctg_data ={}
+        self.utg_to_ctg = {}
+        self.node_to_ctg = {}
+        self.node_to_utg = {}
+
+        self.load_sg_data(sg_file)
+        self.load_utg_data(utg_file)
+        self.load_ctg_data(ctg_file)
+
+        self.build_node_map()
+
+    def load_sg_data(self, sg_file):
+
+        with open(sg_file) as f:
+            for l in f:
+                l = l.strip().split()
+                v, w = l[0:2]
+                seq_id, b, e = l[2:5]
+                b, e = int(b), int(e)
+                score, idt = l[5:7]
+                score, idt = int(score), float(idt)
+                type_ = l[7]
+                self.sg_edges[ (v, w) ] = ( (seq_id, b, e), score, idt, type_)
+
+    def load_sg_seq(self, fasta_fn):
+
+        all_read_ids = set() # read ids in the graph
+
+        for v, w in self.sg_edges:
+            type_ = self.sg_edges[ (v, w) ][-1]
+            if type_ != "G":
+                continue
+            v = v.split(":")[0]
+            w = w.split(":")[0]
+            all_read_ids.add(v)
+            all_read_ids.add(w)
+
+        seqs = {}
+        # load all p-read name into memory
+        f = FastaReader(fasta_fn)
+        for r in f:
+            if r.name not in all_read_ids:
+                continue
+            seqs[r.name] = r.sequence.upper()
+
+
+        for v, w in self.sg_edges:
+            seq_id, s, t = self.sg_edges[ (v, w) ][0]
+            type_ = self.sg_edges[ (v, w) ][-1]
+
+            if type_ != "G":
+                continue
+
+            if s < t:
+                e_seq = seqs[ seq_id ][ s:t ]
+            else:
+                e_seq = "".join([ RCMAP[c] for c in seqs[ seq_id ][ s:t:-1 ] ])
+            self.sg_edge_seqs[ (v, w) ] = e_seq
+
+    def get_seq_from_path(self, path):
+        if len(self.sg_edge_seqs) == 0:
+            return ""
+        v = path[0]
+        seqs = []
+        for w in path[1:]:
+            seqs.append( self.sg_edge_seqs[ (v, w) ] )
+            v = w
+        return "".join(seqs)
+
+
+    def load_utg_data(self, utg_file):
+
+        with open(utg_file) as f:
+            for l in f:
+                l = l.strip().split()
+                s, v, t = l[0:3]
+                type_, length, score = l[3:6]
+                length, score = int(length), int(score)
+                path_or_edges = l[6]
+                self.utg_data[ (s,t,v) ] = ( type_, length, score, path_or_edges)
+
+
+    def load_ctg_data(self, ctg_file):
+
+        with open(ctg_file) as f:
+            for l in f:
+                l = l.strip().split()
+                ctg_id, ctg_type = l[0:2]
+                start_edge = l[2]
+                end_node = l[3]
+                length = int(l[4])
+                score = int(l[5])
+                path = tuple( ( e.split("~") for e in l[6].split("|") ) )
+                self.ctg_data[ ctg_id ] = ( ctg_type, start_edge, end_node,  length, score, path )
+                for u in path:
+                    s, v, t = u
+                    #rint s,v,t
+                    type_, length, score, path_or_edges =  self.utg_data[ (s,t,v) ]
+                    if type_ != "compound":
+                        self.utg_to_ctg[ (s, t, v) ] = ctg_id
+                    else:
+                        for svt in path_or_edges.split("|"):
+                            s, v, t = svt.split("~")
+                            self.utg_to_ctg[ (s, t, v) ] = ctg_id
+
+
+    def get_sg_for_utg(self, utg_id):
+        sg = nx.DiGraph()
+        type_, length, score, path_or_edges =  self.utg_data[ utg_id ]
+        if type_ == "compound":
+            for svt in path_or_edges.split("|"):
+                s, v, t = svt.split("~")
+                type_, length, score, one_path =  self.utg_data[ (s, t, v) ]
+                one_path = one_path.split("~")
+                sg.add_path(one_path)
+        else:
+            one_path = path_or_edges.split("~")
+            sg.add_path(one_path)
+        return sg
+
+
+    def get_sg_for_ctg(self, ctg_id):
+        sg = nx.DiGraph()
+        utgs = []
+        path = self.ctg_data[ctg_id][-1]
+        for s, v, t in path:
+            type_, length, score, path_or_edges =  self.utg_data[ (s, t, v) ]
+            utgs.append( (type_, path_or_edges) )
+
+        for t, utg in utgs:
+            if t == "simple":
+                one_path = utg.split("~")
+                sg.add_path(one_path)
+            elif t == "compound":
+                for svt in utg.split("|"):
+                    s, v, t = svt.split("~")
+                    type_, length, score, one_path =  self.utg_data[ (s, t, v) ]
+                    one_path = one_path.split("~")
+                    sg.add_path(one_path)
+
+        return sg
+
+
+    def build_node_map(self):
+
+        for ctg_id in self.ctg_data:
+            sg = self.get_sg_for_ctg( ctg_id )
+            for n in sg.nodes():
+                self.node_to_ctg.setdefault(n, set())
+                self.node_to_ctg[n].add(ctg_id)
+
+
+        for u_id in self.utg_data:
+            if self.utg_data[u_id][0] == "compound":
+                continue
+            sg = self.get_sg_for_utg( u_id )
+            for n in sg.nodes():
+                self.node_to_utg.setdefault(n, set())
+                self.node_to_utg[n].add( u_id )
diff --git a/src/py/functional.py b/src/py/functional.py
new file mode 100644
index 0000000..020c241
--- /dev/null
+++ b/src/py/functional.py
@@ -0,0 +1,89 @@
+"""Purely functional code.
+"""
+import collections
+import re
+
+def _verify_pairs(pairs1, pairs2):
+    if pairs1 != pairs2:
+        print('pair2dali:', pairs1)
+        print('pair2sort:', pairs2)
+        print('pair2dali:', len(pairs1))
+        print('pair2sort:', len(pairs2))
+        assert pairs1 == pairs2
+
+def get_daligner_job_descriptions(run_jobs_stream, db_prefix):
+    """Return a dict of job-desc-tuple -> HPCdaligner bash-job.
+
+    E.g., each item will look like:
+      (2, 1, 2, 3): 'daligner ...; LAsort ...; LAmerge ...; rm ...'
+
+    Rationale
+    ---------
+    For i/o efficiency, we will combine daligner calls with LAsort lines, which include 0-level merge.
+    Example:
+      daligner -v -t16 -H12000 -e0.7 -s1000 raw_reads.2 raw_reads.1 raw_reads.2
+    That would be combined with two LAsort lines:
+      LAsort -v raw_reads.2.raw_reads.1.C0 ...
+      LAsort -v raw_reads.2.raw_reads.2.C0 ...
+    For each returned job, the result of
+      daligner X A B C; LAsort*
+    will then be
+      L.1.X.A, L.1.X.B, and L.1.X.C
+    where A, B, or C could be X.
+    (In the example, X=2 A=1 B=2.)
+
+    Comments and lines starting with LAmerge are ignored.
+    """
+    re_block_dali = re.compile(r'%s\.(\d+)' %db_prefix)
+    def blocks_dali(line):
+        return [mo.group(1) for mo in re_block_dali.finditer(line)]
+    # X == blocks[0]; A/B/C = blocks[...]
+
+    re_pair_sort = re.compile(r'%s\.(\d+)\.%s\.(\d+)' %(db_prefix, db_prefix))
+    def LAsort_pair(line):
+        return re_pair_sort.search(line).group(1, 2)
+
+    lines = [line.strip() for line in run_jobs_stream]
+    assert any(len(l) > 1 for l in lines) # in case caller passed filename, not stream
+    lines_dali = [l for l in lines if l.startswith('daligner')] # could be daligner_p
+    lines_sort = [l for l in lines if l.startswith('LAsort')]
+    pair2dali = {}
+    for line in lines_dali:
+        blocks = blocks_dali(line)
+        for block in blocks[1:]:
+            pair = (blocks[0], block)
+            pair2dali[pair] = line
+            if block != blocks[0]:
+                # Then we have a reverse comparison too.
+                # https://dazzlerblog.wordpress.com/2014/07/10/dalign-fast-and-sensitive-detection-of-all-pairwise-local-alignments/
+                rpair = (block, blocks[0])
+                pair2dali[rpair] = line
+    pair2sort = {}
+    for line in lines_sort:
+        pair = LAsort_pair(line)
+        pair2sort[pair] = line
+    _verify_pairs(sorted(pair2dali.keys()), sorted(pair2sort.keys()))
+    dali2pairs = collections.defaultdict(set)
+    for pair, dali in pair2dali.items():
+        dali2pairs[dali].add(pair)
+    result = {}
+    for dali, pairs in dali2pairs.items():
+        sorts = [pair2sort[pair] for pair in sorted(pairs, key=lambda k: (int(k[0]), int(k[1])))]
+        id = tuple(map(int, blocks_dali(dali)))
+        script = '\n'.join([dali] + sorts) + '\n'
+        result[id] = script
+    return result
+
+_re_sub_daligner = re.compile(r'^daligner\b', re.MULTILINE)
+def xform_script_for_preads(script):
+    daligner_exe = 'daligner_p'
+    return _re_sub_daligner.sub(daligner_exe, script) #, flags=re.MULTILINE) # flags in py2.7
+
+def xform_script_for_raw_reads(script):
+    return script
+
+def get_script_xformer(pread_aln):
+    if pread_aln:
+        return xform_script_for_preads
+    else:
+        return xform_script_for_raw_reads
diff --git a/src/py/mains/__init__.py b/src/py/mains/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/py/mains/actg_coordinate.py b/src/py/mains/actg_coordinate.py
new file mode 100644
index 0000000..aba479c
--- /dev/null
+++ b/src/py/mains/actg_coordinate.py
@@ -0,0 +1,27 @@
+from falcon_kit.FastaReader import FastaReader
+
+
+def main(argv=None):
+  p_ctg_coor_map = {}
+  with open("p_ctg_tiling_path") as f:
+    for row in f:
+        row = row.strip().split()
+        ctg_id, v, w, edge_rid, b, e  = row[:6]
+        if ctg_id not in p_ctg_coor_map:
+            coor = 0   # the p_ctg_tiling_path should be sorted by contig the order of the edges in the tiling path
+            p_ctg_coor_map[ctg_id] = {}
+            p_ctg_coor_map[ctg_id][v] = 0
+            coor += abs(int(b) - int(e))
+            p_ctg_coor_map[ctg_id][w] = coor
+            continue
+        else:
+            coor += abs(int(b) - int(e))
+            p_ctg_coor_map[ctg_id][w] = coor 
+
+
+  a_ctg_fasta = FastaReader("a_ctg.fa")
+  for r in a_ctg_fasta:
+    rid = r.name.split()
+    rid, v, w = rid[:3]
+    pid = rid.split("-")[0]
+    print rid, p_ctg_coor_map[pid][v], p_ctg_coor_map[pid][w]
diff --git a/src/py_scripts/falcon_sense.py b/src/py/mains/consensus.py
similarity index 65%
copy from src/py_scripts/falcon_sense.py
copy to src/py/mains/consensus.py
index c23b7bf..42a720c 100644
--- a/src/py_scripts/falcon_sense.py
+++ b/src/py/mains/consensus.py
@@ -1,51 +1,12 @@
-#!/usr/bin/env python
-
-#################################################################################$$
-# Copyright (c) 2011-2014, Pacific Biosciences of California, Inc.
-#
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted (subject to the limitations in the
-# disclaimer below) provided that the following conditions are met:
-#
-#  * Redistributions of source code must retain the above copyright
-#  notice, this list of conditions and the following disclaimer.
-#
-#  * Redistributions in binary form must reproduce the above
-#  copyright notice, this list of conditions and the following
-#  disclaimer in the documentation and/or other materials provided
-#  with the distribution.
-#
-#  * Neither the name of Pacific Biosciences nor the names of its
-#  contributors may be used to endorse or promote products derived
-#  from this software without specific prior written permission.
-#
-# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
-# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
-# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
-# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
-# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
-# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
-# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-# SUCH DAMAGE.
-#################################################################################$$
-
-from ctypes import *
-import sys
-from multiprocessing import Pool
+from ctypes import (POINTER, c_char_p, c_uint, c_uint, c_uint, c_uint, c_uint, c_double, string_at)
+from falcon_kit.multiproc import Pool
+from falcon_kit import falcon
+import argparse
 import os
+import re
+import sys
 import falcon_kit
 
-module_path = falcon_kit.__path__[0]
-
-falcon = CDLL(os.path.join(module_path, "falcon.so"))
 
 falcon.generate_consensus.argtypes = [ POINTER(c_char_p), c_uint, c_uint, c_uint, c_uint, c_uint, c_double ]
 falcon.generate_consensus.restype = POINTER(falcon_kit.ConsensusData)
@@ -155,52 +116,78 @@ def get_consensus_with_trim( c_input ):
     return consensus, seed_id
 
 
-def get_seq_data(config):
+def get_seq_data(config, min_cov_aln, min_len_aln):
+    max_len = 100000
+    min_cov, K, local_match_count_window, local_match_count_threshold, max_n_read, min_idt, edge_tolerance, trim_size = config
     seqs = []
     seed_id = None
     seqs_data = []
+    read_ids = set()
     with sys.stdin as f:
         for l in f:
             l = l.strip().split()
             if len(l) != 2:
                 continue
-            if l[0] not in ("+", "-"):
-                if len(l[1]) > 100:
+
+            read_id = l[0]
+            seq = l[1]
+            if len(seq) > max_len:
+                seq = seq[:max_len-1]
+
+            if read_id not in ("+", "-", "*"):
+                if len(seq) >= min_len_aln:
                     if len(seqs) == 0:
-                        seqs.append(l[1]) #the "seed"
+                        seqs.append(seq) #the "seed"
                         seed_id = l[0]
-                    seqs.append(l[1])
+                    if read_id not in read_ids: #avoidng using the same read twice. seed is used again here by design
+                        seqs.append(seq)
+                        read_ids.add(read_id)
             elif l[0] == "+":
-                if len(seqs) > 10:
-                    yield (seqs, seed_id, config) 
+                if len(seqs) >= min_cov_aln:
+                    seqs = seqs[:1] + sorted(seqs[1:], key=lambda x: -len(x))
+                    yield (seqs[:max_n_read], seed_id, config) 
                 #seqs_data.append( (seqs, seed_id) ) 
                 seqs = []
+                read_ids = set()
+                seed_id = None
+            elif l[0] == "*":
+                seqs = []
+                read_ids = set()
                 seed_id = None
             elif l[0] == "-":
                 #yield (seqs, seed_id)
                 #seqs_data.append( (seqs, seed_id) )
                 break
+def format_seq(seq, col):
+    return "\n".join( [ seq[i:(i+col)] for i in xrange(0, len(seq), col) ] )
 
-if __name__ == "__main__":
-    import argparse
-    import re
+def main(argv=sys.argv):
     parser = argparse.ArgumentParser(description='a simple multi-processor consensus sequence generator')
     parser.add_argument('--n_core', type=int, default=24,
-                        help='number of processes used for generating consensus')
+                        help='number of processes used for generating consensus; '
+                        '0 for main process only (default=%(default)s)')
     parser.add_argument('--local_match_count_window', type=int, default=12,
-                        help='local match window size')
+                        help='local match window size (obsoleted, no effect)')
     parser.add_argument('--local_match_count_threshold', type=int, default=6,
-                        help='local match count threshold')
+                        help='local match count threshold (obsoleted, no effect)')
     parser.add_argument('--min_cov', type=int, default=6,
                         help='minimum coverage to break the consensus')
+    parser.add_argument('--min_cov_aln', type=int, default=10,
+                        help='minimum coverage of alignment data; an alignment with fewer reads will be completely ignored')
+    parser.add_argument('--min_len_aln', type=int, default=100,
+                        help='minimum length of a sequence in an alignment to be used in consensus; any shorter sequence will be completely ignored')
     parser.add_argument('--max_n_read', type=int, default=500,
-                        help='minimum number of reads used in generating the consensus')
+                        help='maximum number of reads used in generating the consensus')
     parser.add_argument('--trim', action="store_true", default=False,
                         help='trim the input sequence with k-mer spare dynamic programming to find the mapped range')
     parser.add_argument('--output_full', action="store_true", default=False,
                         help='output uncorrected regions too')
     parser.add_argument('--output_multi', action="store_true", default=False,
-                        help='output multi correct regions')
+                        help='output multi correct regions; implies --output_dformat, unless --output-simple-fasta-header')
+    parser.add_argument('--output_dformat', action="store_true", default=True,
+                        help='output daligner compatible header, only work with --output_multi; DEPRECATED and ignored, as this is the default now')
+    parser.add_argument('--output_simple_fasta_header', action='store_true', default=False,
+                        help='Turn off --output_dformat. This was for older (pre spring 2015) DALIGNER. Never needed now.')
     parser.add_argument('--min_idt', type=float, default=0.70,
                         help='minimum identity of the alignments used for correction')
     parser.add_argument('--edge_tolerance', type=int, default=1000,
@@ -208,8 +195,10 @@ if __name__ == "__main__":
     parser.add_argument('--trim_size', type=int, default=50,
                         help='the size for triming both ends from initial sparse aligned region')
     good_region = re.compile("[ACGT]+")
-    args = parser.parse_args()
-    exe_pool = Pool(args.n_core)
+    args = parser.parse_args(argv[1:])
+    def Start():
+        print>>sys.stderr, 'Started a worker in %d from parent %d' %(os.getpid(), os.getppid())
+    exe_pool = Pool(args.n_core, initializer=Start)
     if args.trim:
         get_consensus = get_consensus_with_trim
     else:
@@ -218,26 +207,35 @@ if __name__ == "__main__":
     K = 8
     config = args.min_cov, K, args.local_match_count_window, args.local_match_count_threshold,\
              args.max_n_read, args.min_idt, args.edge_tolerance, args.trim_size
-    for res in exe_pool.imap(get_consensus, get_seq_data(config)):  
+    # TODO: pass config object, not tuple, so we can add fields
+    for res in exe_pool.imap(get_consensus, get_seq_data(config, args.min_cov_aln, args.min_len_aln)):
         cns, seed_id = res
-        if args.output_full == True:
-            if len(cns) > 500:
-                print ">"+seed_id+"_f"
-                print cns
+        if len(cns) < 500:
+            continue
+
+
+        if args.output_full:
+            print ">"+seed_id+"_f"
+            print cns
         else:
             cns = good_region.findall(cns)
             if len(cns) == 0:
                 continue
-            if args.output_multi == True:
+            if args.output_multi:
                 seq_i = 0
                 for cns_seq in cns:
-                    if len(cns_seq) > 500:
+                    if len(cns_seq) < 500:
+                        continue
+                    if not args.output_simple_fasta_header:
+                        if seq_i >= 10:
+                            break
+                        print ">prolog/%s%01d/%d_%d" % (seed_id, seq_i, 0, len(cns_seq))
+                        print format_seq(cns_seq, 80)
+                    else:
                         print ">"+seed_id+"_%d" % seq_i
                         print cns_seq
                     seq_i += 1
             else:
                 cns.sort(key = lambda x: len(x))
-                if len(cns[-1]) > 500:
-                    print ">"+seed_id
-                    print cns[-1]
-
+                print ">"+seed_id
+                print cns[-1]
diff --git a/src/py/mains/contig_annotate.py b/src/py/mains/contig_annotate.py
new file mode 100644
index 0000000..9313a7c
--- /dev/null
+++ b/src/py/mains/contig_annotate.py
@@ -0,0 +1,29 @@
+import networkx as nx
+from falcon_kit.fc_asm_graph import AsmGraph
+
+
+def main(argv=None):
+  G_asm = AsmGraph("sg_edges_list", "utg_data", "ctg_paths")
+
+
+  p_ctg_coor_map = {}
+  for fn in ("p_ctg_tiling_path", "a_ctg_tiling_path"):
+    f = open(fn)
+    for row in f:
+        row = row.strip().split()
+        ctg_id, v, w, edge_rid, b, e  = row[:6]
+        if ctg_id not in p_ctg_coor_map:
+            coor = 0   # the p_ctg_tiling_path should be sorted by contig the order of the edges in the tiling path
+            p_ctg_coor_map[ctg_id] = {}
+            p_ctg_coor_map[ctg_id][v] = 0
+            coor += abs(int(b) - int(e))
+            p_ctg_coor_map[ctg_id][w] = coor
+            G_asm.node_to_ctg[w]
+            print ctg_id, v, 0, " ".join(list(G_asm.node_to_ctg[v]))
+            print ctg_id, w, coor, " ".join(list(G_asm.node_to_ctg[w]))
+            continue
+        else:
+            coor += abs(int(b) - int(e))
+            p_ctg_coor_map[ctg_id][w] = coor 
+            print ctg_id, w, coor, " ".join(list(G_asm.node_to_ctg[w]))
+    f.close()
diff --git a/src/py/mains/ctg_link_analysis.py b/src/py/mains/ctg_link_analysis.py
new file mode 100644
index 0000000..39bc84f
--- /dev/null
+++ b/src/py/mains/ctg_link_analysis.py
@@ -0,0 +1,80 @@
+from falcon_kit import fc_asm_graph 
+
+def main(argv=None):
+  AsmGraph = fc_asm_graph.AsmGraph
+
+  G_asm = AsmGraph("sg_edges_list", "utg_data", "ctg_paths")
+
+  sg_edges = G_asm.sg_edges
+  node_to_ctg = G_asm.node_to_ctg
+  node_to_utg = G_asm.node_to_utg
+
+  ctg_data = G_asm.ctg_data
+  utg_data = G_asm.utg_data
+
+  ctg_pair_links = {}
+  for v, w in sg_edges.keys():
+    if v in node_to_ctg and w in node_to_ctg:
+        for ctg1 in list(node_to_ctg[v]):
+            for ctg2 in list(node_to_ctg[w]):
+                if ctg1 == ctg2:
+                    continue
+                ctg_pair_links.setdefault((ctg1, ctg2), set())
+                ctg_pair_links[ (ctg1, ctg2) ].add( (v,w) )    
+
+                    
+  utg_pair_links = {}
+  for v, w in sg_edges.keys():
+    if v in node_to_utg and w in node_to_utg:
+        for u1 in list(node_to_utg[v]):
+            for u2 in list(node_to_utg[w]):
+                if u1 == u2:
+                    continue
+                utg_pair_links.setdefault((u1, u2), set())
+                utg_pair_links[(u1,u2)].add( (v, w) )
+
+
+  for ctg1, ctg2 in ctg_pair_links:
+    links = ctg_pair_links[ ( ctg1, ctg2 ) ]
+    count = len(links)
+    if count > 0:
+        path1 = ctg_data[ctg1][-1][-5:]
+        path2 = ctg_data[ctg2][-1][:5]
+        utg1 = []
+        utg2 = []
+        for s1, v1, t1 in path1:
+            u1 = (s1, t1, v1)
+            type_, length, score, path_or_edges =  utg_data[ u1 ]
+            if type_ == "compound":
+                for u in path_or_edges.split("|"):
+                    ss, vv, tt = u.split("~")
+                    utg1.append( (ss, tt, vv) )
+            else:
+               utg1.append(u1)
+        for s2, v2, t2 in path2:
+            u2 = (s2, t2, v2)
+            type_, length, score, path_or_edges =  utg_data[ u2 ]
+            if type_ == "compound":
+                for u in path_or_edges.split("|"):
+                    ss, vv, tt = u.split("~")
+                    utg2.append( (ss, tt, vv) )
+            else:
+               utg2.append(u2) 
+        #print path1
+        #print path2
+        #print len(utg1), len(utg2)
+        for u1 in utg1:
+            for u2 in utg2:
+                u1 = tuple(u1)
+                u2 = tuple(u2)
+                c = utg_pair_links.get( (u1, u2), set() )
+                if len(c) == 0:
+                    continue
+                s1,t1,v1 = u1
+                s2,t2,v2 = u2
+                len_1 = ctg_data[ ctg1 ][ 3 ]
+                len_2 = ctg_data[ ctg2 ][ 3 ]
+                print ctg1, ctg2, len_1, len_2, len(utg1), len(utg2), len(links), "~".join( (s1,v1,t1) ),  "~".join( (s2,v2,t2) ), len(c)
+        
+
+
diff --git a/src/py/mains/dedup_a_tigs.py b/src/py/mains/dedup_a_tigs.py
new file mode 100644
index 0000000..95fb979
--- /dev/null
+++ b/src/py/mains/dedup_a_tigs.py
@@ -0,0 +1,23 @@
+from falcon_kit.FastaReader import FastaReader
+import argparse
+import sys
+
+def parse_args(argv):
+    parser = argparse.ArgumentParser(description='remove duplicate a-tig, it assumes the working directory has the a_ctg_all.fa file')
+    parser.add_argument('--max_idt', type=int, help="keep a-tig if the identity (in %) to the primary contig is <= max_idt", default = 96)
+    parser.add_argument('--max_aln_cov', type=int, help="keep a-tig if the alignment coverage (in %) on the a-tig is <= max_aln_cov", default = 97)
+    parser.add_argument('--min_len_diff', type=int, help="keep a-tig if the length different > min_len_diff", default = 500)
+    args = parser.parse_args(argv[1:])
+    return args
+
+def main(argv=sys.argv):
+    args = parse_args(argv)
+    reads = FastaReader("a_ctg_all.fa")
+    with open("a_ctg.fa","w") as f:
+        for r in reads:
+            tig_id, v, w, len_, ovl, ne, delta_l, idt, cov = r.name.split()
+            if 100*float(idt) > args.max_idt and 100*float(cov) > args.max_aln_cov and\
+               abs(int(delta_l)) < args.min_len_diff:
+                   continue
+            print >>f, ">"+r.name
+            print >>f, r.sequence
diff --git a/src/py/mains/graph_to_contig.py b/src/py/mains/graph_to_contig.py
new file mode 100644
index 0000000..0a1ecc1
--- /dev/null
+++ b/src/py/mains/graph_to_contig.py
@@ -0,0 +1,297 @@
+import networkx as nx
+#from pbcore.io import FastaReader
+from falcon_kit.FastaReader import FastaReader
+from falcon_kit import kup, falcon, DWA
+
+read_fasta = "preads4falcon.fasta"
+edge_data_file = "sg_edges_list"
+utg_data_file = "utg_data"
+ctg_data_file = "ctg_paths"
+
+RCMAP = dict(zip("ACGTacgtNn-","TGCAtgcaNn-"))
+def rc(seq):
+    return "".join([RCMAP[c] for c in seq[::-1]])
+
+def get_aln_data(t_seq, q_seq):
+    aln_data = []
+    x = []
+    y = []
+    K = 8
+    seq0 = t_seq
+    lk_ptr = kup.allocate_kmer_lookup( 1 << (K * 2) )
+    sa_ptr = kup.allocate_seq( len(seq0) )
+    sda_ptr = kup.allocate_seq_addr( len(seq0) )
+    kup.add_sequence( 0, K, seq0, len(seq0), sda_ptr, sa_ptr, lk_ptr)
+    q_id = "dummy"
+
+    kmer_match_ptr = kup.find_kmer_pos_for_seq(q_seq, len(q_seq), K, sda_ptr, lk_ptr)
+    kmer_match = kmer_match_ptr[0]
+
+    if kmer_match.count != 0:
+        aln_range_ptr = kup.find_best_aln_range(kmer_match_ptr, K, K*5, 12)
+        aln_range = aln_range_ptr[0]
+        x,y = zip( * [ (kmer_match.query_pos[i], kmer_match.target_pos[i]) for i in range(kmer_match.count)] )
+
+        s1, e1, s2, e2 = aln_range.s1, aln_range.e1, aln_range.s2, aln_range.e2
+
+        if e1 - s1 > 100:
+
+            alignment = DWA.align(q_seq[s1:e1], e1-s1,
+                                  seq0[s2:e2], e2-s2,
+                                  1500,1)
+
+            if alignment[0].aln_str_size > 100:
+                aln_data.append( ( q_id, 0, s1, e1, len(q_seq), s2, e2, len(seq0), alignment[0].aln_str_size, alignment[0].dist ) )
+                aln_str1 = alignment[0].q_aln_str
+                aln_str0 = alignment[0].t_aln_str
+
+            DWA.free_alignment(alignment)
+
+        kup.free_aln_range(aln_range_ptr)
+
+    kup.free_kmer_match(kmer_match_ptr)
+    kup.free_kmer_lookup(lk_ptr)
+    kup.free_seq_array(sa_ptr)
+    kup.free_seq_addr_array(sda_ptr)
+    return aln_data, x, y
+
+def reverse_end( node_id ):
+    node_id, end = node_id.split(":")
+    new_end = "B" if end == "E" else "E"
+    return node_id + ":" + new_end
+
+def main(argv=None):
+    reads_in_layout = set()
+    with open(edge_data_file) as f:
+        for l in f:
+            l = l.strip().split()
+            """001039799:E 000333411:E 000333411 17524 20167 17524 99.62"""
+            v, w, rid, s, t, aln_score, idt, type_ = l
+            if type_ != "G":
+                continue
+            r1 = v.split(":")[0]
+            reads_in_layout.add(r1)
+            r2 = w.split(":")[0]
+            reads_in_layout.add(r2)
+
+    seqs = {}
+    # load all p-read name into memory
+    f = FastaReader(read_fasta)
+    for r in f:
+        if r.name not in reads_in_layout:
+            continue
+        seqs[r.name] = r.sequence.upper()
+
+    edge_data = {}
+    with open(edge_data_file) as f:
+        for l in f:
+            l = l.strip().split()
+            """001039799:E 000333411:E 000333411 17524 20167 17524 99.62"""
+            v, w, rid, s, t, aln_score, idt, type_ = l
+
+            if type_ != "G":
+                continue
+            r1 = v.split(":")[0]
+            reads_in_layout.add(r1)
+            r2 = w.split(":")[0]
+            reads_in_layout.add(r2)
+
+            s = int(s)
+            t = int(t)
+            aln_score = int(aln_score)
+            idt = float(idt)
+
+            if s < t:
+                e_seq = seqs[ rid ][ s:t ]
+            else:
+                e_seq = "".join([ RCMAP[c] for c in seqs[ rid ][ s:t:-1 ] ])
+            edge_data[ (v, w) ] = ( rid, s, t, aln_score, idt, e_seq )
+
+    utg_data = {}
+    with open(utg_data_file) as f:
+        for l in f:
+            l = l.strip().split()
+            s, v, t, type_, length, score, path_or_edges = l
+            if type_ not in ["compound", "simple", "contained"]:
+                continue
+            length = int(length)
+            score = int(score)
+            if type_ in ("simple", "contained"):
+                path_or_edges = path_or_edges.split("~")
+            else:
+                path_or_edges = [ tuple(e.split("~")) for e in path_or_edges.split("|") ]
+            utg_data[ (s,v,t) ] = type_, length, score, path_or_edges
+
+    p_ctg_out = open("p_ctg.fa","w")
+    a_ctg_out = open("a_ctg_all.fa","w")
+    a_ctg_base_out = open("a_ctg_base.fa","w")
+    p_ctg_t_out = open("p_ctg_tiling_path","w")
+    a_ctg_t_out = open("a_ctg_tiling_path","w")
+    a_ctg_base_t_out = open("a_ctg_base_tiling_path","w")
+    layout_ctg = set()
+
+    with open(ctg_data_file) as f:
+        for l in f:
+            l = l.strip().split()
+            ctg_id, c_type_, i_utig, t0, length, score, utgs = l
+            ctg_id = ctg_id
+            s0 = i_utig.split("~")[0]
+
+            if (reverse_end(t0), reverse_end(s0)) in layout_ctg:
+                continue
+            else:
+                layout_ctg.add( (s0, t0) )
+
+            ctg_label = i_utig+"~"+t0
+            length = int(length)
+            utgs = utgs.split("|")
+            one_path = []
+            total_score = 0
+            total_length =0
+
+            #a_ctg_data = []
+            a_ctg_group = {}
+
+            for utg in utgs:
+                s,v,t  = utg.split("~")
+                type_, length, score, path_or_edges = utg_data[ (s,v,t) ]
+                total_score += score
+                total_length += length
+                if type_ == "simple":
+                    if len(one_path) != 0:
+                        one_path.extend ( path_or_edges[1:] )
+                    else:
+                        one_path.extend ( path_or_edges )
+                if type_ == "compound":
+
+                    c_graph = nx.DiGraph()
+
+                    all_alt_path = []
+                    for ss, vv, tt in path_or_edges:
+                        type_, length, score, sub_path = utg_data[ (ss,vv,tt) ]
+
+                        v1 = sub_path[0]
+                        for v2 in sub_path[1:]:
+                            c_graph.add_edge( v1, v2, e_score = edge_data[ (v1, v2) ][3]  )
+                            v1 = v2
+
+                    shortest_path = nx.shortest_path( c_graph, s, t, "e_score" )
+                    score = nx.shortest_path_length( c_graph, s, t, "e_score" )
+                    all_alt_path.append( (score, shortest_path) )
+
+                    #a_ctg_data.append( (s, t, shortest_path) ) #first path is the same as the one used in the primary contig
+                    while 1:
+                        n0 = shortest_path[0]
+                        for n1 in shortest_path[1:]:
+                            c_graph.remove_edge(n0, n1)
+                            n0 = n1
+                        try:
+                            shortest_path = nx.shortest_path( c_graph, s, t, "e_score" )
+                            score = nx.shortest_path_length( c_graph, s, t, "e_score" )
+                            #a_ctg_data.append( (s, t, shortest_path) )
+                            all_alt_path.append( (score, shortest_path) )
+
+                        except nx.exception.NetworkXNoPath:
+                            break
+                        #if len(shortest_path) < 2:
+                        #    break
+                    all_alt_path.sort()
+                    all_alt_path.reverse()
+                    shortest_path = all_alt_path[0][1]
+                    if len(one_path) != 0:
+                        one_path.extend ( shortest_path[1:] )
+                    else:
+                        one_path.extend ( shortest_path )
+
+                    a_ctg_group[ (s, t) ] = all_alt_path
+
+            if len(one_path) == 0:
+                continue
+
+            one_path_edges = zip(one_path[:-1], one_path[1:])
+
+            sub_seqs = []
+            for vv, ww in one_path_edges:
+                rid, s, t, aln_score, idt, e_seq = edge_data[ (vv, ww) ]
+                sub_seqs.append( e_seq )
+                print >> p_ctg_t_out, "%s %s %s %s %d %d %d %0.2f" % (ctg_id, vv, ww, rid, s, t, aln_score, idt)
+            print >> p_ctg_out, ">%s %s %s %d %d" % (ctg_id, ctg_label, c_type_, total_length, total_score)
+            print >> p_ctg_out, "".join(sub_seqs)
+
+            a_id = 1
+            for v, w, in a_ctg_group:
+                #get the base sequence used in the primary contig
+                #count = len( [x for x in a_ctg_group[ (v, w) ] if len(x[1]) > 3] )
+                #if count < 2:
+                #    continue
+                atig_output = []
+
+                score, atig_path = a_ctg_group[ (v, w) ][0]
+                atig_path_edges = zip(atig_path[:-1], atig_path[1:])
+                sub_seqs = []
+                total_length = 0
+                total_score = 0
+                for vv, ww in atig_path_edges:
+                    rid, s, t, aln_score, idt, e_seq = edge_data[ (vv, ww) ]
+                    sub_seqs.append( e_seq )
+                    total_length += abs(s-t)
+                    total_score += aln_score
+
+                base_seq = "".join(sub_seqs)
+                atig_output.append( (v, w, atig_path, total_length, total_score, base_seq, atig_path_edges, 0, 1, 1) )
+
+                for score, atig_path in a_ctg_group[ (v, w) ][1:]:
+                    atig_path_edges = zip(atig_path[:-1], atig_path[1:])
+                    sub_seqs = []
+                    total_length = 0
+                    total_score = 0
+                    for vv, ww in atig_path_edges:
+                        rid, s, t, aln_score, idt, e_seq = edge_data[ (vv, ww) ]
+                        sub_seqs.append( e_seq )
+                        total_length += abs(s-t)
+                        total_score += aln_score
+
+                    seq = "".join(sub_seqs)
+
+                    delta_len = len(seq) - len(base_seq)
+                    idt = 0.0
+                    cov = 0.0
+                    if len(base_seq) > 2000 and len(seq) > 2000:
+                        aln_data, x, y = get_aln_data(base_seq, seq)
+                        if len( aln_data ) != 0:
+                            idt =  1.0-1.0*aln_data[-1][-1] / aln_data[-1][-2]
+                            cov = 1.0*(aln_data[-1][3]-aln_data[-1][2])/aln_data[-1][4]
+
+                    atig_output.append( (v, w, atig_path, total_length, total_score, seq, atig_path_edges, delta_len, idt, cov) )
+
+                if len(atig_output) == 1:
+                    continue
+
+                sub_id = 0
+                for data in atig_output:
+                    v0, w0, tig_path, total_length, total_score, seq, atig_path_edges, delta_len, a_idt, cov = data
+                    for vv, ww in atig_path_edges:
+                        rid, s, t, aln_score, idt, e_seq = edge_data[ (vv, ww) ]
+                        if sub_id != 0:
+                            print >> a_ctg_t_out, "%s-%03d-%02d %s %s %s %d %d %d %0.2f" % (ctg_id, a_id, sub_id, vv, ww, rid, s, t, aln_score, idt)
+                        else:
+                            print >> a_ctg_base_t_out, "%s-%03d-%02d %s %s %s %d %d %d %0.2f" % (ctg_id, a_id, sub_id, vv, ww, rid, s, t, aln_score, idt)
+
+                    if sub_id != 0:
+                        print >> a_ctg_out, ">%s-%03d-%02d %s %s %d %d %d %d %0.2f %0.2f" % (ctg_id, a_id, sub_id, v0, w0, total_length, total_score, len(atig_path_edges), delta_len, a_idt, cov )
+                        print >> a_ctg_out, seq
+                    else:
+                        print >> a_ctg_base_out, ">%s-%03d-%02d %s %s %d %d %d %d %0.2f %0.2f" % (ctg_id, a_id, sub_id, v0, w0, total_length, total_score, len(atig_path_edges), delta_len, a_idt, cov )
+                        print >> a_ctg_base_out, seq
+
+                    sub_id += 1
+
+                a_id += 1
+
+    a_ctg_out.close()
+    a_ctg_base_out.close()
+    p_ctg_out.close()
+    a_ctg_t_out.close()
+    a_ctg_base_t_out.close()
+    a_ctg_t_out.close()
+    p_ctg_t_out.close()
diff --git a/src/py/mains/graph_to_utgs.py b/src/py/mains/graph_to_utgs.py
new file mode 100644
index 0000000..d0fc548
--- /dev/null
+++ b/src/py/mains/graph_to_utgs.py
@@ -0,0 +1,160 @@
+from falcon_kit import kup, falcon, DWA
+from falcon_kit.fc_asm_graph import AsmGraph
+import networkx as nx
+
+RCMAP = dict(zip("ACGTacgtNn-","TGCAtgcaNn-"))
+def rc(seq):
+    return "".join([RCMAP[c] for c in seq[::-1]])
+
+def get_aln_data(t_seq, q_seq):
+    aln_data = []
+    K = 8
+    seq0 = t_seq
+    lk_ptr = kup.allocate_kmer_lookup( 1 << (K * 2) )
+    sa_ptr = kup.allocate_seq( len(seq0) )
+    sda_ptr = kup.allocate_seq_addr( len(seq0) )
+    kup.add_sequence( 0, K, seq0, len(seq0), sda_ptr, sa_ptr, lk_ptr)
+    q_id = "dummy"
+    
+    kmer_match_ptr = kup.find_kmer_pos_for_seq(q_seq, len(q_seq), K, sda_ptr, lk_ptr)
+    kmer_match = kmer_match_ptr[0]
+    aln_range_ptr = kup.find_best_aln_range(kmer_match_ptr, K, K*5, 12)
+    aln_range = aln_range_ptr[0]
+    x,y = zip( * [ (kmer_match.query_pos[i], kmer_match.target_pos[i]) for i in range(kmer_match.count)] )
+    kup.free_kmer_match(kmer_match_ptr)
+    s1, e1, s2, e2 = aln_range.s1, aln_range.e1, aln_range.s2, aln_range.e2
+    
+    if e1 - s1 > 100:
+
+        alignment = DWA.align(q_seq[s1:e1], e1-s1,
+                              seq0[s2:e2], e2-s2,
+                              1500,1)
+
+        if alignment[0].aln_str_size > 100:
+            aln_data.append( ( q_id, 0, s1, e1, len(q_seq), s2, e2, len(seq0), alignment[0].aln_str_size, alignment[0].dist ) )
+            aln_str1 = alignment[0].q_aln_str
+            aln_str0 = alignment[0].t_aln_str
+
+        DWA.free_alignment(alignment)
+
+    kup.free_kmer_lookup(lk_ptr)
+    kup.free_seq_array(sa_ptr)
+    kup.free_seq_addr_array(sda_ptr)
+    return aln_data, x, y
+
+def main(argv=None):
+  G_asm = AsmGraph("sg_edges_list", "utg_data", "ctg_paths")
+  G_asm.load_sg_seq("preads4falcon.fasta")
+
+  utg_out = open("utgs.fa","w")
+
+
+  for utg in G_asm.utg_data:
+    s,t,v  = utg
+    type_, length, score, path_or_edges = G_asm.utg_data[ (s,t,v) ]
+    if type_ == "simple":
+        path_or_edges = path_or_edges.split("~")
+        seq = G_asm.get_seq_from_path( path_or_edges )
+        print >> utg_out, ">%s~%s~%s-%d %d %d" % (s, v, t, 0, length, score ) 
+        print >> utg_out, seq
+
+    if type_ == "compound":
+
+        c_graph = nx.DiGraph()
+
+        all_alt_path = []
+        path_or_edges = [ c.split("~") for c in path_or_edges.split("|")]
+        for ss, vv, tt in path_or_edges:
+            type_, length, score, sub_path = G_asm.utg_data[ (ss,tt,vv) ]
+             
+            sub_path = sub_path.split("~")
+            v1 = sub_path[0]
+            for v2 in sub_path[1:]:
+                c_graph.add_edge( v1, v2, e_score = G_asm.sg_edges[ (v1, v2) ][1]  )
+                v1 = v2
+        
+        shortest_path = nx.shortest_path( c_graph, s, t, "e_score" )
+        score = nx.shortest_path_length( c_graph, s, t, "e_score" )
+        all_alt_path.append( (score, shortest_path) )
+        
+
+        #a_ctg_data.append( (s, t, shortest_path) ) #first path is the same as the one used in the primary contig
+        while 1:
+            if s == t:
+                break
+            n0 = shortest_path[0]
+            for n1 in shortest_path[1:]:
+                c_graph.remove_edge(n0, n1)
+                n0 = n1
+            try:
+                shortest_path = nx.shortest_path( c_graph, s, t, "e_score" )
+                score = nx.shortest_path_length( c_graph, s, t, "e_score" )
+                #a_ctg_data.append( (s, t, shortest_path) )
+                all_alt_path.append( (score, shortest_path) )
+
+            except nx.exception.NetworkXNoPath:
+                break
+            #if len(shortest_path) < 2:
+            #    break
+
+        all_alt_path.sort()
+        all_alt_path.reverse()
+        shortest_path = all_alt_path[0][1]
+
+        
+        score, atig_path = all_alt_path[0]
+
+        atig_output = []
+
+        atig_path_edges = zip(atig_path[:-1], atig_path[1:])
+        sub_seqs = []
+        total_length = 0
+        total_score = 0
+        for vv, ww in atig_path_edges:
+            r, aln_score, idt, typs_  = G_asm.sg_edges[ (vv, ww) ]
+            e_seq  = G_asm.sg_edge_seqs[ (vv, ww) ]
+            rid, ss, tt = r
+            sub_seqs.append( e_seq )
+            total_length += abs(ss-tt)
+            total_score += aln_score
+
+        base_seq = "".join(sub_seqs)
+        atig_output.append( (s, t, atig_path, total_length, total_score, base_seq, atig_path_edges, 1, 1) )
+
+
+        duplicated = True
+        for score, atig_path in all_alt_path[1:]:
+            atig_path_edges = zip(atig_path[:-1], atig_path[1:])
+            sub_seqs = []
+            total_length = 0
+            total_score = 0
+            for vv, ww in atig_path_edges:
+                r, aln_score, idt, type_ = G_asm.sg_edges[ (vv, ww) ]
+                e_seq  = G_asm.sg_edge_seqs[ (vv, ww) ]
+                rid, ss, tt = r
+                sub_seqs.append( e_seq )
+                total_length += abs(ss-tt)
+                total_score += aln_score
+
+            seq = "".join(sub_seqs)
+
+            aln_data, x, y = get_aln_data(base_seq, seq)
+            if len( aln_data ) != 0:
+                idt =  1.0-1.0*aln_data[-1][-1] / aln_data[-1][-2]
+                cov = 1.0*(aln_data[-1][3]-aln_data[-1][2])/aln_data[-1][4]
+                if idt < 0.96 or cov < 0.98:
+                    duplicated = False
+                    atig_output.append( (s, t, atig_path, total_length, total_score, seq, atig_path_edges, idt, cov) )
+            else:
+                duplicated = False
+                atig_output.append( (s, t, atig_path, total_length, total_score, seq, atig_path_edges, 0, 0) )
+
+        #if len(atig_output) == 1:
+        #    continue
+
+        sub_id = 0
+        for data in atig_output:
+            v0, w0, tig_path, total_length, total_score, seq, atig_path_edges, a_idt, cov = data
+            print >> utg_out, ">%s~%s~%s-%d %d %d" % (v0, "NA", w0, sub_id,  total_length, total_score ) 
+            print >> utg_out, seq
+            sub_id += 1
diff --git a/src/py/mains/ovlp_filter.py b/src/py/mains/ovlp_filter.py
new file mode 100644
index 0000000..4b0e7cd
--- /dev/null
+++ b/src/py/mains/ovlp_filter.py
@@ -0,0 +1,265 @@
+from falcon_kit.multiproc import Pool
+import falcon_kit.util.io as io
+import argparse
+import sys
+
+Reader = io.CapturedProcessReaderContext
+
+
+def run_filter_stage1(db_fn, fn, max_diff, max_ovlp, min_ovlp, min_len):
+    cmd = "LA4Falcon -mo %s %s" % (db_fn, fn)
+    reader = Reader(cmd)
+    with reader:
+        return fn, filter_stage1(reader.readlines, max_diff, max_ovlp, min_ovlp, min_len)
+def filter_stage1(readlines, max_diff, max_ovlp, min_ovlp, min_len):
+        def ignore(overlap_data):
+            left_count = overlap_data["5p"]
+            right_count = overlap_data["3p"]
+            if abs(left_count - right_count) > max_diff:
+                return True
+            elif left_count > max_ovlp or right_count > max_ovlp:
+                return True
+            elif left_count < min_ovlp or right_count < min_ovlp:
+                return True
+
+        ignore_rtn = []
+        current_q_id = None
+        ave_idt = 0.0
+        all_over_len = 0.0
+        overlap_data = {"5p":0, "3p":0}
+        q_id = None
+        for l in readlines():
+            l = l.strip().split()
+            q_id, t_id = l[:2]
+
+            if q_id != current_q_id:
+                if current_q_id is not None:
+                    if ignore(overlap_data):
+                        ignore_rtn.append( current_q_id )
+                overlap_data = {"5p":0, "3p":0}
+                current_q_id = q_id
+                ave_idt = 0.0
+                all_over_len = 0.0
+
+            overlap_len = -int(l[2])
+            idt = float(l[3])
+            q_s, q_e, q_l = int(l[5]), int(l[6]), int(l[7])
+            t_s, t_e, t_l = int(l[9]), int(l[10]), int(l[11])
+
+            if idt < 90.0:
+                continue
+            if q_l < min_len or t_l < min_len:
+                continue
+            if l[-1] in ("contains", "overlap"):
+                ave_idt += idt * overlap_len
+                all_over_len += overlap_len
+            if q_s == 0:
+                overlap_data["5p"] += 1
+            if q_e == q_l:
+                overlap_data["3p"] += 1
+        if q_id is not None:
+            if ignore(overlap_data):
+                ignore_rtn.append( current_q_id )
+        return ignore_rtn
+
+def run_filter_stage2(db_fn, fn, max_diff, max_ovlp, min_ovlp, min_len, ignore_set):
+    cmd = "LA4Falcon -mo %s %s" % (db_fn, fn)
+    reader = Reader(cmd)
+    with reader:
+        return fn, filter_stage2(reader.readlines, max_diff, max_ovlp, min_ovlp, min_len, ignore_set)
+def filter_stage2(readlines, max_diff, max_ovlp, min_ovlp, min_len, ignore_set):
+        contained_id = set()
+        for l in readlines():
+            l = l.strip().split()
+            q_id, t_id = l[:2]
+
+            q_s, q_e, q_l = int(l[5]), int(l[6]), int(l[7])
+            t_s, t_e, t_l = int(l[9]), int(l[10]), int(l[11])
+
+            idt = float(l[3])
+            if idt < 90:
+                continue
+
+            if q_l < min_len or t_l < min_len:
+                continue
+
+            if q_id in ignore_set:
+                continue
+            if t_id in ignore_set:
+                continue
+            if l[-1] == "contained":
+                contained_id.add(q_id)
+            if l[-1] == "contains":
+                contained_id.add(t_id)
+        return contained_id 
+
+def run_filter_stage3(db_fn, fn, max_diff, max_ovlp, min_ovlp, min_len, ignore_set, contained_set, bestn):
+    cmd = "LA4Falcon -mo %s %s" % (db_fn, fn)
+    reader = Reader(cmd)
+    with reader:
+        return fn, filter_stage3(reader.readlines, max_diff, max_ovlp, min_ovlp, min_len, ignore_set, contained_set, bestn)
+def filter_stage3(readlines, max_diff, max_ovlp, min_ovlp, min_len, ignore_set, contained_set, bestn):
+        ovlp_output = []
+        overlap_data = {"5p":[], "3p":[]}
+        current_q_id = None
+        for l in readlines():
+            l = l.strip().split()
+            q_id, t_id = l[:2]
+
+
+            if current_q_id == None:
+                current_q_id = q_id
+                overlap_data = {"5p":[], "3p":[]}
+
+            elif q_id != current_q_id:
+
+                left = overlap_data["5p"]
+                right = overlap_data["3p"]
+                left.sort()
+                right.sort()
+
+                for i in xrange(len(left)):
+                    score, m_range, ovlp = left[i]
+                    ovlp_output.append(ovlp)
+                    #print " ".join(ovlp), read_end_data[current_q_id] 
+                    if i >= bestn and m_range > 1000:
+                        break
+                
+                for i in xrange(len(right)):
+                    score, m_range, ovlp = right[i]
+                    ovlp_output.append(ovlp)
+                    #print " ".join(ovlp), read_end_data[current_q_id]
+                    if i >= bestn and m_range > 1000:
+                        break
+
+                overlap_data = {"5p":[], "3p":[]}
+                current_q_id = q_id
+
+            if q_id in contained_set:
+                continue
+            if t_id in contained_set:
+                continue
+            if q_id in ignore_set:
+                continue
+            if t_id in ignore_set:
+                continue
+
+            overlap_len = -int(l[2])
+            idt = float(l[3])
+            q_s, q_e, q_l = int(l[5]), int(l[6]), int(l[7])
+            t_s, t_e, t_l = int(l[9]), int(l[10]), int(l[11])
+
+            if idt < 90:
+                continue
+            if q_l < min_len or t_l < min_len:
+                continue
+
+            if q_s == 0:
+                overlap_data["5p"].append( (-overlap_len,  t_l - (t_e - t_s),  l) )
+            elif q_e == q_l:
+                overlap_data["3p"].append( (-overlap_len, t_l - (t_e - t_s), l) )
+
+        left = overlap_data["5p"]
+        right = overlap_data["3p"]
+        left.sort()
+        right.sort()
+
+
+        for i in xrange(len(left)):
+            score, m_range, ovlp = left[i]
+            ovlp_output.append(ovlp)
+            #print " ".join(ovlp), read_end_data[current_q_id] 
+            if i >= bestn and m_range > 1000:
+                break
+
+        for i in xrange(len(right)):
+            score, m_range, ovlp = right[i]
+            ovlp_output.append(ovlp)
+            #print " ".join(ovlp), read_end_data[current_q_id]
+            if i >= bestn and m_range > 1000:
+                break
+
+        return ovlp_output
+
+def run_ovlp_filter(exe_pool, file_list, max_diff, max_cov, min_cov, min_len, bestn, db_fn):
+    io.LOG('preparing filter_stage1')
+    io.logstats()
+    inputs = []
+    for fn in file_list:
+        if len(fn) != 0:
+            inputs.append( (run_filter_stage1, db_fn, fn, max_diff, max_cov, min_cov, min_len) )
+    
+    ignore_all = []
+    for res in exe_pool.imap(io.run_func, inputs):  
+        ignore_all.extend( res[1] )
+
+    io.LOG('preparing filter_stage2')
+    io.logstats()
+    inputs = []
+    ignore_all = set(ignore_all)
+    for fn in file_list:
+        if len(fn) != 0:
+            inputs.append( (run_filter_stage2, db_fn, fn, max_diff, max_cov, min_cov, min_len, ignore_all) )
+    contained = set()
+    for res in exe_pool.imap(io.run_func, inputs):  
+        contained.update(res[1])
+        #print res[0], len(res[1]), len(contained)
+
+    #print "all", len(contained)
+    io.LOG('preparing filter_stage3')
+    io.logstats()
+    inputs = []
+    ignore_all = set(ignore_all)
+    for fn in file_list:
+        if len(fn) != 0:
+            inputs.append( (run_filter_stage3, db_fn, fn, max_diff, max_cov, min_cov, min_len, ignore_all, contained, bestn) )
+    for res in exe_pool.imap(io.run_func, inputs):  
+        for l in res[1]:
+            print " ".join(l)
+    io.logstats()
+
+def try_run_ovlp_filter(n_core, fofn, max_diff, max_cov, min_cov, min_len, bestn, db_fn):
+    io.LOG('starting ovlp_filter')
+    file_list = io.validated_fns(fofn)
+    io.LOG('fofn %r: %r' %(fofn, file_list))
+    n_core = min(n_core, len(file_list))
+    exe_pool = Pool(n_core)
+    try:
+        run_ovlp_filter(exe_pool, file_list, max_diff, max_cov, min_cov, min_len, bestn, db_fn)
+        io.LOG('finished ovlp_filter')
+    except KeyboardInterrupt:
+        io.LOG('terminating ovlp_filter workers...')
+        exe_pool.terminate()
+
+def ovlp_filter(n_core, fofn, max_diff, max_cov, min_cov, min_len, bestn, db_fn, debug, silent, stream):
+    if debug:
+        n_core = 0
+        silent = False
+    if silent:
+        io.LOG = io.write_nothing
+    if stream:
+        global Reader
+        Reader = io.StreamedProcessReaderContext
+    try_run_ovlp_filter(n_core, fofn, max_diff, max_cov, min_cov, min_len, bestn, db_fn)
+
+def parse_args(argv):
+    parser = argparse.ArgumentParser(description='a simple multi-processes LAS ovelap data filter')
+    parser.add_argument('--n_core', type=int, default=4,
+                        help='number of processes used for generating consensus; '
+                        '0 for main process only (default=%(default)s)')
+    parser.add_argument('--fofn', type=str, help='file contains the path of all LAS file to be processed in parallel')
+    parser.add_argument('--db', type=str, dest='db_fn', help='read db file path')
+    parser.add_argument('--max_diff', type=int, help="max difference of 5' and 3' coverage")
+    parser.add_argument('--max_cov', type=int, help="max coverage of 5' or 3' coverage")
+    parser.add_argument('--min_cov', type=int, help="min coverage of 5' or 3' coverage")
+    parser.add_argument('--min_len', type=int, default=2500, help="min length of the reads (default=%(default)s)")
+    parser.add_argument('--bestn', type=int, default=10, help="output at least best n overlaps on 5' or 3' ends if possible (default=%(default)s)")
+    parser.add_argument('--stream', action='store_true', help='stream from LA4Falcon, instead of slurping all at once; can save memory for large data')
+    parser.add_argument('--debug', '-g', action='store_true', help="single-threaded, plus other aids to debugging")
+    parser.add_argument('--silent', action='store_true', help="suppress cmd reporting on stderr")
+    args = parser.parse_args(argv[1:])
+    return args
+
+def main(argv=sys.argv):
+    args = parse_args(argv)
+    ovlp_filter(**vars(args))
diff --git a/src/py/mains/ovlp_stats.py b/src/py/mains/ovlp_stats.py
new file mode 100644
index 0000000..403f756
--- /dev/null
+++ b/src/py/mains/ovlp_stats.py
@@ -0,0 +1,115 @@
+from falcon_kit.multiproc import Pool
+import falcon_kit.util.io as io
+import argparse
+import shlex
+import subprocess as sp
+import sys
+
+Reader = io.CapturedProcessReaderContext
+
+
+def filter_stats(readlines, min_len):
+        current_q_id = None
+        ave_idt = 0.0
+        all_over_len = 0.0
+        overlap_data = {"5p":0, "3p":0}
+        q_id = None
+        rtn_data = []
+        q_l = 0
+        for l in readlines():
+            l = l.strip().split()
+            q_id, t_id = l[:2]
+
+            if q_id != current_q_id:
+                left_count = overlap_data["5p"]
+                right_count = overlap_data["3p"]
+                if (current_q_id != None and
+                        (left_count > 0 or right_count > 0)):
+                    rtn_data.append( (current_q_id, q_l, left_count, right_count  ) )
+                overlap_data = {"5p":0, "3p":0}
+                current_q_id = q_id
+                ave_idt = 0.0
+                all_over_len = 0.0
+
+            overlap_len = -int(l[2])
+            idt = float(l[3])
+            q_s, q_e, q_l = int(l[5]), int(l[6]), int(l[7])
+            t_s, t_e, t_l = int(l[9]), int(l[10]), int(l[11])
+
+            if q_l < min_len or t_l < min_len:
+                continue
+
+            if idt < 90:
+                continue
+
+            if l[-1] in ("contains", "overlap"):
+                ave_idt += idt * overlap_len
+                all_over_len += overlap_len
+            if q_s == 0:
+                overlap_data["5p"] += 1
+            if q_e == q_l:
+                overlap_data["3p"] += 1
+
+        if q_id != None:
+            left_count = overlap_data["5p"]
+            right_count = overlap_data["3p"]
+            if (left_count > 0 or right_count > 0):
+                rtn_data.append( (q_id, q_l, left_count, right_count  ) )
+
+        return rtn_data
+
+
+def run_filter_stats(fn, min_len):
+    cmd = "LA4Falcon -mo ../1-preads_ovl/preads.db %s" % fn
+    reader = Reader(cmd)
+    with reader:
+        return fn, filter_stats(reader.readlines, min_len)
+
+def run_ovlp_stats(exe_pool, file_list, min_len):
+    inputs = []
+    for fn in file_list:
+        if len(fn) != 0:
+            inputs.append( (run_filter_stats, fn, min_len ) )
+    for res in exe_pool.imap(io.run_func, inputs):
+        for l in res[1]:
+            print " ".join([str(c) for c in l])
+
+def try_run_ovlp_stats(n_core, fofn, min_len):
+    io.LOG('starting ovlp_stats')
+    file_list = io.validated_fns(fofn)
+    io.LOG('fofn %r: %r' %(fofn, file_list))
+    n_core = min(n_core, len(file_list))
+    exe_pool = Pool(n_core)
+    try:
+        run_ovlp_stats(exe_pool, file_list, min_len)
+        io.LOG('finished ovlp_stats')
+    except KeyboardInterrupt:
+        io.LOG('terminating ovlp_stats workers...')
+        exe_pool.terminate()
+
+def ovlp_stats(fofn, min_len, n_core, stream, debug, silent):
+    if debug:
+        n_core = 0
+        silent = False
+    if silent:
+        io.LOG = io.write_nothing
+    if stream:
+        global Reader
+        Reader = io.StreamedProcessReaderContext
+    try_run_ovlp_stats(n_core, fofn, min_len)
+
+def parse_args(argv):
+    parser = argparse.ArgumentParser(description='a simple multi-processes LAS ovelap data filter')
+    parser.add_argument('--n_core', type=int, default=4,
+                        help='number of processes used for generating consensus; '
+                        '0 for main process only (default=%(default)s)')
+    parser.add_argument('--fofn', type=str, help='file contains the path of all LAS file to be processed in parallel')
+    parser.add_argument('--min_len', type=int, default=2500, help="min length of the reads")
+    parser.add_argument('--stream', action='store_true', help='stream from LA4Falcon, instead of slurping all at once; can save memory for large data')
+    parser.add_argument('--debug', '-g', action='store_true', help="single-threaded, plus other aids to debugging")
+    parser.add_argument('--silent', action='store_true', help="suppress cmd reporting on stderr")
+    return parser.parse_args(argv[1:])
+
+def main(argv=sys.argv):
+    args = parse_args(argv)
+    ovlp_stats(**vars(args))
diff --git a/src/py/mains/ovlp_to_graph.py b/src/py/mains/ovlp_to_graph.py
new file mode 100644
index 0000000..371d2aa
--- /dev/null
+++ b/src/py/mains/ovlp_to_graph.py
@@ -0,0 +1,1441 @@
+#from pbcore.io import FastaReader
+import networkx as nx
+import os
+import shlex
+import subprocess
+import sys
+
+DEBUG_LOG_LEVEL = 0
+
+class SGNode(object):
+    """
+    class representing a node in the string graph
+    """
+    def __init__(self, node_name):
+        self.name = node_name
+        self.out_edges = []
+        self.in_edges = []
+    def add_out_edge(self, out_edge):
+        self.out_edges.append(out_edge)
+    def add_in_edge(self, in_edge):
+        self.in_edges.append(in_edge)
+
+class SGEdge(object):
+    """
+    class representing an edge in the string graph
+    """
+    def __init__(self, in_node, out_node):
+        self.in_node = in_node
+        self.out_node = out_node
+        self.attr = {}
+    def set_attribute(self, attr, value):
+        self.attr[attr] = value
+
+def reverse_end( node_id ):
+    node_id, end = node_id.split(":")
+    new_end = "B" if end == "E" else "E"
+    return node_id + ":" + new_end
+
+class StringGraph(object):
+    """
+    class representing the string graph
+    """
+    def __init__(self):
+        self.nodes = {}
+        self.edges = {}
+        self.n_mark = {}
+        self.e_reduce = {}
+        self.repeat_overlap = {}
+        self.best_out = {}
+        self.best_in = {}
+        
+    def add_node(self, node_name):
+        """ 
+        add a node into the graph by given a node name
+        """
+        if node_name not in self.nodes:
+            self.nodes[node_name] = SGNode(node_name)
+    
+    def add_edge(self, in_node_name, out_node_name, **attributes):
+        """ 
+        add an edge into the graph by given a pair of nodes
+        """
+        if (in_node_name, out_node_name) not in self.edges:
+        
+            self.add_node(in_node_name)
+            self.add_node(out_node_name)
+            in_node = self.nodes[in_node_name]
+            out_node = self.nodes[out_node_name]    
+            
+            edge = SGEdge(in_node, out_node)
+            self.edges[ (in_node_name, out_node_name) ] = edge
+            in_node.add_out_edge(edge)
+            out_node.add_in_edge(edge)
+        edge =  self.edges[ (in_node_name, out_node_name) ]
+        for k, v in attributes.items():
+            edge.attr[k] = v
+
+    def init_reduce_dict(self):
+        for e in self.edges:
+            self.e_reduce[e] = False
+
+    def mark_chimer_edges(self):
+
+        for n_name in self.nodes:
+            n = self.nodes[n_name]
+            
+            out_nodes = set( [ e.out_node for e in n.out_edges ] )
+            in_nodes = [e.in_node for e in n.in_edges ] 
+            is_chimer = True
+            for in_node in in_nodes:
+                for v in [e.out_node for e in in_node.out_edges]:
+                    if v in out_nodes:
+                        is_chimer = False
+                        break
+
+            if is_chimer == True:
+                for e in n.out_edges:
+                    v, w =  e.in_node.name, e.out_node.name
+                    self.e_reduce[ (v, w) ] = True
+                for e in n.in_edges:
+                    v, w =  e.in_node.name, e.out_node.name
+                    self.e_reduce[ (v, w) ] = True
+
+
+            # need to remove the node from the graph rather than just mark the edges are "reduced"?
+
+
+    def mark_spur_edge(self):
+
+        removed_edges = set()
+        for  v in self.nodes:
+            if len(self.nodes[v].out_edges) > 1:
+                for out_edge in self.nodes[v].out_edges:
+                    w = out_edge.out_node.name
+                    
+                    if len(self.nodes[w].out_edges) == 0 and self.e_reduce[ (v, w) ] != True:
+                        self.e_reduce[(v, w)] = True
+                        removed_edges.add( (v, w) )
+                        v2, w2 = reverse_end(w), reverse_end(v)
+                        self.e_reduce[(v2, w2)] = True
+                        removed_edges.add( (v2, w2) )
+
+            if len(self.nodes[v].in_edges) > 1:
+                for in_edge in self.nodes[v].in_edges:
+                    w = in_edge.in_node.name
+                    if len(self.nodes[w].in_edges) == 0 and self.e_reduce[ (w, v) ] != True:
+                        self.e_reduce[(w, v)] = True
+                        removed_edges.add( (w, v) )
+                        v2, w2 = reverse_end(w), reverse_end(v)
+                        self.e_reduce[(w2, v2)] = True
+                        removed_edges.add( (w2, v2) )
+        return removed_edges
+
+    def mark_tr_edges(self):
+        """
+        transitive reduction
+        """
+        n_mark = self.n_mark
+        e_reduce = self.e_reduce
+        FUZZ = 500
+        for n in self.nodes:
+            n_mark[n] = "vacant"
+    
+        for n_name, node in self.nodes.items():
+
+            out_edges = node.out_edges
+            if len(out_edges) == 0:
+                continue
+            
+            out_edges.sort(key=lambda x: x.attr["length"])
+            
+            for e in out_edges:
+                w = e.out_node
+                n_mark[ w.name ] = "inplay"
+            
+            max_len = out_edges[-1].attr["length"]
+                
+            max_len += FUZZ
+            
+            for e in out_edges:
+                e_len = e.attr["length"]
+                w = e.out_node
+                if n_mark[w.name] == "inplay":
+                    w.out_edges.sort( key=lambda x: x.attr["length"] )
+                    for e2 in w.out_edges:
+                        if e2.attr["length"] + e_len < max_len:
+                            x = e2.out_node
+                            if n_mark[x.name] == "inplay":
+                                n_mark[x.name] = "eliminated"
+            
+            for e in out_edges:
+                e_len = e.attr["length"]
+                w = e.out_node
+                w.out_edges.sort( key=lambda x: x.attr["length"] )
+                if len(w.out_edges) > 0:
+                    x = w.out_edges[0].out_node
+                    if n_mark[x.name] == "inplay":
+                        n_mark[x.name] = "eliminated"
+                for e2 in w.out_edges:
+                    if e2.attr["length"] < FUZZ:
+                        x = e2.out_node
+                        if n_mark[x.name] == "inplay":
+                            n_mark[x.name] = "eliminated"
+                            
+            for out_edge in out_edges:
+                v = out_edge.in_node
+                w = out_edge.out_node
+                if n_mark[w.name] == "eliminated":
+                    e_reduce[ (v.name, w.name) ] = True
+                    v_name, w_name = reverse_end(w.name), reverse_end(v.name)
+                    e_reduce[(v_name, w_name)] = True
+                n_mark[w.name] = "vacant"
+                
+
+    def mark_best_overlap(self):
+        """
+        find the best overlapped edges
+        """
+
+        best_edges = set()
+        removed_edges = set()
+
+        for v in self.nodes:
+
+            out_edges = self.nodes[v].out_edges
+            if len(out_edges) > 0:
+                out_edges.sort(key=lambda e: -e.attr["score"])
+                for e in out_edges:
+                    if self.e_reduce[ (e.in_node.name, e.out_node.name) ] != True:
+                        best_edges.add( (e.in_node.name, e.out_node.name) )
+                        self.best_out[v] = e.out_node.name
+                        break
+
+            in_edges = self.nodes[v].in_edges
+            if len(in_edges) > 0:
+                in_edges.sort(key=lambda e: -e.attr["score"])
+                for e in in_edges:
+                    if self.e_reduce[ (e.in_node.name, e.out_node.name) ] != True:
+                        best_edges.add( (e.in_node.name, e.out_node.name) )
+                        self.best_in[v] = e.in_node.name
+                        break
+
+        if DEBUG_LOG_LEVEL > 1:
+            print "X", len(best_edges)
+
+        for e_n, e in self.edges.items():
+            v = e_n[0]
+            w = e_n[1]
+            if self.e_reduce[ (v, w) ] != True:
+                if (v, w) not in best_edges:
+                    self.e_reduce[(v, w)] = True
+                    removed_edges.add( (v, w) )
+                    v2, w2 = reverse_end(w), reverse_end(v)
+                    self.e_reduce[(v2, w2)] = True
+                    removed_edges.add( (v2, w2) )
+                
+        return removed_edges
+
+    def resolve_repeat_edges(self):
+
+
+        edges_to_reduce = []
+        nodes_to_test = set()
+        for v_n, v in self.nodes.items():
+            
+            out_nodes = []
+            for e in v.out_edges:
+                if self.e_reduce[(e.in_node.name, e.out_node.name)] == False:
+                    out_nodes.append( e.out_node.name )
+
+            in_nodes = []
+            for e in v.in_edges:
+                if self.e_reduce[(e.in_node.name, e.out_node.name)] == False:
+                    in_nodes.append( e.in_node.name )
+
+            if len(out_nodes) == 1 and len(in_nodes)  == 1:
+                nodes_to_test.add(v_n)
+        
+        for v_n in list( nodes_to_test ):
+            
+            v = self.nodes[v_n]
+
+            out_nodes = []
+            for e in v.out_edges:
+                if self.e_reduce[(e.in_node.name, e.out_node.name)] == False:
+                    out_nodes.append( e.out_node.name )
+
+            in_nodes = []
+            for e in v.in_edges:
+                if self.e_reduce[(e.in_node.name, e.out_node.name)] == False:
+                    in_nodes.append( e.in_node.name )
+
+            in_node_name = in_nodes[0] 
+
+            for out_edge in self.nodes[in_node_name].out_edges:
+                vv = out_edge.in_node.name
+                ww = out_edge.out_node.name
+
+                ww_out = self.nodes[ww].out_edges
+                v_out = self.nodes[v_n].out_edges
+                ww_out_nodes = set( [ n.out_node.name for n in ww_out] )
+                v_out_nodes = set(  [ n.out_node.name for n in v_out] )
+                o_overlap = len( ww_out_nodes & v_out_nodes )
+
+                ww_in_count = 0
+                for e in self.nodes[ww].in_edges:
+                    if self.e_reduce[ ( e.in_node.name, e.out_node.name ) ] == False:
+                        ww_in_count += 1
+
+                if ww != v_n and\
+                   self.e_reduce[ (vv, ww) ] == False and\
+                   ww_in_count > 1 and\
+                   ww not in nodes_to_test and\
+                   o_overlap == 0:
+                    edges_to_reduce.append( (vv, ww) )
+
+            out_node_name = out_nodes[0]
+
+            for in_edge in self.nodes[out_node_name].in_edges:
+                vv = in_edge.in_node.name
+                ww = in_edge.out_node.name
+
+                vv_in = self.nodes[vv].in_edges
+                v_in = self.nodes[v_n].in_edges
+                vv_in_nodes = set( [ n.in_node.name for n in vv_in] )
+                v_in_nodes = set(  [ n.in_node.name for n in v_in] )
+                i_overlap = len( vv_in_nodes & v_in_nodes )
+
+                vv_out_count = 0
+                for e in self.nodes[vv].out_edges:
+                    if self.e_reduce[ ( e.in_node.name, e.out_node.name )] == False:
+                        vv_out_count += 1
+
+                if vv != v_n and\
+                   self.e_reduce[ (vv, ww) ] == False and\
+                   vv_out_count > 1 and\
+                   vv not in nodes_to_test and\
+                   i_overlap == 0:
+                    edges_to_reduce.append( (vv, ww) )
+
+        removed_edges = set()
+        for e in edges_to_reduce:
+            self.e_reduce[e] = True
+            removed_edges.add(e)
+
+        return removed_edges
+
+    def get_out_edges_for_node(self, name, mask=True):
+        rtn = []
+        for e in self.nodes[name].out_edges:
+            v = e.in_node
+            w = e.out_node
+            if self.e_reduce[ (v.name, w.name) ] == False:
+                rtn.append(e)
+        return rtn
+        
+        
+    def get_in_edges_for_node(self, name, mask=True):
+        rtn = []
+        for e in self.nodes[name].in_edges:
+            v = e.in_node
+            w = e.out_node
+            if self.e_reduce[ (v.name, w.name) ] == False:
+                rtn.append(e)
+        return rtn
+
+    def get_best_out_edge_for_node(self, name, mask=True):
+        rtn = []
+        for e in self.nodes[name].out_edges:
+            v = e.in_node
+            w = e.out_node
+            if self.e_reduce[ (v.name, w.name) ] == False:
+                rtn.append(e)
+        rtn.sort(key=lambda e: e.attr["score"])
+
+        return rtn[-1]
+
+    def get_best_in_edge_for_node(self, name, mask=True):
+        rtn = []
+        for e in self.nodes[name].in_edges:
+            v = e.in_node
+            w = e.out_node
+            if self.e_reduce[ (v.name, w.name) ] == False:
+                rtn.append(e)
+        rtn.sort(key=lambda e: e.attr["score"])
+        return rtn[-1]
+        
+
+RCMAP = dict(zip("ACGTacgtNn-","TGCAtgcaNn-"))
+def generate_seq_from_path(sg, seqs, path):
+    subseqs = []
+    r_id, end = path[0].split(":")
+    
+    count = 0
+    for i in range( len( path ) -1 ):
+        w_n, v_n = path[i:i+2]
+        edge = sg.edges[ (w_n, v_n ) ]
+        read_id, coor = edge.attr["label"].split(":")
+        b,e = coor.split("-")
+        b = int(b)
+        e = int(e)
+        if b < e:
+            subseqs.append( seqs[read_id][b:e] )
+        else:
+            subseqs.append( "".join( [RCMAP[c] for c in seqs[read_id][b:e:-1]] ) )
+
+    return "".join(subseqs)
+
+
+def reverse_edge( e ):
+    e1, e2 = e
+    return reverse_end(e2), reverse_end(e1)
+
+def reverse_path( p ):
+    p = p[::-1]
+    return [reverse_end(n) for n in p]
+
+    
+def find_bundle(ug, u_edge_data, start_node, depth_cutoff, width_cutoff, length_cutoff):
+
+    tips = set()
+    bundle_edges = set()
+    bundle_nodes = set()
+
+    local_graph = nx.ego_graph(ug, start_node, depth_cutoff, undirected=False)
+    length_to_node = {start_node:0}
+    score_to_node = {start_node:0}
+
+    v = start_node
+    end_node = start_node
+
+    if DEBUG_LOG_LEVEL > 1: 
+        print
+        print 
+        print "start", start_node
+
+    bundle_nodes.add(v)
+    for vv, ww, kk in local_graph.out_edges(v, keys = True):
+        max_score = 0
+        max_length = 0
+
+        if (vv, ww, kk) not in bundle_edges and\
+                reverse_end(ww) not in bundle_nodes:
+
+            bundle_edges.add( (vv, ww, kk) )
+            tips.add(ww)
+
+    for v in list(tips):
+        bundle_nodes.add(v)
+
+    depth = 1
+    width = 1.0
+    converage = False
+
+
+    while 1:
+        if DEBUG_LOG_LEVEL > 1:
+            print "# of tips", len(tips)
+
+        if len(tips) > 4:
+            converage = False
+            break
+
+        if len(tips) == 1:
+            end_node = tips.pop()
+
+            if DEBUG_LOG_LEVEL > 1:
+                print "end", end_node
+
+            if end_node not in length_to_node:
+                v = end_node
+                max_score_edge = None
+                max_score = 0
+                for uu, vv, kk in local_graph.in_edges(v, keys=True):
+                    if uu not in length_to_node:
+                        continue
+
+                    score = u_edge_data[ (uu, vv, kk) ][1]
+
+                    if score > max_score:
+
+                        max_score = score
+                        max_score_edge = (uu, vv, kk)
+
+                length_to_node[v] = length_to_node[max_score_edge[0]] +  u_edge_data[ max_score_edge ][0]
+                score_to_node[v] = score_to_node[max_score_edge[0]] +  u_edge_data[ max_score_edge ][1]
+                
+
+            converage = True
+            break
+        
+
+        depth += 1
+        width = 1.0 * len(bundle_edges) / depth
+
+        if depth > 10 and width > width_cutoff:
+            converage = False
+            break
+
+        if depth > depth_cutoff:
+            converage = False
+            break
+        
+        tips_list = list(tips)
+
+        tip_updated = False
+        loop_detect = False
+        length_limit_reached = False
+
+        for v in tips_list:
+            if DEBUG_LOG_LEVEL > 1:
+                print "process", v
+
+            if len(local_graph.out_edges(v, keys=True)) == 0: # dead end route
+                print "no out edge", v
+                continue
+
+            max_score_edge = None
+            max_score = 0
+
+            extend_tip = True
+
+            for uu, vv, kk in local_graph.in_edges(v, keys=True):
+                if DEBUG_LOG_LEVEL > 1: 
+                    print "in_edges", uu, vv, kk
+                    print uu, "in length_to_node",  uu in length_to_node
+
+                if uu not in length_to_node:
+                    extend_tip = False
+                    break
+
+                score = u_edge_data[ (uu, vv, kk) ][1]
+
+                if score > max_score:
+
+                    max_score = score
+                    max_score_edge = (uu, vv, kk)
+            
+            if extend_tip:
+            
+                length_to_node[v] = length_to_node[max_score_edge[0]] +  u_edge_data[ max_score_edge ][0]
+                score_to_node[v] = score_to_node[max_score_edge[0]] +  u_edge_data[ max_score_edge ][1]
+
+                if length_to_node[v] > length_cutoff:
+                    length_limit_reached = True
+                    converage = False
+                    break
+
+                v_updated = False
+                for vv, ww, kk in local_graph.out_edges(v, keys=True):
+
+                    if DEBUG_LOG_LEVEL > 1:
+                        print "test", vv, ww, kk
+
+                    if ww in length_to_node:
+                        loop_detect = True
+                        if DEBUG_LOG_LEVEL > 1:
+                            print "loop_detect", ww
+                        break
+
+                    if (vv, ww, kk) not in bundle_edges and\
+                            reverse_end(ww) not in bundle_nodes:
+
+                        if DEBUG_LOG_LEVEL > 1:
+                            print "add", ww
+
+                        tips.add(ww)
+                        bundle_edges.add( (vv, ww, kk) )
+                        tip_updated = True
+                        v_updated = True
+
+                if v_updated:
+
+                    if DEBUG_LOG_LEVEL > 1:
+                        print "remove", v
+
+                    tips.remove(v)
+
+                    if len(tips) == 1:
+                        break
+
+            if loop_detect:
+                converage = False
+                break
+
+        if length_limit_reached:
+            converage = False
+            break
+
+        if loop_detect:
+            converage = False
+            break
+
+        if not tip_updated:
+            converage = False
+            break
+
+        for v in list(tips):
+            bundle_nodes.add(v)
+
+        
+
+    data = start_node, end_node, bundle_edges, length_to_node[end_node], score_to_node[end_node], depth
+    
+    data_r = None
+
+    if DEBUG_LOG_LEVEL > 1:
+        print converage, data, data_r
+    return converage, data, data_r
+
+def generate_string_graph(args):
+
+    overlap_file = args.overlap_file
+
+    contained_reads = set()
+    chimer_ids = set()
+
+    filter_reads = False
+    
+    seqs = set()
+
+    G=nx.Graph()
+    edges =set()
+    overlap_data = []
+    contained_reads = set()
+    overlap_count = {}
+
+
+    # loop through the overlapping data to load the data in the a python array
+    # contained reads are identified 
+
+    with open(overlap_file) as f:
+        for l in f:
+            l = l.strip().split()
+
+            #work around for some ill formed data recored
+            #if len(l) != 13:
+            #    continue
+            
+            f_id, g_id, score, identity = l[:4]
+
+            if f_id == g_id:  # don't need self-self overlapping
+                continue
+            
+            if filter_reads:
+
+                if g_id not in seqs: 
+                    continue
+
+                if f_id not in seqs:
+                    continue
+
+            score = int(score)
+            identity = float(identity)
+            contained = l[12]
+            if contained == "contained":
+                contained_reads.add(f_id)
+                continue
+            if contained == "contains":
+                contained_reads.add(g_id)
+                continue
+            if contained == "none":
+                continue
+
+            if identity < args.min_idt: # only take record with >96% identity as overlapped reads
+                continue
+            f_strain, f_start, f_end, f_len = (int(c) for c in l[4:8])
+            g_strain, g_start, g_end, g_len = (int(c) for c in l[8:12])
+
+            # only used reads longer than the 4kb for assembly
+            if f_len < args.min_len: continue
+            if g_len < args.min_len: continue
+            
+            """
+            # double check for proper overlap
+            # this is not necessary when using DALIGNER for overlapper
+            # it may be useful if other overlappers give fuzzier alignment boundary
+            if f_start > 24 and f_len - f_end > 24:  # allow 24 base tolerance on both sides of the overlapping
+                continue
+            
+            if g_start > 24 and g_len - g_end > 24:
+                continue
+            
+            if g_strain == 0:
+                if f_start < 24 and g_len - g_end > 24:
+                    continue
+                if g_start < 24 and f_len - f_end > 24:
+                    continue
+            else:
+                if f_start < 24 and g_start > 24:
+                    continue
+                if g_start < 24 and f_start > 24:
+                    continue
+            """
+
+            overlap_data.append( (f_id, g_id, score, identity,
+                                  f_strain, f_start, f_end, f_len,
+                                  g_strain, g_start, g_end, g_len) )
+
+            overlap_count[f_id] = overlap_count.get(f_id,0)+1
+            overlap_count[g_id] = overlap_count.get(g_id,0)+1
+            
+    overlap_set = set()
+    sg = StringGraph()
+    for od in overlap_data:
+        f_id, g_id, score, identity = od[:4]
+        if f_id in contained_reads:
+            continue
+        if g_id in contained_reads:
+            continue
+        f_s, f_b, f_e, f_l = od[4:8]
+        g_s, g_b, g_e, g_l = od[8:12]
+        overlap_pair = [f_id, g_id]
+        overlap_pair.sort()
+        overlap_pair = tuple( overlap_pair )
+        if overlap_pair in overlap_set:  # don't allow duplicated records
+            continue
+        else:
+            overlap_set.add(overlap_pair)
+
+        
+        if g_s == 1: # revered alignment, swapping the begin and end coordinates
+            g_b, g_e = g_e, g_b
+        
+        # build the string graph edges for each overlap
+        if f_b > 1:
+            if g_b < g_e:
+                """
+                     f.B         f.E
+                  f  ----------->
+                  g         ------------->
+                            g.B           g.E
+                """
+                if f_b == 0 or g_e - g_l == 0:
+                    continue
+                sg.add_edge( "%s:B" % g_id, "%s:B" % f_id, label = (f_id, f_b, 0), 
+                                                           length = abs(f_b-0),
+                                                           score = -score, 
+                                                           identity = identity )
+                sg.add_edge( "%s:E" % f_id, "%s:E" % g_id, label = (g_id, g_e, g_l), 
+                                                           length = abs(g_e-g_l),
+                                                           score = -score,
+                                                           identity = identity)
+            else:
+                """
+                     f.B         f.E
+                  f  ----------->
+                  g         <-------------
+                            g.E           g.B           
+                """
+                if f_b == 0 or g_e == 0:
+                    continue
+                sg.add_edge( "%s:E" % g_id, "%s:B" % f_id, label = (f_id, f_b, 0), 
+                                                           length = abs(f_b -0),
+                                                           score = -score,
+                                                           identity = identity)
+                sg.add_edge( "%s:E" % f_id, "%s:B" % g_id, label = (g_id, g_e, 0), 
+                                                           length = abs(g_e- 0),
+                                                           score = -score,
+                                                           identity = identity)
+        else:
+            if g_b < g_e:
+                """
+                                    f.B         f.E
+                  f                 ----------->
+                  g         ------------->
+                            g.B           g.E
+                """
+                if g_b == 0 or f_e - f_l == 0:
+                    continue
+                sg.add_edge( "%s:B" % f_id, "%s:B" % g_id, label = (g_id, g_b, 0), 
+                                                           length = abs(g_b - 0),
+                                                           score = -score,
+                                                           identity = identity)
+                sg.add_edge( "%s:E" % g_id, "%s:E" % f_id, label = (f_id, f_e, f_l), 
+                                                           length = abs(f_e-f_l),
+                                                           score = -score,
+                                                           identity = identity)
+            else:
+                """
+                                    f.B         f.E
+                  f                 ----------->
+                  g         <-------------
+                            g.E           g.B           
+                """
+                if g_b - g_l == 0 or f_e - f_l ==0:
+                    continue
+                sg.add_edge( "%s:B" % f_id, "%s:E" % g_id, label = (g_id, g_b, g_l), 
+                                                           length = abs(g_b - g_l),
+                                                           score = -score,
+                                                           identity = identity)
+                sg.add_edge( "%s:B" % g_id, "%s:E" % f_id, label = (f_id, f_e, f_l), 
+                                                           length = abs(f_e - f_l),
+                                                           score = -score,
+                                                           identity = identity)
+
+
+    sg.init_reduce_dict()
+
+    #if not args.disable_chimer_prediction:
+    #    sg.mark_chimer_edges()
+    #sg.mark_spur_edge()
+    
+
+    sg.mark_tr_edges() # mark those edges that transitive redundant
+
+    if DEBUG_LOG_LEVEL > 1:
+        print sum( [1 for c in sg.e_reduce.values() if c == True] )
+        print sum( [1 for c in sg.e_reduce.values() if c == False] )
+
+
+    removed_edges = set()
+    if args.lfc == True:
+        removed_edges = sg.resolve_repeat_edges()  
+    else:
+        removed_edges = sg.mark_best_overlap() # mark those edges that are best overlap edges
+
+    spur_edges = sg.mark_spur_edge()
+
+    if DEBUG_LOG_LEVEL > 1:
+        print sum( [1 for c in sg.e_reduce.values() if c == False] )
+
+    out_f = open("sg_edges_list", "w")
+    nxsg = nx.DiGraph()
+    edge_data = {}
+    for v, w in sg.edges:
+        e = sg.edges[ (v, w) ]
+        rid, sp, tp = e.attr["label"]
+        score = e.attr["score"]
+        identity = e.attr["identity"]
+        length = abs(sp-tp)
+
+
+        if  sg.e_reduce[(v, w)] != True:
+            type_ = "G"
+            label = "%s:%d-%d" % (rid, sp, tp)
+            nxsg.add_edge(v, w, label = label, length = length, score = score)
+            edge_data[ (v, w) ] = (rid, sp, tp, length, score, identity, type_)
+            if w in sg.best_in:
+                nxsg.node[w]["best_in"] = v
+        elif (v, w) in removed_edges:
+            type_ = "R"
+        elif (v, w) in spur_edges:
+            type_ = "S"
+        elif sg.e_reduce[(v, w)] == True:
+            type_ = "TR"
+
+        print >>out_f, v, w, rid, sp, tp, score, identity, type_
+
+
+        
+    out_f.close()
+    nxsg_r = nxsg.reverse()    
+
+    return nxsg, nxsg_r, edge_data
+
+
+
+def construct_compound_paths(ug, u_edge_data):
+
+    source_nodes = set()
+    sink_nodes = set()
+    simple_nodes = set()
+    branch_nodes = set()
+
+    all_nodes = ug.nodes()
+    for n in all_nodes:
+        in_degree = len( ug.in_edges(n) )
+        out_degree = len( ug.out_edges(n) )
+        if in_degree == 0:
+            source_nodes.add(n)
+        if out_degree == 0:
+            sink_nodes.add(n)
+        if in_degree == 1 and out_degree == 1:
+            simple_nodes.add(n)
+        if in_degree > 1 or out_degree > 1:
+            branch_nodes.add(n)
+
+    #print "#", len(all_nodes),len(source_nodes), len(sink_nodes), len(simple_nodes), len(branch_nodes)
+    compound_paths_0 = []
+    for p in list(branch_nodes):
+        if ug.out_degree(p) > 1:
+            coverage, data, data_r =  find_bundle(ug, u_edge_data, p, 48, 16, 500000)
+            if coverage == True:
+                start_node, end_node, bundle_edges, length, score, depth = data
+                compound_paths_0.append(  (start_node, "NA", end_node, 1.0*len(bundle_edges)/depth, length, score, bundle_edges ) )
+
+    compound_paths_0.sort( key=lambda x: -len(x[6]) )
+
+
+    edge_to_cpath = {}
+    compound_paths_1 = {}
+    for s, v, t, width, length, score, bundle_edges in compound_paths_0:
+        if DEBUG_LOG_LEVEL > 1:
+            print "constructing utg, test ", s,v, t
+        
+        overlapped = False
+        for vv, ww, kk in list(bundle_edges):
+            if (vv, ww, kk) in edge_to_cpath:
+                if DEBUG_LOG_LEVEL > 1:
+                    print "remove overlapped utg", (s, v, t), (vv, ww, kk)
+                overlapped = True
+                break
+            rvv = reverse_end(vv)
+            rww = reverse_end(ww)
+            rkk = reverse_end(kk)
+            if (rww, rvv, rkk) in edge_to_cpath:
+                if DEBUG_LOG_LEVEL > 1:
+                    print "remove overlapped r utg", (s, v, t),  (rww, rvv, rkk)
+                overlapped = True
+                break
+            
+
+        if not overlapped:
+            if DEBUG_LOG_LEVEL > 1:
+                print "constructing", s,v, t
+
+            bundle_edges_r = []
+            rs = reverse_end(t)
+            rt = reverse_end(s)
+
+            for vv, ww, kk in list(bundle_edges):
+                edge_to_cpath.setdefault( (vv, ww, kk), set() )
+                edge_to_cpath[ (vv, ww, kk) ].add( ( s, t, v) )
+                rvv = reverse_end(ww)
+                rww = reverse_end(vv)
+                rkk = reverse_end(kk)
+                edge_to_cpath.setdefault( (rvv, rww, rkk), set() )
+                edge_to_cpath[ (rvv, rww, rkk) ].add( (rs, rt, v) ) #assert v == "NA"
+                bundle_edges_r.append(  (rvv, rww, rkk) )
+            
+            compound_paths_1[ ( s, v, t) ] = width, length, score, bundle_edges
+            compound_paths_1[ ( rs, v, rt) ] = width, length, score, bundle_edges_r
+
+             
+    compound_paths_2 = {}
+    edge_to_cpath = {}
+    for s, v, t in compound_paths_1:
+        rs = reverse_end(t)
+        rt = reverse_end(s)
+        if (rs, "NA", rt) not in compound_paths_1:
+            if DEBUG_LOG_LEVEL > 1:
+                print "non_compliment bundle", s, v, t, len(compound_paths_1[( s, v, t)][-1])
+            continue
+        width, length, score, bundle_edges = compound_paths_1[ (s, v, t) ]
+        compound_paths_2[ (s, v, t) ] = width, length, score, bundle_edges
+        for vv, ww, kk in list(bundle_edges):
+            edge_to_cpath.setdefault( (vv, ww, kk), set() )
+            edge_to_cpath[ (vv, ww, kk) ].add( ( s, t, v) )
+
+
+    compound_paths_3 = {}
+    for k, val in compound_paths_2.items():
+        
+        start_node, NA, end_node = k
+        rs = reverse_end(end_node)
+        rt = reverse_end(start_node)
+        assert (rs, "NA", rt) in compound_paths_2
+        
+        contained = False
+        for vv, ww, kk in ug.out_edges(start_node, keys=True):
+            if len(edge_to_cpath.get( (vv, ww, kk), [] )) > 1: 
+                contained = True
+
+        if not contained:
+            compound_paths_3[k] = val
+            if DEBUG_LOG_LEVEL > 1:
+                print "compound", k 
+
+    compound_paths = {}
+    for s, v, t in compound_paths_3:
+        rs = reverse_end(t)
+        rt = reverse_end(s)
+        if (rs, "NA", rt) not in compound_paths_3:
+            continue
+        compound_paths[ (s, v, t) ] = compound_paths_3[ (s, v, t) ]
+
+    return compound_paths
+
+def main(argv=sys.argv):
+    import argparse
+    
+    parser = argparse.ArgumentParser(description='a example string graph assembler that is desinged for handling diploid genomes')
+    parser.add_argument('overlap_file', help='a file that contains the overlap information.')
+
+    parser.add_argument('--min_len', type=int, default=4000, 
+                        help='minimum length of the reads to be considered for assembling')
+    parser.add_argument('--min_idt', type=float, default=96,
+                        help='minimum alignment identity of the reads to be considered for assembling')
+    parser.add_argument('--lfc', action="store_true", default=False,
+                        help='use local flow constraint method rather than best overlap method to resolve knots in string graph')
+
+    args = parser.parse_args(argv[1:])
+
+
+    # transitivity reduction, remove spurs, remove putative edges caused by repeats
+    sg, sg_r, edge_data = generate_string_graph(args)
+
+
+    simple_paths = {}
+    dual_path = {}
+
+
+    sg2 = nx.DiGraph()
+
+    for v, w in edge_data:
+
+        assert (reverse_end(w), reverse_end(v)) in edge_data
+        
+        #if (v, w) in masked_edges:
+        #    continue
+
+        rid, sp, tp, length, score, identity, type_ = edge_data[ (v, w) ]
+        if type_ != "G":
+            continue
+
+        label = "%s:%d-%d" % (rid, sp, tp)
+        sg2.add_edge( v, w, label = label, length = length, score = score)
+
+        
+    # utg construction phase 1, identify all simple paths
+    s_nodes = set()
+    t_nodes = set()
+    simple_nodes = set()
+
+    all_nodes = sg2.nodes()
+    for n in all_nodes:
+        in_degree = len( sg2.in_edges(n) )
+        out_degree = len( sg2.out_edges(n) )
+        if in_degree == 1 and out_degree == 1:
+            simple_nodes.add(n)
+        else:
+            if out_degree != 0:
+                s_nodes.add(n)
+            if in_degree != 0:
+                t_nodes.add(n)
+
+
+    free_edges = set(sg2.edges())
+
+    if DEBUG_LOG_LEVEL > 1: 
+        for s in list(simple_nodes):
+            print "simple_node", s
+        for s in list(s_nodes):
+            print "s_node", s
+        for s in list(t_nodes):
+            print "t_node", s
+
+        for v,w in free_edges:
+            if (reverse_end(w), reverse_end(v) ) not in free_edges:
+                print "bug", v,w
+                print oreverse_end(w), reverse_end(v)
+
+    while len(free_edges) != 0:
+        if len(s_nodes) != 0:
+            n = s_nodes.pop()
+            if DEBUG_LOG_LEVEL > 1:
+                print "initial utg 1", n
+        else:
+            e = free_edges.pop()
+            free_edges.add(e)
+            n = e[0]
+            if DEBUG_LOG_LEVEL > 1:
+                print "initial utg 2", n
+
+        path = []
+        path_length =0
+        path_score = 0 
+        for v, w in sg2.out_edges(n):
+            if (v, w) not in free_edges:
+                continue
+            rv = reverse_end(v)
+            rw = reverse_end(w)
+
+            path_length = 0
+            path_score = 0
+            v0 = v
+            w0 = w
+            path = [v, w]
+            path_edges = set()
+            path_edges.add( (v, w) )
+            path_length += edge_data[ (v, w) ][3]
+            path_score += edge_data[ (v, w) ][4]
+            free_edges.remove( (v, w) )
+
+            r_path_length = 0
+            r_path_score = 0
+            rv0 = rv
+            rw0 = rw
+            r_path = [rv, rw] # need to reverse again
+            r_path_edges = set()
+            r_path_edges.add( (rw, rv) )
+            r_path_length += edge_data[ (rw, rv) ][3]
+            r_path_score += edge_data[ (rw, rv) ][4]
+            free_edges.remove( (rw, rv) )
+
+            while w in simple_nodes:
+                w, w_ = sg2.out_edges(w)[0]
+                if (w, w_) not in free_edges:
+                    break
+                rw_, rw = reverse_end(w_), reverse_end(w)
+
+                if ( rw_, rw ) in path_edges:
+                    break
+
+                path.append(w_)
+                path_edges.add( (w, w_) )
+                path_length += edge_data[ (w, w_) ][3]
+                path_score += edge_data[ (w, w_) ][4]
+                free_edges.remove( (w, w_) )
+                
+                r_path.append(rw_)
+                r_path_edges.add( (rw_, rw) )
+                r_path_length += edge_data[ (rw_, rw) ][3]
+                r_path_score += edge_data[ (rw_, rw) ][4]
+                free_edges.remove( (rw_, rw) )
+                
+
+                w = w_
+
+            simple_paths[ (v0, w0, path[-1]) ] = path_length, path_score, path
+            r_path.reverse()
+            assert r_path[0] == reverse_end(path[-1])
+            simple_paths[ (r_path[0], rw0, rv0) ] = r_path_length, r_path_score, r_path
+
+            if DEBUG_LOG_LEVEL > 1:
+                print  path_length, path_score, path
+
+            dual_path[ (r_path[0], rw0, rv0) ] = (v0, w0, path[-1])
+            dual_path[ (v0, w0, path[-1]) ] = (r_path[0], rw0, rv0)
+
+
+
+    ug = nx.MultiDiGraph()
+    u_edge_data = {}
+    circular_path = set()
+
+    for s, v, t in simple_paths:
+        length, score, path = simple_paths[ (s, v, t) ]
+        u_edge_data[ (s, t, v) ] = (length, score, path, "simple")
+        if s != t:
+            ug.add_edge(s, t, key = v, type_ = "simple", via = v, length = length, score = score)
+        else:
+            circular_path.add( (s, t, v) )
+
+
+    if DEBUG_LOG_LEVEL > 1:
+        with open("utg_data0","w") as f:
+            for s, t, v in u_edge_data:
+                rs = reverse_end(t)
+                rt = reverse_end(s)
+                rv = reverse_end(v)
+                assert (rs, rt, rv) in u_edge_data
+                length, score, path_or_edges, type_ = u_edge_data[ (s, t, v) ]
+                
+                if type_ == "compound":
+                    path_or_edges = "|".join( [ ss+"~"+vv+"~"+tt for ss, tt, vv in path_or_edges ] )
+                else:
+                    path_or_edges = "~".join( path_or_edges )
+                print >>f, s, v, t, type_, length, score, path_or_edges
+
+    # identify spurs in the utg graph
+    # Currently, we use ad-hoc logic filtering out shorter utg, but we ca
+    # add proper alignment comparison later to remove redundant utgs 
+
+    utg_spurs = set()
+    all_nodes = ug.nodes()
+
+    ug2 = ug.copy()
+    spur_edges = set()
+    edges_to_remove = set()
+
+    for n in s_nodes:
+        if ug.in_degree(n) != 0:
+            continue
+        for s, t, v in ug.out_edges(n, keys=True):
+            length, score, edges, type_ = u_edge_data[ (s, t, v) ]
+            if length > 50000 and len(edges) > 3:
+                continue
+            in_degree = len( set( e[0] for e in ug.in_edges(t))  ) # ignore mutli-edges
+            out_degree = len( set( e[1] for e in ug.out_edges(t)) )
+            if in_degree > 1 and out_degree > 0:
+                spur_edges.add( (s, t, v) )
+                edges_to_remove.add( (s, t, v) )
+                u_edge_data[ (s, t, v) ] = length, score, edges, "spur:2"
+                rs = reverse_end(t)
+                rt = reverse_end(s)
+                rv = reverse_end(v)
+                edges_to_remove.add( (rs, rt, rv) )
+                length, score, edges, type_ = u_edge_data[ (rs, rt, rv) ]
+                u_edge_data[ (rs, rt, rv) ] = length, score, edges, "spur:2"
+
+    for n in t_nodes:
+        if ug.out_degree(n) != 0:
+            continue
+        for s, t, v in ug.in_edges(n, keys=True):
+            length, score, edges, type_ = u_edge_data[ (s, t, v) ]
+            if length > 50000 and len(edges) > 3:
+                continue
+            in_degree = len( set( e[0] for e in ug.in_edges(s))  ) # ignore mutli-edges
+            out_degree = len( set( e[1] for e in ug.out_edges(s)) )
+            if in_degree > 0 and out_degree > 1:
+                spur_edges.add( (s, t, v) )
+                edges_to_remove.add( (s, t, v) )
+                u_edge_data[ (s, t, v) ] = length, score, edges, "spur:2"
+                rs = reverse_end(t)
+                rt = reverse_end(s)
+                rv = reverse_end(v)
+                edges_to_remove.add( (rs, rt, rv) )
+                length, score, edges, type_ = u_edge_data[ (rs, rt, rv) ]
+                u_edge_data[ (rs, rt, rv) ] = length, score, edges, "spur:2"
+
+    for s, t, v in list(edges_to_remove):
+        ug2.remove_edge( s, t, key= v)
+
+    #phase 2, finding all "consistent" compound paths
+    compound_paths = construct_compound_paths(ug2, u_edge_data)
+    compound_path_file = open("c_path","w")
+
+    ug2_edges = set(ug2.edges(keys = True))
+    edges_to_remove  = set()
+    for s, v, t in compound_paths:
+        width, length, score, bundle_edges =  compound_paths[ (s, v, t) ] 
+        print >> compound_path_file, s,v,t, width, length, score, "|".join( [e[0]+"~"+e[2]+"~"+e[1] for e in bundle_edges] )
+        for ss, tt, vv in bundle_edges:
+            if (ss, tt, vv) in ug2_edges:
+                edges_to_remove.add( (ss, tt, vv) )
+
+    
+    for s, t, v in edges_to_remove:
+        ug2.remove_edge( s, t ,v )
+        length, score, edges, type_ = u_edge_data[ (s, t, v) ]
+        if type_ != "spur":
+            u_edge_data[ (s, t, v) ] = length, score, edges, "contained"
+
+
+    for s, v, t in compound_paths:
+        width, length, score, bundle_edges =  compound_paths[ (s, v, t) ] 
+        u_edge_data[ (s, t, v) ] = (length, score, bundle_edges, "compound")
+        ug2.add_edge( s, t, key = v, via = v, type_="compound", length = length, score = score)
+
+        assert v == "NA"
+        rs = reverse_end(t)
+        rt = reverse_end(s)
+        assert (rs, v, rt) in compound_paths
+        dual_path[ (s, v, t) ] = (rs, v, rt)
+        dual_path[ (rs, v, rt) ] = (s, v, t)
+
+    compound_path_file.close()
+
+
+    # remove short utg using local flow consistent rule
+    """
+      short UTG like this can be removed, this kind of utg are likely artifects of repeats 
+      >____           _____>
+           \__UTG_>__/
+      <____/         \_____<
+    """
+    ug_edge_to_remove = set() 
+    for s, t, v in ug.edges(keys=True):
+        if ug2.in_degree(s) == 1 and ug2.out_degree(s) == 2 and \
+           ug2.in_degree(t) == 2 and ug2.out_degree(t) == 1:
+            length, score, path_or_edges, type_ = u_edge_data[ (s, t, v) ]
+            if length < 60000: 
+                rs = reverse_end(t)
+                rt = reverse_end(s)
+                rv = reverse_end(v)
+                ug_edge_to_remove.add( (s, t, v) )
+                ug_edge_to_remove.add( (rs, rt, rv) )
+    for s, t, v in list(ug_edge_to_remove):
+        ug2.remove_edge(s, t, key=v)
+        length, score, edges, type_ = u_edge_data[ (s, t, v) ]
+        u_edge_data[ (s, t, v) ] = length, score, edges, "repeat_bridge"
+
+    ug = ug2
+
+    with open("utg_data","w") as f:
+        for s, t, v in u_edge_data:
+            length, score, path_or_edges, type_ = u_edge_data[ (s, t, v) ]
+            
+            if v == "NA":
+                path_or_edges = "|".join( [ ss+"~"+vv+"~"+tt for ss, tt, vv in path_or_edges ] )
+            else:
+                path_or_edges = "~".join( path_or_edges )
+            print >>f, s, v, t, type_, length, score, path_or_edges
+
+    # contig construction from utgs
+
+    s_nodes = set()
+    t_nodes = set()
+    simple_nodes = set()
+    simple_out = set()
+    simple_in = set()
+
+    all_nodes = ug.nodes()
+    for n in all_nodes:
+        in_degree = len( ug.in_edges(n) )
+        out_degree = len( ug.out_edges(n) )
+        if in_degree == 1 and out_degree == 1:
+            simple_nodes.add(n)
+        else:
+            if out_degree != 0:
+                s_nodes.add(n)
+            if in_degree != 0:
+                t_nodes.add(n)
+        if out_degree == 1:
+            simple_out.add(n)
+        if in_degree == 1:
+            simple_in.add(n)
+
+    all_nodes = set(all_nodes)
+    c_path = []
+    
+    free_edges = set()
+    for s, t, v in ug.edges(keys=True):
+        free_edges.add( (s, t, v) )
+
+    while len(free_edges) != 0:
+
+        if len(s_nodes) != 0:
+            n = s_nodes.pop()
+        else:
+            e = free_edges.pop()
+            n = e[0]
+        
+        for s, t, v in ug.out_edges(n, keys=True):
+            path_start = n
+            path_end = None
+            path_key = None
+            path = []
+            path_length = 0
+            path_score = 0
+            path_nodes = set()
+            path_nodes.add(s)
+            if DEBUG_LOG_LEVEL > 1:
+                print "check 1", s, t, v
+            path_key = t
+            t0 = s
+            while t in simple_out:
+                if t in path_nodes:
+                    break
+                rt = reverse_end(t)
+                if rt in path_nodes:
+                    break
+
+                length, score, path_or_edges, type_ = u_edge_data[ (t0, t, v) ]
+
+               
+                """
+                If the next node has two in-edges and the current path has the best overlap,
+                we will extend the contigs. Otherwise, we will terminate the contig extension.
+                This can help reduce some mis-assemblies but it can still construct long contigs
+                when there is an oppertunity (assuming the best overlap has the highest
+                likelihood to be correct.)
+                """
+                if len(ug.in_edges(t, keys=True)) > 1:
+                    best_in_node = sg.node[t]["best_in"] 
+                    
+                    if type_ == "simple" and best_in_node != path_or_edges[-2]:
+                        break
+                    if type_ == "compound":
+                        t_in_nodes = set()
+                        for ss, vv, tt in path_or_edges:
+                            if tt != t:
+                                continue
+                            length, score, path_or_edges, type_ = u_edge_data[ (ss,vv,tt) ]
+                            if path_or_edges[-1] == tt:
+                                t_in_nodes.add(path_or_edges[-2])
+                        if best_in_node not in t_in_nodes:
+                            break
+                # ----------------
+
+
+                path.append( (t0, t, v) )
+                path_nodes.add(t)
+                path_length += length
+                path_score += score
+                assert len( ug.out_edges( t, keys=True ) ) == 1 # t is "simple_out" node
+                t0, t, v = ug.out_edges( t, keys=True )[0] 
+
+            path.append( (t0, t, v) )
+            length, score, path_or_edges, type_ = u_edge_data[ (t0, t, v) ]
+            path_length += length
+            path_score += score
+            path_nodes.add(t)
+            path_end = t
+
+            c_path.append( (path_start, path_key, path_end, path_length, path_score, path, len(path)) ) 
+            if DEBUG_LOG_LEVEL > 1:
+                print "c_path", path_start, path_key, path_end, path_length, path_score, len(path)
+            for e in path:
+                if e in free_edges:
+                    free_edges.remove( e )
+ 
+    if DEBUG_LOG_LEVEL > 1:
+        print "left over edges:", len(free_edges)
+
+
+
+    free_edges = set()
+    for s, t, v in ug.edges(keys=True):
+        free_edges.add( (s, t, v) )
+
+
+    ctg_id = 0
+
+    ctg_paths = open("ctg_paths","w")
+
+    c_path.sort( key=lambda x: -x[3] )
+
+    
+    for path_start, path_key, path_end, p_len, p_score, path, n_edges in c_path:
+        length = 0
+        score = 0
+        length_r = 0
+        score_r = 0
+
+        non_overlapped_path = []
+        non_overlapped_path_r = []
+        for s, t, v in path:
+            if v != "NA": 
+                rs, rt, rv = reverse_end(t), reverse_end(s), reverse_end(v)
+            else:
+                rs, rt, rv = reverse_end(t), reverse_end(s), "NA"
+            if (s, t, v) in free_edges and (rs, rt, rv) in free_edges:
+                non_overlapped_path.append( (s,t,v) )
+                non_overlapped_path_r.append( (rs, rt, rv)  )
+                length += u_edge_data[ (s, t, v) ][0]
+                score += u_edge_data[ (s, t, v) ][1]
+                length_r += u_edge_data[ (rs, rt, rv) ][0]
+                score_r += u_edge_data[ (rs, rt, rv) ][1]
+            else:
+                break
+
+        if len(non_overlapped_path) == 0:
+            continue
+        s0, t0, v0 = non_overlapped_path[0]
+        end_node = non_overlapped_path[-1][1]
+
+        print >> ctg_paths, "%06dF" % ctg_id, "ctg_linear", s0+"~"+v0+"~"+t0, end_node, length, score, "|".join([ c[0]+"~"+c[2]+"~"+c[1] for c in non_overlapped_path ] )
+        non_overlapped_path_r.reverse()
+        s0, t0, v0 = non_overlapped_path_r[0]
+        end_node = non_overlapped_path_r[-1][1]
+        print >> ctg_paths, "%06dR" % ctg_id, "ctg_linear", s0+"~"+v0+"~"+t0, end_node, length_r, score_r, "|".join([ c[0]+"~"+c[2]+"~"+c[1] for c in non_overlapped_path_r ] )
+        ctg_id += 1
+        for e in non_overlapped_path:
+            if e in free_edges:
+                free_edges.remove(e)
+        for e in non_overlapped_path_r:
+            if e in free_edges:
+                free_edges.remove(e)
+
+
+
+    for s, t, v in list(circular_path):
+        length, score, path, type_ = u_edge_data[ (s, t, v) ]
+        print >> ctg_paths, "%6d" % ctg_id, "ctg_circular", s+"~"+v+"~"+t, t, length, score, s+"~"+v+"~"+t
+        ctg_id += 1
+
+    ctg_paths.close()
+
diff --git a/src/py/mains/run.py b/src/py/mains/run.py
new file mode 100644
index 0000000..65a0644
--- /dev/null
+++ b/src/py/mains/run.py
@@ -0,0 +1,646 @@
+from .. import run_support as support
+from ..functional import get_daligner_job_descriptions, get_script_xformer
+from pypeflow.data import PypeLocalFile, makePypeLocalFile, fn
+from pypeflow.task import PypeTask, PypeThreadTaskBase, PypeTaskBase
+from pypeflow.controller import PypeThreadWorkflow
+from falcon_kit.FastaReader import FastaReader
+import glob
+import os
+import re
+import sys
+import time
+
+
+wait_time = 5
+fc_run_logger = None
+
+def system(call, check=False):
+    fc_run_logger.debug('$(%s)' %repr(call))
+    rc = os.system(call)
+    msg = "Call %r returned %d." % (call, rc)
+    if rc:
+        fc_run_logger.warning(msg)
+        if check:
+            raise Exception(msg)
+    else:
+        fc_run_logger.debug(msg)
+    return rc
+
+def _qsub_script(job_data, specific):
+        script_fn = job_data["script_fn"]
+        job_name = job_data["job_name"]
+        cwd = job_data["cwd"]
+        sge_option = job_data["sge_option"]
+        sge_cmd="qsub -N {job_name} {sge_option} -o {cwd}/sge_log {specific}\
+                 -S /bin/bash {script}".format(job_name=job_name,
+                                               cwd=os.getcwd(),
+                                               specific=specific,
+                                               sge_option=sge_option,
+                                               script=script_fn)
+        system(sge_cmd, check=True)
+
+def _run_script_sge(job_data):
+    specific = '-j y'
+    _qsub_script(job_data, specific)
+
+def _run_script_torque(job_data):
+    # See https://github.com/PacificBiosciences/FALCON/pull/227
+    specific = '-j oe'
+    _qsub_script(job_data, specific)
+
+def _run_script_slurm(job_data):
+        script_fn = job_data["script_fn"]
+        job_name = job_data["job_name"]
+        cwd = job_data["cwd"]
+        sge_option = job_data["sge_option"]
+        with open(script_fn, 'r') as original: data = original.read()
+        with open(script_fn, 'w') as modified: modified.write("#!/bin/sh" + "\n" + data)
+        sge_cmd="sbatch -J {job_name} {sge_option} {script}".format(job_name=job_name, cwd=os.getcwd(),sge_option=sge_option, script=script_fn)
+        system(sge_cmd, check=True)
+
+def _run_script_local(job_data):
+        script_fn = job_data["script_fn"]
+        job_name = job_data["job_name"]
+        log_fn = '{0}.log'.format(script_fn)
+        cmd = "bash {0} 1> {1} 2>&1".format(script_fn, log_fn)
+        try:
+            system(cmd, check=True)
+        except Exception:
+            out = open(log_fn).read()
+            fc_run_logger.exception('Contents of %r:\n%s' %(log_fn, out))
+            raise
+
+_run_scripts = {
+        'SGE': _run_script_sge,
+        'TORQUE': _run_script_torque,
+        'SLURM': _run_script_slurm,
+        'LOCAL': _run_script_local,
+}
+
+def run_script(job_data, job_type = "SGE" ):
+    """For now, we actually modify the script before running it.
+    This assume a simple bash script.
+    We will have a better solution eventually.
+    """
+    try:
+        _run_script = _run_scripts[job_type.upper()]
+    except LookupError as e:
+        msg = 'Unknown job_type=%s' %repr(job_type)
+        fc_run_logger.exception(msg)
+        raise
+    job_name = job_data["job_name"]
+    script_fn = job_data["script_fn"]
+    support.update_env_in_script(script_fn,
+        ['PATH', 'PYTHONPATH', 'LD_LIBRARY_PATH'])
+    fc_run_logger.info('(%s) %r' %(job_type, script_fn))
+    fc_run_logger.debug('%s (job %r)' %(_run_script.__name__, job_name))
+    rc = _run_script(job_data)
+    # Someday, we might trap exceptions here, as a failure would be caught later anyway.
+
+def wait_for_file(filename, task, job_name = ""):
+    """We could be in the thread or sub-process which spawned a qsub job,
+    so we must check for the shutdown_event.
+    """
+    while 1:
+        time.sleep(wait_time)
+        # We prefer all jobs to rely on `*done.exit`, but not all do yet. So we check that 1st.
+        exit_fn = filename + '.exit'
+        if os.path.exists(exit_fn):
+            fc_run_logger.info( "%r found." % (exit_fn) )
+            fc_run_logger.debug( " job: %r exited." % (job_name) )
+            os.unlink(exit_fn) # to allow a restart later, if not done
+            if not os.path.exists(filename):
+                fc_run_logger.warning( "%r is missing. job: %r failed!" % (filename, job_name) )
+            break
+        if os.path.exists(filename) and not os.path.exists(exit_fn):
+            # (rechecked exit_fn to avoid race condition)
+            fc_run_logger.info( "%r not found, but job is done." % (exit_fn) )
+            fc_run_logger.debug( " job: %r exited." % (job_name) )
+            break
+        if task.shutdown_event is not None and task.shutdown_event.is_set():
+            fc_run_logger.warning( "shutdown_event received (Keyboard Interrupt maybe?), %r not finished."
+                % (job_name) )
+            if support.job_type == "SGE":
+                fc_run_logger.info( "deleting the job by `qdel` now..." )
+                system("qdel %s" % job_name) # Failure is ok.
+            if support.job_type == "SLURM":
+                fc_run_logger.info( "Deleting the job by 'scancel' now...")
+                system("scancel -n %s" % job_name)
+            break
+
+def task_make_fofn_abs_raw(self):
+    return support.make_fofn_abs(self.i_fofn.path, self.o_fofn.path)
+
+def task_make_fofn_abs_preads(self):
+    return support.make_fofn_abs(self.i_fofn.path, self.o_fofn.path)
+
+def task_build_rdb(self):
+    input_fofn_fn = fn(self.input_fofn)
+    job_done = fn(self.rdb_build_done)
+    work_dir = self.parameters["work_dir"]
+    config = self.parameters["config"]
+    sge_option_da = config["sge_option_da"]
+
+    script_fn = os.path.join( work_dir, "prepare_rdb.sh" )
+    args = {
+        'input_fofn_fn': input_fofn_fn,
+        'work_dir': work_dir,
+        'config': config,
+        'job_done': job_done,
+        'script_fn': script_fn,
+        'run_jobs_fn': fn(self.run_jobs),
+    }
+    support.build_rdb(**args)
+
+    job_data = support.make_job_data(self.URL, script_fn)
+    job_data["sge_option"] = sge_option_da
+    run_script(job_data, job_type = config["job_type"])
+    wait_for_file(job_done, task=self, job_name=job_data['job_name'])
+
+def task_build_pdb(self):  #essential the same as build_rdb() but the subtle differences are tricky to consolidate to one function
+    input_fofn_fn = fn(self.pread_fofn)
+    job_done = fn(self.pdb_build_done)
+    work_dir = self.parameters["work_dir"]
+    config = self.parameters["config"]
+    sge_option_pda = config["sge_option_pda"]
+
+    script_fn = os.path.join( work_dir, "prepare_pdb.sh" )
+    args = {
+        'input_fofn_fn': input_fofn_fn,
+        'work_dir': work_dir,
+        'config': config,
+        'job_done': job_done,
+        'script_fn': script_fn,
+        'run_jobs_fn': fn(self.run_jobs),
+    }
+    support.build_pdb(**args)
+
+    job_data = support.make_job_data(self.URL, script_fn)
+    job_data["sge_option"] = sge_option_pda
+    run_script(job_data, job_type = config["job_type"])
+    wait_for_file(job_done, task=self, job_name=job_data['job_name'])
+
+def task_run_falcon_asm(self):
+    wd = self.parameters["wd"]
+    #p_merge_done = self.p_merge_done
+    db_file = fn(self.db_file)
+    job_done = fn(self.falcon_asm_done)
+    config = self.parameters["config"]
+    pread_dir = self.parameters["pread_dir"]
+    script_dir = os.path.join( wd )
+    script_fn =  os.path.join( script_dir ,"run_falcon_asm.sh" )
+    args = {
+        'pread_dir': pread_dir,
+        'db_file': db_file,
+        'config': config,
+        'job_done': job_done,
+        'script_fn': script_fn,
+    }
+    support.run_falcon_asm(**args)
+
+    job_data = support.make_job_data(self.URL, script_fn)
+    job_data["sge_option"] = config["sge_option_fc"]
+    run_script(job_data, job_type = config["job_type"])
+    wait_for_file(job_done, task=self, job_name=job_data['job_name'])
+
+def task_run_daligner(self):
+    job_done = fn(self.job_done)
+    daligner_cmd = self.parameters["daligner_cmd"]
+    job_uid = self.parameters["job_uid"]
+    cwd = self.parameters["cwd"]
+    db_prefix = self.parameters["db_prefix"]
+    nblock = self.parameters["nblock"]
+    config = self.parameters["config"]
+    sge_option_da = config["sge_option_da"]
+    script_dir = os.path.join( cwd )
+    script_fn =  os.path.join( script_dir , "rj_%s.sh" % (job_uid))
+    args = {
+        'daligner_cmd': daligner_cmd,
+        'db_prefix': db_prefix,
+        'nblock': nblock,
+        'config': config,
+        'job_done': job_done,
+        'script_fn': script_fn,
+    }
+    support.run_daligner(**args)
+
+    job_data = support.make_job_data(self.URL, script_fn)
+    job_data["sge_option"] = sge_option_da
+    run_script(job_data, job_type = config["job_type"])
+    wait_for_file(job_done, task=self, job_name=job_data['job_name'])
+
+def task_run_las_merge(self):
+    p_script_fn = self.parameters["merge_script"]
+    job_id = self.parameters["job_id"]
+    cwd = self.parameters["cwd"]
+    job_done = fn(self.job_done)
+    config = self.parameters["config"]
+    sge_option_la = config["sge_option_la"]
+
+    script_dir = os.path.join( cwd )
+    script_fn =  os.path.join( script_dir , "rp_%05d.sh" % (job_id))
+    args = {
+        'p_script_fn': p_script_fn,
+        'config': config,
+        'job_done': job_done,
+        'script_fn': script_fn,
+    }
+    support.run_las_merge(**args)
+
+    job_data = support.make_job_data(self.URL, script_fn)
+    job_data["sge_option"] = sge_option_la
+    run_script(job_data, job_type = config["job_type"])
+    wait_for_file(job_done, task=self, job_name=job_data['job_name'])
+
+def task_run_consensus(self):
+    out_file_fn = fn(self.out_file)
+    job_id = self.parameters["job_id"]
+    cwd = self.parameters["cwd"]
+    config = self.parameters["config"]
+    prefix = self.parameters["prefix"]
+    sge_option_cns = config["sge_option_cns"]
+    script_dir = os.path.join( cwd )
+    job_done = os.path.join( cwd, "c_%05d_done" % job_id )
+    script_fn =  os.path.join( script_dir , "c_%05d.sh" % (job_id))
+    args = {
+        'job_id': job_id,
+        'out_file_fn': out_file_fn,
+        'prefix': prefix,
+        'config': config,
+        'job_done': job_done,
+        'script_fn': script_fn,
+    }
+    support.run_consensus(**args)
+
+    job_data = support.make_job_data(self.URL, script_fn)
+    job_data["sge_option"] = sge_option_cns
+    run_script(job_data, job_type = config["job_type"])
+    wait_for_file(job_done, task=self, job_name=job_data['job_name'])
+
+def mkdir(d):
+    if not os.path.isdir(d):
+        os.makedirs(d)
+
+def task_daligner_gather(self):
+    da_done = fn(self.da_done)
+    main_dir = os.path.dirname(da_done)
+    out_dict = self.inputDataObjs
+    nblock = self.parameters['nblock']
+    fc_run_logger.debug('nblock=%d, out_dir:\n%s'%(nblock, out_dict))
+
+    # Create m_* dirs.
+    for block in xrange(1, nblock+1):
+        mdir = os.path.join(main_dir, 'm_%05d' %block) # By convention. pbsmrtpipe works differently.
+        mkdir(mdir)
+        # TODO: Remove existing symlinks?
+
+    # Symlink all daligner *.las.
+    # Could be L1.* or preads.*
+    re_las = re.compile(r'\.(\d*)(\.\d*)?\.las$')
+    for dal_done in out_dict.values():
+        job_rundir = os.path.dirname(fn(dal_done))
+        for las_fn in os.listdir(job_rundir):
+            mo = re_las.search(las_fn)
+            if not mo:
+                continue
+            block = int(mo.group(1)) # We will merge in the m_* dir of the left block.
+            mdir = os.path.join(main_dir, 'm_%05d' %block) # By convention. pbsmrtpipe works differently.
+            las_path = os.path.join('..', os.path.basename(job_rundir), las_fn)
+            cmd = 'ln -sf {} {}'.format(las_path, mdir)
+            system(cmd)
+    system("touch %s" %da_done)
+
+def get_nblock(db_file):
+    nblock = 1
+    new_db = True
+    if os.path.exists(db_file):
+        with open(db_file) as f:
+            for l in f:
+                l = l.strip().split()
+                if l[0] == "blocks" and l[1] == "=":
+                    nblock = int(l[2])
+                    new_db = False
+                    break
+    # Ignore new_db for now.
+    return nblock
+
+def create_daligner_tasks(run_jobs_fn, wd, db_prefix, nblock, rdb_build_done, config, pread_aln=False):
+    job_id = 0
+    tasks = []
+    tasks_out = {}
+
+    xform_script = get_script_xformer(pread_aln)
+
+    line_count = 0
+    job_descs = get_daligner_job_descriptions(open(run_jobs_fn), db_prefix)
+    for desc, bash in job_descs.iteritems():
+        job_uid = '%04x' %line_count
+        line_count += 1
+
+        support.make_dirs(os.path.join( wd, "./job_%s" % job_uid))
+        call = "cd %s/job_%s;ln -sf ../.%s.bps .; ln -sf ../.%s.idx .; ln -sf ../%s.db ." % (wd, job_uid, db_prefix, db_prefix, db_prefix)
+        rc = system(call)
+        if rc:
+            raise Exception("Failure in system call: %r -> %d" %(call, rc))
+        job_done = makePypeLocalFile(os.path.abspath( "%s/job_%s/job_%s_done" % (wd, job_uid, job_uid)  ))
+        bash = xform_script(bash)
+        parameters =  {"daligner_cmd": bash,
+                        "cwd": os.path.join(wd, "job_%s" % job_uid),
+                        "job_uid": job_uid,
+                        "config": config,
+                        "nblock": nblock,
+                        "db_prefix": db_prefix}
+        make_daligner_task = PypeTask( inputs = {"rdb_build_done": rdb_build_done},
+                                        outputs = {"job_done": job_done},
+                                        parameters = parameters,
+                                        TaskType = PypeThreadTaskBase,
+                                        URL = "task://localhost/d_%s_%s" % (job_uid, db_prefix) )
+        daligner_task = make_daligner_task( task_run_daligner )
+        tasks.append( daligner_task )
+        tasks_out[ "ajob_%s" % job_uid ] = job_done
+        job_id += 1
+    return tasks, tasks_out
+
+def create_merge_tasks(run_jobs_fn, wd, db_prefix, input_dep, config):
+    merge_tasks = []
+    consensus_tasks = []
+    merge_out = {}
+    consensus_out ={}
+    mjob_data = {}
+
+    with open(run_jobs_fn) as f :
+        for l in f:
+            l = l.strip().split()
+            if l[0] not in ( "LAsort", "LAmerge", "mv" ):
+                continue
+            if l[0] == "LAsort":
+                # We now run this part w/ daligner, but we still need
+                # a small script for some book-keeping.
+                p_id = int( l[2].split(".")[1] )
+                mjob_data.setdefault( p_id, [] )
+                #mjob_data[p_id].append(  " ".join(l) ) # Already done w/ daligner!
+            if l[0] == "LAmerge":
+                l2 = l[2].split(".")
+                if l2[1][0] == "L":
+                    p_id = int(  l[2].split(".")[2] )
+                    mjob_data.setdefault( p_id, [] )
+                    mjob_data[p_id].append(  " ".join(l) )
+                else:
+                    p_id = int( l[2].split(".")[1] )
+                    mjob_data.setdefault( p_id, [] )
+                    mjob_data[p_id].append(  " ".join(l) )
+            if l[0] == "mv":
+                l2 = l[1].split(".")
+                if l2[1][0] == "L":
+                    p_id = int(  l[1].split(".")[2] )
+                    mjob_data.setdefault( p_id, [] )
+                    mjob_data[p_id].append(  " ".join(l) )
+                else:
+                    p_id = int( l[1].split(".")[1] )
+                    mjob_data.setdefault( p_id, [] )
+                    mjob_data[p_id].append(  " ".join(l) )
+
+    for p_id in mjob_data:
+        s_data = mjob_data[p_id]
+
+        support.make_dirs("%s/m_%05d" % (wd, p_id))
+        support.make_dirs("%s/preads" % (wd) )
+        support.make_dirs("%s/las_files" % (wd) )
+
+        merge_script_file = os.path.abspath( "%s/m_%05d/m_%05d.sh" % (wd, p_id, p_id) )
+        with open(merge_script_file, "w") as merge_script:
+            #print >> merge_script, """for f in `find .. -wholename "*job*/%s.%d.%s.*.*.las"`; do ln -sf $f .; done""" % (db_prefix, p_id, db_prefix)
+            for l in s_data:
+                print >> merge_script, l
+            print >> merge_script, "ln -sf ../m_%05d/%s.%d.las ../las_files" % (p_id, db_prefix, p_id) 
+            print >> merge_script, "ln -sf ./m_%05d/%s.%d.las .. " % (p_id, db_prefix, p_id) 
+            
+        job_done = makePypeLocalFile(os.path.abspath( "%s/m_%05d/m_%05d_done" % (wd, p_id, p_id)  ))
+        parameters =  {"merge_script": merge_script_file, 
+                       "cwd": os.path.join(wd, "m_%05d" % p_id),
+                       "job_id": p_id,
+                       "config": config}
+
+        make_merge_task = PypeTask( inputs = {"input_dep": input_dep},
+                                       outputs = {"job_done": job_done},
+                                       parameters = parameters,
+                                       TaskType = PypeThreadTaskBase,
+                                       URL = "task://localhost/m_%05d_%s" % (p_id, db_prefix) )
+        merge_task = make_merge_task ( task_run_las_merge)
+
+        merge_out["mjob_%d" % p_id] = job_done
+        merge_tasks.append(merge_task)
+
+
+        out_file = makePypeLocalFile(os.path.abspath( "%s/preads/out.%05d.fasta" % (wd, p_id)  ))
+        out_done = makePypeLocalFile(os.path.abspath( "%s/preads/c_%05d_done" % (wd, p_id)  ))
+        parameters =  {"cwd": os.path.join(wd, "preads" ),
+                       "job_id": p_id, 
+                       "prefix": db_prefix,
+                       "config": config}
+        make_c_task = PypeTask( inputs = {"job_done": job_done},
+                                outputs = {"out_file": out_file, "out_done": out_done },
+                                parameters = parameters,
+                                TaskType = PypeThreadTaskBase,
+                                URL = "task://localhost/ct_%05d" % p_id )
+        
+        c_task = make_c_task( task_run_consensus)
+        consensus_tasks.append(c_task)
+        consensus_out["cjob_%d" % p_id] = out_done 
+
+    return merge_tasks, merge_out, consensus_tasks, consensus_out
+
+
+
+def main1(prog_name, input_config_fn, logger_config_fn=None):
+    global fc_run_logger
+    fc_run_logger = support.setup_logger(logger_config_fn)
+
+    fc_run_logger.info( "fc_run started with configuration %s", input_config_fn ) 
+    config = support.get_dict_from_old_falcon_cfg(support.parse_config(input_config_fn))
+    rawread_dir = os.path.abspath("./0-rawreads")
+    pread_dir = os.path.abspath("./1-preads_ovl")
+    falcon_asm_dir  = os.path.abspath("./2-asm-falcon")
+    script_dir = os.path.abspath("./scripts")
+    sge_log_dir = os.path.abspath("./sge_log")
+
+    for d in (rawread_dir, pread_dir, falcon_asm_dir, script_dir, sge_log_dir):
+        support.make_dirs(d)
+
+    concurrent_jobs = config["pa_concurrent_jobs"]
+    PypeThreadWorkflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs)
+    wf = PypeThreadWorkflow()
+
+    input_fofn_plf = makePypeLocalFile(os.path.basename(config["input_fofn_fn"]))
+    rawread_fofn_plf = makePypeLocalFile(os.path.join(rawread_dir, os.path.basename(config["input_fofn_fn"])))
+    make_fofn_abs_task = PypeTask(inputs = {"i_fofn": input_fofn_plf},
+                                  outputs = {"o_fofn": rawread_fofn_plf},
+                                  parameters = {},
+                                  TaskType = PypeThreadTaskBase)
+    fofn_abs_task = make_fofn_abs_task(task_make_fofn_abs_raw)
+
+    wf.addTasks([fofn_abs_task])
+    wf.refreshTargets([fofn_abs_task])
+
+    if config["input_type"] == "raw":
+        #### import sequences into daligner DB
+        sleep_done = makePypeLocalFile( os.path.join( rawread_dir, "sleep_done") )
+        rdb_build_done = makePypeLocalFile( os.path.join( rawread_dir, "rdb_build_done") ) 
+        run_jobs = makePypeLocalFile( os.path.join( rawread_dir, "run_jobs.sh") ) 
+        parameters = {"work_dir": rawread_dir,
+                      "config": config}
+
+        make_build_rdb_task = PypeTask(inputs = {"input_fofn": rawread_fofn_plf},
+                                      outputs = {"rdb_build_done": rdb_build_done,
+                                                 "run_jobs": run_jobs}, 
+                                      parameters = parameters,
+                                      TaskType = PypeThreadTaskBase)
+        build_rdb_task = make_build_rdb_task(task_build_rdb)
+
+        wf.addTasks([build_rdb_task])
+        wf.refreshTargets([rdb_build_done]) 
+
+        db_file = makePypeLocalFile(os.path.join( rawread_dir, "%s.db" % "raw_reads" ))
+        raw_reads_nblock = get_nblock(fn(db_file))
+        #### run daligner
+        daligner_tasks, daligner_out = create_daligner_tasks(fn(run_jobs), rawread_dir, "raw_reads", raw_reads_nblock, rdb_build_done, config) 
+
+        wf.addTasks(daligner_tasks)
+        #wf.refreshTargets(updateFreq = 60) # larger number better for more jobs
+        r_da_done = makePypeLocalFile( os.path.join( rawread_dir, "da_done") )
+
+        parameters =  {
+                "nblock": raw_reads_nblock,
+        }
+        make_daligner_gather = PypeTask(
+                   inputs = daligner_out, 
+                   outputs =  {"da_done":r_da_done},
+                   parameters = parameters,
+                   TaskType = PypeThreadTaskBase,
+                   URL = "task://localhost/rda_check" )
+        check_r_da_task = make_daligner_gather(task_daligner_gather)
+        wf.addTask(check_r_da_task)
+        wf.refreshTargets(updateFreq = wait_time) # larger number better for more jobs, need to call to run jobs here or the # of concurrency is changed
+        
+        concurrent_jobs = config["cns_concurrent_jobs"]
+        PypeThreadWorkflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs)
+        merge_tasks, merge_out, consensus_tasks, consensus_out = create_merge_tasks(fn(run_jobs), rawread_dir, "raw_reads", r_da_done, config)
+        wf.addTasks( merge_tasks )
+        if config["target"] == "overlapping":
+            wf.refreshTargets(updateFreq = wait_time) # larger number better for more jobs, need to call to run jobs here or the # of concurrency is changed
+            sys.exit(0)
+        wf.addTasks( consensus_tasks )
+
+        r_cns_done = makePypeLocalFile( os.path.join( rawread_dir, "cns_done") )
+        pread_fofn = makePypeLocalFile( os.path.join( pread_dir,  "input_preads.fofn" ) )
+
+        @PypeTask( inputs = consensus_out, 
+                   outputs =  {"cns_done":r_cns_done, "pread_fofn": pread_fofn},
+                   TaskType = PypeThreadTaskBase,
+                   URL = "task://localhost/cns_check" )
+        def check_r_cns_task(self):
+            with open(fn(self.pread_fofn),  "w") as f:
+                fn_list =  glob.glob("%s/preads/out*.fasta" % rawread_dir)
+                fn_list.sort()
+                for fa_fn in fn_list:
+                    print >>f, fa_fn
+            system("touch %s" % fn(self.cns_done))
+
+        wf.addTask(check_r_cns_task)
+        wf.refreshTargets(updateFreq = wait_time) # larger number better for more jobs
+
+    if config["target"] == "pre-assembly":
+        sys.exit(0)
+
+    # build pread database
+    if config["input_type"] == "preads":
+        pread_fofn = makePypeLocalFile(os.path.join(pread_dir, os.path.basename(config["input_fofn_fn"])))
+        make_fofn_abs_task = PypeTask(inputs = {"i_fofn": rawread_fofn_plf},
+                                     outputs = {"o_fofn": pread_fofn},
+                                     parameters = {},
+                                     TaskType = PypeThreadTaskBase)
+        fofn_abs_task = make_fofn_abs_task(task_make_fofn_abs_preads)
+        wf.addTasks([fofn_abs_task])
+        wf.refreshTargets([fofn_abs_task])
+
+    pdb_build_done = makePypeLocalFile( os.path.join( pread_dir, "pdb_build_done") ) 
+    parameters = {"work_dir": pread_dir,
+                  "config": config}
+
+    run_jobs = makePypeLocalFile(os.path.join(pread_dir, 'run_jobs.sh'))
+    make_build_pdb_task  = PypeTask(inputs = { "pread_fofn": pread_fofn },
+                                    outputs = { "pdb_build_done": pdb_build_done,
+                                                "run_jobs": run_jobs},
+                                    parameters = parameters,
+                                    TaskType = PypeThreadTaskBase,
+                                    URL = "task://localhost/build_pdb")
+    build_pdb_task = make_build_pdb_task(task_build_pdb)
+
+    wf.addTasks([build_pdb_task])
+    wf.refreshTargets([pdb_build_done]) 
+
+
+
+    db_file = makePypeLocalFile(os.path.join( pread_dir, "%s.db" % "preads" ))
+    preads_nblock = get_nblock(fn(db_file))
+    #### run daligner
+    concurrent_jobs = config["ovlp_concurrent_jobs"]
+    PypeThreadWorkflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs)
+    config["sge_option_da"] = config["sge_option_pda"]
+    config["sge_option_la"] = config["sge_option_pla"]
+    daligner_tasks, daligner_out = create_daligner_tasks(fn(run_jobs), pread_dir, "preads", preads_nblock, pdb_build_done, config, pread_aln= True) 
+    wf.addTasks(daligner_tasks)
+    #wf.refreshTargets(updateFreq = 30) # larger number better for more jobs
+
+    p_da_done = makePypeLocalFile( os.path.join( pread_dir, "da_done") )
+    parameters =  {
+            "nblock": preads_nblock,
+    }
+    make_daligner_gather = PypeTask(
+                inputs = daligner_out, 
+                outputs =  {"da_done":p_da_done},
+                parameters = parameters,
+                TaskType = PypeThreadTaskBase,
+                URL = "task://localhost/pda_check" )
+    check_p_da_task = make_daligner_gather(task_daligner_gather)
+    wf.addTask(check_p_da_task)
+
+    merge_tasks, merge_out, consensus_tasks, consensus_out = create_merge_tasks(fn(run_jobs), pread_dir, "preads", p_da_done, config)
+    wf.addTasks( merge_tasks )
+    #wf.refreshTargets(updateFreq = 30) #all
+
+    p_merge_done = makePypeLocalFile( os.path.join( pread_dir, "p_merge_done") )
+
+    @PypeTask( inputs = merge_out, 
+               outputs =  {"p_merge_done":p_merge_done},
+               TaskType = PypeThreadTaskBase,
+               URL = "task://localhost/pmerge_check" )
+    def check_p_merge_check_task(self):
+        system("touch %s" % fn(self.p_merge_done))
+    
+    wf.addTask(check_p_merge_check_task)
+    wf.refreshTargets(updateFreq = wait_time) #all
+
+    
+    falcon_asm_done = makePypeLocalFile( os.path.join( falcon_asm_dir, "falcon_asm_done") )
+    make_run_falcon_asm = PypeTask(
+               inputs = {"p_merge_done": p_merge_done, "db_file":db_file},
+               outputs =  {"falcon_asm_done":falcon_asm_done},
+               parameters = {"wd": falcon_asm_dir,
+                             "config": config,
+                             "pread_dir": pread_dir},
+               TaskType = PypeThreadTaskBase,
+               URL = "task://localhost/falcon" )
+    wf.addTask(make_run_falcon_asm(task_run_falcon_asm))
+    wf.refreshTargets(updateFreq = wait_time) #all
+
+
+def main(argv=sys.argv):
+    print(argv)
+    if len(argv) < 2:
+        sys.stderr.write( """
+you need to specify a configuration file"
+usage: fc_run fc_run.cfg [logging.cfg]
+""")
+        sys.exit(2)
+    main1(*argv)
diff --git a/src/py/mains/tasks.py b/src/py/mains/tasks.py
new file mode 100644
index 0000000..7783a21
--- /dev/null
+++ b/src/py/mains/tasks.py
@@ -0,0 +1,31 @@
+"""Executable tasks.
+
+To be called by pbsmrtpipe.
+
+pypeFLOW uses its own adaptors instead.
+"""
+from .. import run_support as support
+import sys
+
+
+def help():
+    print("""
+Usage:
+    falcon-task [task] <[task-args]>
+
+tasks:
+    make-fofn-abs
+""")
+    sys.exit(2)
+
+def main_make_fofn_abs(i_fofn_fn, o_fofn_fn):
+    support.make_fofn_abs(i_fofn_fn, o_fofn_fn)
+
+def main(argv=sys.argv):
+    if len(argv) < 2 or argv[1].startswith('-'):
+        help()
+    task = argv[1]
+    tasks = {
+        'make-fofn-abs': main_make_fofn_abs,
+    }
+    return tasks[task](*argv[2:])
diff --git a/src/py/multiproc.py b/src/py/multiproc.py
new file mode 100644
index 0000000..408ea06
--- /dev/null
+++ b/src/py/multiproc.py
@@ -0,0 +1,25 @@
+"""Job pools for multiprocessing.
+"""
+import multiprocessing
+import itertools
+
+class FakePool(object):
+    """Fake version of multiprocessing.Pool
+    """
+    def map(self, func, iterable, chunksize=None):
+        return map(func, iterable)
+    def imap(self, func, iterable, chunksize=None):
+        return itertools.imap(func, iterable)
+    def __init__(self, initializer=None, initargs=[], *args, **kwds):
+        if initializer:
+            initializer(*initargs)
+
+def Pool(processes, *args, **kwds):
+    """Pool factory.
+    If 'not processes', return our FakePool;
+    otherwise, a multiprocessing.Pool.
+    """
+    if processes:
+        return multiprocessing.Pool(processes, *args, **kwds)
+    else:
+        return FakePool(*args, **kwds)
diff --git a/src/py/run_support.py b/src/py/run_support.py
new file mode 100644
index 0000000..32005dc
--- /dev/null
+++ b/src/py/run_support.py
@@ -0,0 +1,475 @@
+import ConfigParser
+import logging
+import logging.config
+import os
+import StringIO
+import sys
+import tempfile
+import time
+import uuid
+
+job_type = None
+logger = None
+
+def _prepend_env_paths(content, names):
+    """
+    E.g.
+      names = ['PATH', 'PYTYHONPATH']
+      content =
+        echo hi
+      =>
+        export PATH=current:path:${PATH}
+        export PYTHON=current:path:${PYTHONPATH}
+        echo hi
+    """
+    export_env_vars = ['export %(k)s=%(v)s:${%(k)s}' %dict(
+        k=name, v=os.environ.get(name, '')) for name in names]
+    return '\n'.join(export_env_vars + [content])
+
+def update_env_in_script(fn, names):
+    """Modify fn using on prepend_env_paths().
+    """
+    with open(fn) as ifs:
+        content = ifs.read()
+    content = _prepend_env_paths(content, names)
+    with open(fn, 'w') as ofs:
+        ofs.write(content)
+
+def use_tmpdir_for_files(basenames, src_dir, link_dir):
+    """Generate script to copy db files to tmpdir (for speed).
+    - Choose tmp_dir, based on src_dir name.
+    - rsync basenames into tmp_dir  # after 'flock', per file
+    - symlink from link_dir into tmp_dir.
+    Return list of script lines, sans linefeed.
+    """
+    script = list()
+    unique = os.path.abspath(src_dir).replace('/', '_')
+    root = tempfile.gettempdir()
+    tmp_dir = os.path.join(root, 'falcon', unique)
+    script.append('mkdir -p %s' %tmp_dir)
+    for basename in basenames:
+        src = os.path.join(src_dir, basename)
+        dst = os.path.join(tmp_dir, basename)
+        rm_cmd = 'rm -f %s' %basename
+        # Wait on lock for up to 10 minutes, in case of very large files.
+        rsync_cmd = "flock -w 600 %s.lock -c 'rsync -av %s %s'" %(dst, src, dst)
+        ln_cmd = 'ln -sf %s %s' %(dst, basename)
+        script.extend([rm_cmd, rsync_cmd, ln_cmd])
+    return script
+
+def make_job_data(url, script_fn):
+    """Choose defaults.
+    Run in same directory as script_fn.
+    Base job_name on script_fn.
+    """
+    wd = os.path.dirname(script_fn)
+    job_name = '{0}-{1}-{1}'.format(
+            os.path.basename(script_fn),
+            url.split("/")[-1],
+            str(uuid.uuid4())[:8],
+            )
+    job_data = {"job_name": job_name,
+                "cwd": wd,
+                "script_fn": script_fn }
+    return job_data
+
+def validate_config_dict(cd):
+    pass
+
+def get_config(config):
+    """Temporary version for pbsmrtpipe.
+    This will add missing (but curently required) options and use
+    get_dict_from_old_falcon_cfg() below.
+    The plan is to pass a simpler config from pbsmrtpipe,
+    but that will be in a different commit.
+    Side-effect: Update 'config'.
+    """
+    section = 'General'
+    def add(name, val):
+        if not config.has_option(section, name):
+            config.set(section, name, val)
+    add('input_fofn', 'NA')
+    add('target', 'assembly')
+    add('sge_option_da', 'NA')
+    add('sge_option_la', 'NA')
+    add('sge_option_pda', 'NA')
+    add('sge_option_pla', 'NA')
+    add('sge_option_fc', 'NA')
+    add('sge_option_cns', 'NA')
+    return get_dict_from_old_falcon_cfg(config)
+
+def parse_config(config_fn):
+    config = ConfigParser.ConfigParser()
+    config.read(config_fn)
+    return config
+
+def get_dict_from_old_falcon_cfg(config):
+    global job_type  # TODO: Stop using global for wait_for_file().
+    job_type = "SGE"
+    if config.has_option('General', 'job_type'):
+        job_type = config.get('General', 'job_type')
+
+    pa_concurrent_jobs = 8
+    if config.has_option('General', 'pa_concurrent_jobs'):
+        pa_concurrent_jobs = config.getint('General', 'pa_concurrent_jobs')
+
+    cns_concurrent_jobs = 8
+    if config.has_option('General', 'cns_concurrent_jobs'):
+        cns_concurrent_jobs = config.getint('General', 'cns_concurrent_jobs')
+
+    ovlp_concurrent_jobs = 8
+    if config.has_option('General', 'ovlp_concurrent_jobs'):
+        ovlp_concurrent_jobs = config.getint('General', 'ovlp_concurrent_jobs')
+
+    #appending = False
+    #if config.has_option('General', 'appending'):
+    #    appending = config.get('General', 'appending')
+    #    if appending == "True":
+    #        appending = True
+
+    openending = False
+    if config.has_option('General', 'openending'):
+        openending = config.get('General', 'openending')
+        if openending == "True":
+            openending = True
+
+    input_type = "raw"
+    if config.has_option('General', 'input_type'):
+        input_type = config.get('General', 'input_type')
+
+    overlap_filtering_setting =  """--max_diff 1000 --max_cov 1000 --min_cov 2"""
+    if config.has_option('General', 'overlap_filtering_setting'):
+        overlap_filtering_setting = config.get('General', 'overlap_filtering_setting')
+
+    pa_HPCdaligner_option = """-v -dal4 -t16 -e.70 -l1000 -s100"""
+    if config.has_option('General', 'pa_HPCdaligner_option'):
+        pa_HPCdaligner_option = config.get('General', 'pa_HPCdaligner_option')
+
+    ovlp_HPCdaligner_option = """ -v -dal24 -t32 -h60 -e.96 -l500 -s1000"""
+    if config.has_option('General', 'ovlp_HPCdaligner_option'):
+        ovlp_HPCdaligner_option = config.get('General', 'ovlp_HPCdaligner_option')
+
+    pa_DBsplit_option = """ -x500 -s200"""
+    if config.has_option('General', 'pa_DBsplit_option'):
+        pa_DBsplit_option = config.get('General', 'pa_DBsplit_option')
+
+    ovlp_DBsplit_option = """ -x500 -s200"""
+    if config.has_option('General', 'ovlp_DBsplit_option'):
+        ovlp_DBsplit_option = config.get('General', 'ovlp_DBsplit_option')
+
+    falcon_sense_option = """ --output_multi --min_idt 0.70 --min_cov 2 --local_match_count_threshold 0 --max_n_read 1800 --n_core 6"""
+    if config.has_option('General', 'falcon_sense_option'):
+        falcon_sense_option = config.get('General', 'falcon_sense_option')
+
+    falcon_sense_skip_contained = False
+    if config.has_option('General', 'falcon_sense_skip_contained'):
+        falcon_sense_skip_contained = config.get('General', 'falcon_sense_skip_contained')
+        if falcon_sense_skip_contained in ["True", "true", "1"]:
+            falcon_sense_skip_contained = True
+        else:
+            falcon_sense_skip_contained = False
+
+    length_cutoff = config.getint('General', 'length_cutoff')
+    input_fofn_fn = config.get('General', 'input_fofn')
+
+    length_cutoff_pr = config.getint('General', 'length_cutoff_pr')
+
+    bestn = 12
+    if config.has_option('General', 'bestn'):
+        bestn = config.getint('General', 'bestn')
+
+    if config.has_option('General', 'target'):
+        target = config.get('General', 'target')
+        if target not in ["overlapping", "pre-assembly", "assembly"]:
+            msg = """ Target has to be "overlapping", "pre-assembly" or "assembly" in this verison. You have an unknown target %s in the configuration file.  """ % target
+            raise Exception(msg)
+    else:
+        logger.info(""" No target specified, assuming "assembly" as target """)
+        target = "assembly"
+
+    if config.has_option('General', 'use_tmpdir'):
+        use_tmpdir = config.getboolean('General','use_tmpdir')
+    else:
+        use_tmpdir = False
+
+    hgap_config = {"input_fofn_fn" : input_fofn_fn,
+                   "target" : target,
+                   "job_type" : job_type,
+                   "input_type": input_type,
+                   "openending": openending,
+                   "pa_concurrent_jobs" : pa_concurrent_jobs,
+                   "ovlp_concurrent_jobs" : ovlp_concurrent_jobs,
+                   "cns_concurrent_jobs" : cns_concurrent_jobs,
+                   "overlap_filtering_setting": overlap_filtering_setting,
+                   "length_cutoff" : length_cutoff,
+                   "length_cutoff_pr" : length_cutoff_pr,
+                   "sge_option_da": config.get('General', 'sge_option_da'),
+                   "sge_option_la": config.get('General', 'sge_option_la'),
+                   "sge_option_pda": config.get('General', 'sge_option_pda'),
+                   "sge_option_pla": config.get('General', 'sge_option_pla'),
+                   "sge_option_fc": config.get('General', 'sge_option_fc'),
+                   "sge_option_cns": config.get('General', 'sge_option_cns'),
+                   "pa_HPCdaligner_option": pa_HPCdaligner_option,
+                   "ovlp_HPCdaligner_option": ovlp_HPCdaligner_option,
+                   "pa_DBsplit_option": pa_DBsplit_option,
+                   "ovlp_DBsplit_option": ovlp_DBsplit_option,
+                   "falcon_sense_option": falcon_sense_option,
+                   "falcon_sense_skip_contained": falcon_sense_skip_contained,
+                   "use_tmpdir": use_tmpdir,
+                   }
+
+    hgap_config["install_prefix"] = sys.prefix
+
+    return hgap_config
+
+default_logging_config = """
+[loggers]
+keys=root,pypeflow,fc_run
+
+[handlers]
+keys=stream,file_pypeflow,file_fc
+
+[formatters]
+keys=form01,form02
+
+[logger_root]
+level=NOTSET
+handlers=stream
+
+[logger_pypeflow]
+level=DEBUG
+handlers=file_pypeflow
+qualname=pypeflow
+propagate=1
+
+[logger_fc_run]
+level=NOTSET
+handlers=file_fc
+qualname=.
+propagate=1
+
+[handler_stream]
+class=StreamHandler
+level=INFO
+formatter=form02
+args=(sys.stderr,)
+
+[handler_file_pypeflow]
+class=FileHandler
+level=DEBUG
+formatter=form01
+args=('pypeflow.log',)
+
+[handler_file_fc]
+class=FileHandler
+level=DEBUG
+formatter=form01
+args=('fc.log',)
+
+[formatter_form01]
+format=%(asctime)s - %(name)s - %(levelname)s - %(message)s
+
+[formatter_form02]
+format=[%(levelname)s]%(message)s
+"""
+
+def setup_logger(logging_config_fn):
+    """See https://docs.python.org/2/library/logging.config.html
+    """
+    logging.Formatter.converter = time.gmtime # cannot be done in .ini
+
+    if logging_config_fn:
+        logger_fileobj = open(logging_config_fn)
+    else:
+        logger_fileobj = StringIO.StringIO(default_logging_config)
+    defaults = {
+    }
+    logging.config.fileConfig(logger_fileobj, defaults=defaults, disable_existing_loggers=False)
+
+    global logger
+    logger = logging.getLogger("fc_run")
+    return logger
+
+def make_fofn_abs(i_fofn_fn, o_fofn_fn):
+    """Copy i_fofn to o_fofn, but with relative filenames expanded for CWD.
+    """
+    assert os.path.abspath(o_fofn_fn) != os.path.abspath(i_fofn_fn)
+    with open(i_fofn_fn) as ifs, open(o_fofn_fn, 'w') as ofs:
+        for line in ifs:
+            ifn = line.strip()
+            if not ifn: continue
+            abs_ifn = os.path.abspath(ifn)
+            ofs.write('%s\n' %abs_ifn)
+    #return o_fofn_fn
+
+def make_dirs(d):
+    if not os.path.isdir(d):
+        os.makedirs(d)
+
+def build_rdb(input_fofn_fn, work_dir, config, job_done, script_fn, run_jobs_fn):
+    length_cutoff = config["length_cutoff"]
+    pa_HPCdaligner_option = config["pa_HPCdaligner_option"]
+    pa_DBsplit_option = config["pa_DBsplit_option"]
+    openending = config["openending"]
+
+    last_block = 1
+    new_db = True
+    if os.path.exists( os.path.join(work_dir, "raw_reads.db") ):
+        with open(  os.path.join(work_dir, "raw_reads.db") ) as f:
+            for l in f:
+                l = l.strip().split()
+                if l[0] == "blocks" and l[1] == "=":
+                    last_block = int(l[2])
+                    new_db = False
+                    break
+
+    with open(script_fn,"w") as script_file:
+        script_file.write("set -vex\n")
+        script_file.write("trap 'touch {job_done}.exit' EXIT\n".format(job_done = job_done))
+        script_file.write("cd {work_dir}\n".format(work_dir = work_dir))
+        script_file.write("hostname\n")
+        script_file.write("date\n")
+        #script_file.write("for f in `cat {input_fofn_fn}`; do fasta2DB raw_reads $f; done\n".format(input_fofn_fn = input_fofn_fn))
+        script_file.write("fasta2DB -v raw_reads -f{input_fofn_fn}\n".format(input_fofn_fn = input_fofn_fn))
+        if new_db  == True:
+            script_file.write("DBsplit %s raw_reads\n" % pa_DBsplit_option)
+            script_file.write("date\n")
+        if openending == True:
+            script_file.write("""LB=$(cat raw_reads.db | awk '$1 == "blocks" {print $3-1}')\n""")
+        else:
+            script_file.write("""LB=$(cat raw_reads.db | awk '$1 == "blocks" {print $3}')\n""")
+        script_file.write("HPCdaligner %s -H%d raw_reads %d-$LB > %s\n" %(
+            pa_HPCdaligner_option, length_cutoff, last_block, run_jobs_fn))
+        script_file.write("date\n")
+        script_file.write("touch {job_done}\n".format(job_done = job_done))
+
+def build_pdb(input_fofn_fn, work_dir, config, job_done, script_fn, run_jobs_fn):
+    length_cutoff_pr = config["length_cutoff_pr"]
+    ovlp_HPCdaligner_option = config["ovlp_HPCdaligner_option"]
+    ovlp_DBsplit_option = config["ovlp_DBsplit_option"]
+
+    with open(script_fn,"w") as script_file:
+        script_file.write("set -vex\n")
+        script_file.write("trap 'touch {job_done}.exit' EXIT\n".format(job_done = job_done))
+        script_file.write("cd {work_dir}\n".format(work_dir = work_dir))
+        script_file.write("hostname\n")
+        script_file.write("date\n")
+        script_file.write("fasta2DB -v preads -f{input_fofn_fn}\n".format(input_fofn_fn = input_fofn_fn))
+        script_file.write("DBsplit %s preads\n" % (ovlp_DBsplit_option))
+        script_file.write("HPCdaligner %s -H%d preads > %s\n" %(
+            ovlp_HPCdaligner_option, length_cutoff_pr, run_jobs_fn))
+        script_file.write("touch {job_done}\n".format(job_done = job_done))
+
+def run_falcon_asm(pread_dir, db_file, config, job_done, script_fn):
+    wd = os.path.dirname(script_fn)
+    overlap_filtering_setting = config["overlap_filtering_setting"]
+    length_cutoff_pr = config["length_cutoff_pr"]
+
+    script = []
+    script.append( "set -vex" )
+    script.append( "trap 'touch %s.exit' EXIT" % job_done )
+    script.append( "cd %s" % pread_dir )
+    script.append("date")
+    # Given preads.db,
+    # write preads4falcon.fasta, in 1-preads_ovl:
+    script.append( "DB2Falcon -U preads")
+    script.append("date")
+    script.append( "cd %s" % wd )
+    # Generate las.fofn:
+    script.append( """find %s/las_files -name "*.las" > las.fofn """ % pread_dir )
+    # Given, las.fofn,
+    # write preads.ovl:
+    script.append( """fc_ovlp_filter --db %s --fofn las.fofn %s --min_len %d > preads.ovl""" %\
+            (db_file, overlap_filtering_setting, length_cutoff_pr) )
+    script.append("date")
+    script.append( "ln -sf %s/preads4falcon.fasta ." % pread_dir)
+    # TODO: Figure out which steps need preads4falcon.fasta.
+
+    # Given preads.ovl,
+    # write sg_edges_list, c_path, utg_data, ctg_paths.
+    script.append( """fc_ovlp_to_graph preads.ovl --min_len %d > fc_ovlp_to_graph.log""" % length_cutoff_pr) # TODO: drop this logfile
+    script.append("date")
+    # Given sg_edges_list, utg_data, ctg_paths,
+    # Write p_ctg.fa and a_ctg_all.fa,
+    # plus a_ctg_base.fa, p_ctg_tiling_path, a_ctg_tiling_path, a_ctg_base_tiling_path:
+    script.append( """fc_graph_to_contig""" )
+    script.append("date")
+    # Given a_ctg_all.fa, write a_ctg.fa:
+    script.append( """fc_dedup_a_tigs""" )
+    script.append("date")
+    script.append( """touch %s""" % job_done)
+
+    with open(script_fn, "w") as script_file:
+        script_file.write("\n".join(script) + '\n')
+
+def run_daligner(daligner_cmd, db_prefix, nblock, config, job_done, script_fn):
+    cwd = os.path.dirname(script_fn)
+
+    script = []
+    script.append( "set -vex" )
+    script.append( "trap 'touch {job_done}.exit' EXIT".format(job_done = job_done) )
+    script.append( "cd %s" % cwd )
+    script.append( "hostname" )
+    script.append( "date" )
+    if config['use_tmpdir']:
+        basenames = [pattern.format(db_prefix) for pattern in ('.{}.idx', '.{}.bps', '{}.db')]
+        dst_dir = os.path.abspath(cwd)
+        src_dir = os.path.abspath(os.path.dirname(cwd)) # by convention
+        script.extend(use_tmpdir_for_files(basenames, src_dir, dst_dir))
+    script.append( "time "+ daligner_cmd )
+    script.append( "date" )
+    script.append( "touch {job_done}".format(job_done = job_done) )
+
+    with open(script_fn,"w") as script_file:
+        script_file.write("\n".join(script) + '\n')
+
+def run_las_merge(p_script_fn, job_done, config, script_fn):
+    cwd = os.path.dirname(script_fn)
+    script = []
+    script.append( "set -vex" )
+    script.append( "trap 'touch {job_done}.exit' EXIT".format(job_done = job_done) )
+    script.append( "cd %s" % cwd )
+    script.append( "hostname" )
+    script.append( "time bash %s" % p_script_fn )
+    script.append( "touch {job_done}".format(job_done = job_done) )
+
+    with open(script_fn,"w") as script_file:
+        script_file.write("date\n")
+        script_file.write("\n".join(script) + '\n')
+        script_file.write("date\n")
+
+def run_consensus(job_id, out_file_fn, prefix, config, job_done, script_fn):
+    cwd = os.path.dirname(script_fn)
+    falcon_sense_option = config["falcon_sense_option"]
+    length_cutoff = config["length_cutoff"]
+
+    script = []
+    script.append("set -vex")
+    script.append("set -o pipefail")
+    script.append("trap 'touch {job_done}.exit' EXIT".format(job_done = job_done))
+    script.append("cd ..")
+    script.append( "date" )
+    pipe = ''
+    if config["falcon_sense_skip_contained"]:
+        pipe += """LA4Falcon -H%d -fso %s las_files/%s.%d.las | """ % (length_cutoff, prefix, prefix, job_id)
+    else:
+        pipe += """LA4Falcon -H%d -fo %s las_files/%s.%d.las | """ % (length_cutoff, prefix, prefix, job_id)
+    pipe += """fc_consensus %s > %s""" % (falcon_sense_option, out_file_fn)
+    script.append(pipe)
+    script.append("date")
+    script.append("touch {job_done}".format(job_done = job_done))
+
+    c_script_fn = os.path.join(cwd, "cp_%05d.sh" % job_id)
+    with open(c_script_fn, "w") as f:
+        f.write('\n'.join(script + ['']))
+
+    script = []
+    script.append( "set -vex" )
+    script.append( "cd %s" % cwd )
+    script.append( "hostname" )
+    script.append( "date" )
+    script.append( "time bash %s" %os.path.basename(c_script_fn) )
+    script.append( "date" )
+
+    with open(script_fn,"w") as f:
+        f.write("\n".join(script + ['']))
diff --git a/src/py/util/__init__.py b/src/py/util/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/py/util/io.py b/src/py/util/io.py
new file mode 100644
index 0000000..2054604
--- /dev/null
+++ b/src/py/util/io.py
@@ -0,0 +1,162 @@
+"""I/O utilities
+Not specific to FALCON.
+"""
+import os
+import resource
+import shlex
+import subprocess as sp
+import sys
+import traceback
+
+def write_nothing(*args):
+    """
+    To use,
+      LOG = noop
+    """
+
+def write_with_pid(*args):
+    msg = '[%d]%s\n' %(os.getpid(), ' '.join(args))
+    sys.stderr.write(msg)
+
+LOG = write_with_pid
+
+def logstats():
+    """This is useful 'atexit'.
+    """
+    LOG('maxrss:%9d' %(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss))
+
+def reprarg(arg):
+    if (isinstance(arg, set) or isinstance(arg, list)
+            or isinstance(arg, tuple) or isinstance(arg, dict)):
+        if len(arg) > 9:
+            return '%s(%d elem)' %(type(arg).__name__, len(arg))
+    return repr(arg) 
+def run_func(args):
+    """Wrap multiprocessing.Pool calls.
+    Usage:
+        pool.imap(run_func, [func, arg0, arg1, ...])
+    """
+    func = args[0]
+    try:
+        func_name = func.__name__
+    except:
+        func_name = repr(func) # but since it must be pickle-able, this should never happen.
+    args = args[1:]
+    try:
+        LOG('starting %s(%s)' %(func_name, ', '.join(reprarg(a) for a in args)))
+        logstats()
+        ret = func(*args)
+        logstats()
+        LOG('finished %s(%s)' %(func_name, ', '.join(reprarg(a) for a in args)))
+        return ret
+    except Exception:
+        raise Exception(traceback.format_exc())
+    except: # KeyboardInterrupt, SystemExit
+        LOG('interrupted %s(%s)' %(func_name, ', '.join(reprarg(a) for a in args)))
+        return
+
+def syscall(cmd):
+    """Return stdout, fully captured.
+    Wait for subproc to finish.
+    Raise if empty.
+    Raise on non-zero exit-code.
+    """
+    LOG('$ %s >' %cmd)
+    output = sp.check_output(shlex.split(cmd))
+    if not output:
+        msg = '%r failed to produce any output.' %cmd
+        LOG('WARNING: %s' %msg)
+    return output
+
+def slurplines(cmd):
+    return syscall(cmd).splitlines()
+
+def streamlines(cmd):
+    """Stream stdout from cmd.
+    Let stderr fall through.
+    The returned reader will stop yielding when the subproc exits.
+    Note: We do not detect a failure in the underlying process.
+    """
+    LOG('$ %s |' %cmd)
+    proc = sp.Popen(shlex.split(cmd), stdout=sp.PIPE)
+    return proc.stdout
+
+class DataReaderContext(object):
+    def readlines(self):
+        output = self.data.strip()
+        for line in output.splitlines():
+            yield line
+    def __enter__(self):
+        pass
+    def __exit__(self, *args):
+        self.returncode = 0
+    def __init__(self, data):
+        self.data = data
+class ProcessReaderContext(object):
+    """Prefer this to slurplines() or streamlines().
+    """
+    def __enter__(self):
+        self.proc = sp.Popen(shlex.split(self.cmd), stdout=sp.PIPE)
+    def __exit__(self, etype, evalue, etb):
+        if etype is None:
+            self.proc.wait()
+        else:
+            # Exception was raised in "with-block".
+            # We cannot wait on proc b/c it might never finish!
+            pass
+        self.returncode = self.proc.returncode
+        if self.returncode:
+            msg = "%r <- %r" %(self.returncode, self.cmd)
+            raise Exception(msg)
+        del self.proc
+    def __init__(self, cmd):
+        self.cmd = cmd
+class CapturedProcessReaderContext(ProcessReaderContext):
+    def readlines(self):
+        """Usage:
+
+            cmd = 'ls -l'
+            reader = CapturedProcessReaderContext(cmd)
+            with reader:
+                for line in reader.readlines():
+                    print line
+
+        Any exception within the 'with-block' is propagated.
+        Otherwise, after all lines are read, if 'cmd' failed, Exception is raised.
+        """
+        output, _ = self.proc.communicate()
+        for line in output.splitlines():
+            yield line
+class StreamedProcessReaderContext(ProcessReaderContext):
+    def readlines(self):
+        """Usage:
+
+            cmd = 'ls -l'
+            reader = StreamedProcessReaderContext(cmd)
+            with reader:
+                for line in reader.readlines():
+                    print line
+
+        Any exception within the 'with-block' is propagated.
+        Otherwise, after all lines are read, if 'cmd' failed, Exception is raised.
+        """
+        for line in self.proc.stdout:
+            yield line
+
+def filesize(fn):
+    """In bytes.
+    Raise if fn does not exist.
+    """
+    statinfo = os.stat(fn)
+    return statinfo.st_size
+
+def validated_fns(fofn):
+    """Return list of filenames from fofn.
+    Assert none are empty or non-existent.
+    """
+    fns = open(fofn).read().strip().split("\n")
+    for fn in fns:
+        assert fn
+        assert os.path.isfile(fn)
+        assert filesize(fn)
+    return fns
diff --git a/src/py_scripts/fc_actg_coordinate.py b/src/py_scripts/fc_actg_coordinate.py
new file mode 100644
index 0000000..4a2df92
--- /dev/null
+++ b/src/py_scripts/fc_actg_coordinate.py
@@ -0,0 +1,5 @@
+from falcon_kit.mains.actg_coordinate import main
+import sys
+
+if __name__ == "__main__":
+    main(sys.argv)
diff --git a/src/py_scripts/fc_consensus.py b/src/py_scripts/fc_consensus.py
new file mode 100644
index 0000000..8f3e7c1
--- /dev/null
+++ b/src/py_scripts/fc_consensus.py
@@ -0,0 +1,5 @@
+from falcon_kit.mains.consensus import main
+import sys
+
+if __name__ == "__main__":
+    main(sys.argv)
diff --git a/src/py_scripts/fc_contig_annotate.py b/src/py_scripts/fc_contig_annotate.py
new file mode 100644
index 0000000..05b44ba
--- /dev/null
+++ b/src/py_scripts/fc_contig_annotate.py
@@ -0,0 +1,5 @@
+from falcon_kit.mains.contig_annotate import main
+import sys
+
+if __name__ == "__main__":
+    main(sys.argv)
diff --git a/src/py_scripts/fc_ctg_link_analysis.py b/src/py_scripts/fc_ctg_link_analysis.py
new file mode 100644
index 0000000..e55ba46
--- /dev/null
+++ b/src/py_scripts/fc_ctg_link_analysis.py
@@ -0,0 +1,5 @@
+from falcon_kit.mains.ctg_link_analysis import main
+import sys
+
+if __name__ == "__main__":
+    main(sys.argv)
diff --git a/src/py_scripts/fc_dedup_a_tigs.py b/src/py_scripts/fc_dedup_a_tigs.py
new file mode 100644
index 0000000..0ed9453
--- /dev/null
+++ b/src/py_scripts/fc_dedup_a_tigs.py
@@ -0,0 +1,5 @@
+from falcon_kit.mains.dedup_a_tigs import main
+import sys
+
+if __name__ == "__main__":
+    main(sys.argv)
diff --git a/src/py_scripts/fc_graph_to_contig.py b/src/py_scripts/fc_graph_to_contig.py
new file mode 100644
index 0000000..6330503
--- /dev/null
+++ b/src/py_scripts/fc_graph_to_contig.py
@@ -0,0 +1,5 @@
+from falcon_kit.mains.graph_to_contig import main
+import sys
+
+if __name__ == "__main__":
+    main(sys.argv)
diff --git a/src/py_scripts/fc_graph_to_utgs.py b/src/py_scripts/fc_graph_to_utgs.py
new file mode 100644
index 0000000..f50cd57
--- /dev/null
+++ b/src/py_scripts/fc_graph_to_utgs.py
@@ -0,0 +1,5 @@
+from falcon_kit.mains.graph_to_utgs import main
+import sys
+
+if __name__ == "__main__":
+    main(sys.argv)
diff --git a/src/py_scripts/fc_ovlp_filter.py b/src/py_scripts/fc_ovlp_filter.py
new file mode 100644
index 0000000..fdef7e8
--- /dev/null
+++ b/src/py_scripts/fc_ovlp_filter.py
@@ -0,0 +1,5 @@
+from falcon_kit.mains.ovlp_filter import main
+import sys
+
+if __name__ == "__main__":
+    main(sys.argv)
diff --git a/src/py_scripts/fc_ovlp_stats.py b/src/py_scripts/fc_ovlp_stats.py
new file mode 100644
index 0000000..84b774e
--- /dev/null
+++ b/src/py_scripts/fc_ovlp_stats.py
@@ -0,0 +1,5 @@
+from falcon_kit.mains.ovlp_stats import main
+import sys
+
+if __name__ == "__main__":
+    main(sys.argv)
diff --git a/src/py_scripts/fc_ovlp_to_graph.py b/src/py_scripts/fc_ovlp_to_graph.py
new file mode 100644
index 0000000..37aa38e
--- /dev/null
+++ b/src/py_scripts/fc_ovlp_to_graph.py
@@ -0,0 +1,5 @@
+from falcon_kit.mains.ovlp_to_graph import main
+import sys
+
+if __name__ == "__main__":
+    main(sys.argv)
diff --git a/src/py_scripts/fc_run.py b/src/py_scripts/fc_run.py
new file mode 100644
index 0000000..3f15bb6
--- /dev/null
+++ b/src/py_scripts/fc_run.py
@@ -0,0 +1,5 @@
+from falcon_kit.mains.run import main
+import sys
+
+if __name__ == "__main__":
+    main(sys.argv)
diff --git a/src/py_scripts/falcon_asm.py b/src/py_scripts_v0.1/falcon_asm.py
similarity index 100%
rename from src/py_scripts/falcon_asm.py
rename to src/py_scripts_v0.1/falcon_asm.py
diff --git a/src/py_scripts/falcon_asm_dev.py b/src/py_scripts_v0.1/falcon_asm_s.py
similarity index 55%
rename from src/py_scripts/falcon_asm_dev.py
rename to src/py_scripts_v0.1/falcon_asm_s.py
index 610a89f..720b2e2 100755
--- a/src/py_scripts/falcon_asm_dev.py
+++ b/src/py_scripts_v0.1/falcon_asm_s.py
@@ -37,7 +37,6 @@
 # SUCH DAMAGE.
 #################################################################################$$
 
-
 from pbcore.io import FastaReader
 import networkx as nx
 import os
@@ -45,7 +44,12 @@ import shlex
 import sys
 import subprocess
 
+DEBUG_LOG_LEVEL = 0
+
 class SGNode(object):
+    """
+    class representing a node in the string graph
+    """
     def __init__(self, node_name):
         self.name = node_name
         self.out_edges = []
@@ -56,6 +60,9 @@ class SGNode(object):
         self.in_edges.append(in_edge)
 
 class SGEdge(object):
+    """
+    class representing an edge in the string graph
+    """
     def __init__(self, in_node, out_node):
         self.in_node = in_node
         self.out_node = out_node
@@ -63,7 +70,15 @@ class SGEdge(object):
     def set_attribute(self, attr, value):
         self.attr[attr] = value
 
+def reverse_end( node_id ):
+    node_id, end = node_id.split(":")
+    new_end = "B" if end == "E" else "E"
+    return node_id + ":" + new_end
+
 class StringGraph(object):
+    """
+    class representing the string graph
+    """
     def __init__(self):
         self.nodes = {}
         self.edges = {}
@@ -72,10 +87,16 @@ class StringGraph(object):
         self.repeat_overlap = {}
         
     def add_node(self, node_name):
+        """ 
+        add a node into the graph by given a node name
+        """
         if node_name not in self.nodes:
             self.nodes[node_name] = SGNode(node_name)
     
     def add_edge(self, in_node_name, out_node_name, **attributes):
+        """ 
+        add an edge into the graph by given a pair of nodes
+        """
         if (in_node_name, out_node_name) not in self.edges:
         
             self.add_node(in_node_name)
@@ -90,15 +111,69 @@ class StringGraph(object):
         edge =  self.edges[ (in_node_name, out_node_name) ]
         for k, v in attributes.items():
             edge.attr[k] = v
-            
+
+    def init_reduce_dict(self):
+        for e in self.edges:
+            self.e_reduce[e] = False
+
+    def mark_chimer_edge(self):
+
+        for e_n, e in self.edges.items():
+            v = e_n[0]
+            w = e_n[1]
+            overlap_count = 0
+            for w_out_e in self.nodes[w].out_edges:
+                w_out_n = w_out_e.out_node.name
+                if (v, w_out_n) in self.edges:
+                    overlap_count += 1
+            for v_in_e in self.nodes[v].in_edges:
+                v_in_n = v_in_e.in_node.name
+                if (v_in_n, w) in self.edges:
+                    overlap_count += 1
+            if self.e_reduce[ (v, w) ] != True:
+                if overlap_count == 0:
+                    self.e_reduce[(v, w)] = True
+                    #print "XXX: chimer edge %s %s removed" % (v, w)
+                    v, w = reverse_end(w), reverse_end(v)
+                    self.e_reduce[(v, w)] = True
+                    #print "XXX: chimer edge %s %s removed" % (v, w)
+
+
+
+    def mark_spur_edge(self):
+
+        for  v in self.nodes:
+            if len(self.nodes[v].out_edges) > 1:
+                for out_edge in self.nodes[v].out_edges:
+                    w = out_edge.out_node.name
+                    
+                    if len(self.nodes[w].out_edges) == 0 and self.e_reduce[ (v, w) ] != True:
+                        #print "XXX: spur edge %s %s removed" % (v, w)
+                        self.e_reduce[(v, w)] = True
+                        v2, w2 = reverse_end(w), reverse_end(v)
+                        #print "XXX: spur edge %s %s removed" % (v2, w2)
+                        self.e_reduce[(v, w)] = True
+
+            if len(self.nodes[v].in_edges) > 1:
+                for in_edge in self.nodes[v].in_edges:
+                    w = in_edge.in_node.name
+                    if len(self.nodes[w].in_edges) == 0 and self.e_reduce[ (w, v) ] != True:
+                        #print "XXX: spur edge %s %s removed" % (w, v)
+                        self.e_reduce[(w, v)] = True
+                        v2, w2 = reverse_end(w), reverse_end(v)
+                        #print "XXX: spur edge %s %s removed" % (w2, v2)
+                        self.e_reduce[(w, v)] = True
+
+
     def mark_tr_edges(self):
+        """
+        transitive reduction
+        """
         n_mark = self.n_mark
         e_reduce = self.e_reduce
         FUZZ = 500
         for n in self.nodes:
             n_mark[n] = "vacant"
-        for e in self.edges:
-            e_reduce[e] = False
     
         for n_name, node in self.nodes.items():
 
@@ -113,7 +188,6 @@ class StringGraph(object):
                 n_mark[ w.name ] = "inplay"
             
             max_len = out_edges[-1].attr["length"]
-            #longest_edge = out_edges[-1]
                 
             max_len += FUZZ
             
@@ -147,62 +221,17 @@ class StringGraph(object):
                 w = out_edge.out_node
                 if n_mark[w.name] == "eliminated":
                     e_reduce[ (v.name, w.name) ] = True
+                    #print "XXX: tr edge %s %s removed" % (v.name, w.name)
+                    v_name, w_name = reverse_end(w.name), reverse_end(v.name)
+                    e_reduce[(v_name, w_name)] = True
+                    #print "XXX: tr edge %s %s removed" % (v_name, w_name)
                 n_mark[w.name] = "vacant"
                 
-    def mark_repeat_overlap(self):
-        repeat_overlap = self.repeat_overlap
-        in_degree = {}
-        for n in self.nodes:
-            c = 0
-            for e in self.nodes[n].in_edges:
-                v = e.in_node
-                w = e.out_node
-                if self.e_reduce[(v.name, w.name)] == False:
-                    c += 1
-            in_degree[n] = c
-            #print n,c
-        #print len([x for x in in_degree.items() if x[1]>1])
-         
-        for e_n, e in self.edges.items():
-            v = e.in_node
-            w = e.out_node
-            if self.e_reduce[(v.name, w.name)] == False:
-                repeat_overlap[ (v.name, w.name) ] = False
-            else:
-                repeat_overlap[ (v.name, w.name) ] = True
-            
-        for n in self.nodes:
-            if len(self.nodes[n].out_edges) < 2:
-                continue
-            min_in_deg = None
-            for e in self.nodes[n].out_edges:
-                v = e.in_node
-                w = e.out_node
-                #print n, v.name, w.name
-                if self.e_reduce[ (v.name, w.name) ] == True:
-                    continue
-                if min_in_deg == None:
-                    min_in_deg = in_degree[w.name]
-                    continue
-                if in_degree[w.name] < min_in_deg:
-                    min_in_deg = in_degree[w.name]
-                #print n, w.name, in_degree[w.name]
-            for e in self.nodes[n].out_edges:
-                v = e.in_node
-                w = e.out_node
-                assert (v.name, w.name) in self.edges
-                if in_degree[w.name] > min_in_deg:
-                    if self.e_reduce[(v.name, w.name)] == False:
-                        repeat_overlap[ (v.name, w.name) ] = True
-                        
-                    
-        for e_n, e in self.edges.items():
-            v = e.in_node
-            w = e.out_node
-            if repeat_overlap[ (v.name, w.name) ] == True:
-                self.e_reduce[(v.name, w.name)] == True
 
     def mark_best_overlap(self):
+        """
+        find the best overlapped edges
+        """
 
         best_edges = set()
 
@@ -220,7 +249,8 @@ class StringGraph(object):
                 e = in_edges[-1]
                 best_edges.add( (e.in_node.name, e.out_node.name) )
 
-        print "X", len(best_edges)
+        if DEBUG_LOG_LEVEL > 1:
+            print "X", len(best_edges)
 
         for e_n, e in self.edges.items():
             v = e_n[0]
@@ -228,22 +258,10 @@ class StringGraph(object):
             if self.e_reduce[ (v, w) ] != True:
                 if (v, w) not in best_edges:
                     self.e_reduce[(v, w)] = True
-
-    def mark_best_overlap_2(self):
-        best_edges = set()
-        for e in self.edges:
-            v, w = e
-            if w == self.get_best_out_edge_for_node(v).out_node.name and\
-               v == self.get_best_in_edge_for_node(w).in_node.name:
-                   best_edges.add( (v, w) )
-
-        for e_n, e in self.edges.items():
-            v = e_n[0]
-            w = e_n[1]
-            if self.e_reduce[ (v, w) ] != True:
-                if (v, w) not in best_edges:
-                    self.e_reduce[(v, w)] = True
-                    #print sum( [1 for e_n in self.edges if self.e_reduce[ e_n ] == False] )
+                    #print "XXX: in best edge %s %s removed" % (v, w)
+                    v2, w2 = reverse_end(w), reverse_end(v)
+                    #print "XXX: in best edge %s %s removed" % (v2, w2)
+                    self.e_reduce[(v2, w2)] = True
                 
     def get_out_edges_for_node(self, name, mask=True):
         rtn = []
@@ -287,13 +305,9 @@ class StringGraph(object):
         
 
 RCMAP = dict(zip("ACGTacgtNn-","TGCAtgcaNn-"))
-def generate_contig_from_path(sg, seqs, path):
+def generate_seq_from_path(sg, seqs, path):
     subseqs = []
     r_id, end = path[0].split(":")
-    if end == "B":
-        subseqs= [ "".join( [RCMAP[c] for c in seqs[r_id][::-1]] ) ]
-    else:
-        subseqs=[ seqs[r_id] ]
     
     count = 0
     for i in range( len( path ) -1 ):
@@ -311,7 +325,27 @@ def generate_contig_from_path(sg, seqs, path):
     return "".join(subseqs)
 
 
+def reverse_path( path ):
+    new_path = []
+    for n in list(path[::-1]):
+        rid, end = n.split(":")
+        new_end = "B" if end == "E" else "E"
+        new_path.append( rid+":"+new_end)
+    return new_path
+
+
 def generate_unitig(sg, seqs, out_fn, connected_nodes = None):
+
+    """
+    given a string graph:sg and the sequences: seqs, write the unitig fasta file into out_fn
+    the funtion return a reduct graph representing the reduce string graph where the edges are unitigs
+    
+    some extra files generated: 
+        unit_edges.dat : an easy to parse file for unitig data
+        unit_edge_paths : the file contains the information of the path of all unitigs
+        uni_graph.gexf: the unitig graph in gexf format for visulization
+    """
+
     G = SGToNXG(sg)
     if connected_nodes != None:
         connected_nodes = set(sg.nodes)
@@ -325,7 +359,7 @@ def generate_unitig(sg, seqs, out_fn, connected_nodes = None):
     edges_in_tigs = set()
 
     uni_edges = {}
-    path_f = open("paths","w")
+    path_f = open("unit_edge_paths","w")
     uni_edge_f = open("unit_edges.dat", "w")
     while len(sg_edges) > 0:
         v, w = sg_edges.pop()
@@ -341,7 +375,6 @@ def generate_unitig(sg, seqs, out_fn, connected_nodes = None):
             upstream_nodes.append(p_node.name)
             if (p_node.name, c_node) not in  sg_edges:
                 break
-            sg_edges.remove( (p_node.name, c_node) )
             p_in_edges = sg.get_in_edges_for_node(p_node.name)
             p_out_edges = sg.get_out_edges_for_node(p_node.name)
             c_node = p_node.name
@@ -357,53 +390,59 @@ def generate_unitig(sg, seqs, out_fn, connected_nodes = None):
             downstream_nodes.append(n_node.name)
             if (c_node, n_node.name) not in  sg_edges:
                 break
-            sg_edges.remove( (c_node, n_node.name) )
             n_out_edges = sg.get_out_edges_for_node(n_node.name)
             n_in_edges = sg.get_in_edges_for_node(n_node.name)
             c_node = n_node.name 
         
         whole_path = upstream_nodes + [v, w] + downstream_nodes
-        #print len(whole_path)
         count += 1
-        subseqs = []
-        for i in range( len( whole_path ) - 1):
-            v_n, w_n = whole_path[i:i+2]
-            
-            edge = sg.edges[ (v_n, w_n ) ]
-            edges_in_tigs.add( (v_n, w_n ) )
-            #print n, next_node.name, e.attr["label"]
-            
-            read_id, coor = edge.attr["label"].split(":")
-            b,e = coor.split("-")
-            b = int(b)
-            e = int(e)
-            if b < e:
-                subseqs.append( seqs[read_id][b:e] )
-            else:
-                try:
-                    subseqs.append( "".join( [RCMAP[c] for c in seqs[read_id][b:e:-1]] ) )
-                except:
-                    print seqs[read_id]
-            
+        subseq = generate_seq_from_path(sg, seqs, whole_path) 
+        #subseq = ""
         uni_edges.setdefault( (whole_path[0], whole_path[-1]), [] )
-        uni_edges[(whole_path[0], whole_path[-1])].append(  ( whole_path, "".join(subseqs) ) )
-        print >> uni_edge_f, whole_path[0], whole_path[-1], "-".join(whole_path), "".join(subseqs)
-
+        uni_edges[(whole_path[0], whole_path[-1])].append(  ( whole_path, subseq ) )
+        print >> uni_edge_f, whole_path[0], whole_path[-1], "-".join(whole_path), subseq
         print >>path_f, ">%05dc-%s-%s-%d %s" % (count, whole_path[0], whole_path[-1], len(whole_path), " ".join(whole_path))
-
         print >>out_fasta, ">%05dc-%s-%s-%d" % (count, whole_path[0], whole_path[-1], len(whole_path))
-        print >>out_fasta,"".join(subseqs)
+        print >>out_fasta, subseq
+        for i in range( len( whole_path ) -1 ):
+            w_n, v_n = whole_path[i:i+2]
+            try:
+                sg_edges.remove( (w_n, v_n) )
+            except KeyError: #if an edge is already deleted, ignore it
+                pass
+
+        r_whole_path = reverse_path( whole_path )
+        count += 1
+        subseq = generate_seq_from_path(sg, seqs, r_whole_path) 
+        #subseq = ""
+        uni_edges.setdefault( (r_whole_path[0], r_whole_path[-1]), [] )
+        uni_edges[(r_whole_path[0], r_whole_path[-1])].append(  ( r_whole_path, subseq ) )
+        print >> uni_edge_f, r_whole_path[0], r_whole_path[-1], "-".join(r_whole_path), subseq
+        print >>path_f, ">%05dc-%s-%s-%d %s" % (count, r_whole_path[0], r_whole_path[-1], len(r_whole_path), " ".join(r_whole_path))
+        print >>out_fasta, ">%05dc-%s-%s-%d" % (count, r_whole_path[0], r_whole_path[-1], len(r_whole_path))
+        print >>out_fasta, subseq
+        for i in range( len( r_whole_path ) -1 ):
+            w_n, v_n = r_whole_path[i:i+2]
+            try:
+                sg_edges.remove( (w_n, v_n) )
+            except KeyError: #if an edge is already deleted, ignore it
+                pass
+
+
     path_f.close()
     uni_edge_f.close()
-    uni_graph = nx.DiGraph()
-    for n1, n2 in uni_edges.keys():
-        uni_graph.add_edge(n1, n2, weight = len( uni_edges[ (n1,n2) ] ))
-    nx.write_gexf(uni_graph, "uni_graph.gexf")
+    #uni_graph = nx.DiGraph()
+    #for n1, n2 in uni_edges.keys():
+    #    uni_graph.add_edge(n1, n2, count = len( uni_edges[ (n1,n2) ] ))
+    #nx.write_gexf(uni_graph, "uni_graph.gexf")
 
     out_fasta.close()
     return uni_edges
 
 def neighbor_bound(G, v, w, radius):
+    """
+    test if the node v and the node w are connected within a radius in graph G
+    """
     g1 = nx.ego_graph(G, v, radius=radius, undirected=False)
     g2 = nx.ego_graph(G, w, radius=radius, undirected=False)
     if len(set(g1.edges()) & set(g2.edges())) > 0:
@@ -413,6 +452,10 @@ def neighbor_bound(G, v, w, radius):
 
 
 def is_branch_node(G, n):
+    """
+    test whether the node n is a "branch node" which the paths from any of two of 
+    its offsprings do not intersect within a given radius
+    """
     out_edges = G.out_edges([n])
     n2 = [ e[1] for e in out_edges ]
     is_branch = False
@@ -428,18 +471,26 @@ def is_branch_node(G, n):
     return is_branch
 
 
-def get_bundle( path, u_graph, u_edges ):
-    
-    # find a sub-graph contain the nodes between the start and the end of the path
+def get_bundle( path, u_graph, u_graph_r ):
+
+    """ 
+    find a sub-graph contain the nodes between the start and the end of the path
+    inputs: 
+        u_graph : a unitig graph
+    returns:
+        bundle_graph: the whole bundle graph 
+        bundle_paths: the paths in the bundle graph 
+        sub_graph2_edges: all edges of the bundle graph
     
+    """
+
     p_start, p_end = path[0], path[-1]
     p_nodes = set(path)
     p_edges = set(zip(path[:-1], path[1:]))
-    u_graph_r = u_graph.reverse()
+
     down_path = nx.ego_graph(u_graph, p_start, radius=len(p_nodes), undirected=False)
     up_path = nx.ego_graph(u_graph_r, p_end, radius=len(p_nodes), undirected=False)
     subgraph_nodes = set(down_path) & set(up_path)
-    #print len(path), len(down_path), len(up_path), len(bundle_nodes)
     
 
     sub_graph = nx.DiGraph()
@@ -472,10 +523,7 @@ def get_bundle( path, u_graph, u_edges ):
         
     while len(tips) != 0:
         n = tips.pop()
-        #print "n", n
         out_edges = sub_graph.out_edges([n])
-        #out_edges = u_graph.out_edges([n])
-        #print out_edges 
         if len(out_edges) == 1:
             e = out_edges[0]
             sub_graph2.add_edge(e[0], e[1], n_weight = u_graph[e[0]][e[1]]["n_weight"])
@@ -486,12 +534,10 @@ def get_bundle( path, u_graph, u_edges ):
                 orientation = "E" if orientation == "B" else "E"
                 visited.add( r_id +":" + orientation)
                 if not is_branch_node(sub_graph_r, e[1]): 
-                #if not is_branch_node(u_graph_r, e[1]): 
                     tips.add(e[1])
         else:
         
             is_branch = is_branch_node(sub_graph, n)
-            #is_branch = is_branch_node(u_graph, n)
             if not is_branch:
                 for e in out_edges:
                     sub_graph2.add_edge(e[0], e[1], n_weight = u_graph[e[0]][e[1]]["n_weight"])
@@ -502,15 +548,13 @@ def get_bundle( path, u_graph, u_edges ):
                         orientation = "E" if orientation == "B" else "E"
                         visited.add( r_id +":" + orientation)
                         if not is_branch_node(sub_graph_r, e[1]):
-                        #if not is_branch_node(u_graph_r, e[1]):
                             tips.add(e[1])
         ct += 1
-        #print ct, len(tips)
     last_node = None
     longest_len = 0
+        
     sub_graph2_nodes = sub_graph2.nodes()
     sub_graph2_edges = sub_graph2.edges()
-        
 
 
     new_path = [path[0]]
@@ -533,39 +577,63 @@ def get_bundle( path, u_graph, u_edges ):
                 new_path = path_t
 
 
-    #new_path = nx.shortest_path(sub_graph2, path[0], last_node, "n_weight")
     path = new_path
-    print "new_path", path[0], last_node, len(sub_graph2_nodes), path
+
+    # clean up sub_graph2 according to new begin and end
+    sub_graph2_r = sub_graph2.reverse()
+    down_path = nx.ego_graph(sub_graph2, path[0], radius=len(path), undirected=False)
+    up_path = nx.ego_graph(sub_graph2_r, path[-1], radius=len(path), undirected=False)
+    subgraph_nodes = set(down_path) & set(up_path)
+    for v in sub_graph2_nodes:
+        if v not in subgraph_nodes:
+            sub_graph2.remove_node(v)
+    
+    if DEBUG_LOG_LEVEL > 1:
+        print "new_path", path[0], last_node, len(sub_graph2_nodes), path
 
 
     bundle_paths = [path]
     p_nodes = set(path)
     p_edges = set(zip(path[:-1], path[1:]))
+
+    sub_graph2_nodes = sub_graph2.nodes()
+    sub_graph2_edges = sub_graph2.edges()
+
     nodes_idx = dict( [ (n[1], n[0]) for n in enumerate(path) ]  )
     
          
     # create a list of subpath that has no branch
-    non_branch_subpaths = [ [] ]
-    non_branch_edges = set()
-    mtg_edges = set()
-    
-    for i in range(len(path)-1):
-        v, w = path[i:i+2]
-        if len(sub_graph2.successors(v)) == 1 and len(sub_graph2.predecessors(w)) == 1:
-            non_branch_subpaths[-1].append( (v, w) )
-            non_branch_edges.add( (v, w) )
+    non_branch_subpaths = []
+    wi = 0
+    vi = 0
+    v = path[0]
+    while v != path[-1] and wi < len(path)-1:
+        wi += 1
+        w = path[wi]
+        while len( sub_graph2.successors(w) ) == 1 and len( sub_graph2.predecessors(w) ) == 1 and wi < len(path)-1:
+            wi += 1
+            w = path[wi]
+        if  len( sub_graph2.successors(v) )!= 1 or len( sub_graph2.predecessors(w) )!= 1:
+            branched = True
         else:
-            if len(non_branch_subpaths[-1]) != 0:
-                non_branch_subpaths.append([])
-                
+            branched = False
+
+        if not branched:
+            non_branch_subpaths.append( path[vi:wi+1] )
+        v = w
+        vi = wi
+
     # create the accompany_graph that has the path of the alternative subpaths
     
     associate_graph = nx.DiGraph()
     for v, w in sub_graph2.edges_iter():
         if (v, w) not in p_edges:
             associate_graph.add_edge(v, w, n_weight = sub_graph2[v][w]["n_weight"])
-    #print "associate_graph size:", len(associate_graph)           
-    #print "non_branch_subpaths", non_branch_subpaths
+
+    if DEBUG_LOG_LEVEL > 1:
+        print "associate_graph size:", len(associate_graph)           
+        print "non_branch_subpaths",len(non_branch_subpaths), non_branch_subpaths
+
     # construct the bundle graph                
     associate_graph_nodes = set(associate_graph.nodes())
     bundle_graph = nx.DiGraph()
@@ -574,26 +642,28 @@ def get_bundle( path, u_graph, u_edges ):
         if len(non_branch_subpaths[i]) == 0 or len( non_branch_subpaths[i+1] ) == 0:
             continue
         e1, e2 = non_branch_subpaths[i: i+2]
-        v = e1[-1][-1]
-        w = e2[0][0]
+        v = e1[-1]
+        w = e2[0]
         if v == w:
             continue
-        #print v, w
         in_between_node_count = nodes_idx[w] - nodes_idx[v] 
         if v in associate_graph_nodes and w in associate_graph_nodes:
             try:
-                #print "p2",v, w, nx.shortest_path(accommpany_graph, v, w)
-                #print "p1",v, w, nx.shortest_path(bundle_graph, v, w)
                 a_path = nx.shortest_path(associate_graph, v, w, "n_weight")    
             except nx.NetworkXNoPath:
                 continue
             bundle_graph.add_path( a_path )      
             bundle_paths.append( a_path )
-    #bundle_graph_nodes = bundle_graph.nodes()
+
     return bundle_graph, bundle_paths, sub_graph2_edges
             
 def get_bundles(u_edges):
     
+    """
+    input: all unitig edges
+    output: the assembled primary_tigs.fa and all_tigs.fa
+    """
+
     ASM_graph = nx.DiGraph()
     out_f = open("primary_tigs.fa", "w")
     main_tig_paths = open("primary_tigs_paths","w")
@@ -602,7 +672,8 @@ def get_bundles(u_edges):
     max_weight = 0 
     for v, w in u_edges:
         x = max( [len(s[1]) for s in u_edges[ (v,w) ] ] )
-        print "W", v, w, x
+        if DEBUG_LOG_LEVEL > 1:
+            print "W", v, w, x
         if x > max_weight:
             max_weight = x
             
@@ -622,117 +693,162 @@ def get_bundles(u_edges):
 
         u_graph.add_edge(v, w, n_weight = max_weight - max( [len(s[1]) for s in  u_edges[ (v,w) ] ] ) )
     
+    bundle_edge_out = open("bundle_edges","w")
     bundle_index = 0
-    G = u_graph.copy()
-    visited_u_edges = set()
-    while len(G) > 0:
-        
-        root_nodes = set() 
-        for n in G: 
-            if G.in_degree(n) != 1 or G.out_degree(n) !=1 : 
-                root_nodes.add(n) 
-        
-        if len(root_nodes) == 0:  
-            root_nodes.add( G.nodes()[0] ) 
-        
-        candidates = [] 
-        
-        for n in list(root_nodes): 
-            sp =nx.single_source_shortest_path_length(G, n) 
-            sp = sp.items() 
-            sp.sort(key=lambda x : x[1]) 
-            longest = sp[-1] 
-            print "L", n, longest[0]
-            if longest[0].split(":")[0] == n.split(":")[0]: #avoid a big loop 
-                continue
-            candidates.append ( (longest[1], n, longest[0]) ) 
 
-        if len(candidates) == 0:
-            print "no more candiate", len(G.edges()), len(G.nodes())
-            if len(G.edges()) > 0:
-                path = G.edges()[0] 
-            else:
-                break
-        else:
-            candidates.sort() 
-            
-            candidate = candidates[-1] 
-            
-            if candidate[1] == candidate[2]: 
-                G.remove_node(candidate[1]) 
-                continue 
-         
-            path = nx.shortest_path(G, candidate[1], candidate[2], "n_weight") 
-        print "X", path[0], path[-1], len(path)
-        
-        cmp_edges = set()
-        g_edges = set(G.edges())
-        new_path = []  
-        tail = True
-        # avioid confusion due to long palindrome sequence
-        for i in range( 0, len( path ) - 1 ):
-            v_n, w_n = path[i:i+2]
-            new_path.append(v_n)
-            #if (v_n, w_n) in cmp_edges or\
-            #    len(u_graph.out_edges(w_n)) > 5 or\
-            #    len(u_graph.in_edges(w_n)) > 5:
-            if (v_n, w_n) in cmp_edges: 
-                tail = False
-                break
 
-            r_id, end = v_n.split(":")
-            end = "E" if end == "B" else "B" 
-            v_n2 = r_id + ":" + end 
+    components = nx.weakly_connected_component_subgraphs(u_graph)
+    components = [ (len(c), c) for c in components ]
+    components.sort()
+    #components.reverse()
+    allS = len(u_graph)
+    ssG = 0.0
+    processed_overlaps = set()
+    for sG, G in components:
 
-            r_id, end = w_n.split(":")
-            end = "E" if end == "B" else "B" 
-            w_n2 = r_id + ":" + end 
+        ssG += sG
+        print "process graph of size ", sG, "%0.2f %0.2f" % (ssG, ssG/allS)
+        G_edges = set(G.edges())
 
-            if (w_n2, v_n2) in g_edges:
-                cmp_edges.add( (w_n2, v_n2) )
-        if tail:
-            new_path.append(w_n)
-                
+        dual_component = False
         
-        if len(new_path) > 1:
-            path = new_path
-            
-            print "Y", path[0], path[-1], len(path)
-            #bundle_graph, bundle_paths, bundle_graph_edges = get_bundle( path, u_graph, u_edges )
+        for v, w in list(G_edges):
+            v = v.split(":")[0]
+            w = w.split(":")[0]
+            if (v, w) in processed_overlaps:
+                dual_component = True
+                break
+
+        if dual_component == True:
+            continue
+
+        for v, w in list(G_edges):
+            v = v.split(":")[0]
+            w = w.split(":")[0]
+            processed_overlaps.add( (v,w) )
+            processed_overlaps.add( (w,v) )
 
-            bundle_graph, bundle_paths, bundle_graph_edges = get_bundle( path, G, G.edges() )
-            print "Z", bundle_paths[0][0], bundle_paths[0][-1]
-            print bundle_index, len(path), len(bundle_paths[0]), len(bundle_paths), len(bundle_graph_edges)
-            if len(bundle_graph_edges) > 0:
+        G_r = G.reverse()
+        visited_u_edges = set()
 
-                #ASM_graph.add_path(bundle_paths[0], ctg="%04d" % bundle_index)
-                extra_u_edges = []
+        while len(G) > 0:
+            out_f.flush()
+            main_tig_paths.flush()
+            sv_tigs.flush()
+            sv_tig_paths.flush()
+            
+            
+            #root_nodes = set() 
+            candidates = [] 
+            for n in G: 
+                sp =nx.single_source_shortest_path_length(G, n) 
+                sp = sp.items() 
+                sp.sort(key=lambda x : x[1]) 
+                longest = sp[-1] 
+                if DEBUG_LOG_LEVEL > 2:
+                    print "L", n, longest[0]
+                if longest[0].split(":")[0] == n.split(":")[0]: #avoid a big loop 
+                    continue
+                candidates.append ( (longest[1], n, longest[0]) ) 
+
+                n = longest[0]
+                sp =nx.single_source_shortest_path_length(G_r, n) 
+                sp = sp.items() 
+                sp.sort(key=lambda x : x[1]) 
+                longest = sp[-1] 
+                if DEBUG_LOG_LEVEL > 2:
+                    print "L", n, longest[0]
+                if longest[0].split(":")[0] == n.split(":")[0]: #avoid a big loop 
+                    continue
+                candidates.append ( (longest[1], longest[0], n) ) 
+                if len(candidates) != 0:
+                    break
+
+            if len(candidates) == 0:
+                print "no more candiate", len(G.edges()), len(G.nodes())
+                if len(G_edges) > 0:
+                    path = G_edges.pop()
+                    G_edges.add(path)
+                    print path
+                else:
+                    break
+            else:
+                candidates.sort() 
+                
+                candidate = candidates[-1] 
                 
-                print >> main_tig_paths, ">%04d %s" % ( bundle_index, " ".join(bundle_paths[0]) )
-                subseqs = []
+                if candidate[1] == candidate[2]: 
+                    G.remove_node(candidate[1]) 
+                    G_r.remove_node(candidate[1])
+                    continue 
+             
+                path = nx.shortest_path(G, candidate[1], candidate[2], "n_weight") 
+
+            if DEBUG_LOG_LEVEL > 1:
+                print "X", path[0], path[-1], len(path)
             
-                for i in range(len(bundle_paths[0]) - 1): 
-                    v, w = bundle_paths[0][i:i+2]
-                    uedges = u_edges[ (v,w) ]
-                    uedges.sort( key= lambda x: len(x[0]) )
-                    subseqs.append( uedges[-1][1] )
-                    visited_u_edges.add( "-".join(uedges[-1][0]) ) 
-                    for ue in uedges:
-                        if "-".join(ue[0]) not in visited_u_edges:
-                            visited_u_edges.add("-".join(ue[0]))
-                            extra_u_edges.append(ue)
-                seq = "".join(subseqs)        
-                if len(seq) > 0:
-                    print >> out_f, ">%04d %s-%s" % (bundle_index, bundle_paths[0][0], bundle_paths[0][-1])
-                    print >> out_f, seq
+            cmp_edges = set()
+            #g_edges = set(G.edges())
+            new_path = []  
+            tail = True
+            # avioid confusion due to long palindrome sequence
+            if len(path) > 2:
+                for i in range( 0, len( path ) - 1 ):
+                    v_n, w_n = path[i:i+2]
+                    new_path.append(v_n)
+                    # the comment out code below might be useful for filter out some high connectivity nodes
+                    #if (v_n, w_n) in cmp_edges or\
+                    #    len(u_graph.out_edges(w_n)) > 5 or\
+                    #    len(u_graph.in_edges(w_n)) > 5:
+                    if (v_n, w_n) in cmp_edges: 
+                        tail = False
+                        break
+
+                    r_id, end = v_n.split(":")
+                    end = "E" if end == "B" else "B" 
+                    v_n2 = r_id + ":" + end 
+
+                    r_id, end = w_n.split(":")
+                    end = "E" if end == "B" else "B" 
+                    w_n2 = r_id + ":" + end 
+
+                    if (w_n2, v_n2) in G_edges:
+                        cmp_edges.add( (w_n2, v_n2) )
+
+                if tail:
+                    new_path.append(w_n)
+            else:
+                new_path = path[:]
+                    
+            
+            if len(new_path) > 1:
+                path = new_path
                 
-                sv_tig_idx = 0
-                for sv_path in bundle_paths:
-                    print >> sv_tig_paths, ">%04d-%04d %s" % ( bundle_index, sv_tig_idx, " ".join(sv_path) )
-                    ASM_graph.add_path(sv_path, ctg="%04d" % bundle_index)
+                if DEBUG_LOG_LEVEL > 2:
+                    print "Y", path[0], path[-1], len(path)
+
+                bundle_graph, bundle_paths, bundle_graph_edges = get_bundle( path, G, G_r )
+                for bg_edge in bundle_graph_edges:
+                    print >> bundle_edge_out, bundle_index, "edge", bg_edge[0], bg_edge[1]
+                for path_ in bundle_paths:
+                    print >>bundle_edge_out, "path", bundle_index, " ".join(path_) 
+
+                edges_to_be_removed = set()
+                if DEBUG_LOG_LEVEL > 2:
+                    print "Z", bundle_paths[0][0], bundle_paths[0][-1]
+                    print bundle_index, len(path), len(bundle_paths[0]), len(bundle_paths), len(bundle_graph_edges)
+
+                if len(bundle_graph_edges) > 0:
+
+                    ASM_graph.add_path(bundle_paths[0], ctg="%04d" % bundle_index)
+                    extra_u_edges = []
+                    
+                    print >> main_tig_paths, ">%04d %s" % ( bundle_index, " ".join(bundle_paths[0]) )
                     subseqs = []
-                    for i in range(len(sv_path) - 1): 
-                        v, w = sv_path[i:i+2]
+                
+                    for i in range(len(bundle_paths[0]) - 1): 
+                        v, w = bundle_paths[0][i:i+2]
+                        edges_to_be_removed.add( (v,w) )
                         uedges = u_edges[ (v,w) ]
                         uedges.sort( key= lambda x: len(x[0]) )
                         subseqs.append( uedges[-1][1] )
@@ -742,68 +858,113 @@ def get_bundles(u_edges):
                                 visited_u_edges.add("-".join(ue[0]))
                                 extra_u_edges.append(ue)
                     seq = "".join(subseqs)        
-                    if len(seq) > 0: 
-                        print >> sv_tigs, ">%04d-%04d %s-%s" % (bundle_index, sv_tig_idx, sv_path[0], sv_path[-1])
+                    sv_tig_idx = 0
+                    print >> sv_tig_paths, ">%04d-%04d %s" % ( bundle_index, sv_tig_idx, " ".join(bundle_paths[0]) )
+                    if len(seq) > 0:
+                        print >> out_f, ">%04d %s-%s" % (bundle_index, bundle_paths[0][0], bundle_paths[0][-1])
+                        print >> out_f, seq
+                        print >> sv_tigs, ">%04d-%04d %s-%s" % (bundle_index, sv_tig_idx, bundle_paths[0][0], bundle_paths[0][-1])
                         print >> sv_tigs, "".join(subseqs)
+
                     sv_tig_idx += 1
-                for u_path, seq in extra_u_edges:
-                    #u_path = u_path.split("-")
-                    ASM_graph.add_edge(u_path[0], u_path[-1], ctg="%04d" % bundle_index)
-                    print >> sv_tig_paths, ">%04d-%04d-u %s" % ( bundle_index, sv_tig_idx, " ".join(u_path) )
-                    print >> sv_tigs, ">%04d-%04d-u %s-%s" % (bundle_index, sv_tig_idx, u_path[0], u_path[-1])
-                    print >> sv_tigs, seq
-                    sv_tig_idx += 1
+
+                    for sv_path in bundle_paths[1:]:
+                        print >> sv_tig_paths, ">%04d-%04d %s" % ( bundle_index, sv_tig_idx, " ".join(sv_path) )
+                        ASM_graph.add_path(sv_path, ctg="%04d" % bundle_index)
+                        subseqs = []
+                        for i in range(len(sv_path) - 1): 
+                            v, w = sv_path[i:i+2]
+                            edges_to_be_removed.add( (v,w) )
+                            uedges = u_edges[ (v,w) ]
+                            uedges.sort( key= lambda x: len(x[0]) )
+                            subseqs.append( uedges[-1][1] )
+                            visited_u_edges.add( "-".join(uedges[-1][0]) ) 
+                            for ue in uedges:
+                                if "-".join(ue[0]) not in visited_u_edges:
+                                    visited_u_edges.add("-".join(ue[0]))
+                                    extra_u_edges.append(ue)
+                        seq = "".join(subseqs)        
+                        if len(seq) > 0: 
+                            print >> sv_tigs, ">%04d-%04d %s-%s" % (bundle_index, sv_tig_idx, sv_path[0], sv_path[-1])
+                            print >> sv_tigs, "".join(subseqs)
+                        sv_tig_idx += 1
+                    for u_path, seq in extra_u_edges:
+                        #u_path = u_path.split("-")
+                        ASM_graph.add_edge(u_path[0], u_path[-1], ctg="%04d" % bundle_index)
+                        print >> sv_tig_paths, ">%04d-%04d-u %s" % ( bundle_index, sv_tig_idx, " ".join(u_path) )
+                        print >> sv_tigs, ">%04d-%04d-u %s-%s" % (bundle_index, sv_tig_idx, u_path[0], u_path[-1])
+                        print >> sv_tigs, seq
+                        sv_tig_idx += 1
+                        
                     
-                
-                bundle_index += 1
+                    bundle_index += 1
             else:
+                #TODO, consolidate code here
+                v,w = path
+                uedges = u_edges[ (v,w) ]
+                uedges.sort( key= lambda x: len(x[0]) )
+                subseqs.append( uedges[-1][1] )
+                seq = "".join(subseqs)
+                print >> sv_tig_paths, ">%04d-%04d %s" % ( bundle_index, sv_tig_idx, " ".join(paths) )
+                print >> sv_tigs, ">%04d-%04d-u %s-%s" % (bundle_index, sv_tig_idx, path[0], path[-1])
+                print >> sv_tigs, seq
+                sv_tig_idx += 1
+                bundle_index += 1
                 bundle_graph_edges = zip(path[:-1],path[1:])
-        else:
-            bundle_graph_edges = zip(path[:-1],path[1:])
-        
-        #clean up the graph
-
-        edges = set(G.edges())
-        edges_to_be_removed = list(set(bundle_graph_edges))
-        print "BGE",bundle_graph_edges
-        
-        edge_remove_count = 0
-        for v, w in edges_to_be_removed:
-            if (v, w) in edges:
-                G.remove_edge( v, w )
-                edge_remove_count += 1
-                print "remove edge", w, v
-                
-        edges = set(G.edges())
-        for v, w in edges_to_be_removed:
-
-            r_id, end = v.split(":")
-            end = "E" if end == "B" else "B"
-            v = r_id + ":" + end
-
-            r_id, end = w.split(":")
-            end = "E" if end == "B" else "B"
-            w = r_id + ":" + end
+            
+            #clean up the graph
 
-            if (w, v) in edges:
-                G.remove_edge( w, v )
-                edge_remove_count += 1
-                print "remove edge", w, v
+            edges = set(G.edges())
+            edges_to_be_removed |= set(bundle_graph_edges)
 
-        if edge_remove_count == 0:
-            print "premature termination", len(edges), len(G.nodes())
-            break
+            if DEBUG_LOG_LEVEL > 2:
+                print "BGE",bundle_graph_edges
             
-        nodes = G.nodes()
-        for n in nodes:
-            if G.in_degree(n) == 0 and G.out_degree(n) == 0:
-                G.remove_node(n)
-                print "remove node", n 
+            edge_remove_count = 0
+            for v, w in edges_to_be_removed:
+                if (v, w) in edges:
+                    G.remove_edge( v, w )
+                    G_r.remove_edge( w, v )
+                    G_edges.remove( (v, w) )
+                    edge_remove_count += 1
+                    if DEBUG_LOG_LEVEL > 2:
+                        print "remove edge", bundle_index, w, v
+                    
+            edges = set(G.edges())
+            for v, w in edges_to_be_removed:
+
+                r_id, end = v.split(":")
+                end = "E" if end == "B" else "B"
+                v = r_id + ":" + end
+
+                r_id, end = w.split(":")
+                end = "E" if end == "B" else "B"
+                w = r_id + ":" + end
+
+                if (w, v) in edges:
+                    G.remove_edge( w, v )
+                    G_edges.remove( (w, v) )
+                    G_r.remove_edge( v, w )
+                    edge_remove_count += 1
+                    if DEBUG_LOG_LEVEL > 2:
+                        print "remove edge", bundle_index, w, v
+
+            if edge_remove_count == 0:
+                break
+                
+            nodes = G.nodes()
+            for n in nodes:
+                if G.in_degree(n) == 0 and G.out_degree(n) == 0:
+                    G.remove_node(n)
+                    G_r.remove_node(n)
+                    if DEBUG_LOG_LEVEL > 2:
+                        print "remove node", n 
 
     sv_tig_paths.close()
     sv_tigs.close()
     main_tig_paths.close()
     out_f.close()
+    bundle_edge_out.close()
     return ASM_graph
 
 
@@ -828,14 +989,45 @@ def SGToNXG(sg):
     return G
 
 if __name__ == "__main__":
+
+    import argparse
     
-    overlap_file = sys.argv[1]
-    read_fasta = sys.argv[2]
+    parser = argparse.ArgumentParser(description='a example string graph assembler that is desinged for handling diploid genomes')
+    parser.add_argument('overlap_file', help='a file that contains the overlap information.')
+    parser.add_argument('read_fasta', help='the file that contains the sequence to be assembled')
+    parser.add_argument('--min_len', type=int, default=4000, 
+                        help='minimum length of the reads to be considered for assembling')
+    parser.add_argument('--min_idt', type=float, default=96,
+                        help='minimum alignment identity of the reads to be considered for assembling')
+    parser.add_argument('--disable_chimer_prediction', action="store_true", default=False,
+                        help='you may want to disable this as some reads can be falsely identified as chimers in low coverage case')
+
+    args = parser.parse_args()
 
+
+    overlap_file = args.overlap_file
+    read_fasta = args.read_fasta
+
+    contained_reads = set()
+    chimer_ids = set()
+
+    with open("rc_out_all") as f:
+        for l in f:
+            l = l.strip().split()
+            if l[1] == "2":
+                chimer_ids.add(l[0])
+            if l[1] == "1":
+                contained_reads.add(l[0])
+    print len(chimer_ids)
+    
     seqs = {}
-    #f = FastaReader("pre_assembled_reads.fa")
+    # load all p-reads into memory
     f = FastaReader(read_fasta)
     for r in f:
+        if r.name in contained_reads:
+            continue
+        if r.name in chimer_ids:
+            continue
         seqs[r.name] = r.sequence.upper()
 
     G=nx.Graph()
@@ -843,18 +1035,29 @@ if __name__ == "__main__":
     overlap_data = []
     contained_reads = set()
     overlap_count = {}
+
+
+    # loop through the overlapping data to load the data in the a python array
+    # contained reads are identified 
+
     with open(overlap_file) as f:
         for l in f:
             l = l.strip().split()
+
+            #work around for some ill formed data recored
             if len(l) != 13:
                 continue
+            
             f_id, g_id, score, identity = l[:4]
-            if f_id == g_id:
+            if f_id == g_id:  # don't need self-self overlapping
                 continue
-            if g_id not in seqs:
+
+            if g_id not in seqs: 
                 continue
+
             if f_id not in seqs:
                 continue
+
             score = int(score)
             identity = float(identity)
             contained = l[12]
@@ -866,17 +1069,20 @@ if __name__ == "__main__":
                 continue
             if contained == "none":
                 continue
-            if identity < 96:
+
+            if identity < args.min_idt: # only take record with >96% identity as overlapped reads
                 continue
             #if score > -2000:
             #    continue
             f_strain, f_start, f_end, f_len = (int(c) for c in l[4:8])
             g_strain, g_start, g_end, g_len = (int(c) for c in l[8:12])
-            if f_len < 4000: continue
-            if g_len < 4000: continue
+
+            # only used reads longer than the 4kb for assembly
+            if f_len < args.min_len: continue
+            if g_len < args.min_len: continue
             
             # double check for proper overlap
-            if f_start > 24 and f_len - f_end > 24:
+            if f_start > 24 and f_len - f_end > 24:  # allow 24 base tolerance on both sides of the overlapping
                 continue
             
             if g_start > 24 and g_len - g_end > 24:
@@ -893,41 +1099,37 @@ if __name__ == "__main__":
                 if g_start < 24 and f_start > 24:
                     continue
 
-            #if g_strain != 0:
-            #    continue
             overlap_data.append( (f_id, g_id, score, identity,
                                   f_strain, f_start, f_end, f_len,
                                   g_strain, g_start, g_end, g_len) )
 
             overlap_count[f_id] = overlap_count.get(f_id,0)+1
             overlap_count[g_id] = overlap_count.get(g_id,0)+1
-
+            
+    print "###", len(overlap_data), len(contained_reads)
     overlap_set = set()
     sg = StringGraph()
-    #G=nx.Graph()
     for od in overlap_data:
         f_id, g_id, score, identity = od[:4]
         if f_id in contained_reads:
             continue
         if g_id in contained_reads:
             continue
-        #if overlap_count.get(f_id, 0) < 3 or overlap_count.get(f_id, 0) > 400:
-        #    continue
-        #if overlap_count.get(g_id, 0) < 3 or overlap_count.get(g_id, 0) > 400:
-        #    continue
         f_s, f_b, f_e, f_l = od[4:8]
         g_s, g_b, g_e, g_l = od[8:12]
         overlap_pair = [f_id, g_id]
         overlap_pair.sort()
         overlap_pair = tuple( overlap_pair )
-        if overlap_pair in overlap_set:
+        if overlap_pair in overlap_set:  # don't allow duplicated records
             continue
         else:
             overlap_set.add(overlap_pair)
 
         
-        if g_s == 1:
+        if g_s == 1: # revered alignment, swapping the begin and end coordinates
             g_b, g_e = g_e, g_b
+        
+        # build the string graph edges for each overlap
         if f_b > 24:
             if g_b < g_e:
                 """
@@ -990,26 +1192,29 @@ if __name__ == "__main__":
                 sg.add_edge( "%s:B" % g_id, "%s:E" % f_id, label = "%s:%d-%d" % (f_id, f_e, f_l), 
                                                            length = abs(f_e - f_l),
                                                            score = -score)
-        
-    sg.mark_tr_edges()
-    print sum( [1 for c in sg.e_reduce.values() if c == True] )
-    print sum( [1 for c in sg.e_reduce.values() if c == False] )
-    G = SGToNXG(sg)
-    nx.write_adjlist(G, "full_string_graph.adj")
-    sg.mark_best_overlap()
-    print sum( [1 for c in sg.e_reduce.values() if c == False] )
-    #sg.mark_repeat_overlap()
-    #print sum( [1 for c in sg.repeat_overlap.values() if c == True] )
-    #print sum( [1 for c in sg.repeat_overlap.values() if c == False] )
-    #print len(sg.e_reduce), len(sg.repeat_overlap)
 
 
+    sg.init_reduce_dict()
+    #if not args.disable_chimer_prediction:
+    #    sg.mark_chimer_edge()
+    sg.mark_spur_edge()
+    sg.mark_tr_edges() # mark those edges that transitive redundant
+
+    #if DEBUG_LOG_LEVEL > 1:
+    if 1:
+        print sum( [1 for c in sg.e_reduce.values() if c == True] )
+        print sum( [1 for c in sg.e_reduce.values() if c == False] )
+
+    sg.mark_best_overlap() # mark those edges that are best overlap edges
+
+    if DEBUG_LOG_LEVEL > 1:
+        print sum( [1 for c in sg.e_reduce.values() if c == False] )
+
 
     G = SGToNXG(sg)
-    nx.write_gexf(G, "string_graph.gexf")
-    nx.write_adjlist(G, "string_graph.adj")
+    nx.write_gexf(G, "string_graph.gexf") # output the raw string string graph for visuliation
+    nx.write_adjlist(G, "string_graph.adj") # write out the whole adjacent list of the string graph
 
-    #generate_max_contig(sg, seqs, out_fn="max_tigs.fa")
-    u_edges = generate_unitig(sg, seqs, out_fn = "unitgs.fa")
-    ASM_graph = get_bundles(u_edges )
+    u_edges = generate_unitig(sg, seqs, out_fn = "unitgs.fa") # reduct to string graph into unitig graph
+    ASM_graph = get_bundles(u_edges )  # get the assembly
     nx.write_gexf(ASM_graph, "asm_graph.gexf")
diff --git a/src/py_scripts/falcon_dedup.py b/src/py_scripts_v0.1/falcon_dedup.py
similarity index 100%
rename from src/py_scripts/falcon_dedup.py
rename to src/py_scripts_v0.1/falcon_dedup.py
diff --git a/src/py_scripts/falcon_fixasm.py b/src/py_scripts_v0.1/falcon_fixasm.py
similarity index 98%
rename from src/py_scripts/falcon_fixasm.py
rename to src/py_scripts_v0.1/falcon_fixasm.py
index 33c1b8c..9475cef 100644
--- a/src/py_scripts/falcon_fixasm.py
+++ b/src/py_scripts_v0.1/falcon_fixasm.py
@@ -164,8 +164,8 @@ with open("primary_tigs_c.fa","w") as out_f:
                 seq, pos, full_tiling_path = get_seq(u_edges, r_edges, c_path)
                 for p, w, s, e in full_tiling_path:
                     print >> tiling_path_f, "%s_%02d" % (name, sub_idx), p, w, s, e
-                if len(full_tiling_path) <= 5:
-                    continue
+                #if len(full_tiling_path) <= 5:
+                #    continue
                 print >>out_f, ">%s_%02d" % (name, sub_idx)
                 print >>out_f, seq
                 print >>path_f, ">%s_%02d" % (name, sub_idx), " ".join(c_path)
diff --git a/src/py_scripts/falcon_overlap.py b/src/py_scripts_v0.1/falcon_overlap.py
similarity index 100%
rename from src/py_scripts/falcon_overlap.py
rename to src/py_scripts_v0.1/falcon_overlap.py
diff --git a/src/py_scripts/falcon_overlap2.py b/src/py_scripts_v0.1/falcon_overlap2.py
similarity index 100%
rename from src/py_scripts/falcon_overlap2.py
rename to src/py_scripts_v0.1/falcon_overlap2.py
diff --git a/src/py_scripts/falcon_qrm.py b/src/py_scripts_v0.1/falcon_qrm.py
similarity index 100%
copy from src/py_scripts/falcon_qrm.py
copy to src/py_scripts_v0.1/falcon_qrm.py
diff --git a/src/py_scripts/falcon_qrm.py b/src/py_scripts_v0.1/falcon_qrm_0.py
similarity index 80%
rename from src/py_scripts/falcon_qrm.py
rename to src/py_scripts_v0.1/falcon_qrm_0.py
index 5196b65..c07496f 100755
--- a/src/py_scripts/falcon_qrm.py
+++ b/src/py_scripts_v0.1/falcon_qrm_0.py
@@ -49,7 +49,6 @@ import math
 
 global sa_ptr, sda_ptr, lk_ptr
 global q_seqs,t_seqs, seqs
-global n_candidates, max_candidates
 
 seqs = []
 RC_MAP = dict( zip("ACGTacgtNn-", "TGCAtgcaNn-") )
@@ -107,24 +106,34 @@ def get_alignment(seq1, seq0):
     aln_size = 1
     if e1 - s1 > 500:
 
-        aln_size = max( e1-s1, e0-s0 )
-        aln_score = int(km_score * 48)
-        aln_q_s = s1
-        aln_q_e = e1
-        aln_t_s = s0
-        aln_t_e = e0
+        #aln_size = max( e1-s1, e0-s0 )
+        #aln_score = int(km_score * 2)
+        #aln_q_s = s1
+        #aln_q_e = e1
+        #aln_t_s = s0
+        #aln_t_e = e0
+        
+        alignment = DWA.align(seq1[s1:e1], e1-s1,
+                              seq0[s0:e0], e0-s0,
+                              500, 0)
+        aln_size = alignment[0].aln_str_size
+        aln_score = 4 * alignment[0].aln_str_size - 5 * alignment[0].dist
+        aln_q_s = alignment[0].aln_q_s
+        aln_q_e = alignment[0].aln_q_e
+        aln_t_s = alignment[0].aln_t_s
+        aln_t_e = alignment[0].aln_t_e
+        assert aln_q_e- aln_q_s <= alignment[0].aln_str_size or aln_t_e- aln_t_s <= alignment[0].aln_str_size
+        #print aln_str1
+        #print aln_str0
+
+        if aln_size > 500: 
+            contain_status = "overlap"            
+        DWA.free_alignment(alignment)
         
     kup.free_seq_addr_array(sda_ptr)
     kup.free_seq_array(sa_ptr)
     kup.free_kmer_lookup(lk_ptr)
 
-    if s1 > 1000 and s0 > 1000:
-        return 0, 0, 0, 0, 0, 0, "none"
-
-    if len_1 - e1 > 1000 and len_0 - e0 > 1000:
-        return 0, 0, 0, 0, 0, 0, "none"
-
-
     if e1 - s1 > 500 and aln_size > 500:
         return s1, s1+aln_q_e-aln_q_s, s0, s0+aln_t_e-aln_t_s, aln_size, aln_score, "aln"
     else:
@@ -133,49 +142,51 @@ def get_alignment(seq1, seq0):
 def get_candidate_aln(hit_input):
     
     global q_seqs, seqs, t_seqs, q_len
-    global max_candidates
-    global n_candidates
     q_name, hit_index_f, hit_index_r = hit_input
     q_seq = q_seqs[q_name]
 
     rtn = []
     hit_index = hit_index_f
     c = collections.Counter(hit_index)
-    s = [(c[0],c[1]) for c in c.items() if c[1] > 4]
+    s = [(c[0],c[1]) for c in c.items() if c[1] > 6]
     
-    hit_data = {}
-    #hit_ids = set()
-
+    hit_data = []
+    hit_ids = set()
     for p, hit_count in s:
         hit_id = seqs[p][0]
-        hit_data.setdefault(hit_id, [0, 0 ,0])
-        hit_data[hit_id][0] += hit_count;
-        if hit_count > hit_data[hit_id][1]:
-            hit_data[hit_id][1] = hit_count
-        hit_data[hit_id][2] += 1
-
-    hit_data = hit_data.items()
+        if hit_id == q_name or hit_id in hit_ids:
+            continue
+        if hit_id not in hit_ids:
+            hit_ids.add(hit_id)
+            hit_data.append( (hit_id, t_seqs[hit_id], len(t_seqs[hit_id]), hit_count) )
 
-    hit_data.sort( key=lambda x:-x[1][0] )
+    hit_data.sort( key=lambda x:-x[2] )
 
     target_count = {}
     total_hit = 0
 
-    for hit in hit_data[:n_candidates]:
+    for hit in hit_data:
         hit_id = hit[0]
-        hit_count = hit[1][0]
+        hit_count = hit[3]
         target_count.setdefault(hit_id, 0)
-        if target_count[hit_id] > max_candidates:
+        if target_count[hit_id] > 64:
             continue
-        if total_hit > max_candidates:
+        if total_hit > 64:
             continue
-        seq1, seq0 = q_seq, t_seqs[hit_id]
+        seq1, seq0 = q_seq, hit[1] 
         aln_data = get_alignment(seq1, seq0)
         if rtn != None:
              
             s1, e1, s2, e2, aln_size, aln_score, c_status = aln_data
             if c_status == "none":
                 continue
+            """
+            if e1 - s1 < 5000:
+                if -aln_score > -8000:
+                    continue
+                if (100.0*aln_score/(aln_size+1)) < 150:
+                    continue
+            """
             target_count[hit_id] += 1
             total_hit += 1
             rtn.append( ( q_name, hit_id, -aln_score, "%0.2f" % (100.0*aln_score/(aln_size+1)), 
@@ -186,41 +197,44 @@ def get_candidate_aln(hit_input):
     
     hit_index = hit_index_r 
     c = collections.Counter(hit_index)
-    s = [(c[0],c[1]) for c in c.items() if c[1] > 4]
-
-    hit_data = {}
-    #hit_ids = set()
+    s = [(c[0],c[1]) for c in c.items() if c[1] > 6]
 
+    hit_data = []
+    hit_ids = set()
     for p, hit_count in s:
         hit_id = seqs[p][0]
-        hit_data.setdefault(hit_id, [0, 0 ,0])
-        hit_data[hit_id][0] += hit_count;
-        if hit_count > hit_data[hit_id][1]:
-            hit_data[hit_id][1] = hit_count
-        hit_data[hit_id][2] += 1
-
-    hit_data = hit_data.items()
-
-    hit_data.sort( key=lambda x:-x[1][0] )
+        if hit_id == q_name or hit_id in hit_ids:
+            continue
+        if hit_id not in hit_ids:
+            hit_ids.add(hit_id)
+            hit_data.append( (hit_id, t_seqs[hit_id], len(t_seqs[hit_id]), hit_count) )
 
+    hit_data.sort( key=lambda x:-x[2] )
 
     target_count = {}
     total_hit = 0
 
-    for hit in hit_data[:n_candidates]:
+    for hit in hit_data:
         hit_id = hit[0] 
-        hit_count = hit[1][0]
+        hit_count = hit[3]
         target_count.setdefault(hit_id, 0)
-        if target_count[hit_id] > max_candidates:
+        if target_count[hit_id] > 64:
             continue
-        if total_hit > max_candidates:
+        if total_hit > 64:
             continue
-        seq1, seq0 = r_q_seq, t_seqs[hit_id]
+        seq1, seq0 = r_q_seq, hit[1]
         aln_data = get_alignment(seq1, seq0)
         if rtn != None:
             s1, e1, s2, e2, aln_size, aln_score, c_status = aln_data
             if c_status == "none":
                 continue
+            """
+            if e1 - s1 < 5000:
+                if -aln_score > -8000:
+                    continue
+                if (100.0*aln_score/(aln_size+1)) < 150:
+                    continue
+            """
             target_count[hit_id] += 1
             total_hit += 1
             rtn.append( ( q_name, hit_id, -aln_score, "%0.2f" % (100.0*aln_score/(aln_size+1)), 
@@ -306,18 +320,10 @@ if __name__ == "__main__":
                         help='number of processes used for detailed overlapping evalution')
     parser.add_argument('--d_core', type=int, default=1, 
                         help='number of processes used for k-mer matching')
-    parser.add_argument('--n_candidates', type=int, default=128, 
-                        help='number of candidates for read matching')
-    parser.add_argument('--max_candidates', type=int, default=64, 
-                        help='max number for read matching to output')
-
 
 
     args = parser.parse_args()
 
-    max_candidates = args.max_candidates
-    n_candidates = args.n_candidates
-
     q_seqs = {}
     t_seqs = {}
     if  args.min_len < 1200:
@@ -351,9 +357,11 @@ if __name__ == "__main__":
             fn = fn.strip()
             f = FastaReader(fn) # take one commnad line argument of the input fasta file name
             for r in f:
-                seq = r.sequence.upper()
-                #if fivemer_entropy(seq) < 4:
+                #if len(r.sequence) < args.min_len:
                 #    continue
+                seq = r.sequence.upper()
+                if fivemer_entropy(seq) < 4:
+                    continue
                 q_seqs[r.name] = seq
 
 
@@ -364,7 +372,7 @@ if __name__ == "__main__":
 
     
     #for r in pool.imap(get_candidate_aln, lookup_data_iterator( q_seqs)):
-    for r in pool.imap(get_candidate_aln, lookup_data_iterator(q_seqs, m_pool)):
+    for r in pool.imap(get_candidate_aln, lookup_data_iterator( q_seqs, m_pool)):
         for h in r:
             print " ".join([str(x) for x in h]) 
 
diff --git a/src/py_scripts/falcon_sense.py b/src/py_scripts_v0.1/falcon_sense.py
similarity index 95%
rename from src/py_scripts/falcon_sense.py
rename to src/py_scripts_v0.1/falcon_sense.py
index c23b7bf..f2386ae 100644
--- a/src/py_scripts/falcon_sense.py
+++ b/src/py_scripts_v0.1/falcon_sense.py
@@ -156,9 +156,11 @@ def get_consensus_with_trim( c_input ):
 
 
 def get_seq_data(config):
+    min_cov, K, local_match_count_window, local_match_count_threshold, max_n_read, min_idt, edge_tolerance, trim_size = config
     seqs = []
     seed_id = None
     seqs_data = []
+    read_ids = set()
     with sys.stdin as f:
         for l in f:
             l = l.strip().split()
@@ -169,12 +171,15 @@ def get_seq_data(config):
                     if len(seqs) == 0:
                         seqs.append(l[1]) #the "seed"
                         seed_id = l[0]
-                    seqs.append(l[1])
+                    if l[0] not in read_ids: #avoidng using the same read twice
+                        seqs.append(l[1])
             elif l[0] == "+":
                 if len(seqs) > 10:
-                    yield (seqs, seed_id, config) 
+                    seqs.sort( key=lambda x: -len(x) )
+                    yield (seqs[:max_n_read], seed_id, config) 
                 #seqs_data.append( (seqs, seed_id) ) 
                 seqs = []
+                read_id = set()
                 seed_id = None
             elif l[0] == "-":
                 #yield (seqs, seed_id)
diff --git a/src/py_scripts/falcon_ucns_data.py b/src/py_scripts_v0.1/falcon_ucns_data.py
similarity index 96%
rename from src/py_scripts/falcon_ucns_data.py
rename to src/py_scripts_v0.1/falcon_ucns_data.py
index feae510..aecd33a 100644
--- a/src/py_scripts/falcon_ucns_data.py
+++ b/src/py_scripts_v0.1/falcon_ucns_data.py
@@ -76,8 +76,8 @@ if __name__ == "__main__":
         offsets = []
         seqs = []
         p_tig = p_tigs_db[p_tig_id]
-        if len(tiling_path[p_tig_id]) <= 5:
-            continue
+        #if len(tiling_path[p_tig_id]) <= 2:
+        #    continue
         print p_tig_id, 0, p_tig
         for offset, s_id, end, s, e in tiling_path[p_tig_id]:
             seq = seq_db[s_id]
@@ -101,8 +101,8 @@ if __name__ == "__main__":
         offsets = []
         seqs = []
         a_tig = a_tigs_db[a_tig_id]
-        if len(tiling_path[a_tig_id]) <= 5:
-            continue
+        #if len(tiling_path[a_tig_id]) <= 2:
+        #    continue
         print a_tig_id, 0, a_tig
         for offset, s_id, end, s, e in tiling_path[a_tig_id]:
             seq = seq_db[s_id]
diff --git a/src/py_scripts/falcon_utgcns.py b/src/py_scripts_v0.1/falcon_utgcns.py
similarity index 100%
rename from src/py_scripts/falcon_utgcns.py
rename to src/py_scripts_v0.1/falcon_utgcns.py
diff --git a/src/py_scripts_v0.1/get_ovl.sh b/src/py_scripts_v0.1/get_ovl.sh
new file mode 100644
index 0000000..417f03b
--- /dev/null
+++ b/src/py_scripts_v0.1/get_ovl.sh
@@ -0,0 +1,7 @@
+/mnt/secondary/Share/HBAR_03202013/bin/parallel -j 32 "LA4Falcon -mo -H2000 {}  | python overlap_filter_step1.py > {}.ignore" ::: *.las
+rm all.ignore
+cat *.ignore > all.ignore
+/mnt/secondary/Share/HBAR_03202013/bin/parallel -j 32 "LA4Falcon -mo -H2000 {}  | python overlap_filter_step2.py > {}.rc" ::: *.las
+cat *.rc > rc_out_all
+rm *.rc
+/mnt/secondary/Share/HBAR_03202013/bin/parallel -j 32 "LA4Falcon -mo -H2000 {}  | python overlap_filter_step3.py > {}.ovl" ::: *.las
diff --git a/src/py_scripts/get_rdata.py b/src/py_scripts_v0.1/get_rdata.py
similarity index 100%
rename from src/py_scripts/get_rdata.py
rename to src/py_scripts_v0.1/get_rdata.py
diff --git a/src/py_scripts/overlapper.py b/src/py_scripts_v0.1/overlapper.py
similarity index 100%
rename from src/py_scripts/overlapper.py
rename to src/py_scripts_v0.1/overlapper.py
diff --git a/src/py_scripts_v0.1/ovlp_filter.sh b/src/py_scripts_v0.1/ovlp_filter.sh
new file mode 100644
index 0000000..608389e
--- /dev/null
+++ b/src/py_scripts_v0.1/ovlp_filter.sh
@@ -0,0 +1,6 @@
+source /mnt/secondary/Share/HBAR_03202013/bin/activate
+parallel -j 24 "LA4Falcon -mo -H10000 {}  | python overlap_filter_step1.py > {}.ignore" ::: *.las
+cat *.ignore > all.ignore
+parallel -j 24 "LA4Falcon -mo -H10000 {}  | python overlap_filter_step2.py > {}.rc" ::: *.las
+cat *.rc > rc_out_all
+parallel -j 24 "LA4Falcon -mo -H10000 {}  | python overlap_filter_step3.py > {}.ovl" ::: *.las
diff --git a/src/py_scripts_v0.1/redis_graph.py b/src/py_scripts_v0.1/redis_graph.py
new file mode 100644
index 0000000..41cc785
--- /dev/null
+++ b/src/py_scripts_v0.1/redis_graph.py
@@ -0,0 +1,79 @@
+import redis
+import sys
+from pbcore.io import FastaReader
+
+
+r = redis.StrictRedis(host='localhost', port=6379, db=0)
+
+class RedisList(object):
+
+    def __init__(self, rs):
+        self._rs = rs
+        self.id_ = "pid:" + str( id(self) )
+
+    def append(self, value):
+        self._rs.rpush( self.id_, value)
+
+    def __len__(self):
+        return self._rs.llen( self.id_ )
+
+    def __getitem__(self, i):
+        return self._rs.lrange( self.id_, i, i)
+
+    def pylist(self):
+        return self._rs.lrange( self.id_, 0, -1)
+
+    def __del__(self):
+        self._rs.delete(self.id_)
+
+class RedisDict(object):
+
+    def __init__(self, rs):
+        self._rs = rs
+        self.id_ = "pid:" + str( id(self) )
+
+    def __setitem__(self, key, value):
+        self._rs.hset( self.id_, key, value )
+
+    def __getitem__(self, key):
+        return self._rs.hget( self.id_, key )
+
+    def __delitem__(self, key):
+        return self._rs.hdel( self.id_, key)
+
+
+    def __len__(self):
+        return self._rs.hlen( self.id_ )
+    
+    def keys(self):
+        return self._rs.hgetall( self.id_ ).keys()
+
+    def values(self):
+        return self._rs.hgetall( self.id_ ).values()
+
+    def pydict(self):
+        return self._rs.hgetall( self.id_ )
+
+    def __del__(self):
+        self._rs.delete(self.id_)
+
+def test_list():
+    x = RedisList(r)
+    x.append( "1" )
+    x.append( "2" )
+    print len(x)
+    print x.pylist()
+    del x
+
+    y = RedisDict(r)
+    y["a"] = "b"
+    y["b"] = 1
+    print y["a"]
+    del y["a"]
+    print y.values()
+    print y.keys()
+    print y.pydict()
+    del y
+
+if __name__ == "__main__":
+    test_list()
diff --git a/src/py_scripts/remove_dup_ctg.py b/src/py_scripts_v0.1/remove_dup_ctg.py
similarity index 100%
rename from src/py_scripts/remove_dup_ctg.py
rename to src/py_scripts_v0.1/remove_dup_ctg.py
diff --git a/test/HPCdaligner_synth0.sh b/test/HPCdaligner_synth0.sh
new file mode 100644
index 0000000..d6f6857
--- /dev/null
+++ b/test/HPCdaligner_synth0.sh
@@ -0,0 +1,11 @@
+# Daligner jobs (2)
+daligner -v -h1 -t16 -H1 -e0.7 -l1 -s1000 raw_reads.1 raw_reads.1
+daligner -v -h1 -t16 -H1 -e0.7 -l1 -s1000 raw_reads.2 raw_reads.1 raw_reads.2
+# Initial sort jobs (4)
+LAsort -v raw_reads.1.raw_reads.1.C0 raw_reads.1.raw_reads.1.N0 && LAmerge -v L1.1.1 raw_reads.1.raw_reads.1.C0.S raw_reads.1.raw_reads.1.N0.S && rm raw_reads.1.raw_reads.1.C0.S.las raw_reads.1.raw_reads.1.N0.S.las
+LAsort -v raw_reads.1.raw_reads.2.C0 raw_reads.1.raw_reads.2.N0 && LAmerge -v L1.1.2 raw_reads.1.raw_reads.2.C0.S raw_reads.1.raw_reads.2.N0.S && rm raw_reads.1.raw_reads.2.C0.S.las raw_reads.1.raw_reads.2.N0.S.las
+LAsort -v raw_reads.2.raw_reads.1.C0 raw_reads.2.raw_reads.1.N0 && LAmerge -v L1.2.1 raw_reads.2.raw_reads.1.C0.S raw_reads.2.raw_reads.1.N0.S && rm raw_reads.2.raw_reads.1.C0.S.las raw_reads.2.raw_reads.1.N0.S.las
+LAsort -v raw_reads.2.raw_reads.2.C0 raw_reads.2.raw_reads.2.N0 && LAmerge -v L1.2.2 raw_reads.2.raw_reads.2.C0.S raw_reads.2.raw_reads.2.N0.S && rm raw_reads.2.raw_reads.2.C0.S.las raw_reads.2.raw_reads.2.N0.S.las
+# Level 1 jobs (2)
+LAmerge -v raw_reads.1 L1.1.1 L1.1.2 && rm L1.1.1.las L1.1.2.las
+LAmerge -v raw_reads.2 L1.2.1 L1.2.2 && rm L1.2.1.las L1.2.2.las
diff --git a/test/test_actg_coordinate.py b/test/test_actg_coordinate.py
new file mode 100644
index 0000000..ed2c6ea
--- /dev/null
+++ b/test/test_actg_coordinate.py
@@ -0,0 +1,9 @@
+import falcon_kit.mains.actg_coordinate as mod
+
+'''
+def test_help():
+    try:
+        mod.main(['prog', '--help'])
+    except SystemExit:
+        pass
+'''
diff --git a/test/test_consensus.py b/test/test_consensus.py
new file mode 100644
index 0000000..01a5fdd
--- /dev/null
+++ b/test/test_consensus.py
@@ -0,0 +1,7 @@
+import falcon_kit.mains.consensus as mod
+
+def test_help():
+    try:
+        mod.main(['prog', '--help'])
+    except SystemExit:
+        pass
diff --git a/test/test_contig_annotate.py b/test/test_contig_annotate.py
new file mode 100644
index 0000000..41d00d6
--- /dev/null
+++ b/test/test_contig_annotate.py
@@ -0,0 +1,9 @@
+import falcon_kit.mains.contig_annotate as mod
+
+'''
+def test_help():
+    try:
+        mod.main(['prog', '--help'])
+    except SystemExit:
+        pass
+'''
diff --git a/test/test_ctg_link_analysis.py b/test/test_ctg_link_analysis.py
new file mode 100644
index 0000000..4c8ed06
--- /dev/null
+++ b/test/test_ctg_link_analysis.py
@@ -0,0 +1,9 @@
+import falcon_kit.mains.ctg_link_analysis as mod
+
+'''
+def test_help():
+    try:
+        mod.main(['prog', '--help'])
+    except SystemExit:
+        pass
+'''
diff --git a/test/test_functional.py b/test/test_functional.py
new file mode 100644
index 0000000..d6e5468
--- /dev/null
+++ b/test/test_functional.py
@@ -0,0 +1,31 @@
+from nose.tools import eq_
+import falcon_kit.functional as f
+import StringIO
+import os
+
+thisdir = os.path.dirname(os.path.abspath(__file__))
+example_HPCdaligner = open(os.path.join(thisdir, 'HPCdaligner_synth0.sh'))
+
+def test_get_daligner_job_descriptions():
+    result = f.get_daligner_job_descriptions(
+            example_HPCdaligner, 'raw_reads')
+    assert result
+    eq_(result[(1, 1)], "daligner -v -h1 -t16 -H1 -e0.7 -l1 -s1000 raw_reads.1 raw_reads.1\nLAsort -v raw_reads.1.raw_reads.1.C0 raw_reads.1.raw_reads.1.N0 && LAmerge -v L1.1.1 raw_reads.1.raw_reads.1.C0.S raw_reads.1.raw_reads.1.N0.S && rm raw_reads.1.raw_reads.1.C0.S.las raw_reads.1.raw_reads.1.N0.S.las\n")
+    eq_(result[(2, 1, 2)], "daligner -v -h1 -t16 -H1 -e0.7 -l1 -s1000 raw_reads.2 raw_reads.1 raw_reads.2\nLAsort -v raw_reads.1.raw_reads.2.C0 raw_reads.1.raw_reads.2.N0 && LAmerge -v L1.1.2 raw_reads.1.raw_reads.2.C0.S raw_reads.1.raw_reads.2.N0.S && rm raw_reads.1.raw_reads.2.C0.S.las raw_reads.1.raw_reads.2.N0.S.las\nLAsort -v raw_reads.2.raw_reads.1.C0 raw_reads.2.raw_reads.1.N0 && LAmerge -v L1.2.1 raw_reads.2.raw_reads.1.C0.S raw_reads.2.raw_reads.1.N0.S && rm raw_reads.2.raw_read [...]
+    eq_(len(result), 2)
+
+def test_xform_script_for_preads():
+    # Technically, we never have more than one daligner in a script, but that
+    # could change in pbsmrtpipe, since it limits the number of chunks.
+    script = 'daligner x y\nLAsort a b\ndaligner x1 y1\n'
+    expected = 'daligner_p x y\nLAsort a b\ndaligner_p x1 y1\n'
+    result = f.xform_script_for_preads(script)
+    eq_(result, expected)
+
+    script = 'daligner x y\nLAsort a b\ndaligner x1 y1\n'
+    expected = script # no-op
+    result = f.xform_script_for_raw_reads(script)
+    eq_(result, expected)
+
+    eq_(f.get_script_xformer(True), f.xform_script_for_preads)
+    eq_(f.get_script_xformer(False), f.xform_script_for_raw_reads)
diff --git a/test/test_graph_to_contig.py b/test/test_graph_to_contig.py
new file mode 100644
index 0000000..69feb60
--- /dev/null
+++ b/test/test_graph_to_contig.py
@@ -0,0 +1,9 @@
+import falcon_kit.mains.graph_to_contig as mod
+
+'''
+def test_help():
+    try:
+        mod.main(['prog', '--help'])
+    except SystemExit:
+        pass
+'''
diff --git a/test/test_graph_to_utgs.py b/test/test_graph_to_utgs.py
new file mode 100644
index 0000000..0cea568
--- /dev/null
+++ b/test/test_graph_to_utgs.py
@@ -0,0 +1,9 @@
+import falcon_kit.mains.graph_to_utgs as mod
+
+'''
+def test_help():
+    try:
+        mod.main(['prog', '--help'])
+    except SystemExit:
+        pass
+'''
diff --git a/test/test_ovlp_filter.py b/test/test_ovlp_filter.py
new file mode 100644
index 0000000..cd1e64c
--- /dev/null
+++ b/test/test_ovlp_filter.py
@@ -0,0 +1,65 @@
+import falcon_kit.mains.ovlp_filter as mod
+from nose.tools import assert_equal
+
+def test_help():
+    """Calls itself 'nosetests', but reports
+    proper help message otherwise.
+    """
+    try:
+        mod.main(['prog', '--help'])
+    except SystemExit:
+        pass
+
+def test_several():
+    expected = ['000000001', '000000002', '000000017', '000000028']
+    data = """\
+000000000 000000001 -1807 100.00 0 181 1988 1988 0 0 1807 1989 overlap
+000000000 000000002 -823 99.88 0 0 823 1988 0 1166 1989 1989 overlap
+000000000 000000003 -50 99.94 0 0 50 1988 0 0 50 50 overlap
+000000000 000000017 -61 98.36 0 0 61 1988 0 1928 1989 1989 overlap
+000000000 000000028 -1952 79.95 0 0 1952 1988 0 37 1989 1989 overlap
+000000001 000000000 -1807 100.00 0 0 1807 1989 0 181 1988 1988 overlap
+000000001 000000002 -642 99.84 0 0 642 1989 0 1347 1989 1989 overlap
+000000002 000000000 -823 99.88 0 1166 1989 1989 0 0 823 1988 overlap
+000000002 000000001 -642 99.84 0 1347 1989 1989 0 0 642 1989 overlap
+000000003 000000000 -50 99.94 0 0 50 50 0 0 50 1988 overlap
+000000017 000000000 -61 98.36 0 1928 1989 1989 0 0 61 1988 overlap
+000000028 000000000 -1952 79.95 0 37 1989 1989 0 0 1952 1988 overlap
+"""
+    readlines = data.strip().splitlines
+    max_diff, max_ovlp, min_ovlp, min_len = 1000, 1000, 1, 1
+    got = mod.filter_stage1(readlines, max_diff, max_ovlp, min_ovlp, min_len)
+    assert_equal(expected, got)
+
+def test_one_not_ignored():
+    """This is the same as a line dropped in the earlier test.
+    """
+    expected = []
+    data = """\
+000000003 000000000 -50 99.94 0 0 50 50 0 0 50 1988 overlap
+"""
+    readlines = data.strip().splitlines
+    max_diff, max_ovlp, min_ovlp, min_len = 1000, 1000, 1, 1
+    got = mod.filter_stage1(readlines, max_diff, max_ovlp, min_ovlp, min_len)
+    assert_equal(expected, got)
+
+def test_one_line_ignored():
+    """This is the same as a line kept in the earlier test.
+    """
+    expected = ['000000017']
+    data = """\
+000000017 000000000 -61 98.36 0 1928 1989 1989 0 0 61 1988 overlap
+"""
+    readlines = data.strip().splitlines
+    max_diff, max_ovlp, min_ovlp, min_len = 1000, 1000, 1, 1
+    got = mod.filter_stage1(readlines, max_diff, max_ovlp, min_ovlp, min_len)
+    assert_equal(expected, got)
+
+def test_empty():
+    expected = []
+    data = """\
+"""
+    readlines = data.strip().splitlines
+    max_diff, max_ovlp, min_ovlp, min_len = 1000, 1000, 1, 1
+    got = mod.filter_stage1(readlines, max_diff, max_ovlp, min_ovlp, min_len)
+    assert_equal(expected, got)
diff --git a/test/test_ovlp_stats.py b/test/test_ovlp_stats.py
new file mode 100644
index 0000000..78fdae8
--- /dev/null
+++ b/test/test_ovlp_stats.py
@@ -0,0 +1,29 @@
+import falcon_kit.mains.ovlp_stats as mod
+from nose.tools import assert_equal
+
+def test_help():
+    try:
+        mod.main(['prog', '--help'])
+    except SystemExit:
+        pass
+def test():
+    readlines = data.strip().splitlines
+    stats = mod.filter_stats(readlines, min_len=62)
+    #print stats
+    assert_equal(expected,  stats)
+
+expected = [('000000000', 1988, 2, 1), ('000000001', 1989, 2, 0), ('000000002', 1989, 0, 2), ('000000017', 1989, 0, 1)]
+data = """
+000000000 000000001 -1807 100.00 0 181 1988 1988 0 0 1807 1989 overlap
+000000000 000000002 -823 99.88 0 0 823 1988 0 1166 1989 1989 overlap
+000000000 000000003 -50 99.94 0 0 50 1988 0 0 50 50 overlap
+000000000 000000017 -61 98.36 0 0 61 1988 0 1928 1989 1989 overlap
+000000000 000000028 -1952 79.95 0 0 1952 1988 0 37 1989 1989 overlap
+000000001 000000000 -1807 100.00 0 0 1807 1989 0 181 1988 1988 overlap
+000000001 000000002 -642 99.84 0 0 642 1989 0 1347 1989 1989 overlap
+000000002 000000000 -823 99.88 0 1166 1989 1989 0 0 823 1988 overlap
+000000002 000000001 -642 99.84 0 1347 1989 1989 0 0 642 1989 overlap
+000000003 000000000 -50 99.94 0 0 50 50 0 0 50 1988 overlap
+000000017 000000000 -61 98.36 0 1928 1989 1989 0 0 61 1988 overlap
+000000028 000000000 -1952 79.95 0 37 1989 1989 0 0 1952 1988 overlap
+"""
diff --git a/test/test_ovlp_to_graph.py b/test/test_ovlp_to_graph.py
new file mode 100644
index 0000000..6d0e18f
--- /dev/null
+++ b/test/test_ovlp_to_graph.py
@@ -0,0 +1,7 @@
+import falcon_kit.mains.ovlp_to_graph as mod
+
+def test_help():
+    try:
+        mod.main(['prog', '--help'])
+    except SystemExit:
+        pass
diff --git a/test/test_run.py b/test/test_run.py
new file mode 100644
index 0000000..813ccca
--- /dev/null
+++ b/test/test_run.py
@@ -0,0 +1,9 @@
+#import falcon_kit.mains.run as mod
+
+'''
+def test_help():
+    try:
+        mod.main(['prog', '--help'])
+    except SystemExit:
+        pass
+'''
diff --git a/test/test_run_LG.py b/test/test_run_LG.py
new file mode 100644
index 0000000..91d18b9
--- /dev/null
+++ b/test/test_run_LG.py
@@ -0,0 +1,9 @@
+#import falcon_kit.mains.run_LG as mod
+
+'''
+def test_help():
+    try:
+        mod.main(['prog', '--help'])
+    except SystemExit:
+        pass
+'''
diff --git a/travis.sh b/travis.sh
new file mode 100755
index 0000000..5693676
--- /dev/null
+++ b/travis.sh
@@ -0,0 +1,18 @@
+#!/bin/sh
+# -e: fail on error
+# -v: show commands
+# -x: show expanded commands
+set -vex
+
+#env | sort
+mkdir -p fc-env
+rm -f fc-env/bin/python
+virtualenv -p python2.7 fc-env || ../virtualenv/virtualenv.py fc-env
+. fc-env/bin/activate
+python setup.py -v install
+python -c 'import falcon_kit; print falcon_kit.falcon'
+
+# When doctests are passing, add this:
+pip install nose
+nosetests -v test/
+#nosetests -v --with-doctest fc-env/lib/python2.7/site-packages/falcon_kit

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/falconkit.git



More information about the debian-med-commit mailing list